1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/config.h> 37 #include <linux/linkage.h> 38 #include <linux/raid/md.h> 39 #include <linux/raid/bitmap.h> 40 #include <linux/sysctl.h> 41 #include <linux/devfs_fs_kernel.h> 42 #include <linux/buffer_head.h> /* for invalidate_bdev */ 43 #include <linux/suspend.h> 44 45 #include <linux/init.h> 46 47 #include <linux/file.h> 48 49 #ifdef CONFIG_KMOD 50 #include <linux/kmod.h> 51 #endif 52 53 #include <asm/unaligned.h> 54 55 #define MAJOR_NR MD_MAJOR 56 #define MD_DRIVER 57 58 /* 63 partitions with the alternate major number (mdp) */ 59 #define MdpMinorShift 6 60 61 #define DEBUG 0 62 #define dprintk(x...) ((void)(DEBUG && printk(x))) 63 64 65 #ifndef MODULE 66 static void autostart_arrays (int part); 67 #endif 68 69 static mdk_personality_t *pers[MAX_PERSONALITY]; 70 static DEFINE_SPINLOCK(pers_lock); 71 72 /* 73 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 74 * is 1000 KB/sec, so the extra system load does not show up that much. 75 * Increase it if you want to have more _guaranteed_ speed. Note that 76 * the RAID driver will use the maximum available bandwith if the IO 77 * subsystem is idle. There is also an 'absolute maximum' reconstruction 78 * speed limit - in case reconstruction slows down your system despite 79 * idle IO detection. 80 * 81 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 82 */ 83 84 static int sysctl_speed_limit_min = 1000; 85 static int sysctl_speed_limit_max = 200000; 86 87 static struct ctl_table_header *raid_table_header; 88 89 static ctl_table raid_table[] = { 90 { 91 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 92 .procname = "speed_limit_min", 93 .data = &sysctl_speed_limit_min, 94 .maxlen = sizeof(int), 95 .mode = 0644, 96 .proc_handler = &proc_dointvec, 97 }, 98 { 99 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 100 .procname = "speed_limit_max", 101 .data = &sysctl_speed_limit_max, 102 .maxlen = sizeof(int), 103 .mode = 0644, 104 .proc_handler = &proc_dointvec, 105 }, 106 { .ctl_name = 0 } 107 }; 108 109 static ctl_table raid_dir_table[] = { 110 { 111 .ctl_name = DEV_RAID, 112 .procname = "raid", 113 .maxlen = 0, 114 .mode = 0555, 115 .child = raid_table, 116 }, 117 { .ctl_name = 0 } 118 }; 119 120 static ctl_table raid_root_table[] = { 121 { 122 .ctl_name = CTL_DEV, 123 .procname = "dev", 124 .maxlen = 0, 125 .mode = 0555, 126 .child = raid_dir_table, 127 }, 128 { .ctl_name = 0 } 129 }; 130 131 static struct block_device_operations md_fops; 132 133 /* 134 * Enables to iterate over all existing md arrays 135 * all_mddevs_lock protects this list. 136 */ 137 static LIST_HEAD(all_mddevs); 138 static DEFINE_SPINLOCK(all_mddevs_lock); 139 140 141 /* 142 * iterates through all used mddevs in the system. 143 * We take care to grab the all_mddevs_lock whenever navigating 144 * the list, and to always hold a refcount when unlocked. 145 * Any code which breaks out of this loop while own 146 * a reference to the current mddev and must mddev_put it. 147 */ 148 #define ITERATE_MDDEV(mddev,tmp) \ 149 \ 150 for (({ spin_lock(&all_mddevs_lock); \ 151 tmp = all_mddevs.next; \ 152 mddev = NULL;}); \ 153 ({ if (tmp != &all_mddevs) \ 154 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 155 spin_unlock(&all_mddevs_lock); \ 156 if (mddev) mddev_put(mddev); \ 157 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 158 tmp != &all_mddevs;}); \ 159 ({ spin_lock(&all_mddevs_lock); \ 160 tmp = tmp->next;}) \ 161 ) 162 163 164 static int md_fail_request (request_queue_t *q, struct bio *bio) 165 { 166 bio_io_error(bio, bio->bi_size); 167 return 0; 168 } 169 170 static inline mddev_t *mddev_get(mddev_t *mddev) 171 { 172 atomic_inc(&mddev->active); 173 return mddev; 174 } 175 176 static void mddev_put(mddev_t *mddev) 177 { 178 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 179 return; 180 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 181 list_del(&mddev->all_mddevs); 182 blk_put_queue(mddev->queue); 183 kfree(mddev); 184 } 185 spin_unlock(&all_mddevs_lock); 186 } 187 188 static mddev_t * mddev_find(dev_t unit) 189 { 190 mddev_t *mddev, *new = NULL; 191 192 retry: 193 spin_lock(&all_mddevs_lock); 194 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 195 if (mddev->unit == unit) { 196 mddev_get(mddev); 197 spin_unlock(&all_mddevs_lock); 198 kfree(new); 199 return mddev; 200 } 201 202 if (new) { 203 list_add(&new->all_mddevs, &all_mddevs); 204 spin_unlock(&all_mddevs_lock); 205 return new; 206 } 207 spin_unlock(&all_mddevs_lock); 208 209 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 210 if (!new) 211 return NULL; 212 213 memset(new, 0, sizeof(*new)); 214 215 new->unit = unit; 216 if (MAJOR(unit) == MD_MAJOR) 217 new->md_minor = MINOR(unit); 218 else 219 new->md_minor = MINOR(unit) >> MdpMinorShift; 220 221 init_MUTEX(&new->reconfig_sem); 222 INIT_LIST_HEAD(&new->disks); 223 INIT_LIST_HEAD(&new->all_mddevs); 224 init_timer(&new->safemode_timer); 225 atomic_set(&new->active, 1); 226 spin_lock_init(&new->write_lock); 227 init_waitqueue_head(&new->sb_wait); 228 229 new->queue = blk_alloc_queue(GFP_KERNEL); 230 if (!new->queue) { 231 kfree(new); 232 return NULL; 233 } 234 235 blk_queue_make_request(new->queue, md_fail_request); 236 237 goto retry; 238 } 239 240 static inline int mddev_lock(mddev_t * mddev) 241 { 242 return down_interruptible(&mddev->reconfig_sem); 243 } 244 245 static inline void mddev_lock_uninterruptible(mddev_t * mddev) 246 { 247 down(&mddev->reconfig_sem); 248 } 249 250 static inline int mddev_trylock(mddev_t * mddev) 251 { 252 return down_trylock(&mddev->reconfig_sem); 253 } 254 255 static inline void mddev_unlock(mddev_t * mddev) 256 { 257 up(&mddev->reconfig_sem); 258 259 if (mddev->thread) 260 md_wakeup_thread(mddev->thread); 261 } 262 263 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 264 { 265 mdk_rdev_t * rdev; 266 struct list_head *tmp; 267 268 ITERATE_RDEV(mddev,rdev,tmp) { 269 if (rdev->desc_nr == nr) 270 return rdev; 271 } 272 return NULL; 273 } 274 275 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 276 { 277 struct list_head *tmp; 278 mdk_rdev_t *rdev; 279 280 ITERATE_RDEV(mddev,rdev,tmp) { 281 if (rdev->bdev->bd_dev == dev) 282 return rdev; 283 } 284 return NULL; 285 } 286 287 inline static sector_t calc_dev_sboffset(struct block_device *bdev) 288 { 289 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 290 return MD_NEW_SIZE_BLOCKS(size); 291 } 292 293 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 294 { 295 sector_t size; 296 297 size = rdev->sb_offset; 298 299 if (chunk_size) 300 size &= ~((sector_t)chunk_size/1024 - 1); 301 return size; 302 } 303 304 static int alloc_disk_sb(mdk_rdev_t * rdev) 305 { 306 if (rdev->sb_page) 307 MD_BUG(); 308 309 rdev->sb_page = alloc_page(GFP_KERNEL); 310 if (!rdev->sb_page) { 311 printk(KERN_ALERT "md: out of memory.\n"); 312 return -EINVAL; 313 } 314 315 return 0; 316 } 317 318 static void free_disk_sb(mdk_rdev_t * rdev) 319 { 320 if (rdev->sb_page) { 321 page_cache_release(rdev->sb_page); 322 rdev->sb_loaded = 0; 323 rdev->sb_page = NULL; 324 rdev->sb_offset = 0; 325 rdev->size = 0; 326 } 327 } 328 329 330 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 331 { 332 mdk_rdev_t *rdev = bio->bi_private; 333 if (bio->bi_size) 334 return 1; 335 336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 337 md_error(rdev->mddev, rdev); 338 339 if (atomic_dec_and_test(&rdev->mddev->pending_writes)) 340 wake_up(&rdev->mddev->sb_wait); 341 return 0; 342 } 343 344 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 345 sector_t sector, int size, struct page *page) 346 { 347 /* write first size bytes of page to sector of rdev 348 * Increment mddev->pending_writes before returning 349 * and decrement it on completion, waking up sb_wait 350 * if zero is reached. 351 * If an error occurred, call md_error 352 */ 353 struct bio *bio = bio_alloc(GFP_NOIO, 1); 354 355 bio->bi_bdev = rdev->bdev; 356 bio->bi_sector = sector; 357 bio_add_page(bio, page, size, 0); 358 bio->bi_private = rdev; 359 bio->bi_end_io = super_written; 360 atomic_inc(&mddev->pending_writes); 361 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); 362 } 363 364 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 365 { 366 if (bio->bi_size) 367 return 1; 368 369 complete((struct completion*)bio->bi_private); 370 return 0; 371 } 372 373 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 374 struct page *page, int rw) 375 { 376 struct bio *bio = bio_alloc(GFP_NOIO, 1); 377 struct completion event; 378 int ret; 379 380 rw |= (1 << BIO_RW_SYNC); 381 382 bio->bi_bdev = bdev; 383 bio->bi_sector = sector; 384 bio_add_page(bio, page, size, 0); 385 init_completion(&event); 386 bio->bi_private = &event; 387 bio->bi_end_io = bi_complete; 388 submit_bio(rw, bio); 389 wait_for_completion(&event); 390 391 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 392 bio_put(bio); 393 return ret; 394 } 395 396 static int read_disk_sb(mdk_rdev_t * rdev) 397 { 398 char b[BDEVNAME_SIZE]; 399 if (!rdev->sb_page) { 400 MD_BUG(); 401 return -EINVAL; 402 } 403 if (rdev->sb_loaded) 404 return 0; 405 406 407 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) 408 goto fail; 409 rdev->sb_loaded = 1; 410 return 0; 411 412 fail: 413 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 414 bdevname(rdev->bdev,b)); 415 return -EINVAL; 416 } 417 418 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 419 { 420 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 421 (sb1->set_uuid1 == sb2->set_uuid1) && 422 (sb1->set_uuid2 == sb2->set_uuid2) && 423 (sb1->set_uuid3 == sb2->set_uuid3)) 424 425 return 1; 426 427 return 0; 428 } 429 430 431 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 432 { 433 int ret; 434 mdp_super_t *tmp1, *tmp2; 435 436 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 437 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 438 439 if (!tmp1 || !tmp2) { 440 ret = 0; 441 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 442 goto abort; 443 } 444 445 *tmp1 = *sb1; 446 *tmp2 = *sb2; 447 448 /* 449 * nr_disks is not constant 450 */ 451 tmp1->nr_disks = 0; 452 tmp2->nr_disks = 0; 453 454 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 455 ret = 0; 456 else 457 ret = 1; 458 459 abort: 460 kfree(tmp1); 461 kfree(tmp2); 462 return ret; 463 } 464 465 static unsigned int calc_sb_csum(mdp_super_t * sb) 466 { 467 unsigned int disk_csum, csum; 468 469 disk_csum = sb->sb_csum; 470 sb->sb_csum = 0; 471 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 472 sb->sb_csum = disk_csum; 473 return csum; 474 } 475 476 477 /* 478 * Handle superblock details. 479 * We want to be able to handle multiple superblock formats 480 * so we have a common interface to them all, and an array of 481 * different handlers. 482 * We rely on user-space to write the initial superblock, and support 483 * reading and updating of superblocks. 484 * Interface methods are: 485 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 486 * loads and validates a superblock on dev. 487 * if refdev != NULL, compare superblocks on both devices 488 * Return: 489 * 0 - dev has a superblock that is compatible with refdev 490 * 1 - dev has a superblock that is compatible and newer than refdev 491 * so dev should be used as the refdev in future 492 * -EINVAL superblock incompatible or invalid 493 * -othererror e.g. -EIO 494 * 495 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 496 * Verify that dev is acceptable into mddev. 497 * The first time, mddev->raid_disks will be 0, and data from 498 * dev should be merged in. Subsequent calls check that dev 499 * is new enough. Return 0 or -EINVAL 500 * 501 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 502 * Update the superblock for rdev with data in mddev 503 * This does not write to disc. 504 * 505 */ 506 507 struct super_type { 508 char *name; 509 struct module *owner; 510 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 511 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 512 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 513 }; 514 515 /* 516 * load_super for 0.90.0 517 */ 518 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 519 { 520 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 521 mdp_super_t *sb; 522 int ret; 523 sector_t sb_offset; 524 525 /* 526 * Calculate the position of the superblock, 527 * it's at the end of the disk. 528 * 529 * It also happens to be a multiple of 4Kb. 530 */ 531 sb_offset = calc_dev_sboffset(rdev->bdev); 532 rdev->sb_offset = sb_offset; 533 534 ret = read_disk_sb(rdev); 535 if (ret) return ret; 536 537 ret = -EINVAL; 538 539 bdevname(rdev->bdev, b); 540 sb = (mdp_super_t*)page_address(rdev->sb_page); 541 542 if (sb->md_magic != MD_SB_MAGIC) { 543 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 544 b); 545 goto abort; 546 } 547 548 if (sb->major_version != 0 || 549 sb->minor_version != 90) { 550 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 551 sb->major_version, sb->minor_version, 552 b); 553 goto abort; 554 } 555 556 if (sb->raid_disks <= 0) 557 goto abort; 558 559 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 560 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 561 b); 562 goto abort; 563 } 564 565 rdev->preferred_minor = sb->md_minor; 566 rdev->data_offset = 0; 567 568 if (sb->level == LEVEL_MULTIPATH) 569 rdev->desc_nr = -1; 570 else 571 rdev->desc_nr = sb->this_disk.number; 572 573 if (refdev == 0) 574 ret = 1; 575 else { 576 __u64 ev1, ev2; 577 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 578 if (!uuid_equal(refsb, sb)) { 579 printk(KERN_WARNING "md: %s has different UUID to %s\n", 580 b, bdevname(refdev->bdev,b2)); 581 goto abort; 582 } 583 if (!sb_equal(refsb, sb)) { 584 printk(KERN_WARNING "md: %s has same UUID" 585 " but different superblock to %s\n", 586 b, bdevname(refdev->bdev, b2)); 587 goto abort; 588 } 589 ev1 = md_event(sb); 590 ev2 = md_event(refsb); 591 if (ev1 > ev2) 592 ret = 1; 593 else 594 ret = 0; 595 } 596 rdev->size = calc_dev_size(rdev, sb->chunk_size); 597 598 abort: 599 return ret; 600 } 601 602 /* 603 * validate_super for 0.90.0 604 */ 605 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 606 { 607 mdp_disk_t *desc; 608 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 609 610 rdev->raid_disk = -1; 611 rdev->in_sync = 0; 612 if (mddev->raid_disks == 0) { 613 mddev->major_version = 0; 614 mddev->minor_version = sb->minor_version; 615 mddev->patch_version = sb->patch_version; 616 mddev->persistent = ! sb->not_persistent; 617 mddev->chunk_size = sb->chunk_size; 618 mddev->ctime = sb->ctime; 619 mddev->utime = sb->utime; 620 mddev->level = sb->level; 621 mddev->layout = sb->layout; 622 mddev->raid_disks = sb->raid_disks; 623 mddev->size = sb->size; 624 mddev->events = md_event(sb); 625 626 if (sb->state & (1<<MD_SB_CLEAN)) 627 mddev->recovery_cp = MaxSector; 628 else { 629 if (sb->events_hi == sb->cp_events_hi && 630 sb->events_lo == sb->cp_events_lo) { 631 mddev->recovery_cp = sb->recovery_cp; 632 } else 633 mddev->recovery_cp = 0; 634 } 635 636 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 637 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 638 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 639 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 640 641 mddev->max_disks = MD_SB_DISKS; 642 643 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 644 mddev->bitmap_file == NULL) { 645 if (mddev->level != 1) { 646 /* FIXME use a better test */ 647 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 648 return -EINVAL; 649 } 650 mddev->bitmap_offset = (MD_SB_BYTES >> 9); 651 } 652 653 } else if (mddev->pers == NULL) { 654 /* Insist on good event counter while assembling */ 655 __u64 ev1 = md_event(sb); 656 ++ev1; 657 if (ev1 < mddev->events) 658 return -EINVAL; 659 } else if (mddev->bitmap) { 660 /* if adding to array with a bitmap, then we can accept an 661 * older device ... but not too old. 662 */ 663 __u64 ev1 = md_event(sb); 664 if (ev1 < mddev->bitmap->events_cleared) 665 return 0; 666 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 667 return 0; 668 669 if (mddev->level != LEVEL_MULTIPATH) { 670 rdev->faulty = 0; 671 desc = sb->disks + rdev->desc_nr; 672 673 if (desc->state & (1<<MD_DISK_FAULTY)) 674 rdev->faulty = 1; 675 else if (desc->state & (1<<MD_DISK_SYNC) && 676 desc->raid_disk < mddev->raid_disks) { 677 rdev->in_sync = 1; 678 rdev->raid_disk = desc->raid_disk; 679 } 680 } else /* MULTIPATH are always insync */ 681 rdev->in_sync = 1; 682 return 0; 683 } 684 685 /* 686 * sync_super for 0.90.0 687 */ 688 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 689 { 690 mdp_super_t *sb; 691 struct list_head *tmp; 692 mdk_rdev_t *rdev2; 693 int next_spare = mddev->raid_disks; 694 695 /* make rdev->sb match mddev data.. 696 * 697 * 1/ zero out disks 698 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 699 * 3/ any empty disks < next_spare become removed 700 * 701 * disks[0] gets initialised to REMOVED because 702 * we cannot be sure from other fields if it has 703 * been initialised or not. 704 */ 705 int i; 706 int active=0, working=0,failed=0,spare=0,nr_disks=0; 707 708 sb = (mdp_super_t*)page_address(rdev->sb_page); 709 710 memset(sb, 0, sizeof(*sb)); 711 712 sb->md_magic = MD_SB_MAGIC; 713 sb->major_version = mddev->major_version; 714 sb->minor_version = mddev->minor_version; 715 sb->patch_version = mddev->patch_version; 716 sb->gvalid_words = 0; /* ignored */ 717 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 718 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 719 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 720 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 721 722 sb->ctime = mddev->ctime; 723 sb->level = mddev->level; 724 sb->size = mddev->size; 725 sb->raid_disks = mddev->raid_disks; 726 sb->md_minor = mddev->md_minor; 727 sb->not_persistent = !mddev->persistent; 728 sb->utime = mddev->utime; 729 sb->state = 0; 730 sb->events_hi = (mddev->events>>32); 731 sb->events_lo = (u32)mddev->events; 732 733 if (mddev->in_sync) 734 { 735 sb->recovery_cp = mddev->recovery_cp; 736 sb->cp_events_hi = (mddev->events>>32); 737 sb->cp_events_lo = (u32)mddev->events; 738 if (mddev->recovery_cp == MaxSector) 739 sb->state = (1<< MD_SB_CLEAN); 740 } else 741 sb->recovery_cp = 0; 742 743 sb->layout = mddev->layout; 744 sb->chunk_size = mddev->chunk_size; 745 746 if (mddev->bitmap && mddev->bitmap_file == NULL) 747 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 748 749 sb->disks[0].state = (1<<MD_DISK_REMOVED); 750 ITERATE_RDEV(mddev,rdev2,tmp) { 751 mdp_disk_t *d; 752 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) 753 rdev2->desc_nr = rdev2->raid_disk; 754 else 755 rdev2->desc_nr = next_spare++; 756 d = &sb->disks[rdev2->desc_nr]; 757 nr_disks++; 758 d->number = rdev2->desc_nr; 759 d->major = MAJOR(rdev2->bdev->bd_dev); 760 d->minor = MINOR(rdev2->bdev->bd_dev); 761 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) 762 d->raid_disk = rdev2->raid_disk; 763 else 764 d->raid_disk = rdev2->desc_nr; /* compatibility */ 765 if (rdev2->faulty) { 766 d->state = (1<<MD_DISK_FAULTY); 767 failed++; 768 } else if (rdev2->in_sync) { 769 d->state = (1<<MD_DISK_ACTIVE); 770 d->state |= (1<<MD_DISK_SYNC); 771 active++; 772 working++; 773 } else { 774 d->state = 0; 775 spare++; 776 working++; 777 } 778 } 779 780 /* now set the "removed" and "faulty" bits on any missing devices */ 781 for (i=0 ; i < mddev->raid_disks ; i++) { 782 mdp_disk_t *d = &sb->disks[i]; 783 if (d->state == 0 && d->number == 0) { 784 d->number = i; 785 d->raid_disk = i; 786 d->state = (1<<MD_DISK_REMOVED); 787 d->state |= (1<<MD_DISK_FAULTY); 788 failed++; 789 } 790 } 791 sb->nr_disks = nr_disks; 792 sb->active_disks = active; 793 sb->working_disks = working; 794 sb->failed_disks = failed; 795 sb->spare_disks = spare; 796 797 sb->this_disk = sb->disks[rdev->desc_nr]; 798 sb->sb_csum = calc_sb_csum(sb); 799 } 800 801 /* 802 * version 1 superblock 803 */ 804 805 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 806 { 807 unsigned int disk_csum, csum; 808 unsigned long long newcsum; 809 int size = 256 + le32_to_cpu(sb->max_dev)*2; 810 unsigned int *isuper = (unsigned int*)sb; 811 int i; 812 813 disk_csum = sb->sb_csum; 814 sb->sb_csum = 0; 815 newcsum = 0; 816 for (i=0; size>=4; size -= 4 ) 817 newcsum += le32_to_cpu(*isuper++); 818 819 if (size == 2) 820 newcsum += le16_to_cpu(*(unsigned short*) isuper); 821 822 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 823 sb->sb_csum = disk_csum; 824 return cpu_to_le32(csum); 825 } 826 827 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 828 { 829 struct mdp_superblock_1 *sb; 830 int ret; 831 sector_t sb_offset; 832 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 833 834 /* 835 * Calculate the position of the superblock. 836 * It is always aligned to a 4K boundary and 837 * depeding on minor_version, it can be: 838 * 0: At least 8K, but less than 12K, from end of device 839 * 1: At start of device 840 * 2: 4K from start of device. 841 */ 842 switch(minor_version) { 843 case 0: 844 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 845 sb_offset -= 8*2; 846 sb_offset &= ~(sector_t)(4*2-1); 847 /* convert from sectors to K */ 848 sb_offset /= 2; 849 break; 850 case 1: 851 sb_offset = 0; 852 break; 853 case 2: 854 sb_offset = 4; 855 break; 856 default: 857 return -EINVAL; 858 } 859 rdev->sb_offset = sb_offset; 860 861 ret = read_disk_sb(rdev); 862 if (ret) return ret; 863 864 865 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 866 867 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 868 sb->major_version != cpu_to_le32(1) || 869 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 870 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 871 sb->feature_map != 0) 872 return -EINVAL; 873 874 if (calc_sb_1_csum(sb) != sb->sb_csum) { 875 printk("md: invalid superblock checksum on %s\n", 876 bdevname(rdev->bdev,b)); 877 return -EINVAL; 878 } 879 if (le64_to_cpu(sb->data_size) < 10) { 880 printk("md: data_size too small on %s\n", 881 bdevname(rdev->bdev,b)); 882 return -EINVAL; 883 } 884 rdev->preferred_minor = 0xffff; 885 rdev->data_offset = le64_to_cpu(sb->data_offset); 886 887 if (refdev == 0) 888 return 1; 889 else { 890 __u64 ev1, ev2; 891 struct mdp_superblock_1 *refsb = 892 (struct mdp_superblock_1*)page_address(refdev->sb_page); 893 894 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 895 sb->level != refsb->level || 896 sb->layout != refsb->layout || 897 sb->chunksize != refsb->chunksize) { 898 printk(KERN_WARNING "md: %s has strangely different" 899 " superblock to %s\n", 900 bdevname(rdev->bdev,b), 901 bdevname(refdev->bdev,b2)); 902 return -EINVAL; 903 } 904 ev1 = le64_to_cpu(sb->events); 905 ev2 = le64_to_cpu(refsb->events); 906 907 if (ev1 > ev2) 908 return 1; 909 } 910 if (minor_version) 911 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 912 else 913 rdev->size = rdev->sb_offset; 914 if (rdev->size < le64_to_cpu(sb->data_size)/2) 915 return -EINVAL; 916 rdev->size = le64_to_cpu(sb->data_size)/2; 917 if (le32_to_cpu(sb->chunksize)) 918 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 919 return 0; 920 } 921 922 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 923 { 924 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 925 926 rdev->raid_disk = -1; 927 rdev->in_sync = 0; 928 if (mddev->raid_disks == 0) { 929 mddev->major_version = 1; 930 mddev->patch_version = 0; 931 mddev->persistent = 1; 932 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 933 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 934 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 935 mddev->level = le32_to_cpu(sb->level); 936 mddev->layout = le32_to_cpu(sb->layout); 937 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 938 mddev->size = le64_to_cpu(sb->size)/2; 939 mddev->events = le64_to_cpu(sb->events); 940 941 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 942 memcpy(mddev->uuid, sb->set_uuid, 16); 943 944 mddev->max_disks = (4096-256)/2; 945 946 if ((le32_to_cpu(sb->feature_map) & 1) && 947 mddev->bitmap_file == NULL ) { 948 if (mddev->level != 1) { 949 printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); 950 return -EINVAL; 951 } 952 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 953 } 954 } else if (mddev->pers == NULL) { 955 /* Insist of good event counter while assembling */ 956 __u64 ev1 = le64_to_cpu(sb->events); 957 ++ev1; 958 if (ev1 < mddev->events) 959 return -EINVAL; 960 } else if (mddev->bitmap) { 961 /* If adding to array with a bitmap, then we can accept an 962 * older device, but not too old. 963 */ 964 __u64 ev1 = le64_to_cpu(sb->events); 965 if (ev1 < mddev->bitmap->events_cleared) 966 return 0; 967 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 968 return 0; 969 970 if (mddev->level != LEVEL_MULTIPATH) { 971 int role; 972 rdev->desc_nr = le32_to_cpu(sb->dev_number); 973 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 974 switch(role) { 975 case 0xffff: /* spare */ 976 rdev->faulty = 0; 977 break; 978 case 0xfffe: /* faulty */ 979 rdev->faulty = 1; 980 break; 981 default: 982 rdev->in_sync = 1; 983 rdev->faulty = 0; 984 rdev->raid_disk = role; 985 break; 986 } 987 } else /* MULTIPATH are always insync */ 988 rdev->in_sync = 1; 989 990 return 0; 991 } 992 993 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 994 { 995 struct mdp_superblock_1 *sb; 996 struct list_head *tmp; 997 mdk_rdev_t *rdev2; 998 int max_dev, i; 999 /* make rdev->sb match mddev and rdev data. */ 1000 1001 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1002 1003 sb->feature_map = 0; 1004 sb->pad0 = 0; 1005 memset(sb->pad1, 0, sizeof(sb->pad1)); 1006 memset(sb->pad2, 0, sizeof(sb->pad2)); 1007 memset(sb->pad3, 0, sizeof(sb->pad3)); 1008 1009 sb->utime = cpu_to_le64((__u64)mddev->utime); 1010 sb->events = cpu_to_le64(mddev->events); 1011 if (mddev->in_sync) 1012 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1013 else 1014 sb->resync_offset = cpu_to_le64(0); 1015 1016 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1017 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1018 sb->feature_map = cpu_to_le32(1); 1019 } 1020 1021 max_dev = 0; 1022 ITERATE_RDEV(mddev,rdev2,tmp) 1023 if (rdev2->desc_nr+1 > max_dev) 1024 max_dev = rdev2->desc_nr+1; 1025 1026 sb->max_dev = cpu_to_le32(max_dev); 1027 for (i=0; i<max_dev;i++) 1028 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1029 1030 ITERATE_RDEV(mddev,rdev2,tmp) { 1031 i = rdev2->desc_nr; 1032 if (rdev2->faulty) 1033 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1034 else if (rdev2->in_sync) 1035 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1036 else 1037 sb->dev_roles[i] = cpu_to_le16(0xffff); 1038 } 1039 1040 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 1041 sb->sb_csum = calc_sb_1_csum(sb); 1042 } 1043 1044 1045 static struct super_type super_types[] = { 1046 [0] = { 1047 .name = "0.90.0", 1048 .owner = THIS_MODULE, 1049 .load_super = super_90_load, 1050 .validate_super = super_90_validate, 1051 .sync_super = super_90_sync, 1052 }, 1053 [1] = { 1054 .name = "md-1", 1055 .owner = THIS_MODULE, 1056 .load_super = super_1_load, 1057 .validate_super = super_1_validate, 1058 .sync_super = super_1_sync, 1059 }, 1060 }; 1061 1062 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1063 { 1064 struct list_head *tmp; 1065 mdk_rdev_t *rdev; 1066 1067 ITERATE_RDEV(mddev,rdev,tmp) 1068 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1069 return rdev; 1070 1071 return NULL; 1072 } 1073 1074 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1075 { 1076 struct list_head *tmp; 1077 mdk_rdev_t *rdev; 1078 1079 ITERATE_RDEV(mddev1,rdev,tmp) 1080 if (match_dev_unit(mddev2, rdev)) 1081 return 1; 1082 1083 return 0; 1084 } 1085 1086 static LIST_HEAD(pending_raid_disks); 1087 1088 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1089 { 1090 mdk_rdev_t *same_pdev; 1091 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1092 1093 if (rdev->mddev) { 1094 MD_BUG(); 1095 return -EINVAL; 1096 } 1097 same_pdev = match_dev_unit(mddev, rdev); 1098 if (same_pdev) 1099 printk(KERN_WARNING 1100 "%s: WARNING: %s appears to be on the same physical" 1101 " disk as %s. True\n protection against single-disk" 1102 " failure might be compromised.\n", 1103 mdname(mddev), bdevname(rdev->bdev,b), 1104 bdevname(same_pdev->bdev,b2)); 1105 1106 /* Verify rdev->desc_nr is unique. 1107 * If it is -1, assign a free number, else 1108 * check number is not in use 1109 */ 1110 if (rdev->desc_nr < 0) { 1111 int choice = 0; 1112 if (mddev->pers) choice = mddev->raid_disks; 1113 while (find_rdev_nr(mddev, choice)) 1114 choice++; 1115 rdev->desc_nr = choice; 1116 } else { 1117 if (find_rdev_nr(mddev, rdev->desc_nr)) 1118 return -EBUSY; 1119 } 1120 1121 list_add(&rdev->same_set, &mddev->disks); 1122 rdev->mddev = mddev; 1123 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); 1124 return 0; 1125 } 1126 1127 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1128 { 1129 char b[BDEVNAME_SIZE]; 1130 if (!rdev->mddev) { 1131 MD_BUG(); 1132 return; 1133 } 1134 list_del_init(&rdev->same_set); 1135 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1136 rdev->mddev = NULL; 1137 } 1138 1139 /* 1140 * prevent the device from being mounted, repartitioned or 1141 * otherwise reused by a RAID array (or any other kernel 1142 * subsystem), by bd_claiming the device. 1143 */ 1144 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1145 { 1146 int err = 0; 1147 struct block_device *bdev; 1148 char b[BDEVNAME_SIZE]; 1149 1150 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1151 if (IS_ERR(bdev)) { 1152 printk(KERN_ERR "md: could not open %s.\n", 1153 __bdevname(dev, b)); 1154 return PTR_ERR(bdev); 1155 } 1156 err = bd_claim(bdev, rdev); 1157 if (err) { 1158 printk(KERN_ERR "md: could not bd_claim %s.\n", 1159 bdevname(bdev, b)); 1160 blkdev_put(bdev); 1161 return err; 1162 } 1163 rdev->bdev = bdev; 1164 return err; 1165 } 1166 1167 static void unlock_rdev(mdk_rdev_t *rdev) 1168 { 1169 struct block_device *bdev = rdev->bdev; 1170 rdev->bdev = NULL; 1171 if (!bdev) 1172 MD_BUG(); 1173 bd_release(bdev); 1174 blkdev_put(bdev); 1175 } 1176 1177 void md_autodetect_dev(dev_t dev); 1178 1179 static void export_rdev(mdk_rdev_t * rdev) 1180 { 1181 char b[BDEVNAME_SIZE]; 1182 printk(KERN_INFO "md: export_rdev(%s)\n", 1183 bdevname(rdev->bdev,b)); 1184 if (rdev->mddev) 1185 MD_BUG(); 1186 free_disk_sb(rdev); 1187 list_del_init(&rdev->same_set); 1188 #ifndef MODULE 1189 md_autodetect_dev(rdev->bdev->bd_dev); 1190 #endif 1191 unlock_rdev(rdev); 1192 kfree(rdev); 1193 } 1194 1195 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1196 { 1197 unbind_rdev_from_array(rdev); 1198 export_rdev(rdev); 1199 } 1200 1201 static void export_array(mddev_t *mddev) 1202 { 1203 struct list_head *tmp; 1204 mdk_rdev_t *rdev; 1205 1206 ITERATE_RDEV(mddev,rdev,tmp) { 1207 if (!rdev->mddev) { 1208 MD_BUG(); 1209 continue; 1210 } 1211 kick_rdev_from_array(rdev); 1212 } 1213 if (!list_empty(&mddev->disks)) 1214 MD_BUG(); 1215 mddev->raid_disks = 0; 1216 mddev->major_version = 0; 1217 } 1218 1219 static void print_desc(mdp_disk_t *desc) 1220 { 1221 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1222 desc->major,desc->minor,desc->raid_disk,desc->state); 1223 } 1224 1225 static void print_sb(mdp_super_t *sb) 1226 { 1227 int i; 1228 1229 printk(KERN_INFO 1230 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1231 sb->major_version, sb->minor_version, sb->patch_version, 1232 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1233 sb->ctime); 1234 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1235 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1236 sb->md_minor, sb->layout, sb->chunk_size); 1237 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1238 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1239 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1240 sb->failed_disks, sb->spare_disks, 1241 sb->sb_csum, (unsigned long)sb->events_lo); 1242 1243 printk(KERN_INFO); 1244 for (i = 0; i < MD_SB_DISKS; i++) { 1245 mdp_disk_t *desc; 1246 1247 desc = sb->disks + i; 1248 if (desc->number || desc->major || desc->minor || 1249 desc->raid_disk || (desc->state && (desc->state != 4))) { 1250 printk(" D %2d: ", i); 1251 print_desc(desc); 1252 } 1253 } 1254 printk(KERN_INFO "md: THIS: "); 1255 print_desc(&sb->this_disk); 1256 1257 } 1258 1259 static void print_rdev(mdk_rdev_t *rdev) 1260 { 1261 char b[BDEVNAME_SIZE]; 1262 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1263 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1264 rdev->faulty, rdev->in_sync, rdev->desc_nr); 1265 if (rdev->sb_loaded) { 1266 printk(KERN_INFO "md: rdev superblock:\n"); 1267 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1268 } else 1269 printk(KERN_INFO "md: no rdev superblock!\n"); 1270 } 1271 1272 void md_print_devices(void) 1273 { 1274 struct list_head *tmp, *tmp2; 1275 mdk_rdev_t *rdev; 1276 mddev_t *mddev; 1277 char b[BDEVNAME_SIZE]; 1278 1279 printk("\n"); 1280 printk("md: **********************************\n"); 1281 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1282 printk("md: **********************************\n"); 1283 ITERATE_MDDEV(mddev,tmp) { 1284 1285 if (mddev->bitmap) 1286 bitmap_print_sb(mddev->bitmap); 1287 else 1288 printk("%s: ", mdname(mddev)); 1289 ITERATE_RDEV(mddev,rdev,tmp2) 1290 printk("<%s>", bdevname(rdev->bdev,b)); 1291 printk("\n"); 1292 1293 ITERATE_RDEV(mddev,rdev,tmp2) 1294 print_rdev(rdev); 1295 } 1296 printk("md: **********************************\n"); 1297 printk("\n"); 1298 } 1299 1300 1301 static void sync_sbs(mddev_t * mddev) 1302 { 1303 mdk_rdev_t *rdev; 1304 struct list_head *tmp; 1305 1306 ITERATE_RDEV(mddev,rdev,tmp) { 1307 super_types[mddev->major_version]. 1308 sync_super(mddev, rdev); 1309 rdev->sb_loaded = 1; 1310 } 1311 } 1312 1313 static void md_update_sb(mddev_t * mddev) 1314 { 1315 int err; 1316 struct list_head *tmp; 1317 mdk_rdev_t *rdev; 1318 int sync_req; 1319 1320 repeat: 1321 spin_lock(&mddev->write_lock); 1322 sync_req = mddev->in_sync; 1323 mddev->utime = get_seconds(); 1324 mddev->events ++; 1325 1326 if (!mddev->events) { 1327 /* 1328 * oops, this 64-bit counter should never wrap. 1329 * Either we are in around ~1 trillion A.C., assuming 1330 * 1 reboot per second, or we have a bug: 1331 */ 1332 MD_BUG(); 1333 mddev->events --; 1334 } 1335 mddev->sb_dirty = 2; 1336 sync_sbs(mddev); 1337 1338 /* 1339 * do not write anything to disk if using 1340 * nonpersistent superblocks 1341 */ 1342 if (!mddev->persistent) { 1343 mddev->sb_dirty = 0; 1344 spin_unlock(&mddev->write_lock); 1345 wake_up(&mddev->sb_wait); 1346 return; 1347 } 1348 spin_unlock(&mddev->write_lock); 1349 1350 dprintk(KERN_INFO 1351 "md: updating %s RAID superblock on device (in sync %d)\n", 1352 mdname(mddev),mddev->in_sync); 1353 1354 err = bitmap_update_sb(mddev->bitmap); 1355 ITERATE_RDEV(mddev,rdev,tmp) { 1356 char b[BDEVNAME_SIZE]; 1357 dprintk(KERN_INFO "md: "); 1358 if (rdev->faulty) 1359 dprintk("(skipping faulty "); 1360 1361 dprintk("%s ", bdevname(rdev->bdev,b)); 1362 if (!rdev->faulty) { 1363 md_super_write(mddev,rdev, 1364 rdev->sb_offset<<1, MD_SB_BYTES, 1365 rdev->sb_page); 1366 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1367 bdevname(rdev->bdev,b), 1368 (unsigned long long)rdev->sb_offset); 1369 1370 } else 1371 dprintk(")\n"); 1372 if (mddev->level == LEVEL_MULTIPATH) 1373 /* only need to write one superblock... */ 1374 break; 1375 } 1376 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1377 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1378 1379 spin_lock(&mddev->write_lock); 1380 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1381 /* have to write it out again */ 1382 spin_unlock(&mddev->write_lock); 1383 goto repeat; 1384 } 1385 mddev->sb_dirty = 0; 1386 spin_unlock(&mddev->write_lock); 1387 wake_up(&mddev->sb_wait); 1388 1389 } 1390 1391 /* 1392 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1393 * 1394 * mark the device faulty if: 1395 * 1396 * - the device is nonexistent (zero size) 1397 * - the device has no valid superblock 1398 * 1399 * a faulty rdev _never_ has rdev->sb set. 1400 */ 1401 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1402 { 1403 char b[BDEVNAME_SIZE]; 1404 int err; 1405 mdk_rdev_t *rdev; 1406 sector_t size; 1407 1408 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1409 if (!rdev) { 1410 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1411 return ERR_PTR(-ENOMEM); 1412 } 1413 memset(rdev, 0, sizeof(*rdev)); 1414 1415 if ((err = alloc_disk_sb(rdev))) 1416 goto abort_free; 1417 1418 err = lock_rdev(rdev, newdev); 1419 if (err) 1420 goto abort_free; 1421 1422 rdev->desc_nr = -1; 1423 rdev->faulty = 0; 1424 rdev->in_sync = 0; 1425 rdev->data_offset = 0; 1426 atomic_set(&rdev->nr_pending, 0); 1427 1428 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1429 if (!size) { 1430 printk(KERN_WARNING 1431 "md: %s has zero or unknown size, marking faulty!\n", 1432 bdevname(rdev->bdev,b)); 1433 err = -EINVAL; 1434 goto abort_free; 1435 } 1436 1437 if (super_format >= 0) { 1438 err = super_types[super_format]. 1439 load_super(rdev, NULL, super_minor); 1440 if (err == -EINVAL) { 1441 printk(KERN_WARNING 1442 "md: %s has invalid sb, not importing!\n", 1443 bdevname(rdev->bdev,b)); 1444 goto abort_free; 1445 } 1446 if (err < 0) { 1447 printk(KERN_WARNING 1448 "md: could not read %s's sb, not importing!\n", 1449 bdevname(rdev->bdev,b)); 1450 goto abort_free; 1451 } 1452 } 1453 INIT_LIST_HEAD(&rdev->same_set); 1454 1455 return rdev; 1456 1457 abort_free: 1458 if (rdev->sb_page) { 1459 if (rdev->bdev) 1460 unlock_rdev(rdev); 1461 free_disk_sb(rdev); 1462 } 1463 kfree(rdev); 1464 return ERR_PTR(err); 1465 } 1466 1467 /* 1468 * Check a full RAID array for plausibility 1469 */ 1470 1471 1472 static void analyze_sbs(mddev_t * mddev) 1473 { 1474 int i; 1475 struct list_head *tmp; 1476 mdk_rdev_t *rdev, *freshest; 1477 char b[BDEVNAME_SIZE]; 1478 1479 freshest = NULL; 1480 ITERATE_RDEV(mddev,rdev,tmp) 1481 switch (super_types[mddev->major_version]. 1482 load_super(rdev, freshest, mddev->minor_version)) { 1483 case 1: 1484 freshest = rdev; 1485 break; 1486 case 0: 1487 break; 1488 default: 1489 printk( KERN_ERR \ 1490 "md: fatal superblock inconsistency in %s" 1491 " -- removing from array\n", 1492 bdevname(rdev->bdev,b)); 1493 kick_rdev_from_array(rdev); 1494 } 1495 1496 1497 super_types[mddev->major_version]. 1498 validate_super(mddev, freshest); 1499 1500 i = 0; 1501 ITERATE_RDEV(mddev,rdev,tmp) { 1502 if (rdev != freshest) 1503 if (super_types[mddev->major_version]. 1504 validate_super(mddev, rdev)) { 1505 printk(KERN_WARNING "md: kicking non-fresh %s" 1506 " from array!\n", 1507 bdevname(rdev->bdev,b)); 1508 kick_rdev_from_array(rdev); 1509 continue; 1510 } 1511 if (mddev->level == LEVEL_MULTIPATH) { 1512 rdev->desc_nr = i++; 1513 rdev->raid_disk = rdev->desc_nr; 1514 rdev->in_sync = 1; 1515 } 1516 } 1517 1518 1519 1520 if (mddev->recovery_cp != MaxSector && 1521 mddev->level >= 1) 1522 printk(KERN_ERR "md: %s: raid array is not clean" 1523 " -- starting background reconstruction\n", 1524 mdname(mddev)); 1525 1526 } 1527 1528 int mdp_major = 0; 1529 1530 static struct kobject *md_probe(dev_t dev, int *part, void *data) 1531 { 1532 static DECLARE_MUTEX(disks_sem); 1533 mddev_t *mddev = mddev_find(dev); 1534 struct gendisk *disk; 1535 int partitioned = (MAJOR(dev) != MD_MAJOR); 1536 int shift = partitioned ? MdpMinorShift : 0; 1537 int unit = MINOR(dev) >> shift; 1538 1539 if (!mddev) 1540 return NULL; 1541 1542 down(&disks_sem); 1543 if (mddev->gendisk) { 1544 up(&disks_sem); 1545 mddev_put(mddev); 1546 return NULL; 1547 } 1548 disk = alloc_disk(1 << shift); 1549 if (!disk) { 1550 up(&disks_sem); 1551 mddev_put(mddev); 1552 return NULL; 1553 } 1554 disk->major = MAJOR(dev); 1555 disk->first_minor = unit << shift; 1556 if (partitioned) { 1557 sprintf(disk->disk_name, "md_d%d", unit); 1558 sprintf(disk->devfs_name, "md/d%d", unit); 1559 } else { 1560 sprintf(disk->disk_name, "md%d", unit); 1561 sprintf(disk->devfs_name, "md/%d", unit); 1562 } 1563 disk->fops = &md_fops; 1564 disk->private_data = mddev; 1565 disk->queue = mddev->queue; 1566 add_disk(disk); 1567 mddev->gendisk = disk; 1568 up(&disks_sem); 1569 return NULL; 1570 } 1571 1572 void md_wakeup_thread(mdk_thread_t *thread); 1573 1574 static void md_safemode_timeout(unsigned long data) 1575 { 1576 mddev_t *mddev = (mddev_t *) data; 1577 1578 mddev->safemode = 1; 1579 md_wakeup_thread(mddev->thread); 1580 } 1581 1582 1583 static int do_md_run(mddev_t * mddev) 1584 { 1585 int pnum, err; 1586 int chunk_size; 1587 struct list_head *tmp; 1588 mdk_rdev_t *rdev; 1589 struct gendisk *disk; 1590 char b[BDEVNAME_SIZE]; 1591 1592 if (list_empty(&mddev->disks)) 1593 /* cannot run an array with no devices.. */ 1594 return -EINVAL; 1595 1596 if (mddev->pers) 1597 return -EBUSY; 1598 1599 /* 1600 * Analyze all RAID superblock(s) 1601 */ 1602 if (!mddev->raid_disks) 1603 analyze_sbs(mddev); 1604 1605 chunk_size = mddev->chunk_size; 1606 pnum = level_to_pers(mddev->level); 1607 1608 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1609 if (!chunk_size) { 1610 /* 1611 * 'default chunksize' in the old md code used to 1612 * be PAGE_SIZE, baaad. 1613 * we abort here to be on the safe side. We don't 1614 * want to continue the bad practice. 1615 */ 1616 printk(KERN_ERR 1617 "no chunksize specified, see 'man raidtab'\n"); 1618 return -EINVAL; 1619 } 1620 if (chunk_size > MAX_CHUNK_SIZE) { 1621 printk(KERN_ERR "too big chunk_size: %d > %d\n", 1622 chunk_size, MAX_CHUNK_SIZE); 1623 return -EINVAL; 1624 } 1625 /* 1626 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1627 */ 1628 if ( (1 << ffz(~chunk_size)) != chunk_size) { 1629 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 1630 return -EINVAL; 1631 } 1632 if (chunk_size < PAGE_SIZE) { 1633 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 1634 chunk_size, PAGE_SIZE); 1635 return -EINVAL; 1636 } 1637 1638 /* devices must have minimum size of one chunk */ 1639 ITERATE_RDEV(mddev,rdev,tmp) { 1640 if (rdev->faulty) 1641 continue; 1642 if (rdev->size < chunk_size / 1024) { 1643 printk(KERN_WARNING 1644 "md: Dev %s smaller than chunk_size:" 1645 " %lluk < %dk\n", 1646 bdevname(rdev->bdev,b), 1647 (unsigned long long)rdev->size, 1648 chunk_size / 1024); 1649 return -EINVAL; 1650 } 1651 } 1652 } 1653 1654 #ifdef CONFIG_KMOD 1655 if (!pers[pnum]) 1656 { 1657 request_module("md-personality-%d", pnum); 1658 } 1659 #endif 1660 1661 /* 1662 * Drop all container device buffers, from now on 1663 * the only valid external interface is through the md 1664 * device. 1665 * Also find largest hardsector size 1666 */ 1667 ITERATE_RDEV(mddev,rdev,tmp) { 1668 if (rdev->faulty) 1669 continue; 1670 sync_blockdev(rdev->bdev); 1671 invalidate_bdev(rdev->bdev, 0); 1672 } 1673 1674 md_probe(mddev->unit, NULL, NULL); 1675 disk = mddev->gendisk; 1676 if (!disk) 1677 return -ENOMEM; 1678 1679 spin_lock(&pers_lock); 1680 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 1681 spin_unlock(&pers_lock); 1682 printk(KERN_WARNING "md: personality %d is not loaded!\n", 1683 pnum); 1684 return -EINVAL; 1685 } 1686 1687 mddev->pers = pers[pnum]; 1688 spin_unlock(&pers_lock); 1689 1690 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 1691 1692 /* before we start the array running, initialise the bitmap */ 1693 err = bitmap_create(mddev); 1694 if (err) 1695 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 1696 mdname(mddev), err); 1697 else 1698 err = mddev->pers->run(mddev); 1699 if (err) { 1700 printk(KERN_ERR "md: pers->run() failed ...\n"); 1701 module_put(mddev->pers->owner); 1702 mddev->pers = NULL; 1703 bitmap_destroy(mddev); 1704 return err; 1705 } 1706 atomic_set(&mddev->writes_pending,0); 1707 mddev->safemode = 0; 1708 mddev->safemode_timer.function = md_safemode_timeout; 1709 mddev->safemode_timer.data = (unsigned long) mddev; 1710 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 1711 mddev->in_sync = 1; 1712 1713 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1714 1715 if (mddev->sb_dirty) 1716 md_update_sb(mddev); 1717 1718 set_capacity(disk, mddev->array_size<<1); 1719 1720 /* If we call blk_queue_make_request here, it will 1721 * re-initialise max_sectors etc which may have been 1722 * refined inside -> run. So just set the bits we need to set. 1723 * Most initialisation happended when we called 1724 * blk_queue_make_request(..., md_fail_request) 1725 * earlier. 1726 */ 1727 mddev->queue->queuedata = mddev; 1728 mddev->queue->make_request_fn = mddev->pers->make_request; 1729 1730 mddev->changed = 1; 1731 return 0; 1732 } 1733 1734 static int restart_array(mddev_t *mddev) 1735 { 1736 struct gendisk *disk = mddev->gendisk; 1737 int err; 1738 1739 /* 1740 * Complain if it has no devices 1741 */ 1742 err = -ENXIO; 1743 if (list_empty(&mddev->disks)) 1744 goto out; 1745 1746 if (mddev->pers) { 1747 err = -EBUSY; 1748 if (!mddev->ro) 1749 goto out; 1750 1751 mddev->safemode = 0; 1752 mddev->ro = 0; 1753 set_disk_ro(disk, 0); 1754 1755 printk(KERN_INFO "md: %s switched to read-write mode.\n", 1756 mdname(mddev)); 1757 /* 1758 * Kick recovery or resync if necessary 1759 */ 1760 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1761 md_wakeup_thread(mddev->thread); 1762 err = 0; 1763 } else { 1764 printk(KERN_ERR "md: %s has no personality assigned.\n", 1765 mdname(mddev)); 1766 err = -EINVAL; 1767 } 1768 1769 out: 1770 return err; 1771 } 1772 1773 static int do_md_stop(mddev_t * mddev, int ro) 1774 { 1775 int err = 0; 1776 struct gendisk *disk = mddev->gendisk; 1777 1778 if (mddev->pers) { 1779 if (atomic_read(&mddev->active)>2) { 1780 printk("md: %s still in use.\n",mdname(mddev)); 1781 return -EBUSY; 1782 } 1783 1784 if (mddev->sync_thread) { 1785 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1786 md_unregister_thread(mddev->sync_thread); 1787 mddev->sync_thread = NULL; 1788 } 1789 1790 del_timer_sync(&mddev->safemode_timer); 1791 1792 invalidate_partition(disk, 0); 1793 1794 if (ro) { 1795 err = -ENXIO; 1796 if (mddev->ro) 1797 goto out; 1798 mddev->ro = 1; 1799 } else { 1800 if (mddev->ro) 1801 set_disk_ro(disk, 0); 1802 blk_queue_make_request(mddev->queue, md_fail_request); 1803 mddev->pers->stop(mddev); 1804 module_put(mddev->pers->owner); 1805 mddev->pers = NULL; 1806 if (mddev->ro) 1807 mddev->ro = 0; 1808 } 1809 if (!mddev->in_sync) { 1810 /* mark array as shutdown cleanly */ 1811 mddev->in_sync = 1; 1812 md_update_sb(mddev); 1813 } 1814 if (ro) 1815 set_disk_ro(disk, 1); 1816 } 1817 1818 bitmap_destroy(mddev); 1819 if (mddev->bitmap_file) { 1820 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 1821 fput(mddev->bitmap_file); 1822 mddev->bitmap_file = NULL; 1823 } 1824 1825 /* 1826 * Free resources if final stop 1827 */ 1828 if (!ro) { 1829 struct gendisk *disk; 1830 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 1831 1832 export_array(mddev); 1833 1834 mddev->array_size = 0; 1835 disk = mddev->gendisk; 1836 if (disk) 1837 set_capacity(disk, 0); 1838 mddev->changed = 1; 1839 } else 1840 printk(KERN_INFO "md: %s switched to read-only mode.\n", 1841 mdname(mddev)); 1842 err = 0; 1843 out: 1844 return err; 1845 } 1846 1847 static void autorun_array(mddev_t *mddev) 1848 { 1849 mdk_rdev_t *rdev; 1850 struct list_head *tmp; 1851 int err; 1852 1853 if (list_empty(&mddev->disks)) 1854 return; 1855 1856 printk(KERN_INFO "md: running: "); 1857 1858 ITERATE_RDEV(mddev,rdev,tmp) { 1859 char b[BDEVNAME_SIZE]; 1860 printk("<%s>", bdevname(rdev->bdev,b)); 1861 } 1862 printk("\n"); 1863 1864 err = do_md_run (mddev); 1865 if (err) { 1866 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 1867 do_md_stop (mddev, 0); 1868 } 1869 } 1870 1871 /* 1872 * lets try to run arrays based on all disks that have arrived 1873 * until now. (those are in pending_raid_disks) 1874 * 1875 * the method: pick the first pending disk, collect all disks with 1876 * the same UUID, remove all from the pending list and put them into 1877 * the 'same_array' list. Then order this list based on superblock 1878 * update time (freshest comes first), kick out 'old' disks and 1879 * compare superblocks. If everything's fine then run it. 1880 * 1881 * If "unit" is allocated, then bump its reference count 1882 */ 1883 static void autorun_devices(int part) 1884 { 1885 struct list_head candidates; 1886 struct list_head *tmp; 1887 mdk_rdev_t *rdev0, *rdev; 1888 mddev_t *mddev; 1889 char b[BDEVNAME_SIZE]; 1890 1891 printk(KERN_INFO "md: autorun ...\n"); 1892 while (!list_empty(&pending_raid_disks)) { 1893 dev_t dev; 1894 rdev0 = list_entry(pending_raid_disks.next, 1895 mdk_rdev_t, same_set); 1896 1897 printk(KERN_INFO "md: considering %s ...\n", 1898 bdevname(rdev0->bdev,b)); 1899 INIT_LIST_HEAD(&candidates); 1900 ITERATE_RDEV_PENDING(rdev,tmp) 1901 if (super_90_load(rdev, rdev0, 0) >= 0) { 1902 printk(KERN_INFO "md: adding %s ...\n", 1903 bdevname(rdev->bdev,b)); 1904 list_move(&rdev->same_set, &candidates); 1905 } 1906 /* 1907 * now we have a set of devices, with all of them having 1908 * mostly sane superblocks. It's time to allocate the 1909 * mddev. 1910 */ 1911 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 1912 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 1913 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 1914 break; 1915 } 1916 if (part) 1917 dev = MKDEV(mdp_major, 1918 rdev0->preferred_minor << MdpMinorShift); 1919 else 1920 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 1921 1922 md_probe(dev, NULL, NULL); 1923 mddev = mddev_find(dev); 1924 if (!mddev) { 1925 printk(KERN_ERR 1926 "md: cannot allocate memory for md drive.\n"); 1927 break; 1928 } 1929 if (mddev_lock(mddev)) 1930 printk(KERN_WARNING "md: %s locked, cannot run\n", 1931 mdname(mddev)); 1932 else if (mddev->raid_disks || mddev->major_version 1933 || !list_empty(&mddev->disks)) { 1934 printk(KERN_WARNING 1935 "md: %s already running, cannot run %s\n", 1936 mdname(mddev), bdevname(rdev0->bdev,b)); 1937 mddev_unlock(mddev); 1938 } else { 1939 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 1940 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 1941 list_del_init(&rdev->same_set); 1942 if (bind_rdev_to_array(rdev, mddev)) 1943 export_rdev(rdev); 1944 } 1945 autorun_array(mddev); 1946 mddev_unlock(mddev); 1947 } 1948 /* on success, candidates will be empty, on error 1949 * it won't... 1950 */ 1951 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 1952 export_rdev(rdev); 1953 mddev_put(mddev); 1954 } 1955 printk(KERN_INFO "md: ... autorun DONE.\n"); 1956 } 1957 1958 /* 1959 * import RAID devices based on one partition 1960 * if possible, the array gets run as well. 1961 */ 1962 1963 static int autostart_array(dev_t startdev) 1964 { 1965 char b[BDEVNAME_SIZE]; 1966 int err = -EINVAL, i; 1967 mdp_super_t *sb = NULL; 1968 mdk_rdev_t *start_rdev = NULL, *rdev; 1969 1970 start_rdev = md_import_device(startdev, 0, 0); 1971 if (IS_ERR(start_rdev)) 1972 return err; 1973 1974 1975 /* NOTE: this can only work for 0.90.0 superblocks */ 1976 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 1977 if (sb->major_version != 0 || 1978 sb->minor_version != 90 ) { 1979 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 1980 export_rdev(start_rdev); 1981 return err; 1982 } 1983 1984 if (start_rdev->faulty) { 1985 printk(KERN_WARNING 1986 "md: can not autostart based on faulty %s!\n", 1987 bdevname(start_rdev->bdev,b)); 1988 export_rdev(start_rdev); 1989 return err; 1990 } 1991 list_add(&start_rdev->same_set, &pending_raid_disks); 1992 1993 for (i = 0; i < MD_SB_DISKS; i++) { 1994 mdp_disk_t *desc = sb->disks + i; 1995 dev_t dev = MKDEV(desc->major, desc->minor); 1996 1997 if (!dev) 1998 continue; 1999 if (dev == startdev) 2000 continue; 2001 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 2002 continue; 2003 rdev = md_import_device(dev, 0, 0); 2004 if (IS_ERR(rdev)) 2005 continue; 2006 2007 list_add(&rdev->same_set, &pending_raid_disks); 2008 } 2009 2010 /* 2011 * possibly return codes 2012 */ 2013 autorun_devices(0); 2014 return 0; 2015 2016 } 2017 2018 2019 static int get_version(void __user * arg) 2020 { 2021 mdu_version_t ver; 2022 2023 ver.major = MD_MAJOR_VERSION; 2024 ver.minor = MD_MINOR_VERSION; 2025 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2026 2027 if (copy_to_user(arg, &ver, sizeof(ver))) 2028 return -EFAULT; 2029 2030 return 0; 2031 } 2032 2033 static int get_array_info(mddev_t * mddev, void __user * arg) 2034 { 2035 mdu_array_info_t info; 2036 int nr,working,active,failed,spare; 2037 mdk_rdev_t *rdev; 2038 struct list_head *tmp; 2039 2040 nr=working=active=failed=spare=0; 2041 ITERATE_RDEV(mddev,rdev,tmp) { 2042 nr++; 2043 if (rdev->faulty) 2044 failed++; 2045 else { 2046 working++; 2047 if (rdev->in_sync) 2048 active++; 2049 else 2050 spare++; 2051 } 2052 } 2053 2054 info.major_version = mddev->major_version; 2055 info.minor_version = mddev->minor_version; 2056 info.patch_version = MD_PATCHLEVEL_VERSION; 2057 info.ctime = mddev->ctime; 2058 info.level = mddev->level; 2059 info.size = mddev->size; 2060 info.nr_disks = nr; 2061 info.raid_disks = mddev->raid_disks; 2062 info.md_minor = mddev->md_minor; 2063 info.not_persistent= !mddev->persistent; 2064 2065 info.utime = mddev->utime; 2066 info.state = 0; 2067 if (mddev->in_sync) 2068 info.state = (1<<MD_SB_CLEAN); 2069 info.active_disks = active; 2070 info.working_disks = working; 2071 info.failed_disks = failed; 2072 info.spare_disks = spare; 2073 2074 info.layout = mddev->layout; 2075 info.chunk_size = mddev->chunk_size; 2076 2077 if (copy_to_user(arg, &info, sizeof(info))) 2078 return -EFAULT; 2079 2080 return 0; 2081 } 2082 2083 static int get_bitmap_file(mddev_t * mddev, void * arg) 2084 { 2085 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 2086 char *ptr, *buf = NULL; 2087 int err = -ENOMEM; 2088 2089 file = kmalloc(sizeof(*file), GFP_KERNEL); 2090 if (!file) 2091 goto out; 2092 2093 /* bitmap disabled, zero the first byte and copy out */ 2094 if (!mddev->bitmap || !mddev->bitmap->file) { 2095 file->pathname[0] = '\0'; 2096 goto copy_out; 2097 } 2098 2099 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 2100 if (!buf) 2101 goto out; 2102 2103 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 2104 if (!ptr) 2105 goto out; 2106 2107 strcpy(file->pathname, ptr); 2108 2109 copy_out: 2110 err = 0; 2111 if (copy_to_user(arg, file, sizeof(*file))) 2112 err = -EFAULT; 2113 out: 2114 kfree(buf); 2115 kfree(file); 2116 return err; 2117 } 2118 2119 static int get_disk_info(mddev_t * mddev, void __user * arg) 2120 { 2121 mdu_disk_info_t info; 2122 unsigned int nr; 2123 mdk_rdev_t *rdev; 2124 2125 if (copy_from_user(&info, arg, sizeof(info))) 2126 return -EFAULT; 2127 2128 nr = info.number; 2129 2130 rdev = find_rdev_nr(mddev, nr); 2131 if (rdev) { 2132 info.major = MAJOR(rdev->bdev->bd_dev); 2133 info.minor = MINOR(rdev->bdev->bd_dev); 2134 info.raid_disk = rdev->raid_disk; 2135 info.state = 0; 2136 if (rdev->faulty) 2137 info.state |= (1<<MD_DISK_FAULTY); 2138 else if (rdev->in_sync) { 2139 info.state |= (1<<MD_DISK_ACTIVE); 2140 info.state |= (1<<MD_DISK_SYNC); 2141 } 2142 } else { 2143 info.major = info.minor = 0; 2144 info.raid_disk = -1; 2145 info.state = (1<<MD_DISK_REMOVED); 2146 } 2147 2148 if (copy_to_user(arg, &info, sizeof(info))) 2149 return -EFAULT; 2150 2151 return 0; 2152 } 2153 2154 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2155 { 2156 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 2157 mdk_rdev_t *rdev; 2158 dev_t dev = MKDEV(info->major,info->minor); 2159 2160 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 2161 return -EOVERFLOW; 2162 2163 if (!mddev->raid_disks) { 2164 int err; 2165 /* expecting a device which has a superblock */ 2166 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2167 if (IS_ERR(rdev)) { 2168 printk(KERN_WARNING 2169 "md: md_import_device returned %ld\n", 2170 PTR_ERR(rdev)); 2171 return PTR_ERR(rdev); 2172 } 2173 if (!list_empty(&mddev->disks)) { 2174 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2175 mdk_rdev_t, same_set); 2176 int err = super_types[mddev->major_version] 2177 .load_super(rdev, rdev0, mddev->minor_version); 2178 if (err < 0) { 2179 printk(KERN_WARNING 2180 "md: %s has different UUID to %s\n", 2181 bdevname(rdev->bdev,b), 2182 bdevname(rdev0->bdev,b2)); 2183 export_rdev(rdev); 2184 return -EINVAL; 2185 } 2186 } 2187 err = bind_rdev_to_array(rdev, mddev); 2188 if (err) 2189 export_rdev(rdev); 2190 return err; 2191 } 2192 2193 /* 2194 * add_new_disk can be used once the array is assembled 2195 * to add "hot spares". They must already have a superblock 2196 * written 2197 */ 2198 if (mddev->pers) { 2199 int err; 2200 if (!mddev->pers->hot_add_disk) { 2201 printk(KERN_WARNING 2202 "%s: personality does not support diskops!\n", 2203 mdname(mddev)); 2204 return -EINVAL; 2205 } 2206 rdev = md_import_device(dev, mddev->major_version, 2207 mddev->minor_version); 2208 if (IS_ERR(rdev)) { 2209 printk(KERN_WARNING 2210 "md: md_import_device returned %ld\n", 2211 PTR_ERR(rdev)); 2212 return PTR_ERR(rdev); 2213 } 2214 /* set save_raid_disk if appropriate */ 2215 if (!mddev->persistent) { 2216 if (info->state & (1<<MD_DISK_SYNC) && 2217 info->raid_disk < mddev->raid_disks) 2218 rdev->raid_disk = info->raid_disk; 2219 else 2220 rdev->raid_disk = -1; 2221 } else 2222 super_types[mddev->major_version]. 2223 validate_super(mddev, rdev); 2224 rdev->saved_raid_disk = rdev->raid_disk; 2225 2226 rdev->in_sync = 0; /* just to be sure */ 2227 rdev->raid_disk = -1; 2228 err = bind_rdev_to_array(rdev, mddev); 2229 if (err) 2230 export_rdev(rdev); 2231 2232 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2233 if (mddev->thread) 2234 md_wakeup_thread(mddev->thread); 2235 return err; 2236 } 2237 2238 /* otherwise, add_new_disk is only allowed 2239 * for major_version==0 superblocks 2240 */ 2241 if (mddev->major_version != 0) { 2242 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 2243 mdname(mddev)); 2244 return -EINVAL; 2245 } 2246 2247 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2248 int err; 2249 rdev = md_import_device (dev, -1, 0); 2250 if (IS_ERR(rdev)) { 2251 printk(KERN_WARNING 2252 "md: error, md_import_device() returned %ld\n", 2253 PTR_ERR(rdev)); 2254 return PTR_ERR(rdev); 2255 } 2256 rdev->desc_nr = info->number; 2257 if (info->raid_disk < mddev->raid_disks) 2258 rdev->raid_disk = info->raid_disk; 2259 else 2260 rdev->raid_disk = -1; 2261 2262 rdev->faulty = 0; 2263 if (rdev->raid_disk < mddev->raid_disks) 2264 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); 2265 else 2266 rdev->in_sync = 0; 2267 2268 err = bind_rdev_to_array(rdev, mddev); 2269 if (err) { 2270 export_rdev(rdev); 2271 return err; 2272 } 2273 2274 if (!mddev->persistent) { 2275 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2276 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2277 } else 2278 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2279 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2280 2281 if (!mddev->size || (mddev->size > rdev->size)) 2282 mddev->size = rdev->size; 2283 } 2284 2285 return 0; 2286 } 2287 2288 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2289 { 2290 char b[BDEVNAME_SIZE]; 2291 mdk_rdev_t *rdev; 2292 2293 if (!mddev->pers) 2294 return -ENODEV; 2295 2296 rdev = find_rdev(mddev, dev); 2297 if (!rdev) 2298 return -ENXIO; 2299 2300 if (rdev->raid_disk >= 0) 2301 goto busy; 2302 2303 kick_rdev_from_array(rdev); 2304 md_update_sb(mddev); 2305 2306 return 0; 2307 busy: 2308 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 2309 bdevname(rdev->bdev,b), mdname(mddev)); 2310 return -EBUSY; 2311 } 2312 2313 static int hot_add_disk(mddev_t * mddev, dev_t dev) 2314 { 2315 char b[BDEVNAME_SIZE]; 2316 int err; 2317 unsigned int size; 2318 mdk_rdev_t *rdev; 2319 2320 if (!mddev->pers) 2321 return -ENODEV; 2322 2323 if (mddev->major_version != 0) { 2324 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 2325 " version-0 superblocks.\n", 2326 mdname(mddev)); 2327 return -EINVAL; 2328 } 2329 if (!mddev->pers->hot_add_disk) { 2330 printk(KERN_WARNING 2331 "%s: personality does not support diskops!\n", 2332 mdname(mddev)); 2333 return -EINVAL; 2334 } 2335 2336 rdev = md_import_device (dev, -1, 0); 2337 if (IS_ERR(rdev)) { 2338 printk(KERN_WARNING 2339 "md: error, md_import_device() returned %ld\n", 2340 PTR_ERR(rdev)); 2341 return -EINVAL; 2342 } 2343 2344 if (mddev->persistent) 2345 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2346 else 2347 rdev->sb_offset = 2348 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2349 2350 size = calc_dev_size(rdev, mddev->chunk_size); 2351 rdev->size = size; 2352 2353 if (size < mddev->size) { 2354 printk(KERN_WARNING 2355 "%s: disk size %llu blocks < array size %llu\n", 2356 mdname(mddev), (unsigned long long)size, 2357 (unsigned long long)mddev->size); 2358 err = -ENOSPC; 2359 goto abort_export; 2360 } 2361 2362 if (rdev->faulty) { 2363 printk(KERN_WARNING 2364 "md: can not hot-add faulty %s disk to %s!\n", 2365 bdevname(rdev->bdev,b), mdname(mddev)); 2366 err = -EINVAL; 2367 goto abort_export; 2368 } 2369 rdev->in_sync = 0; 2370 rdev->desc_nr = -1; 2371 bind_rdev_to_array(rdev, mddev); 2372 2373 /* 2374 * The rest should better be atomic, we can have disk failures 2375 * noticed in interrupt contexts ... 2376 */ 2377 2378 if (rdev->desc_nr == mddev->max_disks) { 2379 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 2380 mdname(mddev)); 2381 err = -EBUSY; 2382 goto abort_unbind_export; 2383 } 2384 2385 rdev->raid_disk = -1; 2386 2387 md_update_sb(mddev); 2388 2389 /* 2390 * Kick recovery, maybe this spare has to be added to the 2391 * array immediately. 2392 */ 2393 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2394 md_wakeup_thread(mddev->thread); 2395 2396 return 0; 2397 2398 abort_unbind_export: 2399 unbind_rdev_from_array(rdev); 2400 2401 abort_export: 2402 export_rdev(rdev); 2403 return err; 2404 } 2405 2406 /* similar to deny_write_access, but accounts for our holding a reference 2407 * to the file ourselves */ 2408 static int deny_bitmap_write_access(struct file * file) 2409 { 2410 struct inode *inode = file->f_mapping->host; 2411 2412 spin_lock(&inode->i_lock); 2413 if (atomic_read(&inode->i_writecount) > 1) { 2414 spin_unlock(&inode->i_lock); 2415 return -ETXTBSY; 2416 } 2417 atomic_set(&inode->i_writecount, -1); 2418 spin_unlock(&inode->i_lock); 2419 2420 return 0; 2421 } 2422 2423 static int set_bitmap_file(mddev_t *mddev, int fd) 2424 { 2425 int err; 2426 2427 if (mddev->pers) 2428 return -EBUSY; 2429 2430 mddev->bitmap_file = fget(fd); 2431 2432 if (mddev->bitmap_file == NULL) { 2433 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 2434 mdname(mddev)); 2435 return -EBADF; 2436 } 2437 2438 err = deny_bitmap_write_access(mddev->bitmap_file); 2439 if (err) { 2440 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 2441 mdname(mddev)); 2442 fput(mddev->bitmap_file); 2443 mddev->bitmap_file = NULL; 2444 } else 2445 mddev->bitmap_offset = 0; /* file overrides offset */ 2446 return err; 2447 } 2448 2449 /* 2450 * set_array_info is used two different ways 2451 * The original usage is when creating a new array. 2452 * In this usage, raid_disks is > 0 and it together with 2453 * level, size, not_persistent,layout,chunksize determine the 2454 * shape of the array. 2455 * This will always create an array with a type-0.90.0 superblock. 2456 * The newer usage is when assembling an array. 2457 * In this case raid_disks will be 0, and the major_version field is 2458 * use to determine which style super-blocks are to be found on the devices. 2459 * The minor and patch _version numbers are also kept incase the 2460 * super_block handler wishes to interpret them. 2461 */ 2462 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2463 { 2464 2465 if (info->raid_disks == 0) { 2466 /* just setting version number for superblock loading */ 2467 if (info->major_version < 0 || 2468 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 2469 super_types[info->major_version].name == NULL) { 2470 /* maybe try to auto-load a module? */ 2471 printk(KERN_INFO 2472 "md: superblock version %d not known\n", 2473 info->major_version); 2474 return -EINVAL; 2475 } 2476 mddev->major_version = info->major_version; 2477 mddev->minor_version = info->minor_version; 2478 mddev->patch_version = info->patch_version; 2479 return 0; 2480 } 2481 mddev->major_version = MD_MAJOR_VERSION; 2482 mddev->minor_version = MD_MINOR_VERSION; 2483 mddev->patch_version = MD_PATCHLEVEL_VERSION; 2484 mddev->ctime = get_seconds(); 2485 2486 mddev->level = info->level; 2487 mddev->size = info->size; 2488 mddev->raid_disks = info->raid_disks; 2489 /* don't set md_minor, it is determined by which /dev/md* was 2490 * openned 2491 */ 2492 if (info->state & (1<<MD_SB_CLEAN)) 2493 mddev->recovery_cp = MaxSector; 2494 else 2495 mddev->recovery_cp = 0; 2496 mddev->persistent = ! info->not_persistent; 2497 2498 mddev->layout = info->layout; 2499 mddev->chunk_size = info->chunk_size; 2500 2501 mddev->max_disks = MD_SB_DISKS; 2502 2503 mddev->sb_dirty = 1; 2504 2505 /* 2506 * Generate a 128 bit UUID 2507 */ 2508 get_random_bytes(mddev->uuid, 16); 2509 2510 return 0; 2511 } 2512 2513 /* 2514 * update_array_info is used to change the configuration of an 2515 * on-line array. 2516 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 2517 * fields in the info are checked against the array. 2518 * Any differences that cannot be handled will cause an error. 2519 * Normally, only one change can be managed at a time. 2520 */ 2521 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 2522 { 2523 int rv = 0; 2524 int cnt = 0; 2525 2526 if (mddev->major_version != info->major_version || 2527 mddev->minor_version != info->minor_version || 2528 /* mddev->patch_version != info->patch_version || */ 2529 mddev->ctime != info->ctime || 2530 mddev->level != info->level || 2531 /* mddev->layout != info->layout || */ 2532 !mddev->persistent != info->not_persistent|| 2533 mddev->chunk_size != info->chunk_size ) 2534 return -EINVAL; 2535 /* Check there is only one change */ 2536 if (mddev->size != info->size) cnt++; 2537 if (mddev->raid_disks != info->raid_disks) cnt++; 2538 if (mddev->layout != info->layout) cnt++; 2539 if (cnt == 0) return 0; 2540 if (cnt > 1) return -EINVAL; 2541 2542 if (mddev->layout != info->layout) { 2543 /* Change layout 2544 * we don't need to do anything at the md level, the 2545 * personality will take care of it all. 2546 */ 2547 if (mddev->pers->reconfig == NULL) 2548 return -EINVAL; 2549 else 2550 return mddev->pers->reconfig(mddev, info->layout, -1); 2551 } 2552 if (mddev->size != info->size) { 2553 mdk_rdev_t * rdev; 2554 struct list_head *tmp; 2555 if (mddev->pers->resize == NULL) 2556 return -EINVAL; 2557 /* The "size" is the amount of each device that is used. 2558 * This can only make sense for arrays with redundancy. 2559 * linear and raid0 always use whatever space is available 2560 * We can only consider changing the size if no resync 2561 * or reconstruction is happening, and if the new size 2562 * is acceptable. It must fit before the sb_offset or, 2563 * if that is <data_offset, it must fit before the 2564 * size of each device. 2565 * If size is zero, we find the largest size that fits. 2566 */ 2567 if (mddev->sync_thread) 2568 return -EBUSY; 2569 ITERATE_RDEV(mddev,rdev,tmp) { 2570 sector_t avail; 2571 int fit = (info->size == 0); 2572 if (rdev->sb_offset > rdev->data_offset) 2573 avail = (rdev->sb_offset*2) - rdev->data_offset; 2574 else 2575 avail = get_capacity(rdev->bdev->bd_disk) 2576 - rdev->data_offset; 2577 if (fit && (info->size == 0 || info->size > avail/2)) 2578 info->size = avail/2; 2579 if (avail < ((sector_t)info->size << 1)) 2580 return -ENOSPC; 2581 } 2582 rv = mddev->pers->resize(mddev, (sector_t)info->size *2); 2583 if (!rv) { 2584 struct block_device *bdev; 2585 2586 bdev = bdget_disk(mddev->gendisk, 0); 2587 if (bdev) { 2588 down(&bdev->bd_inode->i_sem); 2589 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2590 up(&bdev->bd_inode->i_sem); 2591 bdput(bdev); 2592 } 2593 } 2594 } 2595 if (mddev->raid_disks != info->raid_disks) { 2596 /* change the number of raid disks */ 2597 if (mddev->pers->reshape == NULL) 2598 return -EINVAL; 2599 if (info->raid_disks <= 0 || 2600 info->raid_disks >= mddev->max_disks) 2601 return -EINVAL; 2602 if (mddev->sync_thread) 2603 return -EBUSY; 2604 rv = mddev->pers->reshape(mddev, info->raid_disks); 2605 if (!rv) { 2606 struct block_device *bdev; 2607 2608 bdev = bdget_disk(mddev->gendisk, 0); 2609 if (bdev) { 2610 down(&bdev->bd_inode->i_sem); 2611 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2612 up(&bdev->bd_inode->i_sem); 2613 bdput(bdev); 2614 } 2615 } 2616 } 2617 md_update_sb(mddev); 2618 return rv; 2619 } 2620 2621 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 2622 { 2623 mdk_rdev_t *rdev; 2624 2625 if (mddev->pers == NULL) 2626 return -ENODEV; 2627 2628 rdev = find_rdev(mddev, dev); 2629 if (!rdev) 2630 return -ENODEV; 2631 2632 md_error(mddev, rdev); 2633 return 0; 2634 } 2635 2636 static int md_ioctl(struct inode *inode, struct file *file, 2637 unsigned int cmd, unsigned long arg) 2638 { 2639 int err = 0; 2640 void __user *argp = (void __user *)arg; 2641 struct hd_geometry __user *loc = argp; 2642 mddev_t *mddev = NULL; 2643 2644 if (!capable(CAP_SYS_ADMIN)) 2645 return -EACCES; 2646 2647 /* 2648 * Commands dealing with the RAID driver but not any 2649 * particular array: 2650 */ 2651 switch (cmd) 2652 { 2653 case RAID_VERSION: 2654 err = get_version(argp); 2655 goto done; 2656 2657 case PRINT_RAID_DEBUG: 2658 err = 0; 2659 md_print_devices(); 2660 goto done; 2661 2662 #ifndef MODULE 2663 case RAID_AUTORUN: 2664 err = 0; 2665 autostart_arrays(arg); 2666 goto done; 2667 #endif 2668 default:; 2669 } 2670 2671 /* 2672 * Commands creating/starting a new array: 2673 */ 2674 2675 mddev = inode->i_bdev->bd_disk->private_data; 2676 2677 if (!mddev) { 2678 BUG(); 2679 goto abort; 2680 } 2681 2682 2683 if (cmd == START_ARRAY) { 2684 /* START_ARRAY doesn't need to lock the array as autostart_array 2685 * does the locking, and it could even be a different array 2686 */ 2687 static int cnt = 3; 2688 if (cnt > 0 ) { 2689 printk(KERN_WARNING 2690 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 2691 "This will not be supported beyond 2.6\n", 2692 current->comm, current->pid); 2693 cnt--; 2694 } 2695 err = autostart_array(new_decode_dev(arg)); 2696 if (err) { 2697 printk(KERN_WARNING "md: autostart failed!\n"); 2698 goto abort; 2699 } 2700 goto done; 2701 } 2702 2703 err = mddev_lock(mddev); 2704 if (err) { 2705 printk(KERN_INFO 2706 "md: ioctl lock interrupted, reason %d, cmd %d\n", 2707 err, cmd); 2708 goto abort; 2709 } 2710 2711 switch (cmd) 2712 { 2713 case SET_ARRAY_INFO: 2714 { 2715 mdu_array_info_t info; 2716 if (!arg) 2717 memset(&info, 0, sizeof(info)); 2718 else if (copy_from_user(&info, argp, sizeof(info))) { 2719 err = -EFAULT; 2720 goto abort_unlock; 2721 } 2722 if (mddev->pers) { 2723 err = update_array_info(mddev, &info); 2724 if (err) { 2725 printk(KERN_WARNING "md: couldn't update" 2726 " array info. %d\n", err); 2727 goto abort_unlock; 2728 } 2729 goto done_unlock; 2730 } 2731 if (!list_empty(&mddev->disks)) { 2732 printk(KERN_WARNING 2733 "md: array %s already has disks!\n", 2734 mdname(mddev)); 2735 err = -EBUSY; 2736 goto abort_unlock; 2737 } 2738 if (mddev->raid_disks) { 2739 printk(KERN_WARNING 2740 "md: array %s already initialised!\n", 2741 mdname(mddev)); 2742 err = -EBUSY; 2743 goto abort_unlock; 2744 } 2745 err = set_array_info(mddev, &info); 2746 if (err) { 2747 printk(KERN_WARNING "md: couldn't set" 2748 " array info. %d\n", err); 2749 goto abort_unlock; 2750 } 2751 } 2752 goto done_unlock; 2753 2754 default:; 2755 } 2756 2757 /* 2758 * Commands querying/configuring an existing array: 2759 */ 2760 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 2761 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 2762 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 2763 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 2764 err = -ENODEV; 2765 goto abort_unlock; 2766 } 2767 2768 /* 2769 * Commands even a read-only array can execute: 2770 */ 2771 switch (cmd) 2772 { 2773 case GET_ARRAY_INFO: 2774 err = get_array_info(mddev, argp); 2775 goto done_unlock; 2776 2777 case GET_BITMAP_FILE: 2778 err = get_bitmap_file(mddev, (void *)arg); 2779 goto done_unlock; 2780 2781 case GET_DISK_INFO: 2782 err = get_disk_info(mddev, argp); 2783 goto done_unlock; 2784 2785 case RESTART_ARRAY_RW: 2786 err = restart_array(mddev); 2787 goto done_unlock; 2788 2789 case STOP_ARRAY: 2790 err = do_md_stop (mddev, 0); 2791 goto done_unlock; 2792 2793 case STOP_ARRAY_RO: 2794 err = do_md_stop (mddev, 1); 2795 goto done_unlock; 2796 2797 /* 2798 * We have a problem here : there is no easy way to give a CHS 2799 * virtual geometry. We currently pretend that we have a 2 heads 2800 * 4 sectors (with a BIG number of cylinders...). This drives 2801 * dosfs just mad... ;-) 2802 */ 2803 case HDIO_GETGEO: 2804 if (!loc) { 2805 err = -EINVAL; 2806 goto abort_unlock; 2807 } 2808 err = put_user (2, (char __user *) &loc->heads); 2809 if (err) 2810 goto abort_unlock; 2811 err = put_user (4, (char __user *) &loc->sectors); 2812 if (err) 2813 goto abort_unlock; 2814 err = put_user(get_capacity(mddev->gendisk)/8, 2815 (short __user *) &loc->cylinders); 2816 if (err) 2817 goto abort_unlock; 2818 err = put_user (get_start_sect(inode->i_bdev), 2819 (long __user *) &loc->start); 2820 goto done_unlock; 2821 } 2822 2823 /* 2824 * The remaining ioctls are changing the state of the 2825 * superblock, so we do not allow read-only arrays 2826 * here: 2827 */ 2828 if (mddev->ro) { 2829 err = -EROFS; 2830 goto abort_unlock; 2831 } 2832 2833 switch (cmd) 2834 { 2835 case ADD_NEW_DISK: 2836 { 2837 mdu_disk_info_t info; 2838 if (copy_from_user(&info, argp, sizeof(info))) 2839 err = -EFAULT; 2840 else 2841 err = add_new_disk(mddev, &info); 2842 goto done_unlock; 2843 } 2844 2845 case HOT_REMOVE_DISK: 2846 err = hot_remove_disk(mddev, new_decode_dev(arg)); 2847 goto done_unlock; 2848 2849 case HOT_ADD_DISK: 2850 err = hot_add_disk(mddev, new_decode_dev(arg)); 2851 goto done_unlock; 2852 2853 case SET_DISK_FAULTY: 2854 err = set_disk_faulty(mddev, new_decode_dev(arg)); 2855 goto done_unlock; 2856 2857 case RUN_ARRAY: 2858 err = do_md_run (mddev); 2859 goto done_unlock; 2860 2861 case SET_BITMAP_FILE: 2862 err = set_bitmap_file(mddev, (int)arg); 2863 goto done_unlock; 2864 2865 default: 2866 if (_IOC_TYPE(cmd) == MD_MAJOR) 2867 printk(KERN_WARNING "md: %s(pid %d) used" 2868 " obsolete MD ioctl, upgrade your" 2869 " software to use new ictls.\n", 2870 current->comm, current->pid); 2871 err = -EINVAL; 2872 goto abort_unlock; 2873 } 2874 2875 done_unlock: 2876 abort_unlock: 2877 mddev_unlock(mddev); 2878 2879 return err; 2880 done: 2881 if (err) 2882 MD_BUG(); 2883 abort: 2884 return err; 2885 } 2886 2887 static int md_open(struct inode *inode, struct file *file) 2888 { 2889 /* 2890 * Succeed if we can lock the mddev, which confirms that 2891 * it isn't being stopped right now. 2892 */ 2893 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 2894 int err; 2895 2896 if ((err = mddev_lock(mddev))) 2897 goto out; 2898 2899 err = 0; 2900 mddev_get(mddev); 2901 mddev_unlock(mddev); 2902 2903 check_disk_change(inode->i_bdev); 2904 out: 2905 return err; 2906 } 2907 2908 static int md_release(struct inode *inode, struct file * file) 2909 { 2910 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 2911 2912 if (!mddev) 2913 BUG(); 2914 mddev_put(mddev); 2915 2916 return 0; 2917 } 2918 2919 static int md_media_changed(struct gendisk *disk) 2920 { 2921 mddev_t *mddev = disk->private_data; 2922 2923 return mddev->changed; 2924 } 2925 2926 static int md_revalidate(struct gendisk *disk) 2927 { 2928 mddev_t *mddev = disk->private_data; 2929 2930 mddev->changed = 0; 2931 return 0; 2932 } 2933 static struct block_device_operations md_fops = 2934 { 2935 .owner = THIS_MODULE, 2936 .open = md_open, 2937 .release = md_release, 2938 .ioctl = md_ioctl, 2939 .media_changed = md_media_changed, 2940 .revalidate_disk= md_revalidate, 2941 }; 2942 2943 static int md_thread(void * arg) 2944 { 2945 mdk_thread_t *thread = arg; 2946 2947 lock_kernel(); 2948 2949 /* 2950 * Detach thread 2951 */ 2952 2953 daemonize(thread->name, mdname(thread->mddev)); 2954 2955 current->exit_signal = SIGCHLD; 2956 allow_signal(SIGKILL); 2957 thread->tsk = current; 2958 2959 /* 2960 * md_thread is a 'system-thread', it's priority should be very 2961 * high. We avoid resource deadlocks individually in each 2962 * raid personality. (RAID5 does preallocation) We also use RR and 2963 * the very same RT priority as kswapd, thus we will never get 2964 * into a priority inversion deadlock. 2965 * 2966 * we definitely have to have equal or higher priority than 2967 * bdflush, otherwise bdflush will deadlock if there are too 2968 * many dirty RAID5 blocks. 2969 */ 2970 unlock_kernel(); 2971 2972 complete(thread->event); 2973 while (thread->run) { 2974 void (*run)(mddev_t *); 2975 2976 wait_event_interruptible_timeout(thread->wqueue, 2977 test_bit(THREAD_WAKEUP, &thread->flags), 2978 thread->timeout); 2979 if (current->flags & PF_FREEZE) 2980 refrigerator(PF_FREEZE); 2981 2982 clear_bit(THREAD_WAKEUP, &thread->flags); 2983 2984 run = thread->run; 2985 if (run) 2986 run(thread->mddev); 2987 2988 if (signal_pending(current)) 2989 flush_signals(current); 2990 } 2991 complete(thread->event); 2992 return 0; 2993 } 2994 2995 void md_wakeup_thread(mdk_thread_t *thread) 2996 { 2997 if (thread) { 2998 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 2999 set_bit(THREAD_WAKEUP, &thread->flags); 3000 wake_up(&thread->wqueue); 3001 } 3002 } 3003 3004 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 3005 const char *name) 3006 { 3007 mdk_thread_t *thread; 3008 int ret; 3009 struct completion event; 3010 3011 thread = (mdk_thread_t *) kmalloc 3012 (sizeof(mdk_thread_t), GFP_KERNEL); 3013 if (!thread) 3014 return NULL; 3015 3016 memset(thread, 0, sizeof(mdk_thread_t)); 3017 init_waitqueue_head(&thread->wqueue); 3018 3019 init_completion(&event); 3020 thread->event = &event; 3021 thread->run = run; 3022 thread->mddev = mddev; 3023 thread->name = name; 3024 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3025 ret = kernel_thread(md_thread, thread, 0); 3026 if (ret < 0) { 3027 kfree(thread); 3028 return NULL; 3029 } 3030 wait_for_completion(&event); 3031 return thread; 3032 } 3033 3034 void md_unregister_thread(mdk_thread_t *thread) 3035 { 3036 struct completion event; 3037 3038 init_completion(&event); 3039 3040 thread->event = &event; 3041 3042 /* As soon as ->run is set to NULL, the task could disappear, 3043 * so we need to hold tasklist_lock until we have sent the signal 3044 */ 3045 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3046 read_lock(&tasklist_lock); 3047 thread->run = NULL; 3048 send_sig(SIGKILL, thread->tsk, 1); 3049 read_unlock(&tasklist_lock); 3050 wait_for_completion(&event); 3051 kfree(thread); 3052 } 3053 3054 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 3055 { 3056 if (!mddev) { 3057 MD_BUG(); 3058 return; 3059 } 3060 3061 if (!rdev || rdev->faulty) 3062 return; 3063 /* 3064 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3065 mdname(mddev), 3066 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3067 __builtin_return_address(0),__builtin_return_address(1), 3068 __builtin_return_address(2),__builtin_return_address(3)); 3069 */ 3070 if (!mddev->pers->error_handler) 3071 return; 3072 mddev->pers->error_handler(mddev,rdev); 3073 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3074 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3075 md_wakeup_thread(mddev->thread); 3076 } 3077 3078 /* seq_file implementation /proc/mdstat */ 3079 3080 static void status_unused(struct seq_file *seq) 3081 { 3082 int i = 0; 3083 mdk_rdev_t *rdev; 3084 struct list_head *tmp; 3085 3086 seq_printf(seq, "unused devices: "); 3087 3088 ITERATE_RDEV_PENDING(rdev,tmp) { 3089 char b[BDEVNAME_SIZE]; 3090 i++; 3091 seq_printf(seq, "%s ", 3092 bdevname(rdev->bdev,b)); 3093 } 3094 if (!i) 3095 seq_printf(seq, "<none>"); 3096 3097 seq_printf(seq, "\n"); 3098 } 3099 3100 3101 static void status_resync(struct seq_file *seq, mddev_t * mddev) 3102 { 3103 unsigned long max_blocks, resync, res, dt, db, rt; 3104 3105 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 3106 3107 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3108 max_blocks = mddev->resync_max_sectors >> 1; 3109 else 3110 max_blocks = mddev->size; 3111 3112 /* 3113 * Should not happen. 3114 */ 3115 if (!max_blocks) { 3116 MD_BUG(); 3117 return; 3118 } 3119 res = (resync/1024)*1000/(max_blocks/1024 + 1); 3120 { 3121 int i, x = res/50, y = 20-x; 3122 seq_printf(seq, "["); 3123 for (i = 0; i < x; i++) 3124 seq_printf(seq, "="); 3125 seq_printf(seq, ">"); 3126 for (i = 0; i < y; i++) 3127 seq_printf(seq, "."); 3128 seq_printf(seq, "] "); 3129 } 3130 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 3131 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 3132 "resync" : "recovery"), 3133 res/10, res % 10, resync, max_blocks); 3134 3135 /* 3136 * We do not want to overflow, so the order of operands and 3137 * the * 100 / 100 trick are important. We do a +1 to be 3138 * safe against division by zero. We only estimate anyway. 3139 * 3140 * dt: time from mark until now 3141 * db: blocks written from mark until now 3142 * rt: remaining time 3143 */ 3144 dt = ((jiffies - mddev->resync_mark) / HZ); 3145 if (!dt) dt++; 3146 db = resync - (mddev->resync_mark_cnt/2); 3147 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 3148 3149 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 3150 3151 seq_printf(seq, " speed=%ldK/sec", db/dt); 3152 } 3153 3154 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 3155 { 3156 struct list_head *tmp; 3157 loff_t l = *pos; 3158 mddev_t *mddev; 3159 3160 if (l >= 0x10000) 3161 return NULL; 3162 if (!l--) 3163 /* header */ 3164 return (void*)1; 3165 3166 spin_lock(&all_mddevs_lock); 3167 list_for_each(tmp,&all_mddevs) 3168 if (!l--) { 3169 mddev = list_entry(tmp, mddev_t, all_mddevs); 3170 mddev_get(mddev); 3171 spin_unlock(&all_mddevs_lock); 3172 return mddev; 3173 } 3174 spin_unlock(&all_mddevs_lock); 3175 if (!l--) 3176 return (void*)2;/* tail */ 3177 return NULL; 3178 } 3179 3180 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3181 { 3182 struct list_head *tmp; 3183 mddev_t *next_mddev, *mddev = v; 3184 3185 ++*pos; 3186 if (v == (void*)2) 3187 return NULL; 3188 3189 spin_lock(&all_mddevs_lock); 3190 if (v == (void*)1) 3191 tmp = all_mddevs.next; 3192 else 3193 tmp = mddev->all_mddevs.next; 3194 if (tmp != &all_mddevs) 3195 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 3196 else { 3197 next_mddev = (void*)2; 3198 *pos = 0x10000; 3199 } 3200 spin_unlock(&all_mddevs_lock); 3201 3202 if (v != (void*)1) 3203 mddev_put(mddev); 3204 return next_mddev; 3205 3206 } 3207 3208 static void md_seq_stop(struct seq_file *seq, void *v) 3209 { 3210 mddev_t *mddev = v; 3211 3212 if (mddev && v != (void*)1 && v != (void*)2) 3213 mddev_put(mddev); 3214 } 3215 3216 static int md_seq_show(struct seq_file *seq, void *v) 3217 { 3218 mddev_t *mddev = v; 3219 sector_t size; 3220 struct list_head *tmp2; 3221 mdk_rdev_t *rdev; 3222 int i; 3223 struct bitmap *bitmap; 3224 3225 if (v == (void*)1) { 3226 seq_printf(seq, "Personalities : "); 3227 spin_lock(&pers_lock); 3228 for (i = 0; i < MAX_PERSONALITY; i++) 3229 if (pers[i]) 3230 seq_printf(seq, "[%s] ", pers[i]->name); 3231 3232 spin_unlock(&pers_lock); 3233 seq_printf(seq, "\n"); 3234 return 0; 3235 } 3236 if (v == (void*)2) { 3237 status_unused(seq); 3238 return 0; 3239 } 3240 3241 if (mddev_lock(mddev)!=0) 3242 return -EINTR; 3243 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3244 seq_printf(seq, "%s : %sactive", mdname(mddev), 3245 mddev->pers ? "" : "in"); 3246 if (mddev->pers) { 3247 if (mddev->ro) 3248 seq_printf(seq, " (read-only)"); 3249 seq_printf(seq, " %s", mddev->pers->name); 3250 } 3251 3252 size = 0; 3253 ITERATE_RDEV(mddev,rdev,tmp2) { 3254 char b[BDEVNAME_SIZE]; 3255 seq_printf(seq, " %s[%d]", 3256 bdevname(rdev->bdev,b), rdev->desc_nr); 3257 if (rdev->faulty) { 3258 seq_printf(seq, "(F)"); 3259 continue; 3260 } 3261 size += rdev->size; 3262 } 3263 3264 if (!list_empty(&mddev->disks)) { 3265 if (mddev->pers) 3266 seq_printf(seq, "\n %llu blocks", 3267 (unsigned long long)mddev->array_size); 3268 else 3269 seq_printf(seq, "\n %llu blocks", 3270 (unsigned long long)size); 3271 } 3272 3273 if (mddev->pers) { 3274 mddev->pers->status (seq, mddev); 3275 seq_printf(seq, "\n "); 3276 if (mddev->curr_resync > 2) { 3277 status_resync (seq, mddev); 3278 seq_printf(seq, "\n "); 3279 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3280 seq_printf(seq, " resync=DELAYED\n "); 3281 } else 3282 seq_printf(seq, "\n "); 3283 3284 if ((bitmap = mddev->bitmap)) { 3285 unsigned long chunk_kb; 3286 unsigned long flags; 3287 spin_lock_irqsave(&bitmap->lock, flags); 3288 chunk_kb = bitmap->chunksize >> 10; 3289 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 3290 "%lu%s chunk", 3291 bitmap->pages - bitmap->missing_pages, 3292 bitmap->pages, 3293 (bitmap->pages - bitmap->missing_pages) 3294 << (PAGE_SHIFT - 10), 3295 chunk_kb ? chunk_kb : bitmap->chunksize, 3296 chunk_kb ? "KB" : "B"); 3297 if (bitmap->file) { 3298 seq_printf(seq, ", file: "); 3299 seq_path(seq, bitmap->file->f_vfsmnt, 3300 bitmap->file->f_dentry," \t\n"); 3301 } 3302 3303 seq_printf(seq, "\n"); 3304 spin_unlock_irqrestore(&bitmap->lock, flags); 3305 } 3306 3307 seq_printf(seq, "\n"); 3308 } 3309 mddev_unlock(mddev); 3310 3311 return 0; 3312 } 3313 3314 static struct seq_operations md_seq_ops = { 3315 .start = md_seq_start, 3316 .next = md_seq_next, 3317 .stop = md_seq_stop, 3318 .show = md_seq_show, 3319 }; 3320 3321 static int md_seq_open(struct inode *inode, struct file *file) 3322 { 3323 int error; 3324 3325 error = seq_open(file, &md_seq_ops); 3326 return error; 3327 } 3328 3329 static struct file_operations md_seq_fops = { 3330 .open = md_seq_open, 3331 .read = seq_read, 3332 .llseek = seq_lseek, 3333 .release = seq_release, 3334 }; 3335 3336 int register_md_personality(int pnum, mdk_personality_t *p) 3337 { 3338 if (pnum >= MAX_PERSONALITY) { 3339 printk(KERN_ERR 3340 "md: tried to install personality %s as nr %d, but max is %lu\n", 3341 p->name, pnum, MAX_PERSONALITY-1); 3342 return -EINVAL; 3343 } 3344 3345 spin_lock(&pers_lock); 3346 if (pers[pnum]) { 3347 spin_unlock(&pers_lock); 3348 return -EBUSY; 3349 } 3350 3351 pers[pnum] = p; 3352 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3353 spin_unlock(&pers_lock); 3354 return 0; 3355 } 3356 3357 int unregister_md_personality(int pnum) 3358 { 3359 if (pnum >= MAX_PERSONALITY) 3360 return -EINVAL; 3361 3362 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3363 spin_lock(&pers_lock); 3364 pers[pnum] = NULL; 3365 spin_unlock(&pers_lock); 3366 return 0; 3367 } 3368 3369 static int is_mddev_idle(mddev_t *mddev) 3370 { 3371 mdk_rdev_t * rdev; 3372 struct list_head *tmp; 3373 int idle; 3374 unsigned long curr_events; 3375 3376 idle = 1; 3377 ITERATE_RDEV(mddev,rdev,tmp) { 3378 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 3379 curr_events = disk_stat_read(disk, read_sectors) + 3380 disk_stat_read(disk, write_sectors) - 3381 atomic_read(&disk->sync_io); 3382 /* Allow some slack between valud of curr_events and last_events, 3383 * as there are some uninteresting races. 3384 * Note: the following is an unsigned comparison. 3385 */ 3386 if ((curr_events - rdev->last_events + 32) > 64) { 3387 rdev->last_events = curr_events; 3388 idle = 0; 3389 } 3390 } 3391 return idle; 3392 } 3393 3394 void md_done_sync(mddev_t *mddev, int blocks, int ok) 3395 { 3396 /* another "blocks" (512byte) blocks have been synced */ 3397 atomic_sub(blocks, &mddev->recovery_active); 3398 wake_up(&mddev->recovery_wait); 3399 if (!ok) { 3400 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3401 md_wakeup_thread(mddev->thread); 3402 // stop recovery, signal do_sync .... 3403 } 3404 } 3405 3406 3407 /* md_write_start(mddev, bi) 3408 * If we need to update some array metadata (e.g. 'active' flag 3409 * in superblock) before writing, schedule a superblock update 3410 * and wait for it to complete. 3411 */ 3412 void md_write_start(mddev_t *mddev, struct bio *bi) 3413 { 3414 DEFINE_WAIT(w); 3415 if (bio_data_dir(bi) != WRITE) 3416 return; 3417 3418 atomic_inc(&mddev->writes_pending); 3419 if (mddev->in_sync) { 3420 spin_lock(&mddev->write_lock); 3421 if (mddev->in_sync) { 3422 mddev->in_sync = 0; 3423 mddev->sb_dirty = 1; 3424 md_wakeup_thread(mddev->thread); 3425 } 3426 spin_unlock(&mddev->write_lock); 3427 } 3428 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 3429 } 3430 3431 void md_write_end(mddev_t *mddev) 3432 { 3433 if (atomic_dec_and_test(&mddev->writes_pending)) { 3434 if (mddev->safemode == 2) 3435 md_wakeup_thread(mddev->thread); 3436 else 3437 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 3438 } 3439 } 3440 3441 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3442 3443 #define SYNC_MARKS 10 3444 #define SYNC_MARK_STEP (3*HZ) 3445 static void md_do_sync(mddev_t *mddev) 3446 { 3447 mddev_t *mddev2; 3448 unsigned int currspeed = 0, 3449 window; 3450 sector_t max_sectors,j, io_sectors; 3451 unsigned long mark[SYNC_MARKS]; 3452 sector_t mark_cnt[SYNC_MARKS]; 3453 int last_mark,m; 3454 struct list_head *tmp; 3455 sector_t last_check; 3456 int skipped = 0; 3457 3458 /* just incase thread restarts... */ 3459 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3460 return; 3461 3462 /* we overload curr_resync somewhat here. 3463 * 0 == not engaged in resync at all 3464 * 2 == checking that there is no conflict with another sync 3465 * 1 == like 2, but have yielded to allow conflicting resync to 3466 * commense 3467 * other == active in resync - this many blocks 3468 * 3469 * Before starting a resync we must have set curr_resync to 3470 * 2, and then checked that every "conflicting" array has curr_resync 3471 * less than ours. When we find one that is the same or higher 3472 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 3473 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 3474 * This will mean we have to start checking from the beginning again. 3475 * 3476 */ 3477 3478 do { 3479 mddev->curr_resync = 2; 3480 3481 try_again: 3482 if (signal_pending(current)) { 3483 flush_signals(current); 3484 goto skip; 3485 } 3486 ITERATE_MDDEV(mddev2,tmp) { 3487 printk("."); 3488 if (mddev2 == mddev) 3489 continue; 3490 if (mddev2->curr_resync && 3491 match_mddev_units(mddev,mddev2)) { 3492 DEFINE_WAIT(wq); 3493 if (mddev < mddev2 && mddev->curr_resync == 2) { 3494 /* arbitrarily yield */ 3495 mddev->curr_resync = 1; 3496 wake_up(&resync_wait); 3497 } 3498 if (mddev > mddev2 && mddev->curr_resync == 1) 3499 /* no need to wait here, we can wait the next 3500 * time 'round when curr_resync == 2 3501 */ 3502 continue; 3503 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 3504 if (!signal_pending(current) 3505 && mddev2->curr_resync >= mddev->curr_resync) { 3506 printk(KERN_INFO "md: delaying resync of %s" 3507 " until %s has finished resync (they" 3508 " share one or more physical units)\n", 3509 mdname(mddev), mdname(mddev2)); 3510 mddev_put(mddev2); 3511 schedule(); 3512 finish_wait(&resync_wait, &wq); 3513 goto try_again; 3514 } 3515 finish_wait(&resync_wait, &wq); 3516 } 3517 } 3518 } while (mddev->curr_resync < 2); 3519 3520 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3521 /* resync follows the size requested by the personality, 3522 * which defaults to physical size, but can be virtual size 3523 */ 3524 max_sectors = mddev->resync_max_sectors; 3525 else 3526 /* recovery follows the physical size of devices */ 3527 max_sectors = mddev->size << 1; 3528 3529 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 3530 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 3531 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 3532 printk(KERN_INFO "md: using maximum available idle IO bandwith " 3533 "(but not more than %d KB/sec) for reconstruction.\n", 3534 sysctl_speed_limit_max); 3535 3536 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3537 /* we don't use the checkpoint if there's a bitmap */ 3538 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap) 3539 j = mddev->recovery_cp; 3540 else 3541 j = 0; 3542 io_sectors = 0; 3543 for (m = 0; m < SYNC_MARKS; m++) { 3544 mark[m] = jiffies; 3545 mark_cnt[m] = io_sectors; 3546 } 3547 last_mark = 0; 3548 mddev->resync_mark = mark[last_mark]; 3549 mddev->resync_mark_cnt = mark_cnt[last_mark]; 3550 3551 /* 3552 * Tune reconstruction: 3553 */ 3554 window = 32*(PAGE_SIZE/512); 3555 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 3556 window/2,(unsigned long long) max_sectors/2); 3557 3558 atomic_set(&mddev->recovery_active, 0); 3559 init_waitqueue_head(&mddev->recovery_wait); 3560 last_check = 0; 3561 3562 if (j>2) { 3563 printk(KERN_INFO 3564 "md: resuming recovery of %s from checkpoint.\n", 3565 mdname(mddev)); 3566 mddev->curr_resync = j; 3567 } 3568 3569 while (j < max_sectors) { 3570 sector_t sectors; 3571 3572 skipped = 0; 3573 sectors = mddev->pers->sync_request(mddev, j, &skipped, 3574 currspeed < sysctl_speed_limit_min); 3575 if (sectors == 0) { 3576 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3577 goto out; 3578 } 3579 3580 if (!skipped) { /* actual IO requested */ 3581 io_sectors += sectors; 3582 atomic_add(sectors, &mddev->recovery_active); 3583 } 3584 3585 j += sectors; 3586 if (j>1) mddev->curr_resync = j; 3587 3588 3589 if (last_check + window > io_sectors || j == max_sectors) 3590 continue; 3591 3592 last_check = io_sectors; 3593 3594 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 3595 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 3596 break; 3597 3598 repeat: 3599 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 3600 /* step marks */ 3601 int next = (last_mark+1) % SYNC_MARKS; 3602 3603 mddev->resync_mark = mark[next]; 3604 mddev->resync_mark_cnt = mark_cnt[next]; 3605 mark[next] = jiffies; 3606 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 3607 last_mark = next; 3608 } 3609 3610 3611 if (signal_pending(current)) { 3612 /* 3613 * got a signal, exit. 3614 */ 3615 printk(KERN_INFO 3616 "md: md_do_sync() got signal ... exiting\n"); 3617 flush_signals(current); 3618 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3619 goto out; 3620 } 3621 3622 /* 3623 * this loop exits only if either when we are slower than 3624 * the 'hard' speed limit, or the system was IO-idle for 3625 * a jiffy. 3626 * the system might be non-idle CPU-wise, but we only care 3627 * about not overloading the IO subsystem. (things like an 3628 * e2fsck being done on the RAID array should execute fast) 3629 */ 3630 mddev->queue->unplug_fn(mddev->queue); 3631 cond_resched(); 3632 3633 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 3634 /((jiffies-mddev->resync_mark)/HZ +1) +1; 3635 3636 if (currspeed > sysctl_speed_limit_min) { 3637 if ((currspeed > sysctl_speed_limit_max) || 3638 !is_mddev_idle(mddev)) { 3639 msleep_interruptible(250); 3640 goto repeat; 3641 } 3642 } 3643 } 3644 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 3645 /* 3646 * this also signals 'finished resyncing' to md_stop 3647 */ 3648 out: 3649 mddev->queue->unplug_fn(mddev->queue); 3650 3651 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 3652 3653 /* tell personality that we are finished */ 3654 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 3655 3656 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3657 mddev->curr_resync > 2 && 3658 mddev->curr_resync >= mddev->recovery_cp) { 3659 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 3660 printk(KERN_INFO 3661 "md: checkpointing recovery of %s.\n", 3662 mdname(mddev)); 3663 mddev->recovery_cp = mddev->curr_resync; 3664 } else 3665 mddev->recovery_cp = MaxSector; 3666 } 3667 3668 skip: 3669 mddev->curr_resync = 0; 3670 wake_up(&resync_wait); 3671 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 3672 md_wakeup_thread(mddev->thread); 3673 } 3674 3675 3676 /* 3677 * This routine is regularly called by all per-raid-array threads to 3678 * deal with generic issues like resync and super-block update. 3679 * Raid personalities that don't have a thread (linear/raid0) do not 3680 * need this as they never do any recovery or update the superblock. 3681 * 3682 * It does not do any resync itself, but rather "forks" off other threads 3683 * to do that as needed. 3684 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 3685 * "->recovery" and create a thread at ->sync_thread. 3686 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 3687 * and wakeups up this thread which will reap the thread and finish up. 3688 * This thread also removes any faulty devices (with nr_pending == 0). 3689 * 3690 * The overall approach is: 3691 * 1/ if the superblock needs updating, update it. 3692 * 2/ If a recovery thread is running, don't do anything else. 3693 * 3/ If recovery has finished, clean up, possibly marking spares active. 3694 * 4/ If there are any faulty devices, remove them. 3695 * 5/ If array is degraded, try to add spares devices 3696 * 6/ If array has spares or is not in-sync, start a resync thread. 3697 */ 3698 void md_check_recovery(mddev_t *mddev) 3699 { 3700 mdk_rdev_t *rdev; 3701 struct list_head *rtmp; 3702 3703 3704 if (mddev->bitmap) 3705 bitmap_daemon_work(mddev->bitmap); 3706 3707 if (mddev->ro) 3708 return; 3709 3710 if (signal_pending(current)) { 3711 if (mddev->pers->sync_request) { 3712 printk(KERN_INFO "md: %s in immediate safe mode\n", 3713 mdname(mddev)); 3714 mddev->safemode = 2; 3715 } 3716 flush_signals(current); 3717 } 3718 3719 if ( ! ( 3720 mddev->sb_dirty || 3721 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 3722 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 3723 (mddev->safemode == 1) || 3724 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 3725 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 3726 )) 3727 return; 3728 3729 if (mddev_trylock(mddev)==0) { 3730 int spares =0; 3731 3732 spin_lock(&mddev->write_lock); 3733 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 3734 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 3735 mddev->in_sync = 1; 3736 mddev->sb_dirty = 1; 3737 } 3738 if (mddev->safemode == 1) 3739 mddev->safemode = 0; 3740 spin_unlock(&mddev->write_lock); 3741 3742 if (mddev->sb_dirty) 3743 md_update_sb(mddev); 3744 3745 3746 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 3747 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 3748 /* resync/recovery still happening */ 3749 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3750 goto unlock; 3751 } 3752 if (mddev->sync_thread) { 3753 /* resync has finished, collect result */ 3754 md_unregister_thread(mddev->sync_thread); 3755 mddev->sync_thread = NULL; 3756 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3757 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 3758 /* success...*/ 3759 /* activate any spares */ 3760 mddev->pers->spare_active(mddev); 3761 } 3762 md_update_sb(mddev); 3763 3764 /* if array is no-longer degraded, then any saved_raid_disk 3765 * information must be scrapped 3766 */ 3767 if (!mddev->degraded) 3768 ITERATE_RDEV(mddev,rdev,rtmp) 3769 rdev->saved_raid_disk = -1; 3770 3771 mddev->recovery = 0; 3772 /* flag recovery needed just to double check */ 3773 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3774 goto unlock; 3775 } 3776 if (mddev->recovery) 3777 /* probably just the RECOVERY_NEEDED flag */ 3778 mddev->recovery = 0; 3779 3780 /* no recovery is running. 3781 * remove any failed drives, then 3782 * add spares if possible. 3783 * Spare are also removed and re-added, to allow 3784 * the personality to fail the re-add. 3785 */ 3786 ITERATE_RDEV(mddev,rdev,rtmp) 3787 if (rdev->raid_disk >= 0 && 3788 (rdev->faulty || ! rdev->in_sync) && 3789 atomic_read(&rdev->nr_pending)==0) { 3790 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) 3791 rdev->raid_disk = -1; 3792 } 3793 3794 if (mddev->degraded) { 3795 ITERATE_RDEV(mddev,rdev,rtmp) 3796 if (rdev->raid_disk < 0 3797 && !rdev->faulty) { 3798 if (mddev->pers->hot_add_disk(mddev,rdev)) 3799 spares++; 3800 else 3801 break; 3802 } 3803 } 3804 3805 if (!spares && (mddev->recovery_cp == MaxSector )) { 3806 /* nothing we can do ... */ 3807 goto unlock; 3808 } 3809 if (mddev->pers->sync_request) { 3810 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3811 if (!spares) 3812 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3813 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 3814 /* We are adding a device or devices to an array 3815 * which has the bitmap stored on all devices. 3816 * So make sure all bitmap pages get written 3817 */ 3818 bitmap_write_all(mddev->bitmap); 3819 } 3820 mddev->sync_thread = md_register_thread(md_do_sync, 3821 mddev, 3822 "%s_resync"); 3823 if (!mddev->sync_thread) { 3824 printk(KERN_ERR "%s: could not start resync" 3825 " thread...\n", 3826 mdname(mddev)); 3827 /* leave the spares where they are, it shouldn't hurt */ 3828 mddev->recovery = 0; 3829 } else { 3830 md_wakeup_thread(mddev->sync_thread); 3831 } 3832 } 3833 unlock: 3834 mddev_unlock(mddev); 3835 } 3836 } 3837 3838 static int md_notify_reboot(struct notifier_block *this, 3839 unsigned long code, void *x) 3840 { 3841 struct list_head *tmp; 3842 mddev_t *mddev; 3843 3844 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 3845 3846 printk(KERN_INFO "md: stopping all md devices.\n"); 3847 3848 ITERATE_MDDEV(mddev,tmp) 3849 if (mddev_trylock(mddev)==0) 3850 do_md_stop (mddev, 1); 3851 /* 3852 * certain more exotic SCSI devices are known to be 3853 * volatile wrt too early system reboots. While the 3854 * right place to handle this issue is the given 3855 * driver, we do want to have a safe RAID driver ... 3856 */ 3857 mdelay(1000*1); 3858 } 3859 return NOTIFY_DONE; 3860 } 3861 3862 static struct notifier_block md_notifier = { 3863 .notifier_call = md_notify_reboot, 3864 .next = NULL, 3865 .priority = INT_MAX, /* before any real devices */ 3866 }; 3867 3868 static void md_geninit(void) 3869 { 3870 struct proc_dir_entry *p; 3871 3872 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 3873 3874 p = create_proc_entry("mdstat", S_IRUGO, NULL); 3875 if (p) 3876 p->proc_fops = &md_seq_fops; 3877 } 3878 3879 static int __init md_init(void) 3880 { 3881 int minor; 3882 3883 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 3884 " MD_SB_DISKS=%d\n", 3885 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3886 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3887 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR, 3888 BITMAP_MINOR); 3889 3890 if (register_blkdev(MAJOR_NR, "md")) 3891 return -1; 3892 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 3893 unregister_blkdev(MAJOR_NR, "md"); 3894 return -1; 3895 } 3896 devfs_mk_dir("md"); 3897 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 3898 md_probe, NULL, NULL); 3899 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 3900 md_probe, NULL, NULL); 3901 3902 for (minor=0; minor < MAX_MD_DEVS; ++minor) 3903 devfs_mk_bdev(MKDEV(MAJOR_NR, minor), 3904 S_IFBLK|S_IRUSR|S_IWUSR, 3905 "md/%d", minor); 3906 3907 for (minor=0; minor < MAX_MD_DEVS; ++minor) 3908 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), 3909 S_IFBLK|S_IRUSR|S_IWUSR, 3910 "md/mdp%d", minor); 3911 3912 3913 register_reboot_notifier(&md_notifier); 3914 raid_table_header = register_sysctl_table(raid_root_table, 1); 3915 3916 md_geninit(); 3917 return (0); 3918 } 3919 3920 3921 #ifndef MODULE 3922 3923 /* 3924 * Searches all registered partitions for autorun RAID arrays 3925 * at boot time. 3926 */ 3927 static dev_t detected_devices[128]; 3928 static int dev_cnt; 3929 3930 void md_autodetect_dev(dev_t dev) 3931 { 3932 if (dev_cnt >= 0 && dev_cnt < 127) 3933 detected_devices[dev_cnt++] = dev; 3934 } 3935 3936 3937 static void autostart_arrays(int part) 3938 { 3939 mdk_rdev_t *rdev; 3940 int i; 3941 3942 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 3943 3944 for (i = 0; i < dev_cnt; i++) { 3945 dev_t dev = detected_devices[i]; 3946 3947 rdev = md_import_device(dev,0, 0); 3948 if (IS_ERR(rdev)) 3949 continue; 3950 3951 if (rdev->faulty) { 3952 MD_BUG(); 3953 continue; 3954 } 3955 list_add(&rdev->same_set, &pending_raid_disks); 3956 } 3957 dev_cnt = 0; 3958 3959 autorun_devices(part); 3960 } 3961 3962 #endif 3963 3964 static __exit void md_exit(void) 3965 { 3966 mddev_t *mddev; 3967 struct list_head *tmp; 3968 int i; 3969 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 3970 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 3971 for (i=0; i < MAX_MD_DEVS; i++) 3972 devfs_remove("md/%d", i); 3973 for (i=0; i < MAX_MD_DEVS; i++) 3974 devfs_remove("md/d%d", i); 3975 3976 devfs_remove("md"); 3977 3978 unregister_blkdev(MAJOR_NR,"md"); 3979 unregister_blkdev(mdp_major, "mdp"); 3980 unregister_reboot_notifier(&md_notifier); 3981 unregister_sysctl_table(raid_table_header); 3982 remove_proc_entry("mdstat", NULL); 3983 ITERATE_MDDEV(mddev,tmp) { 3984 struct gendisk *disk = mddev->gendisk; 3985 if (!disk) 3986 continue; 3987 export_array(mddev); 3988 del_gendisk(disk); 3989 put_disk(disk); 3990 mddev->gendisk = NULL; 3991 mddev_put(mddev); 3992 } 3993 } 3994 3995 module_init(md_init) 3996 module_exit(md_exit) 3997 3998 EXPORT_SYMBOL(register_md_personality); 3999 EXPORT_SYMBOL(unregister_md_personality); 4000 EXPORT_SYMBOL(md_error); 4001 EXPORT_SYMBOL(md_done_sync); 4002 EXPORT_SYMBOL(md_write_start); 4003 EXPORT_SYMBOL(md_write_end); 4004 EXPORT_SYMBOL(md_register_thread); 4005 EXPORT_SYMBOL(md_unregister_thread); 4006 EXPORT_SYMBOL(md_wakeup_thread); 4007 EXPORT_SYMBOL(md_print_devices); 4008 EXPORT_SYMBOL(md_check_recovery); 4009 MODULE_LICENSE("GPL"); 4010