1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/kthread.h> 37 #include <linux/linkage.h> 38 #include <linux/raid/md.h> 39 #include <linux/raid/bitmap.h> 40 #include <linux/sysctl.h> 41 #include <linux/buffer_head.h> /* for invalidate_bdev */ 42 #include <linux/suspend.h> 43 #include <linux/poll.h> 44 #include <linux/mutex.h> 45 #include <linux/ctype.h> 46 47 #include <linux/init.h> 48 49 #include <linux/file.h> 50 51 #ifdef CONFIG_KMOD 52 #include <linux/kmod.h> 53 #endif 54 55 #include <asm/unaligned.h> 56 57 #define MAJOR_NR MD_MAJOR 58 #define MD_DRIVER 59 60 /* 63 partitions with the alternate major number (mdp) */ 61 #define MdpMinorShift 6 62 63 #define DEBUG 0 64 #define dprintk(x...) ((void)(DEBUG && printk(x))) 65 66 67 #ifndef MODULE 68 static void autostart_arrays (int part); 69 #endif 70 71 static LIST_HEAD(pers_list); 72 static DEFINE_SPINLOCK(pers_lock); 73 74 static void md_print_devices(void); 75 76 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 77 78 /* 79 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 80 * is 1000 KB/sec, so the extra system load does not show up that much. 81 * Increase it if you want to have more _guaranteed_ speed. Note that 82 * the RAID driver will use the maximum available bandwidth if the IO 83 * subsystem is idle. There is also an 'absolute maximum' reconstruction 84 * speed limit - in case reconstruction slows down your system despite 85 * idle IO detection. 86 * 87 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 88 * or /sys/block/mdX/md/sync_speed_{min,max} 89 */ 90 91 static int sysctl_speed_limit_min = 1000; 92 static int sysctl_speed_limit_max = 200000; 93 static inline int speed_min(mddev_t *mddev) 94 { 95 return mddev->sync_speed_min ? 96 mddev->sync_speed_min : sysctl_speed_limit_min; 97 } 98 99 static inline int speed_max(mddev_t *mddev) 100 { 101 return mddev->sync_speed_max ? 102 mddev->sync_speed_max : sysctl_speed_limit_max; 103 } 104 105 static struct ctl_table_header *raid_table_header; 106 107 static ctl_table raid_table[] = { 108 { 109 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 110 .procname = "speed_limit_min", 111 .data = &sysctl_speed_limit_min, 112 .maxlen = sizeof(int), 113 .mode = S_IRUGO|S_IWUSR, 114 .proc_handler = &proc_dointvec, 115 }, 116 { 117 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 118 .procname = "speed_limit_max", 119 .data = &sysctl_speed_limit_max, 120 .maxlen = sizeof(int), 121 .mode = S_IRUGO|S_IWUSR, 122 .proc_handler = &proc_dointvec, 123 }, 124 { .ctl_name = 0 } 125 }; 126 127 static ctl_table raid_dir_table[] = { 128 { 129 .ctl_name = DEV_RAID, 130 .procname = "raid", 131 .maxlen = 0, 132 .mode = S_IRUGO|S_IXUGO, 133 .child = raid_table, 134 }, 135 { .ctl_name = 0 } 136 }; 137 138 static ctl_table raid_root_table[] = { 139 { 140 .ctl_name = CTL_DEV, 141 .procname = "dev", 142 .maxlen = 0, 143 .mode = 0555, 144 .child = raid_dir_table, 145 }, 146 { .ctl_name = 0 } 147 }; 148 149 static struct block_device_operations md_fops; 150 151 static int start_readonly; 152 153 /* 154 * We have a system wide 'event count' that is incremented 155 * on any 'interesting' event, and readers of /proc/mdstat 156 * can use 'poll' or 'select' to find out when the event 157 * count increases. 158 * 159 * Events are: 160 * start array, stop array, error, add device, remove device, 161 * start build, activate spare 162 */ 163 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 164 static atomic_t md_event_count; 165 void md_new_event(mddev_t *mddev) 166 { 167 atomic_inc(&md_event_count); 168 wake_up(&md_event_waiters); 169 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 170 } 171 EXPORT_SYMBOL_GPL(md_new_event); 172 173 /* Alternate version that can be called from interrupts 174 * when calling sysfs_notify isn't needed. 175 */ 176 static void md_new_event_inintr(mddev_t *mddev) 177 { 178 atomic_inc(&md_event_count); 179 wake_up(&md_event_waiters); 180 } 181 182 /* 183 * Enables to iterate over all existing md arrays 184 * all_mddevs_lock protects this list. 185 */ 186 static LIST_HEAD(all_mddevs); 187 static DEFINE_SPINLOCK(all_mddevs_lock); 188 189 190 /* 191 * iterates through all used mddevs in the system. 192 * We take care to grab the all_mddevs_lock whenever navigating 193 * the list, and to always hold a refcount when unlocked. 194 * Any code which breaks out of this loop while own 195 * a reference to the current mddev and must mddev_put it. 196 */ 197 #define ITERATE_MDDEV(mddev,tmp) \ 198 \ 199 for (({ spin_lock(&all_mddevs_lock); \ 200 tmp = all_mddevs.next; \ 201 mddev = NULL;}); \ 202 ({ if (tmp != &all_mddevs) \ 203 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 204 spin_unlock(&all_mddevs_lock); \ 205 if (mddev) mddev_put(mddev); \ 206 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 207 tmp != &all_mddevs;}); \ 208 ({ spin_lock(&all_mddevs_lock); \ 209 tmp = tmp->next;}) \ 210 ) 211 212 213 static int md_fail_request (request_queue_t *q, struct bio *bio) 214 { 215 bio_io_error(bio, bio->bi_size); 216 return 0; 217 } 218 219 static inline mddev_t *mddev_get(mddev_t *mddev) 220 { 221 atomic_inc(&mddev->active); 222 return mddev; 223 } 224 225 static void mddev_put(mddev_t *mddev) 226 { 227 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 228 return; 229 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 230 list_del(&mddev->all_mddevs); 231 spin_unlock(&all_mddevs_lock); 232 blk_cleanup_queue(mddev->queue); 233 kobject_unregister(&mddev->kobj); 234 } else 235 spin_unlock(&all_mddevs_lock); 236 } 237 238 static mddev_t * mddev_find(dev_t unit) 239 { 240 mddev_t *mddev, *new = NULL; 241 242 retry: 243 spin_lock(&all_mddevs_lock); 244 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 245 if (mddev->unit == unit) { 246 mddev_get(mddev); 247 spin_unlock(&all_mddevs_lock); 248 kfree(new); 249 return mddev; 250 } 251 252 if (new) { 253 list_add(&new->all_mddevs, &all_mddevs); 254 spin_unlock(&all_mddevs_lock); 255 return new; 256 } 257 spin_unlock(&all_mddevs_lock); 258 259 new = kzalloc(sizeof(*new), GFP_KERNEL); 260 if (!new) 261 return NULL; 262 263 new->unit = unit; 264 if (MAJOR(unit) == MD_MAJOR) 265 new->md_minor = MINOR(unit); 266 else 267 new->md_minor = MINOR(unit) >> MdpMinorShift; 268 269 mutex_init(&new->reconfig_mutex); 270 INIT_LIST_HEAD(&new->disks); 271 INIT_LIST_HEAD(&new->all_mddevs); 272 init_timer(&new->safemode_timer); 273 atomic_set(&new->active, 1); 274 spin_lock_init(&new->write_lock); 275 init_waitqueue_head(&new->sb_wait); 276 277 new->queue = blk_alloc_queue(GFP_KERNEL); 278 if (!new->queue) { 279 kfree(new); 280 return NULL; 281 } 282 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); 283 284 blk_queue_make_request(new->queue, md_fail_request); 285 286 goto retry; 287 } 288 289 static inline int mddev_lock(mddev_t * mddev) 290 { 291 return mutex_lock_interruptible(&mddev->reconfig_mutex); 292 } 293 294 static inline int mddev_trylock(mddev_t * mddev) 295 { 296 return mutex_trylock(&mddev->reconfig_mutex); 297 } 298 299 static inline void mddev_unlock(mddev_t * mddev) 300 { 301 mutex_unlock(&mddev->reconfig_mutex); 302 303 md_wakeup_thread(mddev->thread); 304 } 305 306 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 307 { 308 mdk_rdev_t * rdev; 309 struct list_head *tmp; 310 311 ITERATE_RDEV(mddev,rdev,tmp) { 312 if (rdev->desc_nr == nr) 313 return rdev; 314 } 315 return NULL; 316 } 317 318 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 319 { 320 struct list_head *tmp; 321 mdk_rdev_t *rdev; 322 323 ITERATE_RDEV(mddev,rdev,tmp) { 324 if (rdev->bdev->bd_dev == dev) 325 return rdev; 326 } 327 return NULL; 328 } 329 330 static struct mdk_personality *find_pers(int level, char *clevel) 331 { 332 struct mdk_personality *pers; 333 list_for_each_entry(pers, &pers_list, list) { 334 if (level != LEVEL_NONE && pers->level == level) 335 return pers; 336 if (strcmp(pers->name, clevel)==0) 337 return pers; 338 } 339 return NULL; 340 } 341 342 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 343 { 344 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 345 return MD_NEW_SIZE_BLOCKS(size); 346 } 347 348 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 349 { 350 sector_t size; 351 352 size = rdev->sb_offset; 353 354 if (chunk_size) 355 size &= ~((sector_t)chunk_size/1024 - 1); 356 return size; 357 } 358 359 static int alloc_disk_sb(mdk_rdev_t * rdev) 360 { 361 if (rdev->sb_page) 362 MD_BUG(); 363 364 rdev->sb_page = alloc_page(GFP_KERNEL); 365 if (!rdev->sb_page) { 366 printk(KERN_ALERT "md: out of memory.\n"); 367 return -EINVAL; 368 } 369 370 return 0; 371 } 372 373 static void free_disk_sb(mdk_rdev_t * rdev) 374 { 375 if (rdev->sb_page) { 376 put_page(rdev->sb_page); 377 rdev->sb_loaded = 0; 378 rdev->sb_page = NULL; 379 rdev->sb_offset = 0; 380 rdev->size = 0; 381 } 382 } 383 384 385 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 386 { 387 mdk_rdev_t *rdev = bio->bi_private; 388 mddev_t *mddev = rdev->mddev; 389 if (bio->bi_size) 390 return 1; 391 392 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 393 md_error(mddev, rdev); 394 395 if (atomic_dec_and_test(&mddev->pending_writes)) 396 wake_up(&mddev->sb_wait); 397 bio_put(bio); 398 return 0; 399 } 400 401 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 402 { 403 struct bio *bio2 = bio->bi_private; 404 mdk_rdev_t *rdev = bio2->bi_private; 405 mddev_t *mddev = rdev->mddev; 406 if (bio->bi_size) 407 return 1; 408 409 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 410 error == -EOPNOTSUPP) { 411 unsigned long flags; 412 /* barriers don't appear to be supported :-( */ 413 set_bit(BarriersNotsupp, &rdev->flags); 414 mddev->barriers_work = 0; 415 spin_lock_irqsave(&mddev->write_lock, flags); 416 bio2->bi_next = mddev->biolist; 417 mddev->biolist = bio2; 418 spin_unlock_irqrestore(&mddev->write_lock, flags); 419 wake_up(&mddev->sb_wait); 420 bio_put(bio); 421 return 0; 422 } 423 bio_put(bio2); 424 bio->bi_private = rdev; 425 return super_written(bio, bytes_done, error); 426 } 427 428 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 429 sector_t sector, int size, struct page *page) 430 { 431 /* write first size bytes of page to sector of rdev 432 * Increment mddev->pending_writes before returning 433 * and decrement it on completion, waking up sb_wait 434 * if zero is reached. 435 * If an error occurred, call md_error 436 * 437 * As we might need to resubmit the request if BIO_RW_BARRIER 438 * causes ENOTSUPP, we allocate a spare bio... 439 */ 440 struct bio *bio = bio_alloc(GFP_NOIO, 1); 441 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 442 443 bio->bi_bdev = rdev->bdev; 444 bio->bi_sector = sector; 445 bio_add_page(bio, page, size, 0); 446 bio->bi_private = rdev; 447 bio->bi_end_io = super_written; 448 bio->bi_rw = rw; 449 450 atomic_inc(&mddev->pending_writes); 451 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 452 struct bio *rbio; 453 rw |= (1<<BIO_RW_BARRIER); 454 rbio = bio_clone(bio, GFP_NOIO); 455 rbio->bi_private = bio; 456 rbio->bi_end_io = super_written_barrier; 457 submit_bio(rw, rbio); 458 } else 459 submit_bio(rw, bio); 460 } 461 462 void md_super_wait(mddev_t *mddev) 463 { 464 /* wait for all superblock writes that were scheduled to complete. 465 * if any had to be retried (due to BARRIER problems), retry them 466 */ 467 DEFINE_WAIT(wq); 468 for(;;) { 469 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 470 if (atomic_read(&mddev->pending_writes)==0) 471 break; 472 while (mddev->biolist) { 473 struct bio *bio; 474 spin_lock_irq(&mddev->write_lock); 475 bio = mddev->biolist; 476 mddev->biolist = bio->bi_next ; 477 bio->bi_next = NULL; 478 spin_unlock_irq(&mddev->write_lock); 479 submit_bio(bio->bi_rw, bio); 480 } 481 schedule(); 482 } 483 finish_wait(&mddev->sb_wait, &wq); 484 } 485 486 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 487 { 488 if (bio->bi_size) 489 return 1; 490 491 complete((struct completion*)bio->bi_private); 492 return 0; 493 } 494 495 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 496 struct page *page, int rw) 497 { 498 struct bio *bio = bio_alloc(GFP_NOIO, 1); 499 struct completion event; 500 int ret; 501 502 rw |= (1 << BIO_RW_SYNC); 503 504 bio->bi_bdev = bdev; 505 bio->bi_sector = sector; 506 bio_add_page(bio, page, size, 0); 507 init_completion(&event); 508 bio->bi_private = &event; 509 bio->bi_end_io = bi_complete; 510 submit_bio(rw, bio); 511 wait_for_completion(&event); 512 513 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 514 bio_put(bio); 515 return ret; 516 } 517 EXPORT_SYMBOL_GPL(sync_page_io); 518 519 static int read_disk_sb(mdk_rdev_t * rdev, int size) 520 { 521 char b[BDEVNAME_SIZE]; 522 if (!rdev->sb_page) { 523 MD_BUG(); 524 return -EINVAL; 525 } 526 if (rdev->sb_loaded) 527 return 0; 528 529 530 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 531 goto fail; 532 rdev->sb_loaded = 1; 533 return 0; 534 535 fail: 536 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 537 bdevname(rdev->bdev,b)); 538 return -EINVAL; 539 } 540 541 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 542 { 543 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 544 (sb1->set_uuid1 == sb2->set_uuid1) && 545 (sb1->set_uuid2 == sb2->set_uuid2) && 546 (sb1->set_uuid3 == sb2->set_uuid3)) 547 548 return 1; 549 550 return 0; 551 } 552 553 554 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 555 { 556 int ret; 557 mdp_super_t *tmp1, *tmp2; 558 559 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 560 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 561 562 if (!tmp1 || !tmp2) { 563 ret = 0; 564 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 565 goto abort; 566 } 567 568 *tmp1 = *sb1; 569 *tmp2 = *sb2; 570 571 /* 572 * nr_disks is not constant 573 */ 574 tmp1->nr_disks = 0; 575 tmp2->nr_disks = 0; 576 577 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 578 ret = 0; 579 else 580 ret = 1; 581 582 abort: 583 kfree(tmp1); 584 kfree(tmp2); 585 return ret; 586 } 587 588 static unsigned int calc_sb_csum(mdp_super_t * sb) 589 { 590 unsigned int disk_csum, csum; 591 592 disk_csum = sb->sb_csum; 593 sb->sb_csum = 0; 594 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 595 sb->sb_csum = disk_csum; 596 return csum; 597 } 598 599 600 /* 601 * Handle superblock details. 602 * We want to be able to handle multiple superblock formats 603 * so we have a common interface to them all, and an array of 604 * different handlers. 605 * We rely on user-space to write the initial superblock, and support 606 * reading and updating of superblocks. 607 * Interface methods are: 608 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 609 * loads and validates a superblock on dev. 610 * if refdev != NULL, compare superblocks on both devices 611 * Return: 612 * 0 - dev has a superblock that is compatible with refdev 613 * 1 - dev has a superblock that is compatible and newer than refdev 614 * so dev should be used as the refdev in future 615 * -EINVAL superblock incompatible or invalid 616 * -othererror e.g. -EIO 617 * 618 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 619 * Verify that dev is acceptable into mddev. 620 * The first time, mddev->raid_disks will be 0, and data from 621 * dev should be merged in. Subsequent calls check that dev 622 * is new enough. Return 0 or -EINVAL 623 * 624 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 625 * Update the superblock for rdev with data in mddev 626 * This does not write to disc. 627 * 628 */ 629 630 struct super_type { 631 char *name; 632 struct module *owner; 633 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 634 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 635 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 636 }; 637 638 /* 639 * load_super for 0.90.0 640 */ 641 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 642 { 643 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 644 mdp_super_t *sb; 645 int ret; 646 sector_t sb_offset; 647 648 /* 649 * Calculate the position of the superblock, 650 * it's at the end of the disk. 651 * 652 * It also happens to be a multiple of 4Kb. 653 */ 654 sb_offset = calc_dev_sboffset(rdev->bdev); 655 rdev->sb_offset = sb_offset; 656 657 ret = read_disk_sb(rdev, MD_SB_BYTES); 658 if (ret) return ret; 659 660 ret = -EINVAL; 661 662 bdevname(rdev->bdev, b); 663 sb = (mdp_super_t*)page_address(rdev->sb_page); 664 665 if (sb->md_magic != MD_SB_MAGIC) { 666 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 667 b); 668 goto abort; 669 } 670 671 if (sb->major_version != 0 || 672 sb->minor_version < 90 || 673 sb->minor_version > 91) { 674 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 675 sb->major_version, sb->minor_version, 676 b); 677 goto abort; 678 } 679 680 if (sb->raid_disks <= 0) 681 goto abort; 682 683 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 684 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 685 b); 686 goto abort; 687 } 688 689 rdev->preferred_minor = sb->md_minor; 690 rdev->data_offset = 0; 691 rdev->sb_size = MD_SB_BYTES; 692 693 if (sb->level == LEVEL_MULTIPATH) 694 rdev->desc_nr = -1; 695 else 696 rdev->desc_nr = sb->this_disk.number; 697 698 if (refdev == 0) 699 ret = 1; 700 else { 701 __u64 ev1, ev2; 702 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 703 if (!uuid_equal(refsb, sb)) { 704 printk(KERN_WARNING "md: %s has different UUID to %s\n", 705 b, bdevname(refdev->bdev,b2)); 706 goto abort; 707 } 708 if (!sb_equal(refsb, sb)) { 709 printk(KERN_WARNING "md: %s has same UUID" 710 " but different superblock to %s\n", 711 b, bdevname(refdev->bdev, b2)); 712 goto abort; 713 } 714 ev1 = md_event(sb); 715 ev2 = md_event(refsb); 716 if (ev1 > ev2) 717 ret = 1; 718 else 719 ret = 0; 720 } 721 rdev->size = calc_dev_size(rdev, sb->chunk_size); 722 723 if (rdev->size < sb->size && sb->level > 1) 724 /* "this cannot possibly happen" ... */ 725 ret = -EINVAL; 726 727 abort: 728 return ret; 729 } 730 731 /* 732 * validate_super for 0.90.0 733 */ 734 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 735 { 736 mdp_disk_t *desc; 737 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 738 __u64 ev1 = md_event(sb); 739 740 rdev->raid_disk = -1; 741 rdev->flags = 0; 742 if (mddev->raid_disks == 0) { 743 mddev->major_version = 0; 744 mddev->minor_version = sb->minor_version; 745 mddev->patch_version = sb->patch_version; 746 mddev->persistent = ! sb->not_persistent; 747 mddev->chunk_size = sb->chunk_size; 748 mddev->ctime = sb->ctime; 749 mddev->utime = sb->utime; 750 mddev->level = sb->level; 751 mddev->clevel[0] = 0; 752 mddev->layout = sb->layout; 753 mddev->raid_disks = sb->raid_disks; 754 mddev->size = sb->size; 755 mddev->events = ev1; 756 mddev->bitmap_offset = 0; 757 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 758 759 if (mddev->minor_version >= 91) { 760 mddev->reshape_position = sb->reshape_position; 761 mddev->delta_disks = sb->delta_disks; 762 mddev->new_level = sb->new_level; 763 mddev->new_layout = sb->new_layout; 764 mddev->new_chunk = sb->new_chunk; 765 } else { 766 mddev->reshape_position = MaxSector; 767 mddev->delta_disks = 0; 768 mddev->new_level = mddev->level; 769 mddev->new_layout = mddev->layout; 770 mddev->new_chunk = mddev->chunk_size; 771 } 772 773 if (sb->state & (1<<MD_SB_CLEAN)) 774 mddev->recovery_cp = MaxSector; 775 else { 776 if (sb->events_hi == sb->cp_events_hi && 777 sb->events_lo == sb->cp_events_lo) { 778 mddev->recovery_cp = sb->recovery_cp; 779 } else 780 mddev->recovery_cp = 0; 781 } 782 783 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 784 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 785 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 786 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 787 788 mddev->max_disks = MD_SB_DISKS; 789 790 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 791 mddev->bitmap_file == NULL) { 792 if (mddev->level != 1 && mddev->level != 4 793 && mddev->level != 5 && mddev->level != 6 794 && mddev->level != 10) { 795 /* FIXME use a better test */ 796 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 797 return -EINVAL; 798 } 799 mddev->bitmap_offset = mddev->default_bitmap_offset; 800 } 801 802 } else if (mddev->pers == NULL) { 803 /* Insist on good event counter while assembling */ 804 ++ev1; 805 if (ev1 < mddev->events) 806 return -EINVAL; 807 } else if (mddev->bitmap) { 808 /* if adding to array with a bitmap, then we can accept an 809 * older device ... but not too old. 810 */ 811 if (ev1 < mddev->bitmap->events_cleared) 812 return 0; 813 } else { 814 if (ev1 < mddev->events) 815 /* just a hot-add of a new device, leave raid_disk at -1 */ 816 return 0; 817 } 818 819 if (mddev->level != LEVEL_MULTIPATH) { 820 desc = sb->disks + rdev->desc_nr; 821 822 if (desc->state & (1<<MD_DISK_FAULTY)) 823 set_bit(Faulty, &rdev->flags); 824 else if (desc->state & (1<<MD_DISK_SYNC) /* && 825 desc->raid_disk < mddev->raid_disks */) { 826 set_bit(In_sync, &rdev->flags); 827 rdev->raid_disk = desc->raid_disk; 828 } 829 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 830 set_bit(WriteMostly, &rdev->flags); 831 } else /* MULTIPATH are always insync */ 832 set_bit(In_sync, &rdev->flags); 833 return 0; 834 } 835 836 /* 837 * sync_super for 0.90.0 838 */ 839 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 840 { 841 mdp_super_t *sb; 842 struct list_head *tmp; 843 mdk_rdev_t *rdev2; 844 int next_spare = mddev->raid_disks; 845 846 847 /* make rdev->sb match mddev data.. 848 * 849 * 1/ zero out disks 850 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 851 * 3/ any empty disks < next_spare become removed 852 * 853 * disks[0] gets initialised to REMOVED because 854 * we cannot be sure from other fields if it has 855 * been initialised or not. 856 */ 857 int i; 858 int active=0, working=0,failed=0,spare=0,nr_disks=0; 859 860 rdev->sb_size = MD_SB_BYTES; 861 862 sb = (mdp_super_t*)page_address(rdev->sb_page); 863 864 memset(sb, 0, sizeof(*sb)); 865 866 sb->md_magic = MD_SB_MAGIC; 867 sb->major_version = mddev->major_version; 868 sb->patch_version = mddev->patch_version; 869 sb->gvalid_words = 0; /* ignored */ 870 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 871 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 872 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 873 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 874 875 sb->ctime = mddev->ctime; 876 sb->level = mddev->level; 877 sb->size = mddev->size; 878 sb->raid_disks = mddev->raid_disks; 879 sb->md_minor = mddev->md_minor; 880 sb->not_persistent = !mddev->persistent; 881 sb->utime = mddev->utime; 882 sb->state = 0; 883 sb->events_hi = (mddev->events>>32); 884 sb->events_lo = (u32)mddev->events; 885 886 if (mddev->reshape_position == MaxSector) 887 sb->minor_version = 90; 888 else { 889 sb->minor_version = 91; 890 sb->reshape_position = mddev->reshape_position; 891 sb->new_level = mddev->new_level; 892 sb->delta_disks = mddev->delta_disks; 893 sb->new_layout = mddev->new_layout; 894 sb->new_chunk = mddev->new_chunk; 895 } 896 mddev->minor_version = sb->minor_version; 897 if (mddev->in_sync) 898 { 899 sb->recovery_cp = mddev->recovery_cp; 900 sb->cp_events_hi = (mddev->events>>32); 901 sb->cp_events_lo = (u32)mddev->events; 902 if (mddev->recovery_cp == MaxSector) 903 sb->state = (1<< MD_SB_CLEAN); 904 } else 905 sb->recovery_cp = 0; 906 907 sb->layout = mddev->layout; 908 sb->chunk_size = mddev->chunk_size; 909 910 if (mddev->bitmap && mddev->bitmap_file == NULL) 911 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 912 913 sb->disks[0].state = (1<<MD_DISK_REMOVED); 914 ITERATE_RDEV(mddev,rdev2,tmp) { 915 mdp_disk_t *d; 916 int desc_nr; 917 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 918 && !test_bit(Faulty, &rdev2->flags)) 919 desc_nr = rdev2->raid_disk; 920 else 921 desc_nr = next_spare++; 922 rdev2->desc_nr = desc_nr; 923 d = &sb->disks[rdev2->desc_nr]; 924 nr_disks++; 925 d->number = rdev2->desc_nr; 926 d->major = MAJOR(rdev2->bdev->bd_dev); 927 d->minor = MINOR(rdev2->bdev->bd_dev); 928 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 929 && !test_bit(Faulty, &rdev2->flags)) 930 d->raid_disk = rdev2->raid_disk; 931 else 932 d->raid_disk = rdev2->desc_nr; /* compatibility */ 933 if (test_bit(Faulty, &rdev2->flags)) 934 d->state = (1<<MD_DISK_FAULTY); 935 else if (test_bit(In_sync, &rdev2->flags)) { 936 d->state = (1<<MD_DISK_ACTIVE); 937 d->state |= (1<<MD_DISK_SYNC); 938 active++; 939 working++; 940 } else { 941 d->state = 0; 942 spare++; 943 working++; 944 } 945 if (test_bit(WriteMostly, &rdev2->flags)) 946 d->state |= (1<<MD_DISK_WRITEMOSTLY); 947 } 948 /* now set the "removed" and "faulty" bits on any missing devices */ 949 for (i=0 ; i < mddev->raid_disks ; i++) { 950 mdp_disk_t *d = &sb->disks[i]; 951 if (d->state == 0 && d->number == 0) { 952 d->number = i; 953 d->raid_disk = i; 954 d->state = (1<<MD_DISK_REMOVED); 955 d->state |= (1<<MD_DISK_FAULTY); 956 failed++; 957 } 958 } 959 sb->nr_disks = nr_disks; 960 sb->active_disks = active; 961 sb->working_disks = working; 962 sb->failed_disks = failed; 963 sb->spare_disks = spare; 964 965 sb->this_disk = sb->disks[rdev->desc_nr]; 966 sb->sb_csum = calc_sb_csum(sb); 967 } 968 969 /* 970 * version 1 superblock 971 */ 972 973 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 974 { 975 unsigned int disk_csum, csum; 976 unsigned long long newcsum; 977 int size = 256 + le32_to_cpu(sb->max_dev)*2; 978 unsigned int *isuper = (unsigned int*)sb; 979 int i; 980 981 disk_csum = sb->sb_csum; 982 sb->sb_csum = 0; 983 newcsum = 0; 984 for (i=0; size>=4; size -= 4 ) 985 newcsum += le32_to_cpu(*isuper++); 986 987 if (size == 2) 988 newcsum += le16_to_cpu(*(unsigned short*) isuper); 989 990 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 991 sb->sb_csum = disk_csum; 992 return cpu_to_le32(csum); 993 } 994 995 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 996 { 997 struct mdp_superblock_1 *sb; 998 int ret; 999 sector_t sb_offset; 1000 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1001 int bmask; 1002 1003 /* 1004 * Calculate the position of the superblock. 1005 * It is always aligned to a 4K boundary and 1006 * depeding on minor_version, it can be: 1007 * 0: At least 8K, but less than 12K, from end of device 1008 * 1: At start of device 1009 * 2: 4K from start of device. 1010 */ 1011 switch(minor_version) { 1012 case 0: 1013 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1014 sb_offset -= 8*2; 1015 sb_offset &= ~(sector_t)(4*2-1); 1016 /* convert from sectors to K */ 1017 sb_offset /= 2; 1018 break; 1019 case 1: 1020 sb_offset = 0; 1021 break; 1022 case 2: 1023 sb_offset = 4; 1024 break; 1025 default: 1026 return -EINVAL; 1027 } 1028 rdev->sb_offset = sb_offset; 1029 1030 /* superblock is rarely larger than 1K, but it can be larger, 1031 * and it is safe to read 4k, so we do that 1032 */ 1033 ret = read_disk_sb(rdev, 4096); 1034 if (ret) return ret; 1035 1036 1037 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1038 1039 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1040 sb->major_version != cpu_to_le32(1) || 1041 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1042 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1043 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1044 return -EINVAL; 1045 1046 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1047 printk("md: invalid superblock checksum on %s\n", 1048 bdevname(rdev->bdev,b)); 1049 return -EINVAL; 1050 } 1051 if (le64_to_cpu(sb->data_size) < 10) { 1052 printk("md: data_size too small on %s\n", 1053 bdevname(rdev->bdev,b)); 1054 return -EINVAL; 1055 } 1056 rdev->preferred_minor = 0xffff; 1057 rdev->data_offset = le64_to_cpu(sb->data_offset); 1058 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1059 1060 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1061 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1062 if (rdev->sb_size & bmask) 1063 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1064 1065 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1066 rdev->desc_nr = -1; 1067 else 1068 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1069 1070 if (refdev == 0) 1071 ret = 1; 1072 else { 1073 __u64 ev1, ev2; 1074 struct mdp_superblock_1 *refsb = 1075 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1076 1077 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1078 sb->level != refsb->level || 1079 sb->layout != refsb->layout || 1080 sb->chunksize != refsb->chunksize) { 1081 printk(KERN_WARNING "md: %s has strangely different" 1082 " superblock to %s\n", 1083 bdevname(rdev->bdev,b), 1084 bdevname(refdev->bdev,b2)); 1085 return -EINVAL; 1086 } 1087 ev1 = le64_to_cpu(sb->events); 1088 ev2 = le64_to_cpu(refsb->events); 1089 1090 if (ev1 > ev2) 1091 ret = 1; 1092 else 1093 ret = 0; 1094 } 1095 if (minor_version) 1096 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1097 else 1098 rdev->size = rdev->sb_offset; 1099 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1100 return -EINVAL; 1101 rdev->size = le64_to_cpu(sb->data_size)/2; 1102 if (le32_to_cpu(sb->chunksize)) 1103 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1104 1105 if (le32_to_cpu(sb->size) > rdev->size*2) 1106 return -EINVAL; 1107 return ret; 1108 } 1109 1110 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1111 { 1112 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1113 __u64 ev1 = le64_to_cpu(sb->events); 1114 1115 rdev->raid_disk = -1; 1116 rdev->flags = 0; 1117 if (mddev->raid_disks == 0) { 1118 mddev->major_version = 1; 1119 mddev->patch_version = 0; 1120 mddev->persistent = 1; 1121 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1122 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1123 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1124 mddev->level = le32_to_cpu(sb->level); 1125 mddev->clevel[0] = 0; 1126 mddev->layout = le32_to_cpu(sb->layout); 1127 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1128 mddev->size = le64_to_cpu(sb->size)/2; 1129 mddev->events = ev1; 1130 mddev->bitmap_offset = 0; 1131 mddev->default_bitmap_offset = 1024 >> 9; 1132 1133 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1134 memcpy(mddev->uuid, sb->set_uuid, 16); 1135 1136 mddev->max_disks = (4096-256)/2; 1137 1138 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1139 mddev->bitmap_file == NULL ) { 1140 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 1141 && mddev->level != 10) { 1142 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 1143 return -EINVAL; 1144 } 1145 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1146 } 1147 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1148 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1149 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1150 mddev->new_level = le32_to_cpu(sb->new_level); 1151 mddev->new_layout = le32_to_cpu(sb->new_layout); 1152 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1153 } else { 1154 mddev->reshape_position = MaxSector; 1155 mddev->delta_disks = 0; 1156 mddev->new_level = mddev->level; 1157 mddev->new_layout = mddev->layout; 1158 mddev->new_chunk = mddev->chunk_size; 1159 } 1160 1161 } else if (mddev->pers == NULL) { 1162 /* Insist of good event counter while assembling */ 1163 ++ev1; 1164 if (ev1 < mddev->events) 1165 return -EINVAL; 1166 } else if (mddev->bitmap) { 1167 /* If adding to array with a bitmap, then we can accept an 1168 * older device, but not too old. 1169 */ 1170 if (ev1 < mddev->bitmap->events_cleared) 1171 return 0; 1172 } else { 1173 if (ev1 < mddev->events) 1174 /* just a hot-add of a new device, leave raid_disk at -1 */ 1175 return 0; 1176 } 1177 if (mddev->level != LEVEL_MULTIPATH) { 1178 int role; 1179 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1180 switch(role) { 1181 case 0xffff: /* spare */ 1182 break; 1183 case 0xfffe: /* faulty */ 1184 set_bit(Faulty, &rdev->flags); 1185 break; 1186 default: 1187 if ((le32_to_cpu(sb->feature_map) & 1188 MD_FEATURE_RECOVERY_OFFSET)) 1189 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1190 else 1191 set_bit(In_sync, &rdev->flags); 1192 rdev->raid_disk = role; 1193 break; 1194 } 1195 if (sb->devflags & WriteMostly1) 1196 set_bit(WriteMostly, &rdev->flags); 1197 } else /* MULTIPATH are always insync */ 1198 set_bit(In_sync, &rdev->flags); 1199 1200 return 0; 1201 } 1202 1203 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1204 { 1205 struct mdp_superblock_1 *sb; 1206 struct list_head *tmp; 1207 mdk_rdev_t *rdev2; 1208 int max_dev, i; 1209 /* make rdev->sb match mddev and rdev data. */ 1210 1211 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1212 1213 sb->feature_map = 0; 1214 sb->pad0 = 0; 1215 sb->recovery_offset = cpu_to_le64(0); 1216 memset(sb->pad1, 0, sizeof(sb->pad1)); 1217 memset(sb->pad2, 0, sizeof(sb->pad2)); 1218 memset(sb->pad3, 0, sizeof(sb->pad3)); 1219 1220 sb->utime = cpu_to_le64((__u64)mddev->utime); 1221 sb->events = cpu_to_le64(mddev->events); 1222 if (mddev->in_sync) 1223 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1224 else 1225 sb->resync_offset = cpu_to_le64(0); 1226 1227 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors); 1228 1229 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1230 sb->size = cpu_to_le64(mddev->size<<1); 1231 1232 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1233 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1234 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1235 } 1236 1237 if (rdev->raid_disk >= 0 && 1238 !test_bit(In_sync, &rdev->flags) && 1239 rdev->recovery_offset > 0) { 1240 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1241 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1242 } 1243 1244 if (mddev->reshape_position != MaxSector) { 1245 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1246 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1247 sb->new_layout = cpu_to_le32(mddev->new_layout); 1248 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1249 sb->new_level = cpu_to_le32(mddev->new_level); 1250 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1251 } 1252 1253 max_dev = 0; 1254 ITERATE_RDEV(mddev,rdev2,tmp) 1255 if (rdev2->desc_nr+1 > max_dev) 1256 max_dev = rdev2->desc_nr+1; 1257 1258 sb->max_dev = cpu_to_le32(max_dev); 1259 for (i=0; i<max_dev;i++) 1260 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1261 1262 ITERATE_RDEV(mddev,rdev2,tmp) { 1263 i = rdev2->desc_nr; 1264 if (test_bit(Faulty, &rdev2->flags)) 1265 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1266 else if (test_bit(In_sync, &rdev2->flags)) 1267 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1268 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1269 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1270 else 1271 sb->dev_roles[i] = cpu_to_le16(0xffff); 1272 } 1273 1274 sb->sb_csum = calc_sb_1_csum(sb); 1275 } 1276 1277 1278 static struct super_type super_types[] = { 1279 [0] = { 1280 .name = "0.90.0", 1281 .owner = THIS_MODULE, 1282 .load_super = super_90_load, 1283 .validate_super = super_90_validate, 1284 .sync_super = super_90_sync, 1285 }, 1286 [1] = { 1287 .name = "md-1", 1288 .owner = THIS_MODULE, 1289 .load_super = super_1_load, 1290 .validate_super = super_1_validate, 1291 .sync_super = super_1_sync, 1292 }, 1293 }; 1294 1295 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1296 { 1297 struct list_head *tmp; 1298 mdk_rdev_t *rdev; 1299 1300 ITERATE_RDEV(mddev,rdev,tmp) 1301 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1302 return rdev; 1303 1304 return NULL; 1305 } 1306 1307 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1308 { 1309 struct list_head *tmp; 1310 mdk_rdev_t *rdev; 1311 1312 ITERATE_RDEV(mddev1,rdev,tmp) 1313 if (match_dev_unit(mddev2, rdev)) 1314 return 1; 1315 1316 return 0; 1317 } 1318 1319 static LIST_HEAD(pending_raid_disks); 1320 1321 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1322 { 1323 mdk_rdev_t *same_pdev; 1324 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1325 struct kobject *ko; 1326 char *s; 1327 1328 if (rdev->mddev) { 1329 MD_BUG(); 1330 return -EINVAL; 1331 } 1332 /* make sure rdev->size exceeds mddev->size */ 1333 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1334 if (mddev->pers) 1335 /* Cannot change size, so fail */ 1336 return -ENOSPC; 1337 else 1338 mddev->size = rdev->size; 1339 } 1340 same_pdev = match_dev_unit(mddev, rdev); 1341 if (same_pdev) 1342 printk(KERN_WARNING 1343 "%s: WARNING: %s appears to be on the same physical" 1344 " disk as %s. True\n protection against single-disk" 1345 " failure might be compromised.\n", 1346 mdname(mddev), bdevname(rdev->bdev,b), 1347 bdevname(same_pdev->bdev,b2)); 1348 1349 /* Verify rdev->desc_nr is unique. 1350 * If it is -1, assign a free number, else 1351 * check number is not in use 1352 */ 1353 if (rdev->desc_nr < 0) { 1354 int choice = 0; 1355 if (mddev->pers) choice = mddev->raid_disks; 1356 while (find_rdev_nr(mddev, choice)) 1357 choice++; 1358 rdev->desc_nr = choice; 1359 } else { 1360 if (find_rdev_nr(mddev, rdev->desc_nr)) 1361 return -EBUSY; 1362 } 1363 bdevname(rdev->bdev,b); 1364 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1365 return -ENOMEM; 1366 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL) 1367 *s = '!'; 1368 1369 list_add(&rdev->same_set, &mddev->disks); 1370 rdev->mddev = mddev; 1371 printk(KERN_INFO "md: bind<%s>\n", b); 1372 1373 rdev->kobj.parent = &mddev->kobj; 1374 kobject_add(&rdev->kobj); 1375 1376 if (rdev->bdev->bd_part) 1377 ko = &rdev->bdev->bd_part->kobj; 1378 else 1379 ko = &rdev->bdev->bd_disk->kobj; 1380 sysfs_create_link(&rdev->kobj, ko, "block"); 1381 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); 1382 return 0; 1383 } 1384 1385 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1386 { 1387 char b[BDEVNAME_SIZE]; 1388 if (!rdev->mddev) { 1389 MD_BUG(); 1390 return; 1391 } 1392 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1393 list_del_init(&rdev->same_set); 1394 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1395 rdev->mddev = NULL; 1396 sysfs_remove_link(&rdev->kobj, "block"); 1397 kobject_del(&rdev->kobj); 1398 } 1399 1400 /* 1401 * prevent the device from being mounted, repartitioned or 1402 * otherwise reused by a RAID array (or any other kernel 1403 * subsystem), by bd_claiming the device. 1404 */ 1405 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1406 { 1407 int err = 0; 1408 struct block_device *bdev; 1409 char b[BDEVNAME_SIZE]; 1410 1411 bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1412 if (IS_ERR(bdev)) { 1413 printk(KERN_ERR "md: could not open %s.\n", 1414 __bdevname(dev, b)); 1415 return PTR_ERR(bdev); 1416 } 1417 err = bd_claim(bdev, rdev); 1418 if (err) { 1419 printk(KERN_ERR "md: could not bd_claim %s.\n", 1420 bdevname(bdev, b)); 1421 blkdev_put_partition(bdev); 1422 return err; 1423 } 1424 rdev->bdev = bdev; 1425 return err; 1426 } 1427 1428 static void unlock_rdev(mdk_rdev_t *rdev) 1429 { 1430 struct block_device *bdev = rdev->bdev; 1431 rdev->bdev = NULL; 1432 if (!bdev) 1433 MD_BUG(); 1434 bd_release(bdev); 1435 blkdev_put_partition(bdev); 1436 } 1437 1438 void md_autodetect_dev(dev_t dev); 1439 1440 static void export_rdev(mdk_rdev_t * rdev) 1441 { 1442 char b[BDEVNAME_SIZE]; 1443 printk(KERN_INFO "md: export_rdev(%s)\n", 1444 bdevname(rdev->bdev,b)); 1445 if (rdev->mddev) 1446 MD_BUG(); 1447 free_disk_sb(rdev); 1448 list_del_init(&rdev->same_set); 1449 #ifndef MODULE 1450 md_autodetect_dev(rdev->bdev->bd_dev); 1451 #endif 1452 unlock_rdev(rdev); 1453 kobject_put(&rdev->kobj); 1454 } 1455 1456 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1457 { 1458 unbind_rdev_from_array(rdev); 1459 export_rdev(rdev); 1460 } 1461 1462 static void export_array(mddev_t *mddev) 1463 { 1464 struct list_head *tmp; 1465 mdk_rdev_t *rdev; 1466 1467 ITERATE_RDEV(mddev,rdev,tmp) { 1468 if (!rdev->mddev) { 1469 MD_BUG(); 1470 continue; 1471 } 1472 kick_rdev_from_array(rdev); 1473 } 1474 if (!list_empty(&mddev->disks)) 1475 MD_BUG(); 1476 mddev->raid_disks = 0; 1477 mddev->major_version = 0; 1478 } 1479 1480 static void print_desc(mdp_disk_t *desc) 1481 { 1482 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1483 desc->major,desc->minor,desc->raid_disk,desc->state); 1484 } 1485 1486 static void print_sb(mdp_super_t *sb) 1487 { 1488 int i; 1489 1490 printk(KERN_INFO 1491 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1492 sb->major_version, sb->minor_version, sb->patch_version, 1493 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1494 sb->ctime); 1495 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1496 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1497 sb->md_minor, sb->layout, sb->chunk_size); 1498 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1499 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1500 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1501 sb->failed_disks, sb->spare_disks, 1502 sb->sb_csum, (unsigned long)sb->events_lo); 1503 1504 printk(KERN_INFO); 1505 for (i = 0; i < MD_SB_DISKS; i++) { 1506 mdp_disk_t *desc; 1507 1508 desc = sb->disks + i; 1509 if (desc->number || desc->major || desc->minor || 1510 desc->raid_disk || (desc->state && (desc->state != 4))) { 1511 printk(" D %2d: ", i); 1512 print_desc(desc); 1513 } 1514 } 1515 printk(KERN_INFO "md: THIS: "); 1516 print_desc(&sb->this_disk); 1517 1518 } 1519 1520 static void print_rdev(mdk_rdev_t *rdev) 1521 { 1522 char b[BDEVNAME_SIZE]; 1523 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1524 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1525 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1526 rdev->desc_nr); 1527 if (rdev->sb_loaded) { 1528 printk(KERN_INFO "md: rdev superblock:\n"); 1529 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1530 } else 1531 printk(KERN_INFO "md: no rdev superblock!\n"); 1532 } 1533 1534 static void md_print_devices(void) 1535 { 1536 struct list_head *tmp, *tmp2; 1537 mdk_rdev_t *rdev; 1538 mddev_t *mddev; 1539 char b[BDEVNAME_SIZE]; 1540 1541 printk("\n"); 1542 printk("md: **********************************\n"); 1543 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1544 printk("md: **********************************\n"); 1545 ITERATE_MDDEV(mddev,tmp) { 1546 1547 if (mddev->bitmap) 1548 bitmap_print_sb(mddev->bitmap); 1549 else 1550 printk("%s: ", mdname(mddev)); 1551 ITERATE_RDEV(mddev,rdev,tmp2) 1552 printk("<%s>", bdevname(rdev->bdev,b)); 1553 printk("\n"); 1554 1555 ITERATE_RDEV(mddev,rdev,tmp2) 1556 print_rdev(rdev); 1557 } 1558 printk("md: **********************************\n"); 1559 printk("\n"); 1560 } 1561 1562 1563 static void sync_sbs(mddev_t * mddev, int nospares) 1564 { 1565 /* Update each superblock (in-memory image), but 1566 * if we are allowed to, skip spares which already 1567 * have the right event counter, or have one earlier 1568 * (which would mean they aren't being marked as dirty 1569 * with the rest of the array) 1570 */ 1571 mdk_rdev_t *rdev; 1572 struct list_head *tmp; 1573 1574 ITERATE_RDEV(mddev,rdev,tmp) { 1575 if (rdev->sb_events == mddev->events || 1576 (nospares && 1577 rdev->raid_disk < 0 && 1578 (rdev->sb_events&1)==0 && 1579 rdev->sb_events+1 == mddev->events)) { 1580 /* Don't update this superblock */ 1581 rdev->sb_loaded = 2; 1582 } else { 1583 super_types[mddev->major_version]. 1584 sync_super(mddev, rdev); 1585 rdev->sb_loaded = 1; 1586 } 1587 } 1588 } 1589 1590 void md_update_sb(mddev_t * mddev) 1591 { 1592 int err; 1593 struct list_head *tmp; 1594 mdk_rdev_t *rdev; 1595 int sync_req; 1596 int nospares = 0; 1597 1598 repeat: 1599 spin_lock_irq(&mddev->write_lock); 1600 1601 if (mddev->degraded && mddev->sb_dirty == 3) 1602 /* If the array is degraded, then skipping spares is both 1603 * dangerous and fairly pointless. 1604 * Dangerous because a device that was removed from the array 1605 * might have a event_count that still looks up-to-date, 1606 * so it can be re-added without a resync. 1607 * Pointless because if there are any spares to skip, 1608 * then a recovery will happen and soon that array won't 1609 * be degraded any more and the spare can go back to sleep then. 1610 */ 1611 mddev->sb_dirty = 1; 1612 1613 sync_req = mddev->in_sync; 1614 mddev->utime = get_seconds(); 1615 if (mddev->sb_dirty == 3) 1616 /* just a clean<-> dirty transition, possibly leave spares alone, 1617 * though if events isn't the right even/odd, we will have to do 1618 * spares after all 1619 */ 1620 nospares = 1; 1621 1622 /* If this is just a dirty<->clean transition, and the array is clean 1623 * and 'events' is odd, we can roll back to the previous clean state */ 1624 if (mddev->sb_dirty == 3 1625 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1626 && (mddev->events & 1)) 1627 mddev->events--; 1628 else { 1629 /* otherwise we have to go forward and ... */ 1630 mddev->events ++; 1631 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1632 /* .. if the array isn't clean, insist on an odd 'events' */ 1633 if ((mddev->events&1)==0) { 1634 mddev->events++; 1635 nospares = 0; 1636 } 1637 } else { 1638 /* otherwise insist on an even 'events' (for clean states) */ 1639 if ((mddev->events&1)) { 1640 mddev->events++; 1641 nospares = 0; 1642 } 1643 } 1644 } 1645 1646 if (!mddev->events) { 1647 /* 1648 * oops, this 64-bit counter should never wrap. 1649 * Either we are in around ~1 trillion A.C., assuming 1650 * 1 reboot per second, or we have a bug: 1651 */ 1652 MD_BUG(); 1653 mddev->events --; 1654 } 1655 mddev->sb_dirty = 2; 1656 sync_sbs(mddev, nospares); 1657 1658 /* 1659 * do not write anything to disk if using 1660 * nonpersistent superblocks 1661 */ 1662 if (!mddev->persistent) { 1663 mddev->sb_dirty = 0; 1664 spin_unlock_irq(&mddev->write_lock); 1665 wake_up(&mddev->sb_wait); 1666 return; 1667 } 1668 spin_unlock_irq(&mddev->write_lock); 1669 1670 dprintk(KERN_INFO 1671 "md: updating %s RAID superblock on device (in sync %d)\n", 1672 mdname(mddev),mddev->in_sync); 1673 1674 err = bitmap_update_sb(mddev->bitmap); 1675 ITERATE_RDEV(mddev,rdev,tmp) { 1676 char b[BDEVNAME_SIZE]; 1677 dprintk(KERN_INFO "md: "); 1678 if (rdev->sb_loaded != 1) 1679 continue; /* no noise on spare devices */ 1680 if (test_bit(Faulty, &rdev->flags)) 1681 dprintk("(skipping faulty "); 1682 1683 dprintk("%s ", bdevname(rdev->bdev,b)); 1684 if (!test_bit(Faulty, &rdev->flags)) { 1685 md_super_write(mddev,rdev, 1686 rdev->sb_offset<<1, rdev->sb_size, 1687 rdev->sb_page); 1688 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1689 bdevname(rdev->bdev,b), 1690 (unsigned long long)rdev->sb_offset); 1691 rdev->sb_events = mddev->events; 1692 1693 } else 1694 dprintk(")\n"); 1695 if (mddev->level == LEVEL_MULTIPATH) 1696 /* only need to write one superblock... */ 1697 break; 1698 } 1699 md_super_wait(mddev); 1700 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1701 1702 spin_lock_irq(&mddev->write_lock); 1703 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1704 /* have to write it out again */ 1705 spin_unlock_irq(&mddev->write_lock); 1706 goto repeat; 1707 } 1708 mddev->sb_dirty = 0; 1709 spin_unlock_irq(&mddev->write_lock); 1710 wake_up(&mddev->sb_wait); 1711 1712 } 1713 EXPORT_SYMBOL_GPL(md_update_sb); 1714 1715 /* words written to sysfs files may, or my not, be \n terminated. 1716 * We want to accept with case. For this we use cmd_match. 1717 */ 1718 static int cmd_match(const char *cmd, const char *str) 1719 { 1720 /* See if cmd, written into a sysfs file, matches 1721 * str. They must either be the same, or cmd can 1722 * have a trailing newline 1723 */ 1724 while (*cmd && *str && *cmd == *str) { 1725 cmd++; 1726 str++; 1727 } 1728 if (*cmd == '\n') 1729 cmd++; 1730 if (*str || *cmd) 1731 return 0; 1732 return 1; 1733 } 1734 1735 struct rdev_sysfs_entry { 1736 struct attribute attr; 1737 ssize_t (*show)(mdk_rdev_t *, char *); 1738 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1739 }; 1740 1741 static ssize_t 1742 state_show(mdk_rdev_t *rdev, char *page) 1743 { 1744 char *sep = ""; 1745 int len=0; 1746 1747 if (test_bit(Faulty, &rdev->flags)) { 1748 len+= sprintf(page+len, "%sfaulty",sep); 1749 sep = ","; 1750 } 1751 if (test_bit(In_sync, &rdev->flags)) { 1752 len += sprintf(page+len, "%sin_sync",sep); 1753 sep = ","; 1754 } 1755 if (test_bit(WriteMostly, &rdev->flags)) { 1756 len += sprintf(page+len, "%swrite_mostly",sep); 1757 sep = ","; 1758 } 1759 if (!test_bit(Faulty, &rdev->flags) && 1760 !test_bit(In_sync, &rdev->flags)) { 1761 len += sprintf(page+len, "%sspare", sep); 1762 sep = ","; 1763 } 1764 return len+sprintf(page+len, "\n"); 1765 } 1766 1767 static ssize_t 1768 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1769 { 1770 /* can write 1771 * faulty - simulates and error 1772 * remove - disconnects the device 1773 * writemostly - sets write_mostly 1774 * -writemostly - clears write_mostly 1775 */ 1776 int err = -EINVAL; 1777 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1778 md_error(rdev->mddev, rdev); 1779 err = 0; 1780 } else if (cmd_match(buf, "remove")) { 1781 if (rdev->raid_disk >= 0) 1782 err = -EBUSY; 1783 else { 1784 mddev_t *mddev = rdev->mddev; 1785 kick_rdev_from_array(rdev); 1786 md_update_sb(mddev); 1787 md_new_event(mddev); 1788 err = 0; 1789 } 1790 } else if (cmd_match(buf, "writemostly")) { 1791 set_bit(WriteMostly, &rdev->flags); 1792 err = 0; 1793 } else if (cmd_match(buf, "-writemostly")) { 1794 clear_bit(WriteMostly, &rdev->flags); 1795 err = 0; 1796 } 1797 return err ? err : len; 1798 } 1799 static struct rdev_sysfs_entry rdev_state = 1800 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 1801 1802 static ssize_t 1803 super_show(mdk_rdev_t *rdev, char *page) 1804 { 1805 if (rdev->sb_loaded && rdev->sb_size) { 1806 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1807 return rdev->sb_size; 1808 } else 1809 return 0; 1810 } 1811 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1812 1813 static ssize_t 1814 errors_show(mdk_rdev_t *rdev, char *page) 1815 { 1816 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1817 } 1818 1819 static ssize_t 1820 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1821 { 1822 char *e; 1823 unsigned long n = simple_strtoul(buf, &e, 10); 1824 if (*buf && (*e == 0 || *e == '\n')) { 1825 atomic_set(&rdev->corrected_errors, n); 1826 return len; 1827 } 1828 return -EINVAL; 1829 } 1830 static struct rdev_sysfs_entry rdev_errors = 1831 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 1832 1833 static ssize_t 1834 slot_show(mdk_rdev_t *rdev, char *page) 1835 { 1836 if (rdev->raid_disk < 0) 1837 return sprintf(page, "none\n"); 1838 else 1839 return sprintf(page, "%d\n", rdev->raid_disk); 1840 } 1841 1842 static ssize_t 1843 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1844 { 1845 char *e; 1846 int slot = simple_strtoul(buf, &e, 10); 1847 if (strncmp(buf, "none", 4)==0) 1848 slot = -1; 1849 else if (e==buf || (*e && *e!= '\n')) 1850 return -EINVAL; 1851 if (rdev->mddev->pers) 1852 /* Cannot set slot in active array (yet) */ 1853 return -EBUSY; 1854 if (slot >= rdev->mddev->raid_disks) 1855 return -ENOSPC; 1856 rdev->raid_disk = slot; 1857 /* assume it is working */ 1858 rdev->flags = 0; 1859 set_bit(In_sync, &rdev->flags); 1860 return len; 1861 } 1862 1863 1864 static struct rdev_sysfs_entry rdev_slot = 1865 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 1866 1867 static ssize_t 1868 offset_show(mdk_rdev_t *rdev, char *page) 1869 { 1870 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 1871 } 1872 1873 static ssize_t 1874 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1875 { 1876 char *e; 1877 unsigned long long offset = simple_strtoull(buf, &e, 10); 1878 if (e==buf || (*e && *e != '\n')) 1879 return -EINVAL; 1880 if (rdev->mddev->pers) 1881 return -EBUSY; 1882 rdev->data_offset = offset; 1883 return len; 1884 } 1885 1886 static struct rdev_sysfs_entry rdev_offset = 1887 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 1888 1889 static ssize_t 1890 rdev_size_show(mdk_rdev_t *rdev, char *page) 1891 { 1892 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 1893 } 1894 1895 static ssize_t 1896 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1897 { 1898 char *e; 1899 unsigned long long size = simple_strtoull(buf, &e, 10); 1900 if (e==buf || (*e && *e != '\n')) 1901 return -EINVAL; 1902 if (rdev->mddev->pers) 1903 return -EBUSY; 1904 rdev->size = size; 1905 if (size < rdev->mddev->size || rdev->mddev->size == 0) 1906 rdev->mddev->size = size; 1907 return len; 1908 } 1909 1910 static struct rdev_sysfs_entry rdev_size = 1911 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 1912 1913 static struct attribute *rdev_default_attrs[] = { 1914 &rdev_state.attr, 1915 &rdev_super.attr, 1916 &rdev_errors.attr, 1917 &rdev_slot.attr, 1918 &rdev_offset.attr, 1919 &rdev_size.attr, 1920 NULL, 1921 }; 1922 static ssize_t 1923 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1924 { 1925 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1926 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1927 1928 if (!entry->show) 1929 return -EIO; 1930 return entry->show(rdev, page); 1931 } 1932 1933 static ssize_t 1934 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1935 const char *page, size_t length) 1936 { 1937 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1938 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1939 1940 if (!entry->store) 1941 return -EIO; 1942 if (!capable(CAP_SYS_ADMIN)) 1943 return -EACCES; 1944 return entry->store(rdev, page, length); 1945 } 1946 1947 static void rdev_free(struct kobject *ko) 1948 { 1949 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1950 kfree(rdev); 1951 } 1952 static struct sysfs_ops rdev_sysfs_ops = { 1953 .show = rdev_attr_show, 1954 .store = rdev_attr_store, 1955 }; 1956 static struct kobj_type rdev_ktype = { 1957 .release = rdev_free, 1958 .sysfs_ops = &rdev_sysfs_ops, 1959 .default_attrs = rdev_default_attrs, 1960 }; 1961 1962 /* 1963 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1964 * 1965 * mark the device faulty if: 1966 * 1967 * - the device is nonexistent (zero size) 1968 * - the device has no valid superblock 1969 * 1970 * a faulty rdev _never_ has rdev->sb set. 1971 */ 1972 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1973 { 1974 char b[BDEVNAME_SIZE]; 1975 int err; 1976 mdk_rdev_t *rdev; 1977 sector_t size; 1978 1979 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 1980 if (!rdev) { 1981 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1982 return ERR_PTR(-ENOMEM); 1983 } 1984 1985 if ((err = alloc_disk_sb(rdev))) 1986 goto abort_free; 1987 1988 err = lock_rdev(rdev, newdev); 1989 if (err) 1990 goto abort_free; 1991 1992 rdev->kobj.parent = NULL; 1993 rdev->kobj.ktype = &rdev_ktype; 1994 kobject_init(&rdev->kobj); 1995 1996 rdev->desc_nr = -1; 1997 rdev->flags = 0; 1998 rdev->data_offset = 0; 1999 rdev->sb_events = 0; 2000 atomic_set(&rdev->nr_pending, 0); 2001 atomic_set(&rdev->read_errors, 0); 2002 atomic_set(&rdev->corrected_errors, 0); 2003 2004 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2005 if (!size) { 2006 printk(KERN_WARNING 2007 "md: %s has zero or unknown size, marking faulty!\n", 2008 bdevname(rdev->bdev,b)); 2009 err = -EINVAL; 2010 goto abort_free; 2011 } 2012 2013 if (super_format >= 0) { 2014 err = super_types[super_format]. 2015 load_super(rdev, NULL, super_minor); 2016 if (err == -EINVAL) { 2017 printk(KERN_WARNING 2018 "md: %s has invalid sb, not importing!\n", 2019 bdevname(rdev->bdev,b)); 2020 goto abort_free; 2021 } 2022 if (err < 0) { 2023 printk(KERN_WARNING 2024 "md: could not read %s's sb, not importing!\n", 2025 bdevname(rdev->bdev,b)); 2026 goto abort_free; 2027 } 2028 } 2029 INIT_LIST_HEAD(&rdev->same_set); 2030 2031 return rdev; 2032 2033 abort_free: 2034 if (rdev->sb_page) { 2035 if (rdev->bdev) 2036 unlock_rdev(rdev); 2037 free_disk_sb(rdev); 2038 } 2039 kfree(rdev); 2040 return ERR_PTR(err); 2041 } 2042 2043 /* 2044 * Check a full RAID array for plausibility 2045 */ 2046 2047 2048 static void analyze_sbs(mddev_t * mddev) 2049 { 2050 int i; 2051 struct list_head *tmp; 2052 mdk_rdev_t *rdev, *freshest; 2053 char b[BDEVNAME_SIZE]; 2054 2055 freshest = NULL; 2056 ITERATE_RDEV(mddev,rdev,tmp) 2057 switch (super_types[mddev->major_version]. 2058 load_super(rdev, freshest, mddev->minor_version)) { 2059 case 1: 2060 freshest = rdev; 2061 break; 2062 case 0: 2063 break; 2064 default: 2065 printk( KERN_ERR \ 2066 "md: fatal superblock inconsistency in %s" 2067 " -- removing from array\n", 2068 bdevname(rdev->bdev,b)); 2069 kick_rdev_from_array(rdev); 2070 } 2071 2072 2073 super_types[mddev->major_version]. 2074 validate_super(mddev, freshest); 2075 2076 i = 0; 2077 ITERATE_RDEV(mddev,rdev,tmp) { 2078 if (rdev != freshest) 2079 if (super_types[mddev->major_version]. 2080 validate_super(mddev, rdev)) { 2081 printk(KERN_WARNING "md: kicking non-fresh %s" 2082 " from array!\n", 2083 bdevname(rdev->bdev,b)); 2084 kick_rdev_from_array(rdev); 2085 continue; 2086 } 2087 if (mddev->level == LEVEL_MULTIPATH) { 2088 rdev->desc_nr = i++; 2089 rdev->raid_disk = rdev->desc_nr; 2090 set_bit(In_sync, &rdev->flags); 2091 } 2092 } 2093 2094 2095 2096 if (mddev->recovery_cp != MaxSector && 2097 mddev->level >= 1) 2098 printk(KERN_ERR "md: %s: raid array is not clean" 2099 " -- starting background reconstruction\n", 2100 mdname(mddev)); 2101 2102 } 2103 2104 static ssize_t 2105 safe_delay_show(mddev_t *mddev, char *page) 2106 { 2107 int msec = (mddev->safemode_delay*1000)/HZ; 2108 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2109 } 2110 static ssize_t 2111 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2112 { 2113 int scale=1; 2114 int dot=0; 2115 int i; 2116 unsigned long msec; 2117 char buf[30]; 2118 char *e; 2119 /* remove a period, and count digits after it */ 2120 if (len >= sizeof(buf)) 2121 return -EINVAL; 2122 strlcpy(buf, cbuf, len); 2123 buf[len] = 0; 2124 for (i=0; i<len; i++) { 2125 if (dot) { 2126 if (isdigit(buf[i])) { 2127 buf[i-1] = buf[i]; 2128 scale *= 10; 2129 } 2130 buf[i] = 0; 2131 } else if (buf[i] == '.') { 2132 dot=1; 2133 buf[i] = 0; 2134 } 2135 } 2136 msec = simple_strtoul(buf, &e, 10); 2137 if (e == buf || (*e && *e != '\n')) 2138 return -EINVAL; 2139 msec = (msec * 1000) / scale; 2140 if (msec == 0) 2141 mddev->safemode_delay = 0; 2142 else { 2143 mddev->safemode_delay = (msec*HZ)/1000; 2144 if (mddev->safemode_delay == 0) 2145 mddev->safemode_delay = 1; 2146 } 2147 return len; 2148 } 2149 static struct md_sysfs_entry md_safe_delay = 2150 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2151 2152 static ssize_t 2153 level_show(mddev_t *mddev, char *page) 2154 { 2155 struct mdk_personality *p = mddev->pers; 2156 if (p) 2157 return sprintf(page, "%s\n", p->name); 2158 else if (mddev->clevel[0]) 2159 return sprintf(page, "%s\n", mddev->clevel); 2160 else if (mddev->level != LEVEL_NONE) 2161 return sprintf(page, "%d\n", mddev->level); 2162 else 2163 return 0; 2164 } 2165 2166 static ssize_t 2167 level_store(mddev_t *mddev, const char *buf, size_t len) 2168 { 2169 int rv = len; 2170 if (mddev->pers) 2171 return -EBUSY; 2172 if (len == 0) 2173 return 0; 2174 if (len >= sizeof(mddev->clevel)) 2175 return -ENOSPC; 2176 strncpy(mddev->clevel, buf, len); 2177 if (mddev->clevel[len-1] == '\n') 2178 len--; 2179 mddev->clevel[len] = 0; 2180 mddev->level = LEVEL_NONE; 2181 return rv; 2182 } 2183 2184 static struct md_sysfs_entry md_level = 2185 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2186 2187 2188 static ssize_t 2189 layout_show(mddev_t *mddev, char *page) 2190 { 2191 /* just a number, not meaningful for all levels */ 2192 return sprintf(page, "%d\n", mddev->layout); 2193 } 2194 2195 static ssize_t 2196 layout_store(mddev_t *mddev, const char *buf, size_t len) 2197 { 2198 char *e; 2199 unsigned long n = simple_strtoul(buf, &e, 10); 2200 if (mddev->pers) 2201 return -EBUSY; 2202 2203 if (!*buf || (*e && *e != '\n')) 2204 return -EINVAL; 2205 2206 mddev->layout = n; 2207 return len; 2208 } 2209 static struct md_sysfs_entry md_layout = 2210 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2211 2212 2213 static ssize_t 2214 raid_disks_show(mddev_t *mddev, char *page) 2215 { 2216 if (mddev->raid_disks == 0) 2217 return 0; 2218 return sprintf(page, "%d\n", mddev->raid_disks); 2219 } 2220 2221 static int update_raid_disks(mddev_t *mddev, int raid_disks); 2222 2223 static ssize_t 2224 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2225 { 2226 /* can only set raid_disks if array is not yet active */ 2227 char *e; 2228 int rv = 0; 2229 unsigned long n = simple_strtoul(buf, &e, 10); 2230 2231 if (!*buf || (*e && *e != '\n')) 2232 return -EINVAL; 2233 2234 if (mddev->pers) 2235 rv = update_raid_disks(mddev, n); 2236 else 2237 mddev->raid_disks = n; 2238 return rv ? rv : len; 2239 } 2240 static struct md_sysfs_entry md_raid_disks = 2241 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2242 2243 static ssize_t 2244 chunk_size_show(mddev_t *mddev, char *page) 2245 { 2246 return sprintf(page, "%d\n", mddev->chunk_size); 2247 } 2248 2249 static ssize_t 2250 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2251 { 2252 /* can only set chunk_size if array is not yet active */ 2253 char *e; 2254 unsigned long n = simple_strtoul(buf, &e, 10); 2255 2256 if (mddev->pers) 2257 return -EBUSY; 2258 if (!*buf || (*e && *e != '\n')) 2259 return -EINVAL; 2260 2261 mddev->chunk_size = n; 2262 return len; 2263 } 2264 static struct md_sysfs_entry md_chunk_size = 2265 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2266 2267 static ssize_t 2268 resync_start_show(mddev_t *mddev, char *page) 2269 { 2270 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2271 } 2272 2273 static ssize_t 2274 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2275 { 2276 /* can only set chunk_size if array is not yet active */ 2277 char *e; 2278 unsigned long long n = simple_strtoull(buf, &e, 10); 2279 2280 if (mddev->pers) 2281 return -EBUSY; 2282 if (!*buf || (*e && *e != '\n')) 2283 return -EINVAL; 2284 2285 mddev->recovery_cp = n; 2286 return len; 2287 } 2288 static struct md_sysfs_entry md_resync_start = 2289 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2290 2291 /* 2292 * The array state can be: 2293 * 2294 * clear 2295 * No devices, no size, no level 2296 * Equivalent to STOP_ARRAY ioctl 2297 * inactive 2298 * May have some settings, but array is not active 2299 * all IO results in error 2300 * When written, doesn't tear down array, but just stops it 2301 * suspended (not supported yet) 2302 * All IO requests will block. The array can be reconfigured. 2303 * Writing this, if accepted, will block until array is quiessent 2304 * readonly 2305 * no resync can happen. no superblocks get written. 2306 * write requests fail 2307 * read-auto 2308 * like readonly, but behaves like 'clean' on a write request. 2309 * 2310 * clean - no pending writes, but otherwise active. 2311 * When written to inactive array, starts without resync 2312 * If a write request arrives then 2313 * if metadata is known, mark 'dirty' and switch to 'active'. 2314 * if not known, block and switch to write-pending 2315 * If written to an active array that has pending writes, then fails. 2316 * active 2317 * fully active: IO and resync can be happening. 2318 * When written to inactive array, starts with resync 2319 * 2320 * write-pending 2321 * clean, but writes are blocked waiting for 'active' to be written. 2322 * 2323 * active-idle 2324 * like active, but no writes have been seen for a while (100msec). 2325 * 2326 */ 2327 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2328 write_pending, active_idle, bad_word}; 2329 static char *array_states[] = { 2330 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2331 "write-pending", "active-idle", NULL }; 2332 2333 static int match_word(const char *word, char **list) 2334 { 2335 int n; 2336 for (n=0; list[n]; n++) 2337 if (cmd_match(word, list[n])) 2338 break; 2339 return n; 2340 } 2341 2342 static ssize_t 2343 array_state_show(mddev_t *mddev, char *page) 2344 { 2345 enum array_state st = inactive; 2346 2347 if (mddev->pers) 2348 switch(mddev->ro) { 2349 case 1: 2350 st = readonly; 2351 break; 2352 case 2: 2353 st = read_auto; 2354 break; 2355 case 0: 2356 if (mddev->in_sync) 2357 st = clean; 2358 else if (mddev->safemode) 2359 st = active_idle; 2360 else 2361 st = active; 2362 } 2363 else { 2364 if (list_empty(&mddev->disks) && 2365 mddev->raid_disks == 0 && 2366 mddev->size == 0) 2367 st = clear; 2368 else 2369 st = inactive; 2370 } 2371 return sprintf(page, "%s\n", array_states[st]); 2372 } 2373 2374 static int do_md_stop(mddev_t * mddev, int ro); 2375 static int do_md_run(mddev_t * mddev); 2376 static int restart_array(mddev_t *mddev); 2377 2378 static ssize_t 2379 array_state_store(mddev_t *mddev, const char *buf, size_t len) 2380 { 2381 int err = -EINVAL; 2382 enum array_state st = match_word(buf, array_states); 2383 switch(st) { 2384 case bad_word: 2385 break; 2386 case clear: 2387 /* stopping an active array */ 2388 if (mddev->pers) { 2389 if (atomic_read(&mddev->active) > 1) 2390 return -EBUSY; 2391 err = do_md_stop(mddev, 0); 2392 } 2393 break; 2394 case inactive: 2395 /* stopping an active array */ 2396 if (mddev->pers) { 2397 if (atomic_read(&mddev->active) > 1) 2398 return -EBUSY; 2399 err = do_md_stop(mddev, 2); 2400 } 2401 break; 2402 case suspended: 2403 break; /* not supported yet */ 2404 case readonly: 2405 if (mddev->pers) 2406 err = do_md_stop(mddev, 1); 2407 else { 2408 mddev->ro = 1; 2409 err = do_md_run(mddev); 2410 } 2411 break; 2412 case read_auto: 2413 /* stopping an active array */ 2414 if (mddev->pers) { 2415 err = do_md_stop(mddev, 1); 2416 if (err == 0) 2417 mddev->ro = 2; /* FIXME mark devices writable */ 2418 } else { 2419 mddev->ro = 2; 2420 err = do_md_run(mddev); 2421 } 2422 break; 2423 case clean: 2424 if (mddev->pers) { 2425 restart_array(mddev); 2426 spin_lock_irq(&mddev->write_lock); 2427 if (atomic_read(&mddev->writes_pending) == 0) { 2428 mddev->in_sync = 1; 2429 mddev->sb_dirty = 1; 2430 } 2431 spin_unlock_irq(&mddev->write_lock); 2432 } else { 2433 mddev->ro = 0; 2434 mddev->recovery_cp = MaxSector; 2435 err = do_md_run(mddev); 2436 } 2437 break; 2438 case active: 2439 if (mddev->pers) { 2440 restart_array(mddev); 2441 mddev->sb_dirty = 0; 2442 wake_up(&mddev->sb_wait); 2443 err = 0; 2444 } else { 2445 mddev->ro = 0; 2446 err = do_md_run(mddev); 2447 } 2448 break; 2449 case write_pending: 2450 case active_idle: 2451 /* these cannot be set */ 2452 break; 2453 } 2454 if (err) 2455 return err; 2456 else 2457 return len; 2458 } 2459 static struct md_sysfs_entry md_array_state = 2460 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2461 2462 static ssize_t 2463 null_show(mddev_t *mddev, char *page) 2464 { 2465 return -EINVAL; 2466 } 2467 2468 static ssize_t 2469 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 2470 { 2471 /* buf must be %d:%d\n? giving major and minor numbers */ 2472 /* The new device is added to the array. 2473 * If the array has a persistent superblock, we read the 2474 * superblock to initialise info and check validity. 2475 * Otherwise, only checking done is that in bind_rdev_to_array, 2476 * which mainly checks size. 2477 */ 2478 char *e; 2479 int major = simple_strtoul(buf, &e, 10); 2480 int minor; 2481 dev_t dev; 2482 mdk_rdev_t *rdev; 2483 int err; 2484 2485 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 2486 return -EINVAL; 2487 minor = simple_strtoul(e+1, &e, 10); 2488 if (*e && *e != '\n') 2489 return -EINVAL; 2490 dev = MKDEV(major, minor); 2491 if (major != MAJOR(dev) || 2492 minor != MINOR(dev)) 2493 return -EOVERFLOW; 2494 2495 2496 if (mddev->persistent) { 2497 rdev = md_import_device(dev, mddev->major_version, 2498 mddev->minor_version); 2499 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 2500 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2501 mdk_rdev_t, same_set); 2502 err = super_types[mddev->major_version] 2503 .load_super(rdev, rdev0, mddev->minor_version); 2504 if (err < 0) 2505 goto out; 2506 } 2507 } else 2508 rdev = md_import_device(dev, -1, -1); 2509 2510 if (IS_ERR(rdev)) 2511 return PTR_ERR(rdev); 2512 err = bind_rdev_to_array(rdev, mddev); 2513 out: 2514 if (err) 2515 export_rdev(rdev); 2516 return err ? err : len; 2517 } 2518 2519 static struct md_sysfs_entry md_new_device = 2520 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 2521 2522 static ssize_t 2523 size_show(mddev_t *mddev, char *page) 2524 { 2525 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2526 } 2527 2528 static int update_size(mddev_t *mddev, unsigned long size); 2529 2530 static ssize_t 2531 size_store(mddev_t *mddev, const char *buf, size_t len) 2532 { 2533 /* If array is inactive, we can reduce the component size, but 2534 * not increase it (except from 0). 2535 * If array is active, we can try an on-line resize 2536 */ 2537 char *e; 2538 int err = 0; 2539 unsigned long long size = simple_strtoull(buf, &e, 10); 2540 if (!*buf || *buf == '\n' || 2541 (*e && *e != '\n')) 2542 return -EINVAL; 2543 2544 if (mddev->pers) { 2545 err = update_size(mddev, size); 2546 md_update_sb(mddev); 2547 } else { 2548 if (mddev->size == 0 || 2549 mddev->size > size) 2550 mddev->size = size; 2551 else 2552 err = -ENOSPC; 2553 } 2554 return err ? err : len; 2555 } 2556 2557 static struct md_sysfs_entry md_size = 2558 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 2559 2560 2561 /* Metdata version. 2562 * This is either 'none' for arrays with externally managed metadata, 2563 * or N.M for internally known formats 2564 */ 2565 static ssize_t 2566 metadata_show(mddev_t *mddev, char *page) 2567 { 2568 if (mddev->persistent) 2569 return sprintf(page, "%d.%d\n", 2570 mddev->major_version, mddev->minor_version); 2571 else 2572 return sprintf(page, "none\n"); 2573 } 2574 2575 static ssize_t 2576 metadata_store(mddev_t *mddev, const char *buf, size_t len) 2577 { 2578 int major, minor; 2579 char *e; 2580 if (!list_empty(&mddev->disks)) 2581 return -EBUSY; 2582 2583 if (cmd_match(buf, "none")) { 2584 mddev->persistent = 0; 2585 mddev->major_version = 0; 2586 mddev->minor_version = 90; 2587 return len; 2588 } 2589 major = simple_strtoul(buf, &e, 10); 2590 if (e==buf || *e != '.') 2591 return -EINVAL; 2592 buf = e+1; 2593 minor = simple_strtoul(buf, &e, 10); 2594 if (e==buf || *e != '\n') 2595 return -EINVAL; 2596 if (major >= sizeof(super_types)/sizeof(super_types[0]) || 2597 super_types[major].name == NULL) 2598 return -ENOENT; 2599 mddev->major_version = major; 2600 mddev->minor_version = minor; 2601 mddev->persistent = 1; 2602 return len; 2603 } 2604 2605 static struct md_sysfs_entry md_metadata = 2606 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 2607 2608 static ssize_t 2609 action_show(mddev_t *mddev, char *page) 2610 { 2611 char *type = "idle"; 2612 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2613 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2614 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2615 type = "reshape"; 2616 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2617 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2618 type = "resync"; 2619 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2620 type = "check"; 2621 else 2622 type = "repair"; 2623 } else 2624 type = "recover"; 2625 } 2626 return sprintf(page, "%s\n", type); 2627 } 2628 2629 static ssize_t 2630 action_store(mddev_t *mddev, const char *page, size_t len) 2631 { 2632 if (!mddev->pers || !mddev->pers->sync_request) 2633 return -EINVAL; 2634 2635 if (cmd_match(page, "idle")) { 2636 if (mddev->sync_thread) { 2637 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2638 md_unregister_thread(mddev->sync_thread); 2639 mddev->sync_thread = NULL; 2640 mddev->recovery = 0; 2641 } 2642 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2643 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 2644 return -EBUSY; 2645 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2646 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2647 else if (cmd_match(page, "reshape")) { 2648 int err; 2649 if (mddev->pers->start_reshape == NULL) 2650 return -EINVAL; 2651 err = mddev->pers->start_reshape(mddev); 2652 if (err) 2653 return err; 2654 } else { 2655 if (cmd_match(page, "check")) 2656 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2657 else if (!cmd_match(page, "repair")) 2658 return -EINVAL; 2659 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2660 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2661 } 2662 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2663 md_wakeup_thread(mddev->thread); 2664 return len; 2665 } 2666 2667 static ssize_t 2668 mismatch_cnt_show(mddev_t *mddev, char *page) 2669 { 2670 return sprintf(page, "%llu\n", 2671 (unsigned long long) mddev->resync_mismatches); 2672 } 2673 2674 static struct md_sysfs_entry md_scan_mode = 2675 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2676 2677 2678 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 2679 2680 static ssize_t 2681 sync_min_show(mddev_t *mddev, char *page) 2682 { 2683 return sprintf(page, "%d (%s)\n", speed_min(mddev), 2684 mddev->sync_speed_min ? "local": "system"); 2685 } 2686 2687 static ssize_t 2688 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 2689 { 2690 int min; 2691 char *e; 2692 if (strncmp(buf, "system", 6)==0) { 2693 mddev->sync_speed_min = 0; 2694 return len; 2695 } 2696 min = simple_strtoul(buf, &e, 10); 2697 if (buf == e || (*e && *e != '\n') || min <= 0) 2698 return -EINVAL; 2699 mddev->sync_speed_min = min; 2700 return len; 2701 } 2702 2703 static struct md_sysfs_entry md_sync_min = 2704 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 2705 2706 static ssize_t 2707 sync_max_show(mddev_t *mddev, char *page) 2708 { 2709 return sprintf(page, "%d (%s)\n", speed_max(mddev), 2710 mddev->sync_speed_max ? "local": "system"); 2711 } 2712 2713 static ssize_t 2714 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 2715 { 2716 int max; 2717 char *e; 2718 if (strncmp(buf, "system", 6)==0) { 2719 mddev->sync_speed_max = 0; 2720 return len; 2721 } 2722 max = simple_strtoul(buf, &e, 10); 2723 if (buf == e || (*e && *e != '\n') || max <= 0) 2724 return -EINVAL; 2725 mddev->sync_speed_max = max; 2726 return len; 2727 } 2728 2729 static struct md_sysfs_entry md_sync_max = 2730 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 2731 2732 2733 static ssize_t 2734 sync_speed_show(mddev_t *mddev, char *page) 2735 { 2736 unsigned long resync, dt, db; 2737 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 2738 dt = ((jiffies - mddev->resync_mark) / HZ); 2739 if (!dt) dt++; 2740 db = resync - (mddev->resync_mark_cnt); 2741 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 2742 } 2743 2744 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 2745 2746 static ssize_t 2747 sync_completed_show(mddev_t *mddev, char *page) 2748 { 2749 unsigned long max_blocks, resync; 2750 2751 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2752 max_blocks = mddev->resync_max_sectors; 2753 else 2754 max_blocks = mddev->size << 1; 2755 2756 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 2757 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 2758 } 2759 2760 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 2761 2762 static ssize_t 2763 suspend_lo_show(mddev_t *mddev, char *page) 2764 { 2765 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 2766 } 2767 2768 static ssize_t 2769 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 2770 { 2771 char *e; 2772 unsigned long long new = simple_strtoull(buf, &e, 10); 2773 2774 if (mddev->pers->quiesce == NULL) 2775 return -EINVAL; 2776 if (buf == e || (*e && *e != '\n')) 2777 return -EINVAL; 2778 if (new >= mddev->suspend_hi || 2779 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 2780 mddev->suspend_lo = new; 2781 mddev->pers->quiesce(mddev, 2); 2782 return len; 2783 } else 2784 return -EINVAL; 2785 } 2786 static struct md_sysfs_entry md_suspend_lo = 2787 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 2788 2789 2790 static ssize_t 2791 suspend_hi_show(mddev_t *mddev, char *page) 2792 { 2793 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 2794 } 2795 2796 static ssize_t 2797 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 2798 { 2799 char *e; 2800 unsigned long long new = simple_strtoull(buf, &e, 10); 2801 2802 if (mddev->pers->quiesce == NULL) 2803 return -EINVAL; 2804 if (buf == e || (*e && *e != '\n')) 2805 return -EINVAL; 2806 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 2807 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 2808 mddev->suspend_hi = new; 2809 mddev->pers->quiesce(mddev, 1); 2810 mddev->pers->quiesce(mddev, 0); 2811 return len; 2812 } else 2813 return -EINVAL; 2814 } 2815 static struct md_sysfs_entry md_suspend_hi = 2816 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 2817 2818 2819 static struct attribute *md_default_attrs[] = { 2820 &md_level.attr, 2821 &md_layout.attr, 2822 &md_raid_disks.attr, 2823 &md_chunk_size.attr, 2824 &md_size.attr, 2825 &md_resync_start.attr, 2826 &md_metadata.attr, 2827 &md_new_device.attr, 2828 &md_safe_delay.attr, 2829 &md_array_state.attr, 2830 NULL, 2831 }; 2832 2833 static struct attribute *md_redundancy_attrs[] = { 2834 &md_scan_mode.attr, 2835 &md_mismatches.attr, 2836 &md_sync_min.attr, 2837 &md_sync_max.attr, 2838 &md_sync_speed.attr, 2839 &md_sync_completed.attr, 2840 &md_suspend_lo.attr, 2841 &md_suspend_hi.attr, 2842 NULL, 2843 }; 2844 static struct attribute_group md_redundancy_group = { 2845 .name = NULL, 2846 .attrs = md_redundancy_attrs, 2847 }; 2848 2849 2850 static ssize_t 2851 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2852 { 2853 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2854 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2855 ssize_t rv; 2856 2857 if (!entry->show) 2858 return -EIO; 2859 rv = mddev_lock(mddev); 2860 if (!rv) { 2861 rv = entry->show(mddev, page); 2862 mddev_unlock(mddev); 2863 } 2864 return rv; 2865 } 2866 2867 static ssize_t 2868 md_attr_store(struct kobject *kobj, struct attribute *attr, 2869 const char *page, size_t length) 2870 { 2871 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2872 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2873 ssize_t rv; 2874 2875 if (!entry->store) 2876 return -EIO; 2877 if (!capable(CAP_SYS_ADMIN)) 2878 return -EACCES; 2879 rv = mddev_lock(mddev); 2880 if (!rv) { 2881 rv = entry->store(mddev, page, length); 2882 mddev_unlock(mddev); 2883 } 2884 return rv; 2885 } 2886 2887 static void md_free(struct kobject *ko) 2888 { 2889 mddev_t *mddev = container_of(ko, mddev_t, kobj); 2890 kfree(mddev); 2891 } 2892 2893 static struct sysfs_ops md_sysfs_ops = { 2894 .show = md_attr_show, 2895 .store = md_attr_store, 2896 }; 2897 static struct kobj_type md_ktype = { 2898 .release = md_free, 2899 .sysfs_ops = &md_sysfs_ops, 2900 .default_attrs = md_default_attrs, 2901 }; 2902 2903 int mdp_major = 0; 2904 2905 static struct kobject *md_probe(dev_t dev, int *part, void *data) 2906 { 2907 static DEFINE_MUTEX(disks_mutex); 2908 mddev_t *mddev = mddev_find(dev); 2909 struct gendisk *disk; 2910 int partitioned = (MAJOR(dev) != MD_MAJOR); 2911 int shift = partitioned ? MdpMinorShift : 0; 2912 int unit = MINOR(dev) >> shift; 2913 2914 if (!mddev) 2915 return NULL; 2916 2917 mutex_lock(&disks_mutex); 2918 if (mddev->gendisk) { 2919 mutex_unlock(&disks_mutex); 2920 mddev_put(mddev); 2921 return NULL; 2922 } 2923 disk = alloc_disk(1 << shift); 2924 if (!disk) { 2925 mutex_unlock(&disks_mutex); 2926 mddev_put(mddev); 2927 return NULL; 2928 } 2929 disk->major = MAJOR(dev); 2930 disk->first_minor = unit << shift; 2931 if (partitioned) 2932 sprintf(disk->disk_name, "md_d%d", unit); 2933 else 2934 sprintf(disk->disk_name, "md%d", unit); 2935 disk->fops = &md_fops; 2936 disk->private_data = mddev; 2937 disk->queue = mddev->queue; 2938 add_disk(disk); 2939 mddev->gendisk = disk; 2940 mutex_unlock(&disks_mutex); 2941 mddev->kobj.parent = &disk->kobj; 2942 mddev->kobj.k_name = NULL; 2943 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 2944 mddev->kobj.ktype = &md_ktype; 2945 kobject_register(&mddev->kobj); 2946 return NULL; 2947 } 2948 2949 static void md_safemode_timeout(unsigned long data) 2950 { 2951 mddev_t *mddev = (mddev_t *) data; 2952 2953 mddev->safemode = 1; 2954 md_wakeup_thread(mddev->thread); 2955 } 2956 2957 static int start_dirty_degraded; 2958 2959 static int do_md_run(mddev_t * mddev) 2960 { 2961 int err; 2962 int chunk_size; 2963 struct list_head *tmp; 2964 mdk_rdev_t *rdev; 2965 struct gendisk *disk; 2966 struct mdk_personality *pers; 2967 char b[BDEVNAME_SIZE]; 2968 2969 if (list_empty(&mddev->disks)) 2970 /* cannot run an array with no devices.. */ 2971 return -EINVAL; 2972 2973 if (mddev->pers) 2974 return -EBUSY; 2975 2976 /* 2977 * Analyze all RAID superblock(s) 2978 */ 2979 if (!mddev->raid_disks) 2980 analyze_sbs(mddev); 2981 2982 chunk_size = mddev->chunk_size; 2983 2984 if (chunk_size) { 2985 if (chunk_size > MAX_CHUNK_SIZE) { 2986 printk(KERN_ERR "too big chunk_size: %d > %d\n", 2987 chunk_size, MAX_CHUNK_SIZE); 2988 return -EINVAL; 2989 } 2990 /* 2991 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 2992 */ 2993 if ( (1 << ffz(~chunk_size)) != chunk_size) { 2994 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 2995 return -EINVAL; 2996 } 2997 if (chunk_size < PAGE_SIZE) { 2998 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 2999 chunk_size, PAGE_SIZE); 3000 return -EINVAL; 3001 } 3002 3003 /* devices must have minimum size of one chunk */ 3004 ITERATE_RDEV(mddev,rdev,tmp) { 3005 if (test_bit(Faulty, &rdev->flags)) 3006 continue; 3007 if (rdev->size < chunk_size / 1024) { 3008 printk(KERN_WARNING 3009 "md: Dev %s smaller than chunk_size:" 3010 " %lluk < %dk\n", 3011 bdevname(rdev->bdev,b), 3012 (unsigned long long)rdev->size, 3013 chunk_size / 1024); 3014 return -EINVAL; 3015 } 3016 } 3017 } 3018 3019 #ifdef CONFIG_KMOD 3020 if (mddev->level != LEVEL_NONE) 3021 request_module("md-level-%d", mddev->level); 3022 else if (mddev->clevel[0]) 3023 request_module("md-%s", mddev->clevel); 3024 #endif 3025 3026 /* 3027 * Drop all container device buffers, from now on 3028 * the only valid external interface is through the md 3029 * device. 3030 * Also find largest hardsector size 3031 */ 3032 ITERATE_RDEV(mddev,rdev,tmp) { 3033 if (test_bit(Faulty, &rdev->flags)) 3034 continue; 3035 sync_blockdev(rdev->bdev); 3036 invalidate_bdev(rdev->bdev, 0); 3037 } 3038 3039 md_probe(mddev->unit, NULL, NULL); 3040 disk = mddev->gendisk; 3041 if (!disk) 3042 return -ENOMEM; 3043 3044 spin_lock(&pers_lock); 3045 pers = find_pers(mddev->level, mddev->clevel); 3046 if (!pers || !try_module_get(pers->owner)) { 3047 spin_unlock(&pers_lock); 3048 if (mddev->level != LEVEL_NONE) 3049 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 3050 mddev->level); 3051 else 3052 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 3053 mddev->clevel); 3054 return -EINVAL; 3055 } 3056 mddev->pers = pers; 3057 spin_unlock(&pers_lock); 3058 mddev->level = pers->level; 3059 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3060 3061 if (mddev->reshape_position != MaxSector && 3062 pers->start_reshape == NULL) { 3063 /* This personality cannot handle reshaping... */ 3064 mddev->pers = NULL; 3065 module_put(pers->owner); 3066 return -EINVAL; 3067 } 3068 3069 mddev->recovery = 0; 3070 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 3071 mddev->barriers_work = 1; 3072 mddev->ok_start_degraded = start_dirty_degraded; 3073 3074 if (start_readonly) 3075 mddev->ro = 2; /* read-only, but switch on first write */ 3076 3077 err = mddev->pers->run(mddev); 3078 if (!err && mddev->pers->sync_request) { 3079 err = bitmap_create(mddev); 3080 if (err) { 3081 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3082 mdname(mddev), err); 3083 mddev->pers->stop(mddev); 3084 } 3085 } 3086 if (err) { 3087 printk(KERN_ERR "md: pers->run() failed ...\n"); 3088 module_put(mddev->pers->owner); 3089 mddev->pers = NULL; 3090 bitmap_destroy(mddev); 3091 return err; 3092 } 3093 if (mddev->pers->sync_request) 3094 sysfs_create_group(&mddev->kobj, &md_redundancy_group); 3095 else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3096 mddev->ro = 0; 3097 3098 atomic_set(&mddev->writes_pending,0); 3099 mddev->safemode = 0; 3100 mddev->safemode_timer.function = md_safemode_timeout; 3101 mddev->safemode_timer.data = (unsigned long) mddev; 3102 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3103 mddev->in_sync = 1; 3104 3105 ITERATE_RDEV(mddev,rdev,tmp) 3106 if (rdev->raid_disk >= 0) { 3107 char nm[20]; 3108 sprintf(nm, "rd%d", rdev->raid_disk); 3109 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 3110 } 3111 3112 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3113 3114 if (mddev->sb_dirty) 3115 md_update_sb(mddev); 3116 3117 set_capacity(disk, mddev->array_size<<1); 3118 3119 /* If we call blk_queue_make_request here, it will 3120 * re-initialise max_sectors etc which may have been 3121 * refined inside -> run. So just set the bits we need to set. 3122 * Most initialisation happended when we called 3123 * blk_queue_make_request(..., md_fail_request) 3124 * earlier. 3125 */ 3126 mddev->queue->queuedata = mddev; 3127 mddev->queue->make_request_fn = mddev->pers->make_request; 3128 3129 /* If there is a partially-recovered drive we need to 3130 * start recovery here. If we leave it to md_check_recovery, 3131 * it will remove the drives and not do the right thing 3132 */ 3133 if (mddev->degraded && !mddev->sync_thread) { 3134 struct list_head *rtmp; 3135 int spares = 0; 3136 ITERATE_RDEV(mddev,rdev,rtmp) 3137 if (rdev->raid_disk >= 0 && 3138 !test_bit(In_sync, &rdev->flags) && 3139 !test_bit(Faulty, &rdev->flags)) 3140 /* complete an interrupted recovery */ 3141 spares++; 3142 if (spares && mddev->pers->sync_request) { 3143 mddev->recovery = 0; 3144 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3145 mddev->sync_thread = md_register_thread(md_do_sync, 3146 mddev, 3147 "%s_resync"); 3148 if (!mddev->sync_thread) { 3149 printk(KERN_ERR "%s: could not start resync" 3150 " thread...\n", 3151 mdname(mddev)); 3152 /* leave the spares where they are, it shouldn't hurt */ 3153 mddev->recovery = 0; 3154 } 3155 } 3156 } 3157 md_wakeup_thread(mddev->thread); 3158 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 3159 3160 mddev->changed = 1; 3161 md_new_event(mddev); 3162 return 0; 3163 } 3164 3165 static int restart_array(mddev_t *mddev) 3166 { 3167 struct gendisk *disk = mddev->gendisk; 3168 int err; 3169 3170 /* 3171 * Complain if it has no devices 3172 */ 3173 err = -ENXIO; 3174 if (list_empty(&mddev->disks)) 3175 goto out; 3176 3177 if (mddev->pers) { 3178 err = -EBUSY; 3179 if (!mddev->ro) 3180 goto out; 3181 3182 mddev->safemode = 0; 3183 mddev->ro = 0; 3184 set_disk_ro(disk, 0); 3185 3186 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3187 mdname(mddev)); 3188 /* 3189 * Kick recovery or resync if necessary 3190 */ 3191 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3192 md_wakeup_thread(mddev->thread); 3193 md_wakeup_thread(mddev->sync_thread); 3194 err = 0; 3195 } else 3196 err = -EINVAL; 3197 3198 out: 3199 return err; 3200 } 3201 3202 /* similar to deny_write_access, but accounts for our holding a reference 3203 * to the file ourselves */ 3204 static int deny_bitmap_write_access(struct file * file) 3205 { 3206 struct inode *inode = file->f_mapping->host; 3207 3208 spin_lock(&inode->i_lock); 3209 if (atomic_read(&inode->i_writecount) > 1) { 3210 spin_unlock(&inode->i_lock); 3211 return -ETXTBSY; 3212 } 3213 atomic_set(&inode->i_writecount, -1); 3214 spin_unlock(&inode->i_lock); 3215 3216 return 0; 3217 } 3218 3219 static void restore_bitmap_write_access(struct file *file) 3220 { 3221 struct inode *inode = file->f_mapping->host; 3222 3223 spin_lock(&inode->i_lock); 3224 atomic_set(&inode->i_writecount, 1); 3225 spin_unlock(&inode->i_lock); 3226 } 3227 3228 /* mode: 3229 * 0 - completely stop and dis-assemble array 3230 * 1 - switch to readonly 3231 * 2 - stop but do not disassemble array 3232 */ 3233 static int do_md_stop(mddev_t * mddev, int mode) 3234 { 3235 int err = 0; 3236 struct gendisk *disk = mddev->gendisk; 3237 3238 if (mddev->pers) { 3239 if (atomic_read(&mddev->active)>2) { 3240 printk("md: %s still in use.\n",mdname(mddev)); 3241 return -EBUSY; 3242 } 3243 3244 if (mddev->sync_thread) { 3245 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3246 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3247 md_unregister_thread(mddev->sync_thread); 3248 mddev->sync_thread = NULL; 3249 } 3250 3251 del_timer_sync(&mddev->safemode_timer); 3252 3253 invalidate_partition(disk, 0); 3254 3255 switch(mode) { 3256 case 1: /* readonly */ 3257 err = -ENXIO; 3258 if (mddev->ro==1) 3259 goto out; 3260 mddev->ro = 1; 3261 break; 3262 case 0: /* disassemble */ 3263 case 2: /* stop */ 3264 bitmap_flush(mddev); 3265 md_super_wait(mddev); 3266 if (mddev->ro) 3267 set_disk_ro(disk, 0); 3268 blk_queue_make_request(mddev->queue, md_fail_request); 3269 mddev->pers->stop(mddev); 3270 if (mddev->pers->sync_request) 3271 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3272 3273 module_put(mddev->pers->owner); 3274 mddev->pers = NULL; 3275 if (mddev->ro) 3276 mddev->ro = 0; 3277 } 3278 if (!mddev->in_sync || mddev->sb_dirty) { 3279 /* mark array as shutdown cleanly */ 3280 mddev->in_sync = 1; 3281 md_update_sb(mddev); 3282 } 3283 if (mode == 1) 3284 set_disk_ro(disk, 1); 3285 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3286 } 3287 3288 /* 3289 * Free resources if final stop 3290 */ 3291 if (mode == 0) { 3292 mdk_rdev_t *rdev; 3293 struct list_head *tmp; 3294 struct gendisk *disk; 3295 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 3296 3297 bitmap_destroy(mddev); 3298 if (mddev->bitmap_file) { 3299 restore_bitmap_write_access(mddev->bitmap_file); 3300 fput(mddev->bitmap_file); 3301 mddev->bitmap_file = NULL; 3302 } 3303 mddev->bitmap_offset = 0; 3304 3305 ITERATE_RDEV(mddev,rdev,tmp) 3306 if (rdev->raid_disk >= 0) { 3307 char nm[20]; 3308 sprintf(nm, "rd%d", rdev->raid_disk); 3309 sysfs_remove_link(&mddev->kobj, nm); 3310 } 3311 3312 export_array(mddev); 3313 3314 mddev->array_size = 0; 3315 mddev->size = 0; 3316 mddev->raid_disks = 0; 3317 mddev->recovery_cp = 0; 3318 3319 disk = mddev->gendisk; 3320 if (disk) 3321 set_capacity(disk, 0); 3322 mddev->changed = 1; 3323 } else if (mddev->pers) 3324 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3325 mdname(mddev)); 3326 err = 0; 3327 md_new_event(mddev); 3328 out: 3329 return err; 3330 } 3331 3332 static void autorun_array(mddev_t *mddev) 3333 { 3334 mdk_rdev_t *rdev; 3335 struct list_head *tmp; 3336 int err; 3337 3338 if (list_empty(&mddev->disks)) 3339 return; 3340 3341 printk(KERN_INFO "md: running: "); 3342 3343 ITERATE_RDEV(mddev,rdev,tmp) { 3344 char b[BDEVNAME_SIZE]; 3345 printk("<%s>", bdevname(rdev->bdev,b)); 3346 } 3347 printk("\n"); 3348 3349 err = do_md_run (mddev); 3350 if (err) { 3351 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3352 do_md_stop (mddev, 0); 3353 } 3354 } 3355 3356 /* 3357 * lets try to run arrays based on all disks that have arrived 3358 * until now. (those are in pending_raid_disks) 3359 * 3360 * the method: pick the first pending disk, collect all disks with 3361 * the same UUID, remove all from the pending list and put them into 3362 * the 'same_array' list. Then order this list based on superblock 3363 * update time (freshest comes first), kick out 'old' disks and 3364 * compare superblocks. If everything's fine then run it. 3365 * 3366 * If "unit" is allocated, then bump its reference count 3367 */ 3368 static void autorun_devices(int part) 3369 { 3370 struct list_head *tmp; 3371 mdk_rdev_t *rdev0, *rdev; 3372 mddev_t *mddev; 3373 char b[BDEVNAME_SIZE]; 3374 3375 printk(KERN_INFO "md: autorun ...\n"); 3376 while (!list_empty(&pending_raid_disks)) { 3377 dev_t dev; 3378 LIST_HEAD(candidates); 3379 rdev0 = list_entry(pending_raid_disks.next, 3380 mdk_rdev_t, same_set); 3381 3382 printk(KERN_INFO "md: considering %s ...\n", 3383 bdevname(rdev0->bdev,b)); 3384 INIT_LIST_HEAD(&candidates); 3385 ITERATE_RDEV_PENDING(rdev,tmp) 3386 if (super_90_load(rdev, rdev0, 0) >= 0) { 3387 printk(KERN_INFO "md: adding %s ...\n", 3388 bdevname(rdev->bdev,b)); 3389 list_move(&rdev->same_set, &candidates); 3390 } 3391 /* 3392 * now we have a set of devices, with all of them having 3393 * mostly sane superblocks. It's time to allocate the 3394 * mddev. 3395 */ 3396 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 3397 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 3398 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 3399 break; 3400 } 3401 if (part) 3402 dev = MKDEV(mdp_major, 3403 rdev0->preferred_minor << MdpMinorShift); 3404 else 3405 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 3406 3407 md_probe(dev, NULL, NULL); 3408 mddev = mddev_find(dev); 3409 if (!mddev) { 3410 printk(KERN_ERR 3411 "md: cannot allocate memory for md drive.\n"); 3412 break; 3413 } 3414 if (mddev_lock(mddev)) 3415 printk(KERN_WARNING "md: %s locked, cannot run\n", 3416 mdname(mddev)); 3417 else if (mddev->raid_disks || mddev->major_version 3418 || !list_empty(&mddev->disks)) { 3419 printk(KERN_WARNING 3420 "md: %s already running, cannot run %s\n", 3421 mdname(mddev), bdevname(rdev0->bdev,b)); 3422 mddev_unlock(mddev); 3423 } else { 3424 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 3425 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 3426 list_del_init(&rdev->same_set); 3427 if (bind_rdev_to_array(rdev, mddev)) 3428 export_rdev(rdev); 3429 } 3430 autorun_array(mddev); 3431 mddev_unlock(mddev); 3432 } 3433 /* on success, candidates will be empty, on error 3434 * it won't... 3435 */ 3436 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 3437 export_rdev(rdev); 3438 mddev_put(mddev); 3439 } 3440 printk(KERN_INFO "md: ... autorun DONE.\n"); 3441 } 3442 3443 /* 3444 * import RAID devices based on one partition 3445 * if possible, the array gets run as well. 3446 */ 3447 3448 static int autostart_array(dev_t startdev) 3449 { 3450 char b[BDEVNAME_SIZE]; 3451 int err = -EINVAL, i; 3452 mdp_super_t *sb = NULL; 3453 mdk_rdev_t *start_rdev = NULL, *rdev; 3454 3455 start_rdev = md_import_device(startdev, 0, 0); 3456 if (IS_ERR(start_rdev)) 3457 return err; 3458 3459 3460 /* NOTE: this can only work for 0.90.0 superblocks */ 3461 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 3462 if (sb->major_version != 0 || 3463 sb->minor_version != 90 ) { 3464 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 3465 export_rdev(start_rdev); 3466 return err; 3467 } 3468 3469 if (test_bit(Faulty, &start_rdev->flags)) { 3470 printk(KERN_WARNING 3471 "md: can not autostart based on faulty %s!\n", 3472 bdevname(start_rdev->bdev,b)); 3473 export_rdev(start_rdev); 3474 return err; 3475 } 3476 list_add(&start_rdev->same_set, &pending_raid_disks); 3477 3478 for (i = 0; i < MD_SB_DISKS; i++) { 3479 mdp_disk_t *desc = sb->disks + i; 3480 dev_t dev = MKDEV(desc->major, desc->minor); 3481 3482 if (!dev) 3483 continue; 3484 if (dev == startdev) 3485 continue; 3486 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 3487 continue; 3488 rdev = md_import_device(dev, 0, 0); 3489 if (IS_ERR(rdev)) 3490 continue; 3491 3492 list_add(&rdev->same_set, &pending_raid_disks); 3493 } 3494 3495 /* 3496 * possibly return codes 3497 */ 3498 autorun_devices(0); 3499 return 0; 3500 3501 } 3502 3503 3504 static int get_version(void __user * arg) 3505 { 3506 mdu_version_t ver; 3507 3508 ver.major = MD_MAJOR_VERSION; 3509 ver.minor = MD_MINOR_VERSION; 3510 ver.patchlevel = MD_PATCHLEVEL_VERSION; 3511 3512 if (copy_to_user(arg, &ver, sizeof(ver))) 3513 return -EFAULT; 3514 3515 return 0; 3516 } 3517 3518 static int get_array_info(mddev_t * mddev, void __user * arg) 3519 { 3520 mdu_array_info_t info; 3521 int nr,working,active,failed,spare; 3522 mdk_rdev_t *rdev; 3523 struct list_head *tmp; 3524 3525 nr=working=active=failed=spare=0; 3526 ITERATE_RDEV(mddev,rdev,tmp) { 3527 nr++; 3528 if (test_bit(Faulty, &rdev->flags)) 3529 failed++; 3530 else { 3531 working++; 3532 if (test_bit(In_sync, &rdev->flags)) 3533 active++; 3534 else 3535 spare++; 3536 } 3537 } 3538 3539 info.major_version = mddev->major_version; 3540 info.minor_version = mddev->minor_version; 3541 info.patch_version = MD_PATCHLEVEL_VERSION; 3542 info.ctime = mddev->ctime; 3543 info.level = mddev->level; 3544 info.size = mddev->size; 3545 if (info.size != mddev->size) /* overflow */ 3546 info.size = -1; 3547 info.nr_disks = nr; 3548 info.raid_disks = mddev->raid_disks; 3549 info.md_minor = mddev->md_minor; 3550 info.not_persistent= !mddev->persistent; 3551 3552 info.utime = mddev->utime; 3553 info.state = 0; 3554 if (mddev->in_sync) 3555 info.state = (1<<MD_SB_CLEAN); 3556 if (mddev->bitmap && mddev->bitmap_offset) 3557 info.state = (1<<MD_SB_BITMAP_PRESENT); 3558 info.active_disks = active; 3559 info.working_disks = working; 3560 info.failed_disks = failed; 3561 info.spare_disks = spare; 3562 3563 info.layout = mddev->layout; 3564 info.chunk_size = mddev->chunk_size; 3565 3566 if (copy_to_user(arg, &info, sizeof(info))) 3567 return -EFAULT; 3568 3569 return 0; 3570 } 3571 3572 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 3573 { 3574 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 3575 char *ptr, *buf = NULL; 3576 int err = -ENOMEM; 3577 3578 file = kmalloc(sizeof(*file), GFP_KERNEL); 3579 if (!file) 3580 goto out; 3581 3582 /* bitmap disabled, zero the first byte and copy out */ 3583 if (!mddev->bitmap || !mddev->bitmap->file) { 3584 file->pathname[0] = '\0'; 3585 goto copy_out; 3586 } 3587 3588 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 3589 if (!buf) 3590 goto out; 3591 3592 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 3593 if (!ptr) 3594 goto out; 3595 3596 strcpy(file->pathname, ptr); 3597 3598 copy_out: 3599 err = 0; 3600 if (copy_to_user(arg, file, sizeof(*file))) 3601 err = -EFAULT; 3602 out: 3603 kfree(buf); 3604 kfree(file); 3605 return err; 3606 } 3607 3608 static int get_disk_info(mddev_t * mddev, void __user * arg) 3609 { 3610 mdu_disk_info_t info; 3611 unsigned int nr; 3612 mdk_rdev_t *rdev; 3613 3614 if (copy_from_user(&info, arg, sizeof(info))) 3615 return -EFAULT; 3616 3617 nr = info.number; 3618 3619 rdev = find_rdev_nr(mddev, nr); 3620 if (rdev) { 3621 info.major = MAJOR(rdev->bdev->bd_dev); 3622 info.minor = MINOR(rdev->bdev->bd_dev); 3623 info.raid_disk = rdev->raid_disk; 3624 info.state = 0; 3625 if (test_bit(Faulty, &rdev->flags)) 3626 info.state |= (1<<MD_DISK_FAULTY); 3627 else if (test_bit(In_sync, &rdev->flags)) { 3628 info.state |= (1<<MD_DISK_ACTIVE); 3629 info.state |= (1<<MD_DISK_SYNC); 3630 } 3631 if (test_bit(WriteMostly, &rdev->flags)) 3632 info.state |= (1<<MD_DISK_WRITEMOSTLY); 3633 } else { 3634 info.major = info.minor = 0; 3635 info.raid_disk = -1; 3636 info.state = (1<<MD_DISK_REMOVED); 3637 } 3638 3639 if (copy_to_user(arg, &info, sizeof(info))) 3640 return -EFAULT; 3641 3642 return 0; 3643 } 3644 3645 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 3646 { 3647 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3648 mdk_rdev_t *rdev; 3649 dev_t dev = MKDEV(info->major,info->minor); 3650 3651 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 3652 return -EOVERFLOW; 3653 3654 if (!mddev->raid_disks) { 3655 int err; 3656 /* expecting a device which has a superblock */ 3657 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 3658 if (IS_ERR(rdev)) { 3659 printk(KERN_WARNING 3660 "md: md_import_device returned %ld\n", 3661 PTR_ERR(rdev)); 3662 return PTR_ERR(rdev); 3663 } 3664 if (!list_empty(&mddev->disks)) { 3665 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3666 mdk_rdev_t, same_set); 3667 int err = super_types[mddev->major_version] 3668 .load_super(rdev, rdev0, mddev->minor_version); 3669 if (err < 0) { 3670 printk(KERN_WARNING 3671 "md: %s has different UUID to %s\n", 3672 bdevname(rdev->bdev,b), 3673 bdevname(rdev0->bdev,b2)); 3674 export_rdev(rdev); 3675 return -EINVAL; 3676 } 3677 } 3678 err = bind_rdev_to_array(rdev, mddev); 3679 if (err) 3680 export_rdev(rdev); 3681 return err; 3682 } 3683 3684 /* 3685 * add_new_disk can be used once the array is assembled 3686 * to add "hot spares". They must already have a superblock 3687 * written 3688 */ 3689 if (mddev->pers) { 3690 int err; 3691 if (!mddev->pers->hot_add_disk) { 3692 printk(KERN_WARNING 3693 "%s: personality does not support diskops!\n", 3694 mdname(mddev)); 3695 return -EINVAL; 3696 } 3697 if (mddev->persistent) 3698 rdev = md_import_device(dev, mddev->major_version, 3699 mddev->minor_version); 3700 else 3701 rdev = md_import_device(dev, -1, -1); 3702 if (IS_ERR(rdev)) { 3703 printk(KERN_WARNING 3704 "md: md_import_device returned %ld\n", 3705 PTR_ERR(rdev)); 3706 return PTR_ERR(rdev); 3707 } 3708 /* set save_raid_disk if appropriate */ 3709 if (!mddev->persistent) { 3710 if (info->state & (1<<MD_DISK_SYNC) && 3711 info->raid_disk < mddev->raid_disks) 3712 rdev->raid_disk = info->raid_disk; 3713 else 3714 rdev->raid_disk = -1; 3715 } else 3716 super_types[mddev->major_version]. 3717 validate_super(mddev, rdev); 3718 rdev->saved_raid_disk = rdev->raid_disk; 3719 3720 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 3721 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3722 set_bit(WriteMostly, &rdev->flags); 3723 3724 rdev->raid_disk = -1; 3725 err = bind_rdev_to_array(rdev, mddev); 3726 if (!err && !mddev->pers->hot_remove_disk) { 3727 /* If there is hot_add_disk but no hot_remove_disk 3728 * then added disks for geometry changes, 3729 * and should be added immediately. 3730 */ 3731 super_types[mddev->major_version]. 3732 validate_super(mddev, rdev); 3733 err = mddev->pers->hot_add_disk(mddev, rdev); 3734 if (err) 3735 unbind_rdev_from_array(rdev); 3736 } 3737 if (err) 3738 export_rdev(rdev); 3739 3740 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3741 md_wakeup_thread(mddev->thread); 3742 return err; 3743 } 3744 3745 /* otherwise, add_new_disk is only allowed 3746 * for major_version==0 superblocks 3747 */ 3748 if (mddev->major_version != 0) { 3749 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 3750 mdname(mddev)); 3751 return -EINVAL; 3752 } 3753 3754 if (!(info->state & (1<<MD_DISK_FAULTY))) { 3755 int err; 3756 rdev = md_import_device (dev, -1, 0); 3757 if (IS_ERR(rdev)) { 3758 printk(KERN_WARNING 3759 "md: error, md_import_device() returned %ld\n", 3760 PTR_ERR(rdev)); 3761 return PTR_ERR(rdev); 3762 } 3763 rdev->desc_nr = info->number; 3764 if (info->raid_disk < mddev->raid_disks) 3765 rdev->raid_disk = info->raid_disk; 3766 else 3767 rdev->raid_disk = -1; 3768 3769 rdev->flags = 0; 3770 3771 if (rdev->raid_disk < mddev->raid_disks) 3772 if (info->state & (1<<MD_DISK_SYNC)) 3773 set_bit(In_sync, &rdev->flags); 3774 3775 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3776 set_bit(WriteMostly, &rdev->flags); 3777 3778 if (!mddev->persistent) { 3779 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3780 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3781 } else 3782 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3783 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3784 3785 err = bind_rdev_to_array(rdev, mddev); 3786 if (err) { 3787 export_rdev(rdev); 3788 return err; 3789 } 3790 } 3791 3792 return 0; 3793 } 3794 3795 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 3796 { 3797 char b[BDEVNAME_SIZE]; 3798 mdk_rdev_t *rdev; 3799 3800 if (!mddev->pers) 3801 return -ENODEV; 3802 3803 rdev = find_rdev(mddev, dev); 3804 if (!rdev) 3805 return -ENXIO; 3806 3807 if (rdev->raid_disk >= 0) 3808 goto busy; 3809 3810 kick_rdev_from_array(rdev); 3811 md_update_sb(mddev); 3812 md_new_event(mddev); 3813 3814 return 0; 3815 busy: 3816 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 3817 bdevname(rdev->bdev,b), mdname(mddev)); 3818 return -EBUSY; 3819 } 3820 3821 static int hot_add_disk(mddev_t * mddev, dev_t dev) 3822 { 3823 char b[BDEVNAME_SIZE]; 3824 int err; 3825 unsigned int size; 3826 mdk_rdev_t *rdev; 3827 3828 if (!mddev->pers) 3829 return -ENODEV; 3830 3831 if (mddev->major_version != 0) { 3832 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 3833 " version-0 superblocks.\n", 3834 mdname(mddev)); 3835 return -EINVAL; 3836 } 3837 if (!mddev->pers->hot_add_disk) { 3838 printk(KERN_WARNING 3839 "%s: personality does not support diskops!\n", 3840 mdname(mddev)); 3841 return -EINVAL; 3842 } 3843 3844 rdev = md_import_device (dev, -1, 0); 3845 if (IS_ERR(rdev)) { 3846 printk(KERN_WARNING 3847 "md: error, md_import_device() returned %ld\n", 3848 PTR_ERR(rdev)); 3849 return -EINVAL; 3850 } 3851 3852 if (mddev->persistent) 3853 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3854 else 3855 rdev->sb_offset = 3856 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3857 3858 size = calc_dev_size(rdev, mddev->chunk_size); 3859 rdev->size = size; 3860 3861 if (test_bit(Faulty, &rdev->flags)) { 3862 printk(KERN_WARNING 3863 "md: can not hot-add faulty %s disk to %s!\n", 3864 bdevname(rdev->bdev,b), mdname(mddev)); 3865 err = -EINVAL; 3866 goto abort_export; 3867 } 3868 clear_bit(In_sync, &rdev->flags); 3869 rdev->desc_nr = -1; 3870 err = bind_rdev_to_array(rdev, mddev); 3871 if (err) 3872 goto abort_export; 3873 3874 /* 3875 * The rest should better be atomic, we can have disk failures 3876 * noticed in interrupt contexts ... 3877 */ 3878 3879 if (rdev->desc_nr == mddev->max_disks) { 3880 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 3881 mdname(mddev)); 3882 err = -EBUSY; 3883 goto abort_unbind_export; 3884 } 3885 3886 rdev->raid_disk = -1; 3887 3888 md_update_sb(mddev); 3889 3890 /* 3891 * Kick recovery, maybe this spare has to be added to the 3892 * array immediately. 3893 */ 3894 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3895 md_wakeup_thread(mddev->thread); 3896 md_new_event(mddev); 3897 return 0; 3898 3899 abort_unbind_export: 3900 unbind_rdev_from_array(rdev); 3901 3902 abort_export: 3903 export_rdev(rdev); 3904 return err; 3905 } 3906 3907 static int set_bitmap_file(mddev_t *mddev, int fd) 3908 { 3909 int err; 3910 3911 if (mddev->pers) { 3912 if (!mddev->pers->quiesce) 3913 return -EBUSY; 3914 if (mddev->recovery || mddev->sync_thread) 3915 return -EBUSY; 3916 /* we should be able to change the bitmap.. */ 3917 } 3918 3919 3920 if (fd >= 0) { 3921 if (mddev->bitmap) 3922 return -EEXIST; /* cannot add when bitmap is present */ 3923 mddev->bitmap_file = fget(fd); 3924 3925 if (mddev->bitmap_file == NULL) { 3926 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 3927 mdname(mddev)); 3928 return -EBADF; 3929 } 3930 3931 err = deny_bitmap_write_access(mddev->bitmap_file); 3932 if (err) { 3933 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 3934 mdname(mddev)); 3935 fput(mddev->bitmap_file); 3936 mddev->bitmap_file = NULL; 3937 return err; 3938 } 3939 mddev->bitmap_offset = 0; /* file overrides offset */ 3940 } else if (mddev->bitmap == NULL) 3941 return -ENOENT; /* cannot remove what isn't there */ 3942 err = 0; 3943 if (mddev->pers) { 3944 mddev->pers->quiesce(mddev, 1); 3945 if (fd >= 0) 3946 err = bitmap_create(mddev); 3947 if (fd < 0 || err) { 3948 bitmap_destroy(mddev); 3949 fd = -1; /* make sure to put the file */ 3950 } 3951 mddev->pers->quiesce(mddev, 0); 3952 } 3953 if (fd < 0) { 3954 if (mddev->bitmap_file) { 3955 restore_bitmap_write_access(mddev->bitmap_file); 3956 fput(mddev->bitmap_file); 3957 } 3958 mddev->bitmap_file = NULL; 3959 } 3960 3961 return err; 3962 } 3963 3964 /* 3965 * set_array_info is used two different ways 3966 * The original usage is when creating a new array. 3967 * In this usage, raid_disks is > 0 and it together with 3968 * level, size, not_persistent,layout,chunksize determine the 3969 * shape of the array. 3970 * This will always create an array with a type-0.90.0 superblock. 3971 * The newer usage is when assembling an array. 3972 * In this case raid_disks will be 0, and the major_version field is 3973 * use to determine which style super-blocks are to be found on the devices. 3974 * The minor and patch _version numbers are also kept incase the 3975 * super_block handler wishes to interpret them. 3976 */ 3977 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 3978 { 3979 3980 if (info->raid_disks == 0) { 3981 /* just setting version number for superblock loading */ 3982 if (info->major_version < 0 || 3983 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 3984 super_types[info->major_version].name == NULL) { 3985 /* maybe try to auto-load a module? */ 3986 printk(KERN_INFO 3987 "md: superblock version %d not known\n", 3988 info->major_version); 3989 return -EINVAL; 3990 } 3991 mddev->major_version = info->major_version; 3992 mddev->minor_version = info->minor_version; 3993 mddev->patch_version = info->patch_version; 3994 return 0; 3995 } 3996 mddev->major_version = MD_MAJOR_VERSION; 3997 mddev->minor_version = MD_MINOR_VERSION; 3998 mddev->patch_version = MD_PATCHLEVEL_VERSION; 3999 mddev->ctime = get_seconds(); 4000 4001 mddev->level = info->level; 4002 mddev->clevel[0] = 0; 4003 mddev->size = info->size; 4004 mddev->raid_disks = info->raid_disks; 4005 /* don't set md_minor, it is determined by which /dev/md* was 4006 * openned 4007 */ 4008 if (info->state & (1<<MD_SB_CLEAN)) 4009 mddev->recovery_cp = MaxSector; 4010 else 4011 mddev->recovery_cp = 0; 4012 mddev->persistent = ! info->not_persistent; 4013 4014 mddev->layout = info->layout; 4015 mddev->chunk_size = info->chunk_size; 4016 4017 mddev->max_disks = MD_SB_DISKS; 4018 4019 mddev->sb_dirty = 1; 4020 4021 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 4022 mddev->bitmap_offset = 0; 4023 4024 mddev->reshape_position = MaxSector; 4025 4026 /* 4027 * Generate a 128 bit UUID 4028 */ 4029 get_random_bytes(mddev->uuid, 16); 4030 4031 mddev->new_level = mddev->level; 4032 mddev->new_chunk = mddev->chunk_size; 4033 mddev->new_layout = mddev->layout; 4034 mddev->delta_disks = 0; 4035 4036 return 0; 4037 } 4038 4039 static int update_size(mddev_t *mddev, unsigned long size) 4040 { 4041 mdk_rdev_t * rdev; 4042 int rv; 4043 struct list_head *tmp; 4044 int fit = (size == 0); 4045 4046 if (mddev->pers->resize == NULL) 4047 return -EINVAL; 4048 /* The "size" is the amount of each device that is used. 4049 * This can only make sense for arrays with redundancy. 4050 * linear and raid0 always use whatever space is available 4051 * We can only consider changing the size if no resync 4052 * or reconstruction is happening, and if the new size 4053 * is acceptable. It must fit before the sb_offset or, 4054 * if that is <data_offset, it must fit before the 4055 * size of each device. 4056 * If size is zero, we find the largest size that fits. 4057 */ 4058 if (mddev->sync_thread) 4059 return -EBUSY; 4060 ITERATE_RDEV(mddev,rdev,tmp) { 4061 sector_t avail; 4062 if (rdev->sb_offset > rdev->data_offset) 4063 avail = (rdev->sb_offset*2) - rdev->data_offset; 4064 else 4065 avail = get_capacity(rdev->bdev->bd_disk) 4066 - rdev->data_offset; 4067 if (fit && (size == 0 || size > avail/2)) 4068 size = avail/2; 4069 if (avail < ((sector_t)size << 1)) 4070 return -ENOSPC; 4071 } 4072 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4073 if (!rv) { 4074 struct block_device *bdev; 4075 4076 bdev = bdget_disk(mddev->gendisk, 0); 4077 if (bdev) { 4078 mutex_lock(&bdev->bd_inode->i_mutex); 4079 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4080 mutex_unlock(&bdev->bd_inode->i_mutex); 4081 bdput(bdev); 4082 } 4083 } 4084 return rv; 4085 } 4086 4087 static int update_raid_disks(mddev_t *mddev, int raid_disks) 4088 { 4089 int rv; 4090 /* change the number of raid disks */ 4091 if (mddev->pers->check_reshape == NULL) 4092 return -EINVAL; 4093 if (raid_disks <= 0 || 4094 raid_disks >= mddev->max_disks) 4095 return -EINVAL; 4096 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 4097 return -EBUSY; 4098 mddev->delta_disks = raid_disks - mddev->raid_disks; 4099 4100 rv = mddev->pers->check_reshape(mddev); 4101 return rv; 4102 } 4103 4104 4105 /* 4106 * update_array_info is used to change the configuration of an 4107 * on-line array. 4108 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 4109 * fields in the info are checked against the array. 4110 * Any differences that cannot be handled will cause an error. 4111 * Normally, only one change can be managed at a time. 4112 */ 4113 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 4114 { 4115 int rv = 0; 4116 int cnt = 0; 4117 int state = 0; 4118 4119 /* calculate expected state,ignoring low bits */ 4120 if (mddev->bitmap && mddev->bitmap_offset) 4121 state |= (1 << MD_SB_BITMAP_PRESENT); 4122 4123 if (mddev->major_version != info->major_version || 4124 mddev->minor_version != info->minor_version || 4125 /* mddev->patch_version != info->patch_version || */ 4126 mddev->ctime != info->ctime || 4127 mddev->level != info->level || 4128 /* mddev->layout != info->layout || */ 4129 !mddev->persistent != info->not_persistent|| 4130 mddev->chunk_size != info->chunk_size || 4131 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 4132 ((state^info->state) & 0xfffffe00) 4133 ) 4134 return -EINVAL; 4135 /* Check there is only one change */ 4136 if (info->size >= 0 && mddev->size != info->size) cnt++; 4137 if (mddev->raid_disks != info->raid_disks) cnt++; 4138 if (mddev->layout != info->layout) cnt++; 4139 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 4140 if (cnt == 0) return 0; 4141 if (cnt > 1) return -EINVAL; 4142 4143 if (mddev->layout != info->layout) { 4144 /* Change layout 4145 * we don't need to do anything at the md level, the 4146 * personality will take care of it all. 4147 */ 4148 if (mddev->pers->reconfig == NULL) 4149 return -EINVAL; 4150 else 4151 return mddev->pers->reconfig(mddev, info->layout, -1); 4152 } 4153 if (info->size >= 0 && mddev->size != info->size) 4154 rv = update_size(mddev, info->size); 4155 4156 if (mddev->raid_disks != info->raid_disks) 4157 rv = update_raid_disks(mddev, info->raid_disks); 4158 4159 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 4160 if (mddev->pers->quiesce == NULL) 4161 return -EINVAL; 4162 if (mddev->recovery || mddev->sync_thread) 4163 return -EBUSY; 4164 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 4165 /* add the bitmap */ 4166 if (mddev->bitmap) 4167 return -EEXIST; 4168 if (mddev->default_bitmap_offset == 0) 4169 return -EINVAL; 4170 mddev->bitmap_offset = mddev->default_bitmap_offset; 4171 mddev->pers->quiesce(mddev, 1); 4172 rv = bitmap_create(mddev); 4173 if (rv) 4174 bitmap_destroy(mddev); 4175 mddev->pers->quiesce(mddev, 0); 4176 } else { 4177 /* remove the bitmap */ 4178 if (!mddev->bitmap) 4179 return -ENOENT; 4180 if (mddev->bitmap->file) 4181 return -EINVAL; 4182 mddev->pers->quiesce(mddev, 1); 4183 bitmap_destroy(mddev); 4184 mddev->pers->quiesce(mddev, 0); 4185 mddev->bitmap_offset = 0; 4186 } 4187 } 4188 md_update_sb(mddev); 4189 return rv; 4190 } 4191 4192 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 4193 { 4194 mdk_rdev_t *rdev; 4195 4196 if (mddev->pers == NULL) 4197 return -ENODEV; 4198 4199 rdev = find_rdev(mddev, dev); 4200 if (!rdev) 4201 return -ENODEV; 4202 4203 md_error(mddev, rdev); 4204 return 0; 4205 } 4206 4207 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4208 { 4209 mddev_t *mddev = bdev->bd_disk->private_data; 4210 4211 geo->heads = 2; 4212 geo->sectors = 4; 4213 geo->cylinders = get_capacity(mddev->gendisk) / 8; 4214 return 0; 4215 } 4216 4217 static int md_ioctl(struct inode *inode, struct file *file, 4218 unsigned int cmd, unsigned long arg) 4219 { 4220 int err = 0; 4221 void __user *argp = (void __user *)arg; 4222 mddev_t *mddev = NULL; 4223 4224 if (!capable(CAP_SYS_ADMIN)) 4225 return -EACCES; 4226 4227 /* 4228 * Commands dealing with the RAID driver but not any 4229 * particular array: 4230 */ 4231 switch (cmd) 4232 { 4233 case RAID_VERSION: 4234 err = get_version(argp); 4235 goto done; 4236 4237 case PRINT_RAID_DEBUG: 4238 err = 0; 4239 md_print_devices(); 4240 goto done; 4241 4242 #ifndef MODULE 4243 case RAID_AUTORUN: 4244 err = 0; 4245 autostart_arrays(arg); 4246 goto done; 4247 #endif 4248 default:; 4249 } 4250 4251 /* 4252 * Commands creating/starting a new array: 4253 */ 4254 4255 mddev = inode->i_bdev->bd_disk->private_data; 4256 4257 if (!mddev) { 4258 BUG(); 4259 goto abort; 4260 } 4261 4262 4263 if (cmd == START_ARRAY) { 4264 /* START_ARRAY doesn't need to lock the array as autostart_array 4265 * does the locking, and it could even be a different array 4266 */ 4267 static int cnt = 3; 4268 if (cnt > 0 ) { 4269 printk(KERN_WARNING 4270 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 4271 "This will not be supported beyond July 2006\n", 4272 current->comm, current->pid); 4273 cnt--; 4274 } 4275 err = autostart_array(new_decode_dev(arg)); 4276 if (err) { 4277 printk(KERN_WARNING "md: autostart failed!\n"); 4278 goto abort; 4279 } 4280 goto done; 4281 } 4282 4283 err = mddev_lock(mddev); 4284 if (err) { 4285 printk(KERN_INFO 4286 "md: ioctl lock interrupted, reason %d, cmd %d\n", 4287 err, cmd); 4288 goto abort; 4289 } 4290 4291 switch (cmd) 4292 { 4293 case SET_ARRAY_INFO: 4294 { 4295 mdu_array_info_t info; 4296 if (!arg) 4297 memset(&info, 0, sizeof(info)); 4298 else if (copy_from_user(&info, argp, sizeof(info))) { 4299 err = -EFAULT; 4300 goto abort_unlock; 4301 } 4302 if (mddev->pers) { 4303 err = update_array_info(mddev, &info); 4304 if (err) { 4305 printk(KERN_WARNING "md: couldn't update" 4306 " array info. %d\n", err); 4307 goto abort_unlock; 4308 } 4309 goto done_unlock; 4310 } 4311 if (!list_empty(&mddev->disks)) { 4312 printk(KERN_WARNING 4313 "md: array %s already has disks!\n", 4314 mdname(mddev)); 4315 err = -EBUSY; 4316 goto abort_unlock; 4317 } 4318 if (mddev->raid_disks) { 4319 printk(KERN_WARNING 4320 "md: array %s already initialised!\n", 4321 mdname(mddev)); 4322 err = -EBUSY; 4323 goto abort_unlock; 4324 } 4325 err = set_array_info(mddev, &info); 4326 if (err) { 4327 printk(KERN_WARNING "md: couldn't set" 4328 " array info. %d\n", err); 4329 goto abort_unlock; 4330 } 4331 } 4332 goto done_unlock; 4333 4334 default:; 4335 } 4336 4337 /* 4338 * Commands querying/configuring an existing array: 4339 */ 4340 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 4341 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 4342 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 4343 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 4344 err = -ENODEV; 4345 goto abort_unlock; 4346 } 4347 4348 /* 4349 * Commands even a read-only array can execute: 4350 */ 4351 switch (cmd) 4352 { 4353 case GET_ARRAY_INFO: 4354 err = get_array_info(mddev, argp); 4355 goto done_unlock; 4356 4357 case GET_BITMAP_FILE: 4358 err = get_bitmap_file(mddev, argp); 4359 goto done_unlock; 4360 4361 case GET_DISK_INFO: 4362 err = get_disk_info(mddev, argp); 4363 goto done_unlock; 4364 4365 case RESTART_ARRAY_RW: 4366 err = restart_array(mddev); 4367 goto done_unlock; 4368 4369 case STOP_ARRAY: 4370 err = do_md_stop (mddev, 0); 4371 goto done_unlock; 4372 4373 case STOP_ARRAY_RO: 4374 err = do_md_stop (mddev, 1); 4375 goto done_unlock; 4376 4377 /* 4378 * We have a problem here : there is no easy way to give a CHS 4379 * virtual geometry. We currently pretend that we have a 2 heads 4380 * 4 sectors (with a BIG number of cylinders...). This drives 4381 * dosfs just mad... ;-) 4382 */ 4383 } 4384 4385 /* 4386 * The remaining ioctls are changing the state of the 4387 * superblock, so we do not allow them on read-only arrays. 4388 * However non-MD ioctls (e.g. get-size) will still come through 4389 * here and hit the 'default' below, so only disallow 4390 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4391 */ 4392 if (_IOC_TYPE(cmd) == MD_MAJOR && 4393 mddev->ro && mddev->pers) { 4394 if (mddev->ro == 2) { 4395 mddev->ro = 0; 4396 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4397 md_wakeup_thread(mddev->thread); 4398 4399 } else { 4400 err = -EROFS; 4401 goto abort_unlock; 4402 } 4403 } 4404 4405 switch (cmd) 4406 { 4407 case ADD_NEW_DISK: 4408 { 4409 mdu_disk_info_t info; 4410 if (copy_from_user(&info, argp, sizeof(info))) 4411 err = -EFAULT; 4412 else 4413 err = add_new_disk(mddev, &info); 4414 goto done_unlock; 4415 } 4416 4417 case HOT_REMOVE_DISK: 4418 err = hot_remove_disk(mddev, new_decode_dev(arg)); 4419 goto done_unlock; 4420 4421 case HOT_ADD_DISK: 4422 err = hot_add_disk(mddev, new_decode_dev(arg)); 4423 goto done_unlock; 4424 4425 case SET_DISK_FAULTY: 4426 err = set_disk_faulty(mddev, new_decode_dev(arg)); 4427 goto done_unlock; 4428 4429 case RUN_ARRAY: 4430 err = do_md_run (mddev); 4431 goto done_unlock; 4432 4433 case SET_BITMAP_FILE: 4434 err = set_bitmap_file(mddev, (int)arg); 4435 goto done_unlock; 4436 4437 default: 4438 err = -EINVAL; 4439 goto abort_unlock; 4440 } 4441 4442 done_unlock: 4443 abort_unlock: 4444 mddev_unlock(mddev); 4445 4446 return err; 4447 done: 4448 if (err) 4449 MD_BUG(); 4450 abort: 4451 return err; 4452 } 4453 4454 static int md_open(struct inode *inode, struct file *file) 4455 { 4456 /* 4457 * Succeed if we can lock the mddev, which confirms that 4458 * it isn't being stopped right now. 4459 */ 4460 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4461 int err; 4462 4463 if ((err = mddev_lock(mddev))) 4464 goto out; 4465 4466 err = 0; 4467 mddev_get(mddev); 4468 mddev_unlock(mddev); 4469 4470 check_disk_change(inode->i_bdev); 4471 out: 4472 return err; 4473 } 4474 4475 static int md_release(struct inode *inode, struct file * file) 4476 { 4477 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4478 4479 if (!mddev) 4480 BUG(); 4481 mddev_put(mddev); 4482 4483 return 0; 4484 } 4485 4486 static int md_media_changed(struct gendisk *disk) 4487 { 4488 mddev_t *mddev = disk->private_data; 4489 4490 return mddev->changed; 4491 } 4492 4493 static int md_revalidate(struct gendisk *disk) 4494 { 4495 mddev_t *mddev = disk->private_data; 4496 4497 mddev->changed = 0; 4498 return 0; 4499 } 4500 static struct block_device_operations md_fops = 4501 { 4502 .owner = THIS_MODULE, 4503 .open = md_open, 4504 .release = md_release, 4505 .ioctl = md_ioctl, 4506 .getgeo = md_getgeo, 4507 .media_changed = md_media_changed, 4508 .revalidate_disk= md_revalidate, 4509 }; 4510 4511 static int md_thread(void * arg) 4512 { 4513 mdk_thread_t *thread = arg; 4514 4515 /* 4516 * md_thread is a 'system-thread', it's priority should be very 4517 * high. We avoid resource deadlocks individually in each 4518 * raid personality. (RAID5 does preallocation) We also use RR and 4519 * the very same RT priority as kswapd, thus we will never get 4520 * into a priority inversion deadlock. 4521 * 4522 * we definitely have to have equal or higher priority than 4523 * bdflush, otherwise bdflush will deadlock if there are too 4524 * many dirty RAID5 blocks. 4525 */ 4526 4527 allow_signal(SIGKILL); 4528 while (!kthread_should_stop()) { 4529 4530 /* We need to wait INTERRUPTIBLE so that 4531 * we don't add to the load-average. 4532 * That means we need to be sure no signals are 4533 * pending 4534 */ 4535 if (signal_pending(current)) 4536 flush_signals(current); 4537 4538 wait_event_interruptible_timeout 4539 (thread->wqueue, 4540 test_bit(THREAD_WAKEUP, &thread->flags) 4541 || kthread_should_stop(), 4542 thread->timeout); 4543 try_to_freeze(); 4544 4545 clear_bit(THREAD_WAKEUP, &thread->flags); 4546 4547 thread->run(thread->mddev); 4548 } 4549 4550 return 0; 4551 } 4552 4553 void md_wakeup_thread(mdk_thread_t *thread) 4554 { 4555 if (thread) { 4556 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 4557 set_bit(THREAD_WAKEUP, &thread->flags); 4558 wake_up(&thread->wqueue); 4559 } 4560 } 4561 4562 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 4563 const char *name) 4564 { 4565 mdk_thread_t *thread; 4566 4567 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 4568 if (!thread) 4569 return NULL; 4570 4571 init_waitqueue_head(&thread->wqueue); 4572 4573 thread->run = run; 4574 thread->mddev = mddev; 4575 thread->timeout = MAX_SCHEDULE_TIMEOUT; 4576 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 4577 if (IS_ERR(thread->tsk)) { 4578 kfree(thread); 4579 return NULL; 4580 } 4581 return thread; 4582 } 4583 4584 void md_unregister_thread(mdk_thread_t *thread) 4585 { 4586 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 4587 4588 kthread_stop(thread->tsk); 4589 kfree(thread); 4590 } 4591 4592 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 4593 { 4594 if (!mddev) { 4595 MD_BUG(); 4596 return; 4597 } 4598 4599 if (!rdev || test_bit(Faulty, &rdev->flags)) 4600 return; 4601 /* 4602 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4603 mdname(mddev), 4604 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 4605 __builtin_return_address(0),__builtin_return_address(1), 4606 __builtin_return_address(2),__builtin_return_address(3)); 4607 */ 4608 if (!mddev->pers) 4609 return; 4610 if (!mddev->pers->error_handler) 4611 return; 4612 mddev->pers->error_handler(mddev,rdev); 4613 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4614 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4615 md_wakeup_thread(mddev->thread); 4616 md_new_event_inintr(mddev); 4617 } 4618 4619 /* seq_file implementation /proc/mdstat */ 4620 4621 static void status_unused(struct seq_file *seq) 4622 { 4623 int i = 0; 4624 mdk_rdev_t *rdev; 4625 struct list_head *tmp; 4626 4627 seq_printf(seq, "unused devices: "); 4628 4629 ITERATE_RDEV_PENDING(rdev,tmp) { 4630 char b[BDEVNAME_SIZE]; 4631 i++; 4632 seq_printf(seq, "%s ", 4633 bdevname(rdev->bdev,b)); 4634 } 4635 if (!i) 4636 seq_printf(seq, "<none>"); 4637 4638 seq_printf(seq, "\n"); 4639 } 4640 4641 4642 static void status_resync(struct seq_file *seq, mddev_t * mddev) 4643 { 4644 sector_t max_blocks, resync, res; 4645 unsigned long dt, db, rt; 4646 int scale; 4647 unsigned int per_milli; 4648 4649 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 4650 4651 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4652 max_blocks = mddev->resync_max_sectors >> 1; 4653 else 4654 max_blocks = mddev->size; 4655 4656 /* 4657 * Should not happen. 4658 */ 4659 if (!max_blocks) { 4660 MD_BUG(); 4661 return; 4662 } 4663 /* Pick 'scale' such that (resync>>scale)*1000 will fit 4664 * in a sector_t, and (max_blocks>>scale) will fit in a 4665 * u32, as those are the requirements for sector_div. 4666 * Thus 'scale' must be at least 10 4667 */ 4668 scale = 10; 4669 if (sizeof(sector_t) > sizeof(unsigned long)) { 4670 while ( max_blocks/2 > (1ULL<<(scale+32))) 4671 scale++; 4672 } 4673 res = (resync>>scale)*1000; 4674 sector_div(res, (u32)((max_blocks>>scale)+1)); 4675 4676 per_milli = res; 4677 { 4678 int i, x = per_milli/50, y = 20-x; 4679 seq_printf(seq, "["); 4680 for (i = 0; i < x; i++) 4681 seq_printf(seq, "="); 4682 seq_printf(seq, ">"); 4683 for (i = 0; i < y; i++) 4684 seq_printf(seq, "."); 4685 seq_printf(seq, "] "); 4686 } 4687 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 4688 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 4689 "reshape" : 4690 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 4691 "resync" : "recovery")), 4692 per_milli/10, per_milli % 10, 4693 (unsigned long long) resync, 4694 (unsigned long long) max_blocks); 4695 4696 /* 4697 * We do not want to overflow, so the order of operands and 4698 * the * 100 / 100 trick are important. We do a +1 to be 4699 * safe against division by zero. We only estimate anyway. 4700 * 4701 * dt: time from mark until now 4702 * db: blocks written from mark until now 4703 * rt: remaining time 4704 */ 4705 dt = ((jiffies - mddev->resync_mark) / HZ); 4706 if (!dt) dt++; 4707 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 4708 - mddev->resync_mark_cnt; 4709 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; 4710 4711 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 4712 4713 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 4714 } 4715 4716 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 4717 { 4718 struct list_head *tmp; 4719 loff_t l = *pos; 4720 mddev_t *mddev; 4721 4722 if (l >= 0x10000) 4723 return NULL; 4724 if (!l--) 4725 /* header */ 4726 return (void*)1; 4727 4728 spin_lock(&all_mddevs_lock); 4729 list_for_each(tmp,&all_mddevs) 4730 if (!l--) { 4731 mddev = list_entry(tmp, mddev_t, all_mddevs); 4732 mddev_get(mddev); 4733 spin_unlock(&all_mddevs_lock); 4734 return mddev; 4735 } 4736 spin_unlock(&all_mddevs_lock); 4737 if (!l--) 4738 return (void*)2;/* tail */ 4739 return NULL; 4740 } 4741 4742 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4743 { 4744 struct list_head *tmp; 4745 mddev_t *next_mddev, *mddev = v; 4746 4747 ++*pos; 4748 if (v == (void*)2) 4749 return NULL; 4750 4751 spin_lock(&all_mddevs_lock); 4752 if (v == (void*)1) 4753 tmp = all_mddevs.next; 4754 else 4755 tmp = mddev->all_mddevs.next; 4756 if (tmp != &all_mddevs) 4757 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 4758 else { 4759 next_mddev = (void*)2; 4760 *pos = 0x10000; 4761 } 4762 spin_unlock(&all_mddevs_lock); 4763 4764 if (v != (void*)1) 4765 mddev_put(mddev); 4766 return next_mddev; 4767 4768 } 4769 4770 static void md_seq_stop(struct seq_file *seq, void *v) 4771 { 4772 mddev_t *mddev = v; 4773 4774 if (mddev && v != (void*)1 && v != (void*)2) 4775 mddev_put(mddev); 4776 } 4777 4778 struct mdstat_info { 4779 int event; 4780 }; 4781 4782 static int md_seq_show(struct seq_file *seq, void *v) 4783 { 4784 mddev_t *mddev = v; 4785 sector_t size; 4786 struct list_head *tmp2; 4787 mdk_rdev_t *rdev; 4788 struct mdstat_info *mi = seq->private; 4789 struct bitmap *bitmap; 4790 4791 if (v == (void*)1) { 4792 struct mdk_personality *pers; 4793 seq_printf(seq, "Personalities : "); 4794 spin_lock(&pers_lock); 4795 list_for_each_entry(pers, &pers_list, list) 4796 seq_printf(seq, "[%s] ", pers->name); 4797 4798 spin_unlock(&pers_lock); 4799 seq_printf(seq, "\n"); 4800 mi->event = atomic_read(&md_event_count); 4801 return 0; 4802 } 4803 if (v == (void*)2) { 4804 status_unused(seq); 4805 return 0; 4806 } 4807 4808 if (mddev_lock(mddev) < 0) 4809 return -EINTR; 4810 4811 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 4812 seq_printf(seq, "%s : %sactive", mdname(mddev), 4813 mddev->pers ? "" : "in"); 4814 if (mddev->pers) { 4815 if (mddev->ro==1) 4816 seq_printf(seq, " (read-only)"); 4817 if (mddev->ro==2) 4818 seq_printf(seq, "(auto-read-only)"); 4819 seq_printf(seq, " %s", mddev->pers->name); 4820 } 4821 4822 size = 0; 4823 ITERATE_RDEV(mddev,rdev,tmp2) { 4824 char b[BDEVNAME_SIZE]; 4825 seq_printf(seq, " %s[%d]", 4826 bdevname(rdev->bdev,b), rdev->desc_nr); 4827 if (test_bit(WriteMostly, &rdev->flags)) 4828 seq_printf(seq, "(W)"); 4829 if (test_bit(Faulty, &rdev->flags)) { 4830 seq_printf(seq, "(F)"); 4831 continue; 4832 } else if (rdev->raid_disk < 0) 4833 seq_printf(seq, "(S)"); /* spare */ 4834 size += rdev->size; 4835 } 4836 4837 if (!list_empty(&mddev->disks)) { 4838 if (mddev->pers) 4839 seq_printf(seq, "\n %llu blocks", 4840 (unsigned long long)mddev->array_size); 4841 else 4842 seq_printf(seq, "\n %llu blocks", 4843 (unsigned long long)size); 4844 } 4845 if (mddev->persistent) { 4846 if (mddev->major_version != 0 || 4847 mddev->minor_version != 90) { 4848 seq_printf(seq," super %d.%d", 4849 mddev->major_version, 4850 mddev->minor_version); 4851 } 4852 } else 4853 seq_printf(seq, " super non-persistent"); 4854 4855 if (mddev->pers) { 4856 mddev->pers->status (seq, mddev); 4857 seq_printf(seq, "\n "); 4858 if (mddev->pers->sync_request) { 4859 if (mddev->curr_resync > 2) { 4860 status_resync (seq, mddev); 4861 seq_printf(seq, "\n "); 4862 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 4863 seq_printf(seq, "\tresync=DELAYED\n "); 4864 else if (mddev->recovery_cp < MaxSector) 4865 seq_printf(seq, "\tresync=PENDING\n "); 4866 } 4867 } else 4868 seq_printf(seq, "\n "); 4869 4870 if ((bitmap = mddev->bitmap)) { 4871 unsigned long chunk_kb; 4872 unsigned long flags; 4873 spin_lock_irqsave(&bitmap->lock, flags); 4874 chunk_kb = bitmap->chunksize >> 10; 4875 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 4876 "%lu%s chunk", 4877 bitmap->pages - bitmap->missing_pages, 4878 bitmap->pages, 4879 (bitmap->pages - bitmap->missing_pages) 4880 << (PAGE_SHIFT - 10), 4881 chunk_kb ? chunk_kb : bitmap->chunksize, 4882 chunk_kb ? "KB" : "B"); 4883 if (bitmap->file) { 4884 seq_printf(seq, ", file: "); 4885 seq_path(seq, bitmap->file->f_vfsmnt, 4886 bitmap->file->f_dentry," \t\n"); 4887 } 4888 4889 seq_printf(seq, "\n"); 4890 spin_unlock_irqrestore(&bitmap->lock, flags); 4891 } 4892 4893 seq_printf(seq, "\n"); 4894 } 4895 mddev_unlock(mddev); 4896 4897 return 0; 4898 } 4899 4900 static struct seq_operations md_seq_ops = { 4901 .start = md_seq_start, 4902 .next = md_seq_next, 4903 .stop = md_seq_stop, 4904 .show = md_seq_show, 4905 }; 4906 4907 static int md_seq_open(struct inode *inode, struct file *file) 4908 { 4909 int error; 4910 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 4911 if (mi == NULL) 4912 return -ENOMEM; 4913 4914 error = seq_open(file, &md_seq_ops); 4915 if (error) 4916 kfree(mi); 4917 else { 4918 struct seq_file *p = file->private_data; 4919 p->private = mi; 4920 mi->event = atomic_read(&md_event_count); 4921 } 4922 return error; 4923 } 4924 4925 static int md_seq_release(struct inode *inode, struct file *file) 4926 { 4927 struct seq_file *m = file->private_data; 4928 struct mdstat_info *mi = m->private; 4929 m->private = NULL; 4930 kfree(mi); 4931 return seq_release(inode, file); 4932 } 4933 4934 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 4935 { 4936 struct seq_file *m = filp->private_data; 4937 struct mdstat_info *mi = m->private; 4938 int mask; 4939 4940 poll_wait(filp, &md_event_waiters, wait); 4941 4942 /* always allow read */ 4943 mask = POLLIN | POLLRDNORM; 4944 4945 if (mi->event != atomic_read(&md_event_count)) 4946 mask |= POLLERR | POLLPRI; 4947 return mask; 4948 } 4949 4950 static struct file_operations md_seq_fops = { 4951 .open = md_seq_open, 4952 .read = seq_read, 4953 .llseek = seq_lseek, 4954 .release = md_seq_release, 4955 .poll = mdstat_poll, 4956 }; 4957 4958 int register_md_personality(struct mdk_personality *p) 4959 { 4960 spin_lock(&pers_lock); 4961 list_add_tail(&p->list, &pers_list); 4962 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 4963 spin_unlock(&pers_lock); 4964 return 0; 4965 } 4966 4967 int unregister_md_personality(struct mdk_personality *p) 4968 { 4969 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 4970 spin_lock(&pers_lock); 4971 list_del_init(&p->list); 4972 spin_unlock(&pers_lock); 4973 return 0; 4974 } 4975 4976 static int is_mddev_idle(mddev_t *mddev) 4977 { 4978 mdk_rdev_t * rdev; 4979 struct list_head *tmp; 4980 int idle; 4981 unsigned long curr_events; 4982 4983 idle = 1; 4984 ITERATE_RDEV(mddev,rdev,tmp) { 4985 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 4986 curr_events = disk_stat_read(disk, sectors[0]) + 4987 disk_stat_read(disk, sectors[1]) - 4988 atomic_read(&disk->sync_io); 4989 /* The difference between curr_events and last_events 4990 * will be affected by any new non-sync IO (making 4991 * curr_events bigger) and any difference in the amount of 4992 * in-flight syncio (making current_events bigger or smaller) 4993 * The amount in-flight is currently limited to 4994 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 4995 * which is at most 4096 sectors. 4996 * These numbers are fairly fragile and should be made 4997 * more robust, probably by enforcing the 4998 * 'window size' that md_do_sync sort-of uses. 4999 * 5000 * Note: the following is an unsigned comparison. 5001 */ 5002 if ((curr_events - rdev->last_events + 4096) > 8192) { 5003 rdev->last_events = curr_events; 5004 idle = 0; 5005 } 5006 } 5007 return idle; 5008 } 5009 5010 void md_done_sync(mddev_t *mddev, int blocks, int ok) 5011 { 5012 /* another "blocks" (512byte) blocks have been synced */ 5013 atomic_sub(blocks, &mddev->recovery_active); 5014 wake_up(&mddev->recovery_wait); 5015 if (!ok) { 5016 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5017 md_wakeup_thread(mddev->thread); 5018 // stop recovery, signal do_sync .... 5019 } 5020 } 5021 5022 5023 /* md_write_start(mddev, bi) 5024 * If we need to update some array metadata (e.g. 'active' flag 5025 * in superblock) before writing, schedule a superblock update 5026 * and wait for it to complete. 5027 */ 5028 void md_write_start(mddev_t *mddev, struct bio *bi) 5029 { 5030 if (bio_data_dir(bi) != WRITE) 5031 return; 5032 5033 BUG_ON(mddev->ro == 1); 5034 if (mddev->ro == 2) { 5035 /* need to switch to read/write */ 5036 mddev->ro = 0; 5037 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5038 md_wakeup_thread(mddev->thread); 5039 } 5040 atomic_inc(&mddev->writes_pending); 5041 if (mddev->in_sync) { 5042 spin_lock_irq(&mddev->write_lock); 5043 if (mddev->in_sync) { 5044 mddev->in_sync = 0; 5045 mddev->sb_dirty = 3; 5046 md_wakeup_thread(mddev->thread); 5047 } 5048 spin_unlock_irq(&mddev->write_lock); 5049 } 5050 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 5051 } 5052 5053 void md_write_end(mddev_t *mddev) 5054 { 5055 if (atomic_dec_and_test(&mddev->writes_pending)) { 5056 if (mddev->safemode == 2) 5057 md_wakeup_thread(mddev->thread); 5058 else if (mddev->safemode_delay) 5059 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5060 } 5061 } 5062 5063 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 5064 5065 #define SYNC_MARKS 10 5066 #define SYNC_MARK_STEP (3*HZ) 5067 void md_do_sync(mddev_t *mddev) 5068 { 5069 mddev_t *mddev2; 5070 unsigned int currspeed = 0, 5071 window; 5072 sector_t max_sectors,j, io_sectors; 5073 unsigned long mark[SYNC_MARKS]; 5074 sector_t mark_cnt[SYNC_MARKS]; 5075 int last_mark,m; 5076 struct list_head *tmp; 5077 sector_t last_check; 5078 int skipped = 0; 5079 struct list_head *rtmp; 5080 mdk_rdev_t *rdev; 5081 5082 /* just incase thread restarts... */ 5083 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5084 return; 5085 if (mddev->ro) /* never try to sync a read-only array */ 5086 return; 5087 5088 /* we overload curr_resync somewhat here. 5089 * 0 == not engaged in resync at all 5090 * 2 == checking that there is no conflict with another sync 5091 * 1 == like 2, but have yielded to allow conflicting resync to 5092 * commense 5093 * other == active in resync - this many blocks 5094 * 5095 * Before starting a resync we must have set curr_resync to 5096 * 2, and then checked that every "conflicting" array has curr_resync 5097 * less than ours. When we find one that is the same or higher 5098 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 5099 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 5100 * This will mean we have to start checking from the beginning again. 5101 * 5102 */ 5103 5104 do { 5105 mddev->curr_resync = 2; 5106 5107 try_again: 5108 if (kthread_should_stop()) { 5109 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5110 goto skip; 5111 } 5112 ITERATE_MDDEV(mddev2,tmp) { 5113 if (mddev2 == mddev) 5114 continue; 5115 if (mddev2->curr_resync && 5116 match_mddev_units(mddev,mddev2)) { 5117 DEFINE_WAIT(wq); 5118 if (mddev < mddev2 && mddev->curr_resync == 2) { 5119 /* arbitrarily yield */ 5120 mddev->curr_resync = 1; 5121 wake_up(&resync_wait); 5122 } 5123 if (mddev > mddev2 && mddev->curr_resync == 1) 5124 /* no need to wait here, we can wait the next 5125 * time 'round when curr_resync == 2 5126 */ 5127 continue; 5128 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 5129 if (!kthread_should_stop() && 5130 mddev2->curr_resync >= mddev->curr_resync) { 5131 printk(KERN_INFO "md: delaying resync of %s" 5132 " until %s has finished resync (they" 5133 " share one or more physical units)\n", 5134 mdname(mddev), mdname(mddev2)); 5135 mddev_put(mddev2); 5136 schedule(); 5137 finish_wait(&resync_wait, &wq); 5138 goto try_again; 5139 } 5140 finish_wait(&resync_wait, &wq); 5141 } 5142 } 5143 } while (mddev->curr_resync < 2); 5144 5145 j = 0; 5146 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5147 /* resync follows the size requested by the personality, 5148 * which defaults to physical size, but can be virtual size 5149 */ 5150 max_sectors = mddev->resync_max_sectors; 5151 mddev->resync_mismatches = 0; 5152 /* we don't use the checkpoint if there's a bitmap */ 5153 if (!mddev->bitmap && 5154 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5155 j = mddev->recovery_cp; 5156 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5157 max_sectors = mddev->size << 1; 5158 else { 5159 /* recovery follows the physical size of devices */ 5160 max_sectors = mddev->size << 1; 5161 j = MaxSector; 5162 ITERATE_RDEV(mddev,rdev,rtmp) 5163 if (rdev->raid_disk >= 0 && 5164 !test_bit(Faulty, &rdev->flags) && 5165 !test_bit(In_sync, &rdev->flags) && 5166 rdev->recovery_offset < j) 5167 j = rdev->recovery_offset; 5168 } 5169 5170 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 5171 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 5172 " %d KB/sec/disc.\n", speed_min(mddev)); 5173 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 5174 "(but not more than %d KB/sec) for reconstruction.\n", 5175 speed_max(mddev)); 5176 5177 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5178 5179 io_sectors = 0; 5180 for (m = 0; m < SYNC_MARKS; m++) { 5181 mark[m] = jiffies; 5182 mark_cnt[m] = io_sectors; 5183 } 5184 last_mark = 0; 5185 mddev->resync_mark = mark[last_mark]; 5186 mddev->resync_mark_cnt = mark_cnt[last_mark]; 5187 5188 /* 5189 * Tune reconstruction: 5190 */ 5191 window = 32*(PAGE_SIZE/512); 5192 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 5193 window/2,(unsigned long long) max_sectors/2); 5194 5195 atomic_set(&mddev->recovery_active, 0); 5196 init_waitqueue_head(&mddev->recovery_wait); 5197 last_check = 0; 5198 5199 if (j>2) { 5200 printk(KERN_INFO 5201 "md: resuming recovery of %s from checkpoint.\n", 5202 mdname(mddev)); 5203 mddev->curr_resync = j; 5204 } 5205 5206 while (j < max_sectors) { 5207 sector_t sectors; 5208 5209 skipped = 0; 5210 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5211 currspeed < speed_min(mddev)); 5212 if (sectors == 0) { 5213 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5214 goto out; 5215 } 5216 5217 if (!skipped) { /* actual IO requested */ 5218 io_sectors += sectors; 5219 atomic_add(sectors, &mddev->recovery_active); 5220 } 5221 5222 j += sectors; 5223 if (j>1) mddev->curr_resync = j; 5224 mddev->curr_mark_cnt = io_sectors; 5225 if (last_check == 0) 5226 /* this is the earliers that rebuilt will be 5227 * visible in /proc/mdstat 5228 */ 5229 md_new_event(mddev); 5230 5231 if (last_check + window > io_sectors || j == max_sectors) 5232 continue; 5233 5234 last_check = io_sectors; 5235 5236 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 5237 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 5238 break; 5239 5240 repeat: 5241 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 5242 /* step marks */ 5243 int next = (last_mark+1) % SYNC_MARKS; 5244 5245 mddev->resync_mark = mark[next]; 5246 mddev->resync_mark_cnt = mark_cnt[next]; 5247 mark[next] = jiffies; 5248 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 5249 last_mark = next; 5250 } 5251 5252 5253 if (kthread_should_stop()) { 5254 /* 5255 * got a signal, exit. 5256 */ 5257 printk(KERN_INFO 5258 "md: md_do_sync() got signal ... exiting\n"); 5259 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5260 goto out; 5261 } 5262 5263 /* 5264 * this loop exits only if either when we are slower than 5265 * the 'hard' speed limit, or the system was IO-idle for 5266 * a jiffy. 5267 * the system might be non-idle CPU-wise, but we only care 5268 * about not overloading the IO subsystem. (things like an 5269 * e2fsck being done on the RAID array should execute fast) 5270 */ 5271 mddev->queue->unplug_fn(mddev->queue); 5272 cond_resched(); 5273 5274 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 5275 /((jiffies-mddev->resync_mark)/HZ +1) +1; 5276 5277 if (currspeed > speed_min(mddev)) { 5278 if ((currspeed > speed_max(mddev)) || 5279 !is_mddev_idle(mddev)) { 5280 msleep(500); 5281 goto repeat; 5282 } 5283 } 5284 } 5285 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 5286 /* 5287 * this also signals 'finished resyncing' to md_stop 5288 */ 5289 out: 5290 mddev->queue->unplug_fn(mddev->queue); 5291 5292 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 5293 5294 /* tell personality that we are finished */ 5295 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5296 5297 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5298 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 5299 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5300 mddev->curr_resync > 2) { 5301 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5302 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5303 if (mddev->curr_resync >= mddev->recovery_cp) { 5304 printk(KERN_INFO 5305 "md: checkpointing recovery of %s.\n", 5306 mdname(mddev)); 5307 mddev->recovery_cp = mddev->curr_resync; 5308 } 5309 } else 5310 mddev->recovery_cp = MaxSector; 5311 } else { 5312 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5313 mddev->curr_resync = MaxSector; 5314 ITERATE_RDEV(mddev,rdev,rtmp) 5315 if (rdev->raid_disk >= 0 && 5316 !test_bit(Faulty, &rdev->flags) && 5317 !test_bit(In_sync, &rdev->flags) && 5318 rdev->recovery_offset < mddev->curr_resync) 5319 rdev->recovery_offset = mddev->curr_resync; 5320 mddev->sb_dirty = 1; 5321 } 5322 } 5323 5324 skip: 5325 mddev->curr_resync = 0; 5326 wake_up(&resync_wait); 5327 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 5328 md_wakeup_thread(mddev->thread); 5329 } 5330 EXPORT_SYMBOL_GPL(md_do_sync); 5331 5332 5333 /* 5334 * This routine is regularly called by all per-raid-array threads to 5335 * deal with generic issues like resync and super-block update. 5336 * Raid personalities that don't have a thread (linear/raid0) do not 5337 * need this as they never do any recovery or update the superblock. 5338 * 5339 * It does not do any resync itself, but rather "forks" off other threads 5340 * to do that as needed. 5341 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 5342 * "->recovery" and create a thread at ->sync_thread. 5343 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 5344 * and wakeups up this thread which will reap the thread and finish up. 5345 * This thread also removes any faulty devices (with nr_pending == 0). 5346 * 5347 * The overall approach is: 5348 * 1/ if the superblock needs updating, update it. 5349 * 2/ If a recovery thread is running, don't do anything else. 5350 * 3/ If recovery has finished, clean up, possibly marking spares active. 5351 * 4/ If there are any faulty devices, remove them. 5352 * 5/ If array is degraded, try to add spares devices 5353 * 6/ If array has spares or is not in-sync, start a resync thread. 5354 */ 5355 void md_check_recovery(mddev_t *mddev) 5356 { 5357 mdk_rdev_t *rdev; 5358 struct list_head *rtmp; 5359 5360 5361 if (mddev->bitmap) 5362 bitmap_daemon_work(mddev->bitmap); 5363 5364 if (mddev->ro) 5365 return; 5366 5367 if (signal_pending(current)) { 5368 if (mddev->pers->sync_request) { 5369 printk(KERN_INFO "md: %s in immediate safe mode\n", 5370 mdname(mddev)); 5371 mddev->safemode = 2; 5372 } 5373 flush_signals(current); 5374 } 5375 5376 if ( ! ( 5377 mddev->sb_dirty || 5378 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 5379 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 5380 (mddev->safemode == 1) || 5381 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 5382 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 5383 )) 5384 return; 5385 5386 if (mddev_trylock(mddev)) { 5387 int spares =0; 5388 5389 spin_lock_irq(&mddev->write_lock); 5390 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5391 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5392 mddev->in_sync = 1; 5393 mddev->sb_dirty = 3; 5394 } 5395 if (mddev->safemode == 1) 5396 mddev->safemode = 0; 5397 spin_unlock_irq(&mddev->write_lock); 5398 5399 if (mddev->sb_dirty) 5400 md_update_sb(mddev); 5401 5402 5403 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 5404 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 5405 /* resync/recovery still happening */ 5406 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5407 goto unlock; 5408 } 5409 if (mddev->sync_thread) { 5410 /* resync has finished, collect result */ 5411 md_unregister_thread(mddev->sync_thread); 5412 mddev->sync_thread = NULL; 5413 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5414 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5415 /* success...*/ 5416 /* activate any spares */ 5417 mddev->pers->spare_active(mddev); 5418 } 5419 md_update_sb(mddev); 5420 5421 /* if array is no-longer degraded, then any saved_raid_disk 5422 * information must be scrapped 5423 */ 5424 if (!mddev->degraded) 5425 ITERATE_RDEV(mddev,rdev,rtmp) 5426 rdev->saved_raid_disk = -1; 5427 5428 mddev->recovery = 0; 5429 /* flag recovery needed just to double check */ 5430 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5431 md_new_event(mddev); 5432 goto unlock; 5433 } 5434 /* Clear some bits that don't mean anything, but 5435 * might be left set 5436 */ 5437 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5438 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 5439 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5440 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5441 5442 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 5443 goto unlock; 5444 /* no recovery is running. 5445 * remove any failed drives, then 5446 * add spares if possible. 5447 * Spare are also removed and re-added, to allow 5448 * the personality to fail the re-add. 5449 */ 5450 ITERATE_RDEV(mddev,rdev,rtmp) 5451 if (rdev->raid_disk >= 0 && 5452 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && 5453 atomic_read(&rdev->nr_pending)==0) { 5454 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 5455 char nm[20]; 5456 sprintf(nm,"rd%d", rdev->raid_disk); 5457 sysfs_remove_link(&mddev->kobj, nm); 5458 rdev->raid_disk = -1; 5459 } 5460 } 5461 5462 if (mddev->degraded) { 5463 ITERATE_RDEV(mddev,rdev,rtmp) 5464 if (rdev->raid_disk < 0 5465 && !test_bit(Faulty, &rdev->flags)) { 5466 rdev->recovery_offset = 0; 5467 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5468 char nm[20]; 5469 sprintf(nm, "rd%d", rdev->raid_disk); 5470 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 5471 spares++; 5472 md_new_event(mddev); 5473 } else 5474 break; 5475 } 5476 } 5477 5478 if (spares) { 5479 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5480 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5481 } else if (mddev->recovery_cp < MaxSector) { 5482 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5483 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5484 /* nothing to be done ... */ 5485 goto unlock; 5486 5487 if (mddev->pers->sync_request) { 5488 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5489 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 5490 /* We are adding a device or devices to an array 5491 * which has the bitmap stored on all devices. 5492 * So make sure all bitmap pages get written 5493 */ 5494 bitmap_write_all(mddev->bitmap); 5495 } 5496 mddev->sync_thread = md_register_thread(md_do_sync, 5497 mddev, 5498 "%s_resync"); 5499 if (!mddev->sync_thread) { 5500 printk(KERN_ERR "%s: could not start resync" 5501 " thread...\n", 5502 mdname(mddev)); 5503 /* leave the spares where they are, it shouldn't hurt */ 5504 mddev->recovery = 0; 5505 } else 5506 md_wakeup_thread(mddev->sync_thread); 5507 md_new_event(mddev); 5508 } 5509 unlock: 5510 mddev_unlock(mddev); 5511 } 5512 } 5513 5514 static int md_notify_reboot(struct notifier_block *this, 5515 unsigned long code, void *x) 5516 { 5517 struct list_head *tmp; 5518 mddev_t *mddev; 5519 5520 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 5521 5522 printk(KERN_INFO "md: stopping all md devices.\n"); 5523 5524 ITERATE_MDDEV(mddev,tmp) 5525 if (mddev_trylock(mddev)) { 5526 do_md_stop (mddev, 1); 5527 mddev_unlock(mddev); 5528 } 5529 /* 5530 * certain more exotic SCSI devices are known to be 5531 * volatile wrt too early system reboots. While the 5532 * right place to handle this issue is the given 5533 * driver, we do want to have a safe RAID driver ... 5534 */ 5535 mdelay(1000*1); 5536 } 5537 return NOTIFY_DONE; 5538 } 5539 5540 static struct notifier_block md_notifier = { 5541 .notifier_call = md_notify_reboot, 5542 .next = NULL, 5543 .priority = INT_MAX, /* before any real devices */ 5544 }; 5545 5546 static void md_geninit(void) 5547 { 5548 struct proc_dir_entry *p; 5549 5550 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 5551 5552 p = create_proc_entry("mdstat", S_IRUGO, NULL); 5553 if (p) 5554 p->proc_fops = &md_seq_fops; 5555 } 5556 5557 static int __init md_init(void) 5558 { 5559 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 5560 " MD_SB_DISKS=%d\n", 5561 MD_MAJOR_VERSION, MD_MINOR_VERSION, 5562 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 5563 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, 5564 BITMAP_MINOR); 5565 5566 if (register_blkdev(MAJOR_NR, "md")) 5567 return -1; 5568 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 5569 unregister_blkdev(MAJOR_NR, "md"); 5570 return -1; 5571 } 5572 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 5573 md_probe, NULL, NULL); 5574 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 5575 md_probe, NULL, NULL); 5576 5577 register_reboot_notifier(&md_notifier); 5578 raid_table_header = register_sysctl_table(raid_root_table, 1); 5579 5580 md_geninit(); 5581 return (0); 5582 } 5583 5584 5585 #ifndef MODULE 5586 5587 /* 5588 * Searches all registered partitions for autorun RAID arrays 5589 * at boot time. 5590 */ 5591 static dev_t detected_devices[128]; 5592 static int dev_cnt; 5593 5594 void md_autodetect_dev(dev_t dev) 5595 { 5596 if (dev_cnt >= 0 && dev_cnt < 127) 5597 detected_devices[dev_cnt++] = dev; 5598 } 5599 5600 5601 static void autostart_arrays(int part) 5602 { 5603 mdk_rdev_t *rdev; 5604 int i; 5605 5606 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 5607 5608 for (i = 0; i < dev_cnt; i++) { 5609 dev_t dev = detected_devices[i]; 5610 5611 rdev = md_import_device(dev,0, 0); 5612 if (IS_ERR(rdev)) 5613 continue; 5614 5615 if (test_bit(Faulty, &rdev->flags)) { 5616 MD_BUG(); 5617 continue; 5618 } 5619 list_add(&rdev->same_set, &pending_raid_disks); 5620 } 5621 dev_cnt = 0; 5622 5623 autorun_devices(part); 5624 } 5625 5626 #endif 5627 5628 static __exit void md_exit(void) 5629 { 5630 mddev_t *mddev; 5631 struct list_head *tmp; 5632 5633 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 5634 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 5635 5636 unregister_blkdev(MAJOR_NR,"md"); 5637 unregister_blkdev(mdp_major, "mdp"); 5638 unregister_reboot_notifier(&md_notifier); 5639 unregister_sysctl_table(raid_table_header); 5640 remove_proc_entry("mdstat", NULL); 5641 ITERATE_MDDEV(mddev,tmp) { 5642 struct gendisk *disk = mddev->gendisk; 5643 if (!disk) 5644 continue; 5645 export_array(mddev); 5646 del_gendisk(disk); 5647 put_disk(disk); 5648 mddev->gendisk = NULL; 5649 mddev_put(mddev); 5650 } 5651 } 5652 5653 module_init(md_init) 5654 module_exit(md_exit) 5655 5656 static int get_ro(char *buffer, struct kernel_param *kp) 5657 { 5658 return sprintf(buffer, "%d", start_readonly); 5659 } 5660 static int set_ro(const char *val, struct kernel_param *kp) 5661 { 5662 char *e; 5663 int num = simple_strtoul(val, &e, 10); 5664 if (*val && (*e == '\0' || *e == '\n')) { 5665 start_readonly = num; 5666 return 0; 5667 } 5668 return -EINVAL; 5669 } 5670 5671 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 5672 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 5673 5674 5675 EXPORT_SYMBOL(register_md_personality); 5676 EXPORT_SYMBOL(unregister_md_personality); 5677 EXPORT_SYMBOL(md_error); 5678 EXPORT_SYMBOL(md_done_sync); 5679 EXPORT_SYMBOL(md_write_start); 5680 EXPORT_SYMBOL(md_write_end); 5681 EXPORT_SYMBOL(md_register_thread); 5682 EXPORT_SYMBOL(md_unregister_thread); 5683 EXPORT_SYMBOL(md_wakeup_thread); 5684 EXPORT_SYMBOL(md_check_recovery); 5685 MODULE_LICENSE("GPL"); 5686 MODULE_ALIAS("md"); 5687 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 5688