1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/config.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/buffer_head.h> /* for invalidate_bdev */ 43 #include <linux/suspend.h> 44 #include <linux/poll.h> 45 #include <linux/mutex.h> 46 #include <linux/ctype.h> 47 48 #include <linux/init.h> 49 50 #include <linux/file.h> 51 52 #ifdef CONFIG_KMOD 53 #include <linux/kmod.h> 54 #endif 55 56 #include <asm/unaligned.h> 57 58 #define MAJOR_NR MD_MAJOR 59 #define MD_DRIVER 60 61 /* 63 partitions with the alternate major number (mdp) */ 62 #define MdpMinorShift 6 63 64 #define DEBUG 0 65 #define dprintk(x...) ((void)(DEBUG && printk(x))) 66 67 68 #ifndef MODULE 69 static void autostart_arrays (int part); 70 #endif 71 72 static LIST_HEAD(pers_list); 73 static DEFINE_SPINLOCK(pers_lock); 74 75 static void md_print_devices(void); 76 77 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 78 79 /* 80 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 81 * is 1000 KB/sec, so the extra system load does not show up that much. 82 * Increase it if you want to have more _guaranteed_ speed. Note that 83 * the RAID driver will use the maximum available bandwidth if the IO 84 * subsystem is idle. There is also an 'absolute maximum' reconstruction 85 * speed limit - in case reconstruction slows down your system despite 86 * idle IO detection. 87 * 88 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 89 * or /sys/block/mdX/md/sync_speed_{min,max} 90 */ 91 92 static int sysctl_speed_limit_min = 1000; 93 static int sysctl_speed_limit_max = 200000; 94 static inline int speed_min(mddev_t *mddev) 95 { 96 return mddev->sync_speed_min ? 97 mddev->sync_speed_min : sysctl_speed_limit_min; 98 } 99 100 static inline int speed_max(mddev_t *mddev) 101 { 102 return mddev->sync_speed_max ? 103 mddev->sync_speed_max : sysctl_speed_limit_max; 104 } 105 106 static struct ctl_table_header *raid_table_header; 107 108 static ctl_table raid_table[] = { 109 { 110 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 111 .procname = "speed_limit_min", 112 .data = &sysctl_speed_limit_min, 113 .maxlen = sizeof(int), 114 .mode = 0644, 115 .proc_handler = &proc_dointvec, 116 }, 117 { 118 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 119 .procname = "speed_limit_max", 120 .data = &sysctl_speed_limit_max, 121 .maxlen = sizeof(int), 122 .mode = 0644, 123 .proc_handler = &proc_dointvec, 124 }, 125 { .ctl_name = 0 } 126 }; 127 128 static ctl_table raid_dir_table[] = { 129 { 130 .ctl_name = DEV_RAID, 131 .procname = "raid", 132 .maxlen = 0, 133 .mode = 0555, 134 .child = raid_table, 135 }, 136 { .ctl_name = 0 } 137 }; 138 139 static ctl_table raid_root_table[] = { 140 { 141 .ctl_name = CTL_DEV, 142 .procname = "dev", 143 .maxlen = 0, 144 .mode = 0555, 145 .child = raid_dir_table, 146 }, 147 { .ctl_name = 0 } 148 }; 149 150 static struct block_device_operations md_fops; 151 152 static int start_readonly; 153 154 /* 155 * We have a system wide 'event count' that is incremented 156 * on any 'interesting' event, and readers of /proc/mdstat 157 * can use 'poll' or 'select' to find out when the event 158 * count increases. 159 * 160 * Events are: 161 * start array, stop array, error, add device, remove device, 162 * start build, activate spare 163 */ 164 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 165 static atomic_t md_event_count; 166 void md_new_event(mddev_t *mddev) 167 { 168 atomic_inc(&md_event_count); 169 wake_up(&md_event_waiters); 170 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 171 } 172 EXPORT_SYMBOL_GPL(md_new_event); 173 174 /* Alternate version that can be called from interrupts 175 * when calling sysfs_notify isn't needed. 176 */ 177 static void md_new_event_inintr(mddev_t *mddev) 178 { 179 atomic_inc(&md_event_count); 180 wake_up(&md_event_waiters); 181 } 182 183 /* 184 * Enables to iterate over all existing md arrays 185 * all_mddevs_lock protects this list. 186 */ 187 static LIST_HEAD(all_mddevs); 188 static DEFINE_SPINLOCK(all_mddevs_lock); 189 190 191 /* 192 * iterates through all used mddevs in the system. 193 * We take care to grab the all_mddevs_lock whenever navigating 194 * the list, and to always hold a refcount when unlocked. 195 * Any code which breaks out of this loop while own 196 * a reference to the current mddev and must mddev_put it. 197 */ 198 #define ITERATE_MDDEV(mddev,tmp) \ 199 \ 200 for (({ spin_lock(&all_mddevs_lock); \ 201 tmp = all_mddevs.next; \ 202 mddev = NULL;}); \ 203 ({ if (tmp != &all_mddevs) \ 204 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 205 spin_unlock(&all_mddevs_lock); \ 206 if (mddev) mddev_put(mddev); \ 207 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 208 tmp != &all_mddevs;}); \ 209 ({ spin_lock(&all_mddevs_lock); \ 210 tmp = tmp->next;}) \ 211 ) 212 213 214 static int md_fail_request (request_queue_t *q, struct bio *bio) 215 { 216 bio_io_error(bio, bio->bi_size); 217 return 0; 218 } 219 220 static inline mddev_t *mddev_get(mddev_t *mddev) 221 { 222 atomic_inc(&mddev->active); 223 return mddev; 224 } 225 226 static void mddev_put(mddev_t *mddev) 227 { 228 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 229 return; 230 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 231 list_del(&mddev->all_mddevs); 232 spin_unlock(&all_mddevs_lock); 233 blk_cleanup_queue(mddev->queue); 234 kobject_unregister(&mddev->kobj); 235 } else 236 spin_unlock(&all_mddevs_lock); 237 } 238 239 static mddev_t * mddev_find(dev_t unit) 240 { 241 mddev_t *mddev, *new = NULL; 242 243 retry: 244 spin_lock(&all_mddevs_lock); 245 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 246 if (mddev->unit == unit) { 247 mddev_get(mddev); 248 spin_unlock(&all_mddevs_lock); 249 kfree(new); 250 return mddev; 251 } 252 253 if (new) { 254 list_add(&new->all_mddevs, &all_mddevs); 255 spin_unlock(&all_mddevs_lock); 256 return new; 257 } 258 spin_unlock(&all_mddevs_lock); 259 260 new = kzalloc(sizeof(*new), GFP_KERNEL); 261 if (!new) 262 return NULL; 263 264 new->unit = unit; 265 if (MAJOR(unit) == MD_MAJOR) 266 new->md_minor = MINOR(unit); 267 else 268 new->md_minor = MINOR(unit) >> MdpMinorShift; 269 270 mutex_init(&new->reconfig_mutex); 271 INIT_LIST_HEAD(&new->disks); 272 INIT_LIST_HEAD(&new->all_mddevs); 273 init_timer(&new->safemode_timer); 274 atomic_set(&new->active, 1); 275 spin_lock_init(&new->write_lock); 276 init_waitqueue_head(&new->sb_wait); 277 278 new->queue = blk_alloc_queue(GFP_KERNEL); 279 if (!new->queue) { 280 kfree(new); 281 return NULL; 282 } 283 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); 284 285 blk_queue_make_request(new->queue, md_fail_request); 286 287 goto retry; 288 } 289 290 static inline int mddev_lock(mddev_t * mddev) 291 { 292 return mutex_lock_interruptible(&mddev->reconfig_mutex); 293 } 294 295 static inline int mddev_trylock(mddev_t * mddev) 296 { 297 return mutex_trylock(&mddev->reconfig_mutex); 298 } 299 300 static inline void mddev_unlock(mddev_t * mddev) 301 { 302 mutex_unlock(&mddev->reconfig_mutex); 303 304 md_wakeup_thread(mddev->thread); 305 } 306 307 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 308 { 309 mdk_rdev_t * rdev; 310 struct list_head *tmp; 311 312 ITERATE_RDEV(mddev,rdev,tmp) { 313 if (rdev->desc_nr == nr) 314 return rdev; 315 } 316 return NULL; 317 } 318 319 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 320 { 321 struct list_head *tmp; 322 mdk_rdev_t *rdev; 323 324 ITERATE_RDEV(mddev,rdev,tmp) { 325 if (rdev->bdev->bd_dev == dev) 326 return rdev; 327 } 328 return NULL; 329 } 330 331 static struct mdk_personality *find_pers(int level, char *clevel) 332 { 333 struct mdk_personality *pers; 334 list_for_each_entry(pers, &pers_list, list) { 335 if (level != LEVEL_NONE && pers->level == level) 336 return pers; 337 if (strcmp(pers->name, clevel)==0) 338 return pers; 339 } 340 return NULL; 341 } 342 343 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 344 { 345 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 346 return MD_NEW_SIZE_BLOCKS(size); 347 } 348 349 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 350 { 351 sector_t size; 352 353 size = rdev->sb_offset; 354 355 if (chunk_size) 356 size &= ~((sector_t)chunk_size/1024 - 1); 357 return size; 358 } 359 360 static int alloc_disk_sb(mdk_rdev_t * rdev) 361 { 362 if (rdev->sb_page) 363 MD_BUG(); 364 365 rdev->sb_page = alloc_page(GFP_KERNEL); 366 if (!rdev->sb_page) { 367 printk(KERN_ALERT "md: out of memory.\n"); 368 return -EINVAL; 369 } 370 371 return 0; 372 } 373 374 static void free_disk_sb(mdk_rdev_t * rdev) 375 { 376 if (rdev->sb_page) { 377 put_page(rdev->sb_page); 378 rdev->sb_loaded = 0; 379 rdev->sb_page = NULL; 380 rdev->sb_offset = 0; 381 rdev->size = 0; 382 } 383 } 384 385 386 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 387 { 388 mdk_rdev_t *rdev = bio->bi_private; 389 mddev_t *mddev = rdev->mddev; 390 if (bio->bi_size) 391 return 1; 392 393 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 394 md_error(mddev, rdev); 395 396 if (atomic_dec_and_test(&mddev->pending_writes)) 397 wake_up(&mddev->sb_wait); 398 bio_put(bio); 399 return 0; 400 } 401 402 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 403 { 404 struct bio *bio2 = bio->bi_private; 405 mdk_rdev_t *rdev = bio2->bi_private; 406 mddev_t *mddev = rdev->mddev; 407 if (bio->bi_size) 408 return 1; 409 410 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 411 error == -EOPNOTSUPP) { 412 unsigned long flags; 413 /* barriers don't appear to be supported :-( */ 414 set_bit(BarriersNotsupp, &rdev->flags); 415 mddev->barriers_work = 0; 416 spin_lock_irqsave(&mddev->write_lock, flags); 417 bio2->bi_next = mddev->biolist; 418 mddev->biolist = bio2; 419 spin_unlock_irqrestore(&mddev->write_lock, flags); 420 wake_up(&mddev->sb_wait); 421 bio_put(bio); 422 return 0; 423 } 424 bio_put(bio2); 425 bio->bi_private = rdev; 426 return super_written(bio, bytes_done, error); 427 } 428 429 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 430 sector_t sector, int size, struct page *page) 431 { 432 /* write first size bytes of page to sector of rdev 433 * Increment mddev->pending_writes before returning 434 * and decrement it on completion, waking up sb_wait 435 * if zero is reached. 436 * If an error occurred, call md_error 437 * 438 * As we might need to resubmit the request if BIO_RW_BARRIER 439 * causes ENOTSUPP, we allocate a spare bio... 440 */ 441 struct bio *bio = bio_alloc(GFP_NOIO, 1); 442 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 443 444 bio->bi_bdev = rdev->bdev; 445 bio->bi_sector = sector; 446 bio_add_page(bio, page, size, 0); 447 bio->bi_private = rdev; 448 bio->bi_end_io = super_written; 449 bio->bi_rw = rw; 450 451 atomic_inc(&mddev->pending_writes); 452 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 453 struct bio *rbio; 454 rw |= (1<<BIO_RW_BARRIER); 455 rbio = bio_clone(bio, GFP_NOIO); 456 rbio->bi_private = bio; 457 rbio->bi_end_io = super_written_barrier; 458 submit_bio(rw, rbio); 459 } else 460 submit_bio(rw, bio); 461 } 462 463 void md_super_wait(mddev_t *mddev) 464 { 465 /* wait for all superblock writes that were scheduled to complete. 466 * if any had to be retried (due to BARRIER problems), retry them 467 */ 468 DEFINE_WAIT(wq); 469 for(;;) { 470 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 471 if (atomic_read(&mddev->pending_writes)==0) 472 break; 473 while (mddev->biolist) { 474 struct bio *bio; 475 spin_lock_irq(&mddev->write_lock); 476 bio = mddev->biolist; 477 mddev->biolist = bio->bi_next ; 478 bio->bi_next = NULL; 479 spin_unlock_irq(&mddev->write_lock); 480 submit_bio(bio->bi_rw, bio); 481 } 482 schedule(); 483 } 484 finish_wait(&mddev->sb_wait, &wq); 485 } 486 487 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 488 { 489 if (bio->bi_size) 490 return 1; 491 492 complete((struct completion*)bio->bi_private); 493 return 0; 494 } 495 496 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 497 struct page *page, int rw) 498 { 499 struct bio *bio = bio_alloc(GFP_NOIO, 1); 500 struct completion event; 501 int ret; 502 503 rw |= (1 << BIO_RW_SYNC); 504 505 bio->bi_bdev = bdev; 506 bio->bi_sector = sector; 507 bio_add_page(bio, page, size, 0); 508 init_completion(&event); 509 bio->bi_private = &event; 510 bio->bi_end_io = bi_complete; 511 submit_bio(rw, bio); 512 wait_for_completion(&event); 513 514 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 515 bio_put(bio); 516 return ret; 517 } 518 EXPORT_SYMBOL_GPL(sync_page_io); 519 520 static int read_disk_sb(mdk_rdev_t * rdev, int size) 521 { 522 char b[BDEVNAME_SIZE]; 523 if (!rdev->sb_page) { 524 MD_BUG(); 525 return -EINVAL; 526 } 527 if (rdev->sb_loaded) 528 return 0; 529 530 531 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 532 goto fail; 533 rdev->sb_loaded = 1; 534 return 0; 535 536 fail: 537 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 538 bdevname(rdev->bdev,b)); 539 return -EINVAL; 540 } 541 542 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 543 { 544 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 545 (sb1->set_uuid1 == sb2->set_uuid1) && 546 (sb1->set_uuid2 == sb2->set_uuid2) && 547 (sb1->set_uuid3 == sb2->set_uuid3)) 548 549 return 1; 550 551 return 0; 552 } 553 554 555 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 556 { 557 int ret; 558 mdp_super_t *tmp1, *tmp2; 559 560 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 561 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 562 563 if (!tmp1 || !tmp2) { 564 ret = 0; 565 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 566 goto abort; 567 } 568 569 *tmp1 = *sb1; 570 *tmp2 = *sb2; 571 572 /* 573 * nr_disks is not constant 574 */ 575 tmp1->nr_disks = 0; 576 tmp2->nr_disks = 0; 577 578 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 579 ret = 0; 580 else 581 ret = 1; 582 583 abort: 584 kfree(tmp1); 585 kfree(tmp2); 586 return ret; 587 } 588 589 static unsigned int calc_sb_csum(mdp_super_t * sb) 590 { 591 unsigned int disk_csum, csum; 592 593 disk_csum = sb->sb_csum; 594 sb->sb_csum = 0; 595 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 596 sb->sb_csum = disk_csum; 597 return csum; 598 } 599 600 601 /* 602 * Handle superblock details. 603 * We want to be able to handle multiple superblock formats 604 * so we have a common interface to them all, and an array of 605 * different handlers. 606 * We rely on user-space to write the initial superblock, and support 607 * reading and updating of superblocks. 608 * Interface methods are: 609 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 610 * loads and validates a superblock on dev. 611 * if refdev != NULL, compare superblocks on both devices 612 * Return: 613 * 0 - dev has a superblock that is compatible with refdev 614 * 1 - dev has a superblock that is compatible and newer than refdev 615 * so dev should be used as the refdev in future 616 * -EINVAL superblock incompatible or invalid 617 * -othererror e.g. -EIO 618 * 619 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 620 * Verify that dev is acceptable into mddev. 621 * The first time, mddev->raid_disks will be 0, and data from 622 * dev should be merged in. Subsequent calls check that dev 623 * is new enough. Return 0 or -EINVAL 624 * 625 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 626 * Update the superblock for rdev with data in mddev 627 * This does not write to disc. 628 * 629 */ 630 631 struct super_type { 632 char *name; 633 struct module *owner; 634 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 635 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 636 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 637 }; 638 639 /* 640 * load_super for 0.90.0 641 */ 642 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 643 { 644 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 645 mdp_super_t *sb; 646 int ret; 647 sector_t sb_offset; 648 649 /* 650 * Calculate the position of the superblock, 651 * it's at the end of the disk. 652 * 653 * It also happens to be a multiple of 4Kb. 654 */ 655 sb_offset = calc_dev_sboffset(rdev->bdev); 656 rdev->sb_offset = sb_offset; 657 658 ret = read_disk_sb(rdev, MD_SB_BYTES); 659 if (ret) return ret; 660 661 ret = -EINVAL; 662 663 bdevname(rdev->bdev, b); 664 sb = (mdp_super_t*)page_address(rdev->sb_page); 665 666 if (sb->md_magic != MD_SB_MAGIC) { 667 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 668 b); 669 goto abort; 670 } 671 672 if (sb->major_version != 0 || 673 sb->minor_version < 90 || 674 sb->minor_version > 91) { 675 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 676 sb->major_version, sb->minor_version, 677 b); 678 goto abort; 679 } 680 681 if (sb->raid_disks <= 0) 682 goto abort; 683 684 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 685 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 686 b); 687 goto abort; 688 } 689 690 rdev->preferred_minor = sb->md_minor; 691 rdev->data_offset = 0; 692 rdev->sb_size = MD_SB_BYTES; 693 694 if (sb->level == LEVEL_MULTIPATH) 695 rdev->desc_nr = -1; 696 else 697 rdev->desc_nr = sb->this_disk.number; 698 699 if (refdev == 0) 700 ret = 1; 701 else { 702 __u64 ev1, ev2; 703 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 704 if (!uuid_equal(refsb, sb)) { 705 printk(KERN_WARNING "md: %s has different UUID to %s\n", 706 b, bdevname(refdev->bdev,b2)); 707 goto abort; 708 } 709 if (!sb_equal(refsb, sb)) { 710 printk(KERN_WARNING "md: %s has same UUID" 711 " but different superblock to %s\n", 712 b, bdevname(refdev->bdev, b2)); 713 goto abort; 714 } 715 ev1 = md_event(sb); 716 ev2 = md_event(refsb); 717 if (ev1 > ev2) 718 ret = 1; 719 else 720 ret = 0; 721 } 722 rdev->size = calc_dev_size(rdev, sb->chunk_size); 723 724 if (rdev->size < sb->size && sb->level > 1) 725 /* "this cannot possibly happen" ... */ 726 ret = -EINVAL; 727 728 abort: 729 return ret; 730 } 731 732 /* 733 * validate_super for 0.90.0 734 */ 735 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 736 { 737 mdp_disk_t *desc; 738 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 739 __u64 ev1 = md_event(sb); 740 741 rdev->raid_disk = -1; 742 rdev->flags = 0; 743 if (mddev->raid_disks == 0) { 744 mddev->major_version = 0; 745 mddev->minor_version = sb->minor_version; 746 mddev->patch_version = sb->patch_version; 747 mddev->persistent = ! sb->not_persistent; 748 mddev->chunk_size = sb->chunk_size; 749 mddev->ctime = sb->ctime; 750 mddev->utime = sb->utime; 751 mddev->level = sb->level; 752 mddev->clevel[0] = 0; 753 mddev->layout = sb->layout; 754 mddev->raid_disks = sb->raid_disks; 755 mddev->size = sb->size; 756 mddev->events = ev1; 757 mddev->bitmap_offset = 0; 758 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 759 760 if (mddev->minor_version >= 91) { 761 mddev->reshape_position = sb->reshape_position; 762 mddev->delta_disks = sb->delta_disks; 763 mddev->new_level = sb->new_level; 764 mddev->new_layout = sb->new_layout; 765 mddev->new_chunk = sb->new_chunk; 766 } else { 767 mddev->reshape_position = MaxSector; 768 mddev->delta_disks = 0; 769 mddev->new_level = mddev->level; 770 mddev->new_layout = mddev->layout; 771 mddev->new_chunk = mddev->chunk_size; 772 } 773 774 if (sb->state & (1<<MD_SB_CLEAN)) 775 mddev->recovery_cp = MaxSector; 776 else { 777 if (sb->events_hi == sb->cp_events_hi && 778 sb->events_lo == sb->cp_events_lo) { 779 mddev->recovery_cp = sb->recovery_cp; 780 } else 781 mddev->recovery_cp = 0; 782 } 783 784 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 785 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 786 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 787 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 788 789 mddev->max_disks = MD_SB_DISKS; 790 791 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 792 mddev->bitmap_file == NULL) { 793 if (mddev->level != 1 && mddev->level != 4 794 && mddev->level != 5 && mddev->level != 6 795 && mddev->level != 10) { 796 /* FIXME use a better test */ 797 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 798 return -EINVAL; 799 } 800 mddev->bitmap_offset = mddev->default_bitmap_offset; 801 } 802 803 } else if (mddev->pers == NULL) { 804 /* Insist on good event counter while assembling */ 805 ++ev1; 806 if (ev1 < mddev->events) 807 return -EINVAL; 808 } else if (mddev->bitmap) { 809 /* if adding to array with a bitmap, then we can accept an 810 * older device ... but not too old. 811 */ 812 if (ev1 < mddev->bitmap->events_cleared) 813 return 0; 814 } else { 815 if (ev1 < mddev->events) 816 /* just a hot-add of a new device, leave raid_disk at -1 */ 817 return 0; 818 } 819 820 if (mddev->level != LEVEL_MULTIPATH) { 821 desc = sb->disks + rdev->desc_nr; 822 823 if (desc->state & (1<<MD_DISK_FAULTY)) 824 set_bit(Faulty, &rdev->flags); 825 else if (desc->state & (1<<MD_DISK_SYNC) /* && 826 desc->raid_disk < mddev->raid_disks */) { 827 set_bit(In_sync, &rdev->flags); 828 rdev->raid_disk = desc->raid_disk; 829 } 830 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 831 set_bit(WriteMostly, &rdev->flags); 832 } else /* MULTIPATH are always insync */ 833 set_bit(In_sync, &rdev->flags); 834 return 0; 835 } 836 837 /* 838 * sync_super for 0.90.0 839 */ 840 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 841 { 842 mdp_super_t *sb; 843 struct list_head *tmp; 844 mdk_rdev_t *rdev2; 845 int next_spare = mddev->raid_disks; 846 847 848 /* make rdev->sb match mddev data.. 849 * 850 * 1/ zero out disks 851 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 852 * 3/ any empty disks < next_spare become removed 853 * 854 * disks[0] gets initialised to REMOVED because 855 * we cannot be sure from other fields if it has 856 * been initialised or not. 857 */ 858 int i; 859 int active=0, working=0,failed=0,spare=0,nr_disks=0; 860 861 rdev->sb_size = MD_SB_BYTES; 862 863 sb = (mdp_super_t*)page_address(rdev->sb_page); 864 865 memset(sb, 0, sizeof(*sb)); 866 867 sb->md_magic = MD_SB_MAGIC; 868 sb->major_version = mddev->major_version; 869 sb->patch_version = mddev->patch_version; 870 sb->gvalid_words = 0; /* ignored */ 871 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 872 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 873 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 874 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 875 876 sb->ctime = mddev->ctime; 877 sb->level = mddev->level; 878 sb->size = mddev->size; 879 sb->raid_disks = mddev->raid_disks; 880 sb->md_minor = mddev->md_minor; 881 sb->not_persistent = !mddev->persistent; 882 sb->utime = mddev->utime; 883 sb->state = 0; 884 sb->events_hi = (mddev->events>>32); 885 sb->events_lo = (u32)mddev->events; 886 887 if (mddev->reshape_position == MaxSector) 888 sb->minor_version = 90; 889 else { 890 sb->minor_version = 91; 891 sb->reshape_position = mddev->reshape_position; 892 sb->new_level = mddev->new_level; 893 sb->delta_disks = mddev->delta_disks; 894 sb->new_layout = mddev->new_layout; 895 sb->new_chunk = mddev->new_chunk; 896 } 897 mddev->minor_version = sb->minor_version; 898 if (mddev->in_sync) 899 { 900 sb->recovery_cp = mddev->recovery_cp; 901 sb->cp_events_hi = (mddev->events>>32); 902 sb->cp_events_lo = (u32)mddev->events; 903 if (mddev->recovery_cp == MaxSector) 904 sb->state = (1<< MD_SB_CLEAN); 905 } else 906 sb->recovery_cp = 0; 907 908 sb->layout = mddev->layout; 909 sb->chunk_size = mddev->chunk_size; 910 911 if (mddev->bitmap && mddev->bitmap_file == NULL) 912 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 913 914 sb->disks[0].state = (1<<MD_DISK_REMOVED); 915 ITERATE_RDEV(mddev,rdev2,tmp) { 916 mdp_disk_t *d; 917 int desc_nr; 918 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 919 && !test_bit(Faulty, &rdev2->flags)) 920 desc_nr = rdev2->raid_disk; 921 else 922 desc_nr = next_spare++; 923 rdev2->desc_nr = desc_nr; 924 d = &sb->disks[rdev2->desc_nr]; 925 nr_disks++; 926 d->number = rdev2->desc_nr; 927 d->major = MAJOR(rdev2->bdev->bd_dev); 928 d->minor = MINOR(rdev2->bdev->bd_dev); 929 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 930 && !test_bit(Faulty, &rdev2->flags)) 931 d->raid_disk = rdev2->raid_disk; 932 else 933 d->raid_disk = rdev2->desc_nr; /* compatibility */ 934 if (test_bit(Faulty, &rdev2->flags)) 935 d->state = (1<<MD_DISK_FAULTY); 936 else if (test_bit(In_sync, &rdev2->flags)) { 937 d->state = (1<<MD_DISK_ACTIVE); 938 d->state |= (1<<MD_DISK_SYNC); 939 active++; 940 working++; 941 } else { 942 d->state = 0; 943 spare++; 944 working++; 945 } 946 if (test_bit(WriteMostly, &rdev2->flags)) 947 d->state |= (1<<MD_DISK_WRITEMOSTLY); 948 } 949 /* now set the "removed" and "faulty" bits on any missing devices */ 950 for (i=0 ; i < mddev->raid_disks ; i++) { 951 mdp_disk_t *d = &sb->disks[i]; 952 if (d->state == 0 && d->number == 0) { 953 d->number = i; 954 d->raid_disk = i; 955 d->state = (1<<MD_DISK_REMOVED); 956 d->state |= (1<<MD_DISK_FAULTY); 957 failed++; 958 } 959 } 960 sb->nr_disks = nr_disks; 961 sb->active_disks = active; 962 sb->working_disks = working; 963 sb->failed_disks = failed; 964 sb->spare_disks = spare; 965 966 sb->this_disk = sb->disks[rdev->desc_nr]; 967 sb->sb_csum = calc_sb_csum(sb); 968 } 969 970 /* 971 * version 1 superblock 972 */ 973 974 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 975 { 976 unsigned int disk_csum, csum; 977 unsigned long long newcsum; 978 int size = 256 + le32_to_cpu(sb->max_dev)*2; 979 unsigned int *isuper = (unsigned int*)sb; 980 int i; 981 982 disk_csum = sb->sb_csum; 983 sb->sb_csum = 0; 984 newcsum = 0; 985 for (i=0; size>=4; size -= 4 ) 986 newcsum += le32_to_cpu(*isuper++); 987 988 if (size == 2) 989 newcsum += le16_to_cpu(*(unsigned short*) isuper); 990 991 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 992 sb->sb_csum = disk_csum; 993 return cpu_to_le32(csum); 994 } 995 996 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 997 { 998 struct mdp_superblock_1 *sb; 999 int ret; 1000 sector_t sb_offset; 1001 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1002 int bmask; 1003 1004 /* 1005 * Calculate the position of the superblock. 1006 * It is always aligned to a 4K boundary and 1007 * depeding on minor_version, it can be: 1008 * 0: At least 8K, but less than 12K, from end of device 1009 * 1: At start of device 1010 * 2: 4K from start of device. 1011 */ 1012 switch(minor_version) { 1013 case 0: 1014 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1015 sb_offset -= 8*2; 1016 sb_offset &= ~(sector_t)(4*2-1); 1017 /* convert from sectors to K */ 1018 sb_offset /= 2; 1019 break; 1020 case 1: 1021 sb_offset = 0; 1022 break; 1023 case 2: 1024 sb_offset = 4; 1025 break; 1026 default: 1027 return -EINVAL; 1028 } 1029 rdev->sb_offset = sb_offset; 1030 1031 /* superblock is rarely larger than 1K, but it can be larger, 1032 * and it is safe to read 4k, so we do that 1033 */ 1034 ret = read_disk_sb(rdev, 4096); 1035 if (ret) return ret; 1036 1037 1038 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1039 1040 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1041 sb->major_version != cpu_to_le32(1) || 1042 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1043 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1044 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1045 return -EINVAL; 1046 1047 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1048 printk("md: invalid superblock checksum on %s\n", 1049 bdevname(rdev->bdev,b)); 1050 return -EINVAL; 1051 } 1052 if (le64_to_cpu(sb->data_size) < 10) { 1053 printk("md: data_size too small on %s\n", 1054 bdevname(rdev->bdev,b)); 1055 return -EINVAL; 1056 } 1057 rdev->preferred_minor = 0xffff; 1058 rdev->data_offset = le64_to_cpu(sb->data_offset); 1059 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1060 1061 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1062 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1063 if (rdev->sb_size & bmask) 1064 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1065 1066 if (refdev == 0) 1067 ret = 1; 1068 else { 1069 __u64 ev1, ev2; 1070 struct mdp_superblock_1 *refsb = 1071 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1072 1073 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1074 sb->level != refsb->level || 1075 sb->layout != refsb->layout || 1076 sb->chunksize != refsb->chunksize) { 1077 printk(KERN_WARNING "md: %s has strangely different" 1078 " superblock to %s\n", 1079 bdevname(rdev->bdev,b), 1080 bdevname(refdev->bdev,b2)); 1081 return -EINVAL; 1082 } 1083 ev1 = le64_to_cpu(sb->events); 1084 ev2 = le64_to_cpu(refsb->events); 1085 1086 if (ev1 > ev2) 1087 ret = 1; 1088 else 1089 ret = 0; 1090 } 1091 if (minor_version) 1092 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1093 else 1094 rdev->size = rdev->sb_offset; 1095 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1096 return -EINVAL; 1097 rdev->size = le64_to_cpu(sb->data_size)/2; 1098 if (le32_to_cpu(sb->chunksize)) 1099 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1100 1101 if (le32_to_cpu(sb->size) > rdev->size*2) 1102 return -EINVAL; 1103 return ret; 1104 } 1105 1106 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1107 { 1108 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1109 __u64 ev1 = le64_to_cpu(sb->events); 1110 1111 rdev->raid_disk = -1; 1112 rdev->flags = 0; 1113 if (mddev->raid_disks == 0) { 1114 mddev->major_version = 1; 1115 mddev->patch_version = 0; 1116 mddev->persistent = 1; 1117 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1118 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1119 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1120 mddev->level = le32_to_cpu(sb->level); 1121 mddev->clevel[0] = 0; 1122 mddev->layout = le32_to_cpu(sb->layout); 1123 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1124 mddev->size = le64_to_cpu(sb->size)/2; 1125 mddev->events = ev1; 1126 mddev->bitmap_offset = 0; 1127 mddev->default_bitmap_offset = 1024 >> 9; 1128 1129 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1130 memcpy(mddev->uuid, sb->set_uuid, 16); 1131 1132 mddev->max_disks = (4096-256)/2; 1133 1134 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1135 mddev->bitmap_file == NULL ) { 1136 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 1137 && mddev->level != 10) { 1138 printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); 1139 return -EINVAL; 1140 } 1141 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1142 } 1143 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1144 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1145 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1146 mddev->new_level = le32_to_cpu(sb->new_level); 1147 mddev->new_layout = le32_to_cpu(sb->new_layout); 1148 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1149 } else { 1150 mddev->reshape_position = MaxSector; 1151 mddev->delta_disks = 0; 1152 mddev->new_level = mddev->level; 1153 mddev->new_layout = mddev->layout; 1154 mddev->new_chunk = mddev->chunk_size; 1155 } 1156 1157 } else if (mddev->pers == NULL) { 1158 /* Insist of good event counter while assembling */ 1159 ++ev1; 1160 if (ev1 < mddev->events) 1161 return -EINVAL; 1162 } else if (mddev->bitmap) { 1163 /* If adding to array with a bitmap, then we can accept an 1164 * older device, but not too old. 1165 */ 1166 if (ev1 < mddev->bitmap->events_cleared) 1167 return 0; 1168 } else { 1169 if (ev1 < mddev->events) 1170 /* just a hot-add of a new device, leave raid_disk at -1 */ 1171 return 0; 1172 } 1173 if (mddev->level != LEVEL_MULTIPATH) { 1174 int role; 1175 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1176 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1177 switch(role) { 1178 case 0xffff: /* spare */ 1179 break; 1180 case 0xfffe: /* faulty */ 1181 set_bit(Faulty, &rdev->flags); 1182 break; 1183 default: 1184 if ((le32_to_cpu(sb->feature_map) & 1185 MD_FEATURE_RECOVERY_OFFSET)) 1186 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1187 else 1188 set_bit(In_sync, &rdev->flags); 1189 rdev->raid_disk = role; 1190 break; 1191 } 1192 if (sb->devflags & WriteMostly1) 1193 set_bit(WriteMostly, &rdev->flags); 1194 } else /* MULTIPATH are always insync */ 1195 set_bit(In_sync, &rdev->flags); 1196 1197 return 0; 1198 } 1199 1200 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1201 { 1202 struct mdp_superblock_1 *sb; 1203 struct list_head *tmp; 1204 mdk_rdev_t *rdev2; 1205 int max_dev, i; 1206 /* make rdev->sb match mddev and rdev data. */ 1207 1208 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1209 1210 sb->feature_map = 0; 1211 sb->pad0 = 0; 1212 sb->recovery_offset = cpu_to_le64(0); 1213 memset(sb->pad1, 0, sizeof(sb->pad1)); 1214 memset(sb->pad2, 0, sizeof(sb->pad2)); 1215 memset(sb->pad3, 0, sizeof(sb->pad3)); 1216 1217 sb->utime = cpu_to_le64((__u64)mddev->utime); 1218 sb->events = cpu_to_le64(mddev->events); 1219 if (mddev->in_sync) 1220 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1221 else 1222 sb->resync_offset = cpu_to_le64(0); 1223 1224 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors); 1225 1226 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1227 sb->size = cpu_to_le64(mddev->size<<1); 1228 1229 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1230 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1231 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1232 } 1233 1234 if (rdev->raid_disk >= 0 && 1235 !test_bit(In_sync, &rdev->flags) && 1236 rdev->recovery_offset > 0) { 1237 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1238 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1239 } 1240 1241 if (mddev->reshape_position != MaxSector) { 1242 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1243 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1244 sb->new_layout = cpu_to_le32(mddev->new_layout); 1245 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1246 sb->new_level = cpu_to_le32(mddev->new_level); 1247 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1248 } 1249 1250 max_dev = 0; 1251 ITERATE_RDEV(mddev,rdev2,tmp) 1252 if (rdev2->desc_nr+1 > max_dev) 1253 max_dev = rdev2->desc_nr+1; 1254 1255 sb->max_dev = cpu_to_le32(max_dev); 1256 for (i=0; i<max_dev;i++) 1257 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1258 1259 ITERATE_RDEV(mddev,rdev2,tmp) { 1260 i = rdev2->desc_nr; 1261 if (test_bit(Faulty, &rdev2->flags)) 1262 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1263 else if (test_bit(In_sync, &rdev2->flags)) 1264 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1265 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1266 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1267 else 1268 sb->dev_roles[i] = cpu_to_le16(0xffff); 1269 } 1270 1271 sb->sb_csum = calc_sb_1_csum(sb); 1272 } 1273 1274 1275 static struct super_type super_types[] = { 1276 [0] = { 1277 .name = "0.90.0", 1278 .owner = THIS_MODULE, 1279 .load_super = super_90_load, 1280 .validate_super = super_90_validate, 1281 .sync_super = super_90_sync, 1282 }, 1283 [1] = { 1284 .name = "md-1", 1285 .owner = THIS_MODULE, 1286 .load_super = super_1_load, 1287 .validate_super = super_1_validate, 1288 .sync_super = super_1_sync, 1289 }, 1290 }; 1291 1292 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1293 { 1294 struct list_head *tmp; 1295 mdk_rdev_t *rdev; 1296 1297 ITERATE_RDEV(mddev,rdev,tmp) 1298 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1299 return rdev; 1300 1301 return NULL; 1302 } 1303 1304 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1305 { 1306 struct list_head *tmp; 1307 mdk_rdev_t *rdev; 1308 1309 ITERATE_RDEV(mddev1,rdev,tmp) 1310 if (match_dev_unit(mddev2, rdev)) 1311 return 1; 1312 1313 return 0; 1314 } 1315 1316 static LIST_HEAD(pending_raid_disks); 1317 1318 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1319 { 1320 mdk_rdev_t *same_pdev; 1321 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1322 struct kobject *ko; 1323 char *s; 1324 1325 if (rdev->mddev) { 1326 MD_BUG(); 1327 return -EINVAL; 1328 } 1329 /* make sure rdev->size exceeds mddev->size */ 1330 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1331 if (mddev->pers) 1332 /* Cannot change size, so fail */ 1333 return -ENOSPC; 1334 else 1335 mddev->size = rdev->size; 1336 } 1337 same_pdev = match_dev_unit(mddev, rdev); 1338 if (same_pdev) 1339 printk(KERN_WARNING 1340 "%s: WARNING: %s appears to be on the same physical" 1341 " disk as %s. True\n protection against single-disk" 1342 " failure might be compromised.\n", 1343 mdname(mddev), bdevname(rdev->bdev,b), 1344 bdevname(same_pdev->bdev,b2)); 1345 1346 /* Verify rdev->desc_nr is unique. 1347 * If it is -1, assign a free number, else 1348 * check number is not in use 1349 */ 1350 if (rdev->desc_nr < 0) { 1351 int choice = 0; 1352 if (mddev->pers) choice = mddev->raid_disks; 1353 while (find_rdev_nr(mddev, choice)) 1354 choice++; 1355 rdev->desc_nr = choice; 1356 } else { 1357 if (find_rdev_nr(mddev, rdev->desc_nr)) 1358 return -EBUSY; 1359 } 1360 bdevname(rdev->bdev,b); 1361 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1362 return -ENOMEM; 1363 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL) 1364 *s = '!'; 1365 1366 list_add(&rdev->same_set, &mddev->disks); 1367 rdev->mddev = mddev; 1368 printk(KERN_INFO "md: bind<%s>\n", b); 1369 1370 rdev->kobj.parent = &mddev->kobj; 1371 kobject_add(&rdev->kobj); 1372 1373 if (rdev->bdev->bd_part) 1374 ko = &rdev->bdev->bd_part->kobj; 1375 else 1376 ko = &rdev->bdev->bd_disk->kobj; 1377 sysfs_create_link(&rdev->kobj, ko, "block"); 1378 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); 1379 return 0; 1380 } 1381 1382 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1383 { 1384 char b[BDEVNAME_SIZE]; 1385 if (!rdev->mddev) { 1386 MD_BUG(); 1387 return; 1388 } 1389 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1390 list_del_init(&rdev->same_set); 1391 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1392 rdev->mddev = NULL; 1393 sysfs_remove_link(&rdev->kobj, "block"); 1394 kobject_del(&rdev->kobj); 1395 } 1396 1397 /* 1398 * prevent the device from being mounted, repartitioned or 1399 * otherwise reused by a RAID array (or any other kernel 1400 * subsystem), by bd_claiming the device. 1401 */ 1402 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1403 { 1404 int err = 0; 1405 struct block_device *bdev; 1406 char b[BDEVNAME_SIZE]; 1407 1408 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1409 if (IS_ERR(bdev)) { 1410 printk(KERN_ERR "md: could not open %s.\n", 1411 __bdevname(dev, b)); 1412 return PTR_ERR(bdev); 1413 } 1414 err = bd_claim(bdev, rdev); 1415 if (err) { 1416 printk(KERN_ERR "md: could not bd_claim %s.\n", 1417 bdevname(bdev, b)); 1418 blkdev_put(bdev); 1419 return err; 1420 } 1421 rdev->bdev = bdev; 1422 return err; 1423 } 1424 1425 static void unlock_rdev(mdk_rdev_t *rdev) 1426 { 1427 struct block_device *bdev = rdev->bdev; 1428 rdev->bdev = NULL; 1429 if (!bdev) 1430 MD_BUG(); 1431 bd_release(bdev); 1432 blkdev_put(bdev); 1433 } 1434 1435 void md_autodetect_dev(dev_t dev); 1436 1437 static void export_rdev(mdk_rdev_t * rdev) 1438 { 1439 char b[BDEVNAME_SIZE]; 1440 printk(KERN_INFO "md: export_rdev(%s)\n", 1441 bdevname(rdev->bdev,b)); 1442 if (rdev->mddev) 1443 MD_BUG(); 1444 free_disk_sb(rdev); 1445 list_del_init(&rdev->same_set); 1446 #ifndef MODULE 1447 md_autodetect_dev(rdev->bdev->bd_dev); 1448 #endif 1449 unlock_rdev(rdev); 1450 kobject_put(&rdev->kobj); 1451 } 1452 1453 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1454 { 1455 unbind_rdev_from_array(rdev); 1456 export_rdev(rdev); 1457 } 1458 1459 static void export_array(mddev_t *mddev) 1460 { 1461 struct list_head *tmp; 1462 mdk_rdev_t *rdev; 1463 1464 ITERATE_RDEV(mddev,rdev,tmp) { 1465 if (!rdev->mddev) { 1466 MD_BUG(); 1467 continue; 1468 } 1469 kick_rdev_from_array(rdev); 1470 } 1471 if (!list_empty(&mddev->disks)) 1472 MD_BUG(); 1473 mddev->raid_disks = 0; 1474 mddev->major_version = 0; 1475 } 1476 1477 static void print_desc(mdp_disk_t *desc) 1478 { 1479 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1480 desc->major,desc->minor,desc->raid_disk,desc->state); 1481 } 1482 1483 static void print_sb(mdp_super_t *sb) 1484 { 1485 int i; 1486 1487 printk(KERN_INFO 1488 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1489 sb->major_version, sb->minor_version, sb->patch_version, 1490 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1491 sb->ctime); 1492 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1493 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1494 sb->md_minor, sb->layout, sb->chunk_size); 1495 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1496 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1497 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1498 sb->failed_disks, sb->spare_disks, 1499 sb->sb_csum, (unsigned long)sb->events_lo); 1500 1501 printk(KERN_INFO); 1502 for (i = 0; i < MD_SB_DISKS; i++) { 1503 mdp_disk_t *desc; 1504 1505 desc = sb->disks + i; 1506 if (desc->number || desc->major || desc->minor || 1507 desc->raid_disk || (desc->state && (desc->state != 4))) { 1508 printk(" D %2d: ", i); 1509 print_desc(desc); 1510 } 1511 } 1512 printk(KERN_INFO "md: THIS: "); 1513 print_desc(&sb->this_disk); 1514 1515 } 1516 1517 static void print_rdev(mdk_rdev_t *rdev) 1518 { 1519 char b[BDEVNAME_SIZE]; 1520 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1521 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1522 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1523 rdev->desc_nr); 1524 if (rdev->sb_loaded) { 1525 printk(KERN_INFO "md: rdev superblock:\n"); 1526 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1527 } else 1528 printk(KERN_INFO "md: no rdev superblock!\n"); 1529 } 1530 1531 static void md_print_devices(void) 1532 { 1533 struct list_head *tmp, *tmp2; 1534 mdk_rdev_t *rdev; 1535 mddev_t *mddev; 1536 char b[BDEVNAME_SIZE]; 1537 1538 printk("\n"); 1539 printk("md: **********************************\n"); 1540 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1541 printk("md: **********************************\n"); 1542 ITERATE_MDDEV(mddev,tmp) { 1543 1544 if (mddev->bitmap) 1545 bitmap_print_sb(mddev->bitmap); 1546 else 1547 printk("%s: ", mdname(mddev)); 1548 ITERATE_RDEV(mddev,rdev,tmp2) 1549 printk("<%s>", bdevname(rdev->bdev,b)); 1550 printk("\n"); 1551 1552 ITERATE_RDEV(mddev,rdev,tmp2) 1553 print_rdev(rdev); 1554 } 1555 printk("md: **********************************\n"); 1556 printk("\n"); 1557 } 1558 1559 1560 static void sync_sbs(mddev_t * mddev, int nospares) 1561 { 1562 /* Update each superblock (in-memory image), but 1563 * if we are allowed to, skip spares which already 1564 * have the right event counter, or have one earlier 1565 * (which would mean they aren't being marked as dirty 1566 * with the rest of the array) 1567 */ 1568 mdk_rdev_t *rdev; 1569 struct list_head *tmp; 1570 1571 ITERATE_RDEV(mddev,rdev,tmp) { 1572 if (rdev->sb_events == mddev->events || 1573 (nospares && 1574 rdev->raid_disk < 0 && 1575 (rdev->sb_events&1)==0 && 1576 rdev->sb_events+1 == mddev->events)) { 1577 /* Don't update this superblock */ 1578 rdev->sb_loaded = 2; 1579 } else { 1580 super_types[mddev->major_version]. 1581 sync_super(mddev, rdev); 1582 rdev->sb_loaded = 1; 1583 } 1584 } 1585 } 1586 1587 void md_update_sb(mddev_t * mddev) 1588 { 1589 int err; 1590 struct list_head *tmp; 1591 mdk_rdev_t *rdev; 1592 int sync_req; 1593 int nospares = 0; 1594 1595 repeat: 1596 spin_lock_irq(&mddev->write_lock); 1597 sync_req = mddev->in_sync; 1598 mddev->utime = get_seconds(); 1599 if (mddev->sb_dirty == 3) 1600 /* just a clean<-> dirty transition, possibly leave spares alone, 1601 * though if events isn't the right even/odd, we will have to do 1602 * spares after all 1603 */ 1604 nospares = 1; 1605 1606 /* If this is just a dirty<->clean transition, and the array is clean 1607 * and 'events' is odd, we can roll back to the previous clean state */ 1608 if (mddev->sb_dirty == 3 1609 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1610 && (mddev->events & 1)) 1611 mddev->events--; 1612 else { 1613 /* otherwise we have to go forward and ... */ 1614 mddev->events ++; 1615 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1616 /* .. if the array isn't clean, insist on an odd 'events' */ 1617 if ((mddev->events&1)==0) { 1618 mddev->events++; 1619 nospares = 0; 1620 } 1621 } else { 1622 /* otherwise insist on an even 'events' (for clean states) */ 1623 if ((mddev->events&1)) { 1624 mddev->events++; 1625 nospares = 0; 1626 } 1627 } 1628 } 1629 1630 if (!mddev->events) { 1631 /* 1632 * oops, this 64-bit counter should never wrap. 1633 * Either we are in around ~1 trillion A.C., assuming 1634 * 1 reboot per second, or we have a bug: 1635 */ 1636 MD_BUG(); 1637 mddev->events --; 1638 } 1639 mddev->sb_dirty = 2; 1640 sync_sbs(mddev, nospares); 1641 1642 /* 1643 * do not write anything to disk if using 1644 * nonpersistent superblocks 1645 */ 1646 if (!mddev->persistent) { 1647 mddev->sb_dirty = 0; 1648 spin_unlock_irq(&mddev->write_lock); 1649 wake_up(&mddev->sb_wait); 1650 return; 1651 } 1652 spin_unlock_irq(&mddev->write_lock); 1653 1654 dprintk(KERN_INFO 1655 "md: updating %s RAID superblock on device (in sync %d)\n", 1656 mdname(mddev),mddev->in_sync); 1657 1658 err = bitmap_update_sb(mddev->bitmap); 1659 ITERATE_RDEV(mddev,rdev,tmp) { 1660 char b[BDEVNAME_SIZE]; 1661 dprintk(KERN_INFO "md: "); 1662 if (rdev->sb_loaded != 1) 1663 continue; /* no noise on spare devices */ 1664 if (test_bit(Faulty, &rdev->flags)) 1665 dprintk("(skipping faulty "); 1666 1667 dprintk("%s ", bdevname(rdev->bdev,b)); 1668 if (!test_bit(Faulty, &rdev->flags)) { 1669 md_super_write(mddev,rdev, 1670 rdev->sb_offset<<1, rdev->sb_size, 1671 rdev->sb_page); 1672 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1673 bdevname(rdev->bdev,b), 1674 (unsigned long long)rdev->sb_offset); 1675 rdev->sb_events = mddev->events; 1676 1677 } else 1678 dprintk(")\n"); 1679 if (mddev->level == LEVEL_MULTIPATH) 1680 /* only need to write one superblock... */ 1681 break; 1682 } 1683 md_super_wait(mddev); 1684 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1685 1686 spin_lock_irq(&mddev->write_lock); 1687 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1688 /* have to write it out again */ 1689 spin_unlock_irq(&mddev->write_lock); 1690 goto repeat; 1691 } 1692 mddev->sb_dirty = 0; 1693 spin_unlock_irq(&mddev->write_lock); 1694 wake_up(&mddev->sb_wait); 1695 1696 } 1697 EXPORT_SYMBOL_GPL(md_update_sb); 1698 1699 /* words written to sysfs files may, or my not, be \n terminated. 1700 * We want to accept with case. For this we use cmd_match. 1701 */ 1702 static int cmd_match(const char *cmd, const char *str) 1703 { 1704 /* See if cmd, written into a sysfs file, matches 1705 * str. They must either be the same, or cmd can 1706 * have a trailing newline 1707 */ 1708 while (*cmd && *str && *cmd == *str) { 1709 cmd++; 1710 str++; 1711 } 1712 if (*cmd == '\n') 1713 cmd++; 1714 if (*str || *cmd) 1715 return 0; 1716 return 1; 1717 } 1718 1719 struct rdev_sysfs_entry { 1720 struct attribute attr; 1721 ssize_t (*show)(mdk_rdev_t *, char *); 1722 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1723 }; 1724 1725 static ssize_t 1726 state_show(mdk_rdev_t *rdev, char *page) 1727 { 1728 char *sep = ""; 1729 int len=0; 1730 1731 if (test_bit(Faulty, &rdev->flags)) { 1732 len+= sprintf(page+len, "%sfaulty",sep); 1733 sep = ","; 1734 } 1735 if (test_bit(In_sync, &rdev->flags)) { 1736 len += sprintf(page+len, "%sin_sync",sep); 1737 sep = ","; 1738 } 1739 if (test_bit(WriteMostly, &rdev->flags)) { 1740 len += sprintf(page+len, "%swrite_mostly",sep); 1741 sep = ","; 1742 } 1743 if (!test_bit(Faulty, &rdev->flags) && 1744 !test_bit(In_sync, &rdev->flags)) { 1745 len += sprintf(page+len, "%sspare", sep); 1746 sep = ","; 1747 } 1748 return len+sprintf(page+len, "\n"); 1749 } 1750 1751 static ssize_t 1752 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1753 { 1754 /* can write 1755 * faulty - simulates and error 1756 * remove - disconnects the device 1757 * writemostly - sets write_mostly 1758 * -writemostly - clears write_mostly 1759 */ 1760 int err = -EINVAL; 1761 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1762 md_error(rdev->mddev, rdev); 1763 err = 0; 1764 } else if (cmd_match(buf, "remove")) { 1765 if (rdev->raid_disk >= 0) 1766 err = -EBUSY; 1767 else { 1768 mddev_t *mddev = rdev->mddev; 1769 kick_rdev_from_array(rdev); 1770 md_update_sb(mddev); 1771 md_new_event(mddev); 1772 err = 0; 1773 } 1774 } else if (cmd_match(buf, "writemostly")) { 1775 set_bit(WriteMostly, &rdev->flags); 1776 err = 0; 1777 } else if (cmd_match(buf, "-writemostly")) { 1778 clear_bit(WriteMostly, &rdev->flags); 1779 err = 0; 1780 } 1781 return err ? err : len; 1782 } 1783 static struct rdev_sysfs_entry 1784 rdev_state = __ATTR(state, 0644, state_show, state_store); 1785 1786 static ssize_t 1787 super_show(mdk_rdev_t *rdev, char *page) 1788 { 1789 if (rdev->sb_loaded && rdev->sb_size) { 1790 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1791 return rdev->sb_size; 1792 } else 1793 return 0; 1794 } 1795 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1796 1797 static ssize_t 1798 errors_show(mdk_rdev_t *rdev, char *page) 1799 { 1800 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1801 } 1802 1803 static ssize_t 1804 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1805 { 1806 char *e; 1807 unsigned long n = simple_strtoul(buf, &e, 10); 1808 if (*buf && (*e == 0 || *e == '\n')) { 1809 atomic_set(&rdev->corrected_errors, n); 1810 return len; 1811 } 1812 return -EINVAL; 1813 } 1814 static struct rdev_sysfs_entry rdev_errors = 1815 __ATTR(errors, 0644, errors_show, errors_store); 1816 1817 static ssize_t 1818 slot_show(mdk_rdev_t *rdev, char *page) 1819 { 1820 if (rdev->raid_disk < 0) 1821 return sprintf(page, "none\n"); 1822 else 1823 return sprintf(page, "%d\n", rdev->raid_disk); 1824 } 1825 1826 static ssize_t 1827 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1828 { 1829 char *e; 1830 int slot = simple_strtoul(buf, &e, 10); 1831 if (strncmp(buf, "none", 4)==0) 1832 slot = -1; 1833 else if (e==buf || (*e && *e!= '\n')) 1834 return -EINVAL; 1835 if (rdev->mddev->pers) 1836 /* Cannot set slot in active array (yet) */ 1837 return -EBUSY; 1838 if (slot >= rdev->mddev->raid_disks) 1839 return -ENOSPC; 1840 rdev->raid_disk = slot; 1841 /* assume it is working */ 1842 rdev->flags = 0; 1843 set_bit(In_sync, &rdev->flags); 1844 return len; 1845 } 1846 1847 1848 static struct rdev_sysfs_entry rdev_slot = 1849 __ATTR(slot, 0644, slot_show, slot_store); 1850 1851 static ssize_t 1852 offset_show(mdk_rdev_t *rdev, char *page) 1853 { 1854 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 1855 } 1856 1857 static ssize_t 1858 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1859 { 1860 char *e; 1861 unsigned long long offset = simple_strtoull(buf, &e, 10); 1862 if (e==buf || (*e && *e != '\n')) 1863 return -EINVAL; 1864 if (rdev->mddev->pers) 1865 return -EBUSY; 1866 rdev->data_offset = offset; 1867 return len; 1868 } 1869 1870 static struct rdev_sysfs_entry rdev_offset = 1871 __ATTR(offset, 0644, offset_show, offset_store); 1872 1873 static ssize_t 1874 rdev_size_show(mdk_rdev_t *rdev, char *page) 1875 { 1876 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 1877 } 1878 1879 static ssize_t 1880 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1881 { 1882 char *e; 1883 unsigned long long size = simple_strtoull(buf, &e, 10); 1884 if (e==buf || (*e && *e != '\n')) 1885 return -EINVAL; 1886 if (rdev->mddev->pers) 1887 return -EBUSY; 1888 rdev->size = size; 1889 if (size < rdev->mddev->size || rdev->mddev->size == 0) 1890 rdev->mddev->size = size; 1891 return len; 1892 } 1893 1894 static struct rdev_sysfs_entry rdev_size = 1895 __ATTR(size, 0644, rdev_size_show, rdev_size_store); 1896 1897 static struct attribute *rdev_default_attrs[] = { 1898 &rdev_state.attr, 1899 &rdev_super.attr, 1900 &rdev_errors.attr, 1901 &rdev_slot.attr, 1902 &rdev_offset.attr, 1903 &rdev_size.attr, 1904 NULL, 1905 }; 1906 static ssize_t 1907 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1908 { 1909 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1910 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1911 1912 if (!entry->show) 1913 return -EIO; 1914 return entry->show(rdev, page); 1915 } 1916 1917 static ssize_t 1918 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1919 const char *page, size_t length) 1920 { 1921 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1922 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1923 1924 if (!entry->store) 1925 return -EIO; 1926 return entry->store(rdev, page, length); 1927 } 1928 1929 static void rdev_free(struct kobject *ko) 1930 { 1931 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 1932 kfree(rdev); 1933 } 1934 static struct sysfs_ops rdev_sysfs_ops = { 1935 .show = rdev_attr_show, 1936 .store = rdev_attr_store, 1937 }; 1938 static struct kobj_type rdev_ktype = { 1939 .release = rdev_free, 1940 .sysfs_ops = &rdev_sysfs_ops, 1941 .default_attrs = rdev_default_attrs, 1942 }; 1943 1944 /* 1945 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1946 * 1947 * mark the device faulty if: 1948 * 1949 * - the device is nonexistent (zero size) 1950 * - the device has no valid superblock 1951 * 1952 * a faulty rdev _never_ has rdev->sb set. 1953 */ 1954 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1955 { 1956 char b[BDEVNAME_SIZE]; 1957 int err; 1958 mdk_rdev_t *rdev; 1959 sector_t size; 1960 1961 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 1962 if (!rdev) { 1963 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1964 return ERR_PTR(-ENOMEM); 1965 } 1966 1967 if ((err = alloc_disk_sb(rdev))) 1968 goto abort_free; 1969 1970 err = lock_rdev(rdev, newdev); 1971 if (err) 1972 goto abort_free; 1973 1974 rdev->kobj.parent = NULL; 1975 rdev->kobj.ktype = &rdev_ktype; 1976 kobject_init(&rdev->kobj); 1977 1978 rdev->desc_nr = -1; 1979 rdev->flags = 0; 1980 rdev->data_offset = 0; 1981 rdev->sb_events = 0; 1982 atomic_set(&rdev->nr_pending, 0); 1983 atomic_set(&rdev->read_errors, 0); 1984 atomic_set(&rdev->corrected_errors, 0); 1985 1986 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1987 if (!size) { 1988 printk(KERN_WARNING 1989 "md: %s has zero or unknown size, marking faulty!\n", 1990 bdevname(rdev->bdev,b)); 1991 err = -EINVAL; 1992 goto abort_free; 1993 } 1994 1995 if (super_format >= 0) { 1996 err = super_types[super_format]. 1997 load_super(rdev, NULL, super_minor); 1998 if (err == -EINVAL) { 1999 printk(KERN_WARNING 2000 "md: %s has invalid sb, not importing!\n", 2001 bdevname(rdev->bdev,b)); 2002 goto abort_free; 2003 } 2004 if (err < 0) { 2005 printk(KERN_WARNING 2006 "md: could not read %s's sb, not importing!\n", 2007 bdevname(rdev->bdev,b)); 2008 goto abort_free; 2009 } 2010 } 2011 INIT_LIST_HEAD(&rdev->same_set); 2012 2013 return rdev; 2014 2015 abort_free: 2016 if (rdev->sb_page) { 2017 if (rdev->bdev) 2018 unlock_rdev(rdev); 2019 free_disk_sb(rdev); 2020 } 2021 kfree(rdev); 2022 return ERR_PTR(err); 2023 } 2024 2025 /* 2026 * Check a full RAID array for plausibility 2027 */ 2028 2029 2030 static void analyze_sbs(mddev_t * mddev) 2031 { 2032 int i; 2033 struct list_head *tmp; 2034 mdk_rdev_t *rdev, *freshest; 2035 char b[BDEVNAME_SIZE]; 2036 2037 freshest = NULL; 2038 ITERATE_RDEV(mddev,rdev,tmp) 2039 switch (super_types[mddev->major_version]. 2040 load_super(rdev, freshest, mddev->minor_version)) { 2041 case 1: 2042 freshest = rdev; 2043 break; 2044 case 0: 2045 break; 2046 default: 2047 printk( KERN_ERR \ 2048 "md: fatal superblock inconsistency in %s" 2049 " -- removing from array\n", 2050 bdevname(rdev->bdev,b)); 2051 kick_rdev_from_array(rdev); 2052 } 2053 2054 2055 super_types[mddev->major_version]. 2056 validate_super(mddev, freshest); 2057 2058 i = 0; 2059 ITERATE_RDEV(mddev,rdev,tmp) { 2060 if (rdev != freshest) 2061 if (super_types[mddev->major_version]. 2062 validate_super(mddev, rdev)) { 2063 printk(KERN_WARNING "md: kicking non-fresh %s" 2064 " from array!\n", 2065 bdevname(rdev->bdev,b)); 2066 kick_rdev_from_array(rdev); 2067 continue; 2068 } 2069 if (mddev->level == LEVEL_MULTIPATH) { 2070 rdev->desc_nr = i++; 2071 rdev->raid_disk = rdev->desc_nr; 2072 set_bit(In_sync, &rdev->flags); 2073 } 2074 } 2075 2076 2077 2078 if (mddev->recovery_cp != MaxSector && 2079 mddev->level >= 1) 2080 printk(KERN_ERR "md: %s: raid array is not clean" 2081 " -- starting background reconstruction\n", 2082 mdname(mddev)); 2083 2084 } 2085 2086 static ssize_t 2087 safe_delay_show(mddev_t *mddev, char *page) 2088 { 2089 int msec = (mddev->safemode_delay*1000)/HZ; 2090 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2091 } 2092 static ssize_t 2093 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2094 { 2095 int scale=1; 2096 int dot=0; 2097 int i; 2098 unsigned long msec; 2099 char buf[30]; 2100 char *e; 2101 /* remove a period, and count digits after it */ 2102 if (len >= sizeof(buf)) 2103 return -EINVAL; 2104 strlcpy(buf, cbuf, len); 2105 buf[len] = 0; 2106 for (i=0; i<len; i++) { 2107 if (dot) { 2108 if (isdigit(buf[i])) { 2109 buf[i-1] = buf[i]; 2110 scale *= 10; 2111 } 2112 buf[i] = 0; 2113 } else if (buf[i] == '.') { 2114 dot=1; 2115 buf[i] = 0; 2116 } 2117 } 2118 msec = simple_strtoul(buf, &e, 10); 2119 if (e == buf || (*e && *e != '\n')) 2120 return -EINVAL; 2121 msec = (msec * 1000) / scale; 2122 if (msec == 0) 2123 mddev->safemode_delay = 0; 2124 else { 2125 mddev->safemode_delay = (msec*HZ)/1000; 2126 if (mddev->safemode_delay == 0) 2127 mddev->safemode_delay = 1; 2128 } 2129 return len; 2130 } 2131 static struct md_sysfs_entry md_safe_delay = 2132 __ATTR(safe_mode_delay, 0644,safe_delay_show, safe_delay_store); 2133 2134 static ssize_t 2135 level_show(mddev_t *mddev, char *page) 2136 { 2137 struct mdk_personality *p = mddev->pers; 2138 if (p) 2139 return sprintf(page, "%s\n", p->name); 2140 else if (mddev->clevel[0]) 2141 return sprintf(page, "%s\n", mddev->clevel); 2142 else if (mddev->level != LEVEL_NONE) 2143 return sprintf(page, "%d\n", mddev->level); 2144 else 2145 return 0; 2146 } 2147 2148 static ssize_t 2149 level_store(mddev_t *mddev, const char *buf, size_t len) 2150 { 2151 int rv = len; 2152 if (mddev->pers) 2153 return -EBUSY; 2154 if (len == 0) 2155 return 0; 2156 if (len >= sizeof(mddev->clevel)) 2157 return -ENOSPC; 2158 strncpy(mddev->clevel, buf, len); 2159 if (mddev->clevel[len-1] == '\n') 2160 len--; 2161 mddev->clevel[len] = 0; 2162 mddev->level = LEVEL_NONE; 2163 return rv; 2164 } 2165 2166 static struct md_sysfs_entry md_level = 2167 __ATTR(level, 0644, level_show, level_store); 2168 2169 2170 static ssize_t 2171 layout_show(mddev_t *mddev, char *page) 2172 { 2173 /* just a number, not meaningful for all levels */ 2174 return sprintf(page, "%d\n", mddev->layout); 2175 } 2176 2177 static ssize_t 2178 layout_store(mddev_t *mddev, const char *buf, size_t len) 2179 { 2180 char *e; 2181 unsigned long n = simple_strtoul(buf, &e, 10); 2182 if (mddev->pers) 2183 return -EBUSY; 2184 2185 if (!*buf || (*e && *e != '\n')) 2186 return -EINVAL; 2187 2188 mddev->layout = n; 2189 return len; 2190 } 2191 static struct md_sysfs_entry md_layout = 2192 __ATTR(layout, 0655, layout_show, layout_store); 2193 2194 2195 static ssize_t 2196 raid_disks_show(mddev_t *mddev, char *page) 2197 { 2198 if (mddev->raid_disks == 0) 2199 return 0; 2200 return sprintf(page, "%d\n", mddev->raid_disks); 2201 } 2202 2203 static int update_raid_disks(mddev_t *mddev, int raid_disks); 2204 2205 static ssize_t 2206 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2207 { 2208 /* can only set raid_disks if array is not yet active */ 2209 char *e; 2210 int rv = 0; 2211 unsigned long n = simple_strtoul(buf, &e, 10); 2212 2213 if (!*buf || (*e && *e != '\n')) 2214 return -EINVAL; 2215 2216 if (mddev->pers) 2217 rv = update_raid_disks(mddev, n); 2218 else 2219 mddev->raid_disks = n; 2220 return rv ? rv : len; 2221 } 2222 static struct md_sysfs_entry md_raid_disks = 2223 __ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store); 2224 2225 static ssize_t 2226 chunk_size_show(mddev_t *mddev, char *page) 2227 { 2228 return sprintf(page, "%d\n", mddev->chunk_size); 2229 } 2230 2231 static ssize_t 2232 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2233 { 2234 /* can only set chunk_size if array is not yet active */ 2235 char *e; 2236 unsigned long n = simple_strtoul(buf, &e, 10); 2237 2238 if (mddev->pers) 2239 return -EBUSY; 2240 if (!*buf || (*e && *e != '\n')) 2241 return -EINVAL; 2242 2243 mddev->chunk_size = n; 2244 return len; 2245 } 2246 static struct md_sysfs_entry md_chunk_size = 2247 __ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store); 2248 2249 static ssize_t 2250 resync_start_show(mddev_t *mddev, char *page) 2251 { 2252 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2253 } 2254 2255 static ssize_t 2256 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2257 { 2258 /* can only set chunk_size if array is not yet active */ 2259 char *e; 2260 unsigned long long n = simple_strtoull(buf, &e, 10); 2261 2262 if (mddev->pers) 2263 return -EBUSY; 2264 if (!*buf || (*e && *e != '\n')) 2265 return -EINVAL; 2266 2267 mddev->recovery_cp = n; 2268 return len; 2269 } 2270 static struct md_sysfs_entry md_resync_start = 2271 __ATTR(resync_start, 0644, resync_start_show, resync_start_store); 2272 2273 /* 2274 * The array state can be: 2275 * 2276 * clear 2277 * No devices, no size, no level 2278 * Equivalent to STOP_ARRAY ioctl 2279 * inactive 2280 * May have some settings, but array is not active 2281 * all IO results in error 2282 * When written, doesn't tear down array, but just stops it 2283 * suspended (not supported yet) 2284 * All IO requests will block. The array can be reconfigured. 2285 * Writing this, if accepted, will block until array is quiessent 2286 * readonly 2287 * no resync can happen. no superblocks get written. 2288 * write requests fail 2289 * read-auto 2290 * like readonly, but behaves like 'clean' on a write request. 2291 * 2292 * clean - no pending writes, but otherwise active. 2293 * When written to inactive array, starts without resync 2294 * If a write request arrives then 2295 * if metadata is known, mark 'dirty' and switch to 'active'. 2296 * if not known, block and switch to write-pending 2297 * If written to an active array that has pending writes, then fails. 2298 * active 2299 * fully active: IO and resync can be happening. 2300 * When written to inactive array, starts with resync 2301 * 2302 * write-pending 2303 * clean, but writes are blocked waiting for 'active' to be written. 2304 * 2305 * active-idle 2306 * like active, but no writes have been seen for a while (100msec). 2307 * 2308 */ 2309 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2310 write_pending, active_idle, bad_word}; 2311 static char *array_states[] = { 2312 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2313 "write-pending", "active-idle", NULL }; 2314 2315 static int match_word(const char *word, char **list) 2316 { 2317 int n; 2318 for (n=0; list[n]; n++) 2319 if (cmd_match(word, list[n])) 2320 break; 2321 return n; 2322 } 2323 2324 static ssize_t 2325 array_state_show(mddev_t *mddev, char *page) 2326 { 2327 enum array_state st = inactive; 2328 2329 if (mddev->pers) 2330 switch(mddev->ro) { 2331 case 1: 2332 st = readonly; 2333 break; 2334 case 2: 2335 st = read_auto; 2336 break; 2337 case 0: 2338 if (mddev->in_sync) 2339 st = clean; 2340 else if (mddev->safemode) 2341 st = active_idle; 2342 else 2343 st = active; 2344 } 2345 else { 2346 if (list_empty(&mddev->disks) && 2347 mddev->raid_disks == 0 && 2348 mddev->size == 0) 2349 st = clear; 2350 else 2351 st = inactive; 2352 } 2353 return sprintf(page, "%s\n", array_states[st]); 2354 } 2355 2356 static int do_md_stop(mddev_t * mddev, int ro); 2357 static int do_md_run(mddev_t * mddev); 2358 static int restart_array(mddev_t *mddev); 2359 2360 static ssize_t 2361 array_state_store(mddev_t *mddev, const char *buf, size_t len) 2362 { 2363 int err = -EINVAL; 2364 enum array_state st = match_word(buf, array_states); 2365 switch(st) { 2366 case bad_word: 2367 break; 2368 case clear: 2369 /* stopping an active array */ 2370 if (mddev->pers) { 2371 if (atomic_read(&mddev->active) > 1) 2372 return -EBUSY; 2373 err = do_md_stop(mddev, 0); 2374 } 2375 break; 2376 case inactive: 2377 /* stopping an active array */ 2378 if (mddev->pers) { 2379 if (atomic_read(&mddev->active) > 1) 2380 return -EBUSY; 2381 err = do_md_stop(mddev, 2); 2382 } 2383 break; 2384 case suspended: 2385 break; /* not supported yet */ 2386 case readonly: 2387 if (mddev->pers) 2388 err = do_md_stop(mddev, 1); 2389 else { 2390 mddev->ro = 1; 2391 err = do_md_run(mddev); 2392 } 2393 break; 2394 case read_auto: 2395 /* stopping an active array */ 2396 if (mddev->pers) { 2397 err = do_md_stop(mddev, 1); 2398 if (err == 0) 2399 mddev->ro = 2; /* FIXME mark devices writable */ 2400 } else { 2401 mddev->ro = 2; 2402 err = do_md_run(mddev); 2403 } 2404 break; 2405 case clean: 2406 if (mddev->pers) { 2407 restart_array(mddev); 2408 spin_lock_irq(&mddev->write_lock); 2409 if (atomic_read(&mddev->writes_pending) == 0) { 2410 mddev->in_sync = 1; 2411 mddev->sb_dirty = 1; 2412 } 2413 spin_unlock_irq(&mddev->write_lock); 2414 } else { 2415 mddev->ro = 0; 2416 mddev->recovery_cp = MaxSector; 2417 err = do_md_run(mddev); 2418 } 2419 break; 2420 case active: 2421 if (mddev->pers) { 2422 restart_array(mddev); 2423 mddev->sb_dirty = 0; 2424 wake_up(&mddev->sb_wait); 2425 err = 0; 2426 } else { 2427 mddev->ro = 0; 2428 err = do_md_run(mddev); 2429 } 2430 break; 2431 case write_pending: 2432 case active_idle: 2433 /* these cannot be set */ 2434 break; 2435 } 2436 if (err) 2437 return err; 2438 else 2439 return len; 2440 } 2441 static struct md_sysfs_entry md_array_state = __ATTR(array_state, 0644, array_state_show, array_state_store); 2442 2443 static ssize_t 2444 null_show(mddev_t *mddev, char *page) 2445 { 2446 return -EINVAL; 2447 } 2448 2449 static ssize_t 2450 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 2451 { 2452 /* buf must be %d:%d\n? giving major and minor numbers */ 2453 /* The new device is added to the array. 2454 * If the array has a persistent superblock, we read the 2455 * superblock to initialise info and check validity. 2456 * Otherwise, only checking done is that in bind_rdev_to_array, 2457 * which mainly checks size. 2458 */ 2459 char *e; 2460 int major = simple_strtoul(buf, &e, 10); 2461 int minor; 2462 dev_t dev; 2463 mdk_rdev_t *rdev; 2464 int err; 2465 2466 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 2467 return -EINVAL; 2468 minor = simple_strtoul(e+1, &e, 10); 2469 if (*e && *e != '\n') 2470 return -EINVAL; 2471 dev = MKDEV(major, minor); 2472 if (major != MAJOR(dev) || 2473 minor != MINOR(dev)) 2474 return -EOVERFLOW; 2475 2476 2477 if (mddev->persistent) { 2478 rdev = md_import_device(dev, mddev->major_version, 2479 mddev->minor_version); 2480 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 2481 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2482 mdk_rdev_t, same_set); 2483 err = super_types[mddev->major_version] 2484 .load_super(rdev, rdev0, mddev->minor_version); 2485 if (err < 0) 2486 goto out; 2487 } 2488 } else 2489 rdev = md_import_device(dev, -1, -1); 2490 2491 if (IS_ERR(rdev)) 2492 return PTR_ERR(rdev); 2493 err = bind_rdev_to_array(rdev, mddev); 2494 out: 2495 if (err) 2496 export_rdev(rdev); 2497 return err ? err : len; 2498 } 2499 2500 static struct md_sysfs_entry md_new_device = 2501 __ATTR(new_dev, 0200, null_show, new_dev_store); 2502 2503 static ssize_t 2504 size_show(mddev_t *mddev, char *page) 2505 { 2506 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2507 } 2508 2509 static int update_size(mddev_t *mddev, unsigned long size); 2510 2511 static ssize_t 2512 size_store(mddev_t *mddev, const char *buf, size_t len) 2513 { 2514 /* If array is inactive, we can reduce the component size, but 2515 * not increase it (except from 0). 2516 * If array is active, we can try an on-line resize 2517 */ 2518 char *e; 2519 int err = 0; 2520 unsigned long long size = simple_strtoull(buf, &e, 10); 2521 if (!*buf || *buf == '\n' || 2522 (*e && *e != '\n')) 2523 return -EINVAL; 2524 2525 if (mddev->pers) { 2526 err = update_size(mddev, size); 2527 md_update_sb(mddev); 2528 } else { 2529 if (mddev->size == 0 || 2530 mddev->size > size) 2531 mddev->size = size; 2532 else 2533 err = -ENOSPC; 2534 } 2535 return err ? err : len; 2536 } 2537 2538 static struct md_sysfs_entry md_size = 2539 __ATTR(component_size, 0644, size_show, size_store); 2540 2541 2542 /* Metdata version. 2543 * This is either 'none' for arrays with externally managed metadata, 2544 * or N.M for internally known formats 2545 */ 2546 static ssize_t 2547 metadata_show(mddev_t *mddev, char *page) 2548 { 2549 if (mddev->persistent) 2550 return sprintf(page, "%d.%d\n", 2551 mddev->major_version, mddev->minor_version); 2552 else 2553 return sprintf(page, "none\n"); 2554 } 2555 2556 static ssize_t 2557 metadata_store(mddev_t *mddev, const char *buf, size_t len) 2558 { 2559 int major, minor; 2560 char *e; 2561 if (!list_empty(&mddev->disks)) 2562 return -EBUSY; 2563 2564 if (cmd_match(buf, "none")) { 2565 mddev->persistent = 0; 2566 mddev->major_version = 0; 2567 mddev->minor_version = 90; 2568 return len; 2569 } 2570 major = simple_strtoul(buf, &e, 10); 2571 if (e==buf || *e != '.') 2572 return -EINVAL; 2573 buf = e+1; 2574 minor = simple_strtoul(buf, &e, 10); 2575 if (e==buf || *e != '\n') 2576 return -EINVAL; 2577 if (major >= sizeof(super_types)/sizeof(super_types[0]) || 2578 super_types[major].name == NULL) 2579 return -ENOENT; 2580 mddev->major_version = major; 2581 mddev->minor_version = minor; 2582 mddev->persistent = 1; 2583 return len; 2584 } 2585 2586 static struct md_sysfs_entry md_metadata = 2587 __ATTR(metadata_version, 0644, metadata_show, metadata_store); 2588 2589 static ssize_t 2590 action_show(mddev_t *mddev, char *page) 2591 { 2592 char *type = "idle"; 2593 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2594 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2595 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2596 type = "reshape"; 2597 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2598 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2599 type = "resync"; 2600 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2601 type = "check"; 2602 else 2603 type = "repair"; 2604 } else 2605 type = "recover"; 2606 } 2607 return sprintf(page, "%s\n", type); 2608 } 2609 2610 static ssize_t 2611 action_store(mddev_t *mddev, const char *page, size_t len) 2612 { 2613 if (!mddev->pers || !mddev->pers->sync_request) 2614 return -EINVAL; 2615 2616 if (cmd_match(page, "idle")) { 2617 if (mddev->sync_thread) { 2618 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2619 md_unregister_thread(mddev->sync_thread); 2620 mddev->sync_thread = NULL; 2621 mddev->recovery = 0; 2622 } 2623 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2624 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 2625 return -EBUSY; 2626 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2627 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2628 else if (cmd_match(page, "reshape")) { 2629 int err; 2630 if (mddev->pers->start_reshape == NULL) 2631 return -EINVAL; 2632 err = mddev->pers->start_reshape(mddev); 2633 if (err) 2634 return err; 2635 } else { 2636 if (cmd_match(page, "check")) 2637 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2638 else if (!cmd_match(page, "repair")) 2639 return -EINVAL; 2640 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2641 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2642 } 2643 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2644 md_wakeup_thread(mddev->thread); 2645 return len; 2646 } 2647 2648 static ssize_t 2649 mismatch_cnt_show(mddev_t *mddev, char *page) 2650 { 2651 return sprintf(page, "%llu\n", 2652 (unsigned long long) mddev->resync_mismatches); 2653 } 2654 2655 static struct md_sysfs_entry 2656 md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2657 2658 2659 static struct md_sysfs_entry 2660 md_mismatches = __ATTR_RO(mismatch_cnt); 2661 2662 static ssize_t 2663 sync_min_show(mddev_t *mddev, char *page) 2664 { 2665 return sprintf(page, "%d (%s)\n", speed_min(mddev), 2666 mddev->sync_speed_min ? "local": "system"); 2667 } 2668 2669 static ssize_t 2670 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 2671 { 2672 int min; 2673 char *e; 2674 if (strncmp(buf, "system", 6)==0) { 2675 mddev->sync_speed_min = 0; 2676 return len; 2677 } 2678 min = simple_strtoul(buf, &e, 10); 2679 if (buf == e || (*e && *e != '\n') || min <= 0) 2680 return -EINVAL; 2681 mddev->sync_speed_min = min; 2682 return len; 2683 } 2684 2685 static struct md_sysfs_entry md_sync_min = 2686 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 2687 2688 static ssize_t 2689 sync_max_show(mddev_t *mddev, char *page) 2690 { 2691 return sprintf(page, "%d (%s)\n", speed_max(mddev), 2692 mddev->sync_speed_max ? "local": "system"); 2693 } 2694 2695 static ssize_t 2696 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 2697 { 2698 int max; 2699 char *e; 2700 if (strncmp(buf, "system", 6)==0) { 2701 mddev->sync_speed_max = 0; 2702 return len; 2703 } 2704 max = simple_strtoul(buf, &e, 10); 2705 if (buf == e || (*e && *e != '\n') || max <= 0) 2706 return -EINVAL; 2707 mddev->sync_speed_max = max; 2708 return len; 2709 } 2710 2711 static struct md_sysfs_entry md_sync_max = 2712 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 2713 2714 2715 static ssize_t 2716 sync_speed_show(mddev_t *mddev, char *page) 2717 { 2718 unsigned long resync, dt, db; 2719 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 2720 dt = ((jiffies - mddev->resync_mark) / HZ); 2721 if (!dt) dt++; 2722 db = resync - (mddev->resync_mark_cnt); 2723 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 2724 } 2725 2726 static struct md_sysfs_entry 2727 md_sync_speed = __ATTR_RO(sync_speed); 2728 2729 static ssize_t 2730 sync_completed_show(mddev_t *mddev, char *page) 2731 { 2732 unsigned long max_blocks, resync; 2733 2734 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2735 max_blocks = mddev->resync_max_sectors; 2736 else 2737 max_blocks = mddev->size << 1; 2738 2739 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 2740 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 2741 } 2742 2743 static struct md_sysfs_entry 2744 md_sync_completed = __ATTR_RO(sync_completed); 2745 2746 static ssize_t 2747 suspend_lo_show(mddev_t *mddev, char *page) 2748 { 2749 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 2750 } 2751 2752 static ssize_t 2753 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 2754 { 2755 char *e; 2756 unsigned long long new = simple_strtoull(buf, &e, 10); 2757 2758 if (mddev->pers->quiesce == NULL) 2759 return -EINVAL; 2760 if (buf == e || (*e && *e != '\n')) 2761 return -EINVAL; 2762 if (new >= mddev->suspend_hi || 2763 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 2764 mddev->suspend_lo = new; 2765 mddev->pers->quiesce(mddev, 2); 2766 return len; 2767 } else 2768 return -EINVAL; 2769 } 2770 static struct md_sysfs_entry md_suspend_lo = 2771 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 2772 2773 2774 static ssize_t 2775 suspend_hi_show(mddev_t *mddev, char *page) 2776 { 2777 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 2778 } 2779 2780 static ssize_t 2781 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 2782 { 2783 char *e; 2784 unsigned long long new = simple_strtoull(buf, &e, 10); 2785 2786 if (mddev->pers->quiesce == NULL) 2787 return -EINVAL; 2788 if (buf == e || (*e && *e != '\n')) 2789 return -EINVAL; 2790 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 2791 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 2792 mddev->suspend_hi = new; 2793 mddev->pers->quiesce(mddev, 1); 2794 mddev->pers->quiesce(mddev, 0); 2795 return len; 2796 } else 2797 return -EINVAL; 2798 } 2799 static struct md_sysfs_entry md_suspend_hi = 2800 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 2801 2802 2803 static struct attribute *md_default_attrs[] = { 2804 &md_level.attr, 2805 &md_layout.attr, 2806 &md_raid_disks.attr, 2807 &md_chunk_size.attr, 2808 &md_size.attr, 2809 &md_resync_start.attr, 2810 &md_metadata.attr, 2811 &md_new_device.attr, 2812 &md_safe_delay.attr, 2813 &md_array_state.attr, 2814 NULL, 2815 }; 2816 2817 static struct attribute *md_redundancy_attrs[] = { 2818 &md_scan_mode.attr, 2819 &md_mismatches.attr, 2820 &md_sync_min.attr, 2821 &md_sync_max.attr, 2822 &md_sync_speed.attr, 2823 &md_sync_completed.attr, 2824 &md_suspend_lo.attr, 2825 &md_suspend_hi.attr, 2826 NULL, 2827 }; 2828 static struct attribute_group md_redundancy_group = { 2829 .name = NULL, 2830 .attrs = md_redundancy_attrs, 2831 }; 2832 2833 2834 static ssize_t 2835 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2836 { 2837 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2838 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2839 ssize_t rv; 2840 2841 if (!entry->show) 2842 return -EIO; 2843 rv = mddev_lock(mddev); 2844 if (!rv) { 2845 rv = entry->show(mddev, page); 2846 mddev_unlock(mddev); 2847 } 2848 return rv; 2849 } 2850 2851 static ssize_t 2852 md_attr_store(struct kobject *kobj, struct attribute *attr, 2853 const char *page, size_t length) 2854 { 2855 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2856 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 2857 ssize_t rv; 2858 2859 if (!entry->store) 2860 return -EIO; 2861 rv = mddev_lock(mddev); 2862 if (!rv) { 2863 rv = entry->store(mddev, page, length); 2864 mddev_unlock(mddev); 2865 } 2866 return rv; 2867 } 2868 2869 static void md_free(struct kobject *ko) 2870 { 2871 mddev_t *mddev = container_of(ko, mddev_t, kobj); 2872 kfree(mddev); 2873 } 2874 2875 static struct sysfs_ops md_sysfs_ops = { 2876 .show = md_attr_show, 2877 .store = md_attr_store, 2878 }; 2879 static struct kobj_type md_ktype = { 2880 .release = md_free, 2881 .sysfs_ops = &md_sysfs_ops, 2882 .default_attrs = md_default_attrs, 2883 }; 2884 2885 int mdp_major = 0; 2886 2887 static struct kobject *md_probe(dev_t dev, int *part, void *data) 2888 { 2889 static DEFINE_MUTEX(disks_mutex); 2890 mddev_t *mddev = mddev_find(dev); 2891 struct gendisk *disk; 2892 int partitioned = (MAJOR(dev) != MD_MAJOR); 2893 int shift = partitioned ? MdpMinorShift : 0; 2894 int unit = MINOR(dev) >> shift; 2895 2896 if (!mddev) 2897 return NULL; 2898 2899 mutex_lock(&disks_mutex); 2900 if (mddev->gendisk) { 2901 mutex_unlock(&disks_mutex); 2902 mddev_put(mddev); 2903 return NULL; 2904 } 2905 disk = alloc_disk(1 << shift); 2906 if (!disk) { 2907 mutex_unlock(&disks_mutex); 2908 mddev_put(mddev); 2909 return NULL; 2910 } 2911 disk->major = MAJOR(dev); 2912 disk->first_minor = unit << shift; 2913 if (partitioned) 2914 sprintf(disk->disk_name, "md_d%d", unit); 2915 else 2916 sprintf(disk->disk_name, "md%d", unit); 2917 disk->fops = &md_fops; 2918 disk->private_data = mddev; 2919 disk->queue = mddev->queue; 2920 add_disk(disk); 2921 mddev->gendisk = disk; 2922 mutex_unlock(&disks_mutex); 2923 mddev->kobj.parent = &disk->kobj; 2924 mddev->kobj.k_name = NULL; 2925 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 2926 mddev->kobj.ktype = &md_ktype; 2927 kobject_register(&mddev->kobj); 2928 return NULL; 2929 } 2930 2931 static void md_safemode_timeout(unsigned long data) 2932 { 2933 mddev_t *mddev = (mddev_t *) data; 2934 2935 mddev->safemode = 1; 2936 md_wakeup_thread(mddev->thread); 2937 } 2938 2939 static int start_dirty_degraded; 2940 2941 static int do_md_run(mddev_t * mddev) 2942 { 2943 int err; 2944 int chunk_size; 2945 struct list_head *tmp; 2946 mdk_rdev_t *rdev; 2947 struct gendisk *disk; 2948 struct mdk_personality *pers; 2949 char b[BDEVNAME_SIZE]; 2950 2951 if (list_empty(&mddev->disks)) 2952 /* cannot run an array with no devices.. */ 2953 return -EINVAL; 2954 2955 if (mddev->pers) 2956 return -EBUSY; 2957 2958 /* 2959 * Analyze all RAID superblock(s) 2960 */ 2961 if (!mddev->raid_disks) 2962 analyze_sbs(mddev); 2963 2964 chunk_size = mddev->chunk_size; 2965 2966 if (chunk_size) { 2967 if (chunk_size > MAX_CHUNK_SIZE) { 2968 printk(KERN_ERR "too big chunk_size: %d > %d\n", 2969 chunk_size, MAX_CHUNK_SIZE); 2970 return -EINVAL; 2971 } 2972 /* 2973 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 2974 */ 2975 if ( (1 << ffz(~chunk_size)) != chunk_size) { 2976 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 2977 return -EINVAL; 2978 } 2979 if (chunk_size < PAGE_SIZE) { 2980 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 2981 chunk_size, PAGE_SIZE); 2982 return -EINVAL; 2983 } 2984 2985 /* devices must have minimum size of one chunk */ 2986 ITERATE_RDEV(mddev,rdev,tmp) { 2987 if (test_bit(Faulty, &rdev->flags)) 2988 continue; 2989 if (rdev->size < chunk_size / 1024) { 2990 printk(KERN_WARNING 2991 "md: Dev %s smaller than chunk_size:" 2992 " %lluk < %dk\n", 2993 bdevname(rdev->bdev,b), 2994 (unsigned long long)rdev->size, 2995 chunk_size / 1024); 2996 return -EINVAL; 2997 } 2998 } 2999 } 3000 3001 #ifdef CONFIG_KMOD 3002 if (mddev->level != LEVEL_NONE) 3003 request_module("md-level-%d", mddev->level); 3004 else if (mddev->clevel[0]) 3005 request_module("md-%s", mddev->clevel); 3006 #endif 3007 3008 /* 3009 * Drop all container device buffers, from now on 3010 * the only valid external interface is through the md 3011 * device. 3012 * Also find largest hardsector size 3013 */ 3014 ITERATE_RDEV(mddev,rdev,tmp) { 3015 if (test_bit(Faulty, &rdev->flags)) 3016 continue; 3017 sync_blockdev(rdev->bdev); 3018 invalidate_bdev(rdev->bdev, 0); 3019 } 3020 3021 md_probe(mddev->unit, NULL, NULL); 3022 disk = mddev->gendisk; 3023 if (!disk) 3024 return -ENOMEM; 3025 3026 spin_lock(&pers_lock); 3027 pers = find_pers(mddev->level, mddev->clevel); 3028 if (!pers || !try_module_get(pers->owner)) { 3029 spin_unlock(&pers_lock); 3030 if (mddev->level != LEVEL_NONE) 3031 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 3032 mddev->level); 3033 else 3034 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 3035 mddev->clevel); 3036 return -EINVAL; 3037 } 3038 mddev->pers = pers; 3039 spin_unlock(&pers_lock); 3040 mddev->level = pers->level; 3041 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3042 3043 if (mddev->reshape_position != MaxSector && 3044 pers->start_reshape == NULL) { 3045 /* This personality cannot handle reshaping... */ 3046 mddev->pers = NULL; 3047 module_put(pers->owner); 3048 return -EINVAL; 3049 } 3050 3051 mddev->recovery = 0; 3052 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 3053 mddev->barriers_work = 1; 3054 mddev->ok_start_degraded = start_dirty_degraded; 3055 3056 if (start_readonly) 3057 mddev->ro = 2; /* read-only, but switch on first write */ 3058 3059 err = mddev->pers->run(mddev); 3060 if (!err && mddev->pers->sync_request) { 3061 err = bitmap_create(mddev); 3062 if (err) { 3063 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3064 mdname(mddev), err); 3065 mddev->pers->stop(mddev); 3066 } 3067 } 3068 if (err) { 3069 printk(KERN_ERR "md: pers->run() failed ...\n"); 3070 module_put(mddev->pers->owner); 3071 mddev->pers = NULL; 3072 bitmap_destroy(mddev); 3073 return err; 3074 } 3075 if (mddev->pers->sync_request) 3076 sysfs_create_group(&mddev->kobj, &md_redundancy_group); 3077 else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3078 mddev->ro = 0; 3079 3080 atomic_set(&mddev->writes_pending,0); 3081 mddev->safemode = 0; 3082 mddev->safemode_timer.function = md_safemode_timeout; 3083 mddev->safemode_timer.data = (unsigned long) mddev; 3084 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3085 mddev->in_sync = 1; 3086 3087 ITERATE_RDEV(mddev,rdev,tmp) 3088 if (rdev->raid_disk >= 0) { 3089 char nm[20]; 3090 sprintf(nm, "rd%d", rdev->raid_disk); 3091 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 3092 } 3093 3094 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3095 md_wakeup_thread(mddev->thread); 3096 3097 if (mddev->sb_dirty) 3098 md_update_sb(mddev); 3099 3100 set_capacity(disk, mddev->array_size<<1); 3101 3102 /* If we call blk_queue_make_request here, it will 3103 * re-initialise max_sectors etc which may have been 3104 * refined inside -> run. So just set the bits we need to set. 3105 * Most initialisation happended when we called 3106 * blk_queue_make_request(..., md_fail_request) 3107 * earlier. 3108 */ 3109 mddev->queue->queuedata = mddev; 3110 mddev->queue->make_request_fn = mddev->pers->make_request; 3111 3112 /* If there is a partially-recovered drive we need to 3113 * start recovery here. If we leave it to md_check_recovery, 3114 * it will remove the drives and not do the right thing 3115 */ 3116 if (mddev->degraded) { 3117 struct list_head *rtmp; 3118 int spares = 0; 3119 ITERATE_RDEV(mddev,rdev,rtmp) 3120 if (rdev->raid_disk >= 0 && 3121 !test_bit(In_sync, &rdev->flags) && 3122 !test_bit(Faulty, &rdev->flags)) 3123 /* complete an interrupted recovery */ 3124 spares++; 3125 if (spares && mddev->pers->sync_request) { 3126 mddev->recovery = 0; 3127 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3128 mddev->sync_thread = md_register_thread(md_do_sync, 3129 mddev, 3130 "%s_resync"); 3131 if (!mddev->sync_thread) { 3132 printk(KERN_ERR "%s: could not start resync" 3133 " thread...\n", 3134 mdname(mddev)); 3135 /* leave the spares where they are, it shouldn't hurt */ 3136 mddev->recovery = 0; 3137 } else 3138 md_wakeup_thread(mddev->sync_thread); 3139 } 3140 } 3141 3142 mddev->changed = 1; 3143 md_new_event(mddev); 3144 return 0; 3145 } 3146 3147 static int restart_array(mddev_t *mddev) 3148 { 3149 struct gendisk *disk = mddev->gendisk; 3150 int err; 3151 3152 /* 3153 * Complain if it has no devices 3154 */ 3155 err = -ENXIO; 3156 if (list_empty(&mddev->disks)) 3157 goto out; 3158 3159 if (mddev->pers) { 3160 err = -EBUSY; 3161 if (!mddev->ro) 3162 goto out; 3163 3164 mddev->safemode = 0; 3165 mddev->ro = 0; 3166 set_disk_ro(disk, 0); 3167 3168 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3169 mdname(mddev)); 3170 /* 3171 * Kick recovery or resync if necessary 3172 */ 3173 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3174 md_wakeup_thread(mddev->thread); 3175 md_wakeup_thread(mddev->sync_thread); 3176 err = 0; 3177 } else 3178 err = -EINVAL; 3179 3180 out: 3181 return err; 3182 } 3183 3184 /* similar to deny_write_access, but accounts for our holding a reference 3185 * to the file ourselves */ 3186 static int deny_bitmap_write_access(struct file * file) 3187 { 3188 struct inode *inode = file->f_mapping->host; 3189 3190 spin_lock(&inode->i_lock); 3191 if (atomic_read(&inode->i_writecount) > 1) { 3192 spin_unlock(&inode->i_lock); 3193 return -ETXTBSY; 3194 } 3195 atomic_set(&inode->i_writecount, -1); 3196 spin_unlock(&inode->i_lock); 3197 3198 return 0; 3199 } 3200 3201 static void restore_bitmap_write_access(struct file *file) 3202 { 3203 struct inode *inode = file->f_mapping->host; 3204 3205 spin_lock(&inode->i_lock); 3206 atomic_set(&inode->i_writecount, 1); 3207 spin_unlock(&inode->i_lock); 3208 } 3209 3210 /* mode: 3211 * 0 - completely stop and dis-assemble array 3212 * 1 - switch to readonly 3213 * 2 - stop but do not disassemble array 3214 */ 3215 static int do_md_stop(mddev_t * mddev, int mode) 3216 { 3217 int err = 0; 3218 struct gendisk *disk = mddev->gendisk; 3219 3220 if (mddev->pers) { 3221 if (atomic_read(&mddev->active)>2) { 3222 printk("md: %s still in use.\n",mdname(mddev)); 3223 return -EBUSY; 3224 } 3225 3226 if (mddev->sync_thread) { 3227 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3228 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3229 md_unregister_thread(mddev->sync_thread); 3230 mddev->sync_thread = NULL; 3231 } 3232 3233 del_timer_sync(&mddev->safemode_timer); 3234 3235 invalidate_partition(disk, 0); 3236 3237 switch(mode) { 3238 case 1: /* readonly */ 3239 err = -ENXIO; 3240 if (mddev->ro==1) 3241 goto out; 3242 mddev->ro = 1; 3243 break; 3244 case 0: /* disassemble */ 3245 case 2: /* stop */ 3246 bitmap_flush(mddev); 3247 md_super_wait(mddev); 3248 if (mddev->ro) 3249 set_disk_ro(disk, 0); 3250 blk_queue_make_request(mddev->queue, md_fail_request); 3251 mddev->pers->stop(mddev); 3252 if (mddev->pers->sync_request) 3253 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3254 3255 module_put(mddev->pers->owner); 3256 mddev->pers = NULL; 3257 if (mddev->ro) 3258 mddev->ro = 0; 3259 } 3260 if (!mddev->in_sync || mddev->sb_dirty) { 3261 /* mark array as shutdown cleanly */ 3262 mddev->in_sync = 1; 3263 md_update_sb(mddev); 3264 } 3265 if (mode == 1) 3266 set_disk_ro(disk, 1); 3267 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3268 } 3269 3270 /* 3271 * Free resources if final stop 3272 */ 3273 if (mode == 0) { 3274 mdk_rdev_t *rdev; 3275 struct list_head *tmp; 3276 struct gendisk *disk; 3277 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 3278 3279 bitmap_destroy(mddev); 3280 if (mddev->bitmap_file) { 3281 restore_bitmap_write_access(mddev->bitmap_file); 3282 fput(mddev->bitmap_file); 3283 mddev->bitmap_file = NULL; 3284 } 3285 mddev->bitmap_offset = 0; 3286 3287 ITERATE_RDEV(mddev,rdev,tmp) 3288 if (rdev->raid_disk >= 0) { 3289 char nm[20]; 3290 sprintf(nm, "rd%d", rdev->raid_disk); 3291 sysfs_remove_link(&mddev->kobj, nm); 3292 } 3293 3294 export_array(mddev); 3295 3296 mddev->array_size = 0; 3297 mddev->size = 0; 3298 mddev->raid_disks = 0; 3299 mddev->recovery_cp = 0; 3300 3301 disk = mddev->gendisk; 3302 if (disk) 3303 set_capacity(disk, 0); 3304 mddev->changed = 1; 3305 } else if (mddev->pers) 3306 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3307 mdname(mddev)); 3308 err = 0; 3309 md_new_event(mddev); 3310 out: 3311 return err; 3312 } 3313 3314 static void autorun_array(mddev_t *mddev) 3315 { 3316 mdk_rdev_t *rdev; 3317 struct list_head *tmp; 3318 int err; 3319 3320 if (list_empty(&mddev->disks)) 3321 return; 3322 3323 printk(KERN_INFO "md: running: "); 3324 3325 ITERATE_RDEV(mddev,rdev,tmp) { 3326 char b[BDEVNAME_SIZE]; 3327 printk("<%s>", bdevname(rdev->bdev,b)); 3328 } 3329 printk("\n"); 3330 3331 err = do_md_run (mddev); 3332 if (err) { 3333 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3334 do_md_stop (mddev, 0); 3335 } 3336 } 3337 3338 /* 3339 * lets try to run arrays based on all disks that have arrived 3340 * until now. (those are in pending_raid_disks) 3341 * 3342 * the method: pick the first pending disk, collect all disks with 3343 * the same UUID, remove all from the pending list and put them into 3344 * the 'same_array' list. Then order this list based on superblock 3345 * update time (freshest comes first), kick out 'old' disks and 3346 * compare superblocks. If everything's fine then run it. 3347 * 3348 * If "unit" is allocated, then bump its reference count 3349 */ 3350 static void autorun_devices(int part) 3351 { 3352 struct list_head *tmp; 3353 mdk_rdev_t *rdev0, *rdev; 3354 mddev_t *mddev; 3355 char b[BDEVNAME_SIZE]; 3356 3357 printk(KERN_INFO "md: autorun ...\n"); 3358 while (!list_empty(&pending_raid_disks)) { 3359 dev_t dev; 3360 LIST_HEAD(candidates); 3361 rdev0 = list_entry(pending_raid_disks.next, 3362 mdk_rdev_t, same_set); 3363 3364 printk(KERN_INFO "md: considering %s ...\n", 3365 bdevname(rdev0->bdev,b)); 3366 INIT_LIST_HEAD(&candidates); 3367 ITERATE_RDEV_PENDING(rdev,tmp) 3368 if (super_90_load(rdev, rdev0, 0) >= 0) { 3369 printk(KERN_INFO "md: adding %s ...\n", 3370 bdevname(rdev->bdev,b)); 3371 list_move(&rdev->same_set, &candidates); 3372 } 3373 /* 3374 * now we have a set of devices, with all of them having 3375 * mostly sane superblocks. It's time to allocate the 3376 * mddev. 3377 */ 3378 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 3379 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 3380 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 3381 break; 3382 } 3383 if (part) 3384 dev = MKDEV(mdp_major, 3385 rdev0->preferred_minor << MdpMinorShift); 3386 else 3387 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 3388 3389 md_probe(dev, NULL, NULL); 3390 mddev = mddev_find(dev); 3391 if (!mddev) { 3392 printk(KERN_ERR 3393 "md: cannot allocate memory for md drive.\n"); 3394 break; 3395 } 3396 if (mddev_lock(mddev)) 3397 printk(KERN_WARNING "md: %s locked, cannot run\n", 3398 mdname(mddev)); 3399 else if (mddev->raid_disks || mddev->major_version 3400 || !list_empty(&mddev->disks)) { 3401 printk(KERN_WARNING 3402 "md: %s already running, cannot run %s\n", 3403 mdname(mddev), bdevname(rdev0->bdev,b)); 3404 mddev_unlock(mddev); 3405 } else { 3406 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 3407 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 3408 list_del_init(&rdev->same_set); 3409 if (bind_rdev_to_array(rdev, mddev)) 3410 export_rdev(rdev); 3411 } 3412 autorun_array(mddev); 3413 mddev_unlock(mddev); 3414 } 3415 /* on success, candidates will be empty, on error 3416 * it won't... 3417 */ 3418 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 3419 export_rdev(rdev); 3420 mddev_put(mddev); 3421 } 3422 printk(KERN_INFO "md: ... autorun DONE.\n"); 3423 } 3424 3425 /* 3426 * import RAID devices based on one partition 3427 * if possible, the array gets run as well. 3428 */ 3429 3430 static int autostart_array(dev_t startdev) 3431 { 3432 char b[BDEVNAME_SIZE]; 3433 int err = -EINVAL, i; 3434 mdp_super_t *sb = NULL; 3435 mdk_rdev_t *start_rdev = NULL, *rdev; 3436 3437 start_rdev = md_import_device(startdev, 0, 0); 3438 if (IS_ERR(start_rdev)) 3439 return err; 3440 3441 3442 /* NOTE: this can only work for 0.90.0 superblocks */ 3443 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 3444 if (sb->major_version != 0 || 3445 sb->minor_version != 90 ) { 3446 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 3447 export_rdev(start_rdev); 3448 return err; 3449 } 3450 3451 if (test_bit(Faulty, &start_rdev->flags)) { 3452 printk(KERN_WARNING 3453 "md: can not autostart based on faulty %s!\n", 3454 bdevname(start_rdev->bdev,b)); 3455 export_rdev(start_rdev); 3456 return err; 3457 } 3458 list_add(&start_rdev->same_set, &pending_raid_disks); 3459 3460 for (i = 0; i < MD_SB_DISKS; i++) { 3461 mdp_disk_t *desc = sb->disks + i; 3462 dev_t dev = MKDEV(desc->major, desc->minor); 3463 3464 if (!dev) 3465 continue; 3466 if (dev == startdev) 3467 continue; 3468 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 3469 continue; 3470 rdev = md_import_device(dev, 0, 0); 3471 if (IS_ERR(rdev)) 3472 continue; 3473 3474 list_add(&rdev->same_set, &pending_raid_disks); 3475 } 3476 3477 /* 3478 * possibly return codes 3479 */ 3480 autorun_devices(0); 3481 return 0; 3482 3483 } 3484 3485 3486 static int get_version(void __user * arg) 3487 { 3488 mdu_version_t ver; 3489 3490 ver.major = MD_MAJOR_VERSION; 3491 ver.minor = MD_MINOR_VERSION; 3492 ver.patchlevel = MD_PATCHLEVEL_VERSION; 3493 3494 if (copy_to_user(arg, &ver, sizeof(ver))) 3495 return -EFAULT; 3496 3497 return 0; 3498 } 3499 3500 static int get_array_info(mddev_t * mddev, void __user * arg) 3501 { 3502 mdu_array_info_t info; 3503 int nr,working,active,failed,spare; 3504 mdk_rdev_t *rdev; 3505 struct list_head *tmp; 3506 3507 nr=working=active=failed=spare=0; 3508 ITERATE_RDEV(mddev,rdev,tmp) { 3509 nr++; 3510 if (test_bit(Faulty, &rdev->flags)) 3511 failed++; 3512 else { 3513 working++; 3514 if (test_bit(In_sync, &rdev->flags)) 3515 active++; 3516 else 3517 spare++; 3518 } 3519 } 3520 3521 info.major_version = mddev->major_version; 3522 info.minor_version = mddev->minor_version; 3523 info.patch_version = MD_PATCHLEVEL_VERSION; 3524 info.ctime = mddev->ctime; 3525 info.level = mddev->level; 3526 info.size = mddev->size; 3527 if (info.size != mddev->size) /* overflow */ 3528 info.size = -1; 3529 info.nr_disks = nr; 3530 info.raid_disks = mddev->raid_disks; 3531 info.md_minor = mddev->md_minor; 3532 info.not_persistent= !mddev->persistent; 3533 3534 info.utime = mddev->utime; 3535 info.state = 0; 3536 if (mddev->in_sync) 3537 info.state = (1<<MD_SB_CLEAN); 3538 if (mddev->bitmap && mddev->bitmap_offset) 3539 info.state = (1<<MD_SB_BITMAP_PRESENT); 3540 info.active_disks = active; 3541 info.working_disks = working; 3542 info.failed_disks = failed; 3543 info.spare_disks = spare; 3544 3545 info.layout = mddev->layout; 3546 info.chunk_size = mddev->chunk_size; 3547 3548 if (copy_to_user(arg, &info, sizeof(info))) 3549 return -EFAULT; 3550 3551 return 0; 3552 } 3553 3554 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 3555 { 3556 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 3557 char *ptr, *buf = NULL; 3558 int err = -ENOMEM; 3559 3560 file = kmalloc(sizeof(*file), GFP_KERNEL); 3561 if (!file) 3562 goto out; 3563 3564 /* bitmap disabled, zero the first byte and copy out */ 3565 if (!mddev->bitmap || !mddev->bitmap->file) { 3566 file->pathname[0] = '\0'; 3567 goto copy_out; 3568 } 3569 3570 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 3571 if (!buf) 3572 goto out; 3573 3574 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 3575 if (!ptr) 3576 goto out; 3577 3578 strcpy(file->pathname, ptr); 3579 3580 copy_out: 3581 err = 0; 3582 if (copy_to_user(arg, file, sizeof(*file))) 3583 err = -EFAULT; 3584 out: 3585 kfree(buf); 3586 kfree(file); 3587 return err; 3588 } 3589 3590 static int get_disk_info(mddev_t * mddev, void __user * arg) 3591 { 3592 mdu_disk_info_t info; 3593 unsigned int nr; 3594 mdk_rdev_t *rdev; 3595 3596 if (copy_from_user(&info, arg, sizeof(info))) 3597 return -EFAULT; 3598 3599 nr = info.number; 3600 3601 rdev = find_rdev_nr(mddev, nr); 3602 if (rdev) { 3603 info.major = MAJOR(rdev->bdev->bd_dev); 3604 info.minor = MINOR(rdev->bdev->bd_dev); 3605 info.raid_disk = rdev->raid_disk; 3606 info.state = 0; 3607 if (test_bit(Faulty, &rdev->flags)) 3608 info.state |= (1<<MD_DISK_FAULTY); 3609 else if (test_bit(In_sync, &rdev->flags)) { 3610 info.state |= (1<<MD_DISK_ACTIVE); 3611 info.state |= (1<<MD_DISK_SYNC); 3612 } 3613 if (test_bit(WriteMostly, &rdev->flags)) 3614 info.state |= (1<<MD_DISK_WRITEMOSTLY); 3615 } else { 3616 info.major = info.minor = 0; 3617 info.raid_disk = -1; 3618 info.state = (1<<MD_DISK_REMOVED); 3619 } 3620 3621 if (copy_to_user(arg, &info, sizeof(info))) 3622 return -EFAULT; 3623 3624 return 0; 3625 } 3626 3627 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 3628 { 3629 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3630 mdk_rdev_t *rdev; 3631 dev_t dev = MKDEV(info->major,info->minor); 3632 3633 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 3634 return -EOVERFLOW; 3635 3636 if (!mddev->raid_disks) { 3637 int err; 3638 /* expecting a device which has a superblock */ 3639 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 3640 if (IS_ERR(rdev)) { 3641 printk(KERN_WARNING 3642 "md: md_import_device returned %ld\n", 3643 PTR_ERR(rdev)); 3644 return PTR_ERR(rdev); 3645 } 3646 if (!list_empty(&mddev->disks)) { 3647 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3648 mdk_rdev_t, same_set); 3649 int err = super_types[mddev->major_version] 3650 .load_super(rdev, rdev0, mddev->minor_version); 3651 if (err < 0) { 3652 printk(KERN_WARNING 3653 "md: %s has different UUID to %s\n", 3654 bdevname(rdev->bdev,b), 3655 bdevname(rdev0->bdev,b2)); 3656 export_rdev(rdev); 3657 return -EINVAL; 3658 } 3659 } 3660 err = bind_rdev_to_array(rdev, mddev); 3661 if (err) 3662 export_rdev(rdev); 3663 return err; 3664 } 3665 3666 /* 3667 * add_new_disk can be used once the array is assembled 3668 * to add "hot spares". They must already have a superblock 3669 * written 3670 */ 3671 if (mddev->pers) { 3672 int err; 3673 if (!mddev->pers->hot_add_disk) { 3674 printk(KERN_WARNING 3675 "%s: personality does not support diskops!\n", 3676 mdname(mddev)); 3677 return -EINVAL; 3678 } 3679 if (mddev->persistent) 3680 rdev = md_import_device(dev, mddev->major_version, 3681 mddev->minor_version); 3682 else 3683 rdev = md_import_device(dev, -1, -1); 3684 if (IS_ERR(rdev)) { 3685 printk(KERN_WARNING 3686 "md: md_import_device returned %ld\n", 3687 PTR_ERR(rdev)); 3688 return PTR_ERR(rdev); 3689 } 3690 /* set save_raid_disk if appropriate */ 3691 if (!mddev->persistent) { 3692 if (info->state & (1<<MD_DISK_SYNC) && 3693 info->raid_disk < mddev->raid_disks) 3694 rdev->raid_disk = info->raid_disk; 3695 else 3696 rdev->raid_disk = -1; 3697 } else 3698 super_types[mddev->major_version]. 3699 validate_super(mddev, rdev); 3700 rdev->saved_raid_disk = rdev->raid_disk; 3701 3702 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 3703 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3704 set_bit(WriteMostly, &rdev->flags); 3705 3706 rdev->raid_disk = -1; 3707 err = bind_rdev_to_array(rdev, mddev); 3708 if (!err && !mddev->pers->hot_remove_disk) { 3709 /* If there is hot_add_disk but no hot_remove_disk 3710 * then added disks for geometry changes, 3711 * and should be added immediately. 3712 */ 3713 super_types[mddev->major_version]. 3714 validate_super(mddev, rdev); 3715 err = mddev->pers->hot_add_disk(mddev, rdev); 3716 if (err) 3717 unbind_rdev_from_array(rdev); 3718 } 3719 if (err) 3720 export_rdev(rdev); 3721 3722 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3723 md_wakeup_thread(mddev->thread); 3724 return err; 3725 } 3726 3727 /* otherwise, add_new_disk is only allowed 3728 * for major_version==0 superblocks 3729 */ 3730 if (mddev->major_version != 0) { 3731 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 3732 mdname(mddev)); 3733 return -EINVAL; 3734 } 3735 3736 if (!(info->state & (1<<MD_DISK_FAULTY))) { 3737 int err; 3738 rdev = md_import_device (dev, -1, 0); 3739 if (IS_ERR(rdev)) { 3740 printk(KERN_WARNING 3741 "md: error, md_import_device() returned %ld\n", 3742 PTR_ERR(rdev)); 3743 return PTR_ERR(rdev); 3744 } 3745 rdev->desc_nr = info->number; 3746 if (info->raid_disk < mddev->raid_disks) 3747 rdev->raid_disk = info->raid_disk; 3748 else 3749 rdev->raid_disk = -1; 3750 3751 rdev->flags = 0; 3752 3753 if (rdev->raid_disk < mddev->raid_disks) 3754 if (info->state & (1<<MD_DISK_SYNC)) 3755 set_bit(In_sync, &rdev->flags); 3756 3757 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3758 set_bit(WriteMostly, &rdev->flags); 3759 3760 if (!mddev->persistent) { 3761 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3762 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3763 } else 3764 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3765 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3766 3767 err = bind_rdev_to_array(rdev, mddev); 3768 if (err) { 3769 export_rdev(rdev); 3770 return err; 3771 } 3772 } 3773 3774 return 0; 3775 } 3776 3777 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 3778 { 3779 char b[BDEVNAME_SIZE]; 3780 mdk_rdev_t *rdev; 3781 3782 if (!mddev->pers) 3783 return -ENODEV; 3784 3785 rdev = find_rdev(mddev, dev); 3786 if (!rdev) 3787 return -ENXIO; 3788 3789 if (rdev->raid_disk >= 0) 3790 goto busy; 3791 3792 kick_rdev_from_array(rdev); 3793 md_update_sb(mddev); 3794 md_new_event(mddev); 3795 3796 return 0; 3797 busy: 3798 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 3799 bdevname(rdev->bdev,b), mdname(mddev)); 3800 return -EBUSY; 3801 } 3802 3803 static int hot_add_disk(mddev_t * mddev, dev_t dev) 3804 { 3805 char b[BDEVNAME_SIZE]; 3806 int err; 3807 unsigned int size; 3808 mdk_rdev_t *rdev; 3809 3810 if (!mddev->pers) 3811 return -ENODEV; 3812 3813 if (mddev->major_version != 0) { 3814 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 3815 " version-0 superblocks.\n", 3816 mdname(mddev)); 3817 return -EINVAL; 3818 } 3819 if (!mddev->pers->hot_add_disk) { 3820 printk(KERN_WARNING 3821 "%s: personality does not support diskops!\n", 3822 mdname(mddev)); 3823 return -EINVAL; 3824 } 3825 3826 rdev = md_import_device (dev, -1, 0); 3827 if (IS_ERR(rdev)) { 3828 printk(KERN_WARNING 3829 "md: error, md_import_device() returned %ld\n", 3830 PTR_ERR(rdev)); 3831 return -EINVAL; 3832 } 3833 3834 if (mddev->persistent) 3835 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3836 else 3837 rdev->sb_offset = 3838 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3839 3840 size = calc_dev_size(rdev, mddev->chunk_size); 3841 rdev->size = size; 3842 3843 if (test_bit(Faulty, &rdev->flags)) { 3844 printk(KERN_WARNING 3845 "md: can not hot-add faulty %s disk to %s!\n", 3846 bdevname(rdev->bdev,b), mdname(mddev)); 3847 err = -EINVAL; 3848 goto abort_export; 3849 } 3850 clear_bit(In_sync, &rdev->flags); 3851 rdev->desc_nr = -1; 3852 err = bind_rdev_to_array(rdev, mddev); 3853 if (err) 3854 goto abort_export; 3855 3856 /* 3857 * The rest should better be atomic, we can have disk failures 3858 * noticed in interrupt contexts ... 3859 */ 3860 3861 if (rdev->desc_nr == mddev->max_disks) { 3862 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 3863 mdname(mddev)); 3864 err = -EBUSY; 3865 goto abort_unbind_export; 3866 } 3867 3868 rdev->raid_disk = -1; 3869 3870 md_update_sb(mddev); 3871 3872 /* 3873 * Kick recovery, maybe this spare has to be added to the 3874 * array immediately. 3875 */ 3876 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3877 md_wakeup_thread(mddev->thread); 3878 md_new_event(mddev); 3879 return 0; 3880 3881 abort_unbind_export: 3882 unbind_rdev_from_array(rdev); 3883 3884 abort_export: 3885 export_rdev(rdev); 3886 return err; 3887 } 3888 3889 static int set_bitmap_file(mddev_t *mddev, int fd) 3890 { 3891 int err; 3892 3893 if (mddev->pers) { 3894 if (!mddev->pers->quiesce) 3895 return -EBUSY; 3896 if (mddev->recovery || mddev->sync_thread) 3897 return -EBUSY; 3898 /* we should be able to change the bitmap.. */ 3899 } 3900 3901 3902 if (fd >= 0) { 3903 if (mddev->bitmap) 3904 return -EEXIST; /* cannot add when bitmap is present */ 3905 mddev->bitmap_file = fget(fd); 3906 3907 if (mddev->bitmap_file == NULL) { 3908 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 3909 mdname(mddev)); 3910 return -EBADF; 3911 } 3912 3913 err = deny_bitmap_write_access(mddev->bitmap_file); 3914 if (err) { 3915 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 3916 mdname(mddev)); 3917 fput(mddev->bitmap_file); 3918 mddev->bitmap_file = NULL; 3919 return err; 3920 } 3921 mddev->bitmap_offset = 0; /* file overrides offset */ 3922 } else if (mddev->bitmap == NULL) 3923 return -ENOENT; /* cannot remove what isn't there */ 3924 err = 0; 3925 if (mddev->pers) { 3926 mddev->pers->quiesce(mddev, 1); 3927 if (fd >= 0) 3928 err = bitmap_create(mddev); 3929 if (fd < 0 || err) { 3930 bitmap_destroy(mddev); 3931 fd = -1; /* make sure to put the file */ 3932 } 3933 mddev->pers->quiesce(mddev, 0); 3934 } 3935 if (fd < 0) { 3936 if (mddev->bitmap_file) { 3937 restore_bitmap_write_access(mddev->bitmap_file); 3938 fput(mddev->bitmap_file); 3939 } 3940 mddev->bitmap_file = NULL; 3941 } 3942 3943 return err; 3944 } 3945 3946 /* 3947 * set_array_info is used two different ways 3948 * The original usage is when creating a new array. 3949 * In this usage, raid_disks is > 0 and it together with 3950 * level, size, not_persistent,layout,chunksize determine the 3951 * shape of the array. 3952 * This will always create an array with a type-0.90.0 superblock. 3953 * The newer usage is when assembling an array. 3954 * In this case raid_disks will be 0, and the major_version field is 3955 * use to determine which style super-blocks are to be found on the devices. 3956 * The minor and patch _version numbers are also kept incase the 3957 * super_block handler wishes to interpret them. 3958 */ 3959 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 3960 { 3961 3962 if (info->raid_disks == 0) { 3963 /* just setting version number for superblock loading */ 3964 if (info->major_version < 0 || 3965 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 3966 super_types[info->major_version].name == NULL) { 3967 /* maybe try to auto-load a module? */ 3968 printk(KERN_INFO 3969 "md: superblock version %d not known\n", 3970 info->major_version); 3971 return -EINVAL; 3972 } 3973 mddev->major_version = info->major_version; 3974 mddev->minor_version = info->minor_version; 3975 mddev->patch_version = info->patch_version; 3976 return 0; 3977 } 3978 mddev->major_version = MD_MAJOR_VERSION; 3979 mddev->minor_version = MD_MINOR_VERSION; 3980 mddev->patch_version = MD_PATCHLEVEL_VERSION; 3981 mddev->ctime = get_seconds(); 3982 3983 mddev->level = info->level; 3984 mddev->clevel[0] = 0; 3985 mddev->size = info->size; 3986 mddev->raid_disks = info->raid_disks; 3987 /* don't set md_minor, it is determined by which /dev/md* was 3988 * openned 3989 */ 3990 if (info->state & (1<<MD_SB_CLEAN)) 3991 mddev->recovery_cp = MaxSector; 3992 else 3993 mddev->recovery_cp = 0; 3994 mddev->persistent = ! info->not_persistent; 3995 3996 mddev->layout = info->layout; 3997 mddev->chunk_size = info->chunk_size; 3998 3999 mddev->max_disks = MD_SB_DISKS; 4000 4001 mddev->sb_dirty = 1; 4002 4003 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 4004 mddev->bitmap_offset = 0; 4005 4006 mddev->reshape_position = MaxSector; 4007 4008 /* 4009 * Generate a 128 bit UUID 4010 */ 4011 get_random_bytes(mddev->uuid, 16); 4012 4013 mddev->new_level = mddev->level; 4014 mddev->new_chunk = mddev->chunk_size; 4015 mddev->new_layout = mddev->layout; 4016 mddev->delta_disks = 0; 4017 4018 return 0; 4019 } 4020 4021 static int update_size(mddev_t *mddev, unsigned long size) 4022 { 4023 mdk_rdev_t * rdev; 4024 int rv; 4025 struct list_head *tmp; 4026 int fit = (size == 0); 4027 4028 if (mddev->pers->resize == NULL) 4029 return -EINVAL; 4030 /* The "size" is the amount of each device that is used. 4031 * This can only make sense for arrays with redundancy. 4032 * linear and raid0 always use whatever space is available 4033 * We can only consider changing the size if no resync 4034 * or reconstruction is happening, and if the new size 4035 * is acceptable. It must fit before the sb_offset or, 4036 * if that is <data_offset, it must fit before the 4037 * size of each device. 4038 * If size is zero, we find the largest size that fits. 4039 */ 4040 if (mddev->sync_thread) 4041 return -EBUSY; 4042 ITERATE_RDEV(mddev,rdev,tmp) { 4043 sector_t avail; 4044 if (rdev->sb_offset > rdev->data_offset) 4045 avail = (rdev->sb_offset*2) - rdev->data_offset; 4046 else 4047 avail = get_capacity(rdev->bdev->bd_disk) 4048 - rdev->data_offset; 4049 if (fit && (size == 0 || size > avail/2)) 4050 size = avail/2; 4051 if (avail < ((sector_t)size << 1)) 4052 return -ENOSPC; 4053 } 4054 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4055 if (!rv) { 4056 struct block_device *bdev; 4057 4058 bdev = bdget_disk(mddev->gendisk, 0); 4059 if (bdev) { 4060 mutex_lock(&bdev->bd_inode->i_mutex); 4061 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4062 mutex_unlock(&bdev->bd_inode->i_mutex); 4063 bdput(bdev); 4064 } 4065 } 4066 return rv; 4067 } 4068 4069 static int update_raid_disks(mddev_t *mddev, int raid_disks) 4070 { 4071 int rv; 4072 /* change the number of raid disks */ 4073 if (mddev->pers->check_reshape == NULL) 4074 return -EINVAL; 4075 if (raid_disks <= 0 || 4076 raid_disks >= mddev->max_disks) 4077 return -EINVAL; 4078 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 4079 return -EBUSY; 4080 mddev->delta_disks = raid_disks - mddev->raid_disks; 4081 4082 rv = mddev->pers->check_reshape(mddev); 4083 return rv; 4084 } 4085 4086 4087 /* 4088 * update_array_info is used to change the configuration of an 4089 * on-line array. 4090 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 4091 * fields in the info are checked against the array. 4092 * Any differences that cannot be handled will cause an error. 4093 * Normally, only one change can be managed at a time. 4094 */ 4095 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 4096 { 4097 int rv = 0; 4098 int cnt = 0; 4099 int state = 0; 4100 4101 /* calculate expected state,ignoring low bits */ 4102 if (mddev->bitmap && mddev->bitmap_offset) 4103 state |= (1 << MD_SB_BITMAP_PRESENT); 4104 4105 if (mddev->major_version != info->major_version || 4106 mddev->minor_version != info->minor_version || 4107 /* mddev->patch_version != info->patch_version || */ 4108 mddev->ctime != info->ctime || 4109 mddev->level != info->level || 4110 /* mddev->layout != info->layout || */ 4111 !mddev->persistent != info->not_persistent|| 4112 mddev->chunk_size != info->chunk_size || 4113 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 4114 ((state^info->state) & 0xfffffe00) 4115 ) 4116 return -EINVAL; 4117 /* Check there is only one change */ 4118 if (info->size >= 0 && mddev->size != info->size) cnt++; 4119 if (mddev->raid_disks != info->raid_disks) cnt++; 4120 if (mddev->layout != info->layout) cnt++; 4121 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 4122 if (cnt == 0) return 0; 4123 if (cnt > 1) return -EINVAL; 4124 4125 if (mddev->layout != info->layout) { 4126 /* Change layout 4127 * we don't need to do anything at the md level, the 4128 * personality will take care of it all. 4129 */ 4130 if (mddev->pers->reconfig == NULL) 4131 return -EINVAL; 4132 else 4133 return mddev->pers->reconfig(mddev, info->layout, -1); 4134 } 4135 if (info->size >= 0 && mddev->size != info->size) 4136 rv = update_size(mddev, info->size); 4137 4138 if (mddev->raid_disks != info->raid_disks) 4139 rv = update_raid_disks(mddev, info->raid_disks); 4140 4141 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 4142 if (mddev->pers->quiesce == NULL) 4143 return -EINVAL; 4144 if (mddev->recovery || mddev->sync_thread) 4145 return -EBUSY; 4146 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 4147 /* add the bitmap */ 4148 if (mddev->bitmap) 4149 return -EEXIST; 4150 if (mddev->default_bitmap_offset == 0) 4151 return -EINVAL; 4152 mddev->bitmap_offset = mddev->default_bitmap_offset; 4153 mddev->pers->quiesce(mddev, 1); 4154 rv = bitmap_create(mddev); 4155 if (rv) 4156 bitmap_destroy(mddev); 4157 mddev->pers->quiesce(mddev, 0); 4158 } else { 4159 /* remove the bitmap */ 4160 if (!mddev->bitmap) 4161 return -ENOENT; 4162 if (mddev->bitmap->file) 4163 return -EINVAL; 4164 mddev->pers->quiesce(mddev, 1); 4165 bitmap_destroy(mddev); 4166 mddev->pers->quiesce(mddev, 0); 4167 mddev->bitmap_offset = 0; 4168 } 4169 } 4170 md_update_sb(mddev); 4171 return rv; 4172 } 4173 4174 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 4175 { 4176 mdk_rdev_t *rdev; 4177 4178 if (mddev->pers == NULL) 4179 return -ENODEV; 4180 4181 rdev = find_rdev(mddev, dev); 4182 if (!rdev) 4183 return -ENODEV; 4184 4185 md_error(mddev, rdev); 4186 return 0; 4187 } 4188 4189 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4190 { 4191 mddev_t *mddev = bdev->bd_disk->private_data; 4192 4193 geo->heads = 2; 4194 geo->sectors = 4; 4195 geo->cylinders = get_capacity(mddev->gendisk) / 8; 4196 return 0; 4197 } 4198 4199 static int md_ioctl(struct inode *inode, struct file *file, 4200 unsigned int cmd, unsigned long arg) 4201 { 4202 int err = 0; 4203 void __user *argp = (void __user *)arg; 4204 mddev_t *mddev = NULL; 4205 4206 if (!capable(CAP_SYS_ADMIN)) 4207 return -EACCES; 4208 4209 /* 4210 * Commands dealing with the RAID driver but not any 4211 * particular array: 4212 */ 4213 switch (cmd) 4214 { 4215 case RAID_VERSION: 4216 err = get_version(argp); 4217 goto done; 4218 4219 case PRINT_RAID_DEBUG: 4220 err = 0; 4221 md_print_devices(); 4222 goto done; 4223 4224 #ifndef MODULE 4225 case RAID_AUTORUN: 4226 err = 0; 4227 autostart_arrays(arg); 4228 goto done; 4229 #endif 4230 default:; 4231 } 4232 4233 /* 4234 * Commands creating/starting a new array: 4235 */ 4236 4237 mddev = inode->i_bdev->bd_disk->private_data; 4238 4239 if (!mddev) { 4240 BUG(); 4241 goto abort; 4242 } 4243 4244 4245 if (cmd == START_ARRAY) { 4246 /* START_ARRAY doesn't need to lock the array as autostart_array 4247 * does the locking, and it could even be a different array 4248 */ 4249 static int cnt = 3; 4250 if (cnt > 0 ) { 4251 printk(KERN_WARNING 4252 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 4253 "This will not be supported beyond July 2006\n", 4254 current->comm, current->pid); 4255 cnt--; 4256 } 4257 err = autostart_array(new_decode_dev(arg)); 4258 if (err) { 4259 printk(KERN_WARNING "md: autostart failed!\n"); 4260 goto abort; 4261 } 4262 goto done; 4263 } 4264 4265 err = mddev_lock(mddev); 4266 if (err) { 4267 printk(KERN_INFO 4268 "md: ioctl lock interrupted, reason %d, cmd %d\n", 4269 err, cmd); 4270 goto abort; 4271 } 4272 4273 switch (cmd) 4274 { 4275 case SET_ARRAY_INFO: 4276 { 4277 mdu_array_info_t info; 4278 if (!arg) 4279 memset(&info, 0, sizeof(info)); 4280 else if (copy_from_user(&info, argp, sizeof(info))) { 4281 err = -EFAULT; 4282 goto abort_unlock; 4283 } 4284 if (mddev->pers) { 4285 err = update_array_info(mddev, &info); 4286 if (err) { 4287 printk(KERN_WARNING "md: couldn't update" 4288 " array info. %d\n", err); 4289 goto abort_unlock; 4290 } 4291 goto done_unlock; 4292 } 4293 if (!list_empty(&mddev->disks)) { 4294 printk(KERN_WARNING 4295 "md: array %s already has disks!\n", 4296 mdname(mddev)); 4297 err = -EBUSY; 4298 goto abort_unlock; 4299 } 4300 if (mddev->raid_disks) { 4301 printk(KERN_WARNING 4302 "md: array %s already initialised!\n", 4303 mdname(mddev)); 4304 err = -EBUSY; 4305 goto abort_unlock; 4306 } 4307 err = set_array_info(mddev, &info); 4308 if (err) { 4309 printk(KERN_WARNING "md: couldn't set" 4310 " array info. %d\n", err); 4311 goto abort_unlock; 4312 } 4313 } 4314 goto done_unlock; 4315 4316 default:; 4317 } 4318 4319 /* 4320 * Commands querying/configuring an existing array: 4321 */ 4322 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 4323 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 4324 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 4325 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 4326 err = -ENODEV; 4327 goto abort_unlock; 4328 } 4329 4330 /* 4331 * Commands even a read-only array can execute: 4332 */ 4333 switch (cmd) 4334 { 4335 case GET_ARRAY_INFO: 4336 err = get_array_info(mddev, argp); 4337 goto done_unlock; 4338 4339 case GET_BITMAP_FILE: 4340 err = get_bitmap_file(mddev, argp); 4341 goto done_unlock; 4342 4343 case GET_DISK_INFO: 4344 err = get_disk_info(mddev, argp); 4345 goto done_unlock; 4346 4347 case RESTART_ARRAY_RW: 4348 err = restart_array(mddev); 4349 goto done_unlock; 4350 4351 case STOP_ARRAY: 4352 err = do_md_stop (mddev, 0); 4353 goto done_unlock; 4354 4355 case STOP_ARRAY_RO: 4356 err = do_md_stop (mddev, 1); 4357 goto done_unlock; 4358 4359 /* 4360 * We have a problem here : there is no easy way to give a CHS 4361 * virtual geometry. We currently pretend that we have a 2 heads 4362 * 4 sectors (with a BIG number of cylinders...). This drives 4363 * dosfs just mad... ;-) 4364 */ 4365 } 4366 4367 /* 4368 * The remaining ioctls are changing the state of the 4369 * superblock, so we do not allow them on read-only arrays. 4370 * However non-MD ioctls (e.g. get-size) will still come through 4371 * here and hit the 'default' below, so only disallow 4372 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4373 */ 4374 if (_IOC_TYPE(cmd) == MD_MAJOR && 4375 mddev->ro && mddev->pers) { 4376 if (mddev->ro == 2) { 4377 mddev->ro = 0; 4378 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4379 md_wakeup_thread(mddev->thread); 4380 4381 } else { 4382 err = -EROFS; 4383 goto abort_unlock; 4384 } 4385 } 4386 4387 switch (cmd) 4388 { 4389 case ADD_NEW_DISK: 4390 { 4391 mdu_disk_info_t info; 4392 if (copy_from_user(&info, argp, sizeof(info))) 4393 err = -EFAULT; 4394 else 4395 err = add_new_disk(mddev, &info); 4396 goto done_unlock; 4397 } 4398 4399 case HOT_REMOVE_DISK: 4400 err = hot_remove_disk(mddev, new_decode_dev(arg)); 4401 goto done_unlock; 4402 4403 case HOT_ADD_DISK: 4404 err = hot_add_disk(mddev, new_decode_dev(arg)); 4405 goto done_unlock; 4406 4407 case SET_DISK_FAULTY: 4408 err = set_disk_faulty(mddev, new_decode_dev(arg)); 4409 goto done_unlock; 4410 4411 case RUN_ARRAY: 4412 err = do_md_run (mddev); 4413 goto done_unlock; 4414 4415 case SET_BITMAP_FILE: 4416 err = set_bitmap_file(mddev, (int)arg); 4417 goto done_unlock; 4418 4419 default: 4420 err = -EINVAL; 4421 goto abort_unlock; 4422 } 4423 4424 done_unlock: 4425 abort_unlock: 4426 mddev_unlock(mddev); 4427 4428 return err; 4429 done: 4430 if (err) 4431 MD_BUG(); 4432 abort: 4433 return err; 4434 } 4435 4436 static int md_open(struct inode *inode, struct file *file) 4437 { 4438 /* 4439 * Succeed if we can lock the mddev, which confirms that 4440 * it isn't being stopped right now. 4441 */ 4442 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4443 int err; 4444 4445 if ((err = mddev_lock(mddev))) 4446 goto out; 4447 4448 err = 0; 4449 mddev_get(mddev); 4450 mddev_unlock(mddev); 4451 4452 check_disk_change(inode->i_bdev); 4453 out: 4454 return err; 4455 } 4456 4457 static int md_release(struct inode *inode, struct file * file) 4458 { 4459 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4460 4461 if (!mddev) 4462 BUG(); 4463 mddev_put(mddev); 4464 4465 return 0; 4466 } 4467 4468 static int md_media_changed(struct gendisk *disk) 4469 { 4470 mddev_t *mddev = disk->private_data; 4471 4472 return mddev->changed; 4473 } 4474 4475 static int md_revalidate(struct gendisk *disk) 4476 { 4477 mddev_t *mddev = disk->private_data; 4478 4479 mddev->changed = 0; 4480 return 0; 4481 } 4482 static struct block_device_operations md_fops = 4483 { 4484 .owner = THIS_MODULE, 4485 .open = md_open, 4486 .release = md_release, 4487 .ioctl = md_ioctl, 4488 .getgeo = md_getgeo, 4489 .media_changed = md_media_changed, 4490 .revalidate_disk= md_revalidate, 4491 }; 4492 4493 static int md_thread(void * arg) 4494 { 4495 mdk_thread_t *thread = arg; 4496 4497 /* 4498 * md_thread is a 'system-thread', it's priority should be very 4499 * high. We avoid resource deadlocks individually in each 4500 * raid personality. (RAID5 does preallocation) We also use RR and 4501 * the very same RT priority as kswapd, thus we will never get 4502 * into a priority inversion deadlock. 4503 * 4504 * we definitely have to have equal or higher priority than 4505 * bdflush, otherwise bdflush will deadlock if there are too 4506 * many dirty RAID5 blocks. 4507 */ 4508 4509 allow_signal(SIGKILL); 4510 while (!kthread_should_stop()) { 4511 4512 /* We need to wait INTERRUPTIBLE so that 4513 * we don't add to the load-average. 4514 * That means we need to be sure no signals are 4515 * pending 4516 */ 4517 if (signal_pending(current)) 4518 flush_signals(current); 4519 4520 wait_event_interruptible_timeout 4521 (thread->wqueue, 4522 test_bit(THREAD_WAKEUP, &thread->flags) 4523 || kthread_should_stop(), 4524 thread->timeout); 4525 try_to_freeze(); 4526 4527 clear_bit(THREAD_WAKEUP, &thread->flags); 4528 4529 thread->run(thread->mddev); 4530 } 4531 4532 return 0; 4533 } 4534 4535 void md_wakeup_thread(mdk_thread_t *thread) 4536 { 4537 if (thread) { 4538 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 4539 set_bit(THREAD_WAKEUP, &thread->flags); 4540 wake_up(&thread->wqueue); 4541 } 4542 } 4543 4544 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 4545 const char *name) 4546 { 4547 mdk_thread_t *thread; 4548 4549 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 4550 if (!thread) 4551 return NULL; 4552 4553 init_waitqueue_head(&thread->wqueue); 4554 4555 thread->run = run; 4556 thread->mddev = mddev; 4557 thread->timeout = MAX_SCHEDULE_TIMEOUT; 4558 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 4559 if (IS_ERR(thread->tsk)) { 4560 kfree(thread); 4561 return NULL; 4562 } 4563 return thread; 4564 } 4565 4566 void md_unregister_thread(mdk_thread_t *thread) 4567 { 4568 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 4569 4570 kthread_stop(thread->tsk); 4571 kfree(thread); 4572 } 4573 4574 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 4575 { 4576 if (!mddev) { 4577 MD_BUG(); 4578 return; 4579 } 4580 4581 if (!rdev || test_bit(Faulty, &rdev->flags)) 4582 return; 4583 /* 4584 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4585 mdname(mddev), 4586 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 4587 __builtin_return_address(0),__builtin_return_address(1), 4588 __builtin_return_address(2),__builtin_return_address(3)); 4589 */ 4590 if (!mddev->pers->error_handler) 4591 return; 4592 mddev->pers->error_handler(mddev,rdev); 4593 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4594 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4595 md_wakeup_thread(mddev->thread); 4596 md_new_event_inintr(mddev); 4597 } 4598 4599 /* seq_file implementation /proc/mdstat */ 4600 4601 static void status_unused(struct seq_file *seq) 4602 { 4603 int i = 0; 4604 mdk_rdev_t *rdev; 4605 struct list_head *tmp; 4606 4607 seq_printf(seq, "unused devices: "); 4608 4609 ITERATE_RDEV_PENDING(rdev,tmp) { 4610 char b[BDEVNAME_SIZE]; 4611 i++; 4612 seq_printf(seq, "%s ", 4613 bdevname(rdev->bdev,b)); 4614 } 4615 if (!i) 4616 seq_printf(seq, "<none>"); 4617 4618 seq_printf(seq, "\n"); 4619 } 4620 4621 4622 static void status_resync(struct seq_file *seq, mddev_t * mddev) 4623 { 4624 sector_t max_blocks, resync, res; 4625 unsigned long dt, db, rt; 4626 int scale; 4627 unsigned int per_milli; 4628 4629 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 4630 4631 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4632 max_blocks = mddev->resync_max_sectors >> 1; 4633 else 4634 max_blocks = mddev->size; 4635 4636 /* 4637 * Should not happen. 4638 */ 4639 if (!max_blocks) { 4640 MD_BUG(); 4641 return; 4642 } 4643 /* Pick 'scale' such that (resync>>scale)*1000 will fit 4644 * in a sector_t, and (max_blocks>>scale) will fit in a 4645 * u32, as those are the requirements for sector_div. 4646 * Thus 'scale' must be at least 10 4647 */ 4648 scale = 10; 4649 if (sizeof(sector_t) > sizeof(unsigned long)) { 4650 while ( max_blocks/2 > (1ULL<<(scale+32))) 4651 scale++; 4652 } 4653 res = (resync>>scale)*1000; 4654 sector_div(res, (u32)((max_blocks>>scale)+1)); 4655 4656 per_milli = res; 4657 { 4658 int i, x = per_milli/50, y = 20-x; 4659 seq_printf(seq, "["); 4660 for (i = 0; i < x; i++) 4661 seq_printf(seq, "="); 4662 seq_printf(seq, ">"); 4663 for (i = 0; i < y; i++) 4664 seq_printf(seq, "."); 4665 seq_printf(seq, "] "); 4666 } 4667 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 4668 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 4669 "reshape" : 4670 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 4671 "resync" : "recovery")), 4672 per_milli/10, per_milli % 10, 4673 (unsigned long long) resync, 4674 (unsigned long long) max_blocks); 4675 4676 /* 4677 * We do not want to overflow, so the order of operands and 4678 * the * 100 / 100 trick are important. We do a +1 to be 4679 * safe against division by zero. We only estimate anyway. 4680 * 4681 * dt: time from mark until now 4682 * db: blocks written from mark until now 4683 * rt: remaining time 4684 */ 4685 dt = ((jiffies - mddev->resync_mark) / HZ); 4686 if (!dt) dt++; 4687 db = resync - (mddev->resync_mark_cnt/2); 4688 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100; 4689 4690 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 4691 4692 seq_printf(seq, " speed=%ldK/sec", db/dt); 4693 } 4694 4695 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 4696 { 4697 struct list_head *tmp; 4698 loff_t l = *pos; 4699 mddev_t *mddev; 4700 4701 if (l >= 0x10000) 4702 return NULL; 4703 if (!l--) 4704 /* header */ 4705 return (void*)1; 4706 4707 spin_lock(&all_mddevs_lock); 4708 list_for_each(tmp,&all_mddevs) 4709 if (!l--) { 4710 mddev = list_entry(tmp, mddev_t, all_mddevs); 4711 mddev_get(mddev); 4712 spin_unlock(&all_mddevs_lock); 4713 return mddev; 4714 } 4715 spin_unlock(&all_mddevs_lock); 4716 if (!l--) 4717 return (void*)2;/* tail */ 4718 return NULL; 4719 } 4720 4721 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4722 { 4723 struct list_head *tmp; 4724 mddev_t *next_mddev, *mddev = v; 4725 4726 ++*pos; 4727 if (v == (void*)2) 4728 return NULL; 4729 4730 spin_lock(&all_mddevs_lock); 4731 if (v == (void*)1) 4732 tmp = all_mddevs.next; 4733 else 4734 tmp = mddev->all_mddevs.next; 4735 if (tmp != &all_mddevs) 4736 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 4737 else { 4738 next_mddev = (void*)2; 4739 *pos = 0x10000; 4740 } 4741 spin_unlock(&all_mddevs_lock); 4742 4743 if (v != (void*)1) 4744 mddev_put(mddev); 4745 return next_mddev; 4746 4747 } 4748 4749 static void md_seq_stop(struct seq_file *seq, void *v) 4750 { 4751 mddev_t *mddev = v; 4752 4753 if (mddev && v != (void*)1 && v != (void*)2) 4754 mddev_put(mddev); 4755 } 4756 4757 struct mdstat_info { 4758 int event; 4759 }; 4760 4761 static int md_seq_show(struct seq_file *seq, void *v) 4762 { 4763 mddev_t *mddev = v; 4764 sector_t size; 4765 struct list_head *tmp2; 4766 mdk_rdev_t *rdev; 4767 struct mdstat_info *mi = seq->private; 4768 struct bitmap *bitmap; 4769 4770 if (v == (void*)1) { 4771 struct mdk_personality *pers; 4772 seq_printf(seq, "Personalities : "); 4773 spin_lock(&pers_lock); 4774 list_for_each_entry(pers, &pers_list, list) 4775 seq_printf(seq, "[%s] ", pers->name); 4776 4777 spin_unlock(&pers_lock); 4778 seq_printf(seq, "\n"); 4779 mi->event = atomic_read(&md_event_count); 4780 return 0; 4781 } 4782 if (v == (void*)2) { 4783 status_unused(seq); 4784 return 0; 4785 } 4786 4787 if (mddev_lock(mddev) < 0) 4788 return -EINTR; 4789 4790 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 4791 seq_printf(seq, "%s : %sactive", mdname(mddev), 4792 mddev->pers ? "" : "in"); 4793 if (mddev->pers) { 4794 if (mddev->ro==1) 4795 seq_printf(seq, " (read-only)"); 4796 if (mddev->ro==2) 4797 seq_printf(seq, "(auto-read-only)"); 4798 seq_printf(seq, " %s", mddev->pers->name); 4799 } 4800 4801 size = 0; 4802 ITERATE_RDEV(mddev,rdev,tmp2) { 4803 char b[BDEVNAME_SIZE]; 4804 seq_printf(seq, " %s[%d]", 4805 bdevname(rdev->bdev,b), rdev->desc_nr); 4806 if (test_bit(WriteMostly, &rdev->flags)) 4807 seq_printf(seq, "(W)"); 4808 if (test_bit(Faulty, &rdev->flags)) { 4809 seq_printf(seq, "(F)"); 4810 continue; 4811 } else if (rdev->raid_disk < 0) 4812 seq_printf(seq, "(S)"); /* spare */ 4813 size += rdev->size; 4814 } 4815 4816 if (!list_empty(&mddev->disks)) { 4817 if (mddev->pers) 4818 seq_printf(seq, "\n %llu blocks", 4819 (unsigned long long)mddev->array_size); 4820 else 4821 seq_printf(seq, "\n %llu blocks", 4822 (unsigned long long)size); 4823 } 4824 if (mddev->persistent) { 4825 if (mddev->major_version != 0 || 4826 mddev->minor_version != 90) { 4827 seq_printf(seq," super %d.%d", 4828 mddev->major_version, 4829 mddev->minor_version); 4830 } 4831 } else 4832 seq_printf(seq, " super non-persistent"); 4833 4834 if (mddev->pers) { 4835 mddev->pers->status (seq, mddev); 4836 seq_printf(seq, "\n "); 4837 if (mddev->pers->sync_request) { 4838 if (mddev->curr_resync > 2) { 4839 status_resync (seq, mddev); 4840 seq_printf(seq, "\n "); 4841 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 4842 seq_printf(seq, "\tresync=DELAYED\n "); 4843 else if (mddev->recovery_cp < MaxSector) 4844 seq_printf(seq, "\tresync=PENDING\n "); 4845 } 4846 } else 4847 seq_printf(seq, "\n "); 4848 4849 if ((bitmap = mddev->bitmap)) { 4850 unsigned long chunk_kb; 4851 unsigned long flags; 4852 spin_lock_irqsave(&bitmap->lock, flags); 4853 chunk_kb = bitmap->chunksize >> 10; 4854 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 4855 "%lu%s chunk", 4856 bitmap->pages - bitmap->missing_pages, 4857 bitmap->pages, 4858 (bitmap->pages - bitmap->missing_pages) 4859 << (PAGE_SHIFT - 10), 4860 chunk_kb ? chunk_kb : bitmap->chunksize, 4861 chunk_kb ? "KB" : "B"); 4862 if (bitmap->file) { 4863 seq_printf(seq, ", file: "); 4864 seq_path(seq, bitmap->file->f_vfsmnt, 4865 bitmap->file->f_dentry," \t\n"); 4866 } 4867 4868 seq_printf(seq, "\n"); 4869 spin_unlock_irqrestore(&bitmap->lock, flags); 4870 } 4871 4872 seq_printf(seq, "\n"); 4873 } 4874 mddev_unlock(mddev); 4875 4876 return 0; 4877 } 4878 4879 static struct seq_operations md_seq_ops = { 4880 .start = md_seq_start, 4881 .next = md_seq_next, 4882 .stop = md_seq_stop, 4883 .show = md_seq_show, 4884 }; 4885 4886 static int md_seq_open(struct inode *inode, struct file *file) 4887 { 4888 int error; 4889 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 4890 if (mi == NULL) 4891 return -ENOMEM; 4892 4893 error = seq_open(file, &md_seq_ops); 4894 if (error) 4895 kfree(mi); 4896 else { 4897 struct seq_file *p = file->private_data; 4898 p->private = mi; 4899 mi->event = atomic_read(&md_event_count); 4900 } 4901 return error; 4902 } 4903 4904 static int md_seq_release(struct inode *inode, struct file *file) 4905 { 4906 struct seq_file *m = file->private_data; 4907 struct mdstat_info *mi = m->private; 4908 m->private = NULL; 4909 kfree(mi); 4910 return seq_release(inode, file); 4911 } 4912 4913 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 4914 { 4915 struct seq_file *m = filp->private_data; 4916 struct mdstat_info *mi = m->private; 4917 int mask; 4918 4919 poll_wait(filp, &md_event_waiters, wait); 4920 4921 /* always allow read */ 4922 mask = POLLIN | POLLRDNORM; 4923 4924 if (mi->event != atomic_read(&md_event_count)) 4925 mask |= POLLERR | POLLPRI; 4926 return mask; 4927 } 4928 4929 static struct file_operations md_seq_fops = { 4930 .open = md_seq_open, 4931 .read = seq_read, 4932 .llseek = seq_lseek, 4933 .release = md_seq_release, 4934 .poll = mdstat_poll, 4935 }; 4936 4937 int register_md_personality(struct mdk_personality *p) 4938 { 4939 spin_lock(&pers_lock); 4940 list_add_tail(&p->list, &pers_list); 4941 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 4942 spin_unlock(&pers_lock); 4943 return 0; 4944 } 4945 4946 int unregister_md_personality(struct mdk_personality *p) 4947 { 4948 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 4949 spin_lock(&pers_lock); 4950 list_del_init(&p->list); 4951 spin_unlock(&pers_lock); 4952 return 0; 4953 } 4954 4955 static int is_mddev_idle(mddev_t *mddev) 4956 { 4957 mdk_rdev_t * rdev; 4958 struct list_head *tmp; 4959 int idle; 4960 unsigned long curr_events; 4961 4962 idle = 1; 4963 ITERATE_RDEV(mddev,rdev,tmp) { 4964 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 4965 curr_events = disk_stat_read(disk, sectors[0]) + 4966 disk_stat_read(disk, sectors[1]) - 4967 atomic_read(&disk->sync_io); 4968 /* The difference between curr_events and last_events 4969 * will be affected by any new non-sync IO (making 4970 * curr_events bigger) and any difference in the amount of 4971 * in-flight syncio (making current_events bigger or smaller) 4972 * The amount in-flight is currently limited to 4973 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 4974 * which is at most 4096 sectors. 4975 * These numbers are fairly fragile and should be made 4976 * more robust, probably by enforcing the 4977 * 'window size' that md_do_sync sort-of uses. 4978 * 4979 * Note: the following is an unsigned comparison. 4980 */ 4981 if ((curr_events - rdev->last_events + 4096) > 8192) { 4982 rdev->last_events = curr_events; 4983 idle = 0; 4984 } 4985 } 4986 return idle; 4987 } 4988 4989 void md_done_sync(mddev_t *mddev, int blocks, int ok) 4990 { 4991 /* another "blocks" (512byte) blocks have been synced */ 4992 atomic_sub(blocks, &mddev->recovery_active); 4993 wake_up(&mddev->recovery_wait); 4994 if (!ok) { 4995 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4996 md_wakeup_thread(mddev->thread); 4997 // stop recovery, signal do_sync .... 4998 } 4999 } 5000 5001 5002 /* md_write_start(mddev, bi) 5003 * If we need to update some array metadata (e.g. 'active' flag 5004 * in superblock) before writing, schedule a superblock update 5005 * and wait for it to complete. 5006 */ 5007 void md_write_start(mddev_t *mddev, struct bio *bi) 5008 { 5009 if (bio_data_dir(bi) != WRITE) 5010 return; 5011 5012 BUG_ON(mddev->ro == 1); 5013 if (mddev->ro == 2) { 5014 /* need to switch to read/write */ 5015 mddev->ro = 0; 5016 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5017 md_wakeup_thread(mddev->thread); 5018 } 5019 atomic_inc(&mddev->writes_pending); 5020 if (mddev->in_sync) { 5021 spin_lock_irq(&mddev->write_lock); 5022 if (mddev->in_sync) { 5023 mddev->in_sync = 0; 5024 mddev->sb_dirty = 3; 5025 md_wakeup_thread(mddev->thread); 5026 } 5027 spin_unlock_irq(&mddev->write_lock); 5028 } 5029 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 5030 } 5031 5032 void md_write_end(mddev_t *mddev) 5033 { 5034 if (atomic_dec_and_test(&mddev->writes_pending)) { 5035 if (mddev->safemode == 2) 5036 md_wakeup_thread(mddev->thread); 5037 else if (mddev->safemode_delay) 5038 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5039 } 5040 } 5041 5042 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 5043 5044 #define SYNC_MARKS 10 5045 #define SYNC_MARK_STEP (3*HZ) 5046 void md_do_sync(mddev_t *mddev) 5047 { 5048 mddev_t *mddev2; 5049 unsigned int currspeed = 0, 5050 window; 5051 sector_t max_sectors,j, io_sectors; 5052 unsigned long mark[SYNC_MARKS]; 5053 sector_t mark_cnt[SYNC_MARKS]; 5054 int last_mark,m; 5055 struct list_head *tmp; 5056 sector_t last_check; 5057 int skipped = 0; 5058 struct list_head *rtmp; 5059 mdk_rdev_t *rdev; 5060 5061 /* just incase thread restarts... */ 5062 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5063 return; 5064 if (mddev->ro) /* never try to sync a read-only array */ 5065 return; 5066 5067 /* we overload curr_resync somewhat here. 5068 * 0 == not engaged in resync at all 5069 * 2 == checking that there is no conflict with another sync 5070 * 1 == like 2, but have yielded to allow conflicting resync to 5071 * commense 5072 * other == active in resync - this many blocks 5073 * 5074 * Before starting a resync we must have set curr_resync to 5075 * 2, and then checked that every "conflicting" array has curr_resync 5076 * less than ours. When we find one that is the same or higher 5077 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 5078 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 5079 * This will mean we have to start checking from the beginning again. 5080 * 5081 */ 5082 5083 do { 5084 mddev->curr_resync = 2; 5085 5086 try_again: 5087 if (kthread_should_stop()) { 5088 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5089 goto skip; 5090 } 5091 ITERATE_MDDEV(mddev2,tmp) { 5092 if (mddev2 == mddev) 5093 continue; 5094 if (mddev2->curr_resync && 5095 match_mddev_units(mddev,mddev2)) { 5096 DEFINE_WAIT(wq); 5097 if (mddev < mddev2 && mddev->curr_resync == 2) { 5098 /* arbitrarily yield */ 5099 mddev->curr_resync = 1; 5100 wake_up(&resync_wait); 5101 } 5102 if (mddev > mddev2 && mddev->curr_resync == 1) 5103 /* no need to wait here, we can wait the next 5104 * time 'round when curr_resync == 2 5105 */ 5106 continue; 5107 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 5108 if (!kthread_should_stop() && 5109 mddev2->curr_resync >= mddev->curr_resync) { 5110 printk(KERN_INFO "md: delaying resync of %s" 5111 " until %s has finished resync (they" 5112 " share one or more physical units)\n", 5113 mdname(mddev), mdname(mddev2)); 5114 mddev_put(mddev2); 5115 schedule(); 5116 finish_wait(&resync_wait, &wq); 5117 goto try_again; 5118 } 5119 finish_wait(&resync_wait, &wq); 5120 } 5121 } 5122 } while (mddev->curr_resync < 2); 5123 5124 j = 0; 5125 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5126 /* resync follows the size requested by the personality, 5127 * which defaults to physical size, but can be virtual size 5128 */ 5129 max_sectors = mddev->resync_max_sectors; 5130 mddev->resync_mismatches = 0; 5131 /* we don't use the checkpoint if there's a bitmap */ 5132 if (!mddev->bitmap && 5133 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5134 j = mddev->recovery_cp; 5135 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5136 max_sectors = mddev->size << 1; 5137 else { 5138 /* recovery follows the physical size of devices */ 5139 max_sectors = mddev->size << 1; 5140 j = MaxSector; 5141 ITERATE_RDEV(mddev,rdev,rtmp) 5142 if (rdev->raid_disk >= 0 && 5143 !test_bit(Faulty, &rdev->flags) && 5144 !test_bit(In_sync, &rdev->flags) && 5145 rdev->recovery_offset < j) 5146 j = rdev->recovery_offset; 5147 } 5148 5149 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 5150 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 5151 " %d KB/sec/disc.\n", speed_min(mddev)); 5152 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 5153 "(but not more than %d KB/sec) for reconstruction.\n", 5154 speed_max(mddev)); 5155 5156 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5157 5158 io_sectors = 0; 5159 for (m = 0; m < SYNC_MARKS; m++) { 5160 mark[m] = jiffies; 5161 mark_cnt[m] = io_sectors; 5162 } 5163 last_mark = 0; 5164 mddev->resync_mark = mark[last_mark]; 5165 mddev->resync_mark_cnt = mark_cnt[last_mark]; 5166 5167 /* 5168 * Tune reconstruction: 5169 */ 5170 window = 32*(PAGE_SIZE/512); 5171 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 5172 window/2,(unsigned long long) max_sectors/2); 5173 5174 atomic_set(&mddev->recovery_active, 0); 5175 init_waitqueue_head(&mddev->recovery_wait); 5176 last_check = 0; 5177 5178 if (j>2) { 5179 printk(KERN_INFO 5180 "md: resuming recovery of %s from checkpoint.\n", 5181 mdname(mddev)); 5182 mddev->curr_resync = j; 5183 } 5184 5185 while (j < max_sectors) { 5186 sector_t sectors; 5187 5188 skipped = 0; 5189 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5190 currspeed < speed_min(mddev)); 5191 if (sectors == 0) { 5192 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5193 goto out; 5194 } 5195 5196 if (!skipped) { /* actual IO requested */ 5197 io_sectors += sectors; 5198 atomic_add(sectors, &mddev->recovery_active); 5199 } 5200 5201 j += sectors; 5202 if (j>1) mddev->curr_resync = j; 5203 if (last_check == 0) 5204 /* this is the earliers that rebuilt will be 5205 * visible in /proc/mdstat 5206 */ 5207 md_new_event(mddev); 5208 5209 if (last_check + window > io_sectors || j == max_sectors) 5210 continue; 5211 5212 last_check = io_sectors; 5213 5214 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 5215 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 5216 break; 5217 5218 repeat: 5219 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 5220 /* step marks */ 5221 int next = (last_mark+1) % SYNC_MARKS; 5222 5223 mddev->resync_mark = mark[next]; 5224 mddev->resync_mark_cnt = mark_cnt[next]; 5225 mark[next] = jiffies; 5226 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 5227 last_mark = next; 5228 } 5229 5230 5231 if (kthread_should_stop()) { 5232 /* 5233 * got a signal, exit. 5234 */ 5235 printk(KERN_INFO 5236 "md: md_do_sync() got signal ... exiting\n"); 5237 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5238 goto out; 5239 } 5240 5241 /* 5242 * this loop exits only if either when we are slower than 5243 * the 'hard' speed limit, or the system was IO-idle for 5244 * a jiffy. 5245 * the system might be non-idle CPU-wise, but we only care 5246 * about not overloading the IO subsystem. (things like an 5247 * e2fsck being done on the RAID array should execute fast) 5248 */ 5249 mddev->queue->unplug_fn(mddev->queue); 5250 cond_resched(); 5251 5252 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 5253 /((jiffies-mddev->resync_mark)/HZ +1) +1; 5254 5255 if (currspeed > speed_min(mddev)) { 5256 if ((currspeed > speed_max(mddev)) || 5257 !is_mddev_idle(mddev)) { 5258 msleep(500); 5259 goto repeat; 5260 } 5261 } 5262 } 5263 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 5264 /* 5265 * this also signals 'finished resyncing' to md_stop 5266 */ 5267 out: 5268 mddev->queue->unplug_fn(mddev->queue); 5269 5270 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 5271 5272 /* tell personality that we are finished */ 5273 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5274 5275 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5276 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 5277 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5278 mddev->curr_resync > 2) { 5279 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5280 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5281 if (mddev->curr_resync >= mddev->recovery_cp) { 5282 printk(KERN_INFO 5283 "md: checkpointing recovery of %s.\n", 5284 mdname(mddev)); 5285 mddev->recovery_cp = mddev->curr_resync; 5286 } 5287 } else 5288 mddev->recovery_cp = MaxSector; 5289 } else { 5290 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5291 mddev->curr_resync = MaxSector; 5292 ITERATE_RDEV(mddev,rdev,rtmp) 5293 if (rdev->raid_disk >= 0 && 5294 !test_bit(Faulty, &rdev->flags) && 5295 !test_bit(In_sync, &rdev->flags) && 5296 rdev->recovery_offset < mddev->curr_resync) 5297 rdev->recovery_offset = mddev->curr_resync; 5298 mddev->sb_dirty = 1; 5299 } 5300 } 5301 5302 skip: 5303 mddev->curr_resync = 0; 5304 wake_up(&resync_wait); 5305 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 5306 md_wakeup_thread(mddev->thread); 5307 } 5308 EXPORT_SYMBOL_GPL(md_do_sync); 5309 5310 5311 /* 5312 * This routine is regularly called by all per-raid-array threads to 5313 * deal with generic issues like resync and super-block update. 5314 * Raid personalities that don't have a thread (linear/raid0) do not 5315 * need this as they never do any recovery or update the superblock. 5316 * 5317 * It does not do any resync itself, but rather "forks" off other threads 5318 * to do that as needed. 5319 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 5320 * "->recovery" and create a thread at ->sync_thread. 5321 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 5322 * and wakeups up this thread which will reap the thread and finish up. 5323 * This thread also removes any faulty devices (with nr_pending == 0). 5324 * 5325 * The overall approach is: 5326 * 1/ if the superblock needs updating, update it. 5327 * 2/ If a recovery thread is running, don't do anything else. 5328 * 3/ If recovery has finished, clean up, possibly marking spares active. 5329 * 4/ If there are any faulty devices, remove them. 5330 * 5/ If array is degraded, try to add spares devices 5331 * 6/ If array has spares or is not in-sync, start a resync thread. 5332 */ 5333 void md_check_recovery(mddev_t *mddev) 5334 { 5335 mdk_rdev_t *rdev; 5336 struct list_head *rtmp; 5337 5338 5339 if (mddev->bitmap) 5340 bitmap_daemon_work(mddev->bitmap); 5341 5342 if (mddev->ro) 5343 return; 5344 5345 if (signal_pending(current)) { 5346 if (mddev->pers->sync_request) { 5347 printk(KERN_INFO "md: %s in immediate safe mode\n", 5348 mdname(mddev)); 5349 mddev->safemode = 2; 5350 } 5351 flush_signals(current); 5352 } 5353 5354 if ( ! ( 5355 mddev->sb_dirty || 5356 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 5357 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 5358 (mddev->safemode == 1) || 5359 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 5360 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 5361 )) 5362 return; 5363 5364 if (mddev_trylock(mddev)) { 5365 int spares =0; 5366 5367 spin_lock_irq(&mddev->write_lock); 5368 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5369 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5370 mddev->in_sync = 1; 5371 mddev->sb_dirty = 3; 5372 } 5373 if (mddev->safemode == 1) 5374 mddev->safemode = 0; 5375 spin_unlock_irq(&mddev->write_lock); 5376 5377 if (mddev->sb_dirty) 5378 md_update_sb(mddev); 5379 5380 5381 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 5382 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 5383 /* resync/recovery still happening */ 5384 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5385 goto unlock; 5386 } 5387 if (mddev->sync_thread) { 5388 /* resync has finished, collect result */ 5389 md_unregister_thread(mddev->sync_thread); 5390 mddev->sync_thread = NULL; 5391 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5392 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5393 /* success...*/ 5394 /* activate any spares */ 5395 mddev->pers->spare_active(mddev); 5396 } 5397 md_update_sb(mddev); 5398 5399 /* if array is no-longer degraded, then any saved_raid_disk 5400 * information must be scrapped 5401 */ 5402 if (!mddev->degraded) 5403 ITERATE_RDEV(mddev,rdev,rtmp) 5404 rdev->saved_raid_disk = -1; 5405 5406 mddev->recovery = 0; 5407 /* flag recovery needed just to double check */ 5408 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5409 md_new_event(mddev); 5410 goto unlock; 5411 } 5412 /* Clear some bits that don't mean anything, but 5413 * might be left set 5414 */ 5415 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5416 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 5417 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5418 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5419 5420 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 5421 goto unlock; 5422 /* no recovery is running. 5423 * remove any failed drives, then 5424 * add spares if possible. 5425 * Spare are also removed and re-added, to allow 5426 * the personality to fail the re-add. 5427 */ 5428 ITERATE_RDEV(mddev,rdev,rtmp) 5429 if (rdev->raid_disk >= 0 && 5430 (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && 5431 atomic_read(&rdev->nr_pending)==0) { 5432 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { 5433 char nm[20]; 5434 sprintf(nm,"rd%d", rdev->raid_disk); 5435 sysfs_remove_link(&mddev->kobj, nm); 5436 rdev->raid_disk = -1; 5437 } 5438 } 5439 5440 if (mddev->degraded) { 5441 ITERATE_RDEV(mddev,rdev,rtmp) 5442 if (rdev->raid_disk < 0 5443 && !test_bit(Faulty, &rdev->flags)) { 5444 rdev->recovery_offset = 0; 5445 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5446 char nm[20]; 5447 sprintf(nm, "rd%d", rdev->raid_disk); 5448 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 5449 spares++; 5450 md_new_event(mddev); 5451 } else 5452 break; 5453 } 5454 } 5455 5456 if (spares) { 5457 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5458 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5459 } else if (mddev->recovery_cp < MaxSector) { 5460 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5461 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5462 /* nothing to be done ... */ 5463 goto unlock; 5464 5465 if (mddev->pers->sync_request) { 5466 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5467 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 5468 /* We are adding a device or devices to an array 5469 * which has the bitmap stored on all devices. 5470 * So make sure all bitmap pages get written 5471 */ 5472 bitmap_write_all(mddev->bitmap); 5473 } 5474 mddev->sync_thread = md_register_thread(md_do_sync, 5475 mddev, 5476 "%s_resync"); 5477 if (!mddev->sync_thread) { 5478 printk(KERN_ERR "%s: could not start resync" 5479 " thread...\n", 5480 mdname(mddev)); 5481 /* leave the spares where they are, it shouldn't hurt */ 5482 mddev->recovery = 0; 5483 } else 5484 md_wakeup_thread(mddev->sync_thread); 5485 md_new_event(mddev); 5486 } 5487 unlock: 5488 mddev_unlock(mddev); 5489 } 5490 } 5491 5492 static int md_notify_reboot(struct notifier_block *this, 5493 unsigned long code, void *x) 5494 { 5495 struct list_head *tmp; 5496 mddev_t *mddev; 5497 5498 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 5499 5500 printk(KERN_INFO "md: stopping all md devices.\n"); 5501 5502 ITERATE_MDDEV(mddev,tmp) 5503 if (mddev_trylock(mddev)) { 5504 do_md_stop (mddev, 1); 5505 mddev_unlock(mddev); 5506 } 5507 /* 5508 * certain more exotic SCSI devices are known to be 5509 * volatile wrt too early system reboots. While the 5510 * right place to handle this issue is the given 5511 * driver, we do want to have a safe RAID driver ... 5512 */ 5513 mdelay(1000*1); 5514 } 5515 return NOTIFY_DONE; 5516 } 5517 5518 static struct notifier_block md_notifier = { 5519 .notifier_call = md_notify_reboot, 5520 .next = NULL, 5521 .priority = INT_MAX, /* before any real devices */ 5522 }; 5523 5524 static void md_geninit(void) 5525 { 5526 struct proc_dir_entry *p; 5527 5528 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 5529 5530 p = create_proc_entry("mdstat", S_IRUGO, NULL); 5531 if (p) 5532 p->proc_fops = &md_seq_fops; 5533 } 5534 5535 static int __init md_init(void) 5536 { 5537 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 5538 " MD_SB_DISKS=%d\n", 5539 MD_MAJOR_VERSION, MD_MINOR_VERSION, 5540 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 5541 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, 5542 BITMAP_MINOR); 5543 5544 if (register_blkdev(MAJOR_NR, "md")) 5545 return -1; 5546 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 5547 unregister_blkdev(MAJOR_NR, "md"); 5548 return -1; 5549 } 5550 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 5551 md_probe, NULL, NULL); 5552 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 5553 md_probe, NULL, NULL); 5554 5555 register_reboot_notifier(&md_notifier); 5556 raid_table_header = register_sysctl_table(raid_root_table, 1); 5557 5558 md_geninit(); 5559 return (0); 5560 } 5561 5562 5563 #ifndef MODULE 5564 5565 /* 5566 * Searches all registered partitions for autorun RAID arrays 5567 * at boot time. 5568 */ 5569 static dev_t detected_devices[128]; 5570 static int dev_cnt; 5571 5572 void md_autodetect_dev(dev_t dev) 5573 { 5574 if (dev_cnt >= 0 && dev_cnt < 127) 5575 detected_devices[dev_cnt++] = dev; 5576 } 5577 5578 5579 static void autostart_arrays(int part) 5580 { 5581 mdk_rdev_t *rdev; 5582 int i; 5583 5584 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 5585 5586 for (i = 0; i < dev_cnt; i++) { 5587 dev_t dev = detected_devices[i]; 5588 5589 rdev = md_import_device(dev,0, 0); 5590 if (IS_ERR(rdev)) 5591 continue; 5592 5593 if (test_bit(Faulty, &rdev->flags)) { 5594 MD_BUG(); 5595 continue; 5596 } 5597 list_add(&rdev->same_set, &pending_raid_disks); 5598 } 5599 dev_cnt = 0; 5600 5601 autorun_devices(part); 5602 } 5603 5604 #endif 5605 5606 static __exit void md_exit(void) 5607 { 5608 mddev_t *mddev; 5609 struct list_head *tmp; 5610 5611 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 5612 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 5613 5614 unregister_blkdev(MAJOR_NR,"md"); 5615 unregister_blkdev(mdp_major, "mdp"); 5616 unregister_reboot_notifier(&md_notifier); 5617 unregister_sysctl_table(raid_table_header); 5618 remove_proc_entry("mdstat", NULL); 5619 ITERATE_MDDEV(mddev,tmp) { 5620 struct gendisk *disk = mddev->gendisk; 5621 if (!disk) 5622 continue; 5623 export_array(mddev); 5624 del_gendisk(disk); 5625 put_disk(disk); 5626 mddev->gendisk = NULL; 5627 mddev_put(mddev); 5628 } 5629 } 5630 5631 module_init(md_init) 5632 module_exit(md_exit) 5633 5634 static int get_ro(char *buffer, struct kernel_param *kp) 5635 { 5636 return sprintf(buffer, "%d", start_readonly); 5637 } 5638 static int set_ro(const char *val, struct kernel_param *kp) 5639 { 5640 char *e; 5641 int num = simple_strtoul(val, &e, 10); 5642 if (*val && (*e == '\0' || *e == '\n')) { 5643 start_readonly = num; 5644 return 0; 5645 } 5646 return -EINVAL; 5647 } 5648 5649 module_param_call(start_ro, set_ro, get_ro, NULL, 0600); 5650 module_param(start_dirty_degraded, int, 0644); 5651 5652 5653 EXPORT_SYMBOL(register_md_personality); 5654 EXPORT_SYMBOL(unregister_md_personality); 5655 EXPORT_SYMBOL(md_error); 5656 EXPORT_SYMBOL(md_done_sync); 5657 EXPORT_SYMBOL(md_write_start); 5658 EXPORT_SYMBOL(md_write_end); 5659 EXPORT_SYMBOL(md_register_thread); 5660 EXPORT_SYMBOL(md_unregister_thread); 5661 EXPORT_SYMBOL(md_wakeup_thread); 5662 EXPORT_SYMBOL(md_check_recovery); 5663 MODULE_LICENSE("GPL"); 5664 MODULE_ALIAS("md"); 5665 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 5666