1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/kthread.h> 36 #include <linux/blkdev.h> 37 #include <linux/sysctl.h> 38 #include <linux/seq_file.h> 39 #include <linux/buffer_head.h> /* for invalidate_bdev */ 40 #include <linux/poll.h> 41 #include <linux/ctype.h> 42 #include <linux/hdreg.h> 43 #include <linux/proc_fs.h> 44 #include <linux/random.h> 45 #include <linux/reboot.h> 46 #include <linux/file.h> 47 #include <linux/delay.h> 48 #include <linux/raid/md_p.h> 49 #include <linux/raid/md_u.h> 50 #include "md.h" 51 #include "bitmap.h" 52 53 #define DEBUG 0 54 #define dprintk(x...) ((void)(DEBUG && printk(x))) 55 56 57 #ifndef MODULE 58 static void autostart_arrays(int part); 59 #endif 60 61 static LIST_HEAD(pers_list); 62 static DEFINE_SPINLOCK(pers_lock); 63 64 static void md_print_devices(void); 65 66 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 67 68 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 69 70 /* 71 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 72 * is 1000 KB/sec, so the extra system load does not show up that much. 73 * Increase it if you want to have more _guaranteed_ speed. Note that 74 * the RAID driver will use the maximum available bandwidth if the IO 75 * subsystem is idle. There is also an 'absolute maximum' reconstruction 76 * speed limit - in case reconstruction slows down your system despite 77 * idle IO detection. 78 * 79 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 80 * or /sys/block/mdX/md/sync_speed_{min,max} 81 */ 82 83 static int sysctl_speed_limit_min = 1000; 84 static int sysctl_speed_limit_max = 200000; 85 static inline int speed_min(mddev_t *mddev) 86 { 87 return mddev->sync_speed_min ? 88 mddev->sync_speed_min : sysctl_speed_limit_min; 89 } 90 91 static inline int speed_max(mddev_t *mddev) 92 { 93 return mddev->sync_speed_max ? 94 mddev->sync_speed_max : sysctl_speed_limit_max; 95 } 96 97 static struct ctl_table_header *raid_table_header; 98 99 static ctl_table raid_table[] = { 100 { 101 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 102 .procname = "speed_limit_min", 103 .data = &sysctl_speed_limit_min, 104 .maxlen = sizeof(int), 105 .mode = S_IRUGO|S_IWUSR, 106 .proc_handler = &proc_dointvec, 107 }, 108 { 109 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 110 .procname = "speed_limit_max", 111 .data = &sysctl_speed_limit_max, 112 .maxlen = sizeof(int), 113 .mode = S_IRUGO|S_IWUSR, 114 .proc_handler = &proc_dointvec, 115 }, 116 { .ctl_name = 0 } 117 }; 118 119 static ctl_table raid_dir_table[] = { 120 { 121 .ctl_name = DEV_RAID, 122 .procname = "raid", 123 .maxlen = 0, 124 .mode = S_IRUGO|S_IXUGO, 125 .child = raid_table, 126 }, 127 { .ctl_name = 0 } 128 }; 129 130 static ctl_table raid_root_table[] = { 131 { 132 .ctl_name = CTL_DEV, 133 .procname = "dev", 134 .maxlen = 0, 135 .mode = 0555, 136 .child = raid_dir_table, 137 }, 138 { .ctl_name = 0 } 139 }; 140 141 static struct block_device_operations md_fops; 142 143 static int start_readonly; 144 145 /* 146 * We have a system wide 'event count' that is incremented 147 * on any 'interesting' event, and readers of /proc/mdstat 148 * can use 'poll' or 'select' to find out when the event 149 * count increases. 150 * 151 * Events are: 152 * start array, stop array, error, add device, remove device, 153 * start build, activate spare 154 */ 155 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 156 static atomic_t md_event_count; 157 void md_new_event(mddev_t *mddev) 158 { 159 atomic_inc(&md_event_count); 160 wake_up(&md_event_waiters); 161 } 162 EXPORT_SYMBOL_GPL(md_new_event); 163 164 /* Alternate version that can be called from interrupts 165 * when calling sysfs_notify isn't needed. 166 */ 167 static void md_new_event_inintr(mddev_t *mddev) 168 { 169 atomic_inc(&md_event_count); 170 wake_up(&md_event_waiters); 171 } 172 173 /* 174 * Enables to iterate over all existing md arrays 175 * all_mddevs_lock protects this list. 176 */ 177 static LIST_HEAD(all_mddevs); 178 static DEFINE_SPINLOCK(all_mddevs_lock); 179 180 181 /* 182 * iterates through all used mddevs in the system. 183 * We take care to grab the all_mddevs_lock whenever navigating 184 * the list, and to always hold a refcount when unlocked. 185 * Any code which breaks out of this loop while own 186 * a reference to the current mddev and must mddev_put it. 187 */ 188 #define for_each_mddev(mddev,tmp) \ 189 \ 190 for (({ spin_lock(&all_mddevs_lock); \ 191 tmp = all_mddevs.next; \ 192 mddev = NULL;}); \ 193 ({ if (tmp != &all_mddevs) \ 194 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 195 spin_unlock(&all_mddevs_lock); \ 196 if (mddev) mddev_put(mddev); \ 197 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 198 tmp != &all_mddevs;}); \ 199 ({ spin_lock(&all_mddevs_lock); \ 200 tmp = tmp->next;}) \ 201 ) 202 203 204 /* Rather than calling directly into the personality make_request function, 205 * IO requests come here first so that we can check if the device is 206 * being suspended pending a reconfiguration. 207 * We hold a refcount over the call to ->make_request. By the time that 208 * call has finished, the bio has been linked into some internal structure 209 * and so is visible to ->quiesce(), so we don't need the refcount any more. 210 */ 211 static int md_make_request(struct request_queue *q, struct bio *bio) 212 { 213 mddev_t *mddev = q->queuedata; 214 int rv; 215 if (mddev == NULL || mddev->pers == NULL) { 216 bio_io_error(bio); 217 return 0; 218 } 219 rcu_read_lock(); 220 if (mddev->suspended) { 221 DEFINE_WAIT(__wait); 222 for (;;) { 223 prepare_to_wait(&mddev->sb_wait, &__wait, 224 TASK_UNINTERRUPTIBLE); 225 if (!mddev->suspended) 226 break; 227 rcu_read_unlock(); 228 schedule(); 229 rcu_read_lock(); 230 } 231 finish_wait(&mddev->sb_wait, &__wait); 232 } 233 atomic_inc(&mddev->active_io); 234 rcu_read_unlock(); 235 rv = mddev->pers->make_request(q, bio); 236 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 237 wake_up(&mddev->sb_wait); 238 239 return rv; 240 } 241 242 static void mddev_suspend(mddev_t *mddev) 243 { 244 BUG_ON(mddev->suspended); 245 mddev->suspended = 1; 246 synchronize_rcu(); 247 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 248 mddev->pers->quiesce(mddev, 1); 249 md_unregister_thread(mddev->thread); 250 mddev->thread = NULL; 251 /* we now know that no code is executing in the personality module, 252 * except possibly the tail end of a ->bi_end_io function, but that 253 * is certain to complete before the module has a chance to get 254 * unloaded 255 */ 256 } 257 258 static void mddev_resume(mddev_t *mddev) 259 { 260 mddev->suspended = 0; 261 wake_up(&mddev->sb_wait); 262 mddev->pers->quiesce(mddev, 0); 263 } 264 265 266 static inline mddev_t *mddev_get(mddev_t *mddev) 267 { 268 atomic_inc(&mddev->active); 269 return mddev; 270 } 271 272 static void mddev_delayed_delete(struct work_struct *ws); 273 274 static void mddev_put(mddev_t *mddev) 275 { 276 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 277 return; 278 if (!mddev->raid_disks && list_empty(&mddev->disks) && 279 !mddev->hold_active) { 280 list_del(&mddev->all_mddevs); 281 if (mddev->gendisk) { 282 /* we did a probe so need to clean up. 283 * Call schedule_work inside the spinlock 284 * so that flush_scheduled_work() after 285 * mddev_find will succeed in waiting for the 286 * work to be done. 287 */ 288 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 289 schedule_work(&mddev->del_work); 290 } else 291 kfree(mddev); 292 } 293 spin_unlock(&all_mddevs_lock); 294 } 295 296 static mddev_t * mddev_find(dev_t unit) 297 { 298 mddev_t *mddev, *new = NULL; 299 300 retry: 301 spin_lock(&all_mddevs_lock); 302 303 if (unit) { 304 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 305 if (mddev->unit == unit) { 306 mddev_get(mddev); 307 spin_unlock(&all_mddevs_lock); 308 kfree(new); 309 return mddev; 310 } 311 312 if (new) { 313 list_add(&new->all_mddevs, &all_mddevs); 314 spin_unlock(&all_mddevs_lock); 315 new->hold_active = UNTIL_IOCTL; 316 return new; 317 } 318 } else if (new) { 319 /* find an unused unit number */ 320 static int next_minor = 512; 321 int start = next_minor; 322 int is_free = 0; 323 int dev = 0; 324 while (!is_free) { 325 dev = MKDEV(MD_MAJOR, next_minor); 326 next_minor++; 327 if (next_minor > MINORMASK) 328 next_minor = 0; 329 if (next_minor == start) { 330 /* Oh dear, all in use. */ 331 spin_unlock(&all_mddevs_lock); 332 kfree(new); 333 return NULL; 334 } 335 336 is_free = 1; 337 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 338 if (mddev->unit == dev) { 339 is_free = 0; 340 break; 341 } 342 } 343 new->unit = dev; 344 new->md_minor = MINOR(dev); 345 new->hold_active = UNTIL_STOP; 346 list_add(&new->all_mddevs, &all_mddevs); 347 spin_unlock(&all_mddevs_lock); 348 return new; 349 } 350 spin_unlock(&all_mddevs_lock); 351 352 new = kzalloc(sizeof(*new), GFP_KERNEL); 353 if (!new) 354 return NULL; 355 356 new->unit = unit; 357 if (MAJOR(unit) == MD_MAJOR) 358 new->md_minor = MINOR(unit); 359 else 360 new->md_minor = MINOR(unit) >> MdpMinorShift; 361 362 mutex_init(&new->reconfig_mutex); 363 INIT_LIST_HEAD(&new->disks); 364 INIT_LIST_HEAD(&new->all_mddevs); 365 init_timer(&new->safemode_timer); 366 atomic_set(&new->active, 1); 367 atomic_set(&new->openers, 0); 368 atomic_set(&new->active_io, 0); 369 spin_lock_init(&new->write_lock); 370 init_waitqueue_head(&new->sb_wait); 371 init_waitqueue_head(&new->recovery_wait); 372 new->reshape_position = MaxSector; 373 new->resync_min = 0; 374 new->resync_max = MaxSector; 375 new->level = LEVEL_NONE; 376 377 goto retry; 378 } 379 380 static inline int mddev_lock(mddev_t * mddev) 381 { 382 return mutex_lock_interruptible(&mddev->reconfig_mutex); 383 } 384 385 static inline int mddev_is_locked(mddev_t *mddev) 386 { 387 return mutex_is_locked(&mddev->reconfig_mutex); 388 } 389 390 static inline int mddev_trylock(mddev_t * mddev) 391 { 392 return mutex_trylock(&mddev->reconfig_mutex); 393 } 394 395 static inline void mddev_unlock(mddev_t * mddev) 396 { 397 mutex_unlock(&mddev->reconfig_mutex); 398 399 md_wakeup_thread(mddev->thread); 400 } 401 402 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 403 { 404 mdk_rdev_t *rdev; 405 406 list_for_each_entry(rdev, &mddev->disks, same_set) 407 if (rdev->desc_nr == nr) 408 return rdev; 409 410 return NULL; 411 } 412 413 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 414 { 415 mdk_rdev_t *rdev; 416 417 list_for_each_entry(rdev, &mddev->disks, same_set) 418 if (rdev->bdev->bd_dev == dev) 419 return rdev; 420 421 return NULL; 422 } 423 424 static struct mdk_personality *find_pers(int level, char *clevel) 425 { 426 struct mdk_personality *pers; 427 list_for_each_entry(pers, &pers_list, list) { 428 if (level != LEVEL_NONE && pers->level == level) 429 return pers; 430 if (strcmp(pers->name, clevel)==0) 431 return pers; 432 } 433 return NULL; 434 } 435 436 /* return the offset of the super block in 512byte sectors */ 437 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 438 { 439 sector_t num_sectors = bdev->bd_inode->i_size / 512; 440 return MD_NEW_SIZE_SECTORS(num_sectors); 441 } 442 443 static int alloc_disk_sb(mdk_rdev_t * rdev) 444 { 445 if (rdev->sb_page) 446 MD_BUG(); 447 448 rdev->sb_page = alloc_page(GFP_KERNEL); 449 if (!rdev->sb_page) { 450 printk(KERN_ALERT "md: out of memory.\n"); 451 return -ENOMEM; 452 } 453 454 return 0; 455 } 456 457 static void free_disk_sb(mdk_rdev_t * rdev) 458 { 459 if (rdev->sb_page) { 460 put_page(rdev->sb_page); 461 rdev->sb_loaded = 0; 462 rdev->sb_page = NULL; 463 rdev->sb_start = 0; 464 rdev->sectors = 0; 465 } 466 } 467 468 469 static void super_written(struct bio *bio, int error) 470 { 471 mdk_rdev_t *rdev = bio->bi_private; 472 mddev_t *mddev = rdev->mddev; 473 474 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 475 printk("md: super_written gets error=%d, uptodate=%d\n", 476 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 477 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 478 md_error(mddev, rdev); 479 } 480 481 if (atomic_dec_and_test(&mddev->pending_writes)) 482 wake_up(&mddev->sb_wait); 483 bio_put(bio); 484 } 485 486 static void super_written_barrier(struct bio *bio, int error) 487 { 488 struct bio *bio2 = bio->bi_private; 489 mdk_rdev_t *rdev = bio2->bi_private; 490 mddev_t *mddev = rdev->mddev; 491 492 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 493 error == -EOPNOTSUPP) { 494 unsigned long flags; 495 /* barriers don't appear to be supported :-( */ 496 set_bit(BarriersNotsupp, &rdev->flags); 497 mddev->barriers_work = 0; 498 spin_lock_irqsave(&mddev->write_lock, flags); 499 bio2->bi_next = mddev->biolist; 500 mddev->biolist = bio2; 501 spin_unlock_irqrestore(&mddev->write_lock, flags); 502 wake_up(&mddev->sb_wait); 503 bio_put(bio); 504 } else { 505 bio_put(bio2); 506 bio->bi_private = rdev; 507 super_written(bio, error); 508 } 509 } 510 511 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 512 sector_t sector, int size, struct page *page) 513 { 514 /* write first size bytes of page to sector of rdev 515 * Increment mddev->pending_writes before returning 516 * and decrement it on completion, waking up sb_wait 517 * if zero is reached. 518 * If an error occurred, call md_error 519 * 520 * As we might need to resubmit the request if BIO_RW_BARRIER 521 * causes ENOTSUPP, we allocate a spare bio... 522 */ 523 struct bio *bio = bio_alloc(GFP_NOIO, 1); 524 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); 525 526 bio->bi_bdev = rdev->bdev; 527 bio->bi_sector = sector; 528 bio_add_page(bio, page, size, 0); 529 bio->bi_private = rdev; 530 bio->bi_end_io = super_written; 531 bio->bi_rw = rw; 532 533 atomic_inc(&mddev->pending_writes); 534 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 535 struct bio *rbio; 536 rw |= (1<<BIO_RW_BARRIER); 537 rbio = bio_clone(bio, GFP_NOIO); 538 rbio->bi_private = bio; 539 rbio->bi_end_io = super_written_barrier; 540 submit_bio(rw, rbio); 541 } else 542 submit_bio(rw, bio); 543 } 544 545 void md_super_wait(mddev_t *mddev) 546 { 547 /* wait for all superblock writes that were scheduled to complete. 548 * if any had to be retried (due to BARRIER problems), retry them 549 */ 550 DEFINE_WAIT(wq); 551 for(;;) { 552 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 553 if (atomic_read(&mddev->pending_writes)==0) 554 break; 555 while (mddev->biolist) { 556 struct bio *bio; 557 spin_lock_irq(&mddev->write_lock); 558 bio = mddev->biolist; 559 mddev->biolist = bio->bi_next ; 560 bio->bi_next = NULL; 561 spin_unlock_irq(&mddev->write_lock); 562 submit_bio(bio->bi_rw, bio); 563 } 564 schedule(); 565 } 566 finish_wait(&mddev->sb_wait, &wq); 567 } 568 569 static void bi_complete(struct bio *bio, int error) 570 { 571 complete((struct completion*)bio->bi_private); 572 } 573 574 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 575 struct page *page, int rw) 576 { 577 struct bio *bio = bio_alloc(GFP_NOIO, 1); 578 struct completion event; 579 int ret; 580 581 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 582 583 bio->bi_bdev = bdev; 584 bio->bi_sector = sector; 585 bio_add_page(bio, page, size, 0); 586 init_completion(&event); 587 bio->bi_private = &event; 588 bio->bi_end_io = bi_complete; 589 submit_bio(rw, bio); 590 wait_for_completion(&event); 591 592 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 593 bio_put(bio); 594 return ret; 595 } 596 EXPORT_SYMBOL_GPL(sync_page_io); 597 598 static int read_disk_sb(mdk_rdev_t * rdev, int size) 599 { 600 char b[BDEVNAME_SIZE]; 601 if (!rdev->sb_page) { 602 MD_BUG(); 603 return -EINVAL; 604 } 605 if (rdev->sb_loaded) 606 return 0; 607 608 609 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 610 goto fail; 611 rdev->sb_loaded = 1; 612 return 0; 613 614 fail: 615 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 616 bdevname(rdev->bdev,b)); 617 return -EINVAL; 618 } 619 620 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 621 { 622 return sb1->set_uuid0 == sb2->set_uuid0 && 623 sb1->set_uuid1 == sb2->set_uuid1 && 624 sb1->set_uuid2 == sb2->set_uuid2 && 625 sb1->set_uuid3 == sb2->set_uuid3; 626 } 627 628 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 629 { 630 int ret; 631 mdp_super_t *tmp1, *tmp2; 632 633 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 634 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 635 636 if (!tmp1 || !tmp2) { 637 ret = 0; 638 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 639 goto abort; 640 } 641 642 *tmp1 = *sb1; 643 *tmp2 = *sb2; 644 645 /* 646 * nr_disks is not constant 647 */ 648 tmp1->nr_disks = 0; 649 tmp2->nr_disks = 0; 650 651 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 652 abort: 653 kfree(tmp1); 654 kfree(tmp2); 655 return ret; 656 } 657 658 659 static u32 md_csum_fold(u32 csum) 660 { 661 csum = (csum & 0xffff) + (csum >> 16); 662 return (csum & 0xffff) + (csum >> 16); 663 } 664 665 static unsigned int calc_sb_csum(mdp_super_t * sb) 666 { 667 u64 newcsum = 0; 668 u32 *sb32 = (u32*)sb; 669 int i; 670 unsigned int disk_csum, csum; 671 672 disk_csum = sb->sb_csum; 673 sb->sb_csum = 0; 674 675 for (i = 0; i < MD_SB_BYTES/4 ; i++) 676 newcsum += sb32[i]; 677 csum = (newcsum & 0xffffffff) + (newcsum>>32); 678 679 680 #ifdef CONFIG_ALPHA 681 /* This used to use csum_partial, which was wrong for several 682 * reasons including that different results are returned on 683 * different architectures. It isn't critical that we get exactly 684 * the same return value as before (we always csum_fold before 685 * testing, and that removes any differences). However as we 686 * know that csum_partial always returned a 16bit value on 687 * alphas, do a fold to maximise conformity to previous behaviour. 688 */ 689 sb->sb_csum = md_csum_fold(disk_csum); 690 #else 691 sb->sb_csum = disk_csum; 692 #endif 693 return csum; 694 } 695 696 697 /* 698 * Handle superblock details. 699 * We want to be able to handle multiple superblock formats 700 * so we have a common interface to them all, and an array of 701 * different handlers. 702 * We rely on user-space to write the initial superblock, and support 703 * reading and updating of superblocks. 704 * Interface methods are: 705 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 706 * loads and validates a superblock on dev. 707 * if refdev != NULL, compare superblocks on both devices 708 * Return: 709 * 0 - dev has a superblock that is compatible with refdev 710 * 1 - dev has a superblock that is compatible and newer than refdev 711 * so dev should be used as the refdev in future 712 * -EINVAL superblock incompatible or invalid 713 * -othererror e.g. -EIO 714 * 715 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 716 * Verify that dev is acceptable into mddev. 717 * The first time, mddev->raid_disks will be 0, and data from 718 * dev should be merged in. Subsequent calls check that dev 719 * is new enough. Return 0 or -EINVAL 720 * 721 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 722 * Update the superblock for rdev with data in mddev 723 * This does not write to disc. 724 * 725 */ 726 727 struct super_type { 728 char *name; 729 struct module *owner; 730 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, 731 int minor_version); 732 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 733 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 734 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, 735 sector_t num_sectors); 736 }; 737 738 /* 739 * Check that the given mddev has no bitmap. 740 * 741 * This function is called from the run method of all personalities that do not 742 * support bitmaps. It prints an error message and returns non-zero if mddev 743 * has a bitmap. Otherwise, it returns 0. 744 * 745 */ 746 int md_check_no_bitmap(mddev_t *mddev) 747 { 748 if (!mddev->bitmap_file && !mddev->bitmap_offset) 749 return 0; 750 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 751 mdname(mddev), mddev->pers->name); 752 return 1; 753 } 754 EXPORT_SYMBOL(md_check_no_bitmap); 755 756 /* 757 * load_super for 0.90.0 758 */ 759 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 760 { 761 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 762 mdp_super_t *sb; 763 int ret; 764 765 /* 766 * Calculate the position of the superblock (512byte sectors), 767 * it's at the end of the disk. 768 * 769 * It also happens to be a multiple of 4Kb. 770 */ 771 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 772 773 ret = read_disk_sb(rdev, MD_SB_BYTES); 774 if (ret) return ret; 775 776 ret = -EINVAL; 777 778 bdevname(rdev->bdev, b); 779 sb = (mdp_super_t*)page_address(rdev->sb_page); 780 781 if (sb->md_magic != MD_SB_MAGIC) { 782 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 783 b); 784 goto abort; 785 } 786 787 if (sb->major_version != 0 || 788 sb->minor_version < 90 || 789 sb->minor_version > 91) { 790 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 791 sb->major_version, sb->minor_version, 792 b); 793 goto abort; 794 } 795 796 if (sb->raid_disks <= 0) 797 goto abort; 798 799 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 800 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 801 b); 802 goto abort; 803 } 804 805 rdev->preferred_minor = sb->md_minor; 806 rdev->data_offset = 0; 807 rdev->sb_size = MD_SB_BYTES; 808 809 if (sb->level == LEVEL_MULTIPATH) 810 rdev->desc_nr = -1; 811 else 812 rdev->desc_nr = sb->this_disk.number; 813 814 if (!refdev) { 815 ret = 1; 816 } else { 817 __u64 ev1, ev2; 818 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 819 if (!uuid_equal(refsb, sb)) { 820 printk(KERN_WARNING "md: %s has different UUID to %s\n", 821 b, bdevname(refdev->bdev,b2)); 822 goto abort; 823 } 824 if (!sb_equal(refsb, sb)) { 825 printk(KERN_WARNING "md: %s has same UUID" 826 " but different superblock to %s\n", 827 b, bdevname(refdev->bdev, b2)); 828 goto abort; 829 } 830 ev1 = md_event(sb); 831 ev2 = md_event(refsb); 832 if (ev1 > ev2) 833 ret = 1; 834 else 835 ret = 0; 836 } 837 rdev->sectors = rdev->sb_start; 838 839 if (rdev->sectors < sb->size * 2 && sb->level > 1) 840 /* "this cannot possibly happen" ... */ 841 ret = -EINVAL; 842 843 abort: 844 return ret; 845 } 846 847 /* 848 * validate_super for 0.90.0 849 */ 850 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 851 { 852 mdp_disk_t *desc; 853 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 854 __u64 ev1 = md_event(sb); 855 856 rdev->raid_disk = -1; 857 clear_bit(Faulty, &rdev->flags); 858 clear_bit(In_sync, &rdev->flags); 859 clear_bit(WriteMostly, &rdev->flags); 860 clear_bit(BarriersNotsupp, &rdev->flags); 861 862 if (mddev->raid_disks == 0) { 863 mddev->major_version = 0; 864 mddev->minor_version = sb->minor_version; 865 mddev->patch_version = sb->patch_version; 866 mddev->external = 0; 867 mddev->chunk_sectors = sb->chunk_size >> 9; 868 mddev->ctime = sb->ctime; 869 mddev->utime = sb->utime; 870 mddev->level = sb->level; 871 mddev->clevel[0] = 0; 872 mddev->layout = sb->layout; 873 mddev->raid_disks = sb->raid_disks; 874 mddev->dev_sectors = sb->size * 2; 875 mddev->events = ev1; 876 mddev->bitmap_offset = 0; 877 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 878 879 if (mddev->minor_version >= 91) { 880 mddev->reshape_position = sb->reshape_position; 881 mddev->delta_disks = sb->delta_disks; 882 mddev->new_level = sb->new_level; 883 mddev->new_layout = sb->new_layout; 884 mddev->new_chunk_sectors = sb->new_chunk >> 9; 885 } else { 886 mddev->reshape_position = MaxSector; 887 mddev->delta_disks = 0; 888 mddev->new_level = mddev->level; 889 mddev->new_layout = mddev->layout; 890 mddev->new_chunk_sectors = mddev->chunk_sectors; 891 } 892 893 if (sb->state & (1<<MD_SB_CLEAN)) 894 mddev->recovery_cp = MaxSector; 895 else { 896 if (sb->events_hi == sb->cp_events_hi && 897 sb->events_lo == sb->cp_events_lo) { 898 mddev->recovery_cp = sb->recovery_cp; 899 } else 900 mddev->recovery_cp = 0; 901 } 902 903 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 904 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 905 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 906 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 907 908 mddev->max_disks = MD_SB_DISKS; 909 910 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 911 mddev->bitmap_file == NULL) 912 mddev->bitmap_offset = mddev->default_bitmap_offset; 913 914 } else if (mddev->pers == NULL) { 915 /* Insist on good event counter while assembling */ 916 ++ev1; 917 if (ev1 < mddev->events) 918 return -EINVAL; 919 } else if (mddev->bitmap) { 920 /* if adding to array with a bitmap, then we can accept an 921 * older device ... but not too old. 922 */ 923 if (ev1 < mddev->bitmap->events_cleared) 924 return 0; 925 } else { 926 if (ev1 < mddev->events) 927 /* just a hot-add of a new device, leave raid_disk at -1 */ 928 return 0; 929 } 930 931 if (mddev->level != LEVEL_MULTIPATH) { 932 desc = sb->disks + rdev->desc_nr; 933 934 if (desc->state & (1<<MD_DISK_FAULTY)) 935 set_bit(Faulty, &rdev->flags); 936 else if (desc->state & (1<<MD_DISK_SYNC) /* && 937 desc->raid_disk < mddev->raid_disks */) { 938 set_bit(In_sync, &rdev->flags); 939 rdev->raid_disk = desc->raid_disk; 940 } 941 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 942 set_bit(WriteMostly, &rdev->flags); 943 } else /* MULTIPATH are always insync */ 944 set_bit(In_sync, &rdev->flags); 945 return 0; 946 } 947 948 /* 949 * sync_super for 0.90.0 950 */ 951 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 952 { 953 mdp_super_t *sb; 954 mdk_rdev_t *rdev2; 955 int next_spare = mddev->raid_disks; 956 957 958 /* make rdev->sb match mddev data.. 959 * 960 * 1/ zero out disks 961 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 962 * 3/ any empty disks < next_spare become removed 963 * 964 * disks[0] gets initialised to REMOVED because 965 * we cannot be sure from other fields if it has 966 * been initialised or not. 967 */ 968 int i; 969 int active=0, working=0,failed=0,spare=0,nr_disks=0; 970 971 rdev->sb_size = MD_SB_BYTES; 972 973 sb = (mdp_super_t*)page_address(rdev->sb_page); 974 975 memset(sb, 0, sizeof(*sb)); 976 977 sb->md_magic = MD_SB_MAGIC; 978 sb->major_version = mddev->major_version; 979 sb->patch_version = mddev->patch_version; 980 sb->gvalid_words = 0; /* ignored */ 981 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 982 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 983 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 984 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 985 986 sb->ctime = mddev->ctime; 987 sb->level = mddev->level; 988 sb->size = mddev->dev_sectors / 2; 989 sb->raid_disks = mddev->raid_disks; 990 sb->md_minor = mddev->md_minor; 991 sb->not_persistent = 0; 992 sb->utime = mddev->utime; 993 sb->state = 0; 994 sb->events_hi = (mddev->events>>32); 995 sb->events_lo = (u32)mddev->events; 996 997 if (mddev->reshape_position == MaxSector) 998 sb->minor_version = 90; 999 else { 1000 sb->minor_version = 91; 1001 sb->reshape_position = mddev->reshape_position; 1002 sb->new_level = mddev->new_level; 1003 sb->delta_disks = mddev->delta_disks; 1004 sb->new_layout = mddev->new_layout; 1005 sb->new_chunk = mddev->new_chunk_sectors << 9; 1006 } 1007 mddev->minor_version = sb->minor_version; 1008 if (mddev->in_sync) 1009 { 1010 sb->recovery_cp = mddev->recovery_cp; 1011 sb->cp_events_hi = (mddev->events>>32); 1012 sb->cp_events_lo = (u32)mddev->events; 1013 if (mddev->recovery_cp == MaxSector) 1014 sb->state = (1<< MD_SB_CLEAN); 1015 } else 1016 sb->recovery_cp = 0; 1017 1018 sb->layout = mddev->layout; 1019 sb->chunk_size = mddev->chunk_sectors << 9; 1020 1021 if (mddev->bitmap && mddev->bitmap_file == NULL) 1022 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1023 1024 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1025 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1026 mdp_disk_t *d; 1027 int desc_nr; 1028 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 1029 && !test_bit(Faulty, &rdev2->flags)) 1030 desc_nr = rdev2->raid_disk; 1031 else 1032 desc_nr = next_spare++; 1033 rdev2->desc_nr = desc_nr; 1034 d = &sb->disks[rdev2->desc_nr]; 1035 nr_disks++; 1036 d->number = rdev2->desc_nr; 1037 d->major = MAJOR(rdev2->bdev->bd_dev); 1038 d->minor = MINOR(rdev2->bdev->bd_dev); 1039 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 1040 && !test_bit(Faulty, &rdev2->flags)) 1041 d->raid_disk = rdev2->raid_disk; 1042 else 1043 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1044 if (test_bit(Faulty, &rdev2->flags)) 1045 d->state = (1<<MD_DISK_FAULTY); 1046 else if (test_bit(In_sync, &rdev2->flags)) { 1047 d->state = (1<<MD_DISK_ACTIVE); 1048 d->state |= (1<<MD_DISK_SYNC); 1049 active++; 1050 working++; 1051 } else { 1052 d->state = 0; 1053 spare++; 1054 working++; 1055 } 1056 if (test_bit(WriteMostly, &rdev2->flags)) 1057 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1058 } 1059 /* now set the "removed" and "faulty" bits on any missing devices */ 1060 for (i=0 ; i < mddev->raid_disks ; i++) { 1061 mdp_disk_t *d = &sb->disks[i]; 1062 if (d->state == 0 && d->number == 0) { 1063 d->number = i; 1064 d->raid_disk = i; 1065 d->state = (1<<MD_DISK_REMOVED); 1066 d->state |= (1<<MD_DISK_FAULTY); 1067 failed++; 1068 } 1069 } 1070 sb->nr_disks = nr_disks; 1071 sb->active_disks = active; 1072 sb->working_disks = working; 1073 sb->failed_disks = failed; 1074 sb->spare_disks = spare; 1075 1076 sb->this_disk = sb->disks[rdev->desc_nr]; 1077 sb->sb_csum = calc_sb_csum(sb); 1078 } 1079 1080 /* 1081 * rdev_size_change for 0.90.0 1082 */ 1083 static unsigned long long 1084 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1085 { 1086 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1087 return 0; /* component must fit device */ 1088 if (rdev->mddev->bitmap_offset) 1089 return 0; /* can't move bitmap */ 1090 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1091 if (!num_sectors || num_sectors > rdev->sb_start) 1092 num_sectors = rdev->sb_start; 1093 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1094 rdev->sb_page); 1095 md_super_wait(rdev->mddev); 1096 return num_sectors / 2; /* kB for sysfs */ 1097 } 1098 1099 1100 /* 1101 * version 1 superblock 1102 */ 1103 1104 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1105 { 1106 __le32 disk_csum; 1107 u32 csum; 1108 unsigned long long newcsum; 1109 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1110 __le32 *isuper = (__le32*)sb; 1111 int i; 1112 1113 disk_csum = sb->sb_csum; 1114 sb->sb_csum = 0; 1115 newcsum = 0; 1116 for (i=0; size>=4; size -= 4 ) 1117 newcsum += le32_to_cpu(*isuper++); 1118 1119 if (size == 2) 1120 newcsum += le16_to_cpu(*(__le16*) isuper); 1121 1122 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1123 sb->sb_csum = disk_csum; 1124 return cpu_to_le32(csum); 1125 } 1126 1127 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1128 { 1129 struct mdp_superblock_1 *sb; 1130 int ret; 1131 sector_t sb_start; 1132 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1133 int bmask; 1134 1135 /* 1136 * Calculate the position of the superblock in 512byte sectors. 1137 * It is always aligned to a 4K boundary and 1138 * depeding on minor_version, it can be: 1139 * 0: At least 8K, but less than 12K, from end of device 1140 * 1: At start of device 1141 * 2: 4K from start of device. 1142 */ 1143 switch(minor_version) { 1144 case 0: 1145 sb_start = rdev->bdev->bd_inode->i_size >> 9; 1146 sb_start -= 8*2; 1147 sb_start &= ~(sector_t)(4*2-1); 1148 break; 1149 case 1: 1150 sb_start = 0; 1151 break; 1152 case 2: 1153 sb_start = 8; 1154 break; 1155 default: 1156 return -EINVAL; 1157 } 1158 rdev->sb_start = sb_start; 1159 1160 /* superblock is rarely larger than 1K, but it can be larger, 1161 * and it is safe to read 4k, so we do that 1162 */ 1163 ret = read_disk_sb(rdev, 4096); 1164 if (ret) return ret; 1165 1166 1167 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1168 1169 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1170 sb->major_version != cpu_to_le32(1) || 1171 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1172 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1173 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1174 return -EINVAL; 1175 1176 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1177 printk("md: invalid superblock checksum on %s\n", 1178 bdevname(rdev->bdev,b)); 1179 return -EINVAL; 1180 } 1181 if (le64_to_cpu(sb->data_size) < 10) { 1182 printk("md: data_size too small on %s\n", 1183 bdevname(rdev->bdev,b)); 1184 return -EINVAL; 1185 } 1186 1187 rdev->preferred_minor = 0xffff; 1188 rdev->data_offset = le64_to_cpu(sb->data_offset); 1189 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1190 1191 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1192 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1193 if (rdev->sb_size & bmask) 1194 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1195 1196 if (minor_version 1197 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1198 return -EINVAL; 1199 1200 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1201 rdev->desc_nr = -1; 1202 else 1203 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1204 1205 if (!refdev) { 1206 ret = 1; 1207 } else { 1208 __u64 ev1, ev2; 1209 struct mdp_superblock_1 *refsb = 1210 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1211 1212 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1213 sb->level != refsb->level || 1214 sb->layout != refsb->layout || 1215 sb->chunksize != refsb->chunksize) { 1216 printk(KERN_WARNING "md: %s has strangely different" 1217 " superblock to %s\n", 1218 bdevname(rdev->bdev,b), 1219 bdevname(refdev->bdev,b2)); 1220 return -EINVAL; 1221 } 1222 ev1 = le64_to_cpu(sb->events); 1223 ev2 = le64_to_cpu(refsb->events); 1224 1225 if (ev1 > ev2) 1226 ret = 1; 1227 else 1228 ret = 0; 1229 } 1230 if (minor_version) 1231 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - 1232 le64_to_cpu(sb->data_offset); 1233 else 1234 rdev->sectors = rdev->sb_start; 1235 if (rdev->sectors < le64_to_cpu(sb->data_size)) 1236 return -EINVAL; 1237 rdev->sectors = le64_to_cpu(sb->data_size); 1238 if (le64_to_cpu(sb->size) > rdev->sectors) 1239 return -EINVAL; 1240 return ret; 1241 } 1242 1243 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1244 { 1245 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1246 __u64 ev1 = le64_to_cpu(sb->events); 1247 1248 rdev->raid_disk = -1; 1249 clear_bit(Faulty, &rdev->flags); 1250 clear_bit(In_sync, &rdev->flags); 1251 clear_bit(WriteMostly, &rdev->flags); 1252 clear_bit(BarriersNotsupp, &rdev->flags); 1253 1254 if (mddev->raid_disks == 0) { 1255 mddev->major_version = 1; 1256 mddev->patch_version = 0; 1257 mddev->external = 0; 1258 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1259 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1260 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1261 mddev->level = le32_to_cpu(sb->level); 1262 mddev->clevel[0] = 0; 1263 mddev->layout = le32_to_cpu(sb->layout); 1264 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1265 mddev->dev_sectors = le64_to_cpu(sb->size); 1266 mddev->events = ev1; 1267 mddev->bitmap_offset = 0; 1268 mddev->default_bitmap_offset = 1024 >> 9; 1269 1270 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1271 memcpy(mddev->uuid, sb->set_uuid, 16); 1272 1273 mddev->max_disks = (4096-256)/2; 1274 1275 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1276 mddev->bitmap_file == NULL ) 1277 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1278 1279 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1280 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1281 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1282 mddev->new_level = le32_to_cpu(sb->new_level); 1283 mddev->new_layout = le32_to_cpu(sb->new_layout); 1284 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1285 } else { 1286 mddev->reshape_position = MaxSector; 1287 mddev->delta_disks = 0; 1288 mddev->new_level = mddev->level; 1289 mddev->new_layout = mddev->layout; 1290 mddev->new_chunk_sectors = mddev->chunk_sectors; 1291 } 1292 1293 } else if (mddev->pers == NULL) { 1294 /* Insist of good event counter while assembling */ 1295 ++ev1; 1296 if (ev1 < mddev->events) 1297 return -EINVAL; 1298 } else if (mddev->bitmap) { 1299 /* If adding to array with a bitmap, then we can accept an 1300 * older device, but not too old. 1301 */ 1302 if (ev1 < mddev->bitmap->events_cleared) 1303 return 0; 1304 } else { 1305 if (ev1 < mddev->events) 1306 /* just a hot-add of a new device, leave raid_disk at -1 */ 1307 return 0; 1308 } 1309 if (mddev->level != LEVEL_MULTIPATH) { 1310 int role; 1311 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1312 switch(role) { 1313 case 0xffff: /* spare */ 1314 break; 1315 case 0xfffe: /* faulty */ 1316 set_bit(Faulty, &rdev->flags); 1317 break; 1318 default: 1319 if ((le32_to_cpu(sb->feature_map) & 1320 MD_FEATURE_RECOVERY_OFFSET)) 1321 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1322 else 1323 set_bit(In_sync, &rdev->flags); 1324 rdev->raid_disk = role; 1325 break; 1326 } 1327 if (sb->devflags & WriteMostly1) 1328 set_bit(WriteMostly, &rdev->flags); 1329 } else /* MULTIPATH are always insync */ 1330 set_bit(In_sync, &rdev->flags); 1331 1332 return 0; 1333 } 1334 1335 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1336 { 1337 struct mdp_superblock_1 *sb; 1338 mdk_rdev_t *rdev2; 1339 int max_dev, i; 1340 /* make rdev->sb match mddev and rdev data. */ 1341 1342 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1343 1344 sb->feature_map = 0; 1345 sb->pad0 = 0; 1346 sb->recovery_offset = cpu_to_le64(0); 1347 memset(sb->pad1, 0, sizeof(sb->pad1)); 1348 memset(sb->pad2, 0, sizeof(sb->pad2)); 1349 memset(sb->pad3, 0, sizeof(sb->pad3)); 1350 1351 sb->utime = cpu_to_le64((__u64)mddev->utime); 1352 sb->events = cpu_to_le64(mddev->events); 1353 if (mddev->in_sync) 1354 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1355 else 1356 sb->resync_offset = cpu_to_le64(0); 1357 1358 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1359 1360 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1361 sb->size = cpu_to_le64(mddev->dev_sectors); 1362 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1363 sb->level = cpu_to_le32(mddev->level); 1364 sb->layout = cpu_to_le32(mddev->layout); 1365 1366 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1367 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1368 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1369 } 1370 1371 if (rdev->raid_disk >= 0 && 1372 !test_bit(In_sync, &rdev->flags)) { 1373 if (mddev->curr_resync_completed > rdev->recovery_offset) 1374 rdev->recovery_offset = mddev->curr_resync_completed; 1375 if (rdev->recovery_offset > 0) { 1376 sb->feature_map |= 1377 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1378 sb->recovery_offset = 1379 cpu_to_le64(rdev->recovery_offset); 1380 } 1381 } 1382 1383 if (mddev->reshape_position != MaxSector) { 1384 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1385 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1386 sb->new_layout = cpu_to_le32(mddev->new_layout); 1387 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1388 sb->new_level = cpu_to_le32(mddev->new_level); 1389 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1390 } 1391 1392 max_dev = 0; 1393 list_for_each_entry(rdev2, &mddev->disks, same_set) 1394 if (rdev2->desc_nr+1 > max_dev) 1395 max_dev = rdev2->desc_nr+1; 1396 1397 if (max_dev > le32_to_cpu(sb->max_dev)) 1398 sb->max_dev = cpu_to_le32(max_dev); 1399 for (i=0; i<max_dev;i++) 1400 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1401 1402 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1403 i = rdev2->desc_nr; 1404 if (test_bit(Faulty, &rdev2->flags)) 1405 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1406 else if (test_bit(In_sync, &rdev2->flags)) 1407 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1408 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1409 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1410 else 1411 sb->dev_roles[i] = cpu_to_le16(0xffff); 1412 } 1413 1414 sb->sb_csum = calc_sb_1_csum(sb); 1415 } 1416 1417 static unsigned long long 1418 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1419 { 1420 struct mdp_superblock_1 *sb; 1421 sector_t max_sectors; 1422 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1423 return 0; /* component must fit device */ 1424 if (rdev->sb_start < rdev->data_offset) { 1425 /* minor versions 1 and 2; superblock before data */ 1426 max_sectors = rdev->bdev->bd_inode->i_size >> 9; 1427 max_sectors -= rdev->data_offset; 1428 if (!num_sectors || num_sectors > max_sectors) 1429 num_sectors = max_sectors; 1430 } else if (rdev->mddev->bitmap_offset) { 1431 /* minor version 0 with bitmap we can't move */ 1432 return 0; 1433 } else { 1434 /* minor version 0; superblock after data */ 1435 sector_t sb_start; 1436 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1437 sb_start &= ~(sector_t)(4*2 - 1); 1438 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1439 if (!num_sectors || num_sectors > max_sectors) 1440 num_sectors = max_sectors; 1441 rdev->sb_start = sb_start; 1442 } 1443 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1444 sb->data_size = cpu_to_le64(num_sectors); 1445 sb->super_offset = rdev->sb_start; 1446 sb->sb_csum = calc_sb_1_csum(sb); 1447 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1448 rdev->sb_page); 1449 md_super_wait(rdev->mddev); 1450 return num_sectors / 2; /* kB for sysfs */ 1451 } 1452 1453 static struct super_type super_types[] = { 1454 [0] = { 1455 .name = "0.90.0", 1456 .owner = THIS_MODULE, 1457 .load_super = super_90_load, 1458 .validate_super = super_90_validate, 1459 .sync_super = super_90_sync, 1460 .rdev_size_change = super_90_rdev_size_change, 1461 }, 1462 [1] = { 1463 .name = "md-1", 1464 .owner = THIS_MODULE, 1465 .load_super = super_1_load, 1466 .validate_super = super_1_validate, 1467 .sync_super = super_1_sync, 1468 .rdev_size_change = super_1_rdev_size_change, 1469 }, 1470 }; 1471 1472 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1473 { 1474 mdk_rdev_t *rdev, *rdev2; 1475 1476 rcu_read_lock(); 1477 rdev_for_each_rcu(rdev, mddev1) 1478 rdev_for_each_rcu(rdev2, mddev2) 1479 if (rdev->bdev->bd_contains == 1480 rdev2->bdev->bd_contains) { 1481 rcu_read_unlock(); 1482 return 1; 1483 } 1484 rcu_read_unlock(); 1485 return 0; 1486 } 1487 1488 static LIST_HEAD(pending_raid_disks); 1489 1490 static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) 1491 { 1492 struct mdk_personality *pers = mddev->pers; 1493 struct gendisk *disk = mddev->gendisk; 1494 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); 1495 struct blk_integrity *bi_mddev = blk_get_integrity(disk); 1496 1497 /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ 1498 if (pers && pers->level >= 4 && pers->level <= 6) 1499 return; 1500 1501 /* If rdev is integrity capable, register profile for mddev */ 1502 if (!bi_mddev && bi_rdev) { 1503 if (blk_integrity_register(disk, bi_rdev)) 1504 printk(KERN_ERR "%s: %s Could not register integrity!\n", 1505 __func__, disk->disk_name); 1506 else 1507 printk(KERN_NOTICE "Enabling data integrity on %s\n", 1508 disk->disk_name); 1509 return; 1510 } 1511 1512 /* Check that mddev and rdev have matching profiles */ 1513 if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { 1514 printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, 1515 disk->disk_name, rdev->bdev->bd_disk->disk_name); 1516 printk(KERN_NOTICE "Disabling data integrity on %s\n", 1517 disk->disk_name); 1518 blk_integrity_unregister(disk); 1519 } 1520 } 1521 1522 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1523 { 1524 char b[BDEVNAME_SIZE]; 1525 struct kobject *ko; 1526 char *s; 1527 int err; 1528 1529 if (rdev->mddev) { 1530 MD_BUG(); 1531 return -EINVAL; 1532 } 1533 1534 /* prevent duplicates */ 1535 if (find_rdev(mddev, rdev->bdev->bd_dev)) 1536 return -EEXIST; 1537 1538 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 1539 if (rdev->sectors && (mddev->dev_sectors == 0 || 1540 rdev->sectors < mddev->dev_sectors)) { 1541 if (mddev->pers) { 1542 /* Cannot change size, so fail 1543 * If mddev->level <= 0, then we don't care 1544 * about aligning sizes (e.g. linear) 1545 */ 1546 if (mddev->level > 0) 1547 return -ENOSPC; 1548 } else 1549 mddev->dev_sectors = rdev->sectors; 1550 } 1551 1552 /* Verify rdev->desc_nr is unique. 1553 * If it is -1, assign a free number, else 1554 * check number is not in use 1555 */ 1556 if (rdev->desc_nr < 0) { 1557 int choice = 0; 1558 if (mddev->pers) choice = mddev->raid_disks; 1559 while (find_rdev_nr(mddev, choice)) 1560 choice++; 1561 rdev->desc_nr = choice; 1562 } else { 1563 if (find_rdev_nr(mddev, rdev->desc_nr)) 1564 return -EBUSY; 1565 } 1566 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 1567 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 1568 mdname(mddev), mddev->max_disks); 1569 return -EBUSY; 1570 } 1571 bdevname(rdev->bdev,b); 1572 while ( (s=strchr(b, '/')) != NULL) 1573 *s = '!'; 1574 1575 rdev->mddev = mddev; 1576 printk(KERN_INFO "md: bind<%s>\n", b); 1577 1578 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 1579 goto fail; 1580 1581 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 1582 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1583 kobject_del(&rdev->kobj); 1584 goto fail; 1585 } 1586 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state"); 1587 1588 list_add_rcu(&rdev->same_set, &mddev->disks); 1589 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1590 1591 /* May as well allow recovery to be retried once */ 1592 mddev->recovery_disabled = 0; 1593 1594 md_integrity_check(rdev, mddev); 1595 return 0; 1596 1597 fail: 1598 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1599 b, mdname(mddev)); 1600 return err; 1601 } 1602 1603 static void md_delayed_delete(struct work_struct *ws) 1604 { 1605 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1606 kobject_del(&rdev->kobj); 1607 kobject_put(&rdev->kobj); 1608 } 1609 1610 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1611 { 1612 char b[BDEVNAME_SIZE]; 1613 if (!rdev->mddev) { 1614 MD_BUG(); 1615 return; 1616 } 1617 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1618 list_del_rcu(&rdev->same_set); 1619 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1620 rdev->mddev = NULL; 1621 sysfs_remove_link(&rdev->kobj, "block"); 1622 sysfs_put(rdev->sysfs_state); 1623 rdev->sysfs_state = NULL; 1624 /* We need to delay this, otherwise we can deadlock when 1625 * writing to 'remove' to "dev/state". We also need 1626 * to delay it due to rcu usage. 1627 */ 1628 synchronize_rcu(); 1629 INIT_WORK(&rdev->del_work, md_delayed_delete); 1630 kobject_get(&rdev->kobj); 1631 schedule_work(&rdev->del_work); 1632 } 1633 1634 /* 1635 * prevent the device from being mounted, repartitioned or 1636 * otherwise reused by a RAID array (or any other kernel 1637 * subsystem), by bd_claiming the device. 1638 */ 1639 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) 1640 { 1641 int err = 0; 1642 struct block_device *bdev; 1643 char b[BDEVNAME_SIZE]; 1644 1645 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1646 if (IS_ERR(bdev)) { 1647 printk(KERN_ERR "md: could not open %s.\n", 1648 __bdevname(dev, b)); 1649 return PTR_ERR(bdev); 1650 } 1651 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); 1652 if (err) { 1653 printk(KERN_ERR "md: could not bd_claim %s.\n", 1654 bdevname(bdev, b)); 1655 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1656 return err; 1657 } 1658 if (!shared) 1659 set_bit(AllReserved, &rdev->flags); 1660 rdev->bdev = bdev; 1661 return err; 1662 } 1663 1664 static void unlock_rdev(mdk_rdev_t *rdev) 1665 { 1666 struct block_device *bdev = rdev->bdev; 1667 rdev->bdev = NULL; 1668 if (!bdev) 1669 MD_BUG(); 1670 bd_release(bdev); 1671 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1672 } 1673 1674 void md_autodetect_dev(dev_t dev); 1675 1676 static void export_rdev(mdk_rdev_t * rdev) 1677 { 1678 char b[BDEVNAME_SIZE]; 1679 printk(KERN_INFO "md: export_rdev(%s)\n", 1680 bdevname(rdev->bdev,b)); 1681 if (rdev->mddev) 1682 MD_BUG(); 1683 free_disk_sb(rdev); 1684 #ifndef MODULE 1685 if (test_bit(AutoDetected, &rdev->flags)) 1686 md_autodetect_dev(rdev->bdev->bd_dev); 1687 #endif 1688 unlock_rdev(rdev); 1689 kobject_put(&rdev->kobj); 1690 } 1691 1692 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1693 { 1694 unbind_rdev_from_array(rdev); 1695 export_rdev(rdev); 1696 } 1697 1698 static void export_array(mddev_t *mddev) 1699 { 1700 mdk_rdev_t *rdev, *tmp; 1701 1702 rdev_for_each(rdev, tmp, mddev) { 1703 if (!rdev->mddev) { 1704 MD_BUG(); 1705 continue; 1706 } 1707 kick_rdev_from_array(rdev); 1708 } 1709 if (!list_empty(&mddev->disks)) 1710 MD_BUG(); 1711 mddev->raid_disks = 0; 1712 mddev->major_version = 0; 1713 } 1714 1715 static void print_desc(mdp_disk_t *desc) 1716 { 1717 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1718 desc->major,desc->minor,desc->raid_disk,desc->state); 1719 } 1720 1721 static void print_sb_90(mdp_super_t *sb) 1722 { 1723 int i; 1724 1725 printk(KERN_INFO 1726 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1727 sb->major_version, sb->minor_version, sb->patch_version, 1728 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1729 sb->ctime); 1730 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1731 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1732 sb->md_minor, sb->layout, sb->chunk_size); 1733 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1734 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1735 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1736 sb->failed_disks, sb->spare_disks, 1737 sb->sb_csum, (unsigned long)sb->events_lo); 1738 1739 printk(KERN_INFO); 1740 for (i = 0; i < MD_SB_DISKS; i++) { 1741 mdp_disk_t *desc; 1742 1743 desc = sb->disks + i; 1744 if (desc->number || desc->major || desc->minor || 1745 desc->raid_disk || (desc->state && (desc->state != 4))) { 1746 printk(" D %2d: ", i); 1747 print_desc(desc); 1748 } 1749 } 1750 printk(KERN_INFO "md: THIS: "); 1751 print_desc(&sb->this_disk); 1752 } 1753 1754 static void print_sb_1(struct mdp_superblock_1 *sb) 1755 { 1756 __u8 *uuid; 1757 1758 uuid = sb->set_uuid; 1759 printk(KERN_INFO 1760 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" 1761 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" 1762 "md: Name: \"%s\" CT:%llu\n", 1763 le32_to_cpu(sb->major_version), 1764 le32_to_cpu(sb->feature_map), 1765 uuid[0], uuid[1], uuid[2], uuid[3], 1766 uuid[4], uuid[5], uuid[6], uuid[7], 1767 uuid[8], uuid[9], uuid[10], uuid[11], 1768 uuid[12], uuid[13], uuid[14], uuid[15], 1769 sb->set_name, 1770 (unsigned long long)le64_to_cpu(sb->ctime) 1771 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 1772 1773 uuid = sb->device_uuid; 1774 printk(KERN_INFO 1775 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 1776 " RO:%llu\n" 1777 "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" 1778 ":%02x%02x%02x%02x%02x%02x\n" 1779 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 1780 "md: (MaxDev:%u) \n", 1781 le32_to_cpu(sb->level), 1782 (unsigned long long)le64_to_cpu(sb->size), 1783 le32_to_cpu(sb->raid_disks), 1784 le32_to_cpu(sb->layout), 1785 le32_to_cpu(sb->chunksize), 1786 (unsigned long long)le64_to_cpu(sb->data_offset), 1787 (unsigned long long)le64_to_cpu(sb->data_size), 1788 (unsigned long long)le64_to_cpu(sb->super_offset), 1789 (unsigned long long)le64_to_cpu(sb->recovery_offset), 1790 le32_to_cpu(sb->dev_number), 1791 uuid[0], uuid[1], uuid[2], uuid[3], 1792 uuid[4], uuid[5], uuid[6], uuid[7], 1793 uuid[8], uuid[9], uuid[10], uuid[11], 1794 uuid[12], uuid[13], uuid[14], uuid[15], 1795 sb->devflags, 1796 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, 1797 (unsigned long long)le64_to_cpu(sb->events), 1798 (unsigned long long)le64_to_cpu(sb->resync_offset), 1799 le32_to_cpu(sb->sb_csum), 1800 le32_to_cpu(sb->max_dev) 1801 ); 1802 } 1803 1804 static void print_rdev(mdk_rdev_t *rdev, int major_version) 1805 { 1806 char b[BDEVNAME_SIZE]; 1807 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", 1808 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, 1809 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1810 rdev->desc_nr); 1811 if (rdev->sb_loaded) { 1812 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 1813 switch (major_version) { 1814 case 0: 1815 print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); 1816 break; 1817 case 1: 1818 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); 1819 break; 1820 } 1821 } else 1822 printk(KERN_INFO "md: no rdev superblock!\n"); 1823 } 1824 1825 static void md_print_devices(void) 1826 { 1827 struct list_head *tmp; 1828 mdk_rdev_t *rdev; 1829 mddev_t *mddev; 1830 char b[BDEVNAME_SIZE]; 1831 1832 printk("\n"); 1833 printk("md: **********************************\n"); 1834 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1835 printk("md: **********************************\n"); 1836 for_each_mddev(mddev, tmp) { 1837 1838 if (mddev->bitmap) 1839 bitmap_print_sb(mddev->bitmap); 1840 else 1841 printk("%s: ", mdname(mddev)); 1842 list_for_each_entry(rdev, &mddev->disks, same_set) 1843 printk("<%s>", bdevname(rdev->bdev,b)); 1844 printk("\n"); 1845 1846 list_for_each_entry(rdev, &mddev->disks, same_set) 1847 print_rdev(rdev, mddev->major_version); 1848 } 1849 printk("md: **********************************\n"); 1850 printk("\n"); 1851 } 1852 1853 1854 static void sync_sbs(mddev_t * mddev, int nospares) 1855 { 1856 /* Update each superblock (in-memory image), but 1857 * if we are allowed to, skip spares which already 1858 * have the right event counter, or have one earlier 1859 * (which would mean they aren't being marked as dirty 1860 * with the rest of the array) 1861 */ 1862 mdk_rdev_t *rdev; 1863 1864 list_for_each_entry(rdev, &mddev->disks, same_set) { 1865 if (rdev->sb_events == mddev->events || 1866 (nospares && 1867 rdev->raid_disk < 0 && 1868 (rdev->sb_events&1)==0 && 1869 rdev->sb_events+1 == mddev->events)) { 1870 /* Don't update this superblock */ 1871 rdev->sb_loaded = 2; 1872 } else { 1873 super_types[mddev->major_version]. 1874 sync_super(mddev, rdev); 1875 rdev->sb_loaded = 1; 1876 } 1877 } 1878 } 1879 1880 static void md_update_sb(mddev_t * mddev, int force_change) 1881 { 1882 mdk_rdev_t *rdev; 1883 int sync_req; 1884 int nospares = 0; 1885 1886 mddev->utime = get_seconds(); 1887 if (mddev->external) 1888 return; 1889 repeat: 1890 spin_lock_irq(&mddev->write_lock); 1891 1892 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1893 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1894 force_change = 1; 1895 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1896 /* just a clean<-> dirty transition, possibly leave spares alone, 1897 * though if events isn't the right even/odd, we will have to do 1898 * spares after all 1899 */ 1900 nospares = 1; 1901 if (force_change) 1902 nospares = 0; 1903 if (mddev->degraded) 1904 /* If the array is degraded, then skipping spares is both 1905 * dangerous and fairly pointless. 1906 * Dangerous because a device that was removed from the array 1907 * might have a event_count that still looks up-to-date, 1908 * so it can be re-added without a resync. 1909 * Pointless because if there are any spares to skip, 1910 * then a recovery will happen and soon that array won't 1911 * be degraded any more and the spare can go back to sleep then. 1912 */ 1913 nospares = 0; 1914 1915 sync_req = mddev->in_sync; 1916 1917 /* If this is just a dirty<->clean transition, and the array is clean 1918 * and 'events' is odd, we can roll back to the previous clean state */ 1919 if (nospares 1920 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1921 && (mddev->events & 1) 1922 && mddev->events != 1) 1923 mddev->events--; 1924 else { 1925 /* otherwise we have to go forward and ... */ 1926 mddev->events ++; 1927 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1928 /* .. if the array isn't clean, insist on an odd 'events' */ 1929 if ((mddev->events&1)==0) { 1930 mddev->events++; 1931 nospares = 0; 1932 } 1933 } else { 1934 /* otherwise insist on an even 'events' (for clean states) */ 1935 if ((mddev->events&1)) { 1936 mddev->events++; 1937 nospares = 0; 1938 } 1939 } 1940 } 1941 1942 if (!mddev->events) { 1943 /* 1944 * oops, this 64-bit counter should never wrap. 1945 * Either we are in around ~1 trillion A.C., assuming 1946 * 1 reboot per second, or we have a bug: 1947 */ 1948 MD_BUG(); 1949 mddev->events --; 1950 } 1951 1952 /* 1953 * do not write anything to disk if using 1954 * nonpersistent superblocks 1955 */ 1956 if (!mddev->persistent) { 1957 if (!mddev->external) 1958 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1959 1960 spin_unlock_irq(&mddev->write_lock); 1961 wake_up(&mddev->sb_wait); 1962 return; 1963 } 1964 sync_sbs(mddev, nospares); 1965 spin_unlock_irq(&mddev->write_lock); 1966 1967 dprintk(KERN_INFO 1968 "md: updating %s RAID superblock on device (in sync %d)\n", 1969 mdname(mddev),mddev->in_sync); 1970 1971 bitmap_update_sb(mddev->bitmap); 1972 list_for_each_entry(rdev, &mddev->disks, same_set) { 1973 char b[BDEVNAME_SIZE]; 1974 dprintk(KERN_INFO "md: "); 1975 if (rdev->sb_loaded != 1) 1976 continue; /* no noise on spare devices */ 1977 if (test_bit(Faulty, &rdev->flags)) 1978 dprintk("(skipping faulty "); 1979 1980 dprintk("%s ", bdevname(rdev->bdev,b)); 1981 if (!test_bit(Faulty, &rdev->flags)) { 1982 md_super_write(mddev,rdev, 1983 rdev->sb_start, rdev->sb_size, 1984 rdev->sb_page); 1985 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1986 bdevname(rdev->bdev,b), 1987 (unsigned long long)rdev->sb_start); 1988 rdev->sb_events = mddev->events; 1989 1990 } else 1991 dprintk(")\n"); 1992 if (mddev->level == LEVEL_MULTIPATH) 1993 /* only need to write one superblock... */ 1994 break; 1995 } 1996 md_super_wait(mddev); 1997 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 1998 1999 spin_lock_irq(&mddev->write_lock); 2000 if (mddev->in_sync != sync_req || 2001 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2002 /* have to write it out again */ 2003 spin_unlock_irq(&mddev->write_lock); 2004 goto repeat; 2005 } 2006 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2007 spin_unlock_irq(&mddev->write_lock); 2008 wake_up(&mddev->sb_wait); 2009 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2010 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2011 2012 } 2013 2014 /* words written to sysfs files may, or may not, be \n terminated. 2015 * We want to accept with case. For this we use cmd_match. 2016 */ 2017 static int cmd_match(const char *cmd, const char *str) 2018 { 2019 /* See if cmd, written into a sysfs file, matches 2020 * str. They must either be the same, or cmd can 2021 * have a trailing newline 2022 */ 2023 while (*cmd && *str && *cmd == *str) { 2024 cmd++; 2025 str++; 2026 } 2027 if (*cmd == '\n') 2028 cmd++; 2029 if (*str || *cmd) 2030 return 0; 2031 return 1; 2032 } 2033 2034 struct rdev_sysfs_entry { 2035 struct attribute attr; 2036 ssize_t (*show)(mdk_rdev_t *, char *); 2037 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 2038 }; 2039 2040 static ssize_t 2041 state_show(mdk_rdev_t *rdev, char *page) 2042 { 2043 char *sep = ""; 2044 size_t len = 0; 2045 2046 if (test_bit(Faulty, &rdev->flags)) { 2047 len+= sprintf(page+len, "%sfaulty",sep); 2048 sep = ","; 2049 } 2050 if (test_bit(In_sync, &rdev->flags)) { 2051 len += sprintf(page+len, "%sin_sync",sep); 2052 sep = ","; 2053 } 2054 if (test_bit(WriteMostly, &rdev->flags)) { 2055 len += sprintf(page+len, "%swrite_mostly",sep); 2056 sep = ","; 2057 } 2058 if (test_bit(Blocked, &rdev->flags)) { 2059 len += sprintf(page+len, "%sblocked", sep); 2060 sep = ","; 2061 } 2062 if (!test_bit(Faulty, &rdev->flags) && 2063 !test_bit(In_sync, &rdev->flags)) { 2064 len += sprintf(page+len, "%sspare", sep); 2065 sep = ","; 2066 } 2067 return len+sprintf(page+len, "\n"); 2068 } 2069 2070 static ssize_t 2071 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2072 { 2073 /* can write 2074 * faulty - simulates and error 2075 * remove - disconnects the device 2076 * writemostly - sets write_mostly 2077 * -writemostly - clears write_mostly 2078 * blocked - sets the Blocked flag 2079 * -blocked - clears the Blocked flag 2080 * insync - sets Insync providing device isn't active 2081 */ 2082 int err = -EINVAL; 2083 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2084 md_error(rdev->mddev, rdev); 2085 err = 0; 2086 } else if (cmd_match(buf, "remove")) { 2087 if (rdev->raid_disk >= 0) 2088 err = -EBUSY; 2089 else { 2090 mddev_t *mddev = rdev->mddev; 2091 kick_rdev_from_array(rdev); 2092 if (mddev->pers) 2093 md_update_sb(mddev, 1); 2094 md_new_event(mddev); 2095 err = 0; 2096 } 2097 } else if (cmd_match(buf, "writemostly")) { 2098 set_bit(WriteMostly, &rdev->flags); 2099 err = 0; 2100 } else if (cmd_match(buf, "-writemostly")) { 2101 clear_bit(WriteMostly, &rdev->flags); 2102 err = 0; 2103 } else if (cmd_match(buf, "blocked")) { 2104 set_bit(Blocked, &rdev->flags); 2105 err = 0; 2106 } else if (cmd_match(buf, "-blocked")) { 2107 clear_bit(Blocked, &rdev->flags); 2108 wake_up(&rdev->blocked_wait); 2109 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2110 md_wakeup_thread(rdev->mddev->thread); 2111 2112 err = 0; 2113 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2114 set_bit(In_sync, &rdev->flags); 2115 err = 0; 2116 } 2117 if (!err && rdev->sysfs_state) 2118 sysfs_notify_dirent(rdev->sysfs_state); 2119 return err ? err : len; 2120 } 2121 static struct rdev_sysfs_entry rdev_state = 2122 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 2123 2124 static ssize_t 2125 errors_show(mdk_rdev_t *rdev, char *page) 2126 { 2127 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2128 } 2129 2130 static ssize_t 2131 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2132 { 2133 char *e; 2134 unsigned long n = simple_strtoul(buf, &e, 10); 2135 if (*buf && (*e == 0 || *e == '\n')) { 2136 atomic_set(&rdev->corrected_errors, n); 2137 return len; 2138 } 2139 return -EINVAL; 2140 } 2141 static struct rdev_sysfs_entry rdev_errors = 2142 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2143 2144 static ssize_t 2145 slot_show(mdk_rdev_t *rdev, char *page) 2146 { 2147 if (rdev->raid_disk < 0) 2148 return sprintf(page, "none\n"); 2149 else 2150 return sprintf(page, "%d\n", rdev->raid_disk); 2151 } 2152 2153 static ssize_t 2154 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2155 { 2156 char *e; 2157 int err; 2158 char nm[20]; 2159 int slot = simple_strtoul(buf, &e, 10); 2160 if (strncmp(buf, "none", 4)==0) 2161 slot = -1; 2162 else if (e==buf || (*e && *e!= '\n')) 2163 return -EINVAL; 2164 if (rdev->mddev->pers && slot == -1) { 2165 /* Setting 'slot' on an active array requires also 2166 * updating the 'rd%d' link, and communicating 2167 * with the personality with ->hot_*_disk. 2168 * For now we only support removing 2169 * failed/spare devices. This normally happens automatically, 2170 * but not when the metadata is externally managed. 2171 */ 2172 if (rdev->raid_disk == -1) 2173 return -EEXIST; 2174 /* personality does all needed checks */ 2175 if (rdev->mddev->pers->hot_add_disk == NULL) 2176 return -EINVAL; 2177 err = rdev->mddev->pers-> 2178 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2179 if (err) 2180 return err; 2181 sprintf(nm, "rd%d", rdev->raid_disk); 2182 sysfs_remove_link(&rdev->mddev->kobj, nm); 2183 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2184 md_wakeup_thread(rdev->mddev->thread); 2185 } else if (rdev->mddev->pers) { 2186 mdk_rdev_t *rdev2; 2187 /* Activating a spare .. or possibly reactivating 2188 * if we ever get bitmaps working here. 2189 */ 2190 2191 if (rdev->raid_disk != -1) 2192 return -EBUSY; 2193 2194 if (rdev->mddev->pers->hot_add_disk == NULL) 2195 return -EINVAL; 2196 2197 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) 2198 if (rdev2->raid_disk == slot) 2199 return -EEXIST; 2200 2201 rdev->raid_disk = slot; 2202 if (test_bit(In_sync, &rdev->flags)) 2203 rdev->saved_raid_disk = slot; 2204 else 2205 rdev->saved_raid_disk = -1; 2206 err = rdev->mddev->pers-> 2207 hot_add_disk(rdev->mddev, rdev); 2208 if (err) { 2209 rdev->raid_disk = -1; 2210 return err; 2211 } else 2212 sysfs_notify_dirent(rdev->sysfs_state); 2213 sprintf(nm, "rd%d", rdev->raid_disk); 2214 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) 2215 printk(KERN_WARNING 2216 "md: cannot register " 2217 "%s for %s\n", 2218 nm, mdname(rdev->mddev)); 2219 2220 /* don't wakeup anyone, leave that to userspace. */ 2221 } else { 2222 if (slot >= rdev->mddev->raid_disks) 2223 return -ENOSPC; 2224 rdev->raid_disk = slot; 2225 /* assume it is working */ 2226 clear_bit(Faulty, &rdev->flags); 2227 clear_bit(WriteMostly, &rdev->flags); 2228 set_bit(In_sync, &rdev->flags); 2229 sysfs_notify_dirent(rdev->sysfs_state); 2230 } 2231 return len; 2232 } 2233 2234 2235 static struct rdev_sysfs_entry rdev_slot = 2236 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2237 2238 static ssize_t 2239 offset_show(mdk_rdev_t *rdev, char *page) 2240 { 2241 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2242 } 2243 2244 static ssize_t 2245 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2246 { 2247 char *e; 2248 unsigned long long offset = simple_strtoull(buf, &e, 10); 2249 if (e==buf || (*e && *e != '\n')) 2250 return -EINVAL; 2251 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2252 return -EBUSY; 2253 if (rdev->sectors && rdev->mddev->external) 2254 /* Must set offset before size, so overlap checks 2255 * can be sane */ 2256 return -EBUSY; 2257 rdev->data_offset = offset; 2258 return len; 2259 } 2260 2261 static struct rdev_sysfs_entry rdev_offset = 2262 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2263 2264 static ssize_t 2265 rdev_size_show(mdk_rdev_t *rdev, char *page) 2266 { 2267 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2268 } 2269 2270 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2271 { 2272 /* check if two start/length pairs overlap */ 2273 if (s1+l1 <= s2) 2274 return 0; 2275 if (s2+l2 <= s1) 2276 return 0; 2277 return 1; 2278 } 2279 2280 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2281 { 2282 unsigned long long blocks; 2283 sector_t new; 2284 2285 if (strict_strtoull(buf, 10, &blocks) < 0) 2286 return -EINVAL; 2287 2288 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2289 return -EINVAL; /* sector conversion overflow */ 2290 2291 new = blocks * 2; 2292 if (new != blocks * 2) 2293 return -EINVAL; /* unsigned long long to sector_t overflow */ 2294 2295 *sectors = new; 2296 return 0; 2297 } 2298 2299 static ssize_t 2300 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2301 { 2302 mddev_t *my_mddev = rdev->mddev; 2303 sector_t oldsectors = rdev->sectors; 2304 sector_t sectors; 2305 2306 if (strict_blocks_to_sectors(buf, §ors) < 0) 2307 return -EINVAL; 2308 if (my_mddev->pers && rdev->raid_disk >= 0) { 2309 if (my_mddev->persistent) { 2310 sectors = super_types[my_mddev->major_version]. 2311 rdev_size_change(rdev, sectors); 2312 if (!sectors) 2313 return -EBUSY; 2314 } else if (!sectors) 2315 sectors = (rdev->bdev->bd_inode->i_size >> 9) - 2316 rdev->data_offset; 2317 } 2318 if (sectors < my_mddev->dev_sectors) 2319 return -EINVAL; /* component must fit device */ 2320 2321 rdev->sectors = sectors; 2322 if (sectors > oldsectors && my_mddev->external) { 2323 /* need to check that all other rdevs with the same ->bdev 2324 * do not overlap. We need to unlock the mddev to avoid 2325 * a deadlock. We have already changed rdev->sectors, and if 2326 * we have to change it back, we will have the lock again. 2327 */ 2328 mddev_t *mddev; 2329 int overlap = 0; 2330 struct list_head *tmp; 2331 2332 mddev_unlock(my_mddev); 2333 for_each_mddev(mddev, tmp) { 2334 mdk_rdev_t *rdev2; 2335 2336 mddev_lock(mddev); 2337 list_for_each_entry(rdev2, &mddev->disks, same_set) 2338 if (test_bit(AllReserved, &rdev2->flags) || 2339 (rdev->bdev == rdev2->bdev && 2340 rdev != rdev2 && 2341 overlaps(rdev->data_offset, rdev->sectors, 2342 rdev2->data_offset, 2343 rdev2->sectors))) { 2344 overlap = 1; 2345 break; 2346 } 2347 mddev_unlock(mddev); 2348 if (overlap) { 2349 mddev_put(mddev); 2350 break; 2351 } 2352 } 2353 mddev_lock(my_mddev); 2354 if (overlap) { 2355 /* Someone else could have slipped in a size 2356 * change here, but doing so is just silly. 2357 * We put oldsectors back because we *know* it is 2358 * safe, and trust userspace not to race with 2359 * itself 2360 */ 2361 rdev->sectors = oldsectors; 2362 return -EBUSY; 2363 } 2364 } 2365 return len; 2366 } 2367 2368 static struct rdev_sysfs_entry rdev_size = 2369 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2370 2371 static struct attribute *rdev_default_attrs[] = { 2372 &rdev_state.attr, 2373 &rdev_errors.attr, 2374 &rdev_slot.attr, 2375 &rdev_offset.attr, 2376 &rdev_size.attr, 2377 NULL, 2378 }; 2379 static ssize_t 2380 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2381 { 2382 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2383 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2384 mddev_t *mddev = rdev->mddev; 2385 ssize_t rv; 2386 2387 if (!entry->show) 2388 return -EIO; 2389 2390 rv = mddev ? mddev_lock(mddev) : -EBUSY; 2391 if (!rv) { 2392 if (rdev->mddev == NULL) 2393 rv = -EBUSY; 2394 else 2395 rv = entry->show(rdev, page); 2396 mddev_unlock(mddev); 2397 } 2398 return rv; 2399 } 2400 2401 static ssize_t 2402 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 2403 const char *page, size_t length) 2404 { 2405 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2406 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2407 ssize_t rv; 2408 mddev_t *mddev = rdev->mddev; 2409 2410 if (!entry->store) 2411 return -EIO; 2412 if (!capable(CAP_SYS_ADMIN)) 2413 return -EACCES; 2414 rv = mddev ? mddev_lock(mddev): -EBUSY; 2415 if (!rv) { 2416 if (rdev->mddev == NULL) 2417 rv = -EBUSY; 2418 else 2419 rv = entry->store(rdev, page, length); 2420 mddev_unlock(mddev); 2421 } 2422 return rv; 2423 } 2424 2425 static void rdev_free(struct kobject *ko) 2426 { 2427 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2428 kfree(rdev); 2429 } 2430 static struct sysfs_ops rdev_sysfs_ops = { 2431 .show = rdev_attr_show, 2432 .store = rdev_attr_store, 2433 }; 2434 static struct kobj_type rdev_ktype = { 2435 .release = rdev_free, 2436 .sysfs_ops = &rdev_sysfs_ops, 2437 .default_attrs = rdev_default_attrs, 2438 }; 2439 2440 /* 2441 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2442 * 2443 * mark the device faulty if: 2444 * 2445 * - the device is nonexistent (zero size) 2446 * - the device has no valid superblock 2447 * 2448 * a faulty rdev _never_ has rdev->sb set. 2449 */ 2450 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2451 { 2452 char b[BDEVNAME_SIZE]; 2453 int err; 2454 mdk_rdev_t *rdev; 2455 sector_t size; 2456 2457 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2458 if (!rdev) { 2459 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2460 return ERR_PTR(-ENOMEM); 2461 } 2462 2463 if ((err = alloc_disk_sb(rdev))) 2464 goto abort_free; 2465 2466 err = lock_rdev(rdev, newdev, super_format == -2); 2467 if (err) 2468 goto abort_free; 2469 2470 kobject_init(&rdev->kobj, &rdev_ktype); 2471 2472 rdev->desc_nr = -1; 2473 rdev->saved_raid_disk = -1; 2474 rdev->raid_disk = -1; 2475 rdev->flags = 0; 2476 rdev->data_offset = 0; 2477 rdev->sb_events = 0; 2478 atomic_set(&rdev->nr_pending, 0); 2479 atomic_set(&rdev->read_errors, 0); 2480 atomic_set(&rdev->corrected_errors, 0); 2481 2482 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2483 if (!size) { 2484 printk(KERN_WARNING 2485 "md: %s has zero or unknown size, marking faulty!\n", 2486 bdevname(rdev->bdev,b)); 2487 err = -EINVAL; 2488 goto abort_free; 2489 } 2490 2491 if (super_format >= 0) { 2492 err = super_types[super_format]. 2493 load_super(rdev, NULL, super_minor); 2494 if (err == -EINVAL) { 2495 printk(KERN_WARNING 2496 "md: %s does not have a valid v%d.%d " 2497 "superblock, not importing!\n", 2498 bdevname(rdev->bdev,b), 2499 super_format, super_minor); 2500 goto abort_free; 2501 } 2502 if (err < 0) { 2503 printk(KERN_WARNING 2504 "md: could not read %s's sb, not importing!\n", 2505 bdevname(rdev->bdev,b)); 2506 goto abort_free; 2507 } 2508 } 2509 2510 INIT_LIST_HEAD(&rdev->same_set); 2511 init_waitqueue_head(&rdev->blocked_wait); 2512 2513 return rdev; 2514 2515 abort_free: 2516 if (rdev->sb_page) { 2517 if (rdev->bdev) 2518 unlock_rdev(rdev); 2519 free_disk_sb(rdev); 2520 } 2521 kfree(rdev); 2522 return ERR_PTR(err); 2523 } 2524 2525 /* 2526 * Check a full RAID array for plausibility 2527 */ 2528 2529 2530 static void analyze_sbs(mddev_t * mddev) 2531 { 2532 int i; 2533 mdk_rdev_t *rdev, *freshest, *tmp; 2534 char b[BDEVNAME_SIZE]; 2535 2536 freshest = NULL; 2537 rdev_for_each(rdev, tmp, mddev) 2538 switch (super_types[mddev->major_version]. 2539 load_super(rdev, freshest, mddev->minor_version)) { 2540 case 1: 2541 freshest = rdev; 2542 break; 2543 case 0: 2544 break; 2545 default: 2546 printk( KERN_ERR \ 2547 "md: fatal superblock inconsistency in %s" 2548 " -- removing from array\n", 2549 bdevname(rdev->bdev,b)); 2550 kick_rdev_from_array(rdev); 2551 } 2552 2553 2554 super_types[mddev->major_version]. 2555 validate_super(mddev, freshest); 2556 2557 i = 0; 2558 rdev_for_each(rdev, tmp, mddev) { 2559 if (rdev->desc_nr >= mddev->max_disks || 2560 i > mddev->max_disks) { 2561 printk(KERN_WARNING 2562 "md: %s: %s: only %d devices permitted\n", 2563 mdname(mddev), bdevname(rdev->bdev, b), 2564 mddev->max_disks); 2565 kick_rdev_from_array(rdev); 2566 continue; 2567 } 2568 if (rdev != freshest) 2569 if (super_types[mddev->major_version]. 2570 validate_super(mddev, rdev)) { 2571 printk(KERN_WARNING "md: kicking non-fresh %s" 2572 " from array!\n", 2573 bdevname(rdev->bdev,b)); 2574 kick_rdev_from_array(rdev); 2575 continue; 2576 } 2577 if (mddev->level == LEVEL_MULTIPATH) { 2578 rdev->desc_nr = i++; 2579 rdev->raid_disk = rdev->desc_nr; 2580 set_bit(In_sync, &rdev->flags); 2581 } else if (rdev->raid_disk >= mddev->raid_disks) { 2582 rdev->raid_disk = -1; 2583 clear_bit(In_sync, &rdev->flags); 2584 } 2585 } 2586 } 2587 2588 static void md_safemode_timeout(unsigned long data); 2589 2590 static ssize_t 2591 safe_delay_show(mddev_t *mddev, char *page) 2592 { 2593 int msec = (mddev->safemode_delay*1000)/HZ; 2594 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2595 } 2596 static ssize_t 2597 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2598 { 2599 int scale=1; 2600 int dot=0; 2601 int i; 2602 unsigned long msec; 2603 char buf[30]; 2604 2605 /* remove a period, and count digits after it */ 2606 if (len >= sizeof(buf)) 2607 return -EINVAL; 2608 strlcpy(buf, cbuf, sizeof(buf)); 2609 for (i=0; i<len; i++) { 2610 if (dot) { 2611 if (isdigit(buf[i])) { 2612 buf[i-1] = buf[i]; 2613 scale *= 10; 2614 } 2615 buf[i] = 0; 2616 } else if (buf[i] == '.') { 2617 dot=1; 2618 buf[i] = 0; 2619 } 2620 } 2621 if (strict_strtoul(buf, 10, &msec) < 0) 2622 return -EINVAL; 2623 msec = (msec * 1000) / scale; 2624 if (msec == 0) 2625 mddev->safemode_delay = 0; 2626 else { 2627 unsigned long old_delay = mddev->safemode_delay; 2628 mddev->safemode_delay = (msec*HZ)/1000; 2629 if (mddev->safemode_delay == 0) 2630 mddev->safemode_delay = 1; 2631 if (mddev->safemode_delay < old_delay) 2632 md_safemode_timeout((unsigned long)mddev); 2633 } 2634 return len; 2635 } 2636 static struct md_sysfs_entry md_safe_delay = 2637 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2638 2639 static ssize_t 2640 level_show(mddev_t *mddev, char *page) 2641 { 2642 struct mdk_personality *p = mddev->pers; 2643 if (p) 2644 return sprintf(page, "%s\n", p->name); 2645 else if (mddev->clevel[0]) 2646 return sprintf(page, "%s\n", mddev->clevel); 2647 else if (mddev->level != LEVEL_NONE) 2648 return sprintf(page, "%d\n", mddev->level); 2649 else 2650 return 0; 2651 } 2652 2653 static ssize_t 2654 level_store(mddev_t *mddev, const char *buf, size_t len) 2655 { 2656 char level[16]; 2657 ssize_t rv = len; 2658 struct mdk_personality *pers; 2659 void *priv; 2660 2661 if (mddev->pers == NULL) { 2662 if (len == 0) 2663 return 0; 2664 if (len >= sizeof(mddev->clevel)) 2665 return -ENOSPC; 2666 strncpy(mddev->clevel, buf, len); 2667 if (mddev->clevel[len-1] == '\n') 2668 len--; 2669 mddev->clevel[len] = 0; 2670 mddev->level = LEVEL_NONE; 2671 return rv; 2672 } 2673 2674 /* request to change the personality. Need to ensure: 2675 * - array is not engaged in resync/recovery/reshape 2676 * - old personality can be suspended 2677 * - new personality will access other array. 2678 */ 2679 2680 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 2681 return -EBUSY; 2682 2683 if (!mddev->pers->quiesce) { 2684 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 2685 mdname(mddev), mddev->pers->name); 2686 return -EINVAL; 2687 } 2688 2689 /* Now find the new personality */ 2690 if (len == 0 || len >= sizeof(level)) 2691 return -EINVAL; 2692 strncpy(level, buf, len); 2693 if (level[len-1] == '\n') 2694 len--; 2695 level[len] = 0; 2696 2697 request_module("md-%s", level); 2698 spin_lock(&pers_lock); 2699 pers = find_pers(LEVEL_NONE, level); 2700 if (!pers || !try_module_get(pers->owner)) { 2701 spin_unlock(&pers_lock); 2702 printk(KERN_WARNING "md: personality %s not loaded\n", level); 2703 return -EINVAL; 2704 } 2705 spin_unlock(&pers_lock); 2706 2707 if (pers == mddev->pers) { 2708 /* Nothing to do! */ 2709 module_put(pers->owner); 2710 return rv; 2711 } 2712 if (!pers->takeover) { 2713 module_put(pers->owner); 2714 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 2715 mdname(mddev), level); 2716 return -EINVAL; 2717 } 2718 2719 /* ->takeover must set new_* and/or delta_disks 2720 * if it succeeds, and may set them when it fails. 2721 */ 2722 priv = pers->takeover(mddev); 2723 if (IS_ERR(priv)) { 2724 mddev->new_level = mddev->level; 2725 mddev->new_layout = mddev->layout; 2726 mddev->new_chunk_sectors = mddev->chunk_sectors; 2727 mddev->raid_disks -= mddev->delta_disks; 2728 mddev->delta_disks = 0; 2729 module_put(pers->owner); 2730 printk(KERN_WARNING "md: %s: %s would not accept array\n", 2731 mdname(mddev), level); 2732 return PTR_ERR(priv); 2733 } 2734 2735 /* Looks like we have a winner */ 2736 mddev_suspend(mddev); 2737 mddev->pers->stop(mddev); 2738 module_put(mddev->pers->owner); 2739 mddev->pers = pers; 2740 mddev->private = priv; 2741 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2742 mddev->level = mddev->new_level; 2743 mddev->layout = mddev->new_layout; 2744 mddev->chunk_sectors = mddev->new_chunk_sectors; 2745 mddev->delta_disks = 0; 2746 pers->run(mddev); 2747 mddev_resume(mddev); 2748 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2749 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2750 md_wakeup_thread(mddev->thread); 2751 return rv; 2752 } 2753 2754 static struct md_sysfs_entry md_level = 2755 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2756 2757 2758 static ssize_t 2759 layout_show(mddev_t *mddev, char *page) 2760 { 2761 /* just a number, not meaningful for all levels */ 2762 if (mddev->reshape_position != MaxSector && 2763 mddev->layout != mddev->new_layout) 2764 return sprintf(page, "%d (%d)\n", 2765 mddev->new_layout, mddev->layout); 2766 return sprintf(page, "%d\n", mddev->layout); 2767 } 2768 2769 static ssize_t 2770 layout_store(mddev_t *mddev, const char *buf, size_t len) 2771 { 2772 char *e; 2773 unsigned long n = simple_strtoul(buf, &e, 10); 2774 2775 if (!*buf || (*e && *e != '\n')) 2776 return -EINVAL; 2777 2778 if (mddev->pers) { 2779 int err; 2780 if (mddev->pers->check_reshape == NULL) 2781 return -EBUSY; 2782 mddev->new_layout = n; 2783 err = mddev->pers->check_reshape(mddev); 2784 if (err) { 2785 mddev->new_layout = mddev->layout; 2786 return err; 2787 } 2788 } else { 2789 mddev->new_layout = n; 2790 if (mddev->reshape_position == MaxSector) 2791 mddev->layout = n; 2792 } 2793 return len; 2794 } 2795 static struct md_sysfs_entry md_layout = 2796 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2797 2798 2799 static ssize_t 2800 raid_disks_show(mddev_t *mddev, char *page) 2801 { 2802 if (mddev->raid_disks == 0) 2803 return 0; 2804 if (mddev->reshape_position != MaxSector && 2805 mddev->delta_disks != 0) 2806 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 2807 mddev->raid_disks - mddev->delta_disks); 2808 return sprintf(page, "%d\n", mddev->raid_disks); 2809 } 2810 2811 static int update_raid_disks(mddev_t *mddev, int raid_disks); 2812 2813 static ssize_t 2814 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2815 { 2816 char *e; 2817 int rv = 0; 2818 unsigned long n = simple_strtoul(buf, &e, 10); 2819 2820 if (!*buf || (*e && *e != '\n')) 2821 return -EINVAL; 2822 2823 if (mddev->pers) 2824 rv = update_raid_disks(mddev, n); 2825 else if (mddev->reshape_position != MaxSector) { 2826 int olddisks = mddev->raid_disks - mddev->delta_disks; 2827 mddev->delta_disks = n - olddisks; 2828 mddev->raid_disks = n; 2829 } else 2830 mddev->raid_disks = n; 2831 return rv ? rv : len; 2832 } 2833 static struct md_sysfs_entry md_raid_disks = 2834 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2835 2836 static ssize_t 2837 chunk_size_show(mddev_t *mddev, char *page) 2838 { 2839 if (mddev->reshape_position != MaxSector && 2840 mddev->chunk_sectors != mddev->new_chunk_sectors) 2841 return sprintf(page, "%d (%d)\n", 2842 mddev->new_chunk_sectors << 9, 2843 mddev->chunk_sectors << 9); 2844 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 2845 } 2846 2847 static ssize_t 2848 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2849 { 2850 char *e; 2851 unsigned long n = simple_strtoul(buf, &e, 10); 2852 2853 if (!*buf || (*e && *e != '\n')) 2854 return -EINVAL; 2855 2856 if (mddev->pers) { 2857 int err; 2858 if (mddev->pers->check_reshape == NULL) 2859 return -EBUSY; 2860 mddev->new_chunk_sectors = n >> 9; 2861 err = mddev->pers->check_reshape(mddev); 2862 if (err) { 2863 mddev->new_chunk_sectors = mddev->chunk_sectors; 2864 return err; 2865 } 2866 } else { 2867 mddev->new_chunk_sectors = n >> 9; 2868 if (mddev->reshape_position == MaxSector) 2869 mddev->chunk_sectors = n >> 9; 2870 } 2871 return len; 2872 } 2873 static struct md_sysfs_entry md_chunk_size = 2874 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2875 2876 static ssize_t 2877 resync_start_show(mddev_t *mddev, char *page) 2878 { 2879 if (mddev->recovery_cp == MaxSector) 2880 return sprintf(page, "none\n"); 2881 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2882 } 2883 2884 static ssize_t 2885 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2886 { 2887 char *e; 2888 unsigned long long n = simple_strtoull(buf, &e, 10); 2889 2890 if (mddev->pers) 2891 return -EBUSY; 2892 if (!*buf || (*e && *e != '\n')) 2893 return -EINVAL; 2894 2895 mddev->recovery_cp = n; 2896 return len; 2897 } 2898 static struct md_sysfs_entry md_resync_start = 2899 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2900 2901 /* 2902 * The array state can be: 2903 * 2904 * clear 2905 * No devices, no size, no level 2906 * Equivalent to STOP_ARRAY ioctl 2907 * inactive 2908 * May have some settings, but array is not active 2909 * all IO results in error 2910 * When written, doesn't tear down array, but just stops it 2911 * suspended (not supported yet) 2912 * All IO requests will block. The array can be reconfigured. 2913 * Writing this, if accepted, will block until array is quiescent 2914 * readonly 2915 * no resync can happen. no superblocks get written. 2916 * write requests fail 2917 * read-auto 2918 * like readonly, but behaves like 'clean' on a write request. 2919 * 2920 * clean - no pending writes, but otherwise active. 2921 * When written to inactive array, starts without resync 2922 * If a write request arrives then 2923 * if metadata is known, mark 'dirty' and switch to 'active'. 2924 * if not known, block and switch to write-pending 2925 * If written to an active array that has pending writes, then fails. 2926 * active 2927 * fully active: IO and resync can be happening. 2928 * When written to inactive array, starts with resync 2929 * 2930 * write-pending 2931 * clean, but writes are blocked waiting for 'active' to be written. 2932 * 2933 * active-idle 2934 * like active, but no writes have been seen for a while (100msec). 2935 * 2936 */ 2937 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2938 write_pending, active_idle, bad_word}; 2939 static char *array_states[] = { 2940 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2941 "write-pending", "active-idle", NULL }; 2942 2943 static int match_word(const char *word, char **list) 2944 { 2945 int n; 2946 for (n=0; list[n]; n++) 2947 if (cmd_match(word, list[n])) 2948 break; 2949 return n; 2950 } 2951 2952 static ssize_t 2953 array_state_show(mddev_t *mddev, char *page) 2954 { 2955 enum array_state st = inactive; 2956 2957 if (mddev->pers) 2958 switch(mddev->ro) { 2959 case 1: 2960 st = readonly; 2961 break; 2962 case 2: 2963 st = read_auto; 2964 break; 2965 case 0: 2966 if (mddev->in_sync) 2967 st = clean; 2968 else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2969 st = write_pending; 2970 else if (mddev->safemode) 2971 st = active_idle; 2972 else 2973 st = active; 2974 } 2975 else { 2976 if (list_empty(&mddev->disks) && 2977 mddev->raid_disks == 0 && 2978 mddev->dev_sectors == 0) 2979 st = clear; 2980 else 2981 st = inactive; 2982 } 2983 return sprintf(page, "%s\n", array_states[st]); 2984 } 2985 2986 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 2987 static int do_md_run(mddev_t * mddev); 2988 static int restart_array(mddev_t *mddev); 2989 2990 static ssize_t 2991 array_state_store(mddev_t *mddev, const char *buf, size_t len) 2992 { 2993 int err = -EINVAL; 2994 enum array_state st = match_word(buf, array_states); 2995 switch(st) { 2996 case bad_word: 2997 break; 2998 case clear: 2999 /* stopping an active array */ 3000 if (atomic_read(&mddev->openers) > 0) 3001 return -EBUSY; 3002 err = do_md_stop(mddev, 0, 0); 3003 break; 3004 case inactive: 3005 /* stopping an active array */ 3006 if (mddev->pers) { 3007 if (atomic_read(&mddev->openers) > 0) 3008 return -EBUSY; 3009 err = do_md_stop(mddev, 2, 0); 3010 } else 3011 err = 0; /* already inactive */ 3012 break; 3013 case suspended: 3014 break; /* not supported yet */ 3015 case readonly: 3016 if (mddev->pers) 3017 err = do_md_stop(mddev, 1, 0); 3018 else { 3019 mddev->ro = 1; 3020 set_disk_ro(mddev->gendisk, 1); 3021 err = do_md_run(mddev); 3022 } 3023 break; 3024 case read_auto: 3025 if (mddev->pers) { 3026 if (mddev->ro == 0) 3027 err = do_md_stop(mddev, 1, 0); 3028 else if (mddev->ro == 1) 3029 err = restart_array(mddev); 3030 if (err == 0) { 3031 mddev->ro = 2; 3032 set_disk_ro(mddev->gendisk, 0); 3033 } 3034 } else { 3035 mddev->ro = 2; 3036 err = do_md_run(mddev); 3037 } 3038 break; 3039 case clean: 3040 if (mddev->pers) { 3041 restart_array(mddev); 3042 spin_lock_irq(&mddev->write_lock); 3043 if (atomic_read(&mddev->writes_pending) == 0) { 3044 if (mddev->in_sync == 0) { 3045 mddev->in_sync = 1; 3046 if (mddev->safemode == 1) 3047 mddev->safemode = 0; 3048 if (mddev->persistent) 3049 set_bit(MD_CHANGE_CLEAN, 3050 &mddev->flags); 3051 } 3052 err = 0; 3053 } else 3054 err = -EBUSY; 3055 spin_unlock_irq(&mddev->write_lock); 3056 } else 3057 err = -EINVAL; 3058 break; 3059 case active: 3060 if (mddev->pers) { 3061 restart_array(mddev); 3062 if (mddev->external) 3063 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 3064 wake_up(&mddev->sb_wait); 3065 err = 0; 3066 } else { 3067 mddev->ro = 0; 3068 set_disk_ro(mddev->gendisk, 0); 3069 err = do_md_run(mddev); 3070 } 3071 break; 3072 case write_pending: 3073 case active_idle: 3074 /* these cannot be set */ 3075 break; 3076 } 3077 if (err) 3078 return err; 3079 else { 3080 sysfs_notify_dirent(mddev->sysfs_state); 3081 return len; 3082 } 3083 } 3084 static struct md_sysfs_entry md_array_state = 3085 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3086 3087 static ssize_t 3088 null_show(mddev_t *mddev, char *page) 3089 { 3090 return -EINVAL; 3091 } 3092 3093 static ssize_t 3094 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 3095 { 3096 /* buf must be %d:%d\n? giving major and minor numbers */ 3097 /* The new device is added to the array. 3098 * If the array has a persistent superblock, we read the 3099 * superblock to initialise info and check validity. 3100 * Otherwise, only checking done is that in bind_rdev_to_array, 3101 * which mainly checks size. 3102 */ 3103 char *e; 3104 int major = simple_strtoul(buf, &e, 10); 3105 int minor; 3106 dev_t dev; 3107 mdk_rdev_t *rdev; 3108 int err; 3109 3110 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 3111 return -EINVAL; 3112 minor = simple_strtoul(e+1, &e, 10); 3113 if (*e && *e != '\n') 3114 return -EINVAL; 3115 dev = MKDEV(major, minor); 3116 if (major != MAJOR(dev) || 3117 minor != MINOR(dev)) 3118 return -EOVERFLOW; 3119 3120 3121 if (mddev->persistent) { 3122 rdev = md_import_device(dev, mddev->major_version, 3123 mddev->minor_version); 3124 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 3125 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3126 mdk_rdev_t, same_set); 3127 err = super_types[mddev->major_version] 3128 .load_super(rdev, rdev0, mddev->minor_version); 3129 if (err < 0) 3130 goto out; 3131 } 3132 } else if (mddev->external) 3133 rdev = md_import_device(dev, -2, -1); 3134 else 3135 rdev = md_import_device(dev, -1, -1); 3136 3137 if (IS_ERR(rdev)) 3138 return PTR_ERR(rdev); 3139 err = bind_rdev_to_array(rdev, mddev); 3140 out: 3141 if (err) 3142 export_rdev(rdev); 3143 return err ? err : len; 3144 } 3145 3146 static struct md_sysfs_entry md_new_device = 3147 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 3148 3149 static ssize_t 3150 bitmap_store(mddev_t *mddev, const char *buf, size_t len) 3151 { 3152 char *end; 3153 unsigned long chunk, end_chunk; 3154 3155 if (!mddev->bitmap) 3156 goto out; 3157 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 3158 while (*buf) { 3159 chunk = end_chunk = simple_strtoul(buf, &end, 0); 3160 if (buf == end) break; 3161 if (*end == '-') { /* range */ 3162 buf = end + 1; 3163 end_chunk = simple_strtoul(buf, &end, 0); 3164 if (buf == end) break; 3165 } 3166 if (*end && !isspace(*end)) break; 3167 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 3168 buf = end; 3169 while (isspace(*buf)) buf++; 3170 } 3171 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 3172 out: 3173 return len; 3174 } 3175 3176 static struct md_sysfs_entry md_bitmap = 3177 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 3178 3179 static ssize_t 3180 size_show(mddev_t *mddev, char *page) 3181 { 3182 return sprintf(page, "%llu\n", 3183 (unsigned long long)mddev->dev_sectors / 2); 3184 } 3185 3186 static int update_size(mddev_t *mddev, sector_t num_sectors); 3187 3188 static ssize_t 3189 size_store(mddev_t *mddev, const char *buf, size_t len) 3190 { 3191 /* If array is inactive, we can reduce the component size, but 3192 * not increase it (except from 0). 3193 * If array is active, we can try an on-line resize 3194 */ 3195 sector_t sectors; 3196 int err = strict_blocks_to_sectors(buf, §ors); 3197 3198 if (err < 0) 3199 return err; 3200 if (mddev->pers) { 3201 err = update_size(mddev, sectors); 3202 md_update_sb(mddev, 1); 3203 } else { 3204 if (mddev->dev_sectors == 0 || 3205 mddev->dev_sectors > sectors) 3206 mddev->dev_sectors = sectors; 3207 else 3208 err = -ENOSPC; 3209 } 3210 return err ? err : len; 3211 } 3212 3213 static struct md_sysfs_entry md_size = 3214 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 3215 3216 3217 /* Metdata version. 3218 * This is one of 3219 * 'none' for arrays with no metadata (good luck...) 3220 * 'external' for arrays with externally managed metadata, 3221 * or N.M for internally known formats 3222 */ 3223 static ssize_t 3224 metadata_show(mddev_t *mddev, char *page) 3225 { 3226 if (mddev->persistent) 3227 return sprintf(page, "%d.%d\n", 3228 mddev->major_version, mddev->minor_version); 3229 else if (mddev->external) 3230 return sprintf(page, "external:%s\n", mddev->metadata_type); 3231 else 3232 return sprintf(page, "none\n"); 3233 } 3234 3235 static ssize_t 3236 metadata_store(mddev_t *mddev, const char *buf, size_t len) 3237 { 3238 int major, minor; 3239 char *e; 3240 /* Changing the details of 'external' metadata is 3241 * always permitted. Otherwise there must be 3242 * no devices attached to the array. 3243 */ 3244 if (mddev->external && strncmp(buf, "external:", 9) == 0) 3245 ; 3246 else if (!list_empty(&mddev->disks)) 3247 return -EBUSY; 3248 3249 if (cmd_match(buf, "none")) { 3250 mddev->persistent = 0; 3251 mddev->external = 0; 3252 mddev->major_version = 0; 3253 mddev->minor_version = 90; 3254 return len; 3255 } 3256 if (strncmp(buf, "external:", 9) == 0) { 3257 size_t namelen = len-9; 3258 if (namelen >= sizeof(mddev->metadata_type)) 3259 namelen = sizeof(mddev->metadata_type)-1; 3260 strncpy(mddev->metadata_type, buf+9, namelen); 3261 mddev->metadata_type[namelen] = 0; 3262 if (namelen && mddev->metadata_type[namelen-1] == '\n') 3263 mddev->metadata_type[--namelen] = 0; 3264 mddev->persistent = 0; 3265 mddev->external = 1; 3266 mddev->major_version = 0; 3267 mddev->minor_version = 90; 3268 return len; 3269 } 3270 major = simple_strtoul(buf, &e, 10); 3271 if (e==buf || *e != '.') 3272 return -EINVAL; 3273 buf = e+1; 3274 minor = simple_strtoul(buf, &e, 10); 3275 if (e==buf || (*e && *e != '\n') ) 3276 return -EINVAL; 3277 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 3278 return -ENOENT; 3279 mddev->major_version = major; 3280 mddev->minor_version = minor; 3281 mddev->persistent = 1; 3282 mddev->external = 0; 3283 return len; 3284 } 3285 3286 static struct md_sysfs_entry md_metadata = 3287 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 3288 3289 static ssize_t 3290 action_show(mddev_t *mddev, char *page) 3291 { 3292 char *type = "idle"; 3293 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3294 type = "frozen"; 3295 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3296 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 3297 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3298 type = "reshape"; 3299 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3300 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 3301 type = "resync"; 3302 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 3303 type = "check"; 3304 else 3305 type = "repair"; 3306 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 3307 type = "recover"; 3308 } 3309 return sprintf(page, "%s\n", type); 3310 } 3311 3312 static ssize_t 3313 action_store(mddev_t *mddev, const char *page, size_t len) 3314 { 3315 if (!mddev->pers || !mddev->pers->sync_request) 3316 return -EINVAL; 3317 3318 if (cmd_match(page, "frozen")) 3319 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3320 else 3321 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3322 3323 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 3324 if (mddev->sync_thread) { 3325 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3326 md_unregister_thread(mddev->sync_thread); 3327 mddev->sync_thread = NULL; 3328 mddev->recovery = 0; 3329 } 3330 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3331 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3332 return -EBUSY; 3333 else if (cmd_match(page, "resync")) 3334 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3335 else if (cmd_match(page, "recover")) { 3336 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 3337 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3338 } else if (cmd_match(page, "reshape")) { 3339 int err; 3340 if (mddev->pers->start_reshape == NULL) 3341 return -EINVAL; 3342 err = mddev->pers->start_reshape(mddev); 3343 if (err) 3344 return err; 3345 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3346 } else { 3347 if (cmd_match(page, "check")) 3348 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3349 else if (!cmd_match(page, "repair")) 3350 return -EINVAL; 3351 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 3352 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3353 } 3354 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3355 md_wakeup_thread(mddev->thread); 3356 sysfs_notify_dirent(mddev->sysfs_action); 3357 return len; 3358 } 3359 3360 static ssize_t 3361 mismatch_cnt_show(mddev_t *mddev, char *page) 3362 { 3363 return sprintf(page, "%llu\n", 3364 (unsigned long long) mddev->resync_mismatches); 3365 } 3366 3367 static struct md_sysfs_entry md_scan_mode = 3368 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 3369 3370 3371 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 3372 3373 static ssize_t 3374 sync_min_show(mddev_t *mddev, char *page) 3375 { 3376 return sprintf(page, "%d (%s)\n", speed_min(mddev), 3377 mddev->sync_speed_min ? "local": "system"); 3378 } 3379 3380 static ssize_t 3381 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 3382 { 3383 int min; 3384 char *e; 3385 if (strncmp(buf, "system", 6)==0) { 3386 mddev->sync_speed_min = 0; 3387 return len; 3388 } 3389 min = simple_strtoul(buf, &e, 10); 3390 if (buf == e || (*e && *e != '\n') || min <= 0) 3391 return -EINVAL; 3392 mddev->sync_speed_min = min; 3393 return len; 3394 } 3395 3396 static struct md_sysfs_entry md_sync_min = 3397 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 3398 3399 static ssize_t 3400 sync_max_show(mddev_t *mddev, char *page) 3401 { 3402 return sprintf(page, "%d (%s)\n", speed_max(mddev), 3403 mddev->sync_speed_max ? "local": "system"); 3404 } 3405 3406 static ssize_t 3407 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 3408 { 3409 int max; 3410 char *e; 3411 if (strncmp(buf, "system", 6)==0) { 3412 mddev->sync_speed_max = 0; 3413 return len; 3414 } 3415 max = simple_strtoul(buf, &e, 10); 3416 if (buf == e || (*e && *e != '\n') || max <= 0) 3417 return -EINVAL; 3418 mddev->sync_speed_max = max; 3419 return len; 3420 } 3421 3422 static struct md_sysfs_entry md_sync_max = 3423 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 3424 3425 static ssize_t 3426 degraded_show(mddev_t *mddev, char *page) 3427 { 3428 return sprintf(page, "%d\n", mddev->degraded); 3429 } 3430 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 3431 3432 static ssize_t 3433 sync_force_parallel_show(mddev_t *mddev, char *page) 3434 { 3435 return sprintf(page, "%d\n", mddev->parallel_resync); 3436 } 3437 3438 static ssize_t 3439 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) 3440 { 3441 long n; 3442 3443 if (strict_strtol(buf, 10, &n)) 3444 return -EINVAL; 3445 3446 if (n != 0 && n != 1) 3447 return -EINVAL; 3448 3449 mddev->parallel_resync = n; 3450 3451 if (mddev->sync_thread) 3452 wake_up(&resync_wait); 3453 3454 return len; 3455 } 3456 3457 /* force parallel resync, even with shared block devices */ 3458 static struct md_sysfs_entry md_sync_force_parallel = 3459 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 3460 sync_force_parallel_show, sync_force_parallel_store); 3461 3462 static ssize_t 3463 sync_speed_show(mddev_t *mddev, char *page) 3464 { 3465 unsigned long resync, dt, db; 3466 if (mddev->curr_resync == 0) 3467 return sprintf(page, "none\n"); 3468 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 3469 dt = (jiffies - mddev->resync_mark) / HZ; 3470 if (!dt) dt++; 3471 db = resync - mddev->resync_mark_cnt; 3472 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 3473 } 3474 3475 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3476 3477 static ssize_t 3478 sync_completed_show(mddev_t *mddev, char *page) 3479 { 3480 unsigned long max_sectors, resync; 3481 3482 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3483 return sprintf(page, "none\n"); 3484 3485 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3486 max_sectors = mddev->resync_max_sectors; 3487 else 3488 max_sectors = mddev->dev_sectors; 3489 3490 resync = mddev->curr_resync_completed; 3491 return sprintf(page, "%lu / %lu\n", resync, max_sectors); 3492 } 3493 3494 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3495 3496 static ssize_t 3497 min_sync_show(mddev_t *mddev, char *page) 3498 { 3499 return sprintf(page, "%llu\n", 3500 (unsigned long long)mddev->resync_min); 3501 } 3502 static ssize_t 3503 min_sync_store(mddev_t *mddev, const char *buf, size_t len) 3504 { 3505 unsigned long long min; 3506 if (strict_strtoull(buf, 10, &min)) 3507 return -EINVAL; 3508 if (min > mddev->resync_max) 3509 return -EINVAL; 3510 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3511 return -EBUSY; 3512 3513 /* Must be a multiple of chunk_size */ 3514 if (mddev->chunk_sectors) { 3515 sector_t temp = min; 3516 if (sector_div(temp, mddev->chunk_sectors)) 3517 return -EINVAL; 3518 } 3519 mddev->resync_min = min; 3520 3521 return len; 3522 } 3523 3524 static struct md_sysfs_entry md_min_sync = 3525 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 3526 3527 static ssize_t 3528 max_sync_show(mddev_t *mddev, char *page) 3529 { 3530 if (mddev->resync_max == MaxSector) 3531 return sprintf(page, "max\n"); 3532 else 3533 return sprintf(page, "%llu\n", 3534 (unsigned long long)mddev->resync_max); 3535 } 3536 static ssize_t 3537 max_sync_store(mddev_t *mddev, const char *buf, size_t len) 3538 { 3539 if (strncmp(buf, "max", 3) == 0) 3540 mddev->resync_max = MaxSector; 3541 else { 3542 unsigned long long max; 3543 if (strict_strtoull(buf, 10, &max)) 3544 return -EINVAL; 3545 if (max < mddev->resync_min) 3546 return -EINVAL; 3547 if (max < mddev->resync_max && 3548 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3549 return -EBUSY; 3550 3551 /* Must be a multiple of chunk_size */ 3552 if (mddev->chunk_sectors) { 3553 sector_t temp = max; 3554 if (sector_div(temp, mddev->chunk_sectors)) 3555 return -EINVAL; 3556 } 3557 mddev->resync_max = max; 3558 } 3559 wake_up(&mddev->recovery_wait); 3560 return len; 3561 } 3562 3563 static struct md_sysfs_entry md_max_sync = 3564 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 3565 3566 static ssize_t 3567 suspend_lo_show(mddev_t *mddev, char *page) 3568 { 3569 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 3570 } 3571 3572 static ssize_t 3573 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 3574 { 3575 char *e; 3576 unsigned long long new = simple_strtoull(buf, &e, 10); 3577 3578 if (mddev->pers == NULL || 3579 mddev->pers->quiesce == NULL) 3580 return -EINVAL; 3581 if (buf == e || (*e && *e != '\n')) 3582 return -EINVAL; 3583 if (new >= mddev->suspend_hi || 3584 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 3585 mddev->suspend_lo = new; 3586 mddev->pers->quiesce(mddev, 2); 3587 return len; 3588 } else 3589 return -EINVAL; 3590 } 3591 static struct md_sysfs_entry md_suspend_lo = 3592 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 3593 3594 3595 static ssize_t 3596 suspend_hi_show(mddev_t *mddev, char *page) 3597 { 3598 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 3599 } 3600 3601 static ssize_t 3602 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 3603 { 3604 char *e; 3605 unsigned long long new = simple_strtoull(buf, &e, 10); 3606 3607 if (mddev->pers == NULL || 3608 mddev->pers->quiesce == NULL) 3609 return -EINVAL; 3610 if (buf == e || (*e && *e != '\n')) 3611 return -EINVAL; 3612 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 3613 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 3614 mddev->suspend_hi = new; 3615 mddev->pers->quiesce(mddev, 1); 3616 mddev->pers->quiesce(mddev, 0); 3617 return len; 3618 } else 3619 return -EINVAL; 3620 } 3621 static struct md_sysfs_entry md_suspend_hi = 3622 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 3623 3624 static ssize_t 3625 reshape_position_show(mddev_t *mddev, char *page) 3626 { 3627 if (mddev->reshape_position != MaxSector) 3628 return sprintf(page, "%llu\n", 3629 (unsigned long long)mddev->reshape_position); 3630 strcpy(page, "none\n"); 3631 return 5; 3632 } 3633 3634 static ssize_t 3635 reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 3636 { 3637 char *e; 3638 unsigned long long new = simple_strtoull(buf, &e, 10); 3639 if (mddev->pers) 3640 return -EBUSY; 3641 if (buf == e || (*e && *e != '\n')) 3642 return -EINVAL; 3643 mddev->reshape_position = new; 3644 mddev->delta_disks = 0; 3645 mddev->new_level = mddev->level; 3646 mddev->new_layout = mddev->layout; 3647 mddev->new_chunk_sectors = mddev->chunk_sectors; 3648 return len; 3649 } 3650 3651 static struct md_sysfs_entry md_reshape_position = 3652 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 3653 reshape_position_store); 3654 3655 static ssize_t 3656 array_size_show(mddev_t *mddev, char *page) 3657 { 3658 if (mddev->external_size) 3659 return sprintf(page, "%llu\n", 3660 (unsigned long long)mddev->array_sectors/2); 3661 else 3662 return sprintf(page, "default\n"); 3663 } 3664 3665 static ssize_t 3666 array_size_store(mddev_t *mddev, const char *buf, size_t len) 3667 { 3668 sector_t sectors; 3669 3670 if (strncmp(buf, "default", 7) == 0) { 3671 if (mddev->pers) 3672 sectors = mddev->pers->size(mddev, 0, 0); 3673 else 3674 sectors = mddev->array_sectors; 3675 3676 mddev->external_size = 0; 3677 } else { 3678 if (strict_blocks_to_sectors(buf, §ors) < 0) 3679 return -EINVAL; 3680 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 3681 return -E2BIG; 3682 3683 mddev->external_size = 1; 3684 } 3685 3686 mddev->array_sectors = sectors; 3687 set_capacity(mddev->gendisk, mddev->array_sectors); 3688 if (mddev->pers) { 3689 struct block_device *bdev = bdget_disk(mddev->gendisk, 0); 3690 3691 if (bdev) { 3692 mutex_lock(&bdev->bd_inode->i_mutex); 3693 i_size_write(bdev->bd_inode, 3694 (loff_t)mddev->array_sectors << 9); 3695 mutex_unlock(&bdev->bd_inode->i_mutex); 3696 bdput(bdev); 3697 } 3698 } 3699 3700 return len; 3701 } 3702 3703 static struct md_sysfs_entry md_array_size = 3704 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 3705 array_size_store); 3706 3707 static struct attribute *md_default_attrs[] = { 3708 &md_level.attr, 3709 &md_layout.attr, 3710 &md_raid_disks.attr, 3711 &md_chunk_size.attr, 3712 &md_size.attr, 3713 &md_resync_start.attr, 3714 &md_metadata.attr, 3715 &md_new_device.attr, 3716 &md_safe_delay.attr, 3717 &md_array_state.attr, 3718 &md_reshape_position.attr, 3719 &md_array_size.attr, 3720 NULL, 3721 }; 3722 3723 static struct attribute *md_redundancy_attrs[] = { 3724 &md_scan_mode.attr, 3725 &md_mismatches.attr, 3726 &md_sync_min.attr, 3727 &md_sync_max.attr, 3728 &md_sync_speed.attr, 3729 &md_sync_force_parallel.attr, 3730 &md_sync_completed.attr, 3731 &md_min_sync.attr, 3732 &md_max_sync.attr, 3733 &md_suspend_lo.attr, 3734 &md_suspend_hi.attr, 3735 &md_bitmap.attr, 3736 &md_degraded.attr, 3737 NULL, 3738 }; 3739 static struct attribute_group md_redundancy_group = { 3740 .name = NULL, 3741 .attrs = md_redundancy_attrs, 3742 }; 3743 3744 3745 static ssize_t 3746 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3747 { 3748 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3749 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3750 ssize_t rv; 3751 3752 if (!entry->show) 3753 return -EIO; 3754 rv = mddev_lock(mddev); 3755 if (!rv) { 3756 rv = entry->show(mddev, page); 3757 mddev_unlock(mddev); 3758 } 3759 return rv; 3760 } 3761 3762 static ssize_t 3763 md_attr_store(struct kobject *kobj, struct attribute *attr, 3764 const char *page, size_t length) 3765 { 3766 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3767 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3768 ssize_t rv; 3769 3770 if (!entry->store) 3771 return -EIO; 3772 if (!capable(CAP_SYS_ADMIN)) 3773 return -EACCES; 3774 rv = mddev_lock(mddev); 3775 if (mddev->hold_active == UNTIL_IOCTL) 3776 mddev->hold_active = 0; 3777 if (!rv) { 3778 rv = entry->store(mddev, page, length); 3779 mddev_unlock(mddev); 3780 } 3781 return rv; 3782 } 3783 3784 static void md_free(struct kobject *ko) 3785 { 3786 mddev_t *mddev = container_of(ko, mddev_t, kobj); 3787 3788 if (mddev->sysfs_state) 3789 sysfs_put(mddev->sysfs_state); 3790 3791 if (mddev->gendisk) { 3792 del_gendisk(mddev->gendisk); 3793 put_disk(mddev->gendisk); 3794 } 3795 if (mddev->queue) 3796 blk_cleanup_queue(mddev->queue); 3797 3798 kfree(mddev); 3799 } 3800 3801 static struct sysfs_ops md_sysfs_ops = { 3802 .show = md_attr_show, 3803 .store = md_attr_store, 3804 }; 3805 static struct kobj_type md_ktype = { 3806 .release = md_free, 3807 .sysfs_ops = &md_sysfs_ops, 3808 .default_attrs = md_default_attrs, 3809 }; 3810 3811 int mdp_major = 0; 3812 3813 static void mddev_delayed_delete(struct work_struct *ws) 3814 { 3815 mddev_t *mddev = container_of(ws, mddev_t, del_work); 3816 3817 if (mddev->private == &md_redundancy_group) { 3818 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3819 if (mddev->sysfs_action) 3820 sysfs_put(mddev->sysfs_action); 3821 mddev->sysfs_action = NULL; 3822 mddev->private = NULL; 3823 } 3824 kobject_del(&mddev->kobj); 3825 kobject_put(&mddev->kobj); 3826 } 3827 3828 static int md_alloc(dev_t dev, char *name) 3829 { 3830 static DEFINE_MUTEX(disks_mutex); 3831 mddev_t *mddev = mddev_find(dev); 3832 struct gendisk *disk; 3833 int partitioned; 3834 int shift; 3835 int unit; 3836 int error; 3837 3838 if (!mddev) 3839 return -ENODEV; 3840 3841 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 3842 shift = partitioned ? MdpMinorShift : 0; 3843 unit = MINOR(mddev->unit) >> shift; 3844 3845 /* wait for any previous instance if this device 3846 * to be completed removed (mddev_delayed_delete). 3847 */ 3848 flush_scheduled_work(); 3849 3850 mutex_lock(&disks_mutex); 3851 error = -EEXIST; 3852 if (mddev->gendisk) 3853 goto abort; 3854 3855 if (name) { 3856 /* Need to ensure that 'name' is not a duplicate. 3857 */ 3858 mddev_t *mddev2; 3859 spin_lock(&all_mddevs_lock); 3860 3861 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 3862 if (mddev2->gendisk && 3863 strcmp(mddev2->gendisk->disk_name, name) == 0) { 3864 spin_unlock(&all_mddevs_lock); 3865 goto abort; 3866 } 3867 spin_unlock(&all_mddevs_lock); 3868 } 3869 3870 error = -ENOMEM; 3871 mddev->queue = blk_alloc_queue(GFP_KERNEL); 3872 if (!mddev->queue) 3873 goto abort; 3874 mddev->queue->queuedata = mddev; 3875 3876 /* Can be unlocked because the queue is new: no concurrency */ 3877 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); 3878 3879 blk_queue_make_request(mddev->queue, md_make_request); 3880 3881 disk = alloc_disk(1 << shift); 3882 if (!disk) { 3883 blk_cleanup_queue(mddev->queue); 3884 mddev->queue = NULL; 3885 goto abort; 3886 } 3887 disk->major = MAJOR(mddev->unit); 3888 disk->first_minor = unit << shift; 3889 if (name) 3890 strcpy(disk->disk_name, name); 3891 else if (partitioned) 3892 sprintf(disk->disk_name, "md_d%d", unit); 3893 else 3894 sprintf(disk->disk_name, "md%d", unit); 3895 disk->fops = &md_fops; 3896 disk->private_data = mddev; 3897 disk->queue = mddev->queue; 3898 /* Allow extended partitions. This makes the 3899 * 'mdp' device redundant, but we can't really 3900 * remove it now. 3901 */ 3902 disk->flags |= GENHD_FL_EXT_DEVT; 3903 add_disk(disk); 3904 mddev->gendisk = disk; 3905 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 3906 &disk_to_dev(disk)->kobj, "%s", "md"); 3907 if (error) { 3908 /* This isn't possible, but as kobject_init_and_add is marked 3909 * __must_check, we must do something with the result 3910 */ 3911 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3912 disk->disk_name); 3913 error = 0; 3914 } 3915 abort: 3916 mutex_unlock(&disks_mutex); 3917 if (!error) { 3918 kobject_uevent(&mddev->kobj, KOBJ_ADD); 3919 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 3920 } 3921 mddev_put(mddev); 3922 return error; 3923 } 3924 3925 static struct kobject *md_probe(dev_t dev, int *part, void *data) 3926 { 3927 md_alloc(dev, NULL); 3928 return NULL; 3929 } 3930 3931 static int add_named_array(const char *val, struct kernel_param *kp) 3932 { 3933 /* val must be "md_*" where * is not all digits. 3934 * We allocate an array with a large free minor number, and 3935 * set the name to val. val must not already be an active name. 3936 */ 3937 int len = strlen(val); 3938 char buf[DISK_NAME_LEN]; 3939 3940 while (len && val[len-1] == '\n') 3941 len--; 3942 if (len >= DISK_NAME_LEN) 3943 return -E2BIG; 3944 strlcpy(buf, val, len+1); 3945 if (strncmp(buf, "md_", 3) != 0) 3946 return -EINVAL; 3947 return md_alloc(0, buf); 3948 } 3949 3950 static void md_safemode_timeout(unsigned long data) 3951 { 3952 mddev_t *mddev = (mddev_t *) data; 3953 3954 if (!atomic_read(&mddev->writes_pending)) { 3955 mddev->safemode = 1; 3956 if (mddev->external) 3957 sysfs_notify_dirent(mddev->sysfs_state); 3958 } 3959 md_wakeup_thread(mddev->thread); 3960 } 3961 3962 static int start_dirty_degraded; 3963 3964 static int do_md_run(mddev_t * mddev) 3965 { 3966 int err; 3967 mdk_rdev_t *rdev; 3968 struct gendisk *disk; 3969 struct mdk_personality *pers; 3970 3971 if (list_empty(&mddev->disks)) 3972 /* cannot run an array with no devices.. */ 3973 return -EINVAL; 3974 3975 if (mddev->pers) 3976 return -EBUSY; 3977 3978 /* 3979 * Analyze all RAID superblock(s) 3980 */ 3981 if (!mddev->raid_disks) { 3982 if (!mddev->persistent) 3983 return -EINVAL; 3984 analyze_sbs(mddev); 3985 } 3986 3987 if (mddev->level != LEVEL_NONE) 3988 request_module("md-level-%d", mddev->level); 3989 else if (mddev->clevel[0]) 3990 request_module("md-%s", mddev->clevel); 3991 3992 /* 3993 * Drop all container device buffers, from now on 3994 * the only valid external interface is through the md 3995 * device. 3996 */ 3997 list_for_each_entry(rdev, &mddev->disks, same_set) { 3998 if (test_bit(Faulty, &rdev->flags)) 3999 continue; 4000 sync_blockdev(rdev->bdev); 4001 invalidate_bdev(rdev->bdev); 4002 4003 /* perform some consistency tests on the device. 4004 * We don't want the data to overlap the metadata, 4005 * Internal Bitmap issues have been handled elsewhere. 4006 */ 4007 if (rdev->data_offset < rdev->sb_start) { 4008 if (mddev->dev_sectors && 4009 rdev->data_offset + mddev->dev_sectors 4010 > rdev->sb_start) { 4011 printk("md: %s: data overlaps metadata\n", 4012 mdname(mddev)); 4013 return -EINVAL; 4014 } 4015 } else { 4016 if (rdev->sb_start + rdev->sb_size/512 4017 > rdev->data_offset) { 4018 printk("md: %s: metadata overlaps data\n", 4019 mdname(mddev)); 4020 return -EINVAL; 4021 } 4022 } 4023 sysfs_notify_dirent(rdev->sysfs_state); 4024 } 4025 4026 md_probe(mddev->unit, NULL, NULL); 4027 disk = mddev->gendisk; 4028 if (!disk) 4029 return -ENOMEM; 4030 4031 spin_lock(&pers_lock); 4032 pers = find_pers(mddev->level, mddev->clevel); 4033 if (!pers || !try_module_get(pers->owner)) { 4034 spin_unlock(&pers_lock); 4035 if (mddev->level != LEVEL_NONE) 4036 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 4037 mddev->level); 4038 else 4039 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 4040 mddev->clevel); 4041 return -EINVAL; 4042 } 4043 mddev->pers = pers; 4044 spin_unlock(&pers_lock); 4045 if (mddev->level != pers->level) { 4046 mddev->level = pers->level; 4047 mddev->new_level = pers->level; 4048 } 4049 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4050 4051 if (pers->level >= 4 && pers->level <= 6) 4052 /* Cannot support integrity (yet) */ 4053 blk_integrity_unregister(mddev->gendisk); 4054 4055 if (mddev->reshape_position != MaxSector && 4056 pers->start_reshape == NULL) { 4057 /* This personality cannot handle reshaping... */ 4058 mddev->pers = NULL; 4059 module_put(pers->owner); 4060 return -EINVAL; 4061 } 4062 4063 if (pers->sync_request) { 4064 /* Warn if this is a potentially silly 4065 * configuration. 4066 */ 4067 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4068 mdk_rdev_t *rdev2; 4069 int warned = 0; 4070 4071 list_for_each_entry(rdev, &mddev->disks, same_set) 4072 list_for_each_entry(rdev2, &mddev->disks, same_set) { 4073 if (rdev < rdev2 && 4074 rdev->bdev->bd_contains == 4075 rdev2->bdev->bd_contains) { 4076 printk(KERN_WARNING 4077 "%s: WARNING: %s appears to be" 4078 " on the same physical disk as" 4079 " %s.\n", 4080 mdname(mddev), 4081 bdevname(rdev->bdev,b), 4082 bdevname(rdev2->bdev,b2)); 4083 warned = 1; 4084 } 4085 } 4086 4087 if (warned) 4088 printk(KERN_WARNING 4089 "True protection against single-disk" 4090 " failure might be compromised.\n"); 4091 } 4092 4093 mddev->recovery = 0; 4094 /* may be over-ridden by personality */ 4095 mddev->resync_max_sectors = mddev->dev_sectors; 4096 4097 mddev->barriers_work = 1; 4098 mddev->ok_start_degraded = start_dirty_degraded; 4099 4100 if (start_readonly) 4101 mddev->ro = 2; /* read-only, but switch on first write */ 4102 4103 err = mddev->pers->run(mddev); 4104 if (err) 4105 printk(KERN_ERR "md: pers->run() failed ...\n"); 4106 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { 4107 WARN_ONCE(!mddev->external_size, "%s: default size too small," 4108 " but 'external_size' not in effect?\n", __func__); 4109 printk(KERN_ERR 4110 "md: invalid array_size %llu > default size %llu\n", 4111 (unsigned long long)mddev->array_sectors / 2, 4112 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); 4113 err = -EINVAL; 4114 mddev->pers->stop(mddev); 4115 } 4116 if (err == 0 && mddev->pers->sync_request) { 4117 err = bitmap_create(mddev); 4118 if (err) { 4119 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 4120 mdname(mddev), err); 4121 mddev->pers->stop(mddev); 4122 } 4123 } 4124 if (err) { 4125 module_put(mddev->pers->owner); 4126 mddev->pers = NULL; 4127 bitmap_destroy(mddev); 4128 return err; 4129 } 4130 if (mddev->pers->sync_request) { 4131 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4132 printk(KERN_WARNING 4133 "md: cannot register extra attributes for %s\n", 4134 mdname(mddev)); 4135 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4136 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 4137 mddev->ro = 0; 4138 4139 atomic_set(&mddev->writes_pending,0); 4140 mddev->safemode = 0; 4141 mddev->safemode_timer.function = md_safemode_timeout; 4142 mddev->safemode_timer.data = (unsigned long) mddev; 4143 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 4144 mddev->in_sync = 1; 4145 4146 list_for_each_entry(rdev, &mddev->disks, same_set) 4147 if (rdev->raid_disk >= 0) { 4148 char nm[20]; 4149 sprintf(nm, "rd%d", rdev->raid_disk); 4150 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 4151 printk("md: cannot register %s for %s\n", 4152 nm, mdname(mddev)); 4153 } 4154 4155 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4156 4157 if (mddev->flags) 4158 md_update_sb(mddev, 0); 4159 4160 set_capacity(disk, mddev->array_sectors); 4161 4162 /* If there is a partially-recovered drive we need to 4163 * start recovery here. If we leave it to md_check_recovery, 4164 * it will remove the drives and not do the right thing 4165 */ 4166 if (mddev->degraded && !mddev->sync_thread) { 4167 int spares = 0; 4168 list_for_each_entry(rdev, &mddev->disks, same_set) 4169 if (rdev->raid_disk >= 0 && 4170 !test_bit(In_sync, &rdev->flags) && 4171 !test_bit(Faulty, &rdev->flags)) 4172 /* complete an interrupted recovery */ 4173 spares++; 4174 if (spares && mddev->pers->sync_request) { 4175 mddev->recovery = 0; 4176 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4177 mddev->sync_thread = md_register_thread(md_do_sync, 4178 mddev, 4179 "%s_resync"); 4180 if (!mddev->sync_thread) { 4181 printk(KERN_ERR "%s: could not start resync" 4182 " thread...\n", 4183 mdname(mddev)); 4184 /* leave the spares where they are, it shouldn't hurt */ 4185 mddev->recovery = 0; 4186 } 4187 } 4188 } 4189 md_wakeup_thread(mddev->thread); 4190 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4191 4192 mddev->changed = 1; 4193 md_new_event(mddev); 4194 sysfs_notify_dirent(mddev->sysfs_state); 4195 if (mddev->sysfs_action) 4196 sysfs_notify_dirent(mddev->sysfs_action); 4197 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4198 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4199 return 0; 4200 } 4201 4202 static int restart_array(mddev_t *mddev) 4203 { 4204 struct gendisk *disk = mddev->gendisk; 4205 4206 /* Complain if it has no devices */ 4207 if (list_empty(&mddev->disks)) 4208 return -ENXIO; 4209 if (!mddev->pers) 4210 return -EINVAL; 4211 if (!mddev->ro) 4212 return -EBUSY; 4213 mddev->safemode = 0; 4214 mddev->ro = 0; 4215 set_disk_ro(disk, 0); 4216 printk(KERN_INFO "md: %s switched to read-write mode.\n", 4217 mdname(mddev)); 4218 /* Kick recovery or resync if necessary */ 4219 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4220 md_wakeup_thread(mddev->thread); 4221 md_wakeup_thread(mddev->sync_thread); 4222 sysfs_notify_dirent(mddev->sysfs_state); 4223 return 0; 4224 } 4225 4226 /* similar to deny_write_access, but accounts for our holding a reference 4227 * to the file ourselves */ 4228 static int deny_bitmap_write_access(struct file * file) 4229 { 4230 struct inode *inode = file->f_mapping->host; 4231 4232 spin_lock(&inode->i_lock); 4233 if (atomic_read(&inode->i_writecount) > 1) { 4234 spin_unlock(&inode->i_lock); 4235 return -ETXTBSY; 4236 } 4237 atomic_set(&inode->i_writecount, -1); 4238 spin_unlock(&inode->i_lock); 4239 4240 return 0; 4241 } 4242 4243 static void restore_bitmap_write_access(struct file *file) 4244 { 4245 struct inode *inode = file->f_mapping->host; 4246 4247 spin_lock(&inode->i_lock); 4248 atomic_set(&inode->i_writecount, 1); 4249 spin_unlock(&inode->i_lock); 4250 } 4251 4252 /* mode: 4253 * 0 - completely stop and dis-assemble array 4254 * 1 - switch to readonly 4255 * 2 - stop but do not disassemble array 4256 */ 4257 static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4258 { 4259 int err = 0; 4260 struct gendisk *disk = mddev->gendisk; 4261 mdk_rdev_t *rdev; 4262 4263 if (atomic_read(&mddev->openers) > is_open) { 4264 printk("md: %s still in use.\n",mdname(mddev)); 4265 return -EBUSY; 4266 } 4267 4268 if (mddev->pers) { 4269 4270 if (mddev->sync_thread) { 4271 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4272 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4273 md_unregister_thread(mddev->sync_thread); 4274 mddev->sync_thread = NULL; 4275 } 4276 4277 del_timer_sync(&mddev->safemode_timer); 4278 4279 switch(mode) { 4280 case 1: /* readonly */ 4281 err = -ENXIO; 4282 if (mddev->ro==1) 4283 goto out; 4284 mddev->ro = 1; 4285 break; 4286 case 0: /* disassemble */ 4287 case 2: /* stop */ 4288 bitmap_flush(mddev); 4289 md_super_wait(mddev); 4290 if (mddev->ro) 4291 set_disk_ro(disk, 0); 4292 4293 mddev->pers->stop(mddev); 4294 mddev->queue->merge_bvec_fn = NULL; 4295 mddev->queue->unplug_fn = NULL; 4296 mddev->queue->backing_dev_info.congested_fn = NULL; 4297 module_put(mddev->pers->owner); 4298 if (mddev->pers->sync_request) 4299 mddev->private = &md_redundancy_group; 4300 mddev->pers = NULL; 4301 /* tell userspace to handle 'inactive' */ 4302 sysfs_notify_dirent(mddev->sysfs_state); 4303 4304 list_for_each_entry(rdev, &mddev->disks, same_set) 4305 if (rdev->raid_disk >= 0) { 4306 char nm[20]; 4307 sprintf(nm, "rd%d", rdev->raid_disk); 4308 sysfs_remove_link(&mddev->kobj, nm); 4309 } 4310 4311 set_capacity(disk, 0); 4312 mddev->changed = 1; 4313 4314 if (mddev->ro) 4315 mddev->ro = 0; 4316 } 4317 if (!mddev->in_sync || mddev->flags) { 4318 /* mark array as shutdown cleanly */ 4319 mddev->in_sync = 1; 4320 md_update_sb(mddev, 1); 4321 } 4322 if (mode == 1) 4323 set_disk_ro(disk, 1); 4324 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4325 } 4326 4327 /* 4328 * Free resources if final stop 4329 */ 4330 if (mode == 0) { 4331 4332 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4333 4334 bitmap_destroy(mddev); 4335 if (mddev->bitmap_file) { 4336 restore_bitmap_write_access(mddev->bitmap_file); 4337 fput(mddev->bitmap_file); 4338 mddev->bitmap_file = NULL; 4339 } 4340 mddev->bitmap_offset = 0; 4341 4342 /* make sure all md_delayed_delete calls have finished */ 4343 flush_scheduled_work(); 4344 4345 export_array(mddev); 4346 4347 mddev->array_sectors = 0; 4348 mddev->external_size = 0; 4349 mddev->dev_sectors = 0; 4350 mddev->raid_disks = 0; 4351 mddev->recovery_cp = 0; 4352 mddev->resync_min = 0; 4353 mddev->resync_max = MaxSector; 4354 mddev->reshape_position = MaxSector; 4355 mddev->external = 0; 4356 mddev->persistent = 0; 4357 mddev->level = LEVEL_NONE; 4358 mddev->clevel[0] = 0; 4359 mddev->flags = 0; 4360 mddev->ro = 0; 4361 mddev->metadata_type[0] = 0; 4362 mddev->chunk_sectors = 0; 4363 mddev->ctime = mddev->utime = 0; 4364 mddev->layout = 0; 4365 mddev->max_disks = 0; 4366 mddev->events = 0; 4367 mddev->delta_disks = 0; 4368 mddev->new_level = LEVEL_NONE; 4369 mddev->new_layout = 0; 4370 mddev->new_chunk_sectors = 0; 4371 mddev->curr_resync = 0; 4372 mddev->resync_mismatches = 0; 4373 mddev->suspend_lo = mddev->suspend_hi = 0; 4374 mddev->sync_speed_min = mddev->sync_speed_max = 0; 4375 mddev->recovery = 0; 4376 mddev->in_sync = 0; 4377 mddev->changed = 0; 4378 mddev->degraded = 0; 4379 mddev->barriers_work = 0; 4380 mddev->safemode = 0; 4381 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4382 if (mddev->hold_active == UNTIL_STOP) 4383 mddev->hold_active = 0; 4384 4385 } else if (mddev->pers) 4386 printk(KERN_INFO "md: %s switched to read-only mode.\n", 4387 mdname(mddev)); 4388 err = 0; 4389 blk_integrity_unregister(disk); 4390 md_new_event(mddev); 4391 sysfs_notify_dirent(mddev->sysfs_state); 4392 out: 4393 return err; 4394 } 4395 4396 #ifndef MODULE 4397 static void autorun_array(mddev_t *mddev) 4398 { 4399 mdk_rdev_t *rdev; 4400 int err; 4401 4402 if (list_empty(&mddev->disks)) 4403 return; 4404 4405 printk(KERN_INFO "md: running: "); 4406 4407 list_for_each_entry(rdev, &mddev->disks, same_set) { 4408 char b[BDEVNAME_SIZE]; 4409 printk("<%s>", bdevname(rdev->bdev,b)); 4410 } 4411 printk("\n"); 4412 4413 err = do_md_run(mddev); 4414 if (err) { 4415 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 4416 do_md_stop(mddev, 0, 0); 4417 } 4418 } 4419 4420 /* 4421 * lets try to run arrays based on all disks that have arrived 4422 * until now. (those are in pending_raid_disks) 4423 * 4424 * the method: pick the first pending disk, collect all disks with 4425 * the same UUID, remove all from the pending list and put them into 4426 * the 'same_array' list. Then order this list based on superblock 4427 * update time (freshest comes first), kick out 'old' disks and 4428 * compare superblocks. If everything's fine then run it. 4429 * 4430 * If "unit" is allocated, then bump its reference count 4431 */ 4432 static void autorun_devices(int part) 4433 { 4434 mdk_rdev_t *rdev0, *rdev, *tmp; 4435 mddev_t *mddev; 4436 char b[BDEVNAME_SIZE]; 4437 4438 printk(KERN_INFO "md: autorun ...\n"); 4439 while (!list_empty(&pending_raid_disks)) { 4440 int unit; 4441 dev_t dev; 4442 LIST_HEAD(candidates); 4443 rdev0 = list_entry(pending_raid_disks.next, 4444 mdk_rdev_t, same_set); 4445 4446 printk(KERN_INFO "md: considering %s ...\n", 4447 bdevname(rdev0->bdev,b)); 4448 INIT_LIST_HEAD(&candidates); 4449 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 4450 if (super_90_load(rdev, rdev0, 0) >= 0) { 4451 printk(KERN_INFO "md: adding %s ...\n", 4452 bdevname(rdev->bdev,b)); 4453 list_move(&rdev->same_set, &candidates); 4454 } 4455 /* 4456 * now we have a set of devices, with all of them having 4457 * mostly sane superblocks. It's time to allocate the 4458 * mddev. 4459 */ 4460 if (part) { 4461 dev = MKDEV(mdp_major, 4462 rdev0->preferred_minor << MdpMinorShift); 4463 unit = MINOR(dev) >> MdpMinorShift; 4464 } else { 4465 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 4466 unit = MINOR(dev); 4467 } 4468 if (rdev0->preferred_minor != unit) { 4469 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 4470 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 4471 break; 4472 } 4473 4474 md_probe(dev, NULL, NULL); 4475 mddev = mddev_find(dev); 4476 if (!mddev || !mddev->gendisk) { 4477 if (mddev) 4478 mddev_put(mddev); 4479 printk(KERN_ERR 4480 "md: cannot allocate memory for md drive.\n"); 4481 break; 4482 } 4483 if (mddev_lock(mddev)) 4484 printk(KERN_WARNING "md: %s locked, cannot run\n", 4485 mdname(mddev)); 4486 else if (mddev->raid_disks || mddev->major_version 4487 || !list_empty(&mddev->disks)) { 4488 printk(KERN_WARNING 4489 "md: %s already running, cannot run %s\n", 4490 mdname(mddev), bdevname(rdev0->bdev,b)); 4491 mddev_unlock(mddev); 4492 } else { 4493 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 4494 mddev->persistent = 1; 4495 rdev_for_each_list(rdev, tmp, &candidates) { 4496 list_del_init(&rdev->same_set); 4497 if (bind_rdev_to_array(rdev, mddev)) 4498 export_rdev(rdev); 4499 } 4500 autorun_array(mddev); 4501 mddev_unlock(mddev); 4502 } 4503 /* on success, candidates will be empty, on error 4504 * it won't... 4505 */ 4506 rdev_for_each_list(rdev, tmp, &candidates) { 4507 list_del_init(&rdev->same_set); 4508 export_rdev(rdev); 4509 } 4510 mddev_put(mddev); 4511 } 4512 printk(KERN_INFO "md: ... autorun DONE.\n"); 4513 } 4514 #endif /* !MODULE */ 4515 4516 static int get_version(void __user * arg) 4517 { 4518 mdu_version_t ver; 4519 4520 ver.major = MD_MAJOR_VERSION; 4521 ver.minor = MD_MINOR_VERSION; 4522 ver.patchlevel = MD_PATCHLEVEL_VERSION; 4523 4524 if (copy_to_user(arg, &ver, sizeof(ver))) 4525 return -EFAULT; 4526 4527 return 0; 4528 } 4529 4530 static int get_array_info(mddev_t * mddev, void __user * arg) 4531 { 4532 mdu_array_info_t info; 4533 int nr,working,active,failed,spare; 4534 mdk_rdev_t *rdev; 4535 4536 nr=working=active=failed=spare=0; 4537 list_for_each_entry(rdev, &mddev->disks, same_set) { 4538 nr++; 4539 if (test_bit(Faulty, &rdev->flags)) 4540 failed++; 4541 else { 4542 working++; 4543 if (test_bit(In_sync, &rdev->flags)) 4544 active++; 4545 else 4546 spare++; 4547 } 4548 } 4549 4550 info.major_version = mddev->major_version; 4551 info.minor_version = mddev->minor_version; 4552 info.patch_version = MD_PATCHLEVEL_VERSION; 4553 info.ctime = mddev->ctime; 4554 info.level = mddev->level; 4555 info.size = mddev->dev_sectors / 2; 4556 if (info.size != mddev->dev_sectors / 2) /* overflow */ 4557 info.size = -1; 4558 info.nr_disks = nr; 4559 info.raid_disks = mddev->raid_disks; 4560 info.md_minor = mddev->md_minor; 4561 info.not_persistent= !mddev->persistent; 4562 4563 info.utime = mddev->utime; 4564 info.state = 0; 4565 if (mddev->in_sync) 4566 info.state = (1<<MD_SB_CLEAN); 4567 if (mddev->bitmap && mddev->bitmap_offset) 4568 info.state = (1<<MD_SB_BITMAP_PRESENT); 4569 info.active_disks = active; 4570 info.working_disks = working; 4571 info.failed_disks = failed; 4572 info.spare_disks = spare; 4573 4574 info.layout = mddev->layout; 4575 info.chunk_size = mddev->chunk_sectors << 9; 4576 4577 if (copy_to_user(arg, &info, sizeof(info))) 4578 return -EFAULT; 4579 4580 return 0; 4581 } 4582 4583 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 4584 { 4585 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 4586 char *ptr, *buf = NULL; 4587 int err = -ENOMEM; 4588 4589 if (md_allow_write(mddev)) 4590 file = kmalloc(sizeof(*file), GFP_NOIO); 4591 else 4592 file = kmalloc(sizeof(*file), GFP_KERNEL); 4593 4594 if (!file) 4595 goto out; 4596 4597 /* bitmap disabled, zero the first byte and copy out */ 4598 if (!mddev->bitmap || !mddev->bitmap->file) { 4599 file->pathname[0] = '\0'; 4600 goto copy_out; 4601 } 4602 4603 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 4604 if (!buf) 4605 goto out; 4606 4607 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); 4608 if (IS_ERR(ptr)) 4609 goto out; 4610 4611 strcpy(file->pathname, ptr); 4612 4613 copy_out: 4614 err = 0; 4615 if (copy_to_user(arg, file, sizeof(*file))) 4616 err = -EFAULT; 4617 out: 4618 kfree(buf); 4619 kfree(file); 4620 return err; 4621 } 4622 4623 static int get_disk_info(mddev_t * mddev, void __user * arg) 4624 { 4625 mdu_disk_info_t info; 4626 mdk_rdev_t *rdev; 4627 4628 if (copy_from_user(&info, arg, sizeof(info))) 4629 return -EFAULT; 4630 4631 rdev = find_rdev_nr(mddev, info.number); 4632 if (rdev) { 4633 info.major = MAJOR(rdev->bdev->bd_dev); 4634 info.minor = MINOR(rdev->bdev->bd_dev); 4635 info.raid_disk = rdev->raid_disk; 4636 info.state = 0; 4637 if (test_bit(Faulty, &rdev->flags)) 4638 info.state |= (1<<MD_DISK_FAULTY); 4639 else if (test_bit(In_sync, &rdev->flags)) { 4640 info.state |= (1<<MD_DISK_ACTIVE); 4641 info.state |= (1<<MD_DISK_SYNC); 4642 } 4643 if (test_bit(WriteMostly, &rdev->flags)) 4644 info.state |= (1<<MD_DISK_WRITEMOSTLY); 4645 } else { 4646 info.major = info.minor = 0; 4647 info.raid_disk = -1; 4648 info.state = (1<<MD_DISK_REMOVED); 4649 } 4650 4651 if (copy_to_user(arg, &info, sizeof(info))) 4652 return -EFAULT; 4653 4654 return 0; 4655 } 4656 4657 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 4658 { 4659 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4660 mdk_rdev_t *rdev; 4661 dev_t dev = MKDEV(info->major,info->minor); 4662 4663 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 4664 return -EOVERFLOW; 4665 4666 if (!mddev->raid_disks) { 4667 int err; 4668 /* expecting a device which has a superblock */ 4669 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 4670 if (IS_ERR(rdev)) { 4671 printk(KERN_WARNING 4672 "md: md_import_device returned %ld\n", 4673 PTR_ERR(rdev)); 4674 return PTR_ERR(rdev); 4675 } 4676 if (!list_empty(&mddev->disks)) { 4677 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 4678 mdk_rdev_t, same_set); 4679 int err = super_types[mddev->major_version] 4680 .load_super(rdev, rdev0, mddev->minor_version); 4681 if (err < 0) { 4682 printk(KERN_WARNING 4683 "md: %s has different UUID to %s\n", 4684 bdevname(rdev->bdev,b), 4685 bdevname(rdev0->bdev,b2)); 4686 export_rdev(rdev); 4687 return -EINVAL; 4688 } 4689 } 4690 err = bind_rdev_to_array(rdev, mddev); 4691 if (err) 4692 export_rdev(rdev); 4693 return err; 4694 } 4695 4696 /* 4697 * add_new_disk can be used once the array is assembled 4698 * to add "hot spares". They must already have a superblock 4699 * written 4700 */ 4701 if (mddev->pers) { 4702 int err; 4703 if (!mddev->pers->hot_add_disk) { 4704 printk(KERN_WARNING 4705 "%s: personality does not support diskops!\n", 4706 mdname(mddev)); 4707 return -EINVAL; 4708 } 4709 if (mddev->persistent) 4710 rdev = md_import_device(dev, mddev->major_version, 4711 mddev->minor_version); 4712 else 4713 rdev = md_import_device(dev, -1, -1); 4714 if (IS_ERR(rdev)) { 4715 printk(KERN_WARNING 4716 "md: md_import_device returned %ld\n", 4717 PTR_ERR(rdev)); 4718 return PTR_ERR(rdev); 4719 } 4720 /* set save_raid_disk if appropriate */ 4721 if (!mddev->persistent) { 4722 if (info->state & (1<<MD_DISK_SYNC) && 4723 info->raid_disk < mddev->raid_disks) 4724 rdev->raid_disk = info->raid_disk; 4725 else 4726 rdev->raid_disk = -1; 4727 } else 4728 super_types[mddev->major_version]. 4729 validate_super(mddev, rdev); 4730 rdev->saved_raid_disk = rdev->raid_disk; 4731 4732 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 4733 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4734 set_bit(WriteMostly, &rdev->flags); 4735 else 4736 clear_bit(WriteMostly, &rdev->flags); 4737 4738 rdev->raid_disk = -1; 4739 err = bind_rdev_to_array(rdev, mddev); 4740 if (!err && !mddev->pers->hot_remove_disk) { 4741 /* If there is hot_add_disk but no hot_remove_disk 4742 * then added disks for geometry changes, 4743 * and should be added immediately. 4744 */ 4745 super_types[mddev->major_version]. 4746 validate_super(mddev, rdev); 4747 err = mddev->pers->hot_add_disk(mddev, rdev); 4748 if (err) 4749 unbind_rdev_from_array(rdev); 4750 } 4751 if (err) 4752 export_rdev(rdev); 4753 else 4754 sysfs_notify_dirent(rdev->sysfs_state); 4755 4756 md_update_sb(mddev, 1); 4757 if (mddev->degraded) 4758 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4759 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4760 md_wakeup_thread(mddev->thread); 4761 return err; 4762 } 4763 4764 /* otherwise, add_new_disk is only allowed 4765 * for major_version==0 superblocks 4766 */ 4767 if (mddev->major_version != 0) { 4768 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 4769 mdname(mddev)); 4770 return -EINVAL; 4771 } 4772 4773 if (!(info->state & (1<<MD_DISK_FAULTY))) { 4774 int err; 4775 rdev = md_import_device(dev, -1, 0); 4776 if (IS_ERR(rdev)) { 4777 printk(KERN_WARNING 4778 "md: error, md_import_device() returned %ld\n", 4779 PTR_ERR(rdev)); 4780 return PTR_ERR(rdev); 4781 } 4782 rdev->desc_nr = info->number; 4783 if (info->raid_disk < mddev->raid_disks) 4784 rdev->raid_disk = info->raid_disk; 4785 else 4786 rdev->raid_disk = -1; 4787 4788 if (rdev->raid_disk < mddev->raid_disks) 4789 if (info->state & (1<<MD_DISK_SYNC)) 4790 set_bit(In_sync, &rdev->flags); 4791 4792 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4793 set_bit(WriteMostly, &rdev->flags); 4794 4795 if (!mddev->persistent) { 4796 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 4797 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4798 } else 4799 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4800 rdev->sectors = rdev->sb_start; 4801 4802 err = bind_rdev_to_array(rdev, mddev); 4803 if (err) { 4804 export_rdev(rdev); 4805 return err; 4806 } 4807 } 4808 4809 return 0; 4810 } 4811 4812 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 4813 { 4814 char b[BDEVNAME_SIZE]; 4815 mdk_rdev_t *rdev; 4816 4817 rdev = find_rdev(mddev, dev); 4818 if (!rdev) 4819 return -ENXIO; 4820 4821 if (rdev->raid_disk >= 0) 4822 goto busy; 4823 4824 kick_rdev_from_array(rdev); 4825 md_update_sb(mddev, 1); 4826 md_new_event(mddev); 4827 4828 return 0; 4829 busy: 4830 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 4831 bdevname(rdev->bdev,b), mdname(mddev)); 4832 return -EBUSY; 4833 } 4834 4835 static int hot_add_disk(mddev_t * mddev, dev_t dev) 4836 { 4837 char b[BDEVNAME_SIZE]; 4838 int err; 4839 mdk_rdev_t *rdev; 4840 4841 if (!mddev->pers) 4842 return -ENODEV; 4843 4844 if (mddev->major_version != 0) { 4845 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 4846 " version-0 superblocks.\n", 4847 mdname(mddev)); 4848 return -EINVAL; 4849 } 4850 if (!mddev->pers->hot_add_disk) { 4851 printk(KERN_WARNING 4852 "%s: personality does not support diskops!\n", 4853 mdname(mddev)); 4854 return -EINVAL; 4855 } 4856 4857 rdev = md_import_device(dev, -1, 0); 4858 if (IS_ERR(rdev)) { 4859 printk(KERN_WARNING 4860 "md: error, md_import_device() returned %ld\n", 4861 PTR_ERR(rdev)); 4862 return -EINVAL; 4863 } 4864 4865 if (mddev->persistent) 4866 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4867 else 4868 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4869 4870 rdev->sectors = rdev->sb_start; 4871 4872 if (test_bit(Faulty, &rdev->flags)) { 4873 printk(KERN_WARNING 4874 "md: can not hot-add faulty %s disk to %s!\n", 4875 bdevname(rdev->bdev,b), mdname(mddev)); 4876 err = -EINVAL; 4877 goto abort_export; 4878 } 4879 clear_bit(In_sync, &rdev->flags); 4880 rdev->desc_nr = -1; 4881 rdev->saved_raid_disk = -1; 4882 err = bind_rdev_to_array(rdev, mddev); 4883 if (err) 4884 goto abort_export; 4885 4886 /* 4887 * The rest should better be atomic, we can have disk failures 4888 * noticed in interrupt contexts ... 4889 */ 4890 4891 rdev->raid_disk = -1; 4892 4893 md_update_sb(mddev, 1); 4894 4895 /* 4896 * Kick recovery, maybe this spare has to be added to the 4897 * array immediately. 4898 */ 4899 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4900 md_wakeup_thread(mddev->thread); 4901 md_new_event(mddev); 4902 return 0; 4903 4904 abort_export: 4905 export_rdev(rdev); 4906 return err; 4907 } 4908 4909 static int set_bitmap_file(mddev_t *mddev, int fd) 4910 { 4911 int err; 4912 4913 if (mddev->pers) { 4914 if (!mddev->pers->quiesce) 4915 return -EBUSY; 4916 if (mddev->recovery || mddev->sync_thread) 4917 return -EBUSY; 4918 /* we should be able to change the bitmap.. */ 4919 } 4920 4921 4922 if (fd >= 0) { 4923 if (mddev->bitmap) 4924 return -EEXIST; /* cannot add when bitmap is present */ 4925 mddev->bitmap_file = fget(fd); 4926 4927 if (mddev->bitmap_file == NULL) { 4928 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 4929 mdname(mddev)); 4930 return -EBADF; 4931 } 4932 4933 err = deny_bitmap_write_access(mddev->bitmap_file); 4934 if (err) { 4935 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 4936 mdname(mddev)); 4937 fput(mddev->bitmap_file); 4938 mddev->bitmap_file = NULL; 4939 return err; 4940 } 4941 mddev->bitmap_offset = 0; /* file overrides offset */ 4942 } else if (mddev->bitmap == NULL) 4943 return -ENOENT; /* cannot remove what isn't there */ 4944 err = 0; 4945 if (mddev->pers) { 4946 mddev->pers->quiesce(mddev, 1); 4947 if (fd >= 0) 4948 err = bitmap_create(mddev); 4949 if (fd < 0 || err) { 4950 bitmap_destroy(mddev); 4951 fd = -1; /* make sure to put the file */ 4952 } 4953 mddev->pers->quiesce(mddev, 0); 4954 } 4955 if (fd < 0) { 4956 if (mddev->bitmap_file) { 4957 restore_bitmap_write_access(mddev->bitmap_file); 4958 fput(mddev->bitmap_file); 4959 } 4960 mddev->bitmap_file = NULL; 4961 } 4962 4963 return err; 4964 } 4965 4966 /* 4967 * set_array_info is used two different ways 4968 * The original usage is when creating a new array. 4969 * In this usage, raid_disks is > 0 and it together with 4970 * level, size, not_persistent,layout,chunksize determine the 4971 * shape of the array. 4972 * This will always create an array with a type-0.90.0 superblock. 4973 * The newer usage is when assembling an array. 4974 * In this case raid_disks will be 0, and the major_version field is 4975 * use to determine which style super-blocks are to be found on the devices. 4976 * The minor and patch _version numbers are also kept incase the 4977 * super_block handler wishes to interpret them. 4978 */ 4979 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 4980 { 4981 4982 if (info->raid_disks == 0) { 4983 /* just setting version number for superblock loading */ 4984 if (info->major_version < 0 || 4985 info->major_version >= ARRAY_SIZE(super_types) || 4986 super_types[info->major_version].name == NULL) { 4987 /* maybe try to auto-load a module? */ 4988 printk(KERN_INFO 4989 "md: superblock version %d not known\n", 4990 info->major_version); 4991 return -EINVAL; 4992 } 4993 mddev->major_version = info->major_version; 4994 mddev->minor_version = info->minor_version; 4995 mddev->patch_version = info->patch_version; 4996 mddev->persistent = !info->not_persistent; 4997 return 0; 4998 } 4999 mddev->major_version = MD_MAJOR_VERSION; 5000 mddev->minor_version = MD_MINOR_VERSION; 5001 mddev->patch_version = MD_PATCHLEVEL_VERSION; 5002 mddev->ctime = get_seconds(); 5003 5004 mddev->level = info->level; 5005 mddev->clevel[0] = 0; 5006 mddev->dev_sectors = 2 * (sector_t)info->size; 5007 mddev->raid_disks = info->raid_disks; 5008 /* don't set md_minor, it is determined by which /dev/md* was 5009 * openned 5010 */ 5011 if (info->state & (1<<MD_SB_CLEAN)) 5012 mddev->recovery_cp = MaxSector; 5013 else 5014 mddev->recovery_cp = 0; 5015 mddev->persistent = ! info->not_persistent; 5016 mddev->external = 0; 5017 5018 mddev->layout = info->layout; 5019 mddev->chunk_sectors = info->chunk_size >> 9; 5020 5021 mddev->max_disks = MD_SB_DISKS; 5022 5023 if (mddev->persistent) 5024 mddev->flags = 0; 5025 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5026 5027 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 5028 mddev->bitmap_offset = 0; 5029 5030 mddev->reshape_position = MaxSector; 5031 5032 /* 5033 * Generate a 128 bit UUID 5034 */ 5035 get_random_bytes(mddev->uuid, 16); 5036 5037 mddev->new_level = mddev->level; 5038 mddev->new_chunk_sectors = mddev->chunk_sectors; 5039 mddev->new_layout = mddev->layout; 5040 mddev->delta_disks = 0; 5041 5042 return 0; 5043 } 5044 5045 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) 5046 { 5047 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 5048 5049 if (mddev->external_size) 5050 return; 5051 5052 mddev->array_sectors = array_sectors; 5053 } 5054 EXPORT_SYMBOL(md_set_array_sectors); 5055 5056 static int update_size(mddev_t *mddev, sector_t num_sectors) 5057 { 5058 mdk_rdev_t *rdev; 5059 int rv; 5060 int fit = (num_sectors == 0); 5061 5062 if (mddev->pers->resize == NULL) 5063 return -EINVAL; 5064 /* The "num_sectors" is the number of sectors of each device that 5065 * is used. This can only make sense for arrays with redundancy. 5066 * linear and raid0 always use whatever space is available. We can only 5067 * consider changing this number if no resync or reconstruction is 5068 * happening, and if the new size is acceptable. It must fit before the 5069 * sb_start or, if that is <data_offset, it must fit before the size 5070 * of each device. If num_sectors is zero, we find the largest size 5071 * that fits. 5072 5073 */ 5074 if (mddev->sync_thread) 5075 return -EBUSY; 5076 if (mddev->bitmap) 5077 /* Sorry, cannot grow a bitmap yet, just remove it, 5078 * grow, and re-add. 5079 */ 5080 return -EBUSY; 5081 list_for_each_entry(rdev, &mddev->disks, same_set) { 5082 sector_t avail = rdev->sectors; 5083 5084 if (fit && (num_sectors == 0 || num_sectors > avail)) 5085 num_sectors = avail; 5086 if (avail < num_sectors) 5087 return -ENOSPC; 5088 } 5089 rv = mddev->pers->resize(mddev, num_sectors); 5090 if (!rv) { 5091 struct block_device *bdev; 5092 5093 bdev = bdget_disk(mddev->gendisk, 0); 5094 if (bdev) { 5095 mutex_lock(&bdev->bd_inode->i_mutex); 5096 i_size_write(bdev->bd_inode, 5097 (loff_t)mddev->array_sectors << 9); 5098 mutex_unlock(&bdev->bd_inode->i_mutex); 5099 bdput(bdev); 5100 } 5101 } 5102 return rv; 5103 } 5104 5105 static int update_raid_disks(mddev_t *mddev, int raid_disks) 5106 { 5107 int rv; 5108 /* change the number of raid disks */ 5109 if (mddev->pers->check_reshape == NULL) 5110 return -EINVAL; 5111 if (raid_disks <= 0 || 5112 raid_disks >= mddev->max_disks) 5113 return -EINVAL; 5114 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5115 return -EBUSY; 5116 mddev->delta_disks = raid_disks - mddev->raid_disks; 5117 5118 rv = mddev->pers->check_reshape(mddev); 5119 return rv; 5120 } 5121 5122 5123 /* 5124 * update_array_info is used to change the configuration of an 5125 * on-line array. 5126 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 5127 * fields in the info are checked against the array. 5128 * Any differences that cannot be handled will cause an error. 5129 * Normally, only one change can be managed at a time. 5130 */ 5131 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 5132 { 5133 int rv = 0; 5134 int cnt = 0; 5135 int state = 0; 5136 5137 /* calculate expected state,ignoring low bits */ 5138 if (mddev->bitmap && mddev->bitmap_offset) 5139 state |= (1 << MD_SB_BITMAP_PRESENT); 5140 5141 if (mddev->major_version != info->major_version || 5142 mddev->minor_version != info->minor_version || 5143 /* mddev->patch_version != info->patch_version || */ 5144 mddev->ctime != info->ctime || 5145 mddev->level != info->level || 5146 /* mddev->layout != info->layout || */ 5147 !mddev->persistent != info->not_persistent|| 5148 mddev->chunk_sectors != info->chunk_size >> 9 || 5149 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 5150 ((state^info->state) & 0xfffffe00) 5151 ) 5152 return -EINVAL; 5153 /* Check there is only one change */ 5154 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5155 cnt++; 5156 if (mddev->raid_disks != info->raid_disks) 5157 cnt++; 5158 if (mddev->layout != info->layout) 5159 cnt++; 5160 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 5161 cnt++; 5162 if (cnt == 0) 5163 return 0; 5164 if (cnt > 1) 5165 return -EINVAL; 5166 5167 if (mddev->layout != info->layout) { 5168 /* Change layout 5169 * we don't need to do anything at the md level, the 5170 * personality will take care of it all. 5171 */ 5172 if (mddev->pers->check_reshape == NULL) 5173 return -EINVAL; 5174 else { 5175 mddev->new_layout = info->layout; 5176 rv = mddev->pers->check_reshape(mddev); 5177 if (rv) 5178 mddev->new_layout = mddev->layout; 5179 return rv; 5180 } 5181 } 5182 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5183 rv = update_size(mddev, (sector_t)info->size * 2); 5184 5185 if (mddev->raid_disks != info->raid_disks) 5186 rv = update_raid_disks(mddev, info->raid_disks); 5187 5188 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 5189 if (mddev->pers->quiesce == NULL) 5190 return -EINVAL; 5191 if (mddev->recovery || mddev->sync_thread) 5192 return -EBUSY; 5193 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 5194 /* add the bitmap */ 5195 if (mddev->bitmap) 5196 return -EEXIST; 5197 if (mddev->default_bitmap_offset == 0) 5198 return -EINVAL; 5199 mddev->bitmap_offset = mddev->default_bitmap_offset; 5200 mddev->pers->quiesce(mddev, 1); 5201 rv = bitmap_create(mddev); 5202 if (rv) 5203 bitmap_destroy(mddev); 5204 mddev->pers->quiesce(mddev, 0); 5205 } else { 5206 /* remove the bitmap */ 5207 if (!mddev->bitmap) 5208 return -ENOENT; 5209 if (mddev->bitmap->file) 5210 return -EINVAL; 5211 mddev->pers->quiesce(mddev, 1); 5212 bitmap_destroy(mddev); 5213 mddev->pers->quiesce(mddev, 0); 5214 mddev->bitmap_offset = 0; 5215 } 5216 } 5217 md_update_sb(mddev, 1); 5218 return rv; 5219 } 5220 5221 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 5222 { 5223 mdk_rdev_t *rdev; 5224 5225 if (mddev->pers == NULL) 5226 return -ENODEV; 5227 5228 rdev = find_rdev(mddev, dev); 5229 if (!rdev) 5230 return -ENODEV; 5231 5232 md_error(mddev, rdev); 5233 return 0; 5234 } 5235 5236 /* 5237 * We have a problem here : there is no easy way to give a CHS 5238 * virtual geometry. We currently pretend that we have a 2 heads 5239 * 4 sectors (with a BIG number of cylinders...). This drives 5240 * dosfs just mad... ;-) 5241 */ 5242 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 5243 { 5244 mddev_t *mddev = bdev->bd_disk->private_data; 5245 5246 geo->heads = 2; 5247 geo->sectors = 4; 5248 geo->cylinders = get_capacity(mddev->gendisk) / 8; 5249 return 0; 5250 } 5251 5252 static int md_ioctl(struct block_device *bdev, fmode_t mode, 5253 unsigned int cmd, unsigned long arg) 5254 { 5255 int err = 0; 5256 void __user *argp = (void __user *)arg; 5257 mddev_t *mddev = NULL; 5258 5259 if (!capable(CAP_SYS_ADMIN)) 5260 return -EACCES; 5261 5262 /* 5263 * Commands dealing with the RAID driver but not any 5264 * particular array: 5265 */ 5266 switch (cmd) 5267 { 5268 case RAID_VERSION: 5269 err = get_version(argp); 5270 goto done; 5271 5272 case PRINT_RAID_DEBUG: 5273 err = 0; 5274 md_print_devices(); 5275 goto done; 5276 5277 #ifndef MODULE 5278 case RAID_AUTORUN: 5279 err = 0; 5280 autostart_arrays(arg); 5281 goto done; 5282 #endif 5283 default:; 5284 } 5285 5286 /* 5287 * Commands creating/starting a new array: 5288 */ 5289 5290 mddev = bdev->bd_disk->private_data; 5291 5292 if (!mddev) { 5293 BUG(); 5294 goto abort; 5295 } 5296 5297 err = mddev_lock(mddev); 5298 if (err) { 5299 printk(KERN_INFO 5300 "md: ioctl lock interrupted, reason %d, cmd %d\n", 5301 err, cmd); 5302 goto abort; 5303 } 5304 5305 switch (cmd) 5306 { 5307 case SET_ARRAY_INFO: 5308 { 5309 mdu_array_info_t info; 5310 if (!arg) 5311 memset(&info, 0, sizeof(info)); 5312 else if (copy_from_user(&info, argp, sizeof(info))) { 5313 err = -EFAULT; 5314 goto abort_unlock; 5315 } 5316 if (mddev->pers) { 5317 err = update_array_info(mddev, &info); 5318 if (err) { 5319 printk(KERN_WARNING "md: couldn't update" 5320 " array info. %d\n", err); 5321 goto abort_unlock; 5322 } 5323 goto done_unlock; 5324 } 5325 if (!list_empty(&mddev->disks)) { 5326 printk(KERN_WARNING 5327 "md: array %s already has disks!\n", 5328 mdname(mddev)); 5329 err = -EBUSY; 5330 goto abort_unlock; 5331 } 5332 if (mddev->raid_disks) { 5333 printk(KERN_WARNING 5334 "md: array %s already initialised!\n", 5335 mdname(mddev)); 5336 err = -EBUSY; 5337 goto abort_unlock; 5338 } 5339 err = set_array_info(mddev, &info); 5340 if (err) { 5341 printk(KERN_WARNING "md: couldn't set" 5342 " array info. %d\n", err); 5343 goto abort_unlock; 5344 } 5345 } 5346 goto done_unlock; 5347 5348 default:; 5349 } 5350 5351 /* 5352 * Commands querying/configuring an existing array: 5353 */ 5354 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 5355 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 5356 if ((!mddev->raid_disks && !mddev->external) 5357 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 5358 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 5359 && cmd != GET_BITMAP_FILE) { 5360 err = -ENODEV; 5361 goto abort_unlock; 5362 } 5363 5364 /* 5365 * Commands even a read-only array can execute: 5366 */ 5367 switch (cmd) 5368 { 5369 case GET_ARRAY_INFO: 5370 err = get_array_info(mddev, argp); 5371 goto done_unlock; 5372 5373 case GET_BITMAP_FILE: 5374 err = get_bitmap_file(mddev, argp); 5375 goto done_unlock; 5376 5377 case GET_DISK_INFO: 5378 err = get_disk_info(mddev, argp); 5379 goto done_unlock; 5380 5381 case RESTART_ARRAY_RW: 5382 err = restart_array(mddev); 5383 goto done_unlock; 5384 5385 case STOP_ARRAY: 5386 err = do_md_stop(mddev, 0, 1); 5387 goto done_unlock; 5388 5389 case STOP_ARRAY_RO: 5390 err = do_md_stop(mddev, 1, 1); 5391 goto done_unlock; 5392 5393 } 5394 5395 /* 5396 * The remaining ioctls are changing the state of the 5397 * superblock, so we do not allow them on read-only arrays. 5398 * However non-MD ioctls (e.g. get-size) will still come through 5399 * here and hit the 'default' below, so only disallow 5400 * 'md' ioctls, and switch to rw mode if started auto-readonly. 5401 */ 5402 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 5403 if (mddev->ro == 2) { 5404 mddev->ro = 0; 5405 sysfs_notify_dirent(mddev->sysfs_state); 5406 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5407 md_wakeup_thread(mddev->thread); 5408 } else { 5409 err = -EROFS; 5410 goto abort_unlock; 5411 } 5412 } 5413 5414 switch (cmd) 5415 { 5416 case ADD_NEW_DISK: 5417 { 5418 mdu_disk_info_t info; 5419 if (copy_from_user(&info, argp, sizeof(info))) 5420 err = -EFAULT; 5421 else 5422 err = add_new_disk(mddev, &info); 5423 goto done_unlock; 5424 } 5425 5426 case HOT_REMOVE_DISK: 5427 err = hot_remove_disk(mddev, new_decode_dev(arg)); 5428 goto done_unlock; 5429 5430 case HOT_ADD_DISK: 5431 err = hot_add_disk(mddev, new_decode_dev(arg)); 5432 goto done_unlock; 5433 5434 case SET_DISK_FAULTY: 5435 err = set_disk_faulty(mddev, new_decode_dev(arg)); 5436 goto done_unlock; 5437 5438 case RUN_ARRAY: 5439 err = do_md_run(mddev); 5440 goto done_unlock; 5441 5442 case SET_BITMAP_FILE: 5443 err = set_bitmap_file(mddev, (int)arg); 5444 goto done_unlock; 5445 5446 default: 5447 err = -EINVAL; 5448 goto abort_unlock; 5449 } 5450 5451 done_unlock: 5452 abort_unlock: 5453 if (mddev->hold_active == UNTIL_IOCTL && 5454 err != -EINVAL) 5455 mddev->hold_active = 0; 5456 mddev_unlock(mddev); 5457 5458 return err; 5459 done: 5460 if (err) 5461 MD_BUG(); 5462 abort: 5463 return err; 5464 } 5465 5466 static int md_open(struct block_device *bdev, fmode_t mode) 5467 { 5468 /* 5469 * Succeed if we can lock the mddev, which confirms that 5470 * it isn't being stopped right now. 5471 */ 5472 mddev_t *mddev = mddev_find(bdev->bd_dev); 5473 int err; 5474 5475 if (mddev->gendisk != bdev->bd_disk) { 5476 /* we are racing with mddev_put which is discarding this 5477 * bd_disk. 5478 */ 5479 mddev_put(mddev); 5480 /* Wait until bdev->bd_disk is definitely gone */ 5481 flush_scheduled_work(); 5482 /* Then retry the open from the top */ 5483 return -ERESTARTSYS; 5484 } 5485 BUG_ON(mddev != bdev->bd_disk->private_data); 5486 5487 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 5488 goto out; 5489 5490 err = 0; 5491 atomic_inc(&mddev->openers); 5492 mddev_unlock(mddev); 5493 5494 check_disk_change(bdev); 5495 out: 5496 return err; 5497 } 5498 5499 static int md_release(struct gendisk *disk, fmode_t mode) 5500 { 5501 mddev_t *mddev = disk->private_data; 5502 5503 BUG_ON(!mddev); 5504 atomic_dec(&mddev->openers); 5505 mddev_put(mddev); 5506 5507 return 0; 5508 } 5509 5510 static int md_media_changed(struct gendisk *disk) 5511 { 5512 mddev_t *mddev = disk->private_data; 5513 5514 return mddev->changed; 5515 } 5516 5517 static int md_revalidate(struct gendisk *disk) 5518 { 5519 mddev_t *mddev = disk->private_data; 5520 5521 mddev->changed = 0; 5522 return 0; 5523 } 5524 static struct block_device_operations md_fops = 5525 { 5526 .owner = THIS_MODULE, 5527 .open = md_open, 5528 .release = md_release, 5529 .ioctl = md_ioctl, 5530 .getgeo = md_getgeo, 5531 .media_changed = md_media_changed, 5532 .revalidate_disk= md_revalidate, 5533 }; 5534 5535 static int md_thread(void * arg) 5536 { 5537 mdk_thread_t *thread = arg; 5538 5539 /* 5540 * md_thread is a 'system-thread', it's priority should be very 5541 * high. We avoid resource deadlocks individually in each 5542 * raid personality. (RAID5 does preallocation) We also use RR and 5543 * the very same RT priority as kswapd, thus we will never get 5544 * into a priority inversion deadlock. 5545 * 5546 * we definitely have to have equal or higher priority than 5547 * bdflush, otherwise bdflush will deadlock if there are too 5548 * many dirty RAID5 blocks. 5549 */ 5550 5551 allow_signal(SIGKILL); 5552 while (!kthread_should_stop()) { 5553 5554 /* We need to wait INTERRUPTIBLE so that 5555 * we don't add to the load-average. 5556 * That means we need to be sure no signals are 5557 * pending 5558 */ 5559 if (signal_pending(current)) 5560 flush_signals(current); 5561 5562 wait_event_interruptible_timeout 5563 (thread->wqueue, 5564 test_bit(THREAD_WAKEUP, &thread->flags) 5565 || kthread_should_stop(), 5566 thread->timeout); 5567 5568 clear_bit(THREAD_WAKEUP, &thread->flags); 5569 5570 thread->run(thread->mddev); 5571 } 5572 5573 return 0; 5574 } 5575 5576 void md_wakeup_thread(mdk_thread_t *thread) 5577 { 5578 if (thread) { 5579 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 5580 set_bit(THREAD_WAKEUP, &thread->flags); 5581 wake_up(&thread->wqueue); 5582 } 5583 } 5584 5585 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 5586 const char *name) 5587 { 5588 mdk_thread_t *thread; 5589 5590 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 5591 if (!thread) 5592 return NULL; 5593 5594 init_waitqueue_head(&thread->wqueue); 5595 5596 thread->run = run; 5597 thread->mddev = mddev; 5598 thread->timeout = MAX_SCHEDULE_TIMEOUT; 5599 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 5600 if (IS_ERR(thread->tsk)) { 5601 kfree(thread); 5602 return NULL; 5603 } 5604 return thread; 5605 } 5606 5607 void md_unregister_thread(mdk_thread_t *thread) 5608 { 5609 if (!thread) 5610 return; 5611 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 5612 5613 kthread_stop(thread->tsk); 5614 kfree(thread); 5615 } 5616 5617 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 5618 { 5619 if (!mddev) { 5620 MD_BUG(); 5621 return; 5622 } 5623 5624 if (!rdev || test_bit(Faulty, &rdev->flags)) 5625 return; 5626 5627 if (mddev->external) 5628 set_bit(Blocked, &rdev->flags); 5629 /* 5630 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 5631 mdname(mddev), 5632 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 5633 __builtin_return_address(0),__builtin_return_address(1), 5634 __builtin_return_address(2),__builtin_return_address(3)); 5635 */ 5636 if (!mddev->pers) 5637 return; 5638 if (!mddev->pers->error_handler) 5639 return; 5640 mddev->pers->error_handler(mddev,rdev); 5641 if (mddev->degraded) 5642 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5643 set_bit(StateChanged, &rdev->flags); 5644 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5645 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5646 md_wakeup_thread(mddev->thread); 5647 md_new_event_inintr(mddev); 5648 } 5649 5650 /* seq_file implementation /proc/mdstat */ 5651 5652 static void status_unused(struct seq_file *seq) 5653 { 5654 int i = 0; 5655 mdk_rdev_t *rdev; 5656 5657 seq_printf(seq, "unused devices: "); 5658 5659 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 5660 char b[BDEVNAME_SIZE]; 5661 i++; 5662 seq_printf(seq, "%s ", 5663 bdevname(rdev->bdev,b)); 5664 } 5665 if (!i) 5666 seq_printf(seq, "<none>"); 5667 5668 seq_printf(seq, "\n"); 5669 } 5670 5671 5672 static void status_resync(struct seq_file *seq, mddev_t * mddev) 5673 { 5674 sector_t max_sectors, resync, res; 5675 unsigned long dt, db; 5676 sector_t rt; 5677 int scale; 5678 unsigned int per_milli; 5679 5680 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); 5681 5682 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5683 max_sectors = mddev->resync_max_sectors; 5684 else 5685 max_sectors = mddev->dev_sectors; 5686 5687 /* 5688 * Should not happen. 5689 */ 5690 if (!max_sectors) { 5691 MD_BUG(); 5692 return; 5693 } 5694 /* Pick 'scale' such that (resync>>scale)*1000 will fit 5695 * in a sector_t, and (max_sectors>>scale) will fit in a 5696 * u32, as those are the requirements for sector_div. 5697 * Thus 'scale' must be at least 10 5698 */ 5699 scale = 10; 5700 if (sizeof(sector_t) > sizeof(unsigned long)) { 5701 while ( max_sectors/2 > (1ULL<<(scale+32))) 5702 scale++; 5703 } 5704 res = (resync>>scale)*1000; 5705 sector_div(res, (u32)((max_sectors>>scale)+1)); 5706 5707 per_milli = res; 5708 { 5709 int i, x = per_milli/50, y = 20-x; 5710 seq_printf(seq, "["); 5711 for (i = 0; i < x; i++) 5712 seq_printf(seq, "="); 5713 seq_printf(seq, ">"); 5714 for (i = 0; i < y; i++) 5715 seq_printf(seq, "."); 5716 seq_printf(seq, "] "); 5717 } 5718 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 5719 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 5720 "reshape" : 5721 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 5722 "check" : 5723 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 5724 "resync" : "recovery"))), 5725 per_milli/10, per_milli % 10, 5726 (unsigned long long) resync/2, 5727 (unsigned long long) max_sectors/2); 5728 5729 /* 5730 * dt: time from mark until now 5731 * db: blocks written from mark until now 5732 * rt: remaining time 5733 * 5734 * rt is a sector_t, so could be 32bit or 64bit. 5735 * So we divide before multiply in case it is 32bit and close 5736 * to the limit. 5737 * We scale the divisor (db) by 32 to avoid loosing precision 5738 * near the end of resync when the number of remaining sectors 5739 * is close to 'db'. 5740 * We then divide rt by 32 after multiplying by db to compensate. 5741 * The '+1' avoids division by zero if db is very small. 5742 */ 5743 dt = ((jiffies - mddev->resync_mark) / HZ); 5744 if (!dt) dt++; 5745 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 5746 - mddev->resync_mark_cnt; 5747 5748 rt = max_sectors - resync; /* number of remaining sectors */ 5749 sector_div(rt, db/32+1); 5750 rt *= dt; 5751 rt >>= 5; 5752 5753 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 5754 ((unsigned long)rt % 60)/6); 5755 5756 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 5757 } 5758 5759 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 5760 { 5761 struct list_head *tmp; 5762 loff_t l = *pos; 5763 mddev_t *mddev; 5764 5765 if (l >= 0x10000) 5766 return NULL; 5767 if (!l--) 5768 /* header */ 5769 return (void*)1; 5770 5771 spin_lock(&all_mddevs_lock); 5772 list_for_each(tmp,&all_mddevs) 5773 if (!l--) { 5774 mddev = list_entry(tmp, mddev_t, all_mddevs); 5775 mddev_get(mddev); 5776 spin_unlock(&all_mddevs_lock); 5777 return mddev; 5778 } 5779 spin_unlock(&all_mddevs_lock); 5780 if (!l--) 5781 return (void*)2;/* tail */ 5782 return NULL; 5783 } 5784 5785 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 5786 { 5787 struct list_head *tmp; 5788 mddev_t *next_mddev, *mddev = v; 5789 5790 ++*pos; 5791 if (v == (void*)2) 5792 return NULL; 5793 5794 spin_lock(&all_mddevs_lock); 5795 if (v == (void*)1) 5796 tmp = all_mddevs.next; 5797 else 5798 tmp = mddev->all_mddevs.next; 5799 if (tmp != &all_mddevs) 5800 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 5801 else { 5802 next_mddev = (void*)2; 5803 *pos = 0x10000; 5804 } 5805 spin_unlock(&all_mddevs_lock); 5806 5807 if (v != (void*)1) 5808 mddev_put(mddev); 5809 return next_mddev; 5810 5811 } 5812 5813 static void md_seq_stop(struct seq_file *seq, void *v) 5814 { 5815 mddev_t *mddev = v; 5816 5817 if (mddev && v != (void*)1 && v != (void*)2) 5818 mddev_put(mddev); 5819 } 5820 5821 struct mdstat_info { 5822 int event; 5823 }; 5824 5825 static int md_seq_show(struct seq_file *seq, void *v) 5826 { 5827 mddev_t *mddev = v; 5828 sector_t sectors; 5829 mdk_rdev_t *rdev; 5830 struct mdstat_info *mi = seq->private; 5831 struct bitmap *bitmap; 5832 5833 if (v == (void*)1) { 5834 struct mdk_personality *pers; 5835 seq_printf(seq, "Personalities : "); 5836 spin_lock(&pers_lock); 5837 list_for_each_entry(pers, &pers_list, list) 5838 seq_printf(seq, "[%s] ", pers->name); 5839 5840 spin_unlock(&pers_lock); 5841 seq_printf(seq, "\n"); 5842 mi->event = atomic_read(&md_event_count); 5843 return 0; 5844 } 5845 if (v == (void*)2) { 5846 status_unused(seq); 5847 return 0; 5848 } 5849 5850 if (mddev_lock(mddev) < 0) 5851 return -EINTR; 5852 5853 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 5854 seq_printf(seq, "%s : %sactive", mdname(mddev), 5855 mddev->pers ? "" : "in"); 5856 if (mddev->pers) { 5857 if (mddev->ro==1) 5858 seq_printf(seq, " (read-only)"); 5859 if (mddev->ro==2) 5860 seq_printf(seq, " (auto-read-only)"); 5861 seq_printf(seq, " %s", mddev->pers->name); 5862 } 5863 5864 sectors = 0; 5865 list_for_each_entry(rdev, &mddev->disks, same_set) { 5866 char b[BDEVNAME_SIZE]; 5867 seq_printf(seq, " %s[%d]", 5868 bdevname(rdev->bdev,b), rdev->desc_nr); 5869 if (test_bit(WriteMostly, &rdev->flags)) 5870 seq_printf(seq, "(W)"); 5871 if (test_bit(Faulty, &rdev->flags)) { 5872 seq_printf(seq, "(F)"); 5873 continue; 5874 } else if (rdev->raid_disk < 0) 5875 seq_printf(seq, "(S)"); /* spare */ 5876 sectors += rdev->sectors; 5877 } 5878 5879 if (!list_empty(&mddev->disks)) { 5880 if (mddev->pers) 5881 seq_printf(seq, "\n %llu blocks", 5882 (unsigned long long) 5883 mddev->array_sectors / 2); 5884 else 5885 seq_printf(seq, "\n %llu blocks", 5886 (unsigned long long)sectors / 2); 5887 } 5888 if (mddev->persistent) { 5889 if (mddev->major_version != 0 || 5890 mddev->minor_version != 90) { 5891 seq_printf(seq," super %d.%d", 5892 mddev->major_version, 5893 mddev->minor_version); 5894 } 5895 } else if (mddev->external) 5896 seq_printf(seq, " super external:%s", 5897 mddev->metadata_type); 5898 else 5899 seq_printf(seq, " super non-persistent"); 5900 5901 if (mddev->pers) { 5902 mddev->pers->status(seq, mddev); 5903 seq_printf(seq, "\n "); 5904 if (mddev->pers->sync_request) { 5905 if (mddev->curr_resync > 2) { 5906 status_resync(seq, mddev); 5907 seq_printf(seq, "\n "); 5908 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 5909 seq_printf(seq, "\tresync=DELAYED\n "); 5910 else if (mddev->recovery_cp < MaxSector) 5911 seq_printf(seq, "\tresync=PENDING\n "); 5912 } 5913 } else 5914 seq_printf(seq, "\n "); 5915 5916 if ((bitmap = mddev->bitmap)) { 5917 unsigned long chunk_kb; 5918 unsigned long flags; 5919 spin_lock_irqsave(&bitmap->lock, flags); 5920 chunk_kb = bitmap->chunksize >> 10; 5921 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 5922 "%lu%s chunk", 5923 bitmap->pages - bitmap->missing_pages, 5924 bitmap->pages, 5925 (bitmap->pages - bitmap->missing_pages) 5926 << (PAGE_SHIFT - 10), 5927 chunk_kb ? chunk_kb : bitmap->chunksize, 5928 chunk_kb ? "KB" : "B"); 5929 if (bitmap->file) { 5930 seq_printf(seq, ", file: "); 5931 seq_path(seq, &bitmap->file->f_path, " \t\n"); 5932 } 5933 5934 seq_printf(seq, "\n"); 5935 spin_unlock_irqrestore(&bitmap->lock, flags); 5936 } 5937 5938 seq_printf(seq, "\n"); 5939 } 5940 mddev_unlock(mddev); 5941 5942 return 0; 5943 } 5944 5945 static const struct seq_operations md_seq_ops = { 5946 .start = md_seq_start, 5947 .next = md_seq_next, 5948 .stop = md_seq_stop, 5949 .show = md_seq_show, 5950 }; 5951 5952 static int md_seq_open(struct inode *inode, struct file *file) 5953 { 5954 int error; 5955 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 5956 if (mi == NULL) 5957 return -ENOMEM; 5958 5959 error = seq_open(file, &md_seq_ops); 5960 if (error) 5961 kfree(mi); 5962 else { 5963 struct seq_file *p = file->private_data; 5964 p->private = mi; 5965 mi->event = atomic_read(&md_event_count); 5966 } 5967 return error; 5968 } 5969 5970 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 5971 { 5972 struct seq_file *m = filp->private_data; 5973 struct mdstat_info *mi = m->private; 5974 int mask; 5975 5976 poll_wait(filp, &md_event_waiters, wait); 5977 5978 /* always allow read */ 5979 mask = POLLIN | POLLRDNORM; 5980 5981 if (mi->event != atomic_read(&md_event_count)) 5982 mask |= POLLERR | POLLPRI; 5983 return mask; 5984 } 5985 5986 static const struct file_operations md_seq_fops = { 5987 .owner = THIS_MODULE, 5988 .open = md_seq_open, 5989 .read = seq_read, 5990 .llseek = seq_lseek, 5991 .release = seq_release_private, 5992 .poll = mdstat_poll, 5993 }; 5994 5995 int register_md_personality(struct mdk_personality *p) 5996 { 5997 spin_lock(&pers_lock); 5998 list_add_tail(&p->list, &pers_list); 5999 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 6000 spin_unlock(&pers_lock); 6001 return 0; 6002 } 6003 6004 int unregister_md_personality(struct mdk_personality *p) 6005 { 6006 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 6007 spin_lock(&pers_lock); 6008 list_del_init(&p->list); 6009 spin_unlock(&pers_lock); 6010 return 0; 6011 } 6012 6013 static int is_mddev_idle(mddev_t *mddev, int init) 6014 { 6015 mdk_rdev_t * rdev; 6016 int idle; 6017 int curr_events; 6018 6019 idle = 1; 6020 rcu_read_lock(); 6021 rdev_for_each_rcu(rdev, mddev) { 6022 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 6023 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 6024 (int)part_stat_read(&disk->part0, sectors[1]) - 6025 atomic_read(&disk->sync_io); 6026 /* sync IO will cause sync_io to increase before the disk_stats 6027 * as sync_io is counted when a request starts, and 6028 * disk_stats is counted when it completes. 6029 * So resync activity will cause curr_events to be smaller than 6030 * when there was no such activity. 6031 * non-sync IO will cause disk_stat to increase without 6032 * increasing sync_io so curr_events will (eventually) 6033 * be larger than it was before. Once it becomes 6034 * substantially larger, the test below will cause 6035 * the array to appear non-idle, and resync will slow 6036 * down. 6037 * If there is a lot of outstanding resync activity when 6038 * we set last_event to curr_events, then all that activity 6039 * completing might cause the array to appear non-idle 6040 * and resync will be slowed down even though there might 6041 * not have been non-resync activity. This will only 6042 * happen once though. 'last_events' will soon reflect 6043 * the state where there is little or no outstanding 6044 * resync requests, and further resync activity will 6045 * always make curr_events less than last_events. 6046 * 6047 */ 6048 if (init || curr_events - rdev->last_events > 64) { 6049 rdev->last_events = curr_events; 6050 idle = 0; 6051 } 6052 } 6053 rcu_read_unlock(); 6054 return idle; 6055 } 6056 6057 void md_done_sync(mddev_t *mddev, int blocks, int ok) 6058 { 6059 /* another "blocks" (512byte) blocks have been synced */ 6060 atomic_sub(blocks, &mddev->recovery_active); 6061 wake_up(&mddev->recovery_wait); 6062 if (!ok) { 6063 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6064 md_wakeup_thread(mddev->thread); 6065 // stop recovery, signal do_sync .... 6066 } 6067 } 6068 6069 6070 /* md_write_start(mddev, bi) 6071 * If we need to update some array metadata (e.g. 'active' flag 6072 * in superblock) before writing, schedule a superblock update 6073 * and wait for it to complete. 6074 */ 6075 void md_write_start(mddev_t *mddev, struct bio *bi) 6076 { 6077 int did_change = 0; 6078 if (bio_data_dir(bi) != WRITE) 6079 return; 6080 6081 BUG_ON(mddev->ro == 1); 6082 if (mddev->ro == 2) { 6083 /* need to switch to read/write */ 6084 mddev->ro = 0; 6085 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6086 md_wakeup_thread(mddev->thread); 6087 md_wakeup_thread(mddev->sync_thread); 6088 did_change = 1; 6089 } 6090 atomic_inc(&mddev->writes_pending); 6091 if (mddev->safemode == 1) 6092 mddev->safemode = 0; 6093 if (mddev->in_sync) { 6094 spin_lock_irq(&mddev->write_lock); 6095 if (mddev->in_sync) { 6096 mddev->in_sync = 0; 6097 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6098 md_wakeup_thread(mddev->thread); 6099 did_change = 1; 6100 } 6101 spin_unlock_irq(&mddev->write_lock); 6102 } 6103 if (did_change) 6104 sysfs_notify_dirent(mddev->sysfs_state); 6105 wait_event(mddev->sb_wait, 6106 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 6107 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6108 } 6109 6110 void md_write_end(mddev_t *mddev) 6111 { 6112 if (atomic_dec_and_test(&mddev->writes_pending)) { 6113 if (mddev->safemode == 2) 6114 md_wakeup_thread(mddev->thread); 6115 else if (mddev->safemode_delay) 6116 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 6117 } 6118 } 6119 6120 /* md_allow_write(mddev) 6121 * Calling this ensures that the array is marked 'active' so that writes 6122 * may proceed without blocking. It is important to call this before 6123 * attempting a GFP_KERNEL allocation while holding the mddev lock. 6124 * Must be called with mddev_lock held. 6125 * 6126 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 6127 * is dropped, so return -EAGAIN after notifying userspace. 6128 */ 6129 int md_allow_write(mddev_t *mddev) 6130 { 6131 if (!mddev->pers) 6132 return 0; 6133 if (mddev->ro) 6134 return 0; 6135 if (!mddev->pers->sync_request) 6136 return 0; 6137 6138 spin_lock_irq(&mddev->write_lock); 6139 if (mddev->in_sync) { 6140 mddev->in_sync = 0; 6141 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6142 if (mddev->safemode_delay && 6143 mddev->safemode == 0) 6144 mddev->safemode = 1; 6145 spin_unlock_irq(&mddev->write_lock); 6146 md_update_sb(mddev, 0); 6147 sysfs_notify_dirent(mddev->sysfs_state); 6148 } else 6149 spin_unlock_irq(&mddev->write_lock); 6150 6151 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 6152 return -EAGAIN; 6153 else 6154 return 0; 6155 } 6156 EXPORT_SYMBOL_GPL(md_allow_write); 6157 6158 #define SYNC_MARKS 10 6159 #define SYNC_MARK_STEP (3*HZ) 6160 void md_do_sync(mddev_t *mddev) 6161 { 6162 mddev_t *mddev2; 6163 unsigned int currspeed = 0, 6164 window; 6165 sector_t max_sectors,j, io_sectors; 6166 unsigned long mark[SYNC_MARKS]; 6167 sector_t mark_cnt[SYNC_MARKS]; 6168 int last_mark,m; 6169 struct list_head *tmp; 6170 sector_t last_check; 6171 int skipped = 0; 6172 mdk_rdev_t *rdev; 6173 char *desc; 6174 6175 /* just incase thread restarts... */ 6176 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 6177 return; 6178 if (mddev->ro) /* never try to sync a read-only array */ 6179 return; 6180 6181 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6182 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 6183 desc = "data-check"; 6184 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6185 desc = "requested-resync"; 6186 else 6187 desc = "resync"; 6188 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6189 desc = "reshape"; 6190 else 6191 desc = "recovery"; 6192 6193 /* we overload curr_resync somewhat here. 6194 * 0 == not engaged in resync at all 6195 * 2 == checking that there is no conflict with another sync 6196 * 1 == like 2, but have yielded to allow conflicting resync to 6197 * commense 6198 * other == active in resync - this many blocks 6199 * 6200 * Before starting a resync we must have set curr_resync to 6201 * 2, and then checked that every "conflicting" array has curr_resync 6202 * less than ours. When we find one that is the same or higher 6203 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 6204 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 6205 * This will mean we have to start checking from the beginning again. 6206 * 6207 */ 6208 6209 do { 6210 mddev->curr_resync = 2; 6211 6212 try_again: 6213 if (kthread_should_stop()) { 6214 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6215 goto skip; 6216 } 6217 for_each_mddev(mddev2, tmp) { 6218 if (mddev2 == mddev) 6219 continue; 6220 if (!mddev->parallel_resync 6221 && mddev2->curr_resync 6222 && match_mddev_units(mddev, mddev2)) { 6223 DEFINE_WAIT(wq); 6224 if (mddev < mddev2 && mddev->curr_resync == 2) { 6225 /* arbitrarily yield */ 6226 mddev->curr_resync = 1; 6227 wake_up(&resync_wait); 6228 } 6229 if (mddev > mddev2 && mddev->curr_resync == 1) 6230 /* no need to wait here, we can wait the next 6231 * time 'round when curr_resync == 2 6232 */ 6233 continue; 6234 /* We need to wait 'interruptible' so as not to 6235 * contribute to the load average, and not to 6236 * be caught by 'softlockup' 6237 */ 6238 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 6239 if (!kthread_should_stop() && 6240 mddev2->curr_resync >= mddev->curr_resync) { 6241 printk(KERN_INFO "md: delaying %s of %s" 6242 " until %s has finished (they" 6243 " share one or more physical units)\n", 6244 desc, mdname(mddev), mdname(mddev2)); 6245 mddev_put(mddev2); 6246 if (signal_pending(current)) 6247 flush_signals(current); 6248 schedule(); 6249 finish_wait(&resync_wait, &wq); 6250 goto try_again; 6251 } 6252 finish_wait(&resync_wait, &wq); 6253 } 6254 } 6255 } while (mddev->curr_resync < 2); 6256 6257 j = 0; 6258 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6259 /* resync follows the size requested by the personality, 6260 * which defaults to physical size, but can be virtual size 6261 */ 6262 max_sectors = mddev->resync_max_sectors; 6263 mddev->resync_mismatches = 0; 6264 /* we don't use the checkpoint if there's a bitmap */ 6265 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6266 j = mddev->resync_min; 6267 else if (!mddev->bitmap) 6268 j = mddev->recovery_cp; 6269 6270 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6271 max_sectors = mddev->dev_sectors; 6272 else { 6273 /* recovery follows the physical size of devices */ 6274 max_sectors = mddev->dev_sectors; 6275 j = MaxSector; 6276 list_for_each_entry(rdev, &mddev->disks, same_set) 6277 if (rdev->raid_disk >= 0 && 6278 !test_bit(Faulty, &rdev->flags) && 6279 !test_bit(In_sync, &rdev->flags) && 6280 rdev->recovery_offset < j) 6281 j = rdev->recovery_offset; 6282 } 6283 6284 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 6285 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 6286 " %d KB/sec/disk.\n", speed_min(mddev)); 6287 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 6288 "(but not more than %d KB/sec) for %s.\n", 6289 speed_max(mddev), desc); 6290 6291 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 6292 6293 io_sectors = 0; 6294 for (m = 0; m < SYNC_MARKS; m++) { 6295 mark[m] = jiffies; 6296 mark_cnt[m] = io_sectors; 6297 } 6298 last_mark = 0; 6299 mddev->resync_mark = mark[last_mark]; 6300 mddev->resync_mark_cnt = mark_cnt[last_mark]; 6301 6302 /* 6303 * Tune reconstruction: 6304 */ 6305 window = 32*(PAGE_SIZE/512); 6306 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 6307 window/2,(unsigned long long) max_sectors/2); 6308 6309 atomic_set(&mddev->recovery_active, 0); 6310 last_check = 0; 6311 6312 if (j>2) { 6313 printk(KERN_INFO 6314 "md: resuming %s of %s from checkpoint.\n", 6315 desc, mdname(mddev)); 6316 mddev->curr_resync = j; 6317 } 6318 6319 while (j < max_sectors) { 6320 sector_t sectors; 6321 6322 skipped = 0; 6323 6324 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6325 ((mddev->curr_resync > mddev->curr_resync_completed && 6326 (mddev->curr_resync - mddev->curr_resync_completed) 6327 > (max_sectors >> 4)) || 6328 (j - mddev->curr_resync_completed)*2 6329 >= mddev->resync_max - mddev->curr_resync_completed 6330 )) { 6331 /* time to update curr_resync_completed */ 6332 blk_unplug(mddev->queue); 6333 wait_event(mddev->recovery_wait, 6334 atomic_read(&mddev->recovery_active) == 0); 6335 mddev->curr_resync_completed = 6336 mddev->curr_resync; 6337 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6338 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6339 } 6340 6341 while (j >= mddev->resync_max && !kthread_should_stop()) { 6342 /* As this condition is controlled by user-space, 6343 * we can block indefinitely, so use '_interruptible' 6344 * to avoid triggering warnings. 6345 */ 6346 flush_signals(current); /* just in case */ 6347 wait_event_interruptible(mddev->recovery_wait, 6348 mddev->resync_max > j 6349 || kthread_should_stop()); 6350 } 6351 6352 if (kthread_should_stop()) 6353 goto interrupted; 6354 6355 sectors = mddev->pers->sync_request(mddev, j, &skipped, 6356 currspeed < speed_min(mddev)); 6357 if (sectors == 0) { 6358 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6359 goto out; 6360 } 6361 6362 if (!skipped) { /* actual IO requested */ 6363 io_sectors += sectors; 6364 atomic_add(sectors, &mddev->recovery_active); 6365 } 6366 6367 j += sectors; 6368 if (j>1) mddev->curr_resync = j; 6369 mddev->curr_mark_cnt = io_sectors; 6370 if (last_check == 0) 6371 /* this is the earliers that rebuilt will be 6372 * visible in /proc/mdstat 6373 */ 6374 md_new_event(mddev); 6375 6376 if (last_check + window > io_sectors || j == max_sectors) 6377 continue; 6378 6379 last_check = io_sectors; 6380 6381 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6382 break; 6383 6384 repeat: 6385 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 6386 /* step marks */ 6387 int next = (last_mark+1) % SYNC_MARKS; 6388 6389 mddev->resync_mark = mark[next]; 6390 mddev->resync_mark_cnt = mark_cnt[next]; 6391 mark[next] = jiffies; 6392 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 6393 last_mark = next; 6394 } 6395 6396 6397 if (kthread_should_stop()) 6398 goto interrupted; 6399 6400 6401 /* 6402 * this loop exits only if either when we are slower than 6403 * the 'hard' speed limit, or the system was IO-idle for 6404 * a jiffy. 6405 * the system might be non-idle CPU-wise, but we only care 6406 * about not overloading the IO subsystem. (things like an 6407 * e2fsck being done on the RAID array should execute fast) 6408 */ 6409 blk_unplug(mddev->queue); 6410 cond_resched(); 6411 6412 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 6413 /((jiffies-mddev->resync_mark)/HZ +1) +1; 6414 6415 if (currspeed > speed_min(mddev)) { 6416 if ((currspeed > speed_max(mddev)) || 6417 !is_mddev_idle(mddev, 0)) { 6418 msleep(500); 6419 goto repeat; 6420 } 6421 } 6422 } 6423 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 6424 /* 6425 * this also signals 'finished resyncing' to md_stop 6426 */ 6427 out: 6428 blk_unplug(mddev->queue); 6429 6430 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 6431 6432 /* tell personality that we are finished */ 6433 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 6434 6435 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 6436 mddev->curr_resync > 2) { 6437 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6438 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6439 if (mddev->curr_resync >= mddev->recovery_cp) { 6440 printk(KERN_INFO 6441 "md: checkpointing %s of %s.\n", 6442 desc, mdname(mddev)); 6443 mddev->recovery_cp = mddev->curr_resync; 6444 } 6445 } else 6446 mddev->recovery_cp = MaxSector; 6447 } else { 6448 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6449 mddev->curr_resync = MaxSector; 6450 list_for_each_entry(rdev, &mddev->disks, same_set) 6451 if (rdev->raid_disk >= 0 && 6452 !test_bit(Faulty, &rdev->flags) && 6453 !test_bit(In_sync, &rdev->flags) && 6454 rdev->recovery_offset < mddev->curr_resync) 6455 rdev->recovery_offset = mddev->curr_resync; 6456 } 6457 } 6458 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6459 6460 skip: 6461 mddev->curr_resync = 0; 6462 mddev->curr_resync_completed = 0; 6463 mddev->resync_min = 0; 6464 mddev->resync_max = MaxSector; 6465 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6466 wake_up(&resync_wait); 6467 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 6468 md_wakeup_thread(mddev->thread); 6469 return; 6470 6471 interrupted: 6472 /* 6473 * got a signal, exit. 6474 */ 6475 printk(KERN_INFO 6476 "md: md_do_sync() got signal ... exiting\n"); 6477 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6478 goto out; 6479 6480 } 6481 EXPORT_SYMBOL_GPL(md_do_sync); 6482 6483 6484 static int remove_and_add_spares(mddev_t *mddev) 6485 { 6486 mdk_rdev_t *rdev; 6487 int spares = 0; 6488 6489 mddev->curr_resync_completed = 0; 6490 6491 list_for_each_entry(rdev, &mddev->disks, same_set) 6492 if (rdev->raid_disk >= 0 && 6493 !test_bit(Blocked, &rdev->flags) && 6494 (test_bit(Faulty, &rdev->flags) || 6495 ! test_bit(In_sync, &rdev->flags)) && 6496 atomic_read(&rdev->nr_pending)==0) { 6497 if (mddev->pers->hot_remove_disk( 6498 mddev, rdev->raid_disk)==0) { 6499 char nm[20]; 6500 sprintf(nm,"rd%d", rdev->raid_disk); 6501 sysfs_remove_link(&mddev->kobj, nm); 6502 rdev->raid_disk = -1; 6503 } 6504 } 6505 6506 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { 6507 list_for_each_entry(rdev, &mddev->disks, same_set) { 6508 if (rdev->raid_disk >= 0 && 6509 !test_bit(In_sync, &rdev->flags) && 6510 !test_bit(Blocked, &rdev->flags)) 6511 spares++; 6512 if (rdev->raid_disk < 0 6513 && !test_bit(Faulty, &rdev->flags)) { 6514 rdev->recovery_offset = 0; 6515 if (mddev->pers-> 6516 hot_add_disk(mddev, rdev) == 0) { 6517 char nm[20]; 6518 sprintf(nm, "rd%d", rdev->raid_disk); 6519 if (sysfs_create_link(&mddev->kobj, 6520 &rdev->kobj, nm)) 6521 printk(KERN_WARNING 6522 "md: cannot register " 6523 "%s for %s\n", 6524 nm, mdname(mddev)); 6525 spares++; 6526 md_new_event(mddev); 6527 } else 6528 break; 6529 } 6530 } 6531 } 6532 return spares; 6533 } 6534 /* 6535 * This routine is regularly called by all per-raid-array threads to 6536 * deal with generic issues like resync and super-block update. 6537 * Raid personalities that don't have a thread (linear/raid0) do not 6538 * need this as they never do any recovery or update the superblock. 6539 * 6540 * It does not do any resync itself, but rather "forks" off other threads 6541 * to do that as needed. 6542 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 6543 * "->recovery" and create a thread at ->sync_thread. 6544 * When the thread finishes it sets MD_RECOVERY_DONE 6545 * and wakeups up this thread which will reap the thread and finish up. 6546 * This thread also removes any faulty devices (with nr_pending == 0). 6547 * 6548 * The overall approach is: 6549 * 1/ if the superblock needs updating, update it. 6550 * 2/ If a recovery thread is running, don't do anything else. 6551 * 3/ If recovery has finished, clean up, possibly marking spares active. 6552 * 4/ If there are any faulty devices, remove them. 6553 * 5/ If array is degraded, try to add spares devices 6554 * 6/ If array has spares or is not in-sync, start a resync thread. 6555 */ 6556 void md_check_recovery(mddev_t *mddev) 6557 { 6558 mdk_rdev_t *rdev; 6559 6560 6561 if (mddev->bitmap) 6562 bitmap_daemon_work(mddev->bitmap); 6563 6564 if (mddev->ro) 6565 return; 6566 6567 if (signal_pending(current)) { 6568 if (mddev->pers->sync_request && !mddev->external) { 6569 printk(KERN_INFO "md: %s in immediate safe mode\n", 6570 mdname(mddev)); 6571 mddev->safemode = 2; 6572 } 6573 flush_signals(current); 6574 } 6575 6576 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 6577 return; 6578 if ( ! ( 6579 (mddev->flags && !mddev->external) || 6580 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 6581 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 6582 (mddev->external == 0 && mddev->safemode == 1) || 6583 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 6584 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 6585 )) 6586 return; 6587 6588 if (mddev_trylock(mddev)) { 6589 int spares = 0; 6590 6591 if (mddev->ro) { 6592 /* Only thing we do on a ro array is remove 6593 * failed devices. 6594 */ 6595 remove_and_add_spares(mddev); 6596 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6597 goto unlock; 6598 } 6599 6600 if (!mddev->external) { 6601 int did_change = 0; 6602 spin_lock_irq(&mddev->write_lock); 6603 if (mddev->safemode && 6604 !atomic_read(&mddev->writes_pending) && 6605 !mddev->in_sync && 6606 mddev->recovery_cp == MaxSector) { 6607 mddev->in_sync = 1; 6608 did_change = 1; 6609 if (mddev->persistent) 6610 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6611 } 6612 if (mddev->safemode == 1) 6613 mddev->safemode = 0; 6614 spin_unlock_irq(&mddev->write_lock); 6615 if (did_change) 6616 sysfs_notify_dirent(mddev->sysfs_state); 6617 } 6618 6619 if (mddev->flags) 6620 md_update_sb(mddev, 0); 6621 6622 list_for_each_entry(rdev, &mddev->disks, same_set) 6623 if (test_and_clear_bit(StateChanged, &rdev->flags)) 6624 sysfs_notify_dirent(rdev->sysfs_state); 6625 6626 6627 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6628 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6629 /* resync/recovery still happening */ 6630 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6631 goto unlock; 6632 } 6633 if (mddev->sync_thread) { 6634 /* resync has finished, collect result */ 6635 md_unregister_thread(mddev->sync_thread); 6636 mddev->sync_thread = NULL; 6637 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 6638 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 6639 /* success...*/ 6640 /* activate any spares */ 6641 if (mddev->pers->spare_active(mddev)) 6642 sysfs_notify(&mddev->kobj, NULL, 6643 "degraded"); 6644 } 6645 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6646 mddev->pers->finish_reshape) 6647 mddev->pers->finish_reshape(mddev); 6648 md_update_sb(mddev, 1); 6649 6650 /* if array is no-longer degraded, then any saved_raid_disk 6651 * information must be scrapped 6652 */ 6653 if (!mddev->degraded) 6654 list_for_each_entry(rdev, &mddev->disks, same_set) 6655 rdev->saved_raid_disk = -1; 6656 6657 mddev->recovery = 0; 6658 /* flag recovery needed just to double check */ 6659 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6660 sysfs_notify_dirent(mddev->sysfs_action); 6661 md_new_event(mddev); 6662 goto unlock; 6663 } 6664 /* Set RUNNING before clearing NEEDED to avoid 6665 * any transients in the value of "sync_action". 6666 */ 6667 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6668 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6669 /* Clear some bits that don't mean anything, but 6670 * might be left set 6671 */ 6672 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6673 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6674 6675 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 6676 goto unlock; 6677 /* no recovery is running. 6678 * remove any failed drives, then 6679 * add spares if possible. 6680 * Spare are also removed and re-added, to allow 6681 * the personality to fail the re-add. 6682 */ 6683 6684 if (mddev->reshape_position != MaxSector) { 6685 if (mddev->pers->check_reshape == NULL || 6686 mddev->pers->check_reshape(mddev) != 0) 6687 /* Cannot proceed */ 6688 goto unlock; 6689 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6690 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6691 } else if ((spares = remove_and_add_spares(mddev))) { 6692 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6693 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6694 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 6695 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6696 } else if (mddev->recovery_cp < MaxSector) { 6697 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6698 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6699 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6700 /* nothing to be done ... */ 6701 goto unlock; 6702 6703 if (mddev->pers->sync_request) { 6704 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6705 /* We are adding a device or devices to an array 6706 * which has the bitmap stored on all devices. 6707 * So make sure all bitmap pages get written 6708 */ 6709 bitmap_write_all(mddev->bitmap); 6710 } 6711 mddev->sync_thread = md_register_thread(md_do_sync, 6712 mddev, 6713 "%s_resync"); 6714 if (!mddev->sync_thread) { 6715 printk(KERN_ERR "%s: could not start resync" 6716 " thread...\n", 6717 mdname(mddev)); 6718 /* leave the spares where they are, it shouldn't hurt */ 6719 mddev->recovery = 0; 6720 } else 6721 md_wakeup_thread(mddev->sync_thread); 6722 sysfs_notify_dirent(mddev->sysfs_action); 6723 md_new_event(mddev); 6724 } 6725 unlock: 6726 if (!mddev->sync_thread) { 6727 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6728 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 6729 &mddev->recovery)) 6730 if (mddev->sysfs_action) 6731 sysfs_notify_dirent(mddev->sysfs_action); 6732 } 6733 mddev_unlock(mddev); 6734 } 6735 } 6736 6737 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 6738 { 6739 sysfs_notify_dirent(rdev->sysfs_state); 6740 wait_event_timeout(rdev->blocked_wait, 6741 !test_bit(Blocked, &rdev->flags), 6742 msecs_to_jiffies(5000)); 6743 rdev_dec_pending(rdev, mddev); 6744 } 6745 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 6746 6747 static int md_notify_reboot(struct notifier_block *this, 6748 unsigned long code, void *x) 6749 { 6750 struct list_head *tmp; 6751 mddev_t *mddev; 6752 6753 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 6754 6755 printk(KERN_INFO "md: stopping all md devices.\n"); 6756 6757 for_each_mddev(mddev, tmp) 6758 if (mddev_trylock(mddev)) { 6759 /* Force a switch to readonly even array 6760 * appears to still be in use. Hence 6761 * the '100'. 6762 */ 6763 do_md_stop(mddev, 1, 100); 6764 mddev_unlock(mddev); 6765 } 6766 /* 6767 * certain more exotic SCSI devices are known to be 6768 * volatile wrt too early system reboots. While the 6769 * right place to handle this issue is the given 6770 * driver, we do want to have a safe RAID driver ... 6771 */ 6772 mdelay(1000*1); 6773 } 6774 return NOTIFY_DONE; 6775 } 6776 6777 static struct notifier_block md_notifier = { 6778 .notifier_call = md_notify_reboot, 6779 .next = NULL, 6780 .priority = INT_MAX, /* before any real devices */ 6781 }; 6782 6783 static void md_geninit(void) 6784 { 6785 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 6786 6787 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 6788 } 6789 6790 static int __init md_init(void) 6791 { 6792 if (register_blkdev(MD_MAJOR, "md")) 6793 return -1; 6794 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 6795 unregister_blkdev(MD_MAJOR, "md"); 6796 return -1; 6797 } 6798 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 6799 md_probe, NULL, NULL); 6800 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 6801 md_probe, NULL, NULL); 6802 6803 register_reboot_notifier(&md_notifier); 6804 raid_table_header = register_sysctl_table(raid_root_table); 6805 6806 md_geninit(); 6807 return 0; 6808 } 6809 6810 6811 #ifndef MODULE 6812 6813 /* 6814 * Searches all registered partitions for autorun RAID arrays 6815 * at boot time. 6816 */ 6817 6818 static LIST_HEAD(all_detected_devices); 6819 struct detected_devices_node { 6820 struct list_head list; 6821 dev_t dev; 6822 }; 6823 6824 void md_autodetect_dev(dev_t dev) 6825 { 6826 struct detected_devices_node *node_detected_dev; 6827 6828 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 6829 if (node_detected_dev) { 6830 node_detected_dev->dev = dev; 6831 list_add_tail(&node_detected_dev->list, &all_detected_devices); 6832 } else { 6833 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 6834 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 6835 } 6836 } 6837 6838 6839 static void autostart_arrays(int part) 6840 { 6841 mdk_rdev_t *rdev; 6842 struct detected_devices_node *node_detected_dev; 6843 dev_t dev; 6844 int i_scanned, i_passed; 6845 6846 i_scanned = 0; 6847 i_passed = 0; 6848 6849 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 6850 6851 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 6852 i_scanned++; 6853 node_detected_dev = list_entry(all_detected_devices.next, 6854 struct detected_devices_node, list); 6855 list_del(&node_detected_dev->list); 6856 dev = node_detected_dev->dev; 6857 kfree(node_detected_dev); 6858 rdev = md_import_device(dev,0, 90); 6859 if (IS_ERR(rdev)) 6860 continue; 6861 6862 if (test_bit(Faulty, &rdev->flags)) { 6863 MD_BUG(); 6864 continue; 6865 } 6866 set_bit(AutoDetected, &rdev->flags); 6867 list_add(&rdev->same_set, &pending_raid_disks); 6868 i_passed++; 6869 } 6870 6871 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 6872 i_scanned, i_passed); 6873 6874 autorun_devices(part); 6875 } 6876 6877 #endif /* !MODULE */ 6878 6879 static __exit void md_exit(void) 6880 { 6881 mddev_t *mddev; 6882 struct list_head *tmp; 6883 6884 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); 6885 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 6886 6887 unregister_blkdev(MD_MAJOR,"md"); 6888 unregister_blkdev(mdp_major, "mdp"); 6889 unregister_reboot_notifier(&md_notifier); 6890 unregister_sysctl_table(raid_table_header); 6891 remove_proc_entry("mdstat", NULL); 6892 for_each_mddev(mddev, tmp) { 6893 export_array(mddev); 6894 mddev->hold_active = 0; 6895 } 6896 } 6897 6898 subsys_initcall(md_init); 6899 module_exit(md_exit) 6900 6901 static int get_ro(char *buffer, struct kernel_param *kp) 6902 { 6903 return sprintf(buffer, "%d", start_readonly); 6904 } 6905 static int set_ro(const char *val, struct kernel_param *kp) 6906 { 6907 char *e; 6908 int num = simple_strtoul(val, &e, 10); 6909 if (*val && (*e == '\0' || *e == '\n')) { 6910 start_readonly = num; 6911 return 0; 6912 } 6913 return -EINVAL; 6914 } 6915 6916 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 6917 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 6918 6919 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 6920 6921 EXPORT_SYMBOL(register_md_personality); 6922 EXPORT_SYMBOL(unregister_md_personality); 6923 EXPORT_SYMBOL(md_error); 6924 EXPORT_SYMBOL(md_done_sync); 6925 EXPORT_SYMBOL(md_write_start); 6926 EXPORT_SYMBOL(md_write_end); 6927 EXPORT_SYMBOL(md_register_thread); 6928 EXPORT_SYMBOL(md_unregister_thread); 6929 EXPORT_SYMBOL(md_wakeup_thread); 6930 EXPORT_SYMBOL(md_check_recovery); 6931 MODULE_LICENSE("GPL"); 6932 MODULE_ALIAS("md"); 6933 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 6934