1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/kthread.h> 36 #include <linux/blkdev.h> 37 #include <linux/sysctl.h> 38 #include <linux/seq_file.h> 39 #include <linux/buffer_head.h> /* for invalidate_bdev */ 40 #include <linux/poll.h> 41 #include <linux/ctype.h> 42 #include <linux/string.h> 43 #include <linux/hdreg.h> 44 #include <linux/proc_fs.h> 45 #include <linux/random.h> 46 #include <linux/reboot.h> 47 #include <linux/file.h> 48 #include <linux/compat.h> 49 #include <linux/delay.h> 50 #include <linux/raid/md_p.h> 51 #include <linux/raid/md_u.h> 52 #include <linux/slab.h> 53 #include "md.h" 54 #include "bitmap.h" 55 56 #define DEBUG 0 57 #define dprintk(x...) ((void)(DEBUG && printk(x))) 58 59 60 #ifndef MODULE 61 static void autostart_arrays(int part); 62 #endif 63 64 static LIST_HEAD(pers_list); 65 static DEFINE_SPINLOCK(pers_lock); 66 67 static void md_print_devices(void); 68 69 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 70 71 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 72 73 /* 74 * Default number of read corrections we'll attempt on an rdev 75 * before ejecting it from the array. We divide the read error 76 * count by 2 for every hour elapsed between read errors. 77 */ 78 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 79 /* 80 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 81 * is 1000 KB/sec, so the extra system load does not show up that much. 82 * Increase it if you want to have more _guaranteed_ speed. Note that 83 * the RAID driver will use the maximum available bandwidth if the IO 84 * subsystem is idle. There is also an 'absolute maximum' reconstruction 85 * speed limit - in case reconstruction slows down your system despite 86 * idle IO detection. 87 * 88 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 89 * or /sys/block/mdX/md/sync_speed_{min,max} 90 */ 91 92 static int sysctl_speed_limit_min = 1000; 93 static int sysctl_speed_limit_max = 200000; 94 static inline int speed_min(mddev_t *mddev) 95 { 96 return mddev->sync_speed_min ? 97 mddev->sync_speed_min : sysctl_speed_limit_min; 98 } 99 100 static inline int speed_max(mddev_t *mddev) 101 { 102 return mddev->sync_speed_max ? 103 mddev->sync_speed_max : sysctl_speed_limit_max; 104 } 105 106 static struct ctl_table_header *raid_table_header; 107 108 static ctl_table raid_table[] = { 109 { 110 .procname = "speed_limit_min", 111 .data = &sysctl_speed_limit_min, 112 .maxlen = sizeof(int), 113 .mode = S_IRUGO|S_IWUSR, 114 .proc_handler = proc_dointvec, 115 }, 116 { 117 .procname = "speed_limit_max", 118 .data = &sysctl_speed_limit_max, 119 .maxlen = sizeof(int), 120 .mode = S_IRUGO|S_IWUSR, 121 .proc_handler = proc_dointvec, 122 }, 123 { } 124 }; 125 126 static ctl_table raid_dir_table[] = { 127 { 128 .procname = "raid", 129 .maxlen = 0, 130 .mode = S_IRUGO|S_IXUGO, 131 .child = raid_table, 132 }, 133 { } 134 }; 135 136 static ctl_table raid_root_table[] = { 137 { 138 .procname = "dev", 139 .maxlen = 0, 140 .mode = 0555, 141 .child = raid_dir_table, 142 }, 143 { } 144 }; 145 146 static const struct block_device_operations md_fops; 147 148 static int start_readonly; 149 150 /* 151 * We have a system wide 'event count' that is incremented 152 * on any 'interesting' event, and readers of /proc/mdstat 153 * can use 'poll' or 'select' to find out when the event 154 * count increases. 155 * 156 * Events are: 157 * start array, stop array, error, add device, remove device, 158 * start build, activate spare 159 */ 160 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 161 static atomic_t md_event_count; 162 void md_new_event(mddev_t *mddev) 163 { 164 atomic_inc(&md_event_count); 165 wake_up(&md_event_waiters); 166 } 167 EXPORT_SYMBOL_GPL(md_new_event); 168 169 /* Alternate version that can be called from interrupts 170 * when calling sysfs_notify isn't needed. 171 */ 172 static void md_new_event_inintr(mddev_t *mddev) 173 { 174 atomic_inc(&md_event_count); 175 wake_up(&md_event_waiters); 176 } 177 178 /* 179 * Enables to iterate over all existing md arrays 180 * all_mddevs_lock protects this list. 181 */ 182 static LIST_HEAD(all_mddevs); 183 static DEFINE_SPINLOCK(all_mddevs_lock); 184 185 186 /* 187 * iterates through all used mddevs in the system. 188 * We take care to grab the all_mddevs_lock whenever navigating 189 * the list, and to always hold a refcount when unlocked. 190 * Any code which breaks out of this loop while own 191 * a reference to the current mddev and must mddev_put it. 192 */ 193 #define for_each_mddev(mddev,tmp) \ 194 \ 195 for (({ spin_lock(&all_mddevs_lock); \ 196 tmp = all_mddevs.next; \ 197 mddev = NULL;}); \ 198 ({ if (tmp != &all_mddevs) \ 199 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 200 spin_unlock(&all_mddevs_lock); \ 201 if (mddev) mddev_put(mddev); \ 202 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 203 tmp != &all_mddevs;}); \ 204 ({ spin_lock(&all_mddevs_lock); \ 205 tmp = tmp->next;}) \ 206 ) 207 208 209 /* Rather than calling directly into the personality make_request function, 210 * IO requests come here first so that we can check if the device is 211 * being suspended pending a reconfiguration. 212 * We hold a refcount over the call to ->make_request. By the time that 213 * call has finished, the bio has been linked into some internal structure 214 * and so is visible to ->quiesce(), so we don't need the refcount any more. 215 */ 216 static int md_make_request(struct request_queue *q, struct bio *bio) 217 { 218 mddev_t *mddev = q->queuedata; 219 int rv; 220 if (mddev == NULL || mddev->pers == NULL) { 221 bio_io_error(bio); 222 return 0; 223 } 224 rcu_read_lock(); 225 if (mddev->suspended || mddev->barrier) { 226 DEFINE_WAIT(__wait); 227 for (;;) { 228 prepare_to_wait(&mddev->sb_wait, &__wait, 229 TASK_UNINTERRUPTIBLE); 230 if (!mddev->suspended && !mddev->barrier) 231 break; 232 rcu_read_unlock(); 233 schedule(); 234 rcu_read_lock(); 235 } 236 finish_wait(&mddev->sb_wait, &__wait); 237 } 238 atomic_inc(&mddev->active_io); 239 rcu_read_unlock(); 240 rv = mddev->pers->make_request(q, bio); 241 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 242 wake_up(&mddev->sb_wait); 243 244 return rv; 245 } 246 247 static void mddev_suspend(mddev_t *mddev) 248 { 249 BUG_ON(mddev->suspended); 250 mddev->suspended = 1; 251 synchronize_rcu(); 252 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 253 mddev->pers->quiesce(mddev, 1); 254 md_unregister_thread(mddev->thread); 255 mddev->thread = NULL; 256 /* we now know that no code is executing in the personality module, 257 * except possibly the tail end of a ->bi_end_io function, but that 258 * is certain to complete before the module has a chance to get 259 * unloaded 260 */ 261 } 262 263 static void mddev_resume(mddev_t *mddev) 264 { 265 mddev->suspended = 0; 266 wake_up(&mddev->sb_wait); 267 mddev->pers->quiesce(mddev, 0); 268 } 269 270 int mddev_congested(mddev_t *mddev, int bits) 271 { 272 if (mddev->barrier) 273 return 1; 274 return mddev->suspended; 275 } 276 EXPORT_SYMBOL(mddev_congested); 277 278 /* 279 * Generic barrier handling for md 280 */ 281 282 #define POST_REQUEST_BARRIER ((void*)1) 283 284 static void md_end_barrier(struct bio *bio, int err) 285 { 286 mdk_rdev_t *rdev = bio->bi_private; 287 mddev_t *mddev = rdev->mddev; 288 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) 289 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); 290 291 rdev_dec_pending(rdev, mddev); 292 293 if (atomic_dec_and_test(&mddev->flush_pending)) { 294 if (mddev->barrier == POST_REQUEST_BARRIER) { 295 /* This was a post-request barrier */ 296 mddev->barrier = NULL; 297 wake_up(&mddev->sb_wait); 298 } else 299 /* The pre-request barrier has finished */ 300 schedule_work(&mddev->barrier_work); 301 } 302 bio_put(bio); 303 } 304 305 static void submit_barriers(mddev_t *mddev) 306 { 307 mdk_rdev_t *rdev; 308 309 rcu_read_lock(); 310 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 311 if (rdev->raid_disk >= 0 && 312 !test_bit(Faulty, &rdev->flags)) { 313 /* Take two references, one is dropped 314 * when request finishes, one after 315 * we reclaim rcu_read_lock 316 */ 317 struct bio *bi; 318 atomic_inc(&rdev->nr_pending); 319 atomic_inc(&rdev->nr_pending); 320 rcu_read_unlock(); 321 bi = bio_alloc(GFP_KERNEL, 0); 322 bi->bi_end_io = md_end_barrier; 323 bi->bi_private = rdev; 324 bi->bi_bdev = rdev->bdev; 325 atomic_inc(&mddev->flush_pending); 326 submit_bio(WRITE_BARRIER, bi); 327 rcu_read_lock(); 328 rdev_dec_pending(rdev, mddev); 329 } 330 rcu_read_unlock(); 331 } 332 333 static void md_submit_barrier(struct work_struct *ws) 334 { 335 mddev_t *mddev = container_of(ws, mddev_t, barrier_work); 336 struct bio *bio = mddev->barrier; 337 338 atomic_set(&mddev->flush_pending, 1); 339 340 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 341 bio_endio(bio, -EOPNOTSUPP); 342 else if (bio->bi_size == 0) 343 /* an empty barrier - all done */ 344 bio_endio(bio, 0); 345 else { 346 bio->bi_rw &= ~(1<<BIO_RW_BARRIER); 347 if (mddev->pers->make_request(mddev->queue, bio)) 348 generic_make_request(bio); 349 mddev->barrier = POST_REQUEST_BARRIER; 350 submit_barriers(mddev); 351 } 352 if (atomic_dec_and_test(&mddev->flush_pending)) { 353 mddev->barrier = NULL; 354 wake_up(&mddev->sb_wait); 355 } 356 } 357 358 void md_barrier_request(mddev_t *mddev, struct bio *bio) 359 { 360 spin_lock_irq(&mddev->write_lock); 361 wait_event_lock_irq(mddev->sb_wait, 362 !mddev->barrier, 363 mddev->write_lock, /*nothing*/); 364 mddev->barrier = bio; 365 spin_unlock_irq(&mddev->write_lock); 366 367 atomic_set(&mddev->flush_pending, 1); 368 INIT_WORK(&mddev->barrier_work, md_submit_barrier); 369 370 submit_barriers(mddev); 371 372 if (atomic_dec_and_test(&mddev->flush_pending)) 373 schedule_work(&mddev->barrier_work); 374 } 375 EXPORT_SYMBOL(md_barrier_request); 376 377 static inline mddev_t *mddev_get(mddev_t *mddev) 378 { 379 atomic_inc(&mddev->active); 380 return mddev; 381 } 382 383 static void mddev_delayed_delete(struct work_struct *ws); 384 385 static void mddev_put(mddev_t *mddev) 386 { 387 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 388 return; 389 if (!mddev->raid_disks && list_empty(&mddev->disks) && 390 mddev->ctime == 0 && !mddev->hold_active) { 391 /* Array is not configured at all, and not held active, 392 * so destroy it */ 393 list_del(&mddev->all_mddevs); 394 if (mddev->gendisk) { 395 /* we did a probe so need to clean up. 396 * Call schedule_work inside the spinlock 397 * so that flush_scheduled_work() after 398 * mddev_find will succeed in waiting for the 399 * work to be done. 400 */ 401 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 402 schedule_work(&mddev->del_work); 403 } else 404 kfree(mddev); 405 } 406 spin_unlock(&all_mddevs_lock); 407 } 408 409 static mddev_t * mddev_find(dev_t unit) 410 { 411 mddev_t *mddev, *new = NULL; 412 413 retry: 414 spin_lock(&all_mddevs_lock); 415 416 if (unit) { 417 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 418 if (mddev->unit == unit) { 419 mddev_get(mddev); 420 spin_unlock(&all_mddevs_lock); 421 kfree(new); 422 return mddev; 423 } 424 425 if (new) { 426 list_add(&new->all_mddevs, &all_mddevs); 427 spin_unlock(&all_mddevs_lock); 428 new->hold_active = UNTIL_IOCTL; 429 return new; 430 } 431 } else if (new) { 432 /* find an unused unit number */ 433 static int next_minor = 512; 434 int start = next_minor; 435 int is_free = 0; 436 int dev = 0; 437 while (!is_free) { 438 dev = MKDEV(MD_MAJOR, next_minor); 439 next_minor++; 440 if (next_minor > MINORMASK) 441 next_minor = 0; 442 if (next_minor == start) { 443 /* Oh dear, all in use. */ 444 spin_unlock(&all_mddevs_lock); 445 kfree(new); 446 return NULL; 447 } 448 449 is_free = 1; 450 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 451 if (mddev->unit == dev) { 452 is_free = 0; 453 break; 454 } 455 } 456 new->unit = dev; 457 new->md_minor = MINOR(dev); 458 new->hold_active = UNTIL_STOP; 459 list_add(&new->all_mddevs, &all_mddevs); 460 spin_unlock(&all_mddevs_lock); 461 return new; 462 } 463 spin_unlock(&all_mddevs_lock); 464 465 new = kzalloc(sizeof(*new), GFP_KERNEL); 466 if (!new) 467 return NULL; 468 469 new->unit = unit; 470 if (MAJOR(unit) == MD_MAJOR) 471 new->md_minor = MINOR(unit); 472 else 473 new->md_minor = MINOR(unit) >> MdpMinorShift; 474 475 mutex_init(&new->open_mutex); 476 mutex_init(&new->reconfig_mutex); 477 mutex_init(&new->bitmap_info.mutex); 478 INIT_LIST_HEAD(&new->disks); 479 INIT_LIST_HEAD(&new->all_mddevs); 480 init_timer(&new->safemode_timer); 481 atomic_set(&new->active, 1); 482 atomic_set(&new->openers, 0); 483 atomic_set(&new->active_io, 0); 484 spin_lock_init(&new->write_lock); 485 atomic_set(&new->flush_pending, 0); 486 init_waitqueue_head(&new->sb_wait); 487 init_waitqueue_head(&new->recovery_wait); 488 new->reshape_position = MaxSector; 489 new->resync_min = 0; 490 new->resync_max = MaxSector; 491 new->level = LEVEL_NONE; 492 493 goto retry; 494 } 495 496 static inline int mddev_lock(mddev_t * mddev) 497 { 498 return mutex_lock_interruptible(&mddev->reconfig_mutex); 499 } 500 501 static inline int mddev_is_locked(mddev_t *mddev) 502 { 503 return mutex_is_locked(&mddev->reconfig_mutex); 504 } 505 506 static inline int mddev_trylock(mddev_t * mddev) 507 { 508 return mutex_trylock(&mddev->reconfig_mutex); 509 } 510 511 static inline void mddev_unlock(mddev_t * mddev) 512 { 513 mutex_unlock(&mddev->reconfig_mutex); 514 515 md_wakeup_thread(mddev->thread); 516 } 517 518 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 519 { 520 mdk_rdev_t *rdev; 521 522 list_for_each_entry(rdev, &mddev->disks, same_set) 523 if (rdev->desc_nr == nr) 524 return rdev; 525 526 return NULL; 527 } 528 529 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 530 { 531 mdk_rdev_t *rdev; 532 533 list_for_each_entry(rdev, &mddev->disks, same_set) 534 if (rdev->bdev->bd_dev == dev) 535 return rdev; 536 537 return NULL; 538 } 539 540 static struct mdk_personality *find_pers(int level, char *clevel) 541 { 542 struct mdk_personality *pers; 543 list_for_each_entry(pers, &pers_list, list) { 544 if (level != LEVEL_NONE && pers->level == level) 545 return pers; 546 if (strcmp(pers->name, clevel)==0) 547 return pers; 548 } 549 return NULL; 550 } 551 552 /* return the offset of the super block in 512byte sectors */ 553 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 554 { 555 sector_t num_sectors = bdev->bd_inode->i_size / 512; 556 return MD_NEW_SIZE_SECTORS(num_sectors); 557 } 558 559 static int alloc_disk_sb(mdk_rdev_t * rdev) 560 { 561 if (rdev->sb_page) 562 MD_BUG(); 563 564 rdev->sb_page = alloc_page(GFP_KERNEL); 565 if (!rdev->sb_page) { 566 printk(KERN_ALERT "md: out of memory.\n"); 567 return -ENOMEM; 568 } 569 570 return 0; 571 } 572 573 static void free_disk_sb(mdk_rdev_t * rdev) 574 { 575 if (rdev->sb_page) { 576 put_page(rdev->sb_page); 577 rdev->sb_loaded = 0; 578 rdev->sb_page = NULL; 579 rdev->sb_start = 0; 580 rdev->sectors = 0; 581 } 582 } 583 584 585 static void super_written(struct bio *bio, int error) 586 { 587 mdk_rdev_t *rdev = bio->bi_private; 588 mddev_t *mddev = rdev->mddev; 589 590 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 591 printk("md: super_written gets error=%d, uptodate=%d\n", 592 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 593 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 594 md_error(mddev, rdev); 595 } 596 597 if (atomic_dec_and_test(&mddev->pending_writes)) 598 wake_up(&mddev->sb_wait); 599 bio_put(bio); 600 } 601 602 static void super_written_barrier(struct bio *bio, int error) 603 { 604 struct bio *bio2 = bio->bi_private; 605 mdk_rdev_t *rdev = bio2->bi_private; 606 mddev_t *mddev = rdev->mddev; 607 608 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 609 error == -EOPNOTSUPP) { 610 unsigned long flags; 611 /* barriers don't appear to be supported :-( */ 612 set_bit(BarriersNotsupp, &rdev->flags); 613 mddev->barriers_work = 0; 614 spin_lock_irqsave(&mddev->write_lock, flags); 615 bio2->bi_next = mddev->biolist; 616 mddev->biolist = bio2; 617 spin_unlock_irqrestore(&mddev->write_lock, flags); 618 wake_up(&mddev->sb_wait); 619 bio_put(bio); 620 } else { 621 bio_put(bio2); 622 bio->bi_private = rdev; 623 super_written(bio, error); 624 } 625 } 626 627 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 628 sector_t sector, int size, struct page *page) 629 { 630 /* write first size bytes of page to sector of rdev 631 * Increment mddev->pending_writes before returning 632 * and decrement it on completion, waking up sb_wait 633 * if zero is reached. 634 * If an error occurred, call md_error 635 * 636 * As we might need to resubmit the request if BIO_RW_BARRIER 637 * causes ENOTSUPP, we allocate a spare bio... 638 */ 639 struct bio *bio = bio_alloc(GFP_NOIO, 1); 640 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); 641 642 bio->bi_bdev = rdev->bdev; 643 bio->bi_sector = sector; 644 bio_add_page(bio, page, size, 0); 645 bio->bi_private = rdev; 646 bio->bi_end_io = super_written; 647 bio->bi_rw = rw; 648 649 atomic_inc(&mddev->pending_writes); 650 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 651 struct bio *rbio; 652 rw |= (1<<BIO_RW_BARRIER); 653 rbio = bio_clone(bio, GFP_NOIO); 654 rbio->bi_private = bio; 655 rbio->bi_end_io = super_written_barrier; 656 submit_bio(rw, rbio); 657 } else 658 submit_bio(rw, bio); 659 } 660 661 void md_super_wait(mddev_t *mddev) 662 { 663 /* wait for all superblock writes that were scheduled to complete. 664 * if any had to be retried (due to BARRIER problems), retry them 665 */ 666 DEFINE_WAIT(wq); 667 for(;;) { 668 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 669 if (atomic_read(&mddev->pending_writes)==0) 670 break; 671 while (mddev->biolist) { 672 struct bio *bio; 673 spin_lock_irq(&mddev->write_lock); 674 bio = mddev->biolist; 675 mddev->biolist = bio->bi_next ; 676 bio->bi_next = NULL; 677 spin_unlock_irq(&mddev->write_lock); 678 submit_bio(bio->bi_rw, bio); 679 } 680 schedule(); 681 } 682 finish_wait(&mddev->sb_wait, &wq); 683 } 684 685 static void bi_complete(struct bio *bio, int error) 686 { 687 complete((struct completion*)bio->bi_private); 688 } 689 690 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 691 struct page *page, int rw) 692 { 693 struct bio *bio = bio_alloc(GFP_NOIO, 1); 694 struct completion event; 695 int ret; 696 697 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 698 699 bio->bi_bdev = bdev; 700 bio->bi_sector = sector; 701 bio_add_page(bio, page, size, 0); 702 init_completion(&event); 703 bio->bi_private = &event; 704 bio->bi_end_io = bi_complete; 705 submit_bio(rw, bio); 706 wait_for_completion(&event); 707 708 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 709 bio_put(bio); 710 return ret; 711 } 712 EXPORT_SYMBOL_GPL(sync_page_io); 713 714 static int read_disk_sb(mdk_rdev_t * rdev, int size) 715 { 716 char b[BDEVNAME_SIZE]; 717 if (!rdev->sb_page) { 718 MD_BUG(); 719 return -EINVAL; 720 } 721 if (rdev->sb_loaded) 722 return 0; 723 724 725 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 726 goto fail; 727 rdev->sb_loaded = 1; 728 return 0; 729 730 fail: 731 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 732 bdevname(rdev->bdev,b)); 733 return -EINVAL; 734 } 735 736 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 737 { 738 return sb1->set_uuid0 == sb2->set_uuid0 && 739 sb1->set_uuid1 == sb2->set_uuid1 && 740 sb1->set_uuid2 == sb2->set_uuid2 && 741 sb1->set_uuid3 == sb2->set_uuid3; 742 } 743 744 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 745 { 746 int ret; 747 mdp_super_t *tmp1, *tmp2; 748 749 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 750 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 751 752 if (!tmp1 || !tmp2) { 753 ret = 0; 754 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 755 goto abort; 756 } 757 758 *tmp1 = *sb1; 759 *tmp2 = *sb2; 760 761 /* 762 * nr_disks is not constant 763 */ 764 tmp1->nr_disks = 0; 765 tmp2->nr_disks = 0; 766 767 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 768 abort: 769 kfree(tmp1); 770 kfree(tmp2); 771 return ret; 772 } 773 774 775 static u32 md_csum_fold(u32 csum) 776 { 777 csum = (csum & 0xffff) + (csum >> 16); 778 return (csum & 0xffff) + (csum >> 16); 779 } 780 781 static unsigned int calc_sb_csum(mdp_super_t * sb) 782 { 783 u64 newcsum = 0; 784 u32 *sb32 = (u32*)sb; 785 int i; 786 unsigned int disk_csum, csum; 787 788 disk_csum = sb->sb_csum; 789 sb->sb_csum = 0; 790 791 for (i = 0; i < MD_SB_BYTES/4 ; i++) 792 newcsum += sb32[i]; 793 csum = (newcsum & 0xffffffff) + (newcsum>>32); 794 795 796 #ifdef CONFIG_ALPHA 797 /* This used to use csum_partial, which was wrong for several 798 * reasons including that different results are returned on 799 * different architectures. It isn't critical that we get exactly 800 * the same return value as before (we always csum_fold before 801 * testing, and that removes any differences). However as we 802 * know that csum_partial always returned a 16bit value on 803 * alphas, do a fold to maximise conformity to previous behaviour. 804 */ 805 sb->sb_csum = md_csum_fold(disk_csum); 806 #else 807 sb->sb_csum = disk_csum; 808 #endif 809 return csum; 810 } 811 812 813 /* 814 * Handle superblock details. 815 * We want to be able to handle multiple superblock formats 816 * so we have a common interface to them all, and an array of 817 * different handlers. 818 * We rely on user-space to write the initial superblock, and support 819 * reading and updating of superblocks. 820 * Interface methods are: 821 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 822 * loads and validates a superblock on dev. 823 * if refdev != NULL, compare superblocks on both devices 824 * Return: 825 * 0 - dev has a superblock that is compatible with refdev 826 * 1 - dev has a superblock that is compatible and newer than refdev 827 * so dev should be used as the refdev in future 828 * -EINVAL superblock incompatible or invalid 829 * -othererror e.g. -EIO 830 * 831 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 832 * Verify that dev is acceptable into mddev. 833 * The first time, mddev->raid_disks will be 0, and data from 834 * dev should be merged in. Subsequent calls check that dev 835 * is new enough. Return 0 or -EINVAL 836 * 837 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 838 * Update the superblock for rdev with data in mddev 839 * This does not write to disc. 840 * 841 */ 842 843 struct super_type { 844 char *name; 845 struct module *owner; 846 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, 847 int minor_version); 848 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 849 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 850 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, 851 sector_t num_sectors); 852 }; 853 854 /* 855 * Check that the given mddev has no bitmap. 856 * 857 * This function is called from the run method of all personalities that do not 858 * support bitmaps. It prints an error message and returns non-zero if mddev 859 * has a bitmap. Otherwise, it returns 0. 860 * 861 */ 862 int md_check_no_bitmap(mddev_t *mddev) 863 { 864 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 865 return 0; 866 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 867 mdname(mddev), mddev->pers->name); 868 return 1; 869 } 870 EXPORT_SYMBOL(md_check_no_bitmap); 871 872 /* 873 * load_super for 0.90.0 874 */ 875 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 876 { 877 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 878 mdp_super_t *sb; 879 int ret; 880 881 /* 882 * Calculate the position of the superblock (512byte sectors), 883 * it's at the end of the disk. 884 * 885 * It also happens to be a multiple of 4Kb. 886 */ 887 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 888 889 ret = read_disk_sb(rdev, MD_SB_BYTES); 890 if (ret) return ret; 891 892 ret = -EINVAL; 893 894 bdevname(rdev->bdev, b); 895 sb = (mdp_super_t*)page_address(rdev->sb_page); 896 897 if (sb->md_magic != MD_SB_MAGIC) { 898 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 899 b); 900 goto abort; 901 } 902 903 if (sb->major_version != 0 || 904 sb->minor_version < 90 || 905 sb->minor_version > 91) { 906 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 907 sb->major_version, sb->minor_version, 908 b); 909 goto abort; 910 } 911 912 if (sb->raid_disks <= 0) 913 goto abort; 914 915 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 916 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 917 b); 918 goto abort; 919 } 920 921 rdev->preferred_minor = sb->md_minor; 922 rdev->data_offset = 0; 923 rdev->sb_size = MD_SB_BYTES; 924 925 if (sb->level == LEVEL_MULTIPATH) 926 rdev->desc_nr = -1; 927 else 928 rdev->desc_nr = sb->this_disk.number; 929 930 if (!refdev) { 931 ret = 1; 932 } else { 933 __u64 ev1, ev2; 934 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 935 if (!uuid_equal(refsb, sb)) { 936 printk(KERN_WARNING "md: %s has different UUID to %s\n", 937 b, bdevname(refdev->bdev,b2)); 938 goto abort; 939 } 940 if (!sb_equal(refsb, sb)) { 941 printk(KERN_WARNING "md: %s has same UUID" 942 " but different superblock to %s\n", 943 b, bdevname(refdev->bdev, b2)); 944 goto abort; 945 } 946 ev1 = md_event(sb); 947 ev2 = md_event(refsb); 948 if (ev1 > ev2) 949 ret = 1; 950 else 951 ret = 0; 952 } 953 rdev->sectors = rdev->sb_start; 954 955 if (rdev->sectors < sb->size * 2 && sb->level > 1) 956 /* "this cannot possibly happen" ... */ 957 ret = -EINVAL; 958 959 abort: 960 return ret; 961 } 962 963 /* 964 * validate_super for 0.90.0 965 */ 966 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 967 { 968 mdp_disk_t *desc; 969 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 970 __u64 ev1 = md_event(sb); 971 972 rdev->raid_disk = -1; 973 clear_bit(Faulty, &rdev->flags); 974 clear_bit(In_sync, &rdev->flags); 975 clear_bit(WriteMostly, &rdev->flags); 976 clear_bit(BarriersNotsupp, &rdev->flags); 977 978 if (mddev->raid_disks == 0) { 979 mddev->major_version = 0; 980 mddev->minor_version = sb->minor_version; 981 mddev->patch_version = sb->patch_version; 982 mddev->external = 0; 983 mddev->chunk_sectors = sb->chunk_size >> 9; 984 mddev->ctime = sb->ctime; 985 mddev->utime = sb->utime; 986 mddev->level = sb->level; 987 mddev->clevel[0] = 0; 988 mddev->layout = sb->layout; 989 mddev->raid_disks = sb->raid_disks; 990 mddev->dev_sectors = sb->size * 2; 991 mddev->events = ev1; 992 mddev->bitmap_info.offset = 0; 993 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 994 995 if (mddev->minor_version >= 91) { 996 mddev->reshape_position = sb->reshape_position; 997 mddev->delta_disks = sb->delta_disks; 998 mddev->new_level = sb->new_level; 999 mddev->new_layout = sb->new_layout; 1000 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1001 } else { 1002 mddev->reshape_position = MaxSector; 1003 mddev->delta_disks = 0; 1004 mddev->new_level = mddev->level; 1005 mddev->new_layout = mddev->layout; 1006 mddev->new_chunk_sectors = mddev->chunk_sectors; 1007 } 1008 1009 if (sb->state & (1<<MD_SB_CLEAN)) 1010 mddev->recovery_cp = MaxSector; 1011 else { 1012 if (sb->events_hi == sb->cp_events_hi && 1013 sb->events_lo == sb->cp_events_lo) { 1014 mddev->recovery_cp = sb->recovery_cp; 1015 } else 1016 mddev->recovery_cp = 0; 1017 } 1018 1019 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1020 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1021 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1022 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1023 1024 mddev->max_disks = MD_SB_DISKS; 1025 1026 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1027 mddev->bitmap_info.file == NULL) 1028 mddev->bitmap_info.offset = 1029 mddev->bitmap_info.default_offset; 1030 1031 } else if (mddev->pers == NULL) { 1032 /* Insist on good event counter while assembling */ 1033 ++ev1; 1034 if (ev1 < mddev->events) 1035 return -EINVAL; 1036 } else if (mddev->bitmap) { 1037 /* if adding to array with a bitmap, then we can accept an 1038 * older device ... but not too old. 1039 */ 1040 if (ev1 < mddev->bitmap->events_cleared) 1041 return 0; 1042 } else { 1043 if (ev1 < mddev->events) 1044 /* just a hot-add of a new device, leave raid_disk at -1 */ 1045 return 0; 1046 } 1047 1048 if (mddev->level != LEVEL_MULTIPATH) { 1049 desc = sb->disks + rdev->desc_nr; 1050 1051 if (desc->state & (1<<MD_DISK_FAULTY)) 1052 set_bit(Faulty, &rdev->flags); 1053 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1054 desc->raid_disk < mddev->raid_disks */) { 1055 set_bit(In_sync, &rdev->flags); 1056 rdev->raid_disk = desc->raid_disk; 1057 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1058 /* active but not in sync implies recovery up to 1059 * reshape position. We don't know exactly where 1060 * that is, so set to zero for now */ 1061 if (mddev->minor_version >= 91) { 1062 rdev->recovery_offset = 0; 1063 rdev->raid_disk = desc->raid_disk; 1064 } 1065 } 1066 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1067 set_bit(WriteMostly, &rdev->flags); 1068 } else /* MULTIPATH are always insync */ 1069 set_bit(In_sync, &rdev->flags); 1070 return 0; 1071 } 1072 1073 /* 1074 * sync_super for 0.90.0 1075 */ 1076 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1077 { 1078 mdp_super_t *sb; 1079 mdk_rdev_t *rdev2; 1080 int next_spare = mddev->raid_disks; 1081 1082 1083 /* make rdev->sb match mddev data.. 1084 * 1085 * 1/ zero out disks 1086 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1087 * 3/ any empty disks < next_spare become removed 1088 * 1089 * disks[0] gets initialised to REMOVED because 1090 * we cannot be sure from other fields if it has 1091 * been initialised or not. 1092 */ 1093 int i; 1094 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1095 1096 rdev->sb_size = MD_SB_BYTES; 1097 1098 sb = (mdp_super_t*)page_address(rdev->sb_page); 1099 1100 memset(sb, 0, sizeof(*sb)); 1101 1102 sb->md_magic = MD_SB_MAGIC; 1103 sb->major_version = mddev->major_version; 1104 sb->patch_version = mddev->patch_version; 1105 sb->gvalid_words = 0; /* ignored */ 1106 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1107 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1108 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1109 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1110 1111 sb->ctime = mddev->ctime; 1112 sb->level = mddev->level; 1113 sb->size = mddev->dev_sectors / 2; 1114 sb->raid_disks = mddev->raid_disks; 1115 sb->md_minor = mddev->md_minor; 1116 sb->not_persistent = 0; 1117 sb->utime = mddev->utime; 1118 sb->state = 0; 1119 sb->events_hi = (mddev->events>>32); 1120 sb->events_lo = (u32)mddev->events; 1121 1122 if (mddev->reshape_position == MaxSector) 1123 sb->minor_version = 90; 1124 else { 1125 sb->minor_version = 91; 1126 sb->reshape_position = mddev->reshape_position; 1127 sb->new_level = mddev->new_level; 1128 sb->delta_disks = mddev->delta_disks; 1129 sb->new_layout = mddev->new_layout; 1130 sb->new_chunk = mddev->new_chunk_sectors << 9; 1131 } 1132 mddev->minor_version = sb->minor_version; 1133 if (mddev->in_sync) 1134 { 1135 sb->recovery_cp = mddev->recovery_cp; 1136 sb->cp_events_hi = (mddev->events>>32); 1137 sb->cp_events_lo = (u32)mddev->events; 1138 if (mddev->recovery_cp == MaxSector) 1139 sb->state = (1<< MD_SB_CLEAN); 1140 } else 1141 sb->recovery_cp = 0; 1142 1143 sb->layout = mddev->layout; 1144 sb->chunk_size = mddev->chunk_sectors << 9; 1145 1146 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1147 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1148 1149 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1150 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1151 mdp_disk_t *d; 1152 int desc_nr; 1153 int is_active = test_bit(In_sync, &rdev2->flags); 1154 1155 if (rdev2->raid_disk >= 0 && 1156 sb->minor_version >= 91) 1157 /* we have nowhere to store the recovery_offset, 1158 * but if it is not below the reshape_position, 1159 * we can piggy-back on that. 1160 */ 1161 is_active = 1; 1162 if (rdev2->raid_disk < 0 || 1163 test_bit(Faulty, &rdev2->flags)) 1164 is_active = 0; 1165 if (is_active) 1166 desc_nr = rdev2->raid_disk; 1167 else 1168 desc_nr = next_spare++; 1169 rdev2->desc_nr = desc_nr; 1170 d = &sb->disks[rdev2->desc_nr]; 1171 nr_disks++; 1172 d->number = rdev2->desc_nr; 1173 d->major = MAJOR(rdev2->bdev->bd_dev); 1174 d->minor = MINOR(rdev2->bdev->bd_dev); 1175 if (is_active) 1176 d->raid_disk = rdev2->raid_disk; 1177 else 1178 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1179 if (test_bit(Faulty, &rdev2->flags)) 1180 d->state = (1<<MD_DISK_FAULTY); 1181 else if (is_active) { 1182 d->state = (1<<MD_DISK_ACTIVE); 1183 if (test_bit(In_sync, &rdev2->flags)) 1184 d->state |= (1<<MD_DISK_SYNC); 1185 active++; 1186 working++; 1187 } else { 1188 d->state = 0; 1189 spare++; 1190 working++; 1191 } 1192 if (test_bit(WriteMostly, &rdev2->flags)) 1193 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1194 } 1195 /* now set the "removed" and "faulty" bits on any missing devices */ 1196 for (i=0 ; i < mddev->raid_disks ; i++) { 1197 mdp_disk_t *d = &sb->disks[i]; 1198 if (d->state == 0 && d->number == 0) { 1199 d->number = i; 1200 d->raid_disk = i; 1201 d->state = (1<<MD_DISK_REMOVED); 1202 d->state |= (1<<MD_DISK_FAULTY); 1203 failed++; 1204 } 1205 } 1206 sb->nr_disks = nr_disks; 1207 sb->active_disks = active; 1208 sb->working_disks = working; 1209 sb->failed_disks = failed; 1210 sb->spare_disks = spare; 1211 1212 sb->this_disk = sb->disks[rdev->desc_nr]; 1213 sb->sb_csum = calc_sb_csum(sb); 1214 } 1215 1216 /* 1217 * rdev_size_change for 0.90.0 1218 */ 1219 static unsigned long long 1220 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1221 { 1222 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1223 return 0; /* component must fit device */ 1224 if (rdev->mddev->bitmap_info.offset) 1225 return 0; /* can't move bitmap */ 1226 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1227 if (!num_sectors || num_sectors > rdev->sb_start) 1228 num_sectors = rdev->sb_start; 1229 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1230 rdev->sb_page); 1231 md_super_wait(rdev->mddev); 1232 return num_sectors / 2; /* kB for sysfs */ 1233 } 1234 1235 1236 /* 1237 * version 1 superblock 1238 */ 1239 1240 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1241 { 1242 __le32 disk_csum; 1243 u32 csum; 1244 unsigned long long newcsum; 1245 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1246 __le32 *isuper = (__le32*)sb; 1247 int i; 1248 1249 disk_csum = sb->sb_csum; 1250 sb->sb_csum = 0; 1251 newcsum = 0; 1252 for (i=0; size>=4; size -= 4 ) 1253 newcsum += le32_to_cpu(*isuper++); 1254 1255 if (size == 2) 1256 newcsum += le16_to_cpu(*(__le16*) isuper); 1257 1258 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1259 sb->sb_csum = disk_csum; 1260 return cpu_to_le32(csum); 1261 } 1262 1263 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1264 { 1265 struct mdp_superblock_1 *sb; 1266 int ret; 1267 sector_t sb_start; 1268 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1269 int bmask; 1270 1271 /* 1272 * Calculate the position of the superblock in 512byte sectors. 1273 * It is always aligned to a 4K boundary and 1274 * depeding on minor_version, it can be: 1275 * 0: At least 8K, but less than 12K, from end of device 1276 * 1: At start of device 1277 * 2: 4K from start of device. 1278 */ 1279 switch(minor_version) { 1280 case 0: 1281 sb_start = rdev->bdev->bd_inode->i_size >> 9; 1282 sb_start -= 8*2; 1283 sb_start &= ~(sector_t)(4*2-1); 1284 break; 1285 case 1: 1286 sb_start = 0; 1287 break; 1288 case 2: 1289 sb_start = 8; 1290 break; 1291 default: 1292 return -EINVAL; 1293 } 1294 rdev->sb_start = sb_start; 1295 1296 /* superblock is rarely larger than 1K, but it can be larger, 1297 * and it is safe to read 4k, so we do that 1298 */ 1299 ret = read_disk_sb(rdev, 4096); 1300 if (ret) return ret; 1301 1302 1303 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1304 1305 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1306 sb->major_version != cpu_to_le32(1) || 1307 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1308 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1309 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1310 return -EINVAL; 1311 1312 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1313 printk("md: invalid superblock checksum on %s\n", 1314 bdevname(rdev->bdev,b)); 1315 return -EINVAL; 1316 } 1317 if (le64_to_cpu(sb->data_size) < 10) { 1318 printk("md: data_size too small on %s\n", 1319 bdevname(rdev->bdev,b)); 1320 return -EINVAL; 1321 } 1322 1323 rdev->preferred_minor = 0xffff; 1324 rdev->data_offset = le64_to_cpu(sb->data_offset); 1325 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1326 1327 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1328 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1329 if (rdev->sb_size & bmask) 1330 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1331 1332 if (minor_version 1333 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1334 return -EINVAL; 1335 1336 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1337 rdev->desc_nr = -1; 1338 else 1339 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1340 1341 if (!refdev) { 1342 ret = 1; 1343 } else { 1344 __u64 ev1, ev2; 1345 struct mdp_superblock_1 *refsb = 1346 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1347 1348 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1349 sb->level != refsb->level || 1350 sb->layout != refsb->layout || 1351 sb->chunksize != refsb->chunksize) { 1352 printk(KERN_WARNING "md: %s has strangely different" 1353 " superblock to %s\n", 1354 bdevname(rdev->bdev,b), 1355 bdevname(refdev->bdev,b2)); 1356 return -EINVAL; 1357 } 1358 ev1 = le64_to_cpu(sb->events); 1359 ev2 = le64_to_cpu(refsb->events); 1360 1361 if (ev1 > ev2) 1362 ret = 1; 1363 else 1364 ret = 0; 1365 } 1366 if (minor_version) 1367 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - 1368 le64_to_cpu(sb->data_offset); 1369 else 1370 rdev->sectors = rdev->sb_start; 1371 if (rdev->sectors < le64_to_cpu(sb->data_size)) 1372 return -EINVAL; 1373 rdev->sectors = le64_to_cpu(sb->data_size); 1374 if (le64_to_cpu(sb->size) > rdev->sectors) 1375 return -EINVAL; 1376 return ret; 1377 } 1378 1379 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1380 { 1381 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1382 __u64 ev1 = le64_to_cpu(sb->events); 1383 1384 rdev->raid_disk = -1; 1385 clear_bit(Faulty, &rdev->flags); 1386 clear_bit(In_sync, &rdev->flags); 1387 clear_bit(WriteMostly, &rdev->flags); 1388 clear_bit(BarriersNotsupp, &rdev->flags); 1389 1390 if (mddev->raid_disks == 0) { 1391 mddev->major_version = 1; 1392 mddev->patch_version = 0; 1393 mddev->external = 0; 1394 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1395 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1396 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1397 mddev->level = le32_to_cpu(sb->level); 1398 mddev->clevel[0] = 0; 1399 mddev->layout = le32_to_cpu(sb->layout); 1400 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1401 mddev->dev_sectors = le64_to_cpu(sb->size); 1402 mddev->events = ev1; 1403 mddev->bitmap_info.offset = 0; 1404 mddev->bitmap_info.default_offset = 1024 >> 9; 1405 1406 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1407 memcpy(mddev->uuid, sb->set_uuid, 16); 1408 1409 mddev->max_disks = (4096-256)/2; 1410 1411 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1412 mddev->bitmap_info.file == NULL ) 1413 mddev->bitmap_info.offset = 1414 (__s32)le32_to_cpu(sb->bitmap_offset); 1415 1416 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1417 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1418 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1419 mddev->new_level = le32_to_cpu(sb->new_level); 1420 mddev->new_layout = le32_to_cpu(sb->new_layout); 1421 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1422 } else { 1423 mddev->reshape_position = MaxSector; 1424 mddev->delta_disks = 0; 1425 mddev->new_level = mddev->level; 1426 mddev->new_layout = mddev->layout; 1427 mddev->new_chunk_sectors = mddev->chunk_sectors; 1428 } 1429 1430 } else if (mddev->pers == NULL) { 1431 /* Insist of good event counter while assembling */ 1432 ++ev1; 1433 if (ev1 < mddev->events) 1434 return -EINVAL; 1435 } else if (mddev->bitmap) { 1436 /* If adding to array with a bitmap, then we can accept an 1437 * older device, but not too old. 1438 */ 1439 if (ev1 < mddev->bitmap->events_cleared) 1440 return 0; 1441 } else { 1442 if (ev1 < mddev->events) 1443 /* just a hot-add of a new device, leave raid_disk at -1 */ 1444 return 0; 1445 } 1446 if (mddev->level != LEVEL_MULTIPATH) { 1447 int role; 1448 if (rdev->desc_nr < 0 || 1449 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1450 role = 0xffff; 1451 rdev->desc_nr = -1; 1452 } else 1453 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1454 switch(role) { 1455 case 0xffff: /* spare */ 1456 break; 1457 case 0xfffe: /* faulty */ 1458 set_bit(Faulty, &rdev->flags); 1459 break; 1460 default: 1461 if ((le32_to_cpu(sb->feature_map) & 1462 MD_FEATURE_RECOVERY_OFFSET)) 1463 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1464 else 1465 set_bit(In_sync, &rdev->flags); 1466 rdev->raid_disk = role; 1467 break; 1468 } 1469 if (sb->devflags & WriteMostly1) 1470 set_bit(WriteMostly, &rdev->flags); 1471 } else /* MULTIPATH are always insync */ 1472 set_bit(In_sync, &rdev->flags); 1473 1474 return 0; 1475 } 1476 1477 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1478 { 1479 struct mdp_superblock_1 *sb; 1480 mdk_rdev_t *rdev2; 1481 int max_dev, i; 1482 /* make rdev->sb match mddev and rdev data. */ 1483 1484 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1485 1486 sb->feature_map = 0; 1487 sb->pad0 = 0; 1488 sb->recovery_offset = cpu_to_le64(0); 1489 memset(sb->pad1, 0, sizeof(sb->pad1)); 1490 memset(sb->pad2, 0, sizeof(sb->pad2)); 1491 memset(sb->pad3, 0, sizeof(sb->pad3)); 1492 1493 sb->utime = cpu_to_le64((__u64)mddev->utime); 1494 sb->events = cpu_to_le64(mddev->events); 1495 if (mddev->in_sync) 1496 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1497 else 1498 sb->resync_offset = cpu_to_le64(0); 1499 1500 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1501 1502 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1503 sb->size = cpu_to_le64(mddev->dev_sectors); 1504 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1505 sb->level = cpu_to_le32(mddev->level); 1506 sb->layout = cpu_to_le32(mddev->layout); 1507 1508 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1509 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1510 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1511 } 1512 1513 if (rdev->raid_disk >= 0 && 1514 !test_bit(In_sync, &rdev->flags)) { 1515 sb->feature_map |= 1516 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1517 sb->recovery_offset = 1518 cpu_to_le64(rdev->recovery_offset); 1519 } 1520 1521 if (mddev->reshape_position != MaxSector) { 1522 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1523 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1524 sb->new_layout = cpu_to_le32(mddev->new_layout); 1525 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1526 sb->new_level = cpu_to_le32(mddev->new_level); 1527 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1528 } 1529 1530 max_dev = 0; 1531 list_for_each_entry(rdev2, &mddev->disks, same_set) 1532 if (rdev2->desc_nr+1 > max_dev) 1533 max_dev = rdev2->desc_nr+1; 1534 1535 if (max_dev > le32_to_cpu(sb->max_dev)) { 1536 int bmask; 1537 sb->max_dev = cpu_to_le32(max_dev); 1538 rdev->sb_size = max_dev * 2 + 256; 1539 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1540 if (rdev->sb_size & bmask) 1541 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1542 } 1543 for (i=0; i<max_dev;i++) 1544 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1545 1546 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1547 i = rdev2->desc_nr; 1548 if (test_bit(Faulty, &rdev2->flags)) 1549 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1550 else if (test_bit(In_sync, &rdev2->flags)) 1551 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1552 else if (rdev2->raid_disk >= 0) 1553 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1554 else 1555 sb->dev_roles[i] = cpu_to_le16(0xffff); 1556 } 1557 1558 sb->sb_csum = calc_sb_1_csum(sb); 1559 } 1560 1561 static unsigned long long 1562 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1563 { 1564 struct mdp_superblock_1 *sb; 1565 sector_t max_sectors; 1566 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1567 return 0; /* component must fit device */ 1568 if (rdev->sb_start < rdev->data_offset) { 1569 /* minor versions 1 and 2; superblock before data */ 1570 max_sectors = rdev->bdev->bd_inode->i_size >> 9; 1571 max_sectors -= rdev->data_offset; 1572 if (!num_sectors || num_sectors > max_sectors) 1573 num_sectors = max_sectors; 1574 } else if (rdev->mddev->bitmap_info.offset) { 1575 /* minor version 0 with bitmap we can't move */ 1576 return 0; 1577 } else { 1578 /* minor version 0; superblock after data */ 1579 sector_t sb_start; 1580 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1581 sb_start &= ~(sector_t)(4*2 - 1); 1582 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1583 if (!num_sectors || num_sectors > max_sectors) 1584 num_sectors = max_sectors; 1585 rdev->sb_start = sb_start; 1586 } 1587 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1588 sb->data_size = cpu_to_le64(num_sectors); 1589 sb->super_offset = rdev->sb_start; 1590 sb->sb_csum = calc_sb_1_csum(sb); 1591 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1592 rdev->sb_page); 1593 md_super_wait(rdev->mddev); 1594 return num_sectors / 2; /* kB for sysfs */ 1595 } 1596 1597 static struct super_type super_types[] = { 1598 [0] = { 1599 .name = "0.90.0", 1600 .owner = THIS_MODULE, 1601 .load_super = super_90_load, 1602 .validate_super = super_90_validate, 1603 .sync_super = super_90_sync, 1604 .rdev_size_change = super_90_rdev_size_change, 1605 }, 1606 [1] = { 1607 .name = "md-1", 1608 .owner = THIS_MODULE, 1609 .load_super = super_1_load, 1610 .validate_super = super_1_validate, 1611 .sync_super = super_1_sync, 1612 .rdev_size_change = super_1_rdev_size_change, 1613 }, 1614 }; 1615 1616 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1617 { 1618 mdk_rdev_t *rdev, *rdev2; 1619 1620 rcu_read_lock(); 1621 rdev_for_each_rcu(rdev, mddev1) 1622 rdev_for_each_rcu(rdev2, mddev2) 1623 if (rdev->bdev->bd_contains == 1624 rdev2->bdev->bd_contains) { 1625 rcu_read_unlock(); 1626 return 1; 1627 } 1628 rcu_read_unlock(); 1629 return 0; 1630 } 1631 1632 static LIST_HEAD(pending_raid_disks); 1633 1634 /* 1635 * Try to register data integrity profile for an mddev 1636 * 1637 * This is called when an array is started and after a disk has been kicked 1638 * from the array. It only succeeds if all working and active component devices 1639 * are integrity capable with matching profiles. 1640 */ 1641 int md_integrity_register(mddev_t *mddev) 1642 { 1643 mdk_rdev_t *rdev, *reference = NULL; 1644 1645 if (list_empty(&mddev->disks)) 1646 return 0; /* nothing to do */ 1647 if (blk_get_integrity(mddev->gendisk)) 1648 return 0; /* already registered */ 1649 list_for_each_entry(rdev, &mddev->disks, same_set) { 1650 /* skip spares and non-functional disks */ 1651 if (test_bit(Faulty, &rdev->flags)) 1652 continue; 1653 if (rdev->raid_disk < 0) 1654 continue; 1655 /* 1656 * If at least one rdev is not integrity capable, we can not 1657 * enable data integrity for the md device. 1658 */ 1659 if (!bdev_get_integrity(rdev->bdev)) 1660 return -EINVAL; 1661 if (!reference) { 1662 /* Use the first rdev as the reference */ 1663 reference = rdev; 1664 continue; 1665 } 1666 /* does this rdev's profile match the reference profile? */ 1667 if (blk_integrity_compare(reference->bdev->bd_disk, 1668 rdev->bdev->bd_disk) < 0) 1669 return -EINVAL; 1670 } 1671 /* 1672 * All component devices are integrity capable and have matching 1673 * profiles, register the common profile for the md device. 1674 */ 1675 if (blk_integrity_register(mddev->gendisk, 1676 bdev_get_integrity(reference->bdev)) != 0) { 1677 printk(KERN_ERR "md: failed to register integrity for %s\n", 1678 mdname(mddev)); 1679 return -EINVAL; 1680 } 1681 printk(KERN_NOTICE "md: data integrity on %s enabled\n", 1682 mdname(mddev)); 1683 return 0; 1684 } 1685 EXPORT_SYMBOL(md_integrity_register); 1686 1687 /* Disable data integrity if non-capable/non-matching disk is being added */ 1688 void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 1689 { 1690 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); 1691 struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); 1692 1693 if (!bi_mddev) /* nothing to do */ 1694 return; 1695 if (rdev->raid_disk < 0) /* skip spares */ 1696 return; 1697 if (bi_rdev && blk_integrity_compare(mddev->gendisk, 1698 rdev->bdev->bd_disk) >= 0) 1699 return; 1700 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); 1701 blk_integrity_unregister(mddev->gendisk); 1702 } 1703 EXPORT_SYMBOL(md_integrity_add_rdev); 1704 1705 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1706 { 1707 char b[BDEVNAME_SIZE]; 1708 struct kobject *ko; 1709 char *s; 1710 int err; 1711 1712 if (rdev->mddev) { 1713 MD_BUG(); 1714 return -EINVAL; 1715 } 1716 1717 /* prevent duplicates */ 1718 if (find_rdev(mddev, rdev->bdev->bd_dev)) 1719 return -EEXIST; 1720 1721 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 1722 if (rdev->sectors && (mddev->dev_sectors == 0 || 1723 rdev->sectors < mddev->dev_sectors)) { 1724 if (mddev->pers) { 1725 /* Cannot change size, so fail 1726 * If mddev->level <= 0, then we don't care 1727 * about aligning sizes (e.g. linear) 1728 */ 1729 if (mddev->level > 0) 1730 return -ENOSPC; 1731 } else 1732 mddev->dev_sectors = rdev->sectors; 1733 } 1734 1735 /* Verify rdev->desc_nr is unique. 1736 * If it is -1, assign a free number, else 1737 * check number is not in use 1738 */ 1739 if (rdev->desc_nr < 0) { 1740 int choice = 0; 1741 if (mddev->pers) choice = mddev->raid_disks; 1742 while (find_rdev_nr(mddev, choice)) 1743 choice++; 1744 rdev->desc_nr = choice; 1745 } else { 1746 if (find_rdev_nr(mddev, rdev->desc_nr)) 1747 return -EBUSY; 1748 } 1749 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 1750 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 1751 mdname(mddev), mddev->max_disks); 1752 return -EBUSY; 1753 } 1754 bdevname(rdev->bdev,b); 1755 while ( (s=strchr(b, '/')) != NULL) 1756 *s = '!'; 1757 1758 rdev->mddev = mddev; 1759 printk(KERN_INFO "md: bind<%s>\n", b); 1760 1761 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 1762 goto fail; 1763 1764 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 1765 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1766 kobject_del(&rdev->kobj); 1767 goto fail; 1768 } 1769 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state"); 1770 1771 list_add_rcu(&rdev->same_set, &mddev->disks); 1772 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1773 1774 /* May as well allow recovery to be retried once */ 1775 mddev->recovery_disabled = 0; 1776 1777 return 0; 1778 1779 fail: 1780 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1781 b, mdname(mddev)); 1782 return err; 1783 } 1784 1785 static void md_delayed_delete(struct work_struct *ws) 1786 { 1787 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1788 kobject_del(&rdev->kobj); 1789 kobject_put(&rdev->kobj); 1790 } 1791 1792 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1793 { 1794 char b[BDEVNAME_SIZE]; 1795 if (!rdev->mddev) { 1796 MD_BUG(); 1797 return; 1798 } 1799 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1800 list_del_rcu(&rdev->same_set); 1801 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1802 rdev->mddev = NULL; 1803 sysfs_remove_link(&rdev->kobj, "block"); 1804 sysfs_put(rdev->sysfs_state); 1805 rdev->sysfs_state = NULL; 1806 /* We need to delay this, otherwise we can deadlock when 1807 * writing to 'remove' to "dev/state". We also need 1808 * to delay it due to rcu usage. 1809 */ 1810 synchronize_rcu(); 1811 INIT_WORK(&rdev->del_work, md_delayed_delete); 1812 kobject_get(&rdev->kobj); 1813 schedule_work(&rdev->del_work); 1814 } 1815 1816 /* 1817 * prevent the device from being mounted, repartitioned or 1818 * otherwise reused by a RAID array (or any other kernel 1819 * subsystem), by bd_claiming the device. 1820 */ 1821 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) 1822 { 1823 int err = 0; 1824 struct block_device *bdev; 1825 char b[BDEVNAME_SIZE]; 1826 1827 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1828 if (IS_ERR(bdev)) { 1829 printk(KERN_ERR "md: could not open %s.\n", 1830 __bdevname(dev, b)); 1831 return PTR_ERR(bdev); 1832 } 1833 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); 1834 if (err) { 1835 printk(KERN_ERR "md: could not bd_claim %s.\n", 1836 bdevname(bdev, b)); 1837 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1838 return err; 1839 } 1840 if (!shared) 1841 set_bit(AllReserved, &rdev->flags); 1842 rdev->bdev = bdev; 1843 return err; 1844 } 1845 1846 static void unlock_rdev(mdk_rdev_t *rdev) 1847 { 1848 struct block_device *bdev = rdev->bdev; 1849 rdev->bdev = NULL; 1850 if (!bdev) 1851 MD_BUG(); 1852 bd_release(bdev); 1853 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1854 } 1855 1856 void md_autodetect_dev(dev_t dev); 1857 1858 static void export_rdev(mdk_rdev_t * rdev) 1859 { 1860 char b[BDEVNAME_SIZE]; 1861 printk(KERN_INFO "md: export_rdev(%s)\n", 1862 bdevname(rdev->bdev,b)); 1863 if (rdev->mddev) 1864 MD_BUG(); 1865 free_disk_sb(rdev); 1866 #ifndef MODULE 1867 if (test_bit(AutoDetected, &rdev->flags)) 1868 md_autodetect_dev(rdev->bdev->bd_dev); 1869 #endif 1870 unlock_rdev(rdev); 1871 kobject_put(&rdev->kobj); 1872 } 1873 1874 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1875 { 1876 unbind_rdev_from_array(rdev); 1877 export_rdev(rdev); 1878 } 1879 1880 static void export_array(mddev_t *mddev) 1881 { 1882 mdk_rdev_t *rdev, *tmp; 1883 1884 rdev_for_each(rdev, tmp, mddev) { 1885 if (!rdev->mddev) { 1886 MD_BUG(); 1887 continue; 1888 } 1889 kick_rdev_from_array(rdev); 1890 } 1891 if (!list_empty(&mddev->disks)) 1892 MD_BUG(); 1893 mddev->raid_disks = 0; 1894 mddev->major_version = 0; 1895 } 1896 1897 static void print_desc(mdp_disk_t *desc) 1898 { 1899 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1900 desc->major,desc->minor,desc->raid_disk,desc->state); 1901 } 1902 1903 static void print_sb_90(mdp_super_t *sb) 1904 { 1905 int i; 1906 1907 printk(KERN_INFO 1908 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1909 sb->major_version, sb->minor_version, sb->patch_version, 1910 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1911 sb->ctime); 1912 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1913 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1914 sb->md_minor, sb->layout, sb->chunk_size); 1915 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1916 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1917 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1918 sb->failed_disks, sb->spare_disks, 1919 sb->sb_csum, (unsigned long)sb->events_lo); 1920 1921 printk(KERN_INFO); 1922 for (i = 0; i < MD_SB_DISKS; i++) { 1923 mdp_disk_t *desc; 1924 1925 desc = sb->disks + i; 1926 if (desc->number || desc->major || desc->minor || 1927 desc->raid_disk || (desc->state && (desc->state != 4))) { 1928 printk(" D %2d: ", i); 1929 print_desc(desc); 1930 } 1931 } 1932 printk(KERN_INFO "md: THIS: "); 1933 print_desc(&sb->this_disk); 1934 } 1935 1936 static void print_sb_1(struct mdp_superblock_1 *sb) 1937 { 1938 __u8 *uuid; 1939 1940 uuid = sb->set_uuid; 1941 printk(KERN_INFO 1942 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" 1943 "md: Name: \"%s\" CT:%llu\n", 1944 le32_to_cpu(sb->major_version), 1945 le32_to_cpu(sb->feature_map), 1946 uuid, 1947 sb->set_name, 1948 (unsigned long long)le64_to_cpu(sb->ctime) 1949 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 1950 1951 uuid = sb->device_uuid; 1952 printk(KERN_INFO 1953 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 1954 " RO:%llu\n" 1955 "md: Dev:%08x UUID: %pU\n" 1956 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 1957 "md: (MaxDev:%u) \n", 1958 le32_to_cpu(sb->level), 1959 (unsigned long long)le64_to_cpu(sb->size), 1960 le32_to_cpu(sb->raid_disks), 1961 le32_to_cpu(sb->layout), 1962 le32_to_cpu(sb->chunksize), 1963 (unsigned long long)le64_to_cpu(sb->data_offset), 1964 (unsigned long long)le64_to_cpu(sb->data_size), 1965 (unsigned long long)le64_to_cpu(sb->super_offset), 1966 (unsigned long long)le64_to_cpu(sb->recovery_offset), 1967 le32_to_cpu(sb->dev_number), 1968 uuid, 1969 sb->devflags, 1970 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, 1971 (unsigned long long)le64_to_cpu(sb->events), 1972 (unsigned long long)le64_to_cpu(sb->resync_offset), 1973 le32_to_cpu(sb->sb_csum), 1974 le32_to_cpu(sb->max_dev) 1975 ); 1976 } 1977 1978 static void print_rdev(mdk_rdev_t *rdev, int major_version) 1979 { 1980 char b[BDEVNAME_SIZE]; 1981 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", 1982 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, 1983 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1984 rdev->desc_nr); 1985 if (rdev->sb_loaded) { 1986 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 1987 switch (major_version) { 1988 case 0: 1989 print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); 1990 break; 1991 case 1: 1992 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); 1993 break; 1994 } 1995 } else 1996 printk(KERN_INFO "md: no rdev superblock!\n"); 1997 } 1998 1999 static void md_print_devices(void) 2000 { 2001 struct list_head *tmp; 2002 mdk_rdev_t *rdev; 2003 mddev_t *mddev; 2004 char b[BDEVNAME_SIZE]; 2005 2006 printk("\n"); 2007 printk("md: **********************************\n"); 2008 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 2009 printk("md: **********************************\n"); 2010 for_each_mddev(mddev, tmp) { 2011 2012 if (mddev->bitmap) 2013 bitmap_print_sb(mddev->bitmap); 2014 else 2015 printk("%s: ", mdname(mddev)); 2016 list_for_each_entry(rdev, &mddev->disks, same_set) 2017 printk("<%s>", bdevname(rdev->bdev,b)); 2018 printk("\n"); 2019 2020 list_for_each_entry(rdev, &mddev->disks, same_set) 2021 print_rdev(rdev, mddev->major_version); 2022 } 2023 printk("md: **********************************\n"); 2024 printk("\n"); 2025 } 2026 2027 2028 static void sync_sbs(mddev_t * mddev, int nospares) 2029 { 2030 /* Update each superblock (in-memory image), but 2031 * if we are allowed to, skip spares which already 2032 * have the right event counter, or have one earlier 2033 * (which would mean they aren't being marked as dirty 2034 * with the rest of the array) 2035 */ 2036 mdk_rdev_t *rdev; 2037 2038 /* First make sure individual recovery_offsets are correct */ 2039 list_for_each_entry(rdev, &mddev->disks, same_set) { 2040 if (rdev->raid_disk >= 0 && 2041 !test_bit(In_sync, &rdev->flags) && 2042 mddev->curr_resync_completed > rdev->recovery_offset) 2043 rdev->recovery_offset = mddev->curr_resync_completed; 2044 2045 } 2046 list_for_each_entry(rdev, &mddev->disks, same_set) { 2047 if (rdev->sb_events == mddev->events || 2048 (nospares && 2049 rdev->raid_disk < 0 && 2050 (rdev->sb_events&1)==0 && 2051 rdev->sb_events+1 == mddev->events)) { 2052 /* Don't update this superblock */ 2053 rdev->sb_loaded = 2; 2054 } else { 2055 super_types[mddev->major_version]. 2056 sync_super(mddev, rdev); 2057 rdev->sb_loaded = 1; 2058 } 2059 } 2060 } 2061 2062 static void md_update_sb(mddev_t * mddev, int force_change) 2063 { 2064 mdk_rdev_t *rdev; 2065 int sync_req; 2066 int nospares = 0; 2067 2068 mddev->utime = get_seconds(); 2069 if (mddev->external) 2070 return; 2071 repeat: 2072 spin_lock_irq(&mddev->write_lock); 2073 2074 set_bit(MD_CHANGE_PENDING, &mddev->flags); 2075 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2076 force_change = 1; 2077 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2078 /* just a clean<-> dirty transition, possibly leave spares alone, 2079 * though if events isn't the right even/odd, we will have to do 2080 * spares after all 2081 */ 2082 nospares = 1; 2083 if (force_change) 2084 nospares = 0; 2085 if (mddev->degraded) 2086 /* If the array is degraded, then skipping spares is both 2087 * dangerous and fairly pointless. 2088 * Dangerous because a device that was removed from the array 2089 * might have a event_count that still looks up-to-date, 2090 * so it can be re-added without a resync. 2091 * Pointless because if there are any spares to skip, 2092 * then a recovery will happen and soon that array won't 2093 * be degraded any more and the spare can go back to sleep then. 2094 */ 2095 nospares = 0; 2096 2097 sync_req = mddev->in_sync; 2098 2099 /* If this is just a dirty<->clean transition, and the array is clean 2100 * and 'events' is odd, we can roll back to the previous clean state */ 2101 if (nospares 2102 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2103 && (mddev->events & 1) 2104 && mddev->events != 1) 2105 mddev->events--; 2106 else { 2107 /* otherwise we have to go forward and ... */ 2108 mddev->events ++; 2109 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 2110 /* .. if the array isn't clean, an 'even' event must also go 2111 * to spares. */ 2112 if ((mddev->events&1)==0) { 2113 nospares = 0; 2114 sync_req = 2; /* force a second update to get the 2115 * even/odd in sync */ 2116 } 2117 } else { 2118 /* otherwise an 'odd' event must go to spares */ 2119 if ((mddev->events&1)) { 2120 nospares = 0; 2121 sync_req = 2; /* force a second update to get the 2122 * even/odd in sync */ 2123 } 2124 } 2125 } 2126 2127 if (!mddev->events) { 2128 /* 2129 * oops, this 64-bit counter should never wrap. 2130 * Either we are in around ~1 trillion A.C., assuming 2131 * 1 reboot per second, or we have a bug: 2132 */ 2133 MD_BUG(); 2134 mddev->events --; 2135 } 2136 2137 /* 2138 * do not write anything to disk if using 2139 * nonpersistent superblocks 2140 */ 2141 if (!mddev->persistent) { 2142 if (!mddev->external) 2143 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2144 2145 spin_unlock_irq(&mddev->write_lock); 2146 wake_up(&mddev->sb_wait); 2147 return; 2148 } 2149 sync_sbs(mddev, nospares); 2150 spin_unlock_irq(&mddev->write_lock); 2151 2152 dprintk(KERN_INFO 2153 "md: updating %s RAID superblock on device (in sync %d)\n", 2154 mdname(mddev),mddev->in_sync); 2155 2156 bitmap_update_sb(mddev->bitmap); 2157 list_for_each_entry(rdev, &mddev->disks, same_set) { 2158 char b[BDEVNAME_SIZE]; 2159 dprintk(KERN_INFO "md: "); 2160 if (rdev->sb_loaded != 1) 2161 continue; /* no noise on spare devices */ 2162 if (test_bit(Faulty, &rdev->flags)) 2163 dprintk("(skipping faulty "); 2164 2165 dprintk("%s ", bdevname(rdev->bdev,b)); 2166 if (!test_bit(Faulty, &rdev->flags)) { 2167 md_super_write(mddev,rdev, 2168 rdev->sb_start, rdev->sb_size, 2169 rdev->sb_page); 2170 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 2171 bdevname(rdev->bdev,b), 2172 (unsigned long long)rdev->sb_start); 2173 rdev->sb_events = mddev->events; 2174 2175 } else 2176 dprintk(")\n"); 2177 if (mddev->level == LEVEL_MULTIPATH) 2178 /* only need to write one superblock... */ 2179 break; 2180 } 2181 md_super_wait(mddev); 2182 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2183 2184 spin_lock_irq(&mddev->write_lock); 2185 if (mddev->in_sync != sync_req || 2186 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2187 /* have to write it out again */ 2188 spin_unlock_irq(&mddev->write_lock); 2189 goto repeat; 2190 } 2191 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2192 spin_unlock_irq(&mddev->write_lock); 2193 wake_up(&mddev->sb_wait); 2194 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2195 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2196 2197 } 2198 2199 /* words written to sysfs files may, or may not, be \n terminated. 2200 * We want to accept with case. For this we use cmd_match. 2201 */ 2202 static int cmd_match(const char *cmd, const char *str) 2203 { 2204 /* See if cmd, written into a sysfs file, matches 2205 * str. They must either be the same, or cmd can 2206 * have a trailing newline 2207 */ 2208 while (*cmd && *str && *cmd == *str) { 2209 cmd++; 2210 str++; 2211 } 2212 if (*cmd == '\n') 2213 cmd++; 2214 if (*str || *cmd) 2215 return 0; 2216 return 1; 2217 } 2218 2219 struct rdev_sysfs_entry { 2220 struct attribute attr; 2221 ssize_t (*show)(mdk_rdev_t *, char *); 2222 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 2223 }; 2224 2225 static ssize_t 2226 state_show(mdk_rdev_t *rdev, char *page) 2227 { 2228 char *sep = ""; 2229 size_t len = 0; 2230 2231 if (test_bit(Faulty, &rdev->flags)) { 2232 len+= sprintf(page+len, "%sfaulty",sep); 2233 sep = ","; 2234 } 2235 if (test_bit(In_sync, &rdev->flags)) { 2236 len += sprintf(page+len, "%sin_sync",sep); 2237 sep = ","; 2238 } 2239 if (test_bit(WriteMostly, &rdev->flags)) { 2240 len += sprintf(page+len, "%swrite_mostly",sep); 2241 sep = ","; 2242 } 2243 if (test_bit(Blocked, &rdev->flags)) { 2244 len += sprintf(page+len, "%sblocked", sep); 2245 sep = ","; 2246 } 2247 if (!test_bit(Faulty, &rdev->flags) && 2248 !test_bit(In_sync, &rdev->flags)) { 2249 len += sprintf(page+len, "%sspare", sep); 2250 sep = ","; 2251 } 2252 return len+sprintf(page+len, "\n"); 2253 } 2254 2255 static ssize_t 2256 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2257 { 2258 /* can write 2259 * faulty - simulates and error 2260 * remove - disconnects the device 2261 * writemostly - sets write_mostly 2262 * -writemostly - clears write_mostly 2263 * blocked - sets the Blocked flag 2264 * -blocked - clears the Blocked flag 2265 * insync - sets Insync providing device isn't active 2266 */ 2267 int err = -EINVAL; 2268 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2269 md_error(rdev->mddev, rdev); 2270 err = 0; 2271 } else if (cmd_match(buf, "remove")) { 2272 if (rdev->raid_disk >= 0) 2273 err = -EBUSY; 2274 else { 2275 mddev_t *mddev = rdev->mddev; 2276 kick_rdev_from_array(rdev); 2277 if (mddev->pers) 2278 md_update_sb(mddev, 1); 2279 md_new_event(mddev); 2280 err = 0; 2281 } 2282 } else if (cmd_match(buf, "writemostly")) { 2283 set_bit(WriteMostly, &rdev->flags); 2284 err = 0; 2285 } else if (cmd_match(buf, "-writemostly")) { 2286 clear_bit(WriteMostly, &rdev->flags); 2287 err = 0; 2288 } else if (cmd_match(buf, "blocked")) { 2289 set_bit(Blocked, &rdev->flags); 2290 err = 0; 2291 } else if (cmd_match(buf, "-blocked")) { 2292 clear_bit(Blocked, &rdev->flags); 2293 wake_up(&rdev->blocked_wait); 2294 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2295 md_wakeup_thread(rdev->mddev->thread); 2296 2297 err = 0; 2298 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2299 set_bit(In_sync, &rdev->flags); 2300 err = 0; 2301 } 2302 if (!err && rdev->sysfs_state) 2303 sysfs_notify_dirent(rdev->sysfs_state); 2304 return err ? err : len; 2305 } 2306 static struct rdev_sysfs_entry rdev_state = 2307 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 2308 2309 static ssize_t 2310 errors_show(mdk_rdev_t *rdev, char *page) 2311 { 2312 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2313 } 2314 2315 static ssize_t 2316 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2317 { 2318 char *e; 2319 unsigned long n = simple_strtoul(buf, &e, 10); 2320 if (*buf && (*e == 0 || *e == '\n')) { 2321 atomic_set(&rdev->corrected_errors, n); 2322 return len; 2323 } 2324 return -EINVAL; 2325 } 2326 static struct rdev_sysfs_entry rdev_errors = 2327 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2328 2329 static ssize_t 2330 slot_show(mdk_rdev_t *rdev, char *page) 2331 { 2332 if (rdev->raid_disk < 0) 2333 return sprintf(page, "none\n"); 2334 else 2335 return sprintf(page, "%d\n", rdev->raid_disk); 2336 } 2337 2338 static ssize_t 2339 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2340 { 2341 char *e; 2342 int err; 2343 char nm[20]; 2344 int slot = simple_strtoul(buf, &e, 10); 2345 if (strncmp(buf, "none", 4)==0) 2346 slot = -1; 2347 else if (e==buf || (*e && *e!= '\n')) 2348 return -EINVAL; 2349 if (rdev->mddev->pers && slot == -1) { 2350 /* Setting 'slot' on an active array requires also 2351 * updating the 'rd%d' link, and communicating 2352 * with the personality with ->hot_*_disk. 2353 * For now we only support removing 2354 * failed/spare devices. This normally happens automatically, 2355 * but not when the metadata is externally managed. 2356 */ 2357 if (rdev->raid_disk == -1) 2358 return -EEXIST; 2359 /* personality does all needed checks */ 2360 if (rdev->mddev->pers->hot_add_disk == NULL) 2361 return -EINVAL; 2362 err = rdev->mddev->pers-> 2363 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2364 if (err) 2365 return err; 2366 sprintf(nm, "rd%d", rdev->raid_disk); 2367 sysfs_remove_link(&rdev->mddev->kobj, nm); 2368 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2369 md_wakeup_thread(rdev->mddev->thread); 2370 } else if (rdev->mddev->pers) { 2371 mdk_rdev_t *rdev2; 2372 /* Activating a spare .. or possibly reactivating 2373 * if we ever get bitmaps working here. 2374 */ 2375 2376 if (rdev->raid_disk != -1) 2377 return -EBUSY; 2378 2379 if (rdev->mddev->pers->hot_add_disk == NULL) 2380 return -EINVAL; 2381 2382 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) 2383 if (rdev2->raid_disk == slot) 2384 return -EEXIST; 2385 2386 rdev->raid_disk = slot; 2387 if (test_bit(In_sync, &rdev->flags)) 2388 rdev->saved_raid_disk = slot; 2389 else 2390 rdev->saved_raid_disk = -1; 2391 err = rdev->mddev->pers-> 2392 hot_add_disk(rdev->mddev, rdev); 2393 if (err) { 2394 rdev->raid_disk = -1; 2395 return err; 2396 } else 2397 sysfs_notify_dirent(rdev->sysfs_state); 2398 sprintf(nm, "rd%d", rdev->raid_disk); 2399 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) 2400 printk(KERN_WARNING 2401 "md: cannot register " 2402 "%s for %s\n", 2403 nm, mdname(rdev->mddev)); 2404 2405 /* don't wakeup anyone, leave that to userspace. */ 2406 } else { 2407 if (slot >= rdev->mddev->raid_disks) 2408 return -ENOSPC; 2409 rdev->raid_disk = slot; 2410 /* assume it is working */ 2411 clear_bit(Faulty, &rdev->flags); 2412 clear_bit(WriteMostly, &rdev->flags); 2413 set_bit(In_sync, &rdev->flags); 2414 sysfs_notify_dirent(rdev->sysfs_state); 2415 } 2416 return len; 2417 } 2418 2419 2420 static struct rdev_sysfs_entry rdev_slot = 2421 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2422 2423 static ssize_t 2424 offset_show(mdk_rdev_t *rdev, char *page) 2425 { 2426 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2427 } 2428 2429 static ssize_t 2430 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2431 { 2432 char *e; 2433 unsigned long long offset = simple_strtoull(buf, &e, 10); 2434 if (e==buf || (*e && *e != '\n')) 2435 return -EINVAL; 2436 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2437 return -EBUSY; 2438 if (rdev->sectors && rdev->mddev->external) 2439 /* Must set offset before size, so overlap checks 2440 * can be sane */ 2441 return -EBUSY; 2442 rdev->data_offset = offset; 2443 return len; 2444 } 2445 2446 static struct rdev_sysfs_entry rdev_offset = 2447 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2448 2449 static ssize_t 2450 rdev_size_show(mdk_rdev_t *rdev, char *page) 2451 { 2452 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2453 } 2454 2455 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2456 { 2457 /* check if two start/length pairs overlap */ 2458 if (s1+l1 <= s2) 2459 return 0; 2460 if (s2+l2 <= s1) 2461 return 0; 2462 return 1; 2463 } 2464 2465 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2466 { 2467 unsigned long long blocks; 2468 sector_t new; 2469 2470 if (strict_strtoull(buf, 10, &blocks) < 0) 2471 return -EINVAL; 2472 2473 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2474 return -EINVAL; /* sector conversion overflow */ 2475 2476 new = blocks * 2; 2477 if (new != blocks * 2) 2478 return -EINVAL; /* unsigned long long to sector_t overflow */ 2479 2480 *sectors = new; 2481 return 0; 2482 } 2483 2484 static ssize_t 2485 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2486 { 2487 mddev_t *my_mddev = rdev->mddev; 2488 sector_t oldsectors = rdev->sectors; 2489 sector_t sectors; 2490 2491 if (strict_blocks_to_sectors(buf, §ors) < 0) 2492 return -EINVAL; 2493 if (my_mddev->pers && rdev->raid_disk >= 0) { 2494 if (my_mddev->persistent) { 2495 sectors = super_types[my_mddev->major_version]. 2496 rdev_size_change(rdev, sectors); 2497 if (!sectors) 2498 return -EBUSY; 2499 } else if (!sectors) 2500 sectors = (rdev->bdev->bd_inode->i_size >> 9) - 2501 rdev->data_offset; 2502 } 2503 if (sectors < my_mddev->dev_sectors) 2504 return -EINVAL; /* component must fit device */ 2505 2506 rdev->sectors = sectors; 2507 if (sectors > oldsectors && my_mddev->external) { 2508 /* need to check that all other rdevs with the same ->bdev 2509 * do not overlap. We need to unlock the mddev to avoid 2510 * a deadlock. We have already changed rdev->sectors, and if 2511 * we have to change it back, we will have the lock again. 2512 */ 2513 mddev_t *mddev; 2514 int overlap = 0; 2515 struct list_head *tmp; 2516 2517 mddev_unlock(my_mddev); 2518 for_each_mddev(mddev, tmp) { 2519 mdk_rdev_t *rdev2; 2520 2521 mddev_lock(mddev); 2522 list_for_each_entry(rdev2, &mddev->disks, same_set) 2523 if (test_bit(AllReserved, &rdev2->flags) || 2524 (rdev->bdev == rdev2->bdev && 2525 rdev != rdev2 && 2526 overlaps(rdev->data_offset, rdev->sectors, 2527 rdev2->data_offset, 2528 rdev2->sectors))) { 2529 overlap = 1; 2530 break; 2531 } 2532 mddev_unlock(mddev); 2533 if (overlap) { 2534 mddev_put(mddev); 2535 break; 2536 } 2537 } 2538 mddev_lock(my_mddev); 2539 if (overlap) { 2540 /* Someone else could have slipped in a size 2541 * change here, but doing so is just silly. 2542 * We put oldsectors back because we *know* it is 2543 * safe, and trust userspace not to race with 2544 * itself 2545 */ 2546 rdev->sectors = oldsectors; 2547 return -EBUSY; 2548 } 2549 } 2550 return len; 2551 } 2552 2553 static struct rdev_sysfs_entry rdev_size = 2554 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2555 2556 2557 static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page) 2558 { 2559 unsigned long long recovery_start = rdev->recovery_offset; 2560 2561 if (test_bit(In_sync, &rdev->flags) || 2562 recovery_start == MaxSector) 2563 return sprintf(page, "none\n"); 2564 2565 return sprintf(page, "%llu\n", recovery_start); 2566 } 2567 2568 static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2569 { 2570 unsigned long long recovery_start; 2571 2572 if (cmd_match(buf, "none")) 2573 recovery_start = MaxSector; 2574 else if (strict_strtoull(buf, 10, &recovery_start)) 2575 return -EINVAL; 2576 2577 if (rdev->mddev->pers && 2578 rdev->raid_disk >= 0) 2579 return -EBUSY; 2580 2581 rdev->recovery_offset = recovery_start; 2582 if (recovery_start == MaxSector) 2583 set_bit(In_sync, &rdev->flags); 2584 else 2585 clear_bit(In_sync, &rdev->flags); 2586 return len; 2587 } 2588 2589 static struct rdev_sysfs_entry rdev_recovery_start = 2590 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 2591 2592 static struct attribute *rdev_default_attrs[] = { 2593 &rdev_state.attr, 2594 &rdev_errors.attr, 2595 &rdev_slot.attr, 2596 &rdev_offset.attr, 2597 &rdev_size.attr, 2598 &rdev_recovery_start.attr, 2599 NULL, 2600 }; 2601 static ssize_t 2602 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2603 { 2604 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2605 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2606 mddev_t *mddev = rdev->mddev; 2607 ssize_t rv; 2608 2609 if (!entry->show) 2610 return -EIO; 2611 2612 rv = mddev ? mddev_lock(mddev) : -EBUSY; 2613 if (!rv) { 2614 if (rdev->mddev == NULL) 2615 rv = -EBUSY; 2616 else 2617 rv = entry->show(rdev, page); 2618 mddev_unlock(mddev); 2619 } 2620 return rv; 2621 } 2622 2623 static ssize_t 2624 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 2625 const char *page, size_t length) 2626 { 2627 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2628 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2629 ssize_t rv; 2630 mddev_t *mddev = rdev->mddev; 2631 2632 if (!entry->store) 2633 return -EIO; 2634 if (!capable(CAP_SYS_ADMIN)) 2635 return -EACCES; 2636 rv = mddev ? mddev_lock(mddev): -EBUSY; 2637 if (!rv) { 2638 if (rdev->mddev == NULL) 2639 rv = -EBUSY; 2640 else 2641 rv = entry->store(rdev, page, length); 2642 mddev_unlock(mddev); 2643 } 2644 return rv; 2645 } 2646 2647 static void rdev_free(struct kobject *ko) 2648 { 2649 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2650 kfree(rdev); 2651 } 2652 static const struct sysfs_ops rdev_sysfs_ops = { 2653 .show = rdev_attr_show, 2654 .store = rdev_attr_store, 2655 }; 2656 static struct kobj_type rdev_ktype = { 2657 .release = rdev_free, 2658 .sysfs_ops = &rdev_sysfs_ops, 2659 .default_attrs = rdev_default_attrs, 2660 }; 2661 2662 /* 2663 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2664 * 2665 * mark the device faulty if: 2666 * 2667 * - the device is nonexistent (zero size) 2668 * - the device has no valid superblock 2669 * 2670 * a faulty rdev _never_ has rdev->sb set. 2671 */ 2672 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2673 { 2674 char b[BDEVNAME_SIZE]; 2675 int err; 2676 mdk_rdev_t *rdev; 2677 sector_t size; 2678 2679 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2680 if (!rdev) { 2681 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2682 return ERR_PTR(-ENOMEM); 2683 } 2684 2685 if ((err = alloc_disk_sb(rdev))) 2686 goto abort_free; 2687 2688 err = lock_rdev(rdev, newdev, super_format == -2); 2689 if (err) 2690 goto abort_free; 2691 2692 kobject_init(&rdev->kobj, &rdev_ktype); 2693 2694 rdev->desc_nr = -1; 2695 rdev->saved_raid_disk = -1; 2696 rdev->raid_disk = -1; 2697 rdev->flags = 0; 2698 rdev->data_offset = 0; 2699 rdev->sb_events = 0; 2700 rdev->last_read_error.tv_sec = 0; 2701 rdev->last_read_error.tv_nsec = 0; 2702 atomic_set(&rdev->nr_pending, 0); 2703 atomic_set(&rdev->read_errors, 0); 2704 atomic_set(&rdev->corrected_errors, 0); 2705 2706 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2707 if (!size) { 2708 printk(KERN_WARNING 2709 "md: %s has zero or unknown size, marking faulty!\n", 2710 bdevname(rdev->bdev,b)); 2711 err = -EINVAL; 2712 goto abort_free; 2713 } 2714 2715 if (super_format >= 0) { 2716 err = super_types[super_format]. 2717 load_super(rdev, NULL, super_minor); 2718 if (err == -EINVAL) { 2719 printk(KERN_WARNING 2720 "md: %s does not have a valid v%d.%d " 2721 "superblock, not importing!\n", 2722 bdevname(rdev->bdev,b), 2723 super_format, super_minor); 2724 goto abort_free; 2725 } 2726 if (err < 0) { 2727 printk(KERN_WARNING 2728 "md: could not read %s's sb, not importing!\n", 2729 bdevname(rdev->bdev,b)); 2730 goto abort_free; 2731 } 2732 } 2733 2734 INIT_LIST_HEAD(&rdev->same_set); 2735 init_waitqueue_head(&rdev->blocked_wait); 2736 2737 return rdev; 2738 2739 abort_free: 2740 if (rdev->sb_page) { 2741 if (rdev->bdev) 2742 unlock_rdev(rdev); 2743 free_disk_sb(rdev); 2744 } 2745 kfree(rdev); 2746 return ERR_PTR(err); 2747 } 2748 2749 /* 2750 * Check a full RAID array for plausibility 2751 */ 2752 2753 2754 static void analyze_sbs(mddev_t * mddev) 2755 { 2756 int i; 2757 mdk_rdev_t *rdev, *freshest, *tmp; 2758 char b[BDEVNAME_SIZE]; 2759 2760 freshest = NULL; 2761 rdev_for_each(rdev, tmp, mddev) 2762 switch (super_types[mddev->major_version]. 2763 load_super(rdev, freshest, mddev->minor_version)) { 2764 case 1: 2765 freshest = rdev; 2766 break; 2767 case 0: 2768 break; 2769 default: 2770 printk( KERN_ERR \ 2771 "md: fatal superblock inconsistency in %s" 2772 " -- removing from array\n", 2773 bdevname(rdev->bdev,b)); 2774 kick_rdev_from_array(rdev); 2775 } 2776 2777 2778 super_types[mddev->major_version]. 2779 validate_super(mddev, freshest); 2780 2781 i = 0; 2782 rdev_for_each(rdev, tmp, mddev) { 2783 if (rdev->desc_nr >= mddev->max_disks || 2784 i > mddev->max_disks) { 2785 printk(KERN_WARNING 2786 "md: %s: %s: only %d devices permitted\n", 2787 mdname(mddev), bdevname(rdev->bdev, b), 2788 mddev->max_disks); 2789 kick_rdev_from_array(rdev); 2790 continue; 2791 } 2792 if (rdev != freshest) 2793 if (super_types[mddev->major_version]. 2794 validate_super(mddev, rdev)) { 2795 printk(KERN_WARNING "md: kicking non-fresh %s" 2796 " from array!\n", 2797 bdevname(rdev->bdev,b)); 2798 kick_rdev_from_array(rdev); 2799 continue; 2800 } 2801 if (mddev->level == LEVEL_MULTIPATH) { 2802 rdev->desc_nr = i++; 2803 rdev->raid_disk = rdev->desc_nr; 2804 set_bit(In_sync, &rdev->flags); 2805 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { 2806 rdev->raid_disk = -1; 2807 clear_bit(In_sync, &rdev->flags); 2808 } 2809 } 2810 } 2811 2812 /* Read a fixed-point number. 2813 * Numbers in sysfs attributes should be in "standard" units where 2814 * possible, so time should be in seconds. 2815 * However we internally use a a much smaller unit such as 2816 * milliseconds or jiffies. 2817 * This function takes a decimal number with a possible fractional 2818 * component, and produces an integer which is the result of 2819 * multiplying that number by 10^'scale'. 2820 * all without any floating-point arithmetic. 2821 */ 2822 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 2823 { 2824 unsigned long result = 0; 2825 long decimals = -1; 2826 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 2827 if (*cp == '.') 2828 decimals = 0; 2829 else if (decimals < scale) { 2830 unsigned int value; 2831 value = *cp - '0'; 2832 result = result * 10 + value; 2833 if (decimals >= 0) 2834 decimals++; 2835 } 2836 cp++; 2837 } 2838 if (*cp == '\n') 2839 cp++; 2840 if (*cp) 2841 return -EINVAL; 2842 if (decimals < 0) 2843 decimals = 0; 2844 while (decimals < scale) { 2845 result *= 10; 2846 decimals ++; 2847 } 2848 *res = result; 2849 return 0; 2850 } 2851 2852 2853 static void md_safemode_timeout(unsigned long data); 2854 2855 static ssize_t 2856 safe_delay_show(mddev_t *mddev, char *page) 2857 { 2858 int msec = (mddev->safemode_delay*1000)/HZ; 2859 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2860 } 2861 static ssize_t 2862 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2863 { 2864 unsigned long msec; 2865 2866 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 2867 return -EINVAL; 2868 if (msec == 0) 2869 mddev->safemode_delay = 0; 2870 else { 2871 unsigned long old_delay = mddev->safemode_delay; 2872 mddev->safemode_delay = (msec*HZ)/1000; 2873 if (mddev->safemode_delay == 0) 2874 mddev->safemode_delay = 1; 2875 if (mddev->safemode_delay < old_delay) 2876 md_safemode_timeout((unsigned long)mddev); 2877 } 2878 return len; 2879 } 2880 static struct md_sysfs_entry md_safe_delay = 2881 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2882 2883 static ssize_t 2884 level_show(mddev_t *mddev, char *page) 2885 { 2886 struct mdk_personality *p = mddev->pers; 2887 if (p) 2888 return sprintf(page, "%s\n", p->name); 2889 else if (mddev->clevel[0]) 2890 return sprintf(page, "%s\n", mddev->clevel); 2891 else if (mddev->level != LEVEL_NONE) 2892 return sprintf(page, "%d\n", mddev->level); 2893 else 2894 return 0; 2895 } 2896 2897 static ssize_t 2898 level_store(mddev_t *mddev, const char *buf, size_t len) 2899 { 2900 char level[16]; 2901 ssize_t rv = len; 2902 struct mdk_personality *pers; 2903 void *priv; 2904 mdk_rdev_t *rdev; 2905 2906 if (mddev->pers == NULL) { 2907 if (len == 0) 2908 return 0; 2909 if (len >= sizeof(mddev->clevel)) 2910 return -ENOSPC; 2911 strncpy(mddev->clevel, buf, len); 2912 if (mddev->clevel[len-1] == '\n') 2913 len--; 2914 mddev->clevel[len] = 0; 2915 mddev->level = LEVEL_NONE; 2916 return rv; 2917 } 2918 2919 /* request to change the personality. Need to ensure: 2920 * - array is not engaged in resync/recovery/reshape 2921 * - old personality can be suspended 2922 * - new personality will access other array. 2923 */ 2924 2925 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 2926 return -EBUSY; 2927 2928 if (!mddev->pers->quiesce) { 2929 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 2930 mdname(mddev), mddev->pers->name); 2931 return -EINVAL; 2932 } 2933 2934 /* Now find the new personality */ 2935 if (len == 0 || len >= sizeof(level)) 2936 return -EINVAL; 2937 strncpy(level, buf, len); 2938 if (level[len-1] == '\n') 2939 len--; 2940 level[len] = 0; 2941 2942 request_module("md-%s", level); 2943 spin_lock(&pers_lock); 2944 pers = find_pers(LEVEL_NONE, level); 2945 if (!pers || !try_module_get(pers->owner)) { 2946 spin_unlock(&pers_lock); 2947 printk(KERN_WARNING "md: personality %s not loaded\n", level); 2948 return -EINVAL; 2949 } 2950 spin_unlock(&pers_lock); 2951 2952 if (pers == mddev->pers) { 2953 /* Nothing to do! */ 2954 module_put(pers->owner); 2955 return rv; 2956 } 2957 if (!pers->takeover) { 2958 module_put(pers->owner); 2959 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 2960 mdname(mddev), level); 2961 return -EINVAL; 2962 } 2963 2964 /* ->takeover must set new_* and/or delta_disks 2965 * if it succeeds, and may set them when it fails. 2966 */ 2967 priv = pers->takeover(mddev); 2968 if (IS_ERR(priv)) { 2969 mddev->new_level = mddev->level; 2970 mddev->new_layout = mddev->layout; 2971 mddev->new_chunk_sectors = mddev->chunk_sectors; 2972 mddev->raid_disks -= mddev->delta_disks; 2973 mddev->delta_disks = 0; 2974 module_put(pers->owner); 2975 printk(KERN_WARNING "md: %s: %s would not accept array\n", 2976 mdname(mddev), level); 2977 return PTR_ERR(priv); 2978 } 2979 2980 /* Looks like we have a winner */ 2981 mddev_suspend(mddev); 2982 mddev->pers->stop(mddev); 2983 module_put(mddev->pers->owner); 2984 /* Invalidate devices that are now superfluous */ 2985 list_for_each_entry(rdev, &mddev->disks, same_set) 2986 if (rdev->raid_disk >= mddev->raid_disks) { 2987 rdev->raid_disk = -1; 2988 clear_bit(In_sync, &rdev->flags); 2989 } 2990 mddev->pers = pers; 2991 mddev->private = priv; 2992 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2993 mddev->level = mddev->new_level; 2994 mddev->layout = mddev->new_layout; 2995 mddev->chunk_sectors = mddev->new_chunk_sectors; 2996 mddev->delta_disks = 0; 2997 pers->run(mddev); 2998 mddev_resume(mddev); 2999 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3000 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3001 md_wakeup_thread(mddev->thread); 3002 return rv; 3003 } 3004 3005 static struct md_sysfs_entry md_level = 3006 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3007 3008 3009 static ssize_t 3010 layout_show(mddev_t *mddev, char *page) 3011 { 3012 /* just a number, not meaningful for all levels */ 3013 if (mddev->reshape_position != MaxSector && 3014 mddev->layout != mddev->new_layout) 3015 return sprintf(page, "%d (%d)\n", 3016 mddev->new_layout, mddev->layout); 3017 return sprintf(page, "%d\n", mddev->layout); 3018 } 3019 3020 static ssize_t 3021 layout_store(mddev_t *mddev, const char *buf, size_t len) 3022 { 3023 char *e; 3024 unsigned long n = simple_strtoul(buf, &e, 10); 3025 3026 if (!*buf || (*e && *e != '\n')) 3027 return -EINVAL; 3028 3029 if (mddev->pers) { 3030 int err; 3031 if (mddev->pers->check_reshape == NULL) 3032 return -EBUSY; 3033 mddev->new_layout = n; 3034 err = mddev->pers->check_reshape(mddev); 3035 if (err) { 3036 mddev->new_layout = mddev->layout; 3037 return err; 3038 } 3039 } else { 3040 mddev->new_layout = n; 3041 if (mddev->reshape_position == MaxSector) 3042 mddev->layout = n; 3043 } 3044 return len; 3045 } 3046 static struct md_sysfs_entry md_layout = 3047 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3048 3049 3050 static ssize_t 3051 raid_disks_show(mddev_t *mddev, char *page) 3052 { 3053 if (mddev->raid_disks == 0) 3054 return 0; 3055 if (mddev->reshape_position != MaxSector && 3056 mddev->delta_disks != 0) 3057 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3058 mddev->raid_disks - mddev->delta_disks); 3059 return sprintf(page, "%d\n", mddev->raid_disks); 3060 } 3061 3062 static int update_raid_disks(mddev_t *mddev, int raid_disks); 3063 3064 static ssize_t 3065 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 3066 { 3067 char *e; 3068 int rv = 0; 3069 unsigned long n = simple_strtoul(buf, &e, 10); 3070 3071 if (!*buf || (*e && *e != '\n')) 3072 return -EINVAL; 3073 3074 if (mddev->pers) 3075 rv = update_raid_disks(mddev, n); 3076 else if (mddev->reshape_position != MaxSector) { 3077 int olddisks = mddev->raid_disks - mddev->delta_disks; 3078 mddev->delta_disks = n - olddisks; 3079 mddev->raid_disks = n; 3080 } else 3081 mddev->raid_disks = n; 3082 return rv ? rv : len; 3083 } 3084 static struct md_sysfs_entry md_raid_disks = 3085 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3086 3087 static ssize_t 3088 chunk_size_show(mddev_t *mddev, char *page) 3089 { 3090 if (mddev->reshape_position != MaxSector && 3091 mddev->chunk_sectors != mddev->new_chunk_sectors) 3092 return sprintf(page, "%d (%d)\n", 3093 mddev->new_chunk_sectors << 9, 3094 mddev->chunk_sectors << 9); 3095 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3096 } 3097 3098 static ssize_t 3099 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 3100 { 3101 char *e; 3102 unsigned long n = simple_strtoul(buf, &e, 10); 3103 3104 if (!*buf || (*e && *e != '\n')) 3105 return -EINVAL; 3106 3107 if (mddev->pers) { 3108 int err; 3109 if (mddev->pers->check_reshape == NULL) 3110 return -EBUSY; 3111 mddev->new_chunk_sectors = n >> 9; 3112 err = mddev->pers->check_reshape(mddev); 3113 if (err) { 3114 mddev->new_chunk_sectors = mddev->chunk_sectors; 3115 return err; 3116 } 3117 } else { 3118 mddev->new_chunk_sectors = n >> 9; 3119 if (mddev->reshape_position == MaxSector) 3120 mddev->chunk_sectors = n >> 9; 3121 } 3122 return len; 3123 } 3124 static struct md_sysfs_entry md_chunk_size = 3125 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3126 3127 static ssize_t 3128 resync_start_show(mddev_t *mddev, char *page) 3129 { 3130 if (mddev->recovery_cp == MaxSector) 3131 return sprintf(page, "none\n"); 3132 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3133 } 3134 3135 static ssize_t 3136 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 3137 { 3138 char *e; 3139 unsigned long long n = simple_strtoull(buf, &e, 10); 3140 3141 if (mddev->pers) 3142 return -EBUSY; 3143 if (cmd_match(buf, "none")) 3144 n = MaxSector; 3145 else if (!*buf || (*e && *e != '\n')) 3146 return -EINVAL; 3147 3148 mddev->recovery_cp = n; 3149 return len; 3150 } 3151 static struct md_sysfs_entry md_resync_start = 3152 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 3153 3154 /* 3155 * The array state can be: 3156 * 3157 * clear 3158 * No devices, no size, no level 3159 * Equivalent to STOP_ARRAY ioctl 3160 * inactive 3161 * May have some settings, but array is not active 3162 * all IO results in error 3163 * When written, doesn't tear down array, but just stops it 3164 * suspended (not supported yet) 3165 * All IO requests will block. The array can be reconfigured. 3166 * Writing this, if accepted, will block until array is quiescent 3167 * readonly 3168 * no resync can happen. no superblocks get written. 3169 * write requests fail 3170 * read-auto 3171 * like readonly, but behaves like 'clean' on a write request. 3172 * 3173 * clean - no pending writes, but otherwise active. 3174 * When written to inactive array, starts without resync 3175 * If a write request arrives then 3176 * if metadata is known, mark 'dirty' and switch to 'active'. 3177 * if not known, block and switch to write-pending 3178 * If written to an active array that has pending writes, then fails. 3179 * active 3180 * fully active: IO and resync can be happening. 3181 * When written to inactive array, starts with resync 3182 * 3183 * write-pending 3184 * clean, but writes are blocked waiting for 'active' to be written. 3185 * 3186 * active-idle 3187 * like active, but no writes have been seen for a while (100msec). 3188 * 3189 */ 3190 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 3191 write_pending, active_idle, bad_word}; 3192 static char *array_states[] = { 3193 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3194 "write-pending", "active-idle", NULL }; 3195 3196 static int match_word(const char *word, char **list) 3197 { 3198 int n; 3199 for (n=0; list[n]; n++) 3200 if (cmd_match(word, list[n])) 3201 break; 3202 return n; 3203 } 3204 3205 static ssize_t 3206 array_state_show(mddev_t *mddev, char *page) 3207 { 3208 enum array_state st = inactive; 3209 3210 if (mddev->pers) 3211 switch(mddev->ro) { 3212 case 1: 3213 st = readonly; 3214 break; 3215 case 2: 3216 st = read_auto; 3217 break; 3218 case 0: 3219 if (mddev->in_sync) 3220 st = clean; 3221 else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 3222 st = write_pending; 3223 else if (mddev->safemode) 3224 st = active_idle; 3225 else 3226 st = active; 3227 } 3228 else { 3229 if (list_empty(&mddev->disks) && 3230 mddev->raid_disks == 0 && 3231 mddev->dev_sectors == 0) 3232 st = clear; 3233 else 3234 st = inactive; 3235 } 3236 return sprintf(page, "%s\n", array_states[st]); 3237 } 3238 3239 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3240 static int do_md_run(mddev_t * mddev); 3241 static int restart_array(mddev_t *mddev); 3242 3243 static ssize_t 3244 array_state_store(mddev_t *mddev, const char *buf, size_t len) 3245 { 3246 int err = -EINVAL; 3247 enum array_state st = match_word(buf, array_states); 3248 switch(st) { 3249 case bad_word: 3250 break; 3251 case clear: 3252 /* stopping an active array */ 3253 if (atomic_read(&mddev->openers) > 0) 3254 return -EBUSY; 3255 err = do_md_stop(mddev, 0, 0); 3256 break; 3257 case inactive: 3258 /* stopping an active array */ 3259 if (mddev->pers) { 3260 if (atomic_read(&mddev->openers) > 0) 3261 return -EBUSY; 3262 err = do_md_stop(mddev, 2, 0); 3263 } else 3264 err = 0; /* already inactive */ 3265 break; 3266 case suspended: 3267 break; /* not supported yet */ 3268 case readonly: 3269 if (mddev->pers) 3270 err = do_md_stop(mddev, 1, 0); 3271 else { 3272 mddev->ro = 1; 3273 set_disk_ro(mddev->gendisk, 1); 3274 err = do_md_run(mddev); 3275 } 3276 break; 3277 case read_auto: 3278 if (mddev->pers) { 3279 if (mddev->ro == 0) 3280 err = do_md_stop(mddev, 1, 0); 3281 else if (mddev->ro == 1) 3282 err = restart_array(mddev); 3283 if (err == 0) { 3284 mddev->ro = 2; 3285 set_disk_ro(mddev->gendisk, 0); 3286 } 3287 } else { 3288 mddev->ro = 2; 3289 err = do_md_run(mddev); 3290 } 3291 break; 3292 case clean: 3293 if (mddev->pers) { 3294 restart_array(mddev); 3295 spin_lock_irq(&mddev->write_lock); 3296 if (atomic_read(&mddev->writes_pending) == 0) { 3297 if (mddev->in_sync == 0) { 3298 mddev->in_sync = 1; 3299 if (mddev->safemode == 1) 3300 mddev->safemode = 0; 3301 if (mddev->persistent) 3302 set_bit(MD_CHANGE_CLEAN, 3303 &mddev->flags); 3304 } 3305 err = 0; 3306 } else 3307 err = -EBUSY; 3308 spin_unlock_irq(&mddev->write_lock); 3309 } else 3310 err = -EINVAL; 3311 break; 3312 case active: 3313 if (mddev->pers) { 3314 restart_array(mddev); 3315 if (mddev->external) 3316 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 3317 wake_up(&mddev->sb_wait); 3318 err = 0; 3319 } else { 3320 mddev->ro = 0; 3321 set_disk_ro(mddev->gendisk, 0); 3322 err = do_md_run(mddev); 3323 } 3324 break; 3325 case write_pending: 3326 case active_idle: 3327 /* these cannot be set */ 3328 break; 3329 } 3330 if (err) 3331 return err; 3332 else { 3333 sysfs_notify_dirent(mddev->sysfs_state); 3334 return len; 3335 } 3336 } 3337 static struct md_sysfs_entry md_array_state = 3338 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3339 3340 static ssize_t 3341 max_corrected_read_errors_show(mddev_t *mddev, char *page) { 3342 return sprintf(page, "%d\n", 3343 atomic_read(&mddev->max_corr_read_errors)); 3344 } 3345 3346 static ssize_t 3347 max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len) 3348 { 3349 char *e; 3350 unsigned long n = simple_strtoul(buf, &e, 10); 3351 3352 if (*buf && (*e == 0 || *e == '\n')) { 3353 atomic_set(&mddev->max_corr_read_errors, n); 3354 return len; 3355 } 3356 return -EINVAL; 3357 } 3358 3359 static struct md_sysfs_entry max_corr_read_errors = 3360 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 3361 max_corrected_read_errors_store); 3362 3363 static ssize_t 3364 null_show(mddev_t *mddev, char *page) 3365 { 3366 return -EINVAL; 3367 } 3368 3369 static ssize_t 3370 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 3371 { 3372 /* buf must be %d:%d\n? giving major and minor numbers */ 3373 /* The new device is added to the array. 3374 * If the array has a persistent superblock, we read the 3375 * superblock to initialise info and check validity. 3376 * Otherwise, only checking done is that in bind_rdev_to_array, 3377 * which mainly checks size. 3378 */ 3379 char *e; 3380 int major = simple_strtoul(buf, &e, 10); 3381 int minor; 3382 dev_t dev; 3383 mdk_rdev_t *rdev; 3384 int err; 3385 3386 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 3387 return -EINVAL; 3388 minor = simple_strtoul(e+1, &e, 10); 3389 if (*e && *e != '\n') 3390 return -EINVAL; 3391 dev = MKDEV(major, minor); 3392 if (major != MAJOR(dev) || 3393 minor != MINOR(dev)) 3394 return -EOVERFLOW; 3395 3396 3397 if (mddev->persistent) { 3398 rdev = md_import_device(dev, mddev->major_version, 3399 mddev->minor_version); 3400 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 3401 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3402 mdk_rdev_t, same_set); 3403 err = super_types[mddev->major_version] 3404 .load_super(rdev, rdev0, mddev->minor_version); 3405 if (err < 0) 3406 goto out; 3407 } 3408 } else if (mddev->external) 3409 rdev = md_import_device(dev, -2, -1); 3410 else 3411 rdev = md_import_device(dev, -1, -1); 3412 3413 if (IS_ERR(rdev)) 3414 return PTR_ERR(rdev); 3415 err = bind_rdev_to_array(rdev, mddev); 3416 out: 3417 if (err) 3418 export_rdev(rdev); 3419 return err ? err : len; 3420 } 3421 3422 static struct md_sysfs_entry md_new_device = 3423 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 3424 3425 static ssize_t 3426 bitmap_store(mddev_t *mddev, const char *buf, size_t len) 3427 { 3428 char *end; 3429 unsigned long chunk, end_chunk; 3430 3431 if (!mddev->bitmap) 3432 goto out; 3433 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 3434 while (*buf) { 3435 chunk = end_chunk = simple_strtoul(buf, &end, 0); 3436 if (buf == end) break; 3437 if (*end == '-') { /* range */ 3438 buf = end + 1; 3439 end_chunk = simple_strtoul(buf, &end, 0); 3440 if (buf == end) break; 3441 } 3442 if (*end && !isspace(*end)) break; 3443 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 3444 buf = skip_spaces(end); 3445 } 3446 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 3447 out: 3448 return len; 3449 } 3450 3451 static struct md_sysfs_entry md_bitmap = 3452 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 3453 3454 static ssize_t 3455 size_show(mddev_t *mddev, char *page) 3456 { 3457 return sprintf(page, "%llu\n", 3458 (unsigned long long)mddev->dev_sectors / 2); 3459 } 3460 3461 static int update_size(mddev_t *mddev, sector_t num_sectors); 3462 3463 static ssize_t 3464 size_store(mddev_t *mddev, const char *buf, size_t len) 3465 { 3466 /* If array is inactive, we can reduce the component size, but 3467 * not increase it (except from 0). 3468 * If array is active, we can try an on-line resize 3469 */ 3470 sector_t sectors; 3471 int err = strict_blocks_to_sectors(buf, §ors); 3472 3473 if (err < 0) 3474 return err; 3475 if (mddev->pers) { 3476 err = update_size(mddev, sectors); 3477 md_update_sb(mddev, 1); 3478 } else { 3479 if (mddev->dev_sectors == 0 || 3480 mddev->dev_sectors > sectors) 3481 mddev->dev_sectors = sectors; 3482 else 3483 err = -ENOSPC; 3484 } 3485 return err ? err : len; 3486 } 3487 3488 static struct md_sysfs_entry md_size = 3489 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 3490 3491 3492 /* Metdata version. 3493 * This is one of 3494 * 'none' for arrays with no metadata (good luck...) 3495 * 'external' for arrays with externally managed metadata, 3496 * or N.M for internally known formats 3497 */ 3498 static ssize_t 3499 metadata_show(mddev_t *mddev, char *page) 3500 { 3501 if (mddev->persistent) 3502 return sprintf(page, "%d.%d\n", 3503 mddev->major_version, mddev->minor_version); 3504 else if (mddev->external) 3505 return sprintf(page, "external:%s\n", mddev->metadata_type); 3506 else 3507 return sprintf(page, "none\n"); 3508 } 3509 3510 static ssize_t 3511 metadata_store(mddev_t *mddev, const char *buf, size_t len) 3512 { 3513 int major, minor; 3514 char *e; 3515 /* Changing the details of 'external' metadata is 3516 * always permitted. Otherwise there must be 3517 * no devices attached to the array. 3518 */ 3519 if (mddev->external && strncmp(buf, "external:", 9) == 0) 3520 ; 3521 else if (!list_empty(&mddev->disks)) 3522 return -EBUSY; 3523 3524 if (cmd_match(buf, "none")) { 3525 mddev->persistent = 0; 3526 mddev->external = 0; 3527 mddev->major_version = 0; 3528 mddev->minor_version = 90; 3529 return len; 3530 } 3531 if (strncmp(buf, "external:", 9) == 0) { 3532 size_t namelen = len-9; 3533 if (namelen >= sizeof(mddev->metadata_type)) 3534 namelen = sizeof(mddev->metadata_type)-1; 3535 strncpy(mddev->metadata_type, buf+9, namelen); 3536 mddev->metadata_type[namelen] = 0; 3537 if (namelen && mddev->metadata_type[namelen-1] == '\n') 3538 mddev->metadata_type[--namelen] = 0; 3539 mddev->persistent = 0; 3540 mddev->external = 1; 3541 mddev->major_version = 0; 3542 mddev->minor_version = 90; 3543 return len; 3544 } 3545 major = simple_strtoul(buf, &e, 10); 3546 if (e==buf || *e != '.') 3547 return -EINVAL; 3548 buf = e+1; 3549 minor = simple_strtoul(buf, &e, 10); 3550 if (e==buf || (*e && *e != '\n') ) 3551 return -EINVAL; 3552 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 3553 return -ENOENT; 3554 mddev->major_version = major; 3555 mddev->minor_version = minor; 3556 mddev->persistent = 1; 3557 mddev->external = 0; 3558 return len; 3559 } 3560 3561 static struct md_sysfs_entry md_metadata = 3562 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 3563 3564 static ssize_t 3565 action_show(mddev_t *mddev, char *page) 3566 { 3567 char *type = "idle"; 3568 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3569 type = "frozen"; 3570 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3571 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 3572 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3573 type = "reshape"; 3574 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3575 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 3576 type = "resync"; 3577 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 3578 type = "check"; 3579 else 3580 type = "repair"; 3581 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 3582 type = "recover"; 3583 } 3584 return sprintf(page, "%s\n", type); 3585 } 3586 3587 static ssize_t 3588 action_store(mddev_t *mddev, const char *page, size_t len) 3589 { 3590 if (!mddev->pers || !mddev->pers->sync_request) 3591 return -EINVAL; 3592 3593 if (cmd_match(page, "frozen")) 3594 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3595 else 3596 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3597 3598 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 3599 if (mddev->sync_thread) { 3600 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3601 md_unregister_thread(mddev->sync_thread); 3602 mddev->sync_thread = NULL; 3603 mddev->recovery = 0; 3604 } 3605 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3606 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3607 return -EBUSY; 3608 else if (cmd_match(page, "resync")) 3609 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3610 else if (cmd_match(page, "recover")) { 3611 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 3612 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3613 } else if (cmd_match(page, "reshape")) { 3614 int err; 3615 if (mddev->pers->start_reshape == NULL) 3616 return -EINVAL; 3617 err = mddev->pers->start_reshape(mddev); 3618 if (err) 3619 return err; 3620 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3621 } else { 3622 if (cmd_match(page, "check")) 3623 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3624 else if (!cmd_match(page, "repair")) 3625 return -EINVAL; 3626 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 3627 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3628 } 3629 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3630 md_wakeup_thread(mddev->thread); 3631 sysfs_notify_dirent(mddev->sysfs_action); 3632 return len; 3633 } 3634 3635 static ssize_t 3636 mismatch_cnt_show(mddev_t *mddev, char *page) 3637 { 3638 return sprintf(page, "%llu\n", 3639 (unsigned long long) mddev->resync_mismatches); 3640 } 3641 3642 static struct md_sysfs_entry md_scan_mode = 3643 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 3644 3645 3646 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 3647 3648 static ssize_t 3649 sync_min_show(mddev_t *mddev, char *page) 3650 { 3651 return sprintf(page, "%d (%s)\n", speed_min(mddev), 3652 mddev->sync_speed_min ? "local": "system"); 3653 } 3654 3655 static ssize_t 3656 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 3657 { 3658 int min; 3659 char *e; 3660 if (strncmp(buf, "system", 6)==0) { 3661 mddev->sync_speed_min = 0; 3662 return len; 3663 } 3664 min = simple_strtoul(buf, &e, 10); 3665 if (buf == e || (*e && *e != '\n') || min <= 0) 3666 return -EINVAL; 3667 mddev->sync_speed_min = min; 3668 return len; 3669 } 3670 3671 static struct md_sysfs_entry md_sync_min = 3672 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 3673 3674 static ssize_t 3675 sync_max_show(mddev_t *mddev, char *page) 3676 { 3677 return sprintf(page, "%d (%s)\n", speed_max(mddev), 3678 mddev->sync_speed_max ? "local": "system"); 3679 } 3680 3681 static ssize_t 3682 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 3683 { 3684 int max; 3685 char *e; 3686 if (strncmp(buf, "system", 6)==0) { 3687 mddev->sync_speed_max = 0; 3688 return len; 3689 } 3690 max = simple_strtoul(buf, &e, 10); 3691 if (buf == e || (*e && *e != '\n') || max <= 0) 3692 return -EINVAL; 3693 mddev->sync_speed_max = max; 3694 return len; 3695 } 3696 3697 static struct md_sysfs_entry md_sync_max = 3698 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 3699 3700 static ssize_t 3701 degraded_show(mddev_t *mddev, char *page) 3702 { 3703 return sprintf(page, "%d\n", mddev->degraded); 3704 } 3705 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 3706 3707 static ssize_t 3708 sync_force_parallel_show(mddev_t *mddev, char *page) 3709 { 3710 return sprintf(page, "%d\n", mddev->parallel_resync); 3711 } 3712 3713 static ssize_t 3714 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) 3715 { 3716 long n; 3717 3718 if (strict_strtol(buf, 10, &n)) 3719 return -EINVAL; 3720 3721 if (n != 0 && n != 1) 3722 return -EINVAL; 3723 3724 mddev->parallel_resync = n; 3725 3726 if (mddev->sync_thread) 3727 wake_up(&resync_wait); 3728 3729 return len; 3730 } 3731 3732 /* force parallel resync, even with shared block devices */ 3733 static struct md_sysfs_entry md_sync_force_parallel = 3734 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 3735 sync_force_parallel_show, sync_force_parallel_store); 3736 3737 static ssize_t 3738 sync_speed_show(mddev_t *mddev, char *page) 3739 { 3740 unsigned long resync, dt, db; 3741 if (mddev->curr_resync == 0) 3742 return sprintf(page, "none\n"); 3743 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 3744 dt = (jiffies - mddev->resync_mark) / HZ; 3745 if (!dt) dt++; 3746 db = resync - mddev->resync_mark_cnt; 3747 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 3748 } 3749 3750 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3751 3752 static ssize_t 3753 sync_completed_show(mddev_t *mddev, char *page) 3754 { 3755 unsigned long max_sectors, resync; 3756 3757 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3758 return sprintf(page, "none\n"); 3759 3760 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3761 max_sectors = mddev->resync_max_sectors; 3762 else 3763 max_sectors = mddev->dev_sectors; 3764 3765 resync = mddev->curr_resync_completed; 3766 return sprintf(page, "%lu / %lu\n", resync, max_sectors); 3767 } 3768 3769 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3770 3771 static ssize_t 3772 min_sync_show(mddev_t *mddev, char *page) 3773 { 3774 return sprintf(page, "%llu\n", 3775 (unsigned long long)mddev->resync_min); 3776 } 3777 static ssize_t 3778 min_sync_store(mddev_t *mddev, const char *buf, size_t len) 3779 { 3780 unsigned long long min; 3781 if (strict_strtoull(buf, 10, &min)) 3782 return -EINVAL; 3783 if (min > mddev->resync_max) 3784 return -EINVAL; 3785 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3786 return -EBUSY; 3787 3788 /* Must be a multiple of chunk_size */ 3789 if (mddev->chunk_sectors) { 3790 sector_t temp = min; 3791 if (sector_div(temp, mddev->chunk_sectors)) 3792 return -EINVAL; 3793 } 3794 mddev->resync_min = min; 3795 3796 return len; 3797 } 3798 3799 static struct md_sysfs_entry md_min_sync = 3800 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 3801 3802 static ssize_t 3803 max_sync_show(mddev_t *mddev, char *page) 3804 { 3805 if (mddev->resync_max == MaxSector) 3806 return sprintf(page, "max\n"); 3807 else 3808 return sprintf(page, "%llu\n", 3809 (unsigned long long)mddev->resync_max); 3810 } 3811 static ssize_t 3812 max_sync_store(mddev_t *mddev, const char *buf, size_t len) 3813 { 3814 if (strncmp(buf, "max", 3) == 0) 3815 mddev->resync_max = MaxSector; 3816 else { 3817 unsigned long long max; 3818 if (strict_strtoull(buf, 10, &max)) 3819 return -EINVAL; 3820 if (max < mddev->resync_min) 3821 return -EINVAL; 3822 if (max < mddev->resync_max && 3823 mddev->ro == 0 && 3824 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3825 return -EBUSY; 3826 3827 /* Must be a multiple of chunk_size */ 3828 if (mddev->chunk_sectors) { 3829 sector_t temp = max; 3830 if (sector_div(temp, mddev->chunk_sectors)) 3831 return -EINVAL; 3832 } 3833 mddev->resync_max = max; 3834 } 3835 wake_up(&mddev->recovery_wait); 3836 return len; 3837 } 3838 3839 static struct md_sysfs_entry md_max_sync = 3840 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 3841 3842 static ssize_t 3843 suspend_lo_show(mddev_t *mddev, char *page) 3844 { 3845 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 3846 } 3847 3848 static ssize_t 3849 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 3850 { 3851 char *e; 3852 unsigned long long new = simple_strtoull(buf, &e, 10); 3853 3854 if (mddev->pers == NULL || 3855 mddev->pers->quiesce == NULL) 3856 return -EINVAL; 3857 if (buf == e || (*e && *e != '\n')) 3858 return -EINVAL; 3859 if (new >= mddev->suspend_hi || 3860 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 3861 mddev->suspend_lo = new; 3862 mddev->pers->quiesce(mddev, 2); 3863 return len; 3864 } else 3865 return -EINVAL; 3866 } 3867 static struct md_sysfs_entry md_suspend_lo = 3868 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 3869 3870 3871 static ssize_t 3872 suspend_hi_show(mddev_t *mddev, char *page) 3873 { 3874 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 3875 } 3876 3877 static ssize_t 3878 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 3879 { 3880 char *e; 3881 unsigned long long new = simple_strtoull(buf, &e, 10); 3882 3883 if (mddev->pers == NULL || 3884 mddev->pers->quiesce == NULL) 3885 return -EINVAL; 3886 if (buf == e || (*e && *e != '\n')) 3887 return -EINVAL; 3888 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 3889 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 3890 mddev->suspend_hi = new; 3891 mddev->pers->quiesce(mddev, 1); 3892 mddev->pers->quiesce(mddev, 0); 3893 return len; 3894 } else 3895 return -EINVAL; 3896 } 3897 static struct md_sysfs_entry md_suspend_hi = 3898 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 3899 3900 static ssize_t 3901 reshape_position_show(mddev_t *mddev, char *page) 3902 { 3903 if (mddev->reshape_position != MaxSector) 3904 return sprintf(page, "%llu\n", 3905 (unsigned long long)mddev->reshape_position); 3906 strcpy(page, "none\n"); 3907 return 5; 3908 } 3909 3910 static ssize_t 3911 reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 3912 { 3913 char *e; 3914 unsigned long long new = simple_strtoull(buf, &e, 10); 3915 if (mddev->pers) 3916 return -EBUSY; 3917 if (buf == e || (*e && *e != '\n')) 3918 return -EINVAL; 3919 mddev->reshape_position = new; 3920 mddev->delta_disks = 0; 3921 mddev->new_level = mddev->level; 3922 mddev->new_layout = mddev->layout; 3923 mddev->new_chunk_sectors = mddev->chunk_sectors; 3924 return len; 3925 } 3926 3927 static struct md_sysfs_entry md_reshape_position = 3928 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 3929 reshape_position_store); 3930 3931 static ssize_t 3932 array_size_show(mddev_t *mddev, char *page) 3933 { 3934 if (mddev->external_size) 3935 return sprintf(page, "%llu\n", 3936 (unsigned long long)mddev->array_sectors/2); 3937 else 3938 return sprintf(page, "default\n"); 3939 } 3940 3941 static ssize_t 3942 array_size_store(mddev_t *mddev, const char *buf, size_t len) 3943 { 3944 sector_t sectors; 3945 3946 if (strncmp(buf, "default", 7) == 0) { 3947 if (mddev->pers) 3948 sectors = mddev->pers->size(mddev, 0, 0); 3949 else 3950 sectors = mddev->array_sectors; 3951 3952 mddev->external_size = 0; 3953 } else { 3954 if (strict_blocks_to_sectors(buf, §ors) < 0) 3955 return -EINVAL; 3956 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 3957 return -E2BIG; 3958 3959 mddev->external_size = 1; 3960 } 3961 3962 mddev->array_sectors = sectors; 3963 set_capacity(mddev->gendisk, mddev->array_sectors); 3964 if (mddev->pers) 3965 revalidate_disk(mddev->gendisk); 3966 3967 return len; 3968 } 3969 3970 static struct md_sysfs_entry md_array_size = 3971 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 3972 array_size_store); 3973 3974 static struct attribute *md_default_attrs[] = { 3975 &md_level.attr, 3976 &md_layout.attr, 3977 &md_raid_disks.attr, 3978 &md_chunk_size.attr, 3979 &md_size.attr, 3980 &md_resync_start.attr, 3981 &md_metadata.attr, 3982 &md_new_device.attr, 3983 &md_safe_delay.attr, 3984 &md_array_state.attr, 3985 &md_reshape_position.attr, 3986 &md_array_size.attr, 3987 &max_corr_read_errors.attr, 3988 NULL, 3989 }; 3990 3991 static struct attribute *md_redundancy_attrs[] = { 3992 &md_scan_mode.attr, 3993 &md_mismatches.attr, 3994 &md_sync_min.attr, 3995 &md_sync_max.attr, 3996 &md_sync_speed.attr, 3997 &md_sync_force_parallel.attr, 3998 &md_sync_completed.attr, 3999 &md_min_sync.attr, 4000 &md_max_sync.attr, 4001 &md_suspend_lo.attr, 4002 &md_suspend_hi.attr, 4003 &md_bitmap.attr, 4004 &md_degraded.attr, 4005 NULL, 4006 }; 4007 static struct attribute_group md_redundancy_group = { 4008 .name = NULL, 4009 .attrs = md_redundancy_attrs, 4010 }; 4011 4012 4013 static ssize_t 4014 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4015 { 4016 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4017 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 4018 ssize_t rv; 4019 4020 if (!entry->show) 4021 return -EIO; 4022 rv = mddev_lock(mddev); 4023 if (!rv) { 4024 rv = entry->show(mddev, page); 4025 mddev_unlock(mddev); 4026 } 4027 return rv; 4028 } 4029 4030 static ssize_t 4031 md_attr_store(struct kobject *kobj, struct attribute *attr, 4032 const char *page, size_t length) 4033 { 4034 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4035 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 4036 ssize_t rv; 4037 4038 if (!entry->store) 4039 return -EIO; 4040 if (!capable(CAP_SYS_ADMIN)) 4041 return -EACCES; 4042 rv = mddev_lock(mddev); 4043 if (mddev->hold_active == UNTIL_IOCTL) 4044 mddev->hold_active = 0; 4045 if (!rv) { 4046 rv = entry->store(mddev, page, length); 4047 mddev_unlock(mddev); 4048 } 4049 return rv; 4050 } 4051 4052 static void md_free(struct kobject *ko) 4053 { 4054 mddev_t *mddev = container_of(ko, mddev_t, kobj); 4055 4056 if (mddev->sysfs_state) 4057 sysfs_put(mddev->sysfs_state); 4058 4059 if (mddev->gendisk) { 4060 del_gendisk(mddev->gendisk); 4061 put_disk(mddev->gendisk); 4062 } 4063 if (mddev->queue) 4064 blk_cleanup_queue(mddev->queue); 4065 4066 kfree(mddev); 4067 } 4068 4069 static const struct sysfs_ops md_sysfs_ops = { 4070 .show = md_attr_show, 4071 .store = md_attr_store, 4072 }; 4073 static struct kobj_type md_ktype = { 4074 .release = md_free, 4075 .sysfs_ops = &md_sysfs_ops, 4076 .default_attrs = md_default_attrs, 4077 }; 4078 4079 int mdp_major = 0; 4080 4081 static void mddev_delayed_delete(struct work_struct *ws) 4082 { 4083 mddev_t *mddev = container_of(ws, mddev_t, del_work); 4084 4085 if (mddev->private) { 4086 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 4087 if (mddev->private != (void*)1) 4088 sysfs_remove_group(&mddev->kobj, mddev->private); 4089 if (mddev->sysfs_action) 4090 sysfs_put(mddev->sysfs_action); 4091 mddev->sysfs_action = NULL; 4092 mddev->private = NULL; 4093 } 4094 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4095 kobject_del(&mddev->kobj); 4096 kobject_put(&mddev->kobj); 4097 } 4098 4099 static int md_alloc(dev_t dev, char *name) 4100 { 4101 static DEFINE_MUTEX(disks_mutex); 4102 mddev_t *mddev = mddev_find(dev); 4103 struct gendisk *disk; 4104 int partitioned; 4105 int shift; 4106 int unit; 4107 int error; 4108 4109 if (!mddev) 4110 return -ENODEV; 4111 4112 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 4113 shift = partitioned ? MdpMinorShift : 0; 4114 unit = MINOR(mddev->unit) >> shift; 4115 4116 /* wait for any previous instance if this device 4117 * to be completed removed (mddev_delayed_delete). 4118 */ 4119 flush_scheduled_work(); 4120 4121 mutex_lock(&disks_mutex); 4122 error = -EEXIST; 4123 if (mddev->gendisk) 4124 goto abort; 4125 4126 if (name) { 4127 /* Need to ensure that 'name' is not a duplicate. 4128 */ 4129 mddev_t *mddev2; 4130 spin_lock(&all_mddevs_lock); 4131 4132 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 4133 if (mddev2->gendisk && 4134 strcmp(mddev2->gendisk->disk_name, name) == 0) { 4135 spin_unlock(&all_mddevs_lock); 4136 goto abort; 4137 } 4138 spin_unlock(&all_mddevs_lock); 4139 } 4140 4141 error = -ENOMEM; 4142 mddev->queue = blk_alloc_queue(GFP_KERNEL); 4143 if (!mddev->queue) 4144 goto abort; 4145 mddev->queue->queuedata = mddev; 4146 4147 /* Can be unlocked because the queue is new: no concurrency */ 4148 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); 4149 4150 blk_queue_make_request(mddev->queue, md_make_request); 4151 4152 disk = alloc_disk(1 << shift); 4153 if (!disk) { 4154 blk_cleanup_queue(mddev->queue); 4155 mddev->queue = NULL; 4156 goto abort; 4157 } 4158 disk->major = MAJOR(mddev->unit); 4159 disk->first_minor = unit << shift; 4160 if (name) 4161 strcpy(disk->disk_name, name); 4162 else if (partitioned) 4163 sprintf(disk->disk_name, "md_d%d", unit); 4164 else 4165 sprintf(disk->disk_name, "md%d", unit); 4166 disk->fops = &md_fops; 4167 disk->private_data = mddev; 4168 disk->queue = mddev->queue; 4169 /* Allow extended partitions. This makes the 4170 * 'mdp' device redundant, but we can't really 4171 * remove it now. 4172 */ 4173 disk->flags |= GENHD_FL_EXT_DEVT; 4174 add_disk(disk); 4175 mddev->gendisk = disk; 4176 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 4177 &disk_to_dev(disk)->kobj, "%s", "md"); 4178 if (error) { 4179 /* This isn't possible, but as kobject_init_and_add is marked 4180 * __must_check, we must do something with the result 4181 */ 4182 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 4183 disk->disk_name); 4184 error = 0; 4185 } 4186 if (sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 4187 printk(KERN_DEBUG "pointless warning\n"); 4188 abort: 4189 mutex_unlock(&disks_mutex); 4190 if (!error) { 4191 kobject_uevent(&mddev->kobj, KOBJ_ADD); 4192 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 4193 } 4194 mddev_put(mddev); 4195 return error; 4196 } 4197 4198 static struct kobject *md_probe(dev_t dev, int *part, void *data) 4199 { 4200 md_alloc(dev, NULL); 4201 return NULL; 4202 } 4203 4204 static int add_named_array(const char *val, struct kernel_param *kp) 4205 { 4206 /* val must be "md_*" where * is not all digits. 4207 * We allocate an array with a large free minor number, and 4208 * set the name to val. val must not already be an active name. 4209 */ 4210 int len = strlen(val); 4211 char buf[DISK_NAME_LEN]; 4212 4213 while (len && val[len-1] == '\n') 4214 len--; 4215 if (len >= DISK_NAME_LEN) 4216 return -E2BIG; 4217 strlcpy(buf, val, len+1); 4218 if (strncmp(buf, "md_", 3) != 0) 4219 return -EINVAL; 4220 return md_alloc(0, buf); 4221 } 4222 4223 static void md_safemode_timeout(unsigned long data) 4224 { 4225 mddev_t *mddev = (mddev_t *) data; 4226 4227 if (!atomic_read(&mddev->writes_pending)) { 4228 mddev->safemode = 1; 4229 if (mddev->external) 4230 sysfs_notify_dirent(mddev->sysfs_state); 4231 } 4232 md_wakeup_thread(mddev->thread); 4233 } 4234 4235 static int start_dirty_degraded; 4236 4237 static int do_md_run(mddev_t * mddev) 4238 { 4239 int err; 4240 mdk_rdev_t *rdev; 4241 struct gendisk *disk; 4242 struct mdk_personality *pers; 4243 4244 if (list_empty(&mddev->disks)) 4245 /* cannot run an array with no devices.. */ 4246 return -EINVAL; 4247 4248 if (mddev->pers) 4249 return -EBUSY; 4250 4251 /* 4252 * Analyze all RAID superblock(s) 4253 */ 4254 if (!mddev->raid_disks) { 4255 if (!mddev->persistent) 4256 return -EINVAL; 4257 analyze_sbs(mddev); 4258 } 4259 4260 if (mddev->level != LEVEL_NONE) 4261 request_module("md-level-%d", mddev->level); 4262 else if (mddev->clevel[0]) 4263 request_module("md-%s", mddev->clevel); 4264 4265 /* 4266 * Drop all container device buffers, from now on 4267 * the only valid external interface is through the md 4268 * device. 4269 */ 4270 list_for_each_entry(rdev, &mddev->disks, same_set) { 4271 if (test_bit(Faulty, &rdev->flags)) 4272 continue; 4273 sync_blockdev(rdev->bdev); 4274 invalidate_bdev(rdev->bdev); 4275 4276 /* perform some consistency tests on the device. 4277 * We don't want the data to overlap the metadata, 4278 * Internal Bitmap issues have been handled elsewhere. 4279 */ 4280 if (rdev->data_offset < rdev->sb_start) { 4281 if (mddev->dev_sectors && 4282 rdev->data_offset + mddev->dev_sectors 4283 > rdev->sb_start) { 4284 printk("md: %s: data overlaps metadata\n", 4285 mdname(mddev)); 4286 return -EINVAL; 4287 } 4288 } else { 4289 if (rdev->sb_start + rdev->sb_size/512 4290 > rdev->data_offset) { 4291 printk("md: %s: metadata overlaps data\n", 4292 mdname(mddev)); 4293 return -EINVAL; 4294 } 4295 } 4296 sysfs_notify_dirent(rdev->sysfs_state); 4297 } 4298 4299 disk = mddev->gendisk; 4300 4301 spin_lock(&pers_lock); 4302 pers = find_pers(mddev->level, mddev->clevel); 4303 if (!pers || !try_module_get(pers->owner)) { 4304 spin_unlock(&pers_lock); 4305 if (mddev->level != LEVEL_NONE) 4306 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 4307 mddev->level); 4308 else 4309 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 4310 mddev->clevel); 4311 return -EINVAL; 4312 } 4313 mddev->pers = pers; 4314 spin_unlock(&pers_lock); 4315 if (mddev->level != pers->level) { 4316 mddev->level = pers->level; 4317 mddev->new_level = pers->level; 4318 } 4319 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4320 4321 if (mddev->reshape_position != MaxSector && 4322 pers->start_reshape == NULL) { 4323 /* This personality cannot handle reshaping... */ 4324 mddev->pers = NULL; 4325 module_put(pers->owner); 4326 return -EINVAL; 4327 } 4328 4329 if (pers->sync_request) { 4330 /* Warn if this is a potentially silly 4331 * configuration. 4332 */ 4333 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4334 mdk_rdev_t *rdev2; 4335 int warned = 0; 4336 4337 list_for_each_entry(rdev, &mddev->disks, same_set) 4338 list_for_each_entry(rdev2, &mddev->disks, same_set) { 4339 if (rdev < rdev2 && 4340 rdev->bdev->bd_contains == 4341 rdev2->bdev->bd_contains) { 4342 printk(KERN_WARNING 4343 "%s: WARNING: %s appears to be" 4344 " on the same physical disk as" 4345 " %s.\n", 4346 mdname(mddev), 4347 bdevname(rdev->bdev,b), 4348 bdevname(rdev2->bdev,b2)); 4349 warned = 1; 4350 } 4351 } 4352 4353 if (warned) 4354 printk(KERN_WARNING 4355 "True protection against single-disk" 4356 " failure might be compromised.\n"); 4357 } 4358 4359 mddev->recovery = 0; 4360 /* may be over-ridden by personality */ 4361 mddev->resync_max_sectors = mddev->dev_sectors; 4362 4363 mddev->barriers_work = 1; 4364 mddev->ok_start_degraded = start_dirty_degraded; 4365 4366 if (start_readonly && mddev->ro == 0) 4367 mddev->ro = 2; /* read-only, but switch on first write */ 4368 4369 err = mddev->pers->run(mddev); 4370 if (err) 4371 printk(KERN_ERR "md: pers->run() failed ...\n"); 4372 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { 4373 WARN_ONCE(!mddev->external_size, "%s: default size too small," 4374 " but 'external_size' not in effect?\n", __func__); 4375 printk(KERN_ERR 4376 "md: invalid array_size %llu > default size %llu\n", 4377 (unsigned long long)mddev->array_sectors / 2, 4378 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); 4379 err = -EINVAL; 4380 mddev->pers->stop(mddev); 4381 } 4382 if (err == 0 && mddev->pers->sync_request) { 4383 err = bitmap_create(mddev); 4384 if (err) { 4385 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 4386 mdname(mddev), err); 4387 mddev->pers->stop(mddev); 4388 } 4389 } 4390 if (err) { 4391 module_put(mddev->pers->owner); 4392 mddev->pers = NULL; 4393 bitmap_destroy(mddev); 4394 return err; 4395 } 4396 if (mddev->pers->sync_request) { 4397 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4398 printk(KERN_WARNING 4399 "md: cannot register extra attributes for %s\n", 4400 mdname(mddev)); 4401 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4402 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 4403 mddev->ro = 0; 4404 4405 atomic_set(&mddev->writes_pending,0); 4406 atomic_set(&mddev->max_corr_read_errors, 4407 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 4408 mddev->safemode = 0; 4409 mddev->safemode_timer.function = md_safemode_timeout; 4410 mddev->safemode_timer.data = (unsigned long) mddev; 4411 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 4412 mddev->in_sync = 1; 4413 4414 list_for_each_entry(rdev, &mddev->disks, same_set) 4415 if (rdev->raid_disk >= 0) { 4416 char nm[20]; 4417 sprintf(nm, "rd%d", rdev->raid_disk); 4418 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 4419 printk("md: cannot register %s for %s\n", 4420 nm, mdname(mddev)); 4421 } 4422 4423 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4424 4425 if (mddev->flags) 4426 md_update_sb(mddev, 0); 4427 4428 set_capacity(disk, mddev->array_sectors); 4429 4430 md_wakeup_thread(mddev->thread); 4431 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4432 4433 revalidate_disk(mddev->gendisk); 4434 mddev->changed = 1; 4435 md_new_event(mddev); 4436 sysfs_notify_dirent(mddev->sysfs_state); 4437 if (mddev->sysfs_action) 4438 sysfs_notify_dirent(mddev->sysfs_action); 4439 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4440 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4441 return 0; 4442 } 4443 4444 static int restart_array(mddev_t *mddev) 4445 { 4446 struct gendisk *disk = mddev->gendisk; 4447 4448 /* Complain if it has no devices */ 4449 if (list_empty(&mddev->disks)) 4450 return -ENXIO; 4451 if (!mddev->pers) 4452 return -EINVAL; 4453 if (!mddev->ro) 4454 return -EBUSY; 4455 mddev->safemode = 0; 4456 mddev->ro = 0; 4457 set_disk_ro(disk, 0); 4458 printk(KERN_INFO "md: %s switched to read-write mode.\n", 4459 mdname(mddev)); 4460 /* Kick recovery or resync if necessary */ 4461 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4462 md_wakeup_thread(mddev->thread); 4463 md_wakeup_thread(mddev->sync_thread); 4464 sysfs_notify_dirent(mddev->sysfs_state); 4465 return 0; 4466 } 4467 4468 /* similar to deny_write_access, but accounts for our holding a reference 4469 * to the file ourselves */ 4470 static int deny_bitmap_write_access(struct file * file) 4471 { 4472 struct inode *inode = file->f_mapping->host; 4473 4474 spin_lock(&inode->i_lock); 4475 if (atomic_read(&inode->i_writecount) > 1) { 4476 spin_unlock(&inode->i_lock); 4477 return -ETXTBSY; 4478 } 4479 atomic_set(&inode->i_writecount, -1); 4480 spin_unlock(&inode->i_lock); 4481 4482 return 0; 4483 } 4484 4485 void restore_bitmap_write_access(struct file *file) 4486 { 4487 struct inode *inode = file->f_mapping->host; 4488 4489 spin_lock(&inode->i_lock); 4490 atomic_set(&inode->i_writecount, 1); 4491 spin_unlock(&inode->i_lock); 4492 } 4493 4494 /* mode: 4495 * 0 - completely stop and dis-assemble array 4496 * 1 - switch to readonly 4497 * 2 - stop but do not disassemble array 4498 */ 4499 static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4500 { 4501 int err = 0; 4502 struct gendisk *disk = mddev->gendisk; 4503 mdk_rdev_t *rdev; 4504 4505 mutex_lock(&mddev->open_mutex); 4506 if (atomic_read(&mddev->openers) > is_open) { 4507 printk("md: %s still in use.\n",mdname(mddev)); 4508 err = -EBUSY; 4509 } else if (mddev->pers) { 4510 4511 if (mddev->sync_thread) { 4512 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4513 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4514 md_unregister_thread(mddev->sync_thread); 4515 mddev->sync_thread = NULL; 4516 } 4517 4518 del_timer_sync(&mddev->safemode_timer); 4519 4520 switch(mode) { 4521 case 1: /* readonly */ 4522 err = -ENXIO; 4523 if (mddev->ro==1) 4524 goto out; 4525 mddev->ro = 1; 4526 break; 4527 case 0: /* disassemble */ 4528 case 2: /* stop */ 4529 bitmap_flush(mddev); 4530 md_super_wait(mddev); 4531 if (mddev->ro) 4532 set_disk_ro(disk, 0); 4533 4534 mddev->pers->stop(mddev); 4535 mddev->queue->merge_bvec_fn = NULL; 4536 mddev->queue->unplug_fn = NULL; 4537 mddev->queue->backing_dev_info.congested_fn = NULL; 4538 module_put(mddev->pers->owner); 4539 if (mddev->pers->sync_request && mddev->private == NULL) 4540 mddev->private = (void*)1; 4541 mddev->pers = NULL; 4542 /* tell userspace to handle 'inactive' */ 4543 sysfs_notify_dirent(mddev->sysfs_state); 4544 4545 list_for_each_entry(rdev, &mddev->disks, same_set) 4546 if (rdev->raid_disk >= 0) { 4547 char nm[20]; 4548 sprintf(nm, "rd%d", rdev->raid_disk); 4549 sysfs_remove_link(&mddev->kobj, nm); 4550 } 4551 4552 set_capacity(disk, 0); 4553 mddev->changed = 1; 4554 4555 if (mddev->ro) 4556 mddev->ro = 0; 4557 } 4558 if (!mddev->in_sync || mddev->flags) { 4559 /* mark array as shutdown cleanly */ 4560 mddev->in_sync = 1; 4561 md_update_sb(mddev, 1); 4562 } 4563 if (mode == 1) 4564 set_disk_ro(disk, 1); 4565 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4566 err = 0; 4567 } 4568 out: 4569 mutex_unlock(&mddev->open_mutex); 4570 if (err) 4571 return err; 4572 /* 4573 * Free resources if final stop 4574 */ 4575 if (mode == 0) { 4576 4577 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4578 4579 bitmap_destroy(mddev); 4580 if (mddev->bitmap_info.file) { 4581 restore_bitmap_write_access(mddev->bitmap_info.file); 4582 fput(mddev->bitmap_info.file); 4583 mddev->bitmap_info.file = NULL; 4584 } 4585 mddev->bitmap_info.offset = 0; 4586 4587 export_array(mddev); 4588 4589 mddev->array_sectors = 0; 4590 mddev->external_size = 0; 4591 mddev->dev_sectors = 0; 4592 mddev->raid_disks = 0; 4593 mddev->recovery_cp = 0; 4594 mddev->resync_min = 0; 4595 mddev->resync_max = MaxSector; 4596 mddev->reshape_position = MaxSector; 4597 mddev->external = 0; 4598 mddev->persistent = 0; 4599 mddev->level = LEVEL_NONE; 4600 mddev->clevel[0] = 0; 4601 mddev->flags = 0; 4602 mddev->ro = 0; 4603 mddev->metadata_type[0] = 0; 4604 mddev->chunk_sectors = 0; 4605 mddev->ctime = mddev->utime = 0; 4606 mddev->layout = 0; 4607 mddev->max_disks = 0; 4608 mddev->events = 0; 4609 mddev->delta_disks = 0; 4610 mddev->new_level = LEVEL_NONE; 4611 mddev->new_layout = 0; 4612 mddev->new_chunk_sectors = 0; 4613 mddev->curr_resync = 0; 4614 mddev->resync_mismatches = 0; 4615 mddev->suspend_lo = mddev->suspend_hi = 0; 4616 mddev->sync_speed_min = mddev->sync_speed_max = 0; 4617 mddev->recovery = 0; 4618 mddev->in_sync = 0; 4619 mddev->changed = 0; 4620 mddev->degraded = 0; 4621 mddev->barriers_work = 0; 4622 mddev->safemode = 0; 4623 mddev->bitmap_info.offset = 0; 4624 mddev->bitmap_info.default_offset = 0; 4625 mddev->bitmap_info.chunksize = 0; 4626 mddev->bitmap_info.daemon_sleep = 0; 4627 mddev->bitmap_info.max_write_behind = 0; 4628 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4629 if (mddev->hold_active == UNTIL_STOP) 4630 mddev->hold_active = 0; 4631 4632 } else if (mddev->pers) 4633 printk(KERN_INFO "md: %s switched to read-only mode.\n", 4634 mdname(mddev)); 4635 err = 0; 4636 blk_integrity_unregister(disk); 4637 md_new_event(mddev); 4638 sysfs_notify_dirent(mddev->sysfs_state); 4639 return err; 4640 } 4641 4642 #ifndef MODULE 4643 static void autorun_array(mddev_t *mddev) 4644 { 4645 mdk_rdev_t *rdev; 4646 int err; 4647 4648 if (list_empty(&mddev->disks)) 4649 return; 4650 4651 printk(KERN_INFO "md: running: "); 4652 4653 list_for_each_entry(rdev, &mddev->disks, same_set) { 4654 char b[BDEVNAME_SIZE]; 4655 printk("<%s>", bdevname(rdev->bdev,b)); 4656 } 4657 printk("\n"); 4658 4659 err = do_md_run(mddev); 4660 if (err) { 4661 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 4662 do_md_stop(mddev, 0, 0); 4663 } 4664 } 4665 4666 /* 4667 * lets try to run arrays based on all disks that have arrived 4668 * until now. (those are in pending_raid_disks) 4669 * 4670 * the method: pick the first pending disk, collect all disks with 4671 * the same UUID, remove all from the pending list and put them into 4672 * the 'same_array' list. Then order this list based on superblock 4673 * update time (freshest comes first), kick out 'old' disks and 4674 * compare superblocks. If everything's fine then run it. 4675 * 4676 * If "unit" is allocated, then bump its reference count 4677 */ 4678 static void autorun_devices(int part) 4679 { 4680 mdk_rdev_t *rdev0, *rdev, *tmp; 4681 mddev_t *mddev; 4682 char b[BDEVNAME_SIZE]; 4683 4684 printk(KERN_INFO "md: autorun ...\n"); 4685 while (!list_empty(&pending_raid_disks)) { 4686 int unit; 4687 dev_t dev; 4688 LIST_HEAD(candidates); 4689 rdev0 = list_entry(pending_raid_disks.next, 4690 mdk_rdev_t, same_set); 4691 4692 printk(KERN_INFO "md: considering %s ...\n", 4693 bdevname(rdev0->bdev,b)); 4694 INIT_LIST_HEAD(&candidates); 4695 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 4696 if (super_90_load(rdev, rdev0, 0) >= 0) { 4697 printk(KERN_INFO "md: adding %s ...\n", 4698 bdevname(rdev->bdev,b)); 4699 list_move(&rdev->same_set, &candidates); 4700 } 4701 /* 4702 * now we have a set of devices, with all of them having 4703 * mostly sane superblocks. It's time to allocate the 4704 * mddev. 4705 */ 4706 if (part) { 4707 dev = MKDEV(mdp_major, 4708 rdev0->preferred_minor << MdpMinorShift); 4709 unit = MINOR(dev) >> MdpMinorShift; 4710 } else { 4711 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 4712 unit = MINOR(dev); 4713 } 4714 if (rdev0->preferred_minor != unit) { 4715 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 4716 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 4717 break; 4718 } 4719 4720 md_probe(dev, NULL, NULL); 4721 mddev = mddev_find(dev); 4722 if (!mddev || !mddev->gendisk) { 4723 if (mddev) 4724 mddev_put(mddev); 4725 printk(KERN_ERR 4726 "md: cannot allocate memory for md drive.\n"); 4727 break; 4728 } 4729 if (mddev_lock(mddev)) 4730 printk(KERN_WARNING "md: %s locked, cannot run\n", 4731 mdname(mddev)); 4732 else if (mddev->raid_disks || mddev->major_version 4733 || !list_empty(&mddev->disks)) { 4734 printk(KERN_WARNING 4735 "md: %s already running, cannot run %s\n", 4736 mdname(mddev), bdevname(rdev0->bdev,b)); 4737 mddev_unlock(mddev); 4738 } else { 4739 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 4740 mddev->persistent = 1; 4741 rdev_for_each_list(rdev, tmp, &candidates) { 4742 list_del_init(&rdev->same_set); 4743 if (bind_rdev_to_array(rdev, mddev)) 4744 export_rdev(rdev); 4745 } 4746 autorun_array(mddev); 4747 mddev_unlock(mddev); 4748 } 4749 /* on success, candidates will be empty, on error 4750 * it won't... 4751 */ 4752 rdev_for_each_list(rdev, tmp, &candidates) { 4753 list_del_init(&rdev->same_set); 4754 export_rdev(rdev); 4755 } 4756 mddev_put(mddev); 4757 } 4758 printk(KERN_INFO "md: ... autorun DONE.\n"); 4759 } 4760 #endif /* !MODULE */ 4761 4762 static int get_version(void __user * arg) 4763 { 4764 mdu_version_t ver; 4765 4766 ver.major = MD_MAJOR_VERSION; 4767 ver.minor = MD_MINOR_VERSION; 4768 ver.patchlevel = MD_PATCHLEVEL_VERSION; 4769 4770 if (copy_to_user(arg, &ver, sizeof(ver))) 4771 return -EFAULT; 4772 4773 return 0; 4774 } 4775 4776 static int get_array_info(mddev_t * mddev, void __user * arg) 4777 { 4778 mdu_array_info_t info; 4779 int nr,working,insync,failed,spare; 4780 mdk_rdev_t *rdev; 4781 4782 nr=working=insync=failed=spare=0; 4783 list_for_each_entry(rdev, &mddev->disks, same_set) { 4784 nr++; 4785 if (test_bit(Faulty, &rdev->flags)) 4786 failed++; 4787 else { 4788 working++; 4789 if (test_bit(In_sync, &rdev->flags)) 4790 insync++; 4791 else 4792 spare++; 4793 } 4794 } 4795 4796 info.major_version = mddev->major_version; 4797 info.minor_version = mddev->minor_version; 4798 info.patch_version = MD_PATCHLEVEL_VERSION; 4799 info.ctime = mddev->ctime; 4800 info.level = mddev->level; 4801 info.size = mddev->dev_sectors / 2; 4802 if (info.size != mddev->dev_sectors / 2) /* overflow */ 4803 info.size = -1; 4804 info.nr_disks = nr; 4805 info.raid_disks = mddev->raid_disks; 4806 info.md_minor = mddev->md_minor; 4807 info.not_persistent= !mddev->persistent; 4808 4809 info.utime = mddev->utime; 4810 info.state = 0; 4811 if (mddev->in_sync) 4812 info.state = (1<<MD_SB_CLEAN); 4813 if (mddev->bitmap && mddev->bitmap_info.offset) 4814 info.state = (1<<MD_SB_BITMAP_PRESENT); 4815 info.active_disks = insync; 4816 info.working_disks = working; 4817 info.failed_disks = failed; 4818 info.spare_disks = spare; 4819 4820 info.layout = mddev->layout; 4821 info.chunk_size = mddev->chunk_sectors << 9; 4822 4823 if (copy_to_user(arg, &info, sizeof(info))) 4824 return -EFAULT; 4825 4826 return 0; 4827 } 4828 4829 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 4830 { 4831 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 4832 char *ptr, *buf = NULL; 4833 int err = -ENOMEM; 4834 4835 if (md_allow_write(mddev)) 4836 file = kmalloc(sizeof(*file), GFP_NOIO); 4837 else 4838 file = kmalloc(sizeof(*file), GFP_KERNEL); 4839 4840 if (!file) 4841 goto out; 4842 4843 /* bitmap disabled, zero the first byte and copy out */ 4844 if (!mddev->bitmap || !mddev->bitmap->file) { 4845 file->pathname[0] = '\0'; 4846 goto copy_out; 4847 } 4848 4849 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 4850 if (!buf) 4851 goto out; 4852 4853 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); 4854 if (IS_ERR(ptr)) 4855 goto out; 4856 4857 strcpy(file->pathname, ptr); 4858 4859 copy_out: 4860 err = 0; 4861 if (copy_to_user(arg, file, sizeof(*file))) 4862 err = -EFAULT; 4863 out: 4864 kfree(buf); 4865 kfree(file); 4866 return err; 4867 } 4868 4869 static int get_disk_info(mddev_t * mddev, void __user * arg) 4870 { 4871 mdu_disk_info_t info; 4872 mdk_rdev_t *rdev; 4873 4874 if (copy_from_user(&info, arg, sizeof(info))) 4875 return -EFAULT; 4876 4877 rdev = find_rdev_nr(mddev, info.number); 4878 if (rdev) { 4879 info.major = MAJOR(rdev->bdev->bd_dev); 4880 info.minor = MINOR(rdev->bdev->bd_dev); 4881 info.raid_disk = rdev->raid_disk; 4882 info.state = 0; 4883 if (test_bit(Faulty, &rdev->flags)) 4884 info.state |= (1<<MD_DISK_FAULTY); 4885 else if (test_bit(In_sync, &rdev->flags)) { 4886 info.state |= (1<<MD_DISK_ACTIVE); 4887 info.state |= (1<<MD_DISK_SYNC); 4888 } 4889 if (test_bit(WriteMostly, &rdev->flags)) 4890 info.state |= (1<<MD_DISK_WRITEMOSTLY); 4891 } else { 4892 info.major = info.minor = 0; 4893 info.raid_disk = -1; 4894 info.state = (1<<MD_DISK_REMOVED); 4895 } 4896 4897 if (copy_to_user(arg, &info, sizeof(info))) 4898 return -EFAULT; 4899 4900 return 0; 4901 } 4902 4903 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 4904 { 4905 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4906 mdk_rdev_t *rdev; 4907 dev_t dev = MKDEV(info->major,info->minor); 4908 4909 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 4910 return -EOVERFLOW; 4911 4912 if (!mddev->raid_disks) { 4913 int err; 4914 /* expecting a device which has a superblock */ 4915 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 4916 if (IS_ERR(rdev)) { 4917 printk(KERN_WARNING 4918 "md: md_import_device returned %ld\n", 4919 PTR_ERR(rdev)); 4920 return PTR_ERR(rdev); 4921 } 4922 if (!list_empty(&mddev->disks)) { 4923 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 4924 mdk_rdev_t, same_set); 4925 err = super_types[mddev->major_version] 4926 .load_super(rdev, rdev0, mddev->minor_version); 4927 if (err < 0) { 4928 printk(KERN_WARNING 4929 "md: %s has different UUID to %s\n", 4930 bdevname(rdev->bdev,b), 4931 bdevname(rdev0->bdev,b2)); 4932 export_rdev(rdev); 4933 return -EINVAL; 4934 } 4935 } 4936 err = bind_rdev_to_array(rdev, mddev); 4937 if (err) 4938 export_rdev(rdev); 4939 return err; 4940 } 4941 4942 /* 4943 * add_new_disk can be used once the array is assembled 4944 * to add "hot spares". They must already have a superblock 4945 * written 4946 */ 4947 if (mddev->pers) { 4948 int err; 4949 if (!mddev->pers->hot_add_disk) { 4950 printk(KERN_WARNING 4951 "%s: personality does not support diskops!\n", 4952 mdname(mddev)); 4953 return -EINVAL; 4954 } 4955 if (mddev->persistent) 4956 rdev = md_import_device(dev, mddev->major_version, 4957 mddev->minor_version); 4958 else 4959 rdev = md_import_device(dev, -1, -1); 4960 if (IS_ERR(rdev)) { 4961 printk(KERN_WARNING 4962 "md: md_import_device returned %ld\n", 4963 PTR_ERR(rdev)); 4964 return PTR_ERR(rdev); 4965 } 4966 /* set save_raid_disk if appropriate */ 4967 if (!mddev->persistent) { 4968 if (info->state & (1<<MD_DISK_SYNC) && 4969 info->raid_disk < mddev->raid_disks) 4970 rdev->raid_disk = info->raid_disk; 4971 else 4972 rdev->raid_disk = -1; 4973 } else 4974 super_types[mddev->major_version]. 4975 validate_super(mddev, rdev); 4976 rdev->saved_raid_disk = rdev->raid_disk; 4977 4978 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 4979 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4980 set_bit(WriteMostly, &rdev->flags); 4981 else 4982 clear_bit(WriteMostly, &rdev->flags); 4983 4984 rdev->raid_disk = -1; 4985 err = bind_rdev_to_array(rdev, mddev); 4986 if (!err && !mddev->pers->hot_remove_disk) { 4987 /* If there is hot_add_disk but no hot_remove_disk 4988 * then added disks for geometry changes, 4989 * and should be added immediately. 4990 */ 4991 super_types[mddev->major_version]. 4992 validate_super(mddev, rdev); 4993 err = mddev->pers->hot_add_disk(mddev, rdev); 4994 if (err) 4995 unbind_rdev_from_array(rdev); 4996 } 4997 if (err) 4998 export_rdev(rdev); 4999 else 5000 sysfs_notify_dirent(rdev->sysfs_state); 5001 5002 md_update_sb(mddev, 1); 5003 if (mddev->degraded) 5004 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5005 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5006 md_wakeup_thread(mddev->thread); 5007 return err; 5008 } 5009 5010 /* otherwise, add_new_disk is only allowed 5011 * for major_version==0 superblocks 5012 */ 5013 if (mddev->major_version != 0) { 5014 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 5015 mdname(mddev)); 5016 return -EINVAL; 5017 } 5018 5019 if (!(info->state & (1<<MD_DISK_FAULTY))) { 5020 int err; 5021 rdev = md_import_device(dev, -1, 0); 5022 if (IS_ERR(rdev)) { 5023 printk(KERN_WARNING 5024 "md: error, md_import_device() returned %ld\n", 5025 PTR_ERR(rdev)); 5026 return PTR_ERR(rdev); 5027 } 5028 rdev->desc_nr = info->number; 5029 if (info->raid_disk < mddev->raid_disks) 5030 rdev->raid_disk = info->raid_disk; 5031 else 5032 rdev->raid_disk = -1; 5033 5034 if (rdev->raid_disk < mddev->raid_disks) 5035 if (info->state & (1<<MD_DISK_SYNC)) 5036 set_bit(In_sync, &rdev->flags); 5037 5038 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5039 set_bit(WriteMostly, &rdev->flags); 5040 5041 if (!mddev->persistent) { 5042 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 5043 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 5044 } else 5045 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 5046 rdev->sectors = rdev->sb_start; 5047 5048 err = bind_rdev_to_array(rdev, mddev); 5049 if (err) { 5050 export_rdev(rdev); 5051 return err; 5052 } 5053 } 5054 5055 return 0; 5056 } 5057 5058 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 5059 { 5060 char b[BDEVNAME_SIZE]; 5061 mdk_rdev_t *rdev; 5062 5063 rdev = find_rdev(mddev, dev); 5064 if (!rdev) 5065 return -ENXIO; 5066 5067 if (rdev->raid_disk >= 0) 5068 goto busy; 5069 5070 kick_rdev_from_array(rdev); 5071 md_update_sb(mddev, 1); 5072 md_new_event(mddev); 5073 5074 return 0; 5075 busy: 5076 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 5077 bdevname(rdev->bdev,b), mdname(mddev)); 5078 return -EBUSY; 5079 } 5080 5081 static int hot_add_disk(mddev_t * mddev, dev_t dev) 5082 { 5083 char b[BDEVNAME_SIZE]; 5084 int err; 5085 mdk_rdev_t *rdev; 5086 5087 if (!mddev->pers) 5088 return -ENODEV; 5089 5090 if (mddev->major_version != 0) { 5091 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 5092 " version-0 superblocks.\n", 5093 mdname(mddev)); 5094 return -EINVAL; 5095 } 5096 if (!mddev->pers->hot_add_disk) { 5097 printk(KERN_WARNING 5098 "%s: personality does not support diskops!\n", 5099 mdname(mddev)); 5100 return -EINVAL; 5101 } 5102 5103 rdev = md_import_device(dev, -1, 0); 5104 if (IS_ERR(rdev)) { 5105 printk(KERN_WARNING 5106 "md: error, md_import_device() returned %ld\n", 5107 PTR_ERR(rdev)); 5108 return -EINVAL; 5109 } 5110 5111 if (mddev->persistent) 5112 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 5113 else 5114 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 5115 5116 rdev->sectors = rdev->sb_start; 5117 5118 if (test_bit(Faulty, &rdev->flags)) { 5119 printk(KERN_WARNING 5120 "md: can not hot-add faulty %s disk to %s!\n", 5121 bdevname(rdev->bdev,b), mdname(mddev)); 5122 err = -EINVAL; 5123 goto abort_export; 5124 } 5125 clear_bit(In_sync, &rdev->flags); 5126 rdev->desc_nr = -1; 5127 rdev->saved_raid_disk = -1; 5128 err = bind_rdev_to_array(rdev, mddev); 5129 if (err) 5130 goto abort_export; 5131 5132 /* 5133 * The rest should better be atomic, we can have disk failures 5134 * noticed in interrupt contexts ... 5135 */ 5136 5137 rdev->raid_disk = -1; 5138 5139 md_update_sb(mddev, 1); 5140 5141 /* 5142 * Kick recovery, maybe this spare has to be added to the 5143 * array immediately. 5144 */ 5145 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5146 md_wakeup_thread(mddev->thread); 5147 md_new_event(mddev); 5148 return 0; 5149 5150 abort_export: 5151 export_rdev(rdev); 5152 return err; 5153 } 5154 5155 static int set_bitmap_file(mddev_t *mddev, int fd) 5156 { 5157 int err; 5158 5159 if (mddev->pers) { 5160 if (!mddev->pers->quiesce) 5161 return -EBUSY; 5162 if (mddev->recovery || mddev->sync_thread) 5163 return -EBUSY; 5164 /* we should be able to change the bitmap.. */ 5165 } 5166 5167 5168 if (fd >= 0) { 5169 if (mddev->bitmap) 5170 return -EEXIST; /* cannot add when bitmap is present */ 5171 mddev->bitmap_info.file = fget(fd); 5172 5173 if (mddev->bitmap_info.file == NULL) { 5174 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 5175 mdname(mddev)); 5176 return -EBADF; 5177 } 5178 5179 err = deny_bitmap_write_access(mddev->bitmap_info.file); 5180 if (err) { 5181 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 5182 mdname(mddev)); 5183 fput(mddev->bitmap_info.file); 5184 mddev->bitmap_info.file = NULL; 5185 return err; 5186 } 5187 mddev->bitmap_info.offset = 0; /* file overrides offset */ 5188 } else if (mddev->bitmap == NULL) 5189 return -ENOENT; /* cannot remove what isn't there */ 5190 err = 0; 5191 if (mddev->pers) { 5192 mddev->pers->quiesce(mddev, 1); 5193 if (fd >= 0) 5194 err = bitmap_create(mddev); 5195 if (fd < 0 || err) { 5196 bitmap_destroy(mddev); 5197 fd = -1; /* make sure to put the file */ 5198 } 5199 mddev->pers->quiesce(mddev, 0); 5200 } 5201 if (fd < 0) { 5202 if (mddev->bitmap_info.file) { 5203 restore_bitmap_write_access(mddev->bitmap_info.file); 5204 fput(mddev->bitmap_info.file); 5205 } 5206 mddev->bitmap_info.file = NULL; 5207 } 5208 5209 return err; 5210 } 5211 5212 /* 5213 * set_array_info is used two different ways 5214 * The original usage is when creating a new array. 5215 * In this usage, raid_disks is > 0 and it together with 5216 * level, size, not_persistent,layout,chunksize determine the 5217 * shape of the array. 5218 * This will always create an array with a type-0.90.0 superblock. 5219 * The newer usage is when assembling an array. 5220 * In this case raid_disks will be 0, and the major_version field is 5221 * use to determine which style super-blocks are to be found on the devices. 5222 * The minor and patch _version numbers are also kept incase the 5223 * super_block handler wishes to interpret them. 5224 */ 5225 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 5226 { 5227 5228 if (info->raid_disks == 0) { 5229 /* just setting version number for superblock loading */ 5230 if (info->major_version < 0 || 5231 info->major_version >= ARRAY_SIZE(super_types) || 5232 super_types[info->major_version].name == NULL) { 5233 /* maybe try to auto-load a module? */ 5234 printk(KERN_INFO 5235 "md: superblock version %d not known\n", 5236 info->major_version); 5237 return -EINVAL; 5238 } 5239 mddev->major_version = info->major_version; 5240 mddev->minor_version = info->minor_version; 5241 mddev->patch_version = info->patch_version; 5242 mddev->persistent = !info->not_persistent; 5243 /* ensure mddev_put doesn't delete this now that there 5244 * is some minimal configuration. 5245 */ 5246 mddev->ctime = get_seconds(); 5247 return 0; 5248 } 5249 mddev->major_version = MD_MAJOR_VERSION; 5250 mddev->minor_version = MD_MINOR_VERSION; 5251 mddev->patch_version = MD_PATCHLEVEL_VERSION; 5252 mddev->ctime = get_seconds(); 5253 5254 mddev->level = info->level; 5255 mddev->clevel[0] = 0; 5256 mddev->dev_sectors = 2 * (sector_t)info->size; 5257 mddev->raid_disks = info->raid_disks; 5258 /* don't set md_minor, it is determined by which /dev/md* was 5259 * openned 5260 */ 5261 if (info->state & (1<<MD_SB_CLEAN)) 5262 mddev->recovery_cp = MaxSector; 5263 else 5264 mddev->recovery_cp = 0; 5265 mddev->persistent = ! info->not_persistent; 5266 mddev->external = 0; 5267 5268 mddev->layout = info->layout; 5269 mddev->chunk_sectors = info->chunk_size >> 9; 5270 5271 mddev->max_disks = MD_SB_DISKS; 5272 5273 if (mddev->persistent) 5274 mddev->flags = 0; 5275 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5276 5277 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 5278 mddev->bitmap_info.offset = 0; 5279 5280 mddev->reshape_position = MaxSector; 5281 5282 /* 5283 * Generate a 128 bit UUID 5284 */ 5285 get_random_bytes(mddev->uuid, 16); 5286 5287 mddev->new_level = mddev->level; 5288 mddev->new_chunk_sectors = mddev->chunk_sectors; 5289 mddev->new_layout = mddev->layout; 5290 mddev->delta_disks = 0; 5291 5292 return 0; 5293 } 5294 5295 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) 5296 { 5297 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 5298 5299 if (mddev->external_size) 5300 return; 5301 5302 mddev->array_sectors = array_sectors; 5303 } 5304 EXPORT_SYMBOL(md_set_array_sectors); 5305 5306 static int update_size(mddev_t *mddev, sector_t num_sectors) 5307 { 5308 mdk_rdev_t *rdev; 5309 int rv; 5310 int fit = (num_sectors == 0); 5311 5312 if (mddev->pers->resize == NULL) 5313 return -EINVAL; 5314 /* The "num_sectors" is the number of sectors of each device that 5315 * is used. This can only make sense for arrays with redundancy. 5316 * linear and raid0 always use whatever space is available. We can only 5317 * consider changing this number if no resync or reconstruction is 5318 * happening, and if the new size is acceptable. It must fit before the 5319 * sb_start or, if that is <data_offset, it must fit before the size 5320 * of each device. If num_sectors is zero, we find the largest size 5321 * that fits. 5322 5323 */ 5324 if (mddev->sync_thread) 5325 return -EBUSY; 5326 if (mddev->bitmap) 5327 /* Sorry, cannot grow a bitmap yet, just remove it, 5328 * grow, and re-add. 5329 */ 5330 return -EBUSY; 5331 list_for_each_entry(rdev, &mddev->disks, same_set) { 5332 sector_t avail = rdev->sectors; 5333 5334 if (fit && (num_sectors == 0 || num_sectors > avail)) 5335 num_sectors = avail; 5336 if (avail < num_sectors) 5337 return -ENOSPC; 5338 } 5339 rv = mddev->pers->resize(mddev, num_sectors); 5340 if (!rv) 5341 revalidate_disk(mddev->gendisk); 5342 return rv; 5343 } 5344 5345 static int update_raid_disks(mddev_t *mddev, int raid_disks) 5346 { 5347 int rv; 5348 /* change the number of raid disks */ 5349 if (mddev->pers->check_reshape == NULL) 5350 return -EINVAL; 5351 if (raid_disks <= 0 || 5352 raid_disks >= mddev->max_disks) 5353 return -EINVAL; 5354 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5355 return -EBUSY; 5356 mddev->delta_disks = raid_disks - mddev->raid_disks; 5357 5358 rv = mddev->pers->check_reshape(mddev); 5359 return rv; 5360 } 5361 5362 5363 /* 5364 * update_array_info is used to change the configuration of an 5365 * on-line array. 5366 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 5367 * fields in the info are checked against the array. 5368 * Any differences that cannot be handled will cause an error. 5369 * Normally, only one change can be managed at a time. 5370 */ 5371 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 5372 { 5373 int rv = 0; 5374 int cnt = 0; 5375 int state = 0; 5376 5377 /* calculate expected state,ignoring low bits */ 5378 if (mddev->bitmap && mddev->bitmap_info.offset) 5379 state |= (1 << MD_SB_BITMAP_PRESENT); 5380 5381 if (mddev->major_version != info->major_version || 5382 mddev->minor_version != info->minor_version || 5383 /* mddev->patch_version != info->patch_version || */ 5384 mddev->ctime != info->ctime || 5385 mddev->level != info->level || 5386 /* mddev->layout != info->layout || */ 5387 !mddev->persistent != info->not_persistent|| 5388 mddev->chunk_sectors != info->chunk_size >> 9 || 5389 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 5390 ((state^info->state) & 0xfffffe00) 5391 ) 5392 return -EINVAL; 5393 /* Check there is only one change */ 5394 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5395 cnt++; 5396 if (mddev->raid_disks != info->raid_disks) 5397 cnt++; 5398 if (mddev->layout != info->layout) 5399 cnt++; 5400 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 5401 cnt++; 5402 if (cnt == 0) 5403 return 0; 5404 if (cnt > 1) 5405 return -EINVAL; 5406 5407 if (mddev->layout != info->layout) { 5408 /* Change layout 5409 * we don't need to do anything at the md level, the 5410 * personality will take care of it all. 5411 */ 5412 if (mddev->pers->check_reshape == NULL) 5413 return -EINVAL; 5414 else { 5415 mddev->new_layout = info->layout; 5416 rv = mddev->pers->check_reshape(mddev); 5417 if (rv) 5418 mddev->new_layout = mddev->layout; 5419 return rv; 5420 } 5421 } 5422 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5423 rv = update_size(mddev, (sector_t)info->size * 2); 5424 5425 if (mddev->raid_disks != info->raid_disks) 5426 rv = update_raid_disks(mddev, info->raid_disks); 5427 5428 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 5429 if (mddev->pers->quiesce == NULL) 5430 return -EINVAL; 5431 if (mddev->recovery || mddev->sync_thread) 5432 return -EBUSY; 5433 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 5434 /* add the bitmap */ 5435 if (mddev->bitmap) 5436 return -EEXIST; 5437 if (mddev->bitmap_info.default_offset == 0) 5438 return -EINVAL; 5439 mddev->bitmap_info.offset = 5440 mddev->bitmap_info.default_offset; 5441 mddev->pers->quiesce(mddev, 1); 5442 rv = bitmap_create(mddev); 5443 if (rv) 5444 bitmap_destroy(mddev); 5445 mddev->pers->quiesce(mddev, 0); 5446 } else { 5447 /* remove the bitmap */ 5448 if (!mddev->bitmap) 5449 return -ENOENT; 5450 if (mddev->bitmap->file) 5451 return -EINVAL; 5452 mddev->pers->quiesce(mddev, 1); 5453 bitmap_destroy(mddev); 5454 mddev->pers->quiesce(mddev, 0); 5455 mddev->bitmap_info.offset = 0; 5456 } 5457 } 5458 md_update_sb(mddev, 1); 5459 return rv; 5460 } 5461 5462 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 5463 { 5464 mdk_rdev_t *rdev; 5465 5466 if (mddev->pers == NULL) 5467 return -ENODEV; 5468 5469 rdev = find_rdev(mddev, dev); 5470 if (!rdev) 5471 return -ENODEV; 5472 5473 md_error(mddev, rdev); 5474 return 0; 5475 } 5476 5477 /* 5478 * We have a problem here : there is no easy way to give a CHS 5479 * virtual geometry. We currently pretend that we have a 2 heads 5480 * 4 sectors (with a BIG number of cylinders...). This drives 5481 * dosfs just mad... ;-) 5482 */ 5483 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 5484 { 5485 mddev_t *mddev = bdev->bd_disk->private_data; 5486 5487 geo->heads = 2; 5488 geo->sectors = 4; 5489 geo->cylinders = get_capacity(mddev->gendisk) / 8; 5490 return 0; 5491 } 5492 5493 static int md_ioctl(struct block_device *bdev, fmode_t mode, 5494 unsigned int cmd, unsigned long arg) 5495 { 5496 int err = 0; 5497 void __user *argp = (void __user *)arg; 5498 mddev_t *mddev = NULL; 5499 5500 if (!capable(CAP_SYS_ADMIN)) 5501 return -EACCES; 5502 5503 /* 5504 * Commands dealing with the RAID driver but not any 5505 * particular array: 5506 */ 5507 switch (cmd) 5508 { 5509 case RAID_VERSION: 5510 err = get_version(argp); 5511 goto done; 5512 5513 case PRINT_RAID_DEBUG: 5514 err = 0; 5515 md_print_devices(); 5516 goto done; 5517 5518 #ifndef MODULE 5519 case RAID_AUTORUN: 5520 err = 0; 5521 autostart_arrays(arg); 5522 goto done; 5523 #endif 5524 default:; 5525 } 5526 5527 /* 5528 * Commands creating/starting a new array: 5529 */ 5530 5531 mddev = bdev->bd_disk->private_data; 5532 5533 if (!mddev) { 5534 BUG(); 5535 goto abort; 5536 } 5537 5538 err = mddev_lock(mddev); 5539 if (err) { 5540 printk(KERN_INFO 5541 "md: ioctl lock interrupted, reason %d, cmd %d\n", 5542 err, cmd); 5543 goto abort; 5544 } 5545 5546 switch (cmd) 5547 { 5548 case SET_ARRAY_INFO: 5549 { 5550 mdu_array_info_t info; 5551 if (!arg) 5552 memset(&info, 0, sizeof(info)); 5553 else if (copy_from_user(&info, argp, sizeof(info))) { 5554 err = -EFAULT; 5555 goto abort_unlock; 5556 } 5557 if (mddev->pers) { 5558 err = update_array_info(mddev, &info); 5559 if (err) { 5560 printk(KERN_WARNING "md: couldn't update" 5561 " array info. %d\n", err); 5562 goto abort_unlock; 5563 } 5564 goto done_unlock; 5565 } 5566 if (!list_empty(&mddev->disks)) { 5567 printk(KERN_WARNING 5568 "md: array %s already has disks!\n", 5569 mdname(mddev)); 5570 err = -EBUSY; 5571 goto abort_unlock; 5572 } 5573 if (mddev->raid_disks) { 5574 printk(KERN_WARNING 5575 "md: array %s already initialised!\n", 5576 mdname(mddev)); 5577 err = -EBUSY; 5578 goto abort_unlock; 5579 } 5580 err = set_array_info(mddev, &info); 5581 if (err) { 5582 printk(KERN_WARNING "md: couldn't set" 5583 " array info. %d\n", err); 5584 goto abort_unlock; 5585 } 5586 } 5587 goto done_unlock; 5588 5589 default:; 5590 } 5591 5592 /* 5593 * Commands querying/configuring an existing array: 5594 */ 5595 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 5596 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 5597 if ((!mddev->raid_disks && !mddev->external) 5598 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 5599 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 5600 && cmd != GET_BITMAP_FILE) { 5601 err = -ENODEV; 5602 goto abort_unlock; 5603 } 5604 5605 /* 5606 * Commands even a read-only array can execute: 5607 */ 5608 switch (cmd) 5609 { 5610 case GET_ARRAY_INFO: 5611 err = get_array_info(mddev, argp); 5612 goto done_unlock; 5613 5614 case GET_BITMAP_FILE: 5615 err = get_bitmap_file(mddev, argp); 5616 goto done_unlock; 5617 5618 case GET_DISK_INFO: 5619 err = get_disk_info(mddev, argp); 5620 goto done_unlock; 5621 5622 case RESTART_ARRAY_RW: 5623 err = restart_array(mddev); 5624 goto done_unlock; 5625 5626 case STOP_ARRAY: 5627 err = do_md_stop(mddev, 0, 1); 5628 goto done_unlock; 5629 5630 case STOP_ARRAY_RO: 5631 err = do_md_stop(mddev, 1, 1); 5632 goto done_unlock; 5633 5634 } 5635 5636 /* 5637 * The remaining ioctls are changing the state of the 5638 * superblock, so we do not allow them on read-only arrays. 5639 * However non-MD ioctls (e.g. get-size) will still come through 5640 * here and hit the 'default' below, so only disallow 5641 * 'md' ioctls, and switch to rw mode if started auto-readonly. 5642 */ 5643 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 5644 if (mddev->ro == 2) { 5645 mddev->ro = 0; 5646 sysfs_notify_dirent(mddev->sysfs_state); 5647 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5648 md_wakeup_thread(mddev->thread); 5649 } else { 5650 err = -EROFS; 5651 goto abort_unlock; 5652 } 5653 } 5654 5655 switch (cmd) 5656 { 5657 case ADD_NEW_DISK: 5658 { 5659 mdu_disk_info_t info; 5660 if (copy_from_user(&info, argp, sizeof(info))) 5661 err = -EFAULT; 5662 else 5663 err = add_new_disk(mddev, &info); 5664 goto done_unlock; 5665 } 5666 5667 case HOT_REMOVE_DISK: 5668 err = hot_remove_disk(mddev, new_decode_dev(arg)); 5669 goto done_unlock; 5670 5671 case HOT_ADD_DISK: 5672 err = hot_add_disk(mddev, new_decode_dev(arg)); 5673 goto done_unlock; 5674 5675 case SET_DISK_FAULTY: 5676 err = set_disk_faulty(mddev, new_decode_dev(arg)); 5677 goto done_unlock; 5678 5679 case RUN_ARRAY: 5680 err = do_md_run(mddev); 5681 goto done_unlock; 5682 5683 case SET_BITMAP_FILE: 5684 err = set_bitmap_file(mddev, (int)arg); 5685 goto done_unlock; 5686 5687 default: 5688 err = -EINVAL; 5689 goto abort_unlock; 5690 } 5691 5692 done_unlock: 5693 abort_unlock: 5694 if (mddev->hold_active == UNTIL_IOCTL && 5695 err != -EINVAL) 5696 mddev->hold_active = 0; 5697 mddev_unlock(mddev); 5698 5699 return err; 5700 done: 5701 if (err) 5702 MD_BUG(); 5703 abort: 5704 return err; 5705 } 5706 #ifdef CONFIG_COMPAT 5707 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 5708 unsigned int cmd, unsigned long arg) 5709 { 5710 switch (cmd) { 5711 case HOT_REMOVE_DISK: 5712 case HOT_ADD_DISK: 5713 case SET_DISK_FAULTY: 5714 case SET_BITMAP_FILE: 5715 /* These take in integer arg, do not convert */ 5716 break; 5717 default: 5718 arg = (unsigned long)compat_ptr(arg); 5719 break; 5720 } 5721 5722 return md_ioctl(bdev, mode, cmd, arg); 5723 } 5724 #endif /* CONFIG_COMPAT */ 5725 5726 static int md_open(struct block_device *bdev, fmode_t mode) 5727 { 5728 /* 5729 * Succeed if we can lock the mddev, which confirms that 5730 * it isn't being stopped right now. 5731 */ 5732 mddev_t *mddev = mddev_find(bdev->bd_dev); 5733 int err; 5734 5735 if (mddev->gendisk != bdev->bd_disk) { 5736 /* we are racing with mddev_put which is discarding this 5737 * bd_disk. 5738 */ 5739 mddev_put(mddev); 5740 /* Wait until bdev->bd_disk is definitely gone */ 5741 flush_scheduled_work(); 5742 /* Then retry the open from the top */ 5743 return -ERESTARTSYS; 5744 } 5745 BUG_ON(mddev != bdev->bd_disk->private_data); 5746 5747 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 5748 goto out; 5749 5750 err = 0; 5751 atomic_inc(&mddev->openers); 5752 mutex_unlock(&mddev->open_mutex); 5753 5754 check_disk_change(bdev); 5755 out: 5756 return err; 5757 } 5758 5759 static int md_release(struct gendisk *disk, fmode_t mode) 5760 { 5761 mddev_t *mddev = disk->private_data; 5762 5763 BUG_ON(!mddev); 5764 atomic_dec(&mddev->openers); 5765 mddev_put(mddev); 5766 5767 return 0; 5768 } 5769 5770 static int md_media_changed(struct gendisk *disk) 5771 { 5772 mddev_t *mddev = disk->private_data; 5773 5774 return mddev->changed; 5775 } 5776 5777 static int md_revalidate(struct gendisk *disk) 5778 { 5779 mddev_t *mddev = disk->private_data; 5780 5781 mddev->changed = 0; 5782 return 0; 5783 } 5784 static const struct block_device_operations md_fops = 5785 { 5786 .owner = THIS_MODULE, 5787 .open = md_open, 5788 .release = md_release, 5789 .ioctl = md_ioctl, 5790 #ifdef CONFIG_COMPAT 5791 .compat_ioctl = md_compat_ioctl, 5792 #endif 5793 .getgeo = md_getgeo, 5794 .media_changed = md_media_changed, 5795 .revalidate_disk= md_revalidate, 5796 }; 5797 5798 static int md_thread(void * arg) 5799 { 5800 mdk_thread_t *thread = arg; 5801 5802 /* 5803 * md_thread is a 'system-thread', it's priority should be very 5804 * high. We avoid resource deadlocks individually in each 5805 * raid personality. (RAID5 does preallocation) We also use RR and 5806 * the very same RT priority as kswapd, thus we will never get 5807 * into a priority inversion deadlock. 5808 * 5809 * we definitely have to have equal or higher priority than 5810 * bdflush, otherwise bdflush will deadlock if there are too 5811 * many dirty RAID5 blocks. 5812 */ 5813 5814 allow_signal(SIGKILL); 5815 while (!kthread_should_stop()) { 5816 5817 /* We need to wait INTERRUPTIBLE so that 5818 * we don't add to the load-average. 5819 * That means we need to be sure no signals are 5820 * pending 5821 */ 5822 if (signal_pending(current)) 5823 flush_signals(current); 5824 5825 wait_event_interruptible_timeout 5826 (thread->wqueue, 5827 test_bit(THREAD_WAKEUP, &thread->flags) 5828 || kthread_should_stop(), 5829 thread->timeout); 5830 5831 clear_bit(THREAD_WAKEUP, &thread->flags); 5832 5833 thread->run(thread->mddev); 5834 } 5835 5836 return 0; 5837 } 5838 5839 void md_wakeup_thread(mdk_thread_t *thread) 5840 { 5841 if (thread) { 5842 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 5843 set_bit(THREAD_WAKEUP, &thread->flags); 5844 wake_up(&thread->wqueue); 5845 } 5846 } 5847 5848 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 5849 const char *name) 5850 { 5851 mdk_thread_t *thread; 5852 5853 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 5854 if (!thread) 5855 return NULL; 5856 5857 init_waitqueue_head(&thread->wqueue); 5858 5859 thread->run = run; 5860 thread->mddev = mddev; 5861 thread->timeout = MAX_SCHEDULE_TIMEOUT; 5862 thread->tsk = kthread_run(md_thread, thread, 5863 "%s_%s", 5864 mdname(thread->mddev), 5865 name ?: mddev->pers->name); 5866 if (IS_ERR(thread->tsk)) { 5867 kfree(thread); 5868 return NULL; 5869 } 5870 return thread; 5871 } 5872 5873 void md_unregister_thread(mdk_thread_t *thread) 5874 { 5875 if (!thread) 5876 return; 5877 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 5878 5879 kthread_stop(thread->tsk); 5880 kfree(thread); 5881 } 5882 5883 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 5884 { 5885 if (!mddev) { 5886 MD_BUG(); 5887 return; 5888 } 5889 5890 if (!rdev || test_bit(Faulty, &rdev->flags)) 5891 return; 5892 5893 if (mddev->external) 5894 set_bit(Blocked, &rdev->flags); 5895 /* 5896 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 5897 mdname(mddev), 5898 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 5899 __builtin_return_address(0),__builtin_return_address(1), 5900 __builtin_return_address(2),__builtin_return_address(3)); 5901 */ 5902 if (!mddev->pers) 5903 return; 5904 if (!mddev->pers->error_handler) 5905 return; 5906 mddev->pers->error_handler(mddev,rdev); 5907 if (mddev->degraded) 5908 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5909 set_bit(StateChanged, &rdev->flags); 5910 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5911 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5912 md_wakeup_thread(mddev->thread); 5913 md_new_event_inintr(mddev); 5914 } 5915 5916 /* seq_file implementation /proc/mdstat */ 5917 5918 static void status_unused(struct seq_file *seq) 5919 { 5920 int i = 0; 5921 mdk_rdev_t *rdev; 5922 5923 seq_printf(seq, "unused devices: "); 5924 5925 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 5926 char b[BDEVNAME_SIZE]; 5927 i++; 5928 seq_printf(seq, "%s ", 5929 bdevname(rdev->bdev,b)); 5930 } 5931 if (!i) 5932 seq_printf(seq, "<none>"); 5933 5934 seq_printf(seq, "\n"); 5935 } 5936 5937 5938 static void status_resync(struct seq_file *seq, mddev_t * mddev) 5939 { 5940 sector_t max_sectors, resync, res; 5941 unsigned long dt, db; 5942 sector_t rt; 5943 int scale; 5944 unsigned int per_milli; 5945 5946 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); 5947 5948 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5949 max_sectors = mddev->resync_max_sectors; 5950 else 5951 max_sectors = mddev->dev_sectors; 5952 5953 /* 5954 * Should not happen. 5955 */ 5956 if (!max_sectors) { 5957 MD_BUG(); 5958 return; 5959 } 5960 /* Pick 'scale' such that (resync>>scale)*1000 will fit 5961 * in a sector_t, and (max_sectors>>scale) will fit in a 5962 * u32, as those are the requirements for sector_div. 5963 * Thus 'scale' must be at least 10 5964 */ 5965 scale = 10; 5966 if (sizeof(sector_t) > sizeof(unsigned long)) { 5967 while ( max_sectors/2 > (1ULL<<(scale+32))) 5968 scale++; 5969 } 5970 res = (resync>>scale)*1000; 5971 sector_div(res, (u32)((max_sectors>>scale)+1)); 5972 5973 per_milli = res; 5974 { 5975 int i, x = per_milli/50, y = 20-x; 5976 seq_printf(seq, "["); 5977 for (i = 0; i < x; i++) 5978 seq_printf(seq, "="); 5979 seq_printf(seq, ">"); 5980 for (i = 0; i < y; i++) 5981 seq_printf(seq, "."); 5982 seq_printf(seq, "] "); 5983 } 5984 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 5985 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 5986 "reshape" : 5987 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 5988 "check" : 5989 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 5990 "resync" : "recovery"))), 5991 per_milli/10, per_milli % 10, 5992 (unsigned long long) resync/2, 5993 (unsigned long long) max_sectors/2); 5994 5995 /* 5996 * dt: time from mark until now 5997 * db: blocks written from mark until now 5998 * rt: remaining time 5999 * 6000 * rt is a sector_t, so could be 32bit or 64bit. 6001 * So we divide before multiply in case it is 32bit and close 6002 * to the limit. 6003 * We scale the divisor (db) by 32 to avoid loosing precision 6004 * near the end of resync when the number of remaining sectors 6005 * is close to 'db'. 6006 * We then divide rt by 32 after multiplying by db to compensate. 6007 * The '+1' avoids division by zero if db is very small. 6008 */ 6009 dt = ((jiffies - mddev->resync_mark) / HZ); 6010 if (!dt) dt++; 6011 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 6012 - mddev->resync_mark_cnt; 6013 6014 rt = max_sectors - resync; /* number of remaining sectors */ 6015 sector_div(rt, db/32+1); 6016 rt *= dt; 6017 rt >>= 5; 6018 6019 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 6020 ((unsigned long)rt % 60)/6); 6021 6022 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 6023 } 6024 6025 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 6026 { 6027 struct list_head *tmp; 6028 loff_t l = *pos; 6029 mddev_t *mddev; 6030 6031 if (l >= 0x10000) 6032 return NULL; 6033 if (!l--) 6034 /* header */ 6035 return (void*)1; 6036 6037 spin_lock(&all_mddevs_lock); 6038 list_for_each(tmp,&all_mddevs) 6039 if (!l--) { 6040 mddev = list_entry(tmp, mddev_t, all_mddevs); 6041 mddev_get(mddev); 6042 spin_unlock(&all_mddevs_lock); 6043 return mddev; 6044 } 6045 spin_unlock(&all_mddevs_lock); 6046 if (!l--) 6047 return (void*)2;/* tail */ 6048 return NULL; 6049 } 6050 6051 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 6052 { 6053 struct list_head *tmp; 6054 mddev_t *next_mddev, *mddev = v; 6055 6056 ++*pos; 6057 if (v == (void*)2) 6058 return NULL; 6059 6060 spin_lock(&all_mddevs_lock); 6061 if (v == (void*)1) 6062 tmp = all_mddevs.next; 6063 else 6064 tmp = mddev->all_mddevs.next; 6065 if (tmp != &all_mddevs) 6066 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 6067 else { 6068 next_mddev = (void*)2; 6069 *pos = 0x10000; 6070 } 6071 spin_unlock(&all_mddevs_lock); 6072 6073 if (v != (void*)1) 6074 mddev_put(mddev); 6075 return next_mddev; 6076 6077 } 6078 6079 static void md_seq_stop(struct seq_file *seq, void *v) 6080 { 6081 mddev_t *mddev = v; 6082 6083 if (mddev && v != (void*)1 && v != (void*)2) 6084 mddev_put(mddev); 6085 } 6086 6087 struct mdstat_info { 6088 int event; 6089 }; 6090 6091 static int md_seq_show(struct seq_file *seq, void *v) 6092 { 6093 mddev_t *mddev = v; 6094 sector_t sectors; 6095 mdk_rdev_t *rdev; 6096 struct mdstat_info *mi = seq->private; 6097 struct bitmap *bitmap; 6098 6099 if (v == (void*)1) { 6100 struct mdk_personality *pers; 6101 seq_printf(seq, "Personalities : "); 6102 spin_lock(&pers_lock); 6103 list_for_each_entry(pers, &pers_list, list) 6104 seq_printf(seq, "[%s] ", pers->name); 6105 6106 spin_unlock(&pers_lock); 6107 seq_printf(seq, "\n"); 6108 mi->event = atomic_read(&md_event_count); 6109 return 0; 6110 } 6111 if (v == (void*)2) { 6112 status_unused(seq); 6113 return 0; 6114 } 6115 6116 if (mddev_lock(mddev) < 0) 6117 return -EINTR; 6118 6119 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 6120 seq_printf(seq, "%s : %sactive", mdname(mddev), 6121 mddev->pers ? "" : "in"); 6122 if (mddev->pers) { 6123 if (mddev->ro==1) 6124 seq_printf(seq, " (read-only)"); 6125 if (mddev->ro==2) 6126 seq_printf(seq, " (auto-read-only)"); 6127 seq_printf(seq, " %s", mddev->pers->name); 6128 } 6129 6130 sectors = 0; 6131 list_for_each_entry(rdev, &mddev->disks, same_set) { 6132 char b[BDEVNAME_SIZE]; 6133 seq_printf(seq, " %s[%d]", 6134 bdevname(rdev->bdev,b), rdev->desc_nr); 6135 if (test_bit(WriteMostly, &rdev->flags)) 6136 seq_printf(seq, "(W)"); 6137 if (test_bit(Faulty, &rdev->flags)) { 6138 seq_printf(seq, "(F)"); 6139 continue; 6140 } else if (rdev->raid_disk < 0) 6141 seq_printf(seq, "(S)"); /* spare */ 6142 sectors += rdev->sectors; 6143 } 6144 6145 if (!list_empty(&mddev->disks)) { 6146 if (mddev->pers) 6147 seq_printf(seq, "\n %llu blocks", 6148 (unsigned long long) 6149 mddev->array_sectors / 2); 6150 else 6151 seq_printf(seq, "\n %llu blocks", 6152 (unsigned long long)sectors / 2); 6153 } 6154 if (mddev->persistent) { 6155 if (mddev->major_version != 0 || 6156 mddev->minor_version != 90) { 6157 seq_printf(seq," super %d.%d", 6158 mddev->major_version, 6159 mddev->minor_version); 6160 } 6161 } else if (mddev->external) 6162 seq_printf(seq, " super external:%s", 6163 mddev->metadata_type); 6164 else 6165 seq_printf(seq, " super non-persistent"); 6166 6167 if (mddev->pers) { 6168 mddev->pers->status(seq, mddev); 6169 seq_printf(seq, "\n "); 6170 if (mddev->pers->sync_request) { 6171 if (mddev->curr_resync > 2) { 6172 status_resync(seq, mddev); 6173 seq_printf(seq, "\n "); 6174 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 6175 seq_printf(seq, "\tresync=DELAYED\n "); 6176 else if (mddev->recovery_cp < MaxSector) 6177 seq_printf(seq, "\tresync=PENDING\n "); 6178 } 6179 } else 6180 seq_printf(seq, "\n "); 6181 6182 if ((bitmap = mddev->bitmap)) { 6183 unsigned long chunk_kb; 6184 unsigned long flags; 6185 spin_lock_irqsave(&bitmap->lock, flags); 6186 chunk_kb = mddev->bitmap_info.chunksize >> 10; 6187 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 6188 "%lu%s chunk", 6189 bitmap->pages - bitmap->missing_pages, 6190 bitmap->pages, 6191 (bitmap->pages - bitmap->missing_pages) 6192 << (PAGE_SHIFT - 10), 6193 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 6194 chunk_kb ? "KB" : "B"); 6195 if (bitmap->file) { 6196 seq_printf(seq, ", file: "); 6197 seq_path(seq, &bitmap->file->f_path, " \t\n"); 6198 } 6199 6200 seq_printf(seq, "\n"); 6201 spin_unlock_irqrestore(&bitmap->lock, flags); 6202 } 6203 6204 seq_printf(seq, "\n"); 6205 } 6206 mddev_unlock(mddev); 6207 6208 return 0; 6209 } 6210 6211 static const struct seq_operations md_seq_ops = { 6212 .start = md_seq_start, 6213 .next = md_seq_next, 6214 .stop = md_seq_stop, 6215 .show = md_seq_show, 6216 }; 6217 6218 static int md_seq_open(struct inode *inode, struct file *file) 6219 { 6220 int error; 6221 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 6222 if (mi == NULL) 6223 return -ENOMEM; 6224 6225 error = seq_open(file, &md_seq_ops); 6226 if (error) 6227 kfree(mi); 6228 else { 6229 struct seq_file *p = file->private_data; 6230 p->private = mi; 6231 mi->event = atomic_read(&md_event_count); 6232 } 6233 return error; 6234 } 6235 6236 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 6237 { 6238 struct seq_file *m = filp->private_data; 6239 struct mdstat_info *mi = m->private; 6240 int mask; 6241 6242 poll_wait(filp, &md_event_waiters, wait); 6243 6244 /* always allow read */ 6245 mask = POLLIN | POLLRDNORM; 6246 6247 if (mi->event != atomic_read(&md_event_count)) 6248 mask |= POLLERR | POLLPRI; 6249 return mask; 6250 } 6251 6252 static const struct file_operations md_seq_fops = { 6253 .owner = THIS_MODULE, 6254 .open = md_seq_open, 6255 .read = seq_read, 6256 .llseek = seq_lseek, 6257 .release = seq_release_private, 6258 .poll = mdstat_poll, 6259 }; 6260 6261 int register_md_personality(struct mdk_personality *p) 6262 { 6263 spin_lock(&pers_lock); 6264 list_add_tail(&p->list, &pers_list); 6265 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 6266 spin_unlock(&pers_lock); 6267 return 0; 6268 } 6269 6270 int unregister_md_personality(struct mdk_personality *p) 6271 { 6272 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 6273 spin_lock(&pers_lock); 6274 list_del_init(&p->list); 6275 spin_unlock(&pers_lock); 6276 return 0; 6277 } 6278 6279 static int is_mddev_idle(mddev_t *mddev, int init) 6280 { 6281 mdk_rdev_t * rdev; 6282 int idle; 6283 int curr_events; 6284 6285 idle = 1; 6286 rcu_read_lock(); 6287 rdev_for_each_rcu(rdev, mddev) { 6288 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 6289 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 6290 (int)part_stat_read(&disk->part0, sectors[1]) - 6291 atomic_read(&disk->sync_io); 6292 /* sync IO will cause sync_io to increase before the disk_stats 6293 * as sync_io is counted when a request starts, and 6294 * disk_stats is counted when it completes. 6295 * So resync activity will cause curr_events to be smaller than 6296 * when there was no such activity. 6297 * non-sync IO will cause disk_stat to increase without 6298 * increasing sync_io so curr_events will (eventually) 6299 * be larger than it was before. Once it becomes 6300 * substantially larger, the test below will cause 6301 * the array to appear non-idle, and resync will slow 6302 * down. 6303 * If there is a lot of outstanding resync activity when 6304 * we set last_event to curr_events, then all that activity 6305 * completing might cause the array to appear non-idle 6306 * and resync will be slowed down even though there might 6307 * not have been non-resync activity. This will only 6308 * happen once though. 'last_events' will soon reflect 6309 * the state where there is little or no outstanding 6310 * resync requests, and further resync activity will 6311 * always make curr_events less than last_events. 6312 * 6313 */ 6314 if (init || curr_events - rdev->last_events > 64) { 6315 rdev->last_events = curr_events; 6316 idle = 0; 6317 } 6318 } 6319 rcu_read_unlock(); 6320 return idle; 6321 } 6322 6323 void md_done_sync(mddev_t *mddev, int blocks, int ok) 6324 { 6325 /* another "blocks" (512byte) blocks have been synced */ 6326 atomic_sub(blocks, &mddev->recovery_active); 6327 wake_up(&mddev->recovery_wait); 6328 if (!ok) { 6329 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6330 md_wakeup_thread(mddev->thread); 6331 // stop recovery, signal do_sync .... 6332 } 6333 } 6334 6335 6336 /* md_write_start(mddev, bi) 6337 * If we need to update some array metadata (e.g. 'active' flag 6338 * in superblock) before writing, schedule a superblock update 6339 * and wait for it to complete. 6340 */ 6341 void md_write_start(mddev_t *mddev, struct bio *bi) 6342 { 6343 int did_change = 0; 6344 if (bio_data_dir(bi) != WRITE) 6345 return; 6346 6347 BUG_ON(mddev->ro == 1); 6348 if (mddev->ro == 2) { 6349 /* need to switch to read/write */ 6350 mddev->ro = 0; 6351 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6352 md_wakeup_thread(mddev->thread); 6353 md_wakeup_thread(mddev->sync_thread); 6354 did_change = 1; 6355 } 6356 atomic_inc(&mddev->writes_pending); 6357 if (mddev->safemode == 1) 6358 mddev->safemode = 0; 6359 if (mddev->in_sync) { 6360 spin_lock_irq(&mddev->write_lock); 6361 if (mddev->in_sync) { 6362 mddev->in_sync = 0; 6363 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6364 md_wakeup_thread(mddev->thread); 6365 did_change = 1; 6366 } 6367 spin_unlock_irq(&mddev->write_lock); 6368 } 6369 if (did_change) 6370 sysfs_notify_dirent(mddev->sysfs_state); 6371 wait_event(mddev->sb_wait, 6372 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 6373 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6374 } 6375 6376 void md_write_end(mddev_t *mddev) 6377 { 6378 if (atomic_dec_and_test(&mddev->writes_pending)) { 6379 if (mddev->safemode == 2) 6380 md_wakeup_thread(mddev->thread); 6381 else if (mddev->safemode_delay) 6382 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 6383 } 6384 } 6385 6386 /* md_allow_write(mddev) 6387 * Calling this ensures that the array is marked 'active' so that writes 6388 * may proceed without blocking. It is important to call this before 6389 * attempting a GFP_KERNEL allocation while holding the mddev lock. 6390 * Must be called with mddev_lock held. 6391 * 6392 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 6393 * is dropped, so return -EAGAIN after notifying userspace. 6394 */ 6395 int md_allow_write(mddev_t *mddev) 6396 { 6397 if (!mddev->pers) 6398 return 0; 6399 if (mddev->ro) 6400 return 0; 6401 if (!mddev->pers->sync_request) 6402 return 0; 6403 6404 spin_lock_irq(&mddev->write_lock); 6405 if (mddev->in_sync) { 6406 mddev->in_sync = 0; 6407 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6408 if (mddev->safemode_delay && 6409 mddev->safemode == 0) 6410 mddev->safemode = 1; 6411 spin_unlock_irq(&mddev->write_lock); 6412 md_update_sb(mddev, 0); 6413 sysfs_notify_dirent(mddev->sysfs_state); 6414 } else 6415 spin_unlock_irq(&mddev->write_lock); 6416 6417 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 6418 return -EAGAIN; 6419 else 6420 return 0; 6421 } 6422 EXPORT_SYMBOL_GPL(md_allow_write); 6423 6424 #define SYNC_MARKS 10 6425 #define SYNC_MARK_STEP (3*HZ) 6426 void md_do_sync(mddev_t *mddev) 6427 { 6428 mddev_t *mddev2; 6429 unsigned int currspeed = 0, 6430 window; 6431 sector_t max_sectors,j, io_sectors; 6432 unsigned long mark[SYNC_MARKS]; 6433 sector_t mark_cnt[SYNC_MARKS]; 6434 int last_mark,m; 6435 struct list_head *tmp; 6436 sector_t last_check; 6437 int skipped = 0; 6438 mdk_rdev_t *rdev; 6439 char *desc; 6440 6441 /* just incase thread restarts... */ 6442 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 6443 return; 6444 if (mddev->ro) /* never try to sync a read-only array */ 6445 return; 6446 6447 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6448 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 6449 desc = "data-check"; 6450 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6451 desc = "requested-resync"; 6452 else 6453 desc = "resync"; 6454 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6455 desc = "reshape"; 6456 else 6457 desc = "recovery"; 6458 6459 /* we overload curr_resync somewhat here. 6460 * 0 == not engaged in resync at all 6461 * 2 == checking that there is no conflict with another sync 6462 * 1 == like 2, but have yielded to allow conflicting resync to 6463 * commense 6464 * other == active in resync - this many blocks 6465 * 6466 * Before starting a resync we must have set curr_resync to 6467 * 2, and then checked that every "conflicting" array has curr_resync 6468 * less than ours. When we find one that is the same or higher 6469 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 6470 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 6471 * This will mean we have to start checking from the beginning again. 6472 * 6473 */ 6474 6475 do { 6476 mddev->curr_resync = 2; 6477 6478 try_again: 6479 if (kthread_should_stop()) 6480 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6481 6482 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6483 goto skip; 6484 for_each_mddev(mddev2, tmp) { 6485 if (mddev2 == mddev) 6486 continue; 6487 if (!mddev->parallel_resync 6488 && mddev2->curr_resync 6489 && match_mddev_units(mddev, mddev2)) { 6490 DEFINE_WAIT(wq); 6491 if (mddev < mddev2 && mddev->curr_resync == 2) { 6492 /* arbitrarily yield */ 6493 mddev->curr_resync = 1; 6494 wake_up(&resync_wait); 6495 } 6496 if (mddev > mddev2 && mddev->curr_resync == 1) 6497 /* no need to wait here, we can wait the next 6498 * time 'round when curr_resync == 2 6499 */ 6500 continue; 6501 /* We need to wait 'interruptible' so as not to 6502 * contribute to the load average, and not to 6503 * be caught by 'softlockup' 6504 */ 6505 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 6506 if (!kthread_should_stop() && 6507 mddev2->curr_resync >= mddev->curr_resync) { 6508 printk(KERN_INFO "md: delaying %s of %s" 6509 " until %s has finished (they" 6510 " share one or more physical units)\n", 6511 desc, mdname(mddev), mdname(mddev2)); 6512 mddev_put(mddev2); 6513 if (signal_pending(current)) 6514 flush_signals(current); 6515 schedule(); 6516 finish_wait(&resync_wait, &wq); 6517 goto try_again; 6518 } 6519 finish_wait(&resync_wait, &wq); 6520 } 6521 } 6522 } while (mddev->curr_resync < 2); 6523 6524 j = 0; 6525 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6526 /* resync follows the size requested by the personality, 6527 * which defaults to physical size, but can be virtual size 6528 */ 6529 max_sectors = mddev->resync_max_sectors; 6530 mddev->resync_mismatches = 0; 6531 /* we don't use the checkpoint if there's a bitmap */ 6532 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6533 j = mddev->resync_min; 6534 else if (!mddev->bitmap) 6535 j = mddev->recovery_cp; 6536 6537 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6538 max_sectors = mddev->dev_sectors; 6539 else { 6540 /* recovery follows the physical size of devices */ 6541 max_sectors = mddev->dev_sectors; 6542 j = MaxSector; 6543 rcu_read_lock(); 6544 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 6545 if (rdev->raid_disk >= 0 && 6546 !test_bit(Faulty, &rdev->flags) && 6547 !test_bit(In_sync, &rdev->flags) && 6548 rdev->recovery_offset < j) 6549 j = rdev->recovery_offset; 6550 rcu_read_unlock(); 6551 } 6552 6553 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 6554 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 6555 " %d KB/sec/disk.\n", speed_min(mddev)); 6556 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 6557 "(but not more than %d KB/sec) for %s.\n", 6558 speed_max(mddev), desc); 6559 6560 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 6561 6562 io_sectors = 0; 6563 for (m = 0; m < SYNC_MARKS; m++) { 6564 mark[m] = jiffies; 6565 mark_cnt[m] = io_sectors; 6566 } 6567 last_mark = 0; 6568 mddev->resync_mark = mark[last_mark]; 6569 mddev->resync_mark_cnt = mark_cnt[last_mark]; 6570 6571 /* 6572 * Tune reconstruction: 6573 */ 6574 window = 32*(PAGE_SIZE/512); 6575 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 6576 window/2,(unsigned long long) max_sectors/2); 6577 6578 atomic_set(&mddev->recovery_active, 0); 6579 last_check = 0; 6580 6581 if (j>2) { 6582 printk(KERN_INFO 6583 "md: resuming %s of %s from checkpoint.\n", 6584 desc, mdname(mddev)); 6585 mddev->curr_resync = j; 6586 } 6587 mddev->curr_resync_completed = mddev->curr_resync; 6588 6589 while (j < max_sectors) { 6590 sector_t sectors; 6591 6592 skipped = 0; 6593 6594 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6595 ((mddev->curr_resync > mddev->curr_resync_completed && 6596 (mddev->curr_resync - mddev->curr_resync_completed) 6597 > (max_sectors >> 4)) || 6598 (j - mddev->curr_resync_completed)*2 6599 >= mddev->resync_max - mddev->curr_resync_completed 6600 )) { 6601 /* time to update curr_resync_completed */ 6602 blk_unplug(mddev->queue); 6603 wait_event(mddev->recovery_wait, 6604 atomic_read(&mddev->recovery_active) == 0); 6605 mddev->curr_resync_completed = 6606 mddev->curr_resync; 6607 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6608 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6609 } 6610 6611 while (j >= mddev->resync_max && !kthread_should_stop()) { 6612 /* As this condition is controlled by user-space, 6613 * we can block indefinitely, so use '_interruptible' 6614 * to avoid triggering warnings. 6615 */ 6616 flush_signals(current); /* just in case */ 6617 wait_event_interruptible(mddev->recovery_wait, 6618 mddev->resync_max > j 6619 || kthread_should_stop()); 6620 } 6621 6622 if (kthread_should_stop()) 6623 goto interrupted; 6624 6625 sectors = mddev->pers->sync_request(mddev, j, &skipped, 6626 currspeed < speed_min(mddev)); 6627 if (sectors == 0) { 6628 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6629 goto out; 6630 } 6631 6632 if (!skipped) { /* actual IO requested */ 6633 io_sectors += sectors; 6634 atomic_add(sectors, &mddev->recovery_active); 6635 } 6636 6637 j += sectors; 6638 if (j>1) mddev->curr_resync = j; 6639 mddev->curr_mark_cnt = io_sectors; 6640 if (last_check == 0) 6641 /* this is the earliers that rebuilt will be 6642 * visible in /proc/mdstat 6643 */ 6644 md_new_event(mddev); 6645 6646 if (last_check + window > io_sectors || j == max_sectors) 6647 continue; 6648 6649 last_check = io_sectors; 6650 6651 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6652 break; 6653 6654 repeat: 6655 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 6656 /* step marks */ 6657 int next = (last_mark+1) % SYNC_MARKS; 6658 6659 mddev->resync_mark = mark[next]; 6660 mddev->resync_mark_cnt = mark_cnt[next]; 6661 mark[next] = jiffies; 6662 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 6663 last_mark = next; 6664 } 6665 6666 6667 if (kthread_should_stop()) 6668 goto interrupted; 6669 6670 6671 /* 6672 * this loop exits only if either when we are slower than 6673 * the 'hard' speed limit, or the system was IO-idle for 6674 * a jiffy. 6675 * the system might be non-idle CPU-wise, but we only care 6676 * about not overloading the IO subsystem. (things like an 6677 * e2fsck being done on the RAID array should execute fast) 6678 */ 6679 blk_unplug(mddev->queue); 6680 cond_resched(); 6681 6682 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 6683 /((jiffies-mddev->resync_mark)/HZ +1) +1; 6684 6685 if (currspeed > speed_min(mddev)) { 6686 if ((currspeed > speed_max(mddev)) || 6687 !is_mddev_idle(mddev, 0)) { 6688 msleep(500); 6689 goto repeat; 6690 } 6691 } 6692 } 6693 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 6694 /* 6695 * this also signals 'finished resyncing' to md_stop 6696 */ 6697 out: 6698 blk_unplug(mddev->queue); 6699 6700 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 6701 6702 /* tell personality that we are finished */ 6703 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 6704 6705 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 6706 mddev->curr_resync > 2) { 6707 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6708 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6709 if (mddev->curr_resync >= mddev->recovery_cp) { 6710 printk(KERN_INFO 6711 "md: checkpointing %s of %s.\n", 6712 desc, mdname(mddev)); 6713 mddev->recovery_cp = mddev->curr_resync; 6714 } 6715 } else 6716 mddev->recovery_cp = MaxSector; 6717 } else { 6718 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6719 mddev->curr_resync = MaxSector; 6720 rcu_read_lock(); 6721 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 6722 if (rdev->raid_disk >= 0 && 6723 !test_bit(Faulty, &rdev->flags) && 6724 !test_bit(In_sync, &rdev->flags) && 6725 rdev->recovery_offset < mddev->curr_resync) 6726 rdev->recovery_offset = mddev->curr_resync; 6727 rcu_read_unlock(); 6728 } 6729 } 6730 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6731 6732 skip: 6733 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6734 /* We completed so min/max setting can be forgotten if used. */ 6735 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6736 mddev->resync_min = 0; 6737 mddev->resync_max = MaxSector; 6738 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6739 mddev->resync_min = mddev->curr_resync_completed; 6740 mddev->curr_resync = 0; 6741 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6742 mddev->curr_resync_completed = 0; 6743 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6744 wake_up(&resync_wait); 6745 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 6746 md_wakeup_thread(mddev->thread); 6747 return; 6748 6749 interrupted: 6750 /* 6751 * got a signal, exit. 6752 */ 6753 printk(KERN_INFO 6754 "md: md_do_sync() got signal ... exiting\n"); 6755 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6756 goto out; 6757 6758 } 6759 EXPORT_SYMBOL_GPL(md_do_sync); 6760 6761 6762 static int remove_and_add_spares(mddev_t *mddev) 6763 { 6764 mdk_rdev_t *rdev; 6765 int spares = 0; 6766 6767 mddev->curr_resync_completed = 0; 6768 6769 list_for_each_entry(rdev, &mddev->disks, same_set) 6770 if (rdev->raid_disk >= 0 && 6771 !test_bit(Blocked, &rdev->flags) && 6772 (test_bit(Faulty, &rdev->flags) || 6773 ! test_bit(In_sync, &rdev->flags)) && 6774 atomic_read(&rdev->nr_pending)==0) { 6775 if (mddev->pers->hot_remove_disk( 6776 mddev, rdev->raid_disk)==0) { 6777 char nm[20]; 6778 sprintf(nm,"rd%d", rdev->raid_disk); 6779 sysfs_remove_link(&mddev->kobj, nm); 6780 rdev->raid_disk = -1; 6781 } 6782 } 6783 6784 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { 6785 list_for_each_entry(rdev, &mddev->disks, same_set) { 6786 if (rdev->raid_disk >= 0 && 6787 !test_bit(In_sync, &rdev->flags) && 6788 !test_bit(Blocked, &rdev->flags)) 6789 spares++; 6790 if (rdev->raid_disk < 0 6791 && !test_bit(Faulty, &rdev->flags)) { 6792 rdev->recovery_offset = 0; 6793 if (mddev->pers-> 6794 hot_add_disk(mddev, rdev) == 0) { 6795 char nm[20]; 6796 sprintf(nm, "rd%d", rdev->raid_disk); 6797 if (sysfs_create_link(&mddev->kobj, 6798 &rdev->kobj, nm)) 6799 printk(KERN_WARNING 6800 "md: cannot register " 6801 "%s for %s\n", 6802 nm, mdname(mddev)); 6803 spares++; 6804 md_new_event(mddev); 6805 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6806 } else 6807 break; 6808 } 6809 } 6810 } 6811 return spares; 6812 } 6813 /* 6814 * This routine is regularly called by all per-raid-array threads to 6815 * deal with generic issues like resync and super-block update. 6816 * Raid personalities that don't have a thread (linear/raid0) do not 6817 * need this as they never do any recovery or update the superblock. 6818 * 6819 * It does not do any resync itself, but rather "forks" off other threads 6820 * to do that as needed. 6821 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 6822 * "->recovery" and create a thread at ->sync_thread. 6823 * When the thread finishes it sets MD_RECOVERY_DONE 6824 * and wakeups up this thread which will reap the thread and finish up. 6825 * This thread also removes any faulty devices (with nr_pending == 0). 6826 * 6827 * The overall approach is: 6828 * 1/ if the superblock needs updating, update it. 6829 * 2/ If a recovery thread is running, don't do anything else. 6830 * 3/ If recovery has finished, clean up, possibly marking spares active. 6831 * 4/ If there are any faulty devices, remove them. 6832 * 5/ If array is degraded, try to add spares devices 6833 * 6/ If array has spares or is not in-sync, start a resync thread. 6834 */ 6835 void md_check_recovery(mddev_t *mddev) 6836 { 6837 mdk_rdev_t *rdev; 6838 6839 6840 if (mddev->bitmap) 6841 bitmap_daemon_work(mddev); 6842 6843 if (mddev->ro) 6844 return; 6845 6846 if (signal_pending(current)) { 6847 if (mddev->pers->sync_request && !mddev->external) { 6848 printk(KERN_INFO "md: %s in immediate safe mode\n", 6849 mdname(mddev)); 6850 mddev->safemode = 2; 6851 } 6852 flush_signals(current); 6853 } 6854 6855 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 6856 return; 6857 if ( ! ( 6858 (mddev->flags && !mddev->external) || 6859 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 6860 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 6861 (mddev->external == 0 && mddev->safemode == 1) || 6862 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 6863 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 6864 )) 6865 return; 6866 6867 if (mddev_trylock(mddev)) { 6868 int spares = 0; 6869 6870 if (mddev->ro) { 6871 /* Only thing we do on a ro array is remove 6872 * failed devices. 6873 */ 6874 remove_and_add_spares(mddev); 6875 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6876 goto unlock; 6877 } 6878 6879 if (!mddev->external) { 6880 int did_change = 0; 6881 spin_lock_irq(&mddev->write_lock); 6882 if (mddev->safemode && 6883 !atomic_read(&mddev->writes_pending) && 6884 !mddev->in_sync && 6885 mddev->recovery_cp == MaxSector) { 6886 mddev->in_sync = 1; 6887 did_change = 1; 6888 if (mddev->persistent) 6889 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6890 } 6891 if (mddev->safemode == 1) 6892 mddev->safemode = 0; 6893 spin_unlock_irq(&mddev->write_lock); 6894 if (did_change) 6895 sysfs_notify_dirent(mddev->sysfs_state); 6896 } 6897 6898 if (mddev->flags) 6899 md_update_sb(mddev, 0); 6900 6901 list_for_each_entry(rdev, &mddev->disks, same_set) 6902 if (test_and_clear_bit(StateChanged, &rdev->flags)) 6903 sysfs_notify_dirent(rdev->sysfs_state); 6904 6905 6906 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6907 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6908 /* resync/recovery still happening */ 6909 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6910 goto unlock; 6911 } 6912 if (mddev->sync_thread) { 6913 /* resync has finished, collect result */ 6914 md_unregister_thread(mddev->sync_thread); 6915 mddev->sync_thread = NULL; 6916 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 6917 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 6918 /* success...*/ 6919 /* activate any spares */ 6920 if (mddev->pers->spare_active(mddev)) 6921 sysfs_notify(&mddev->kobj, NULL, 6922 "degraded"); 6923 } 6924 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6925 mddev->pers->finish_reshape) 6926 mddev->pers->finish_reshape(mddev); 6927 md_update_sb(mddev, 1); 6928 6929 /* if array is no-longer degraded, then any saved_raid_disk 6930 * information must be scrapped 6931 */ 6932 if (!mddev->degraded) 6933 list_for_each_entry(rdev, &mddev->disks, same_set) 6934 rdev->saved_raid_disk = -1; 6935 6936 mddev->recovery = 0; 6937 /* flag recovery needed just to double check */ 6938 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6939 sysfs_notify_dirent(mddev->sysfs_action); 6940 md_new_event(mddev); 6941 goto unlock; 6942 } 6943 /* Set RUNNING before clearing NEEDED to avoid 6944 * any transients in the value of "sync_action". 6945 */ 6946 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6947 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6948 /* Clear some bits that don't mean anything, but 6949 * might be left set 6950 */ 6951 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6952 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6953 6954 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 6955 goto unlock; 6956 /* no recovery is running. 6957 * remove any failed drives, then 6958 * add spares if possible. 6959 * Spare are also removed and re-added, to allow 6960 * the personality to fail the re-add. 6961 */ 6962 6963 if (mddev->reshape_position != MaxSector) { 6964 if (mddev->pers->check_reshape == NULL || 6965 mddev->pers->check_reshape(mddev) != 0) 6966 /* Cannot proceed */ 6967 goto unlock; 6968 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6969 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6970 } else if ((spares = remove_and_add_spares(mddev))) { 6971 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6972 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6973 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 6974 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6975 } else if (mddev->recovery_cp < MaxSector) { 6976 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6977 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6978 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6979 /* nothing to be done ... */ 6980 goto unlock; 6981 6982 if (mddev->pers->sync_request) { 6983 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6984 /* We are adding a device or devices to an array 6985 * which has the bitmap stored on all devices. 6986 * So make sure all bitmap pages get written 6987 */ 6988 bitmap_write_all(mddev->bitmap); 6989 } 6990 mddev->sync_thread = md_register_thread(md_do_sync, 6991 mddev, 6992 "resync"); 6993 if (!mddev->sync_thread) { 6994 printk(KERN_ERR "%s: could not start resync" 6995 " thread...\n", 6996 mdname(mddev)); 6997 /* leave the spares where they are, it shouldn't hurt */ 6998 mddev->recovery = 0; 6999 } else 7000 md_wakeup_thread(mddev->sync_thread); 7001 sysfs_notify_dirent(mddev->sysfs_action); 7002 md_new_event(mddev); 7003 } 7004 unlock: 7005 if (!mddev->sync_thread) { 7006 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7007 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 7008 &mddev->recovery)) 7009 if (mddev->sysfs_action) 7010 sysfs_notify_dirent(mddev->sysfs_action); 7011 } 7012 mddev_unlock(mddev); 7013 } 7014 } 7015 7016 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 7017 { 7018 sysfs_notify_dirent(rdev->sysfs_state); 7019 wait_event_timeout(rdev->blocked_wait, 7020 !test_bit(Blocked, &rdev->flags), 7021 msecs_to_jiffies(5000)); 7022 rdev_dec_pending(rdev, mddev); 7023 } 7024 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7025 7026 static int md_notify_reboot(struct notifier_block *this, 7027 unsigned long code, void *x) 7028 { 7029 struct list_head *tmp; 7030 mddev_t *mddev; 7031 7032 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 7033 7034 printk(KERN_INFO "md: stopping all md devices.\n"); 7035 7036 for_each_mddev(mddev, tmp) 7037 if (mddev_trylock(mddev)) { 7038 /* Force a switch to readonly even array 7039 * appears to still be in use. Hence 7040 * the '100'. 7041 */ 7042 do_md_stop(mddev, 1, 100); 7043 mddev_unlock(mddev); 7044 } 7045 /* 7046 * certain more exotic SCSI devices are known to be 7047 * volatile wrt too early system reboots. While the 7048 * right place to handle this issue is the given 7049 * driver, we do want to have a safe RAID driver ... 7050 */ 7051 mdelay(1000*1); 7052 } 7053 return NOTIFY_DONE; 7054 } 7055 7056 static struct notifier_block md_notifier = { 7057 .notifier_call = md_notify_reboot, 7058 .next = NULL, 7059 .priority = INT_MAX, /* before any real devices */ 7060 }; 7061 7062 static void md_geninit(void) 7063 { 7064 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 7065 7066 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 7067 } 7068 7069 static int __init md_init(void) 7070 { 7071 if (register_blkdev(MD_MAJOR, "md")) 7072 return -1; 7073 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 7074 unregister_blkdev(MD_MAJOR, "md"); 7075 return -1; 7076 } 7077 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 7078 md_probe, NULL, NULL); 7079 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 7080 md_probe, NULL, NULL); 7081 7082 register_reboot_notifier(&md_notifier); 7083 raid_table_header = register_sysctl_table(raid_root_table); 7084 7085 md_geninit(); 7086 return 0; 7087 } 7088 7089 7090 #ifndef MODULE 7091 7092 /* 7093 * Searches all registered partitions for autorun RAID arrays 7094 * at boot time. 7095 */ 7096 7097 static LIST_HEAD(all_detected_devices); 7098 struct detected_devices_node { 7099 struct list_head list; 7100 dev_t dev; 7101 }; 7102 7103 void md_autodetect_dev(dev_t dev) 7104 { 7105 struct detected_devices_node *node_detected_dev; 7106 7107 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 7108 if (node_detected_dev) { 7109 node_detected_dev->dev = dev; 7110 list_add_tail(&node_detected_dev->list, &all_detected_devices); 7111 } else { 7112 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 7113 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 7114 } 7115 } 7116 7117 7118 static void autostart_arrays(int part) 7119 { 7120 mdk_rdev_t *rdev; 7121 struct detected_devices_node *node_detected_dev; 7122 dev_t dev; 7123 int i_scanned, i_passed; 7124 7125 i_scanned = 0; 7126 i_passed = 0; 7127 7128 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 7129 7130 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 7131 i_scanned++; 7132 node_detected_dev = list_entry(all_detected_devices.next, 7133 struct detected_devices_node, list); 7134 list_del(&node_detected_dev->list); 7135 dev = node_detected_dev->dev; 7136 kfree(node_detected_dev); 7137 rdev = md_import_device(dev,0, 90); 7138 if (IS_ERR(rdev)) 7139 continue; 7140 7141 if (test_bit(Faulty, &rdev->flags)) { 7142 MD_BUG(); 7143 continue; 7144 } 7145 set_bit(AutoDetected, &rdev->flags); 7146 list_add(&rdev->same_set, &pending_raid_disks); 7147 i_passed++; 7148 } 7149 7150 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 7151 i_scanned, i_passed); 7152 7153 autorun_devices(part); 7154 } 7155 7156 #endif /* !MODULE */ 7157 7158 static __exit void md_exit(void) 7159 { 7160 mddev_t *mddev; 7161 struct list_head *tmp; 7162 7163 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); 7164 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 7165 7166 unregister_blkdev(MD_MAJOR,"md"); 7167 unregister_blkdev(mdp_major, "mdp"); 7168 unregister_reboot_notifier(&md_notifier); 7169 unregister_sysctl_table(raid_table_header); 7170 remove_proc_entry("mdstat", NULL); 7171 for_each_mddev(mddev, tmp) { 7172 export_array(mddev); 7173 mddev->hold_active = 0; 7174 } 7175 } 7176 7177 subsys_initcall(md_init); 7178 module_exit(md_exit) 7179 7180 static int get_ro(char *buffer, struct kernel_param *kp) 7181 { 7182 return sprintf(buffer, "%d", start_readonly); 7183 } 7184 static int set_ro(const char *val, struct kernel_param *kp) 7185 { 7186 char *e; 7187 int num = simple_strtoul(val, &e, 10); 7188 if (*val && (*e == '\0' || *e == '\n')) { 7189 start_readonly = num; 7190 return 0; 7191 } 7192 return -EINVAL; 7193 } 7194 7195 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 7196 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 7197 7198 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 7199 7200 EXPORT_SYMBOL(register_md_personality); 7201 EXPORT_SYMBOL(unregister_md_personality); 7202 EXPORT_SYMBOL(md_error); 7203 EXPORT_SYMBOL(md_done_sync); 7204 EXPORT_SYMBOL(md_write_start); 7205 EXPORT_SYMBOL(md_write_end); 7206 EXPORT_SYMBOL(md_register_thread); 7207 EXPORT_SYMBOL(md_unregister_thread); 7208 EXPORT_SYMBOL(md_wakeup_thread); 7209 EXPORT_SYMBOL(md_check_recovery); 7210 MODULE_LICENSE("GPL"); 7211 MODULE_DESCRIPTION("MD RAID framework"); 7212 MODULE_ALIAS("md"); 7213 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 7214