1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/kthread.h> 36 #include <linux/blkdev.h> 37 #include <linux/sysctl.h> 38 #include <linux/seq_file.h> 39 #include <linux/buffer_head.h> /* for invalidate_bdev */ 40 #include <linux/poll.h> 41 #include <linux/ctype.h> 42 #include <linux/string.h> 43 #include <linux/hdreg.h> 44 #include <linux/proc_fs.h> 45 #include <linux/random.h> 46 #include <linux/reboot.h> 47 #include <linux/file.h> 48 #include <linux/compat.h> 49 #include <linux/delay.h> 50 #include <linux/raid/md_p.h> 51 #include <linux/raid/md_u.h> 52 #include <linux/slab.h> 53 #include "md.h" 54 #include "bitmap.h" 55 56 #define DEBUG 0 57 #define dprintk(x...) ((void)(DEBUG && printk(x))) 58 59 60 #ifndef MODULE 61 static void autostart_arrays(int part); 62 #endif 63 64 static LIST_HEAD(pers_list); 65 static DEFINE_SPINLOCK(pers_lock); 66 67 static void md_print_devices(void); 68 69 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 70 71 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 72 73 /* 74 * Default number of read corrections we'll attempt on an rdev 75 * before ejecting it from the array. We divide the read error 76 * count by 2 for every hour elapsed between read errors. 77 */ 78 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 79 /* 80 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 81 * is 1000 KB/sec, so the extra system load does not show up that much. 82 * Increase it if you want to have more _guaranteed_ speed. Note that 83 * the RAID driver will use the maximum available bandwidth if the IO 84 * subsystem is idle. There is also an 'absolute maximum' reconstruction 85 * speed limit - in case reconstruction slows down your system despite 86 * idle IO detection. 87 * 88 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 89 * or /sys/block/mdX/md/sync_speed_{min,max} 90 */ 91 92 static int sysctl_speed_limit_min = 1000; 93 static int sysctl_speed_limit_max = 200000; 94 static inline int speed_min(mddev_t *mddev) 95 { 96 return mddev->sync_speed_min ? 97 mddev->sync_speed_min : sysctl_speed_limit_min; 98 } 99 100 static inline int speed_max(mddev_t *mddev) 101 { 102 return mddev->sync_speed_max ? 103 mddev->sync_speed_max : sysctl_speed_limit_max; 104 } 105 106 static struct ctl_table_header *raid_table_header; 107 108 static ctl_table raid_table[] = { 109 { 110 .procname = "speed_limit_min", 111 .data = &sysctl_speed_limit_min, 112 .maxlen = sizeof(int), 113 .mode = S_IRUGO|S_IWUSR, 114 .proc_handler = proc_dointvec, 115 }, 116 { 117 .procname = "speed_limit_max", 118 .data = &sysctl_speed_limit_max, 119 .maxlen = sizeof(int), 120 .mode = S_IRUGO|S_IWUSR, 121 .proc_handler = proc_dointvec, 122 }, 123 { } 124 }; 125 126 static ctl_table raid_dir_table[] = { 127 { 128 .procname = "raid", 129 .maxlen = 0, 130 .mode = S_IRUGO|S_IXUGO, 131 .child = raid_table, 132 }, 133 { } 134 }; 135 136 static ctl_table raid_root_table[] = { 137 { 138 .procname = "dev", 139 .maxlen = 0, 140 .mode = 0555, 141 .child = raid_dir_table, 142 }, 143 { } 144 }; 145 146 static const struct block_device_operations md_fops; 147 148 static int start_readonly; 149 150 /* 151 * We have a system wide 'event count' that is incremented 152 * on any 'interesting' event, and readers of /proc/mdstat 153 * can use 'poll' or 'select' to find out when the event 154 * count increases. 155 * 156 * Events are: 157 * start array, stop array, error, add device, remove device, 158 * start build, activate spare 159 */ 160 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 161 static atomic_t md_event_count; 162 void md_new_event(mddev_t *mddev) 163 { 164 atomic_inc(&md_event_count); 165 wake_up(&md_event_waiters); 166 } 167 EXPORT_SYMBOL_GPL(md_new_event); 168 169 /* Alternate version that can be called from interrupts 170 * when calling sysfs_notify isn't needed. 171 */ 172 static void md_new_event_inintr(mddev_t *mddev) 173 { 174 atomic_inc(&md_event_count); 175 wake_up(&md_event_waiters); 176 } 177 178 /* 179 * Enables to iterate over all existing md arrays 180 * all_mddevs_lock protects this list. 181 */ 182 static LIST_HEAD(all_mddevs); 183 static DEFINE_SPINLOCK(all_mddevs_lock); 184 185 186 /* 187 * iterates through all used mddevs in the system. 188 * We take care to grab the all_mddevs_lock whenever navigating 189 * the list, and to always hold a refcount when unlocked. 190 * Any code which breaks out of this loop while own 191 * a reference to the current mddev and must mddev_put it. 192 */ 193 #define for_each_mddev(mddev,tmp) \ 194 \ 195 for (({ spin_lock(&all_mddevs_lock); \ 196 tmp = all_mddevs.next; \ 197 mddev = NULL;}); \ 198 ({ if (tmp != &all_mddevs) \ 199 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 200 spin_unlock(&all_mddevs_lock); \ 201 if (mddev) mddev_put(mddev); \ 202 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 203 tmp != &all_mddevs;}); \ 204 ({ spin_lock(&all_mddevs_lock); \ 205 tmp = tmp->next;}) \ 206 ) 207 208 209 /* Rather than calling directly into the personality make_request function, 210 * IO requests come here first so that we can check if the device is 211 * being suspended pending a reconfiguration. 212 * We hold a refcount over the call to ->make_request. By the time that 213 * call has finished, the bio has been linked into some internal structure 214 * and so is visible to ->quiesce(), so we don't need the refcount any more. 215 */ 216 static int md_make_request(struct request_queue *q, struct bio *bio) 217 { 218 const int rw = bio_data_dir(bio); 219 mddev_t *mddev = q->queuedata; 220 int rv; 221 int cpu; 222 223 if (mddev == NULL || mddev->pers == NULL) { 224 bio_io_error(bio); 225 return 0; 226 } 227 rcu_read_lock(); 228 if (mddev->suspended || mddev->barrier) { 229 DEFINE_WAIT(__wait); 230 for (;;) { 231 prepare_to_wait(&mddev->sb_wait, &__wait, 232 TASK_UNINTERRUPTIBLE); 233 if (!mddev->suspended && !mddev->barrier) 234 break; 235 rcu_read_unlock(); 236 schedule(); 237 rcu_read_lock(); 238 } 239 finish_wait(&mddev->sb_wait, &__wait); 240 } 241 atomic_inc(&mddev->active_io); 242 rcu_read_unlock(); 243 244 rv = mddev->pers->make_request(mddev, bio); 245 246 cpu = part_stat_lock(); 247 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 248 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 249 bio_sectors(bio)); 250 part_stat_unlock(); 251 252 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 253 wake_up(&mddev->sb_wait); 254 255 return rv; 256 } 257 258 /* mddev_suspend makes sure no new requests are submitted 259 * to the device, and that any requests that have been submitted 260 * are completely handled. 261 * Once ->stop is called and completes, the module will be completely 262 * unused. 263 */ 264 static void mddev_suspend(mddev_t *mddev) 265 { 266 BUG_ON(mddev->suspended); 267 mddev->suspended = 1; 268 synchronize_rcu(); 269 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 270 mddev->pers->quiesce(mddev, 1); 271 } 272 273 static void mddev_resume(mddev_t *mddev) 274 { 275 mddev->suspended = 0; 276 wake_up(&mddev->sb_wait); 277 mddev->pers->quiesce(mddev, 0); 278 } 279 280 int mddev_congested(mddev_t *mddev, int bits) 281 { 282 if (mddev->barrier) 283 return 1; 284 return mddev->suspended; 285 } 286 EXPORT_SYMBOL(mddev_congested); 287 288 /* 289 * Generic barrier handling for md 290 */ 291 292 #define POST_REQUEST_BARRIER ((void*)1) 293 294 static void md_end_barrier(struct bio *bio, int err) 295 { 296 mdk_rdev_t *rdev = bio->bi_private; 297 mddev_t *mddev = rdev->mddev; 298 if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) 299 set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); 300 301 rdev_dec_pending(rdev, mddev); 302 303 if (atomic_dec_and_test(&mddev->flush_pending)) { 304 if (mddev->barrier == POST_REQUEST_BARRIER) { 305 /* This was a post-request barrier */ 306 mddev->barrier = NULL; 307 wake_up(&mddev->sb_wait); 308 } else 309 /* The pre-request barrier has finished */ 310 schedule_work(&mddev->barrier_work); 311 } 312 bio_put(bio); 313 } 314 315 static void submit_barriers(mddev_t *mddev) 316 { 317 mdk_rdev_t *rdev; 318 319 rcu_read_lock(); 320 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 321 if (rdev->raid_disk >= 0 && 322 !test_bit(Faulty, &rdev->flags)) { 323 /* Take two references, one is dropped 324 * when request finishes, one after 325 * we reclaim rcu_read_lock 326 */ 327 struct bio *bi; 328 atomic_inc(&rdev->nr_pending); 329 atomic_inc(&rdev->nr_pending); 330 rcu_read_unlock(); 331 bi = bio_alloc(GFP_KERNEL, 0); 332 bi->bi_end_io = md_end_barrier; 333 bi->bi_private = rdev; 334 bi->bi_bdev = rdev->bdev; 335 atomic_inc(&mddev->flush_pending); 336 submit_bio(WRITE_BARRIER, bi); 337 rcu_read_lock(); 338 rdev_dec_pending(rdev, mddev); 339 } 340 rcu_read_unlock(); 341 } 342 343 static void md_submit_barrier(struct work_struct *ws) 344 { 345 mddev_t *mddev = container_of(ws, mddev_t, barrier_work); 346 struct bio *bio = mddev->barrier; 347 348 atomic_set(&mddev->flush_pending, 1); 349 350 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 351 bio_endio(bio, -EOPNOTSUPP); 352 else if (bio->bi_size == 0) 353 /* an empty barrier - all done */ 354 bio_endio(bio, 0); 355 else { 356 bio->bi_rw &= ~(1<<BIO_RW_BARRIER); 357 if (mddev->pers->make_request(mddev, bio)) 358 generic_make_request(bio); 359 mddev->barrier = POST_REQUEST_BARRIER; 360 submit_barriers(mddev); 361 } 362 if (atomic_dec_and_test(&mddev->flush_pending)) { 363 mddev->barrier = NULL; 364 wake_up(&mddev->sb_wait); 365 } 366 } 367 368 void md_barrier_request(mddev_t *mddev, struct bio *bio) 369 { 370 spin_lock_irq(&mddev->write_lock); 371 wait_event_lock_irq(mddev->sb_wait, 372 !mddev->barrier, 373 mddev->write_lock, /*nothing*/); 374 mddev->barrier = bio; 375 spin_unlock_irq(&mddev->write_lock); 376 377 atomic_set(&mddev->flush_pending, 1); 378 INIT_WORK(&mddev->barrier_work, md_submit_barrier); 379 380 submit_barriers(mddev); 381 382 if (atomic_dec_and_test(&mddev->flush_pending)) 383 schedule_work(&mddev->barrier_work); 384 } 385 EXPORT_SYMBOL(md_barrier_request); 386 387 static inline mddev_t *mddev_get(mddev_t *mddev) 388 { 389 atomic_inc(&mddev->active); 390 return mddev; 391 } 392 393 static void mddev_delayed_delete(struct work_struct *ws); 394 395 static void mddev_put(mddev_t *mddev) 396 { 397 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 398 return; 399 if (!mddev->raid_disks && list_empty(&mddev->disks) && 400 mddev->ctime == 0 && !mddev->hold_active) { 401 /* Array is not configured at all, and not held active, 402 * so destroy it */ 403 list_del(&mddev->all_mddevs); 404 if (mddev->gendisk) { 405 /* we did a probe so need to clean up. 406 * Call schedule_work inside the spinlock 407 * so that flush_scheduled_work() after 408 * mddev_find will succeed in waiting for the 409 * work to be done. 410 */ 411 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 412 schedule_work(&mddev->del_work); 413 } else 414 kfree(mddev); 415 } 416 spin_unlock(&all_mddevs_lock); 417 } 418 419 static void mddev_init(mddev_t *mddev) 420 { 421 mutex_init(&mddev->open_mutex); 422 mutex_init(&mddev->reconfig_mutex); 423 mutex_init(&mddev->bitmap_info.mutex); 424 INIT_LIST_HEAD(&mddev->disks); 425 INIT_LIST_HEAD(&mddev->all_mddevs); 426 init_timer(&mddev->safemode_timer); 427 atomic_set(&mddev->active, 1); 428 atomic_set(&mddev->openers, 0); 429 atomic_set(&mddev->active_io, 0); 430 spin_lock_init(&mddev->write_lock); 431 atomic_set(&mddev->flush_pending, 0); 432 init_waitqueue_head(&mddev->sb_wait); 433 init_waitqueue_head(&mddev->recovery_wait); 434 mddev->reshape_position = MaxSector; 435 mddev->resync_min = 0; 436 mddev->resync_max = MaxSector; 437 mddev->level = LEVEL_NONE; 438 } 439 440 static mddev_t * mddev_find(dev_t unit) 441 { 442 mddev_t *mddev, *new = NULL; 443 444 retry: 445 spin_lock(&all_mddevs_lock); 446 447 if (unit) { 448 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 449 if (mddev->unit == unit) { 450 mddev_get(mddev); 451 spin_unlock(&all_mddevs_lock); 452 kfree(new); 453 return mddev; 454 } 455 456 if (new) { 457 list_add(&new->all_mddevs, &all_mddevs); 458 spin_unlock(&all_mddevs_lock); 459 new->hold_active = UNTIL_IOCTL; 460 return new; 461 } 462 } else if (new) { 463 /* find an unused unit number */ 464 static int next_minor = 512; 465 int start = next_minor; 466 int is_free = 0; 467 int dev = 0; 468 while (!is_free) { 469 dev = MKDEV(MD_MAJOR, next_minor); 470 next_minor++; 471 if (next_minor > MINORMASK) 472 next_minor = 0; 473 if (next_minor == start) { 474 /* Oh dear, all in use. */ 475 spin_unlock(&all_mddevs_lock); 476 kfree(new); 477 return NULL; 478 } 479 480 is_free = 1; 481 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 482 if (mddev->unit == dev) { 483 is_free = 0; 484 break; 485 } 486 } 487 new->unit = dev; 488 new->md_minor = MINOR(dev); 489 new->hold_active = UNTIL_STOP; 490 list_add(&new->all_mddevs, &all_mddevs); 491 spin_unlock(&all_mddevs_lock); 492 return new; 493 } 494 spin_unlock(&all_mddevs_lock); 495 496 new = kzalloc(sizeof(*new), GFP_KERNEL); 497 if (!new) 498 return NULL; 499 500 new->unit = unit; 501 if (MAJOR(unit) == MD_MAJOR) 502 new->md_minor = MINOR(unit); 503 else 504 new->md_minor = MINOR(unit) >> MdpMinorShift; 505 506 mddev_init(new); 507 508 goto retry; 509 } 510 511 static inline int mddev_lock(mddev_t * mddev) 512 { 513 return mutex_lock_interruptible(&mddev->reconfig_mutex); 514 } 515 516 static inline int mddev_is_locked(mddev_t *mddev) 517 { 518 return mutex_is_locked(&mddev->reconfig_mutex); 519 } 520 521 static inline int mddev_trylock(mddev_t * mddev) 522 { 523 return mutex_trylock(&mddev->reconfig_mutex); 524 } 525 526 static struct attribute_group md_redundancy_group; 527 528 static void mddev_unlock(mddev_t * mddev) 529 { 530 if (mddev->to_remove) { 531 /* These cannot be removed under reconfig_mutex as 532 * an access to the files will try to take reconfig_mutex 533 * while holding the file unremovable, which leads to 534 * a deadlock. 535 * So hold open_mutex instead - we are allowed to take 536 * it while holding reconfig_mutex, and md_run can 537 * use it to wait for the remove to complete. 538 */ 539 struct attribute_group *to_remove = mddev->to_remove; 540 mddev->to_remove = NULL; 541 mutex_lock(&mddev->open_mutex); 542 mutex_unlock(&mddev->reconfig_mutex); 543 544 if (to_remove != &md_redundancy_group) 545 sysfs_remove_group(&mddev->kobj, to_remove); 546 if (mddev->pers == NULL || 547 mddev->pers->sync_request == NULL) { 548 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 549 if (mddev->sysfs_action) 550 sysfs_put(mddev->sysfs_action); 551 mddev->sysfs_action = NULL; 552 } 553 mutex_unlock(&mddev->open_mutex); 554 } else 555 mutex_unlock(&mddev->reconfig_mutex); 556 557 md_wakeup_thread(mddev->thread); 558 } 559 560 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 561 { 562 mdk_rdev_t *rdev; 563 564 list_for_each_entry(rdev, &mddev->disks, same_set) 565 if (rdev->desc_nr == nr) 566 return rdev; 567 568 return NULL; 569 } 570 571 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 572 { 573 mdk_rdev_t *rdev; 574 575 list_for_each_entry(rdev, &mddev->disks, same_set) 576 if (rdev->bdev->bd_dev == dev) 577 return rdev; 578 579 return NULL; 580 } 581 582 static struct mdk_personality *find_pers(int level, char *clevel) 583 { 584 struct mdk_personality *pers; 585 list_for_each_entry(pers, &pers_list, list) { 586 if (level != LEVEL_NONE && pers->level == level) 587 return pers; 588 if (strcmp(pers->name, clevel)==0) 589 return pers; 590 } 591 return NULL; 592 } 593 594 /* return the offset of the super block in 512byte sectors */ 595 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 596 { 597 sector_t num_sectors = bdev->bd_inode->i_size / 512; 598 return MD_NEW_SIZE_SECTORS(num_sectors); 599 } 600 601 static int alloc_disk_sb(mdk_rdev_t * rdev) 602 { 603 if (rdev->sb_page) 604 MD_BUG(); 605 606 rdev->sb_page = alloc_page(GFP_KERNEL); 607 if (!rdev->sb_page) { 608 printk(KERN_ALERT "md: out of memory.\n"); 609 return -ENOMEM; 610 } 611 612 return 0; 613 } 614 615 static void free_disk_sb(mdk_rdev_t * rdev) 616 { 617 if (rdev->sb_page) { 618 put_page(rdev->sb_page); 619 rdev->sb_loaded = 0; 620 rdev->sb_page = NULL; 621 rdev->sb_start = 0; 622 rdev->sectors = 0; 623 } 624 } 625 626 627 static void super_written(struct bio *bio, int error) 628 { 629 mdk_rdev_t *rdev = bio->bi_private; 630 mddev_t *mddev = rdev->mddev; 631 632 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 633 printk("md: super_written gets error=%d, uptodate=%d\n", 634 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 635 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 636 md_error(mddev, rdev); 637 } 638 639 if (atomic_dec_and_test(&mddev->pending_writes)) 640 wake_up(&mddev->sb_wait); 641 bio_put(bio); 642 } 643 644 static void super_written_barrier(struct bio *bio, int error) 645 { 646 struct bio *bio2 = bio->bi_private; 647 mdk_rdev_t *rdev = bio2->bi_private; 648 mddev_t *mddev = rdev->mddev; 649 650 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 651 error == -EOPNOTSUPP) { 652 unsigned long flags; 653 /* barriers don't appear to be supported :-( */ 654 set_bit(BarriersNotsupp, &rdev->flags); 655 mddev->barriers_work = 0; 656 spin_lock_irqsave(&mddev->write_lock, flags); 657 bio2->bi_next = mddev->biolist; 658 mddev->biolist = bio2; 659 spin_unlock_irqrestore(&mddev->write_lock, flags); 660 wake_up(&mddev->sb_wait); 661 bio_put(bio); 662 } else { 663 bio_put(bio2); 664 bio->bi_private = rdev; 665 super_written(bio, error); 666 } 667 } 668 669 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 670 sector_t sector, int size, struct page *page) 671 { 672 /* write first size bytes of page to sector of rdev 673 * Increment mddev->pending_writes before returning 674 * and decrement it on completion, waking up sb_wait 675 * if zero is reached. 676 * If an error occurred, call md_error 677 * 678 * As we might need to resubmit the request if BIO_RW_BARRIER 679 * causes ENOTSUPP, we allocate a spare bio... 680 */ 681 struct bio *bio = bio_alloc(GFP_NOIO, 1); 682 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); 683 684 bio->bi_bdev = rdev->bdev; 685 bio->bi_sector = sector; 686 bio_add_page(bio, page, size, 0); 687 bio->bi_private = rdev; 688 bio->bi_end_io = super_written; 689 bio->bi_rw = rw; 690 691 atomic_inc(&mddev->pending_writes); 692 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 693 struct bio *rbio; 694 rw |= (1<<BIO_RW_BARRIER); 695 rbio = bio_clone(bio, GFP_NOIO); 696 rbio->bi_private = bio; 697 rbio->bi_end_io = super_written_barrier; 698 submit_bio(rw, rbio); 699 } else 700 submit_bio(rw, bio); 701 } 702 703 void md_super_wait(mddev_t *mddev) 704 { 705 /* wait for all superblock writes that were scheduled to complete. 706 * if any had to be retried (due to BARRIER problems), retry them 707 */ 708 DEFINE_WAIT(wq); 709 for(;;) { 710 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 711 if (atomic_read(&mddev->pending_writes)==0) 712 break; 713 while (mddev->biolist) { 714 struct bio *bio; 715 spin_lock_irq(&mddev->write_lock); 716 bio = mddev->biolist; 717 mddev->biolist = bio->bi_next ; 718 bio->bi_next = NULL; 719 spin_unlock_irq(&mddev->write_lock); 720 submit_bio(bio->bi_rw, bio); 721 } 722 schedule(); 723 } 724 finish_wait(&mddev->sb_wait, &wq); 725 } 726 727 static void bi_complete(struct bio *bio, int error) 728 { 729 complete((struct completion*)bio->bi_private); 730 } 731 732 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 733 struct page *page, int rw) 734 { 735 struct bio *bio = bio_alloc(GFP_NOIO, 1); 736 struct completion event; 737 int ret; 738 739 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 740 741 bio->bi_bdev = bdev; 742 bio->bi_sector = sector; 743 bio_add_page(bio, page, size, 0); 744 init_completion(&event); 745 bio->bi_private = &event; 746 bio->bi_end_io = bi_complete; 747 submit_bio(rw, bio); 748 wait_for_completion(&event); 749 750 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 751 bio_put(bio); 752 return ret; 753 } 754 EXPORT_SYMBOL_GPL(sync_page_io); 755 756 static int read_disk_sb(mdk_rdev_t * rdev, int size) 757 { 758 char b[BDEVNAME_SIZE]; 759 if (!rdev->sb_page) { 760 MD_BUG(); 761 return -EINVAL; 762 } 763 if (rdev->sb_loaded) 764 return 0; 765 766 767 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 768 goto fail; 769 rdev->sb_loaded = 1; 770 return 0; 771 772 fail: 773 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 774 bdevname(rdev->bdev,b)); 775 return -EINVAL; 776 } 777 778 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 779 { 780 return sb1->set_uuid0 == sb2->set_uuid0 && 781 sb1->set_uuid1 == sb2->set_uuid1 && 782 sb1->set_uuid2 == sb2->set_uuid2 && 783 sb1->set_uuid3 == sb2->set_uuid3; 784 } 785 786 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 787 { 788 int ret; 789 mdp_super_t *tmp1, *tmp2; 790 791 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 792 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 793 794 if (!tmp1 || !tmp2) { 795 ret = 0; 796 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 797 goto abort; 798 } 799 800 *tmp1 = *sb1; 801 *tmp2 = *sb2; 802 803 /* 804 * nr_disks is not constant 805 */ 806 tmp1->nr_disks = 0; 807 tmp2->nr_disks = 0; 808 809 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 810 abort: 811 kfree(tmp1); 812 kfree(tmp2); 813 return ret; 814 } 815 816 817 static u32 md_csum_fold(u32 csum) 818 { 819 csum = (csum & 0xffff) + (csum >> 16); 820 return (csum & 0xffff) + (csum >> 16); 821 } 822 823 static unsigned int calc_sb_csum(mdp_super_t * sb) 824 { 825 u64 newcsum = 0; 826 u32 *sb32 = (u32*)sb; 827 int i; 828 unsigned int disk_csum, csum; 829 830 disk_csum = sb->sb_csum; 831 sb->sb_csum = 0; 832 833 for (i = 0; i < MD_SB_BYTES/4 ; i++) 834 newcsum += sb32[i]; 835 csum = (newcsum & 0xffffffff) + (newcsum>>32); 836 837 838 #ifdef CONFIG_ALPHA 839 /* This used to use csum_partial, which was wrong for several 840 * reasons including that different results are returned on 841 * different architectures. It isn't critical that we get exactly 842 * the same return value as before (we always csum_fold before 843 * testing, and that removes any differences). However as we 844 * know that csum_partial always returned a 16bit value on 845 * alphas, do a fold to maximise conformity to previous behaviour. 846 */ 847 sb->sb_csum = md_csum_fold(disk_csum); 848 #else 849 sb->sb_csum = disk_csum; 850 #endif 851 return csum; 852 } 853 854 855 /* 856 * Handle superblock details. 857 * We want to be able to handle multiple superblock formats 858 * so we have a common interface to them all, and an array of 859 * different handlers. 860 * We rely on user-space to write the initial superblock, and support 861 * reading and updating of superblocks. 862 * Interface methods are: 863 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 864 * loads and validates a superblock on dev. 865 * if refdev != NULL, compare superblocks on both devices 866 * Return: 867 * 0 - dev has a superblock that is compatible with refdev 868 * 1 - dev has a superblock that is compatible and newer than refdev 869 * so dev should be used as the refdev in future 870 * -EINVAL superblock incompatible or invalid 871 * -othererror e.g. -EIO 872 * 873 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 874 * Verify that dev is acceptable into mddev. 875 * The first time, mddev->raid_disks will be 0, and data from 876 * dev should be merged in. Subsequent calls check that dev 877 * is new enough. Return 0 or -EINVAL 878 * 879 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 880 * Update the superblock for rdev with data in mddev 881 * This does not write to disc. 882 * 883 */ 884 885 struct super_type { 886 char *name; 887 struct module *owner; 888 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, 889 int minor_version); 890 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 891 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 892 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, 893 sector_t num_sectors); 894 }; 895 896 /* 897 * Check that the given mddev has no bitmap. 898 * 899 * This function is called from the run method of all personalities that do not 900 * support bitmaps. It prints an error message and returns non-zero if mddev 901 * has a bitmap. Otherwise, it returns 0. 902 * 903 */ 904 int md_check_no_bitmap(mddev_t *mddev) 905 { 906 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 907 return 0; 908 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 909 mdname(mddev), mddev->pers->name); 910 return 1; 911 } 912 EXPORT_SYMBOL(md_check_no_bitmap); 913 914 /* 915 * load_super for 0.90.0 916 */ 917 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 918 { 919 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 920 mdp_super_t *sb; 921 int ret; 922 923 /* 924 * Calculate the position of the superblock (512byte sectors), 925 * it's at the end of the disk. 926 * 927 * It also happens to be a multiple of 4Kb. 928 */ 929 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 930 931 ret = read_disk_sb(rdev, MD_SB_BYTES); 932 if (ret) return ret; 933 934 ret = -EINVAL; 935 936 bdevname(rdev->bdev, b); 937 sb = (mdp_super_t*)page_address(rdev->sb_page); 938 939 if (sb->md_magic != MD_SB_MAGIC) { 940 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 941 b); 942 goto abort; 943 } 944 945 if (sb->major_version != 0 || 946 sb->minor_version < 90 || 947 sb->minor_version > 91) { 948 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 949 sb->major_version, sb->minor_version, 950 b); 951 goto abort; 952 } 953 954 if (sb->raid_disks <= 0) 955 goto abort; 956 957 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 958 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 959 b); 960 goto abort; 961 } 962 963 rdev->preferred_minor = sb->md_minor; 964 rdev->data_offset = 0; 965 rdev->sb_size = MD_SB_BYTES; 966 967 if (sb->level == LEVEL_MULTIPATH) 968 rdev->desc_nr = -1; 969 else 970 rdev->desc_nr = sb->this_disk.number; 971 972 if (!refdev) { 973 ret = 1; 974 } else { 975 __u64 ev1, ev2; 976 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 977 if (!uuid_equal(refsb, sb)) { 978 printk(KERN_WARNING "md: %s has different UUID to %s\n", 979 b, bdevname(refdev->bdev,b2)); 980 goto abort; 981 } 982 if (!sb_equal(refsb, sb)) { 983 printk(KERN_WARNING "md: %s has same UUID" 984 " but different superblock to %s\n", 985 b, bdevname(refdev->bdev, b2)); 986 goto abort; 987 } 988 ev1 = md_event(sb); 989 ev2 = md_event(refsb); 990 if (ev1 > ev2) 991 ret = 1; 992 else 993 ret = 0; 994 } 995 rdev->sectors = rdev->sb_start; 996 997 if (rdev->sectors < sb->size * 2 && sb->level > 1) 998 /* "this cannot possibly happen" ... */ 999 ret = -EINVAL; 1000 1001 abort: 1002 return ret; 1003 } 1004 1005 /* 1006 * validate_super for 0.90.0 1007 */ 1008 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1009 { 1010 mdp_disk_t *desc; 1011 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 1012 __u64 ev1 = md_event(sb); 1013 1014 rdev->raid_disk = -1; 1015 clear_bit(Faulty, &rdev->flags); 1016 clear_bit(In_sync, &rdev->flags); 1017 clear_bit(WriteMostly, &rdev->flags); 1018 clear_bit(BarriersNotsupp, &rdev->flags); 1019 1020 if (mddev->raid_disks == 0) { 1021 mddev->major_version = 0; 1022 mddev->minor_version = sb->minor_version; 1023 mddev->patch_version = sb->patch_version; 1024 mddev->external = 0; 1025 mddev->chunk_sectors = sb->chunk_size >> 9; 1026 mddev->ctime = sb->ctime; 1027 mddev->utime = sb->utime; 1028 mddev->level = sb->level; 1029 mddev->clevel[0] = 0; 1030 mddev->layout = sb->layout; 1031 mddev->raid_disks = sb->raid_disks; 1032 mddev->dev_sectors = sb->size * 2; 1033 mddev->events = ev1; 1034 mddev->bitmap_info.offset = 0; 1035 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1036 1037 if (mddev->minor_version >= 91) { 1038 mddev->reshape_position = sb->reshape_position; 1039 mddev->delta_disks = sb->delta_disks; 1040 mddev->new_level = sb->new_level; 1041 mddev->new_layout = sb->new_layout; 1042 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1043 } else { 1044 mddev->reshape_position = MaxSector; 1045 mddev->delta_disks = 0; 1046 mddev->new_level = mddev->level; 1047 mddev->new_layout = mddev->layout; 1048 mddev->new_chunk_sectors = mddev->chunk_sectors; 1049 } 1050 1051 if (sb->state & (1<<MD_SB_CLEAN)) 1052 mddev->recovery_cp = MaxSector; 1053 else { 1054 if (sb->events_hi == sb->cp_events_hi && 1055 sb->events_lo == sb->cp_events_lo) { 1056 mddev->recovery_cp = sb->recovery_cp; 1057 } else 1058 mddev->recovery_cp = 0; 1059 } 1060 1061 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1062 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1063 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1064 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1065 1066 mddev->max_disks = MD_SB_DISKS; 1067 1068 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1069 mddev->bitmap_info.file == NULL) 1070 mddev->bitmap_info.offset = 1071 mddev->bitmap_info.default_offset; 1072 1073 } else if (mddev->pers == NULL) { 1074 /* Insist on good event counter while assembling, except 1075 * for spares (which don't need an event count) */ 1076 ++ev1; 1077 if (sb->disks[rdev->desc_nr].state & ( 1078 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1079 if (ev1 < mddev->events) 1080 return -EINVAL; 1081 } else if (mddev->bitmap) { 1082 /* if adding to array with a bitmap, then we can accept an 1083 * older device ... but not too old. 1084 */ 1085 if (ev1 < mddev->bitmap->events_cleared) 1086 return 0; 1087 } else { 1088 if (ev1 < mddev->events) 1089 /* just a hot-add of a new device, leave raid_disk at -1 */ 1090 return 0; 1091 } 1092 1093 if (mddev->level != LEVEL_MULTIPATH) { 1094 desc = sb->disks + rdev->desc_nr; 1095 1096 if (desc->state & (1<<MD_DISK_FAULTY)) 1097 set_bit(Faulty, &rdev->flags); 1098 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1099 desc->raid_disk < mddev->raid_disks */) { 1100 set_bit(In_sync, &rdev->flags); 1101 rdev->raid_disk = desc->raid_disk; 1102 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1103 /* active but not in sync implies recovery up to 1104 * reshape position. We don't know exactly where 1105 * that is, so set to zero for now */ 1106 if (mddev->minor_version >= 91) { 1107 rdev->recovery_offset = 0; 1108 rdev->raid_disk = desc->raid_disk; 1109 } 1110 } 1111 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1112 set_bit(WriteMostly, &rdev->flags); 1113 } else /* MULTIPATH are always insync */ 1114 set_bit(In_sync, &rdev->flags); 1115 return 0; 1116 } 1117 1118 /* 1119 * sync_super for 0.90.0 1120 */ 1121 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1122 { 1123 mdp_super_t *sb; 1124 mdk_rdev_t *rdev2; 1125 int next_spare = mddev->raid_disks; 1126 1127 1128 /* make rdev->sb match mddev data.. 1129 * 1130 * 1/ zero out disks 1131 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1132 * 3/ any empty disks < next_spare become removed 1133 * 1134 * disks[0] gets initialised to REMOVED because 1135 * we cannot be sure from other fields if it has 1136 * been initialised or not. 1137 */ 1138 int i; 1139 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1140 1141 rdev->sb_size = MD_SB_BYTES; 1142 1143 sb = (mdp_super_t*)page_address(rdev->sb_page); 1144 1145 memset(sb, 0, sizeof(*sb)); 1146 1147 sb->md_magic = MD_SB_MAGIC; 1148 sb->major_version = mddev->major_version; 1149 sb->patch_version = mddev->patch_version; 1150 sb->gvalid_words = 0; /* ignored */ 1151 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1152 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1153 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1154 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1155 1156 sb->ctime = mddev->ctime; 1157 sb->level = mddev->level; 1158 sb->size = mddev->dev_sectors / 2; 1159 sb->raid_disks = mddev->raid_disks; 1160 sb->md_minor = mddev->md_minor; 1161 sb->not_persistent = 0; 1162 sb->utime = mddev->utime; 1163 sb->state = 0; 1164 sb->events_hi = (mddev->events>>32); 1165 sb->events_lo = (u32)mddev->events; 1166 1167 if (mddev->reshape_position == MaxSector) 1168 sb->minor_version = 90; 1169 else { 1170 sb->minor_version = 91; 1171 sb->reshape_position = mddev->reshape_position; 1172 sb->new_level = mddev->new_level; 1173 sb->delta_disks = mddev->delta_disks; 1174 sb->new_layout = mddev->new_layout; 1175 sb->new_chunk = mddev->new_chunk_sectors << 9; 1176 } 1177 mddev->minor_version = sb->minor_version; 1178 if (mddev->in_sync) 1179 { 1180 sb->recovery_cp = mddev->recovery_cp; 1181 sb->cp_events_hi = (mddev->events>>32); 1182 sb->cp_events_lo = (u32)mddev->events; 1183 if (mddev->recovery_cp == MaxSector) 1184 sb->state = (1<< MD_SB_CLEAN); 1185 } else 1186 sb->recovery_cp = 0; 1187 1188 sb->layout = mddev->layout; 1189 sb->chunk_size = mddev->chunk_sectors << 9; 1190 1191 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1192 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1193 1194 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1195 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1196 mdp_disk_t *d; 1197 int desc_nr; 1198 int is_active = test_bit(In_sync, &rdev2->flags); 1199 1200 if (rdev2->raid_disk >= 0 && 1201 sb->minor_version >= 91) 1202 /* we have nowhere to store the recovery_offset, 1203 * but if it is not below the reshape_position, 1204 * we can piggy-back on that. 1205 */ 1206 is_active = 1; 1207 if (rdev2->raid_disk < 0 || 1208 test_bit(Faulty, &rdev2->flags)) 1209 is_active = 0; 1210 if (is_active) 1211 desc_nr = rdev2->raid_disk; 1212 else 1213 desc_nr = next_spare++; 1214 rdev2->desc_nr = desc_nr; 1215 d = &sb->disks[rdev2->desc_nr]; 1216 nr_disks++; 1217 d->number = rdev2->desc_nr; 1218 d->major = MAJOR(rdev2->bdev->bd_dev); 1219 d->minor = MINOR(rdev2->bdev->bd_dev); 1220 if (is_active) 1221 d->raid_disk = rdev2->raid_disk; 1222 else 1223 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1224 if (test_bit(Faulty, &rdev2->flags)) 1225 d->state = (1<<MD_DISK_FAULTY); 1226 else if (is_active) { 1227 d->state = (1<<MD_DISK_ACTIVE); 1228 if (test_bit(In_sync, &rdev2->flags)) 1229 d->state |= (1<<MD_DISK_SYNC); 1230 active++; 1231 working++; 1232 } else { 1233 d->state = 0; 1234 spare++; 1235 working++; 1236 } 1237 if (test_bit(WriteMostly, &rdev2->flags)) 1238 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1239 } 1240 /* now set the "removed" and "faulty" bits on any missing devices */ 1241 for (i=0 ; i < mddev->raid_disks ; i++) { 1242 mdp_disk_t *d = &sb->disks[i]; 1243 if (d->state == 0 && d->number == 0) { 1244 d->number = i; 1245 d->raid_disk = i; 1246 d->state = (1<<MD_DISK_REMOVED); 1247 d->state |= (1<<MD_DISK_FAULTY); 1248 failed++; 1249 } 1250 } 1251 sb->nr_disks = nr_disks; 1252 sb->active_disks = active; 1253 sb->working_disks = working; 1254 sb->failed_disks = failed; 1255 sb->spare_disks = spare; 1256 1257 sb->this_disk = sb->disks[rdev->desc_nr]; 1258 sb->sb_csum = calc_sb_csum(sb); 1259 } 1260 1261 /* 1262 * rdev_size_change for 0.90.0 1263 */ 1264 static unsigned long long 1265 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1266 { 1267 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1268 return 0; /* component must fit device */ 1269 if (rdev->mddev->bitmap_info.offset) 1270 return 0; /* can't move bitmap */ 1271 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1272 if (!num_sectors || num_sectors > rdev->sb_start) 1273 num_sectors = rdev->sb_start; 1274 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1275 rdev->sb_page); 1276 md_super_wait(rdev->mddev); 1277 return num_sectors / 2; /* kB for sysfs */ 1278 } 1279 1280 1281 /* 1282 * version 1 superblock 1283 */ 1284 1285 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1286 { 1287 __le32 disk_csum; 1288 u32 csum; 1289 unsigned long long newcsum; 1290 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1291 __le32 *isuper = (__le32*)sb; 1292 int i; 1293 1294 disk_csum = sb->sb_csum; 1295 sb->sb_csum = 0; 1296 newcsum = 0; 1297 for (i=0; size>=4; size -= 4 ) 1298 newcsum += le32_to_cpu(*isuper++); 1299 1300 if (size == 2) 1301 newcsum += le16_to_cpu(*(__le16*) isuper); 1302 1303 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1304 sb->sb_csum = disk_csum; 1305 return cpu_to_le32(csum); 1306 } 1307 1308 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1309 { 1310 struct mdp_superblock_1 *sb; 1311 int ret; 1312 sector_t sb_start; 1313 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1314 int bmask; 1315 1316 /* 1317 * Calculate the position of the superblock in 512byte sectors. 1318 * It is always aligned to a 4K boundary and 1319 * depeding on minor_version, it can be: 1320 * 0: At least 8K, but less than 12K, from end of device 1321 * 1: At start of device 1322 * 2: 4K from start of device. 1323 */ 1324 switch(minor_version) { 1325 case 0: 1326 sb_start = rdev->bdev->bd_inode->i_size >> 9; 1327 sb_start -= 8*2; 1328 sb_start &= ~(sector_t)(4*2-1); 1329 break; 1330 case 1: 1331 sb_start = 0; 1332 break; 1333 case 2: 1334 sb_start = 8; 1335 break; 1336 default: 1337 return -EINVAL; 1338 } 1339 rdev->sb_start = sb_start; 1340 1341 /* superblock is rarely larger than 1K, but it can be larger, 1342 * and it is safe to read 4k, so we do that 1343 */ 1344 ret = read_disk_sb(rdev, 4096); 1345 if (ret) return ret; 1346 1347 1348 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1349 1350 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1351 sb->major_version != cpu_to_le32(1) || 1352 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1353 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1354 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1355 return -EINVAL; 1356 1357 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1358 printk("md: invalid superblock checksum on %s\n", 1359 bdevname(rdev->bdev,b)); 1360 return -EINVAL; 1361 } 1362 if (le64_to_cpu(sb->data_size) < 10) { 1363 printk("md: data_size too small on %s\n", 1364 bdevname(rdev->bdev,b)); 1365 return -EINVAL; 1366 } 1367 1368 rdev->preferred_minor = 0xffff; 1369 rdev->data_offset = le64_to_cpu(sb->data_offset); 1370 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1371 1372 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1373 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1374 if (rdev->sb_size & bmask) 1375 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1376 1377 if (minor_version 1378 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1379 return -EINVAL; 1380 1381 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1382 rdev->desc_nr = -1; 1383 else 1384 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1385 1386 if (!refdev) { 1387 ret = 1; 1388 } else { 1389 __u64 ev1, ev2; 1390 struct mdp_superblock_1 *refsb = 1391 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1392 1393 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1394 sb->level != refsb->level || 1395 sb->layout != refsb->layout || 1396 sb->chunksize != refsb->chunksize) { 1397 printk(KERN_WARNING "md: %s has strangely different" 1398 " superblock to %s\n", 1399 bdevname(rdev->bdev,b), 1400 bdevname(refdev->bdev,b2)); 1401 return -EINVAL; 1402 } 1403 ev1 = le64_to_cpu(sb->events); 1404 ev2 = le64_to_cpu(refsb->events); 1405 1406 if (ev1 > ev2) 1407 ret = 1; 1408 else 1409 ret = 0; 1410 } 1411 if (minor_version) 1412 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - 1413 le64_to_cpu(sb->data_offset); 1414 else 1415 rdev->sectors = rdev->sb_start; 1416 if (rdev->sectors < le64_to_cpu(sb->data_size)) 1417 return -EINVAL; 1418 rdev->sectors = le64_to_cpu(sb->data_size); 1419 if (le64_to_cpu(sb->size) > rdev->sectors) 1420 return -EINVAL; 1421 return ret; 1422 } 1423 1424 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1425 { 1426 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1427 __u64 ev1 = le64_to_cpu(sb->events); 1428 1429 rdev->raid_disk = -1; 1430 clear_bit(Faulty, &rdev->flags); 1431 clear_bit(In_sync, &rdev->flags); 1432 clear_bit(WriteMostly, &rdev->flags); 1433 clear_bit(BarriersNotsupp, &rdev->flags); 1434 1435 if (mddev->raid_disks == 0) { 1436 mddev->major_version = 1; 1437 mddev->patch_version = 0; 1438 mddev->external = 0; 1439 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1440 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1441 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1442 mddev->level = le32_to_cpu(sb->level); 1443 mddev->clevel[0] = 0; 1444 mddev->layout = le32_to_cpu(sb->layout); 1445 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1446 mddev->dev_sectors = le64_to_cpu(sb->size); 1447 mddev->events = ev1; 1448 mddev->bitmap_info.offset = 0; 1449 mddev->bitmap_info.default_offset = 1024 >> 9; 1450 1451 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1452 memcpy(mddev->uuid, sb->set_uuid, 16); 1453 1454 mddev->max_disks = (4096-256)/2; 1455 1456 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1457 mddev->bitmap_info.file == NULL ) 1458 mddev->bitmap_info.offset = 1459 (__s32)le32_to_cpu(sb->bitmap_offset); 1460 1461 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1462 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1463 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1464 mddev->new_level = le32_to_cpu(sb->new_level); 1465 mddev->new_layout = le32_to_cpu(sb->new_layout); 1466 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1467 } else { 1468 mddev->reshape_position = MaxSector; 1469 mddev->delta_disks = 0; 1470 mddev->new_level = mddev->level; 1471 mddev->new_layout = mddev->layout; 1472 mddev->new_chunk_sectors = mddev->chunk_sectors; 1473 } 1474 1475 } else if (mddev->pers == NULL) { 1476 /* Insist of good event counter while assembling, except for 1477 * spares (which don't need an event count) */ 1478 ++ev1; 1479 if (rdev->desc_nr >= 0 && 1480 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1481 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) 1482 if (ev1 < mddev->events) 1483 return -EINVAL; 1484 } else if (mddev->bitmap) { 1485 /* If adding to array with a bitmap, then we can accept an 1486 * older device, but not too old. 1487 */ 1488 if (ev1 < mddev->bitmap->events_cleared) 1489 return 0; 1490 } else { 1491 if (ev1 < mddev->events) 1492 /* just a hot-add of a new device, leave raid_disk at -1 */ 1493 return 0; 1494 } 1495 if (mddev->level != LEVEL_MULTIPATH) { 1496 int role; 1497 if (rdev->desc_nr < 0 || 1498 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1499 role = 0xffff; 1500 rdev->desc_nr = -1; 1501 } else 1502 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1503 switch(role) { 1504 case 0xffff: /* spare */ 1505 break; 1506 case 0xfffe: /* faulty */ 1507 set_bit(Faulty, &rdev->flags); 1508 break; 1509 default: 1510 if ((le32_to_cpu(sb->feature_map) & 1511 MD_FEATURE_RECOVERY_OFFSET)) 1512 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1513 else 1514 set_bit(In_sync, &rdev->flags); 1515 rdev->raid_disk = role; 1516 break; 1517 } 1518 if (sb->devflags & WriteMostly1) 1519 set_bit(WriteMostly, &rdev->flags); 1520 } else /* MULTIPATH are always insync */ 1521 set_bit(In_sync, &rdev->flags); 1522 1523 return 0; 1524 } 1525 1526 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1527 { 1528 struct mdp_superblock_1 *sb; 1529 mdk_rdev_t *rdev2; 1530 int max_dev, i; 1531 /* make rdev->sb match mddev and rdev data. */ 1532 1533 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1534 1535 sb->feature_map = 0; 1536 sb->pad0 = 0; 1537 sb->recovery_offset = cpu_to_le64(0); 1538 memset(sb->pad1, 0, sizeof(sb->pad1)); 1539 memset(sb->pad2, 0, sizeof(sb->pad2)); 1540 memset(sb->pad3, 0, sizeof(sb->pad3)); 1541 1542 sb->utime = cpu_to_le64((__u64)mddev->utime); 1543 sb->events = cpu_to_le64(mddev->events); 1544 if (mddev->in_sync) 1545 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1546 else 1547 sb->resync_offset = cpu_to_le64(0); 1548 1549 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1550 1551 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1552 sb->size = cpu_to_le64(mddev->dev_sectors); 1553 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1554 sb->level = cpu_to_le32(mddev->level); 1555 sb->layout = cpu_to_le32(mddev->layout); 1556 1557 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1558 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1559 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1560 } 1561 1562 if (rdev->raid_disk >= 0 && 1563 !test_bit(In_sync, &rdev->flags)) { 1564 sb->feature_map |= 1565 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1566 sb->recovery_offset = 1567 cpu_to_le64(rdev->recovery_offset); 1568 } 1569 1570 if (mddev->reshape_position != MaxSector) { 1571 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1572 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1573 sb->new_layout = cpu_to_le32(mddev->new_layout); 1574 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1575 sb->new_level = cpu_to_le32(mddev->new_level); 1576 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1577 } 1578 1579 max_dev = 0; 1580 list_for_each_entry(rdev2, &mddev->disks, same_set) 1581 if (rdev2->desc_nr+1 > max_dev) 1582 max_dev = rdev2->desc_nr+1; 1583 1584 if (max_dev > le32_to_cpu(sb->max_dev)) { 1585 int bmask; 1586 sb->max_dev = cpu_to_le32(max_dev); 1587 rdev->sb_size = max_dev * 2 + 256; 1588 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1589 if (rdev->sb_size & bmask) 1590 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1591 } 1592 for (i=0; i<max_dev;i++) 1593 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1594 1595 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1596 i = rdev2->desc_nr; 1597 if (test_bit(Faulty, &rdev2->flags)) 1598 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1599 else if (test_bit(In_sync, &rdev2->flags)) 1600 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1601 else if (rdev2->raid_disk >= 0) 1602 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1603 else 1604 sb->dev_roles[i] = cpu_to_le16(0xffff); 1605 } 1606 1607 sb->sb_csum = calc_sb_1_csum(sb); 1608 } 1609 1610 static unsigned long long 1611 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1612 { 1613 struct mdp_superblock_1 *sb; 1614 sector_t max_sectors; 1615 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1616 return 0; /* component must fit device */ 1617 if (rdev->sb_start < rdev->data_offset) { 1618 /* minor versions 1 and 2; superblock before data */ 1619 max_sectors = rdev->bdev->bd_inode->i_size >> 9; 1620 max_sectors -= rdev->data_offset; 1621 if (!num_sectors || num_sectors > max_sectors) 1622 num_sectors = max_sectors; 1623 } else if (rdev->mddev->bitmap_info.offset) { 1624 /* minor version 0 with bitmap we can't move */ 1625 return 0; 1626 } else { 1627 /* minor version 0; superblock after data */ 1628 sector_t sb_start; 1629 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1630 sb_start &= ~(sector_t)(4*2 - 1); 1631 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1632 if (!num_sectors || num_sectors > max_sectors) 1633 num_sectors = max_sectors; 1634 rdev->sb_start = sb_start; 1635 } 1636 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1637 sb->data_size = cpu_to_le64(num_sectors); 1638 sb->super_offset = rdev->sb_start; 1639 sb->sb_csum = calc_sb_1_csum(sb); 1640 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1641 rdev->sb_page); 1642 md_super_wait(rdev->mddev); 1643 return num_sectors / 2; /* kB for sysfs */ 1644 } 1645 1646 static struct super_type super_types[] = { 1647 [0] = { 1648 .name = "0.90.0", 1649 .owner = THIS_MODULE, 1650 .load_super = super_90_load, 1651 .validate_super = super_90_validate, 1652 .sync_super = super_90_sync, 1653 .rdev_size_change = super_90_rdev_size_change, 1654 }, 1655 [1] = { 1656 .name = "md-1", 1657 .owner = THIS_MODULE, 1658 .load_super = super_1_load, 1659 .validate_super = super_1_validate, 1660 .sync_super = super_1_sync, 1661 .rdev_size_change = super_1_rdev_size_change, 1662 }, 1663 }; 1664 1665 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1666 { 1667 mdk_rdev_t *rdev, *rdev2; 1668 1669 rcu_read_lock(); 1670 rdev_for_each_rcu(rdev, mddev1) 1671 rdev_for_each_rcu(rdev2, mddev2) 1672 if (rdev->bdev->bd_contains == 1673 rdev2->bdev->bd_contains) { 1674 rcu_read_unlock(); 1675 return 1; 1676 } 1677 rcu_read_unlock(); 1678 return 0; 1679 } 1680 1681 static LIST_HEAD(pending_raid_disks); 1682 1683 /* 1684 * Try to register data integrity profile for an mddev 1685 * 1686 * This is called when an array is started and after a disk has been kicked 1687 * from the array. It only succeeds if all working and active component devices 1688 * are integrity capable with matching profiles. 1689 */ 1690 int md_integrity_register(mddev_t *mddev) 1691 { 1692 mdk_rdev_t *rdev, *reference = NULL; 1693 1694 if (list_empty(&mddev->disks)) 1695 return 0; /* nothing to do */ 1696 if (blk_get_integrity(mddev->gendisk)) 1697 return 0; /* already registered */ 1698 list_for_each_entry(rdev, &mddev->disks, same_set) { 1699 /* skip spares and non-functional disks */ 1700 if (test_bit(Faulty, &rdev->flags)) 1701 continue; 1702 if (rdev->raid_disk < 0) 1703 continue; 1704 /* 1705 * If at least one rdev is not integrity capable, we can not 1706 * enable data integrity for the md device. 1707 */ 1708 if (!bdev_get_integrity(rdev->bdev)) 1709 return -EINVAL; 1710 if (!reference) { 1711 /* Use the first rdev as the reference */ 1712 reference = rdev; 1713 continue; 1714 } 1715 /* does this rdev's profile match the reference profile? */ 1716 if (blk_integrity_compare(reference->bdev->bd_disk, 1717 rdev->bdev->bd_disk) < 0) 1718 return -EINVAL; 1719 } 1720 /* 1721 * All component devices are integrity capable and have matching 1722 * profiles, register the common profile for the md device. 1723 */ 1724 if (blk_integrity_register(mddev->gendisk, 1725 bdev_get_integrity(reference->bdev)) != 0) { 1726 printk(KERN_ERR "md: failed to register integrity for %s\n", 1727 mdname(mddev)); 1728 return -EINVAL; 1729 } 1730 printk(KERN_NOTICE "md: data integrity on %s enabled\n", 1731 mdname(mddev)); 1732 return 0; 1733 } 1734 EXPORT_SYMBOL(md_integrity_register); 1735 1736 /* Disable data integrity if non-capable/non-matching disk is being added */ 1737 void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 1738 { 1739 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); 1740 struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); 1741 1742 if (!bi_mddev) /* nothing to do */ 1743 return; 1744 if (rdev->raid_disk < 0) /* skip spares */ 1745 return; 1746 if (bi_rdev && blk_integrity_compare(mddev->gendisk, 1747 rdev->bdev->bd_disk) >= 0) 1748 return; 1749 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); 1750 blk_integrity_unregister(mddev->gendisk); 1751 } 1752 EXPORT_SYMBOL(md_integrity_add_rdev); 1753 1754 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1755 { 1756 char b[BDEVNAME_SIZE]; 1757 struct kobject *ko; 1758 char *s; 1759 int err; 1760 1761 if (rdev->mddev) { 1762 MD_BUG(); 1763 return -EINVAL; 1764 } 1765 1766 /* prevent duplicates */ 1767 if (find_rdev(mddev, rdev->bdev->bd_dev)) 1768 return -EEXIST; 1769 1770 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 1771 if (rdev->sectors && (mddev->dev_sectors == 0 || 1772 rdev->sectors < mddev->dev_sectors)) { 1773 if (mddev->pers) { 1774 /* Cannot change size, so fail 1775 * If mddev->level <= 0, then we don't care 1776 * about aligning sizes (e.g. linear) 1777 */ 1778 if (mddev->level > 0) 1779 return -ENOSPC; 1780 } else 1781 mddev->dev_sectors = rdev->sectors; 1782 } 1783 1784 /* Verify rdev->desc_nr is unique. 1785 * If it is -1, assign a free number, else 1786 * check number is not in use 1787 */ 1788 if (rdev->desc_nr < 0) { 1789 int choice = 0; 1790 if (mddev->pers) choice = mddev->raid_disks; 1791 while (find_rdev_nr(mddev, choice)) 1792 choice++; 1793 rdev->desc_nr = choice; 1794 } else { 1795 if (find_rdev_nr(mddev, rdev->desc_nr)) 1796 return -EBUSY; 1797 } 1798 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 1799 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 1800 mdname(mddev), mddev->max_disks); 1801 return -EBUSY; 1802 } 1803 bdevname(rdev->bdev,b); 1804 while ( (s=strchr(b, '/')) != NULL) 1805 *s = '!'; 1806 1807 rdev->mddev = mddev; 1808 printk(KERN_INFO "md: bind<%s>\n", b); 1809 1810 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 1811 goto fail; 1812 1813 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 1814 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1815 kobject_del(&rdev->kobj); 1816 goto fail; 1817 } 1818 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, NULL, "state"); 1819 1820 list_add_rcu(&rdev->same_set, &mddev->disks); 1821 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1822 1823 /* May as well allow recovery to be retried once */ 1824 mddev->recovery_disabled = 0; 1825 1826 return 0; 1827 1828 fail: 1829 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1830 b, mdname(mddev)); 1831 return err; 1832 } 1833 1834 static void md_delayed_delete(struct work_struct *ws) 1835 { 1836 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1837 kobject_del(&rdev->kobj); 1838 kobject_put(&rdev->kobj); 1839 } 1840 1841 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1842 { 1843 char b[BDEVNAME_SIZE]; 1844 if (!rdev->mddev) { 1845 MD_BUG(); 1846 return; 1847 } 1848 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1849 list_del_rcu(&rdev->same_set); 1850 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1851 rdev->mddev = NULL; 1852 sysfs_remove_link(&rdev->kobj, "block"); 1853 sysfs_put(rdev->sysfs_state); 1854 rdev->sysfs_state = NULL; 1855 /* We need to delay this, otherwise we can deadlock when 1856 * writing to 'remove' to "dev/state". We also need 1857 * to delay it due to rcu usage. 1858 */ 1859 synchronize_rcu(); 1860 INIT_WORK(&rdev->del_work, md_delayed_delete); 1861 kobject_get(&rdev->kobj); 1862 schedule_work(&rdev->del_work); 1863 } 1864 1865 /* 1866 * prevent the device from being mounted, repartitioned or 1867 * otherwise reused by a RAID array (or any other kernel 1868 * subsystem), by bd_claiming the device. 1869 */ 1870 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) 1871 { 1872 int err = 0; 1873 struct block_device *bdev; 1874 char b[BDEVNAME_SIZE]; 1875 1876 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1877 if (IS_ERR(bdev)) { 1878 printk(KERN_ERR "md: could not open %s.\n", 1879 __bdevname(dev, b)); 1880 return PTR_ERR(bdev); 1881 } 1882 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); 1883 if (err) { 1884 printk(KERN_ERR "md: could not bd_claim %s.\n", 1885 bdevname(bdev, b)); 1886 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1887 return err; 1888 } 1889 if (!shared) 1890 set_bit(AllReserved, &rdev->flags); 1891 rdev->bdev = bdev; 1892 return err; 1893 } 1894 1895 static void unlock_rdev(mdk_rdev_t *rdev) 1896 { 1897 struct block_device *bdev = rdev->bdev; 1898 rdev->bdev = NULL; 1899 if (!bdev) 1900 MD_BUG(); 1901 bd_release(bdev); 1902 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1903 } 1904 1905 void md_autodetect_dev(dev_t dev); 1906 1907 static void export_rdev(mdk_rdev_t * rdev) 1908 { 1909 char b[BDEVNAME_SIZE]; 1910 printk(KERN_INFO "md: export_rdev(%s)\n", 1911 bdevname(rdev->bdev,b)); 1912 if (rdev->mddev) 1913 MD_BUG(); 1914 free_disk_sb(rdev); 1915 #ifndef MODULE 1916 if (test_bit(AutoDetected, &rdev->flags)) 1917 md_autodetect_dev(rdev->bdev->bd_dev); 1918 #endif 1919 unlock_rdev(rdev); 1920 kobject_put(&rdev->kobj); 1921 } 1922 1923 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1924 { 1925 unbind_rdev_from_array(rdev); 1926 export_rdev(rdev); 1927 } 1928 1929 static void export_array(mddev_t *mddev) 1930 { 1931 mdk_rdev_t *rdev, *tmp; 1932 1933 rdev_for_each(rdev, tmp, mddev) { 1934 if (!rdev->mddev) { 1935 MD_BUG(); 1936 continue; 1937 } 1938 kick_rdev_from_array(rdev); 1939 } 1940 if (!list_empty(&mddev->disks)) 1941 MD_BUG(); 1942 mddev->raid_disks = 0; 1943 mddev->major_version = 0; 1944 } 1945 1946 static void print_desc(mdp_disk_t *desc) 1947 { 1948 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1949 desc->major,desc->minor,desc->raid_disk,desc->state); 1950 } 1951 1952 static void print_sb_90(mdp_super_t *sb) 1953 { 1954 int i; 1955 1956 printk(KERN_INFO 1957 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1958 sb->major_version, sb->minor_version, sb->patch_version, 1959 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1960 sb->ctime); 1961 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1962 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1963 sb->md_minor, sb->layout, sb->chunk_size); 1964 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1965 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1966 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1967 sb->failed_disks, sb->spare_disks, 1968 sb->sb_csum, (unsigned long)sb->events_lo); 1969 1970 printk(KERN_INFO); 1971 for (i = 0; i < MD_SB_DISKS; i++) { 1972 mdp_disk_t *desc; 1973 1974 desc = sb->disks + i; 1975 if (desc->number || desc->major || desc->minor || 1976 desc->raid_disk || (desc->state && (desc->state != 4))) { 1977 printk(" D %2d: ", i); 1978 print_desc(desc); 1979 } 1980 } 1981 printk(KERN_INFO "md: THIS: "); 1982 print_desc(&sb->this_disk); 1983 } 1984 1985 static void print_sb_1(struct mdp_superblock_1 *sb) 1986 { 1987 __u8 *uuid; 1988 1989 uuid = sb->set_uuid; 1990 printk(KERN_INFO 1991 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" 1992 "md: Name: \"%s\" CT:%llu\n", 1993 le32_to_cpu(sb->major_version), 1994 le32_to_cpu(sb->feature_map), 1995 uuid, 1996 sb->set_name, 1997 (unsigned long long)le64_to_cpu(sb->ctime) 1998 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 1999 2000 uuid = sb->device_uuid; 2001 printk(KERN_INFO 2002 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 2003 " RO:%llu\n" 2004 "md: Dev:%08x UUID: %pU\n" 2005 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 2006 "md: (MaxDev:%u) \n", 2007 le32_to_cpu(sb->level), 2008 (unsigned long long)le64_to_cpu(sb->size), 2009 le32_to_cpu(sb->raid_disks), 2010 le32_to_cpu(sb->layout), 2011 le32_to_cpu(sb->chunksize), 2012 (unsigned long long)le64_to_cpu(sb->data_offset), 2013 (unsigned long long)le64_to_cpu(sb->data_size), 2014 (unsigned long long)le64_to_cpu(sb->super_offset), 2015 (unsigned long long)le64_to_cpu(sb->recovery_offset), 2016 le32_to_cpu(sb->dev_number), 2017 uuid, 2018 sb->devflags, 2019 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, 2020 (unsigned long long)le64_to_cpu(sb->events), 2021 (unsigned long long)le64_to_cpu(sb->resync_offset), 2022 le32_to_cpu(sb->sb_csum), 2023 le32_to_cpu(sb->max_dev) 2024 ); 2025 } 2026 2027 static void print_rdev(mdk_rdev_t *rdev, int major_version) 2028 { 2029 char b[BDEVNAME_SIZE]; 2030 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", 2031 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, 2032 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 2033 rdev->desc_nr); 2034 if (rdev->sb_loaded) { 2035 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 2036 switch (major_version) { 2037 case 0: 2038 print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); 2039 break; 2040 case 1: 2041 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); 2042 break; 2043 } 2044 } else 2045 printk(KERN_INFO "md: no rdev superblock!\n"); 2046 } 2047 2048 static void md_print_devices(void) 2049 { 2050 struct list_head *tmp; 2051 mdk_rdev_t *rdev; 2052 mddev_t *mddev; 2053 char b[BDEVNAME_SIZE]; 2054 2055 printk("\n"); 2056 printk("md: **********************************\n"); 2057 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 2058 printk("md: **********************************\n"); 2059 for_each_mddev(mddev, tmp) { 2060 2061 if (mddev->bitmap) 2062 bitmap_print_sb(mddev->bitmap); 2063 else 2064 printk("%s: ", mdname(mddev)); 2065 list_for_each_entry(rdev, &mddev->disks, same_set) 2066 printk("<%s>", bdevname(rdev->bdev,b)); 2067 printk("\n"); 2068 2069 list_for_each_entry(rdev, &mddev->disks, same_set) 2070 print_rdev(rdev, mddev->major_version); 2071 } 2072 printk("md: **********************************\n"); 2073 printk("\n"); 2074 } 2075 2076 2077 static void sync_sbs(mddev_t * mddev, int nospares) 2078 { 2079 /* Update each superblock (in-memory image), but 2080 * if we are allowed to, skip spares which already 2081 * have the right event counter, or have one earlier 2082 * (which would mean they aren't being marked as dirty 2083 * with the rest of the array) 2084 */ 2085 mdk_rdev_t *rdev; 2086 2087 /* First make sure individual recovery_offsets are correct */ 2088 list_for_each_entry(rdev, &mddev->disks, same_set) { 2089 if (rdev->raid_disk >= 0 && 2090 mddev->delta_disks >= 0 && 2091 !test_bit(In_sync, &rdev->flags) && 2092 mddev->curr_resync_completed > rdev->recovery_offset) 2093 rdev->recovery_offset = mddev->curr_resync_completed; 2094 2095 } 2096 list_for_each_entry(rdev, &mddev->disks, same_set) { 2097 if (rdev->sb_events == mddev->events || 2098 (nospares && 2099 rdev->raid_disk < 0 && 2100 rdev->sb_events+1 == mddev->events)) { 2101 /* Don't update this superblock */ 2102 rdev->sb_loaded = 2; 2103 } else { 2104 super_types[mddev->major_version]. 2105 sync_super(mddev, rdev); 2106 rdev->sb_loaded = 1; 2107 } 2108 } 2109 } 2110 2111 static void md_update_sb(mddev_t * mddev, int force_change) 2112 { 2113 mdk_rdev_t *rdev; 2114 int sync_req; 2115 int nospares = 0; 2116 2117 mddev->utime = get_seconds(); 2118 if (mddev->external) 2119 return; 2120 repeat: 2121 spin_lock_irq(&mddev->write_lock); 2122 2123 set_bit(MD_CHANGE_PENDING, &mddev->flags); 2124 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2125 force_change = 1; 2126 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2127 /* just a clean<-> dirty transition, possibly leave spares alone, 2128 * though if events isn't the right even/odd, we will have to do 2129 * spares after all 2130 */ 2131 nospares = 1; 2132 if (force_change) 2133 nospares = 0; 2134 if (mddev->degraded) 2135 /* If the array is degraded, then skipping spares is both 2136 * dangerous and fairly pointless. 2137 * Dangerous because a device that was removed from the array 2138 * might have a event_count that still looks up-to-date, 2139 * so it can be re-added without a resync. 2140 * Pointless because if there are any spares to skip, 2141 * then a recovery will happen and soon that array won't 2142 * be degraded any more and the spare can go back to sleep then. 2143 */ 2144 nospares = 0; 2145 2146 sync_req = mddev->in_sync; 2147 2148 /* If this is just a dirty<->clean transition, and the array is clean 2149 * and 'events' is odd, we can roll back to the previous clean state */ 2150 if (nospares 2151 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2152 && mddev->can_decrease_events 2153 && mddev->events != 1) { 2154 mddev->events--; 2155 mddev->can_decrease_events = 0; 2156 } else { 2157 /* otherwise we have to go forward and ... */ 2158 mddev->events ++; 2159 mddev->can_decrease_events = nospares; 2160 } 2161 2162 if (!mddev->events) { 2163 /* 2164 * oops, this 64-bit counter should never wrap. 2165 * Either we are in around ~1 trillion A.C., assuming 2166 * 1 reboot per second, or we have a bug: 2167 */ 2168 MD_BUG(); 2169 mddev->events --; 2170 } 2171 2172 /* 2173 * do not write anything to disk if using 2174 * nonpersistent superblocks 2175 */ 2176 if (!mddev->persistent) { 2177 if (!mddev->external) 2178 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2179 2180 spin_unlock_irq(&mddev->write_lock); 2181 wake_up(&mddev->sb_wait); 2182 return; 2183 } 2184 sync_sbs(mddev, nospares); 2185 spin_unlock_irq(&mddev->write_lock); 2186 2187 dprintk(KERN_INFO 2188 "md: updating %s RAID superblock on device (in sync %d)\n", 2189 mdname(mddev),mddev->in_sync); 2190 2191 bitmap_update_sb(mddev->bitmap); 2192 list_for_each_entry(rdev, &mddev->disks, same_set) { 2193 char b[BDEVNAME_SIZE]; 2194 dprintk(KERN_INFO "md: "); 2195 if (rdev->sb_loaded != 1) 2196 continue; /* no noise on spare devices */ 2197 if (test_bit(Faulty, &rdev->flags)) 2198 dprintk("(skipping faulty "); 2199 2200 dprintk("%s ", bdevname(rdev->bdev,b)); 2201 if (!test_bit(Faulty, &rdev->flags)) { 2202 md_super_write(mddev,rdev, 2203 rdev->sb_start, rdev->sb_size, 2204 rdev->sb_page); 2205 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 2206 bdevname(rdev->bdev,b), 2207 (unsigned long long)rdev->sb_start); 2208 rdev->sb_events = mddev->events; 2209 2210 } else 2211 dprintk(")\n"); 2212 if (mddev->level == LEVEL_MULTIPATH) 2213 /* only need to write one superblock... */ 2214 break; 2215 } 2216 md_super_wait(mddev); 2217 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2218 2219 spin_lock_irq(&mddev->write_lock); 2220 if (mddev->in_sync != sync_req || 2221 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2222 /* have to write it out again */ 2223 spin_unlock_irq(&mddev->write_lock); 2224 goto repeat; 2225 } 2226 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2227 spin_unlock_irq(&mddev->write_lock); 2228 wake_up(&mddev->sb_wait); 2229 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2230 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2231 2232 } 2233 2234 /* words written to sysfs files may, or may not, be \n terminated. 2235 * We want to accept with case. For this we use cmd_match. 2236 */ 2237 static int cmd_match(const char *cmd, const char *str) 2238 { 2239 /* See if cmd, written into a sysfs file, matches 2240 * str. They must either be the same, or cmd can 2241 * have a trailing newline 2242 */ 2243 while (*cmd && *str && *cmd == *str) { 2244 cmd++; 2245 str++; 2246 } 2247 if (*cmd == '\n') 2248 cmd++; 2249 if (*str || *cmd) 2250 return 0; 2251 return 1; 2252 } 2253 2254 struct rdev_sysfs_entry { 2255 struct attribute attr; 2256 ssize_t (*show)(mdk_rdev_t *, char *); 2257 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 2258 }; 2259 2260 static ssize_t 2261 state_show(mdk_rdev_t *rdev, char *page) 2262 { 2263 char *sep = ""; 2264 size_t len = 0; 2265 2266 if (test_bit(Faulty, &rdev->flags)) { 2267 len+= sprintf(page+len, "%sfaulty",sep); 2268 sep = ","; 2269 } 2270 if (test_bit(In_sync, &rdev->flags)) { 2271 len += sprintf(page+len, "%sin_sync",sep); 2272 sep = ","; 2273 } 2274 if (test_bit(WriteMostly, &rdev->flags)) { 2275 len += sprintf(page+len, "%swrite_mostly",sep); 2276 sep = ","; 2277 } 2278 if (test_bit(Blocked, &rdev->flags)) { 2279 len += sprintf(page+len, "%sblocked", sep); 2280 sep = ","; 2281 } 2282 if (!test_bit(Faulty, &rdev->flags) && 2283 !test_bit(In_sync, &rdev->flags)) { 2284 len += sprintf(page+len, "%sspare", sep); 2285 sep = ","; 2286 } 2287 return len+sprintf(page+len, "\n"); 2288 } 2289 2290 static ssize_t 2291 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2292 { 2293 /* can write 2294 * faulty - simulates and error 2295 * remove - disconnects the device 2296 * writemostly - sets write_mostly 2297 * -writemostly - clears write_mostly 2298 * blocked - sets the Blocked flag 2299 * -blocked - clears the Blocked flag 2300 * insync - sets Insync providing device isn't active 2301 */ 2302 int err = -EINVAL; 2303 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2304 md_error(rdev->mddev, rdev); 2305 err = 0; 2306 } else if (cmd_match(buf, "remove")) { 2307 if (rdev->raid_disk >= 0) 2308 err = -EBUSY; 2309 else { 2310 mddev_t *mddev = rdev->mddev; 2311 kick_rdev_from_array(rdev); 2312 if (mddev->pers) 2313 md_update_sb(mddev, 1); 2314 md_new_event(mddev); 2315 err = 0; 2316 } 2317 } else if (cmd_match(buf, "writemostly")) { 2318 set_bit(WriteMostly, &rdev->flags); 2319 err = 0; 2320 } else if (cmd_match(buf, "-writemostly")) { 2321 clear_bit(WriteMostly, &rdev->flags); 2322 err = 0; 2323 } else if (cmd_match(buf, "blocked")) { 2324 set_bit(Blocked, &rdev->flags); 2325 err = 0; 2326 } else if (cmd_match(buf, "-blocked")) { 2327 clear_bit(Blocked, &rdev->flags); 2328 wake_up(&rdev->blocked_wait); 2329 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2330 md_wakeup_thread(rdev->mddev->thread); 2331 2332 err = 0; 2333 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2334 set_bit(In_sync, &rdev->flags); 2335 err = 0; 2336 } 2337 if (!err && rdev->sysfs_state) 2338 sysfs_notify_dirent(rdev->sysfs_state); 2339 return err ? err : len; 2340 } 2341 static struct rdev_sysfs_entry rdev_state = 2342 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 2343 2344 static ssize_t 2345 errors_show(mdk_rdev_t *rdev, char *page) 2346 { 2347 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2348 } 2349 2350 static ssize_t 2351 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2352 { 2353 char *e; 2354 unsigned long n = simple_strtoul(buf, &e, 10); 2355 if (*buf && (*e == 0 || *e == '\n')) { 2356 atomic_set(&rdev->corrected_errors, n); 2357 return len; 2358 } 2359 return -EINVAL; 2360 } 2361 static struct rdev_sysfs_entry rdev_errors = 2362 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2363 2364 static ssize_t 2365 slot_show(mdk_rdev_t *rdev, char *page) 2366 { 2367 if (rdev->raid_disk < 0) 2368 return sprintf(page, "none\n"); 2369 else 2370 return sprintf(page, "%d\n", rdev->raid_disk); 2371 } 2372 2373 static ssize_t 2374 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2375 { 2376 char *e; 2377 int err; 2378 char nm[20]; 2379 int slot = simple_strtoul(buf, &e, 10); 2380 if (strncmp(buf, "none", 4)==0) 2381 slot = -1; 2382 else if (e==buf || (*e && *e!= '\n')) 2383 return -EINVAL; 2384 if (rdev->mddev->pers && slot == -1) { 2385 /* Setting 'slot' on an active array requires also 2386 * updating the 'rd%d' link, and communicating 2387 * with the personality with ->hot_*_disk. 2388 * For now we only support removing 2389 * failed/spare devices. This normally happens automatically, 2390 * but not when the metadata is externally managed. 2391 */ 2392 if (rdev->raid_disk == -1) 2393 return -EEXIST; 2394 /* personality does all needed checks */ 2395 if (rdev->mddev->pers->hot_add_disk == NULL) 2396 return -EINVAL; 2397 err = rdev->mddev->pers-> 2398 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2399 if (err) 2400 return err; 2401 sprintf(nm, "rd%d", rdev->raid_disk); 2402 sysfs_remove_link(&rdev->mddev->kobj, nm); 2403 rdev->raid_disk = -1; 2404 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2405 md_wakeup_thread(rdev->mddev->thread); 2406 } else if (rdev->mddev->pers) { 2407 mdk_rdev_t *rdev2; 2408 /* Activating a spare .. or possibly reactivating 2409 * if we ever get bitmaps working here. 2410 */ 2411 2412 if (rdev->raid_disk != -1) 2413 return -EBUSY; 2414 2415 if (rdev->mddev->pers->hot_add_disk == NULL) 2416 return -EINVAL; 2417 2418 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) 2419 if (rdev2->raid_disk == slot) 2420 return -EEXIST; 2421 2422 rdev->raid_disk = slot; 2423 if (test_bit(In_sync, &rdev->flags)) 2424 rdev->saved_raid_disk = slot; 2425 else 2426 rdev->saved_raid_disk = -1; 2427 err = rdev->mddev->pers-> 2428 hot_add_disk(rdev->mddev, rdev); 2429 if (err) { 2430 rdev->raid_disk = -1; 2431 return err; 2432 } else 2433 sysfs_notify_dirent(rdev->sysfs_state); 2434 sprintf(nm, "rd%d", rdev->raid_disk); 2435 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) 2436 printk(KERN_WARNING 2437 "md: cannot register " 2438 "%s for %s\n", 2439 nm, mdname(rdev->mddev)); 2440 2441 /* don't wakeup anyone, leave that to userspace. */ 2442 } else { 2443 if (slot >= rdev->mddev->raid_disks) 2444 return -ENOSPC; 2445 rdev->raid_disk = slot; 2446 /* assume it is working */ 2447 clear_bit(Faulty, &rdev->flags); 2448 clear_bit(WriteMostly, &rdev->flags); 2449 set_bit(In_sync, &rdev->flags); 2450 sysfs_notify_dirent(rdev->sysfs_state); 2451 } 2452 return len; 2453 } 2454 2455 2456 static struct rdev_sysfs_entry rdev_slot = 2457 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2458 2459 static ssize_t 2460 offset_show(mdk_rdev_t *rdev, char *page) 2461 { 2462 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2463 } 2464 2465 static ssize_t 2466 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2467 { 2468 char *e; 2469 unsigned long long offset = simple_strtoull(buf, &e, 10); 2470 if (e==buf || (*e && *e != '\n')) 2471 return -EINVAL; 2472 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2473 return -EBUSY; 2474 if (rdev->sectors && rdev->mddev->external) 2475 /* Must set offset before size, so overlap checks 2476 * can be sane */ 2477 return -EBUSY; 2478 rdev->data_offset = offset; 2479 return len; 2480 } 2481 2482 static struct rdev_sysfs_entry rdev_offset = 2483 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2484 2485 static ssize_t 2486 rdev_size_show(mdk_rdev_t *rdev, char *page) 2487 { 2488 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2489 } 2490 2491 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2492 { 2493 /* check if two start/length pairs overlap */ 2494 if (s1+l1 <= s2) 2495 return 0; 2496 if (s2+l2 <= s1) 2497 return 0; 2498 return 1; 2499 } 2500 2501 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2502 { 2503 unsigned long long blocks; 2504 sector_t new; 2505 2506 if (strict_strtoull(buf, 10, &blocks) < 0) 2507 return -EINVAL; 2508 2509 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2510 return -EINVAL; /* sector conversion overflow */ 2511 2512 new = blocks * 2; 2513 if (new != blocks * 2) 2514 return -EINVAL; /* unsigned long long to sector_t overflow */ 2515 2516 *sectors = new; 2517 return 0; 2518 } 2519 2520 static ssize_t 2521 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2522 { 2523 mddev_t *my_mddev = rdev->mddev; 2524 sector_t oldsectors = rdev->sectors; 2525 sector_t sectors; 2526 2527 if (strict_blocks_to_sectors(buf, §ors) < 0) 2528 return -EINVAL; 2529 if (my_mddev->pers && rdev->raid_disk >= 0) { 2530 if (my_mddev->persistent) { 2531 sectors = super_types[my_mddev->major_version]. 2532 rdev_size_change(rdev, sectors); 2533 if (!sectors) 2534 return -EBUSY; 2535 } else if (!sectors) 2536 sectors = (rdev->bdev->bd_inode->i_size >> 9) - 2537 rdev->data_offset; 2538 } 2539 if (sectors < my_mddev->dev_sectors) 2540 return -EINVAL; /* component must fit device */ 2541 2542 rdev->sectors = sectors; 2543 if (sectors > oldsectors && my_mddev->external) { 2544 /* need to check that all other rdevs with the same ->bdev 2545 * do not overlap. We need to unlock the mddev to avoid 2546 * a deadlock. We have already changed rdev->sectors, and if 2547 * we have to change it back, we will have the lock again. 2548 */ 2549 mddev_t *mddev; 2550 int overlap = 0; 2551 struct list_head *tmp; 2552 2553 mddev_unlock(my_mddev); 2554 for_each_mddev(mddev, tmp) { 2555 mdk_rdev_t *rdev2; 2556 2557 mddev_lock(mddev); 2558 list_for_each_entry(rdev2, &mddev->disks, same_set) 2559 if (test_bit(AllReserved, &rdev2->flags) || 2560 (rdev->bdev == rdev2->bdev && 2561 rdev != rdev2 && 2562 overlaps(rdev->data_offset, rdev->sectors, 2563 rdev2->data_offset, 2564 rdev2->sectors))) { 2565 overlap = 1; 2566 break; 2567 } 2568 mddev_unlock(mddev); 2569 if (overlap) { 2570 mddev_put(mddev); 2571 break; 2572 } 2573 } 2574 mddev_lock(my_mddev); 2575 if (overlap) { 2576 /* Someone else could have slipped in a size 2577 * change here, but doing so is just silly. 2578 * We put oldsectors back because we *know* it is 2579 * safe, and trust userspace not to race with 2580 * itself 2581 */ 2582 rdev->sectors = oldsectors; 2583 return -EBUSY; 2584 } 2585 } 2586 return len; 2587 } 2588 2589 static struct rdev_sysfs_entry rdev_size = 2590 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2591 2592 2593 static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page) 2594 { 2595 unsigned long long recovery_start = rdev->recovery_offset; 2596 2597 if (test_bit(In_sync, &rdev->flags) || 2598 recovery_start == MaxSector) 2599 return sprintf(page, "none\n"); 2600 2601 return sprintf(page, "%llu\n", recovery_start); 2602 } 2603 2604 static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2605 { 2606 unsigned long long recovery_start; 2607 2608 if (cmd_match(buf, "none")) 2609 recovery_start = MaxSector; 2610 else if (strict_strtoull(buf, 10, &recovery_start)) 2611 return -EINVAL; 2612 2613 if (rdev->mddev->pers && 2614 rdev->raid_disk >= 0) 2615 return -EBUSY; 2616 2617 rdev->recovery_offset = recovery_start; 2618 if (recovery_start == MaxSector) 2619 set_bit(In_sync, &rdev->flags); 2620 else 2621 clear_bit(In_sync, &rdev->flags); 2622 return len; 2623 } 2624 2625 static struct rdev_sysfs_entry rdev_recovery_start = 2626 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 2627 2628 static struct attribute *rdev_default_attrs[] = { 2629 &rdev_state.attr, 2630 &rdev_errors.attr, 2631 &rdev_slot.attr, 2632 &rdev_offset.attr, 2633 &rdev_size.attr, 2634 &rdev_recovery_start.attr, 2635 NULL, 2636 }; 2637 static ssize_t 2638 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2639 { 2640 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2641 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2642 mddev_t *mddev = rdev->mddev; 2643 ssize_t rv; 2644 2645 if (!entry->show) 2646 return -EIO; 2647 2648 rv = mddev ? mddev_lock(mddev) : -EBUSY; 2649 if (!rv) { 2650 if (rdev->mddev == NULL) 2651 rv = -EBUSY; 2652 else 2653 rv = entry->show(rdev, page); 2654 mddev_unlock(mddev); 2655 } 2656 return rv; 2657 } 2658 2659 static ssize_t 2660 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 2661 const char *page, size_t length) 2662 { 2663 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2664 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2665 ssize_t rv; 2666 mddev_t *mddev = rdev->mddev; 2667 2668 if (!entry->store) 2669 return -EIO; 2670 if (!capable(CAP_SYS_ADMIN)) 2671 return -EACCES; 2672 rv = mddev ? mddev_lock(mddev): -EBUSY; 2673 if (!rv) { 2674 if (rdev->mddev == NULL) 2675 rv = -EBUSY; 2676 else 2677 rv = entry->store(rdev, page, length); 2678 mddev_unlock(mddev); 2679 } 2680 return rv; 2681 } 2682 2683 static void rdev_free(struct kobject *ko) 2684 { 2685 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2686 kfree(rdev); 2687 } 2688 static const struct sysfs_ops rdev_sysfs_ops = { 2689 .show = rdev_attr_show, 2690 .store = rdev_attr_store, 2691 }; 2692 static struct kobj_type rdev_ktype = { 2693 .release = rdev_free, 2694 .sysfs_ops = &rdev_sysfs_ops, 2695 .default_attrs = rdev_default_attrs, 2696 }; 2697 2698 /* 2699 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2700 * 2701 * mark the device faulty if: 2702 * 2703 * - the device is nonexistent (zero size) 2704 * - the device has no valid superblock 2705 * 2706 * a faulty rdev _never_ has rdev->sb set. 2707 */ 2708 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2709 { 2710 char b[BDEVNAME_SIZE]; 2711 int err; 2712 mdk_rdev_t *rdev; 2713 sector_t size; 2714 2715 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2716 if (!rdev) { 2717 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2718 return ERR_PTR(-ENOMEM); 2719 } 2720 2721 if ((err = alloc_disk_sb(rdev))) 2722 goto abort_free; 2723 2724 err = lock_rdev(rdev, newdev, super_format == -2); 2725 if (err) 2726 goto abort_free; 2727 2728 kobject_init(&rdev->kobj, &rdev_ktype); 2729 2730 rdev->desc_nr = -1; 2731 rdev->saved_raid_disk = -1; 2732 rdev->raid_disk = -1; 2733 rdev->flags = 0; 2734 rdev->data_offset = 0; 2735 rdev->sb_events = 0; 2736 rdev->last_read_error.tv_sec = 0; 2737 rdev->last_read_error.tv_nsec = 0; 2738 atomic_set(&rdev->nr_pending, 0); 2739 atomic_set(&rdev->read_errors, 0); 2740 atomic_set(&rdev->corrected_errors, 0); 2741 2742 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2743 if (!size) { 2744 printk(KERN_WARNING 2745 "md: %s has zero or unknown size, marking faulty!\n", 2746 bdevname(rdev->bdev,b)); 2747 err = -EINVAL; 2748 goto abort_free; 2749 } 2750 2751 if (super_format >= 0) { 2752 err = super_types[super_format]. 2753 load_super(rdev, NULL, super_minor); 2754 if (err == -EINVAL) { 2755 printk(KERN_WARNING 2756 "md: %s does not have a valid v%d.%d " 2757 "superblock, not importing!\n", 2758 bdevname(rdev->bdev,b), 2759 super_format, super_minor); 2760 goto abort_free; 2761 } 2762 if (err < 0) { 2763 printk(KERN_WARNING 2764 "md: could not read %s's sb, not importing!\n", 2765 bdevname(rdev->bdev,b)); 2766 goto abort_free; 2767 } 2768 } 2769 2770 INIT_LIST_HEAD(&rdev->same_set); 2771 init_waitqueue_head(&rdev->blocked_wait); 2772 2773 return rdev; 2774 2775 abort_free: 2776 if (rdev->sb_page) { 2777 if (rdev->bdev) 2778 unlock_rdev(rdev); 2779 free_disk_sb(rdev); 2780 } 2781 kfree(rdev); 2782 return ERR_PTR(err); 2783 } 2784 2785 /* 2786 * Check a full RAID array for plausibility 2787 */ 2788 2789 2790 static void analyze_sbs(mddev_t * mddev) 2791 { 2792 int i; 2793 mdk_rdev_t *rdev, *freshest, *tmp; 2794 char b[BDEVNAME_SIZE]; 2795 2796 freshest = NULL; 2797 rdev_for_each(rdev, tmp, mddev) 2798 switch (super_types[mddev->major_version]. 2799 load_super(rdev, freshest, mddev->minor_version)) { 2800 case 1: 2801 freshest = rdev; 2802 break; 2803 case 0: 2804 break; 2805 default: 2806 printk( KERN_ERR \ 2807 "md: fatal superblock inconsistency in %s" 2808 " -- removing from array\n", 2809 bdevname(rdev->bdev,b)); 2810 kick_rdev_from_array(rdev); 2811 } 2812 2813 2814 super_types[mddev->major_version]. 2815 validate_super(mddev, freshest); 2816 2817 i = 0; 2818 rdev_for_each(rdev, tmp, mddev) { 2819 if (mddev->max_disks && 2820 (rdev->desc_nr >= mddev->max_disks || 2821 i > mddev->max_disks)) { 2822 printk(KERN_WARNING 2823 "md: %s: %s: only %d devices permitted\n", 2824 mdname(mddev), bdevname(rdev->bdev, b), 2825 mddev->max_disks); 2826 kick_rdev_from_array(rdev); 2827 continue; 2828 } 2829 if (rdev != freshest) 2830 if (super_types[mddev->major_version]. 2831 validate_super(mddev, rdev)) { 2832 printk(KERN_WARNING "md: kicking non-fresh %s" 2833 " from array!\n", 2834 bdevname(rdev->bdev,b)); 2835 kick_rdev_from_array(rdev); 2836 continue; 2837 } 2838 if (mddev->level == LEVEL_MULTIPATH) { 2839 rdev->desc_nr = i++; 2840 rdev->raid_disk = rdev->desc_nr; 2841 set_bit(In_sync, &rdev->flags); 2842 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { 2843 rdev->raid_disk = -1; 2844 clear_bit(In_sync, &rdev->flags); 2845 } 2846 } 2847 } 2848 2849 /* Read a fixed-point number. 2850 * Numbers in sysfs attributes should be in "standard" units where 2851 * possible, so time should be in seconds. 2852 * However we internally use a a much smaller unit such as 2853 * milliseconds or jiffies. 2854 * This function takes a decimal number with a possible fractional 2855 * component, and produces an integer which is the result of 2856 * multiplying that number by 10^'scale'. 2857 * all without any floating-point arithmetic. 2858 */ 2859 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 2860 { 2861 unsigned long result = 0; 2862 long decimals = -1; 2863 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 2864 if (*cp == '.') 2865 decimals = 0; 2866 else if (decimals < scale) { 2867 unsigned int value; 2868 value = *cp - '0'; 2869 result = result * 10 + value; 2870 if (decimals >= 0) 2871 decimals++; 2872 } 2873 cp++; 2874 } 2875 if (*cp == '\n') 2876 cp++; 2877 if (*cp) 2878 return -EINVAL; 2879 if (decimals < 0) 2880 decimals = 0; 2881 while (decimals < scale) { 2882 result *= 10; 2883 decimals ++; 2884 } 2885 *res = result; 2886 return 0; 2887 } 2888 2889 2890 static void md_safemode_timeout(unsigned long data); 2891 2892 static ssize_t 2893 safe_delay_show(mddev_t *mddev, char *page) 2894 { 2895 int msec = (mddev->safemode_delay*1000)/HZ; 2896 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2897 } 2898 static ssize_t 2899 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2900 { 2901 unsigned long msec; 2902 2903 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 2904 return -EINVAL; 2905 if (msec == 0) 2906 mddev->safemode_delay = 0; 2907 else { 2908 unsigned long old_delay = mddev->safemode_delay; 2909 mddev->safemode_delay = (msec*HZ)/1000; 2910 if (mddev->safemode_delay == 0) 2911 mddev->safemode_delay = 1; 2912 if (mddev->safemode_delay < old_delay) 2913 md_safemode_timeout((unsigned long)mddev); 2914 } 2915 return len; 2916 } 2917 static struct md_sysfs_entry md_safe_delay = 2918 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2919 2920 static ssize_t 2921 level_show(mddev_t *mddev, char *page) 2922 { 2923 struct mdk_personality *p = mddev->pers; 2924 if (p) 2925 return sprintf(page, "%s\n", p->name); 2926 else if (mddev->clevel[0]) 2927 return sprintf(page, "%s\n", mddev->clevel); 2928 else if (mddev->level != LEVEL_NONE) 2929 return sprintf(page, "%d\n", mddev->level); 2930 else 2931 return 0; 2932 } 2933 2934 static ssize_t 2935 level_store(mddev_t *mddev, const char *buf, size_t len) 2936 { 2937 char clevel[16]; 2938 ssize_t rv = len; 2939 struct mdk_personality *pers; 2940 long level; 2941 void *priv; 2942 mdk_rdev_t *rdev; 2943 2944 if (mddev->pers == NULL) { 2945 if (len == 0) 2946 return 0; 2947 if (len >= sizeof(mddev->clevel)) 2948 return -ENOSPC; 2949 strncpy(mddev->clevel, buf, len); 2950 if (mddev->clevel[len-1] == '\n') 2951 len--; 2952 mddev->clevel[len] = 0; 2953 mddev->level = LEVEL_NONE; 2954 return rv; 2955 } 2956 2957 /* request to change the personality. Need to ensure: 2958 * - array is not engaged in resync/recovery/reshape 2959 * - old personality can be suspended 2960 * - new personality will access other array. 2961 */ 2962 2963 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 2964 return -EBUSY; 2965 2966 if (!mddev->pers->quiesce) { 2967 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 2968 mdname(mddev), mddev->pers->name); 2969 return -EINVAL; 2970 } 2971 2972 /* Now find the new personality */ 2973 if (len == 0 || len >= sizeof(clevel)) 2974 return -EINVAL; 2975 strncpy(clevel, buf, len); 2976 if (clevel[len-1] == '\n') 2977 len--; 2978 clevel[len] = 0; 2979 if (strict_strtol(clevel, 10, &level)) 2980 level = LEVEL_NONE; 2981 2982 if (request_module("md-%s", clevel) != 0) 2983 request_module("md-level-%s", clevel); 2984 spin_lock(&pers_lock); 2985 pers = find_pers(level, clevel); 2986 if (!pers || !try_module_get(pers->owner)) { 2987 spin_unlock(&pers_lock); 2988 printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 2989 return -EINVAL; 2990 } 2991 spin_unlock(&pers_lock); 2992 2993 if (pers == mddev->pers) { 2994 /* Nothing to do! */ 2995 module_put(pers->owner); 2996 return rv; 2997 } 2998 if (!pers->takeover) { 2999 module_put(pers->owner); 3000 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3001 mdname(mddev), clevel); 3002 return -EINVAL; 3003 } 3004 3005 list_for_each_entry(rdev, &mddev->disks, same_set) 3006 rdev->new_raid_disk = rdev->raid_disk; 3007 3008 /* ->takeover must set new_* and/or delta_disks 3009 * if it succeeds, and may set them when it fails. 3010 */ 3011 priv = pers->takeover(mddev); 3012 if (IS_ERR(priv)) { 3013 mddev->new_level = mddev->level; 3014 mddev->new_layout = mddev->layout; 3015 mddev->new_chunk_sectors = mddev->chunk_sectors; 3016 mddev->raid_disks -= mddev->delta_disks; 3017 mddev->delta_disks = 0; 3018 module_put(pers->owner); 3019 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3020 mdname(mddev), clevel); 3021 return PTR_ERR(priv); 3022 } 3023 3024 /* Looks like we have a winner */ 3025 mddev_suspend(mddev); 3026 mddev->pers->stop(mddev); 3027 3028 if (mddev->pers->sync_request == NULL && 3029 pers->sync_request != NULL) { 3030 /* need to add the md_redundancy_group */ 3031 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3032 printk(KERN_WARNING 3033 "md: cannot register extra attributes for %s\n", 3034 mdname(mddev)); 3035 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); 3036 } 3037 if (mddev->pers->sync_request != NULL && 3038 pers->sync_request == NULL) { 3039 /* need to remove the md_redundancy_group */ 3040 if (mddev->to_remove == NULL) 3041 mddev->to_remove = &md_redundancy_group; 3042 } 3043 3044 if (mddev->pers->sync_request == NULL && 3045 mddev->external) { 3046 /* We are converting from a no-redundancy array 3047 * to a redundancy array and metadata is managed 3048 * externally so we need to be sure that writes 3049 * won't block due to a need to transition 3050 * clean->dirty 3051 * until external management is started. 3052 */ 3053 mddev->in_sync = 0; 3054 mddev->safemode_delay = 0; 3055 mddev->safemode = 0; 3056 } 3057 3058 list_for_each_entry(rdev, &mddev->disks, same_set) { 3059 char nm[20]; 3060 if (rdev->raid_disk < 0) 3061 continue; 3062 if (rdev->new_raid_disk > mddev->raid_disks) 3063 rdev->new_raid_disk = -1; 3064 if (rdev->new_raid_disk == rdev->raid_disk) 3065 continue; 3066 sprintf(nm, "rd%d", rdev->raid_disk); 3067 sysfs_remove_link(&mddev->kobj, nm); 3068 } 3069 list_for_each_entry(rdev, &mddev->disks, same_set) { 3070 if (rdev->raid_disk < 0) 3071 continue; 3072 if (rdev->new_raid_disk == rdev->raid_disk) 3073 continue; 3074 rdev->raid_disk = rdev->new_raid_disk; 3075 if (rdev->raid_disk < 0) 3076 clear_bit(In_sync, &rdev->flags); 3077 else { 3078 char nm[20]; 3079 sprintf(nm, "rd%d", rdev->raid_disk); 3080 if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 3081 printk("md: cannot register %s for %s after level change\n", 3082 nm, mdname(mddev)); 3083 } 3084 } 3085 3086 module_put(mddev->pers->owner); 3087 mddev->pers = pers; 3088 mddev->private = priv; 3089 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3090 mddev->level = mddev->new_level; 3091 mddev->layout = mddev->new_layout; 3092 mddev->chunk_sectors = mddev->new_chunk_sectors; 3093 mddev->delta_disks = 0; 3094 if (mddev->pers->sync_request == NULL) { 3095 /* this is now an array without redundancy, so 3096 * it must always be in_sync 3097 */ 3098 mddev->in_sync = 1; 3099 del_timer_sync(&mddev->safemode_timer); 3100 } 3101 pers->run(mddev); 3102 mddev_resume(mddev); 3103 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3104 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3105 md_wakeup_thread(mddev->thread); 3106 sysfs_notify(&mddev->kobj, NULL, "level"); 3107 md_new_event(mddev); 3108 return rv; 3109 } 3110 3111 static struct md_sysfs_entry md_level = 3112 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3113 3114 3115 static ssize_t 3116 layout_show(mddev_t *mddev, char *page) 3117 { 3118 /* just a number, not meaningful for all levels */ 3119 if (mddev->reshape_position != MaxSector && 3120 mddev->layout != mddev->new_layout) 3121 return sprintf(page, "%d (%d)\n", 3122 mddev->new_layout, mddev->layout); 3123 return sprintf(page, "%d\n", mddev->layout); 3124 } 3125 3126 static ssize_t 3127 layout_store(mddev_t *mddev, const char *buf, size_t len) 3128 { 3129 char *e; 3130 unsigned long n = simple_strtoul(buf, &e, 10); 3131 3132 if (!*buf || (*e && *e != '\n')) 3133 return -EINVAL; 3134 3135 if (mddev->pers) { 3136 int err; 3137 if (mddev->pers->check_reshape == NULL) 3138 return -EBUSY; 3139 mddev->new_layout = n; 3140 err = mddev->pers->check_reshape(mddev); 3141 if (err) { 3142 mddev->new_layout = mddev->layout; 3143 return err; 3144 } 3145 } else { 3146 mddev->new_layout = n; 3147 if (mddev->reshape_position == MaxSector) 3148 mddev->layout = n; 3149 } 3150 return len; 3151 } 3152 static struct md_sysfs_entry md_layout = 3153 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3154 3155 3156 static ssize_t 3157 raid_disks_show(mddev_t *mddev, char *page) 3158 { 3159 if (mddev->raid_disks == 0) 3160 return 0; 3161 if (mddev->reshape_position != MaxSector && 3162 mddev->delta_disks != 0) 3163 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3164 mddev->raid_disks - mddev->delta_disks); 3165 return sprintf(page, "%d\n", mddev->raid_disks); 3166 } 3167 3168 static int update_raid_disks(mddev_t *mddev, int raid_disks); 3169 3170 static ssize_t 3171 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 3172 { 3173 char *e; 3174 int rv = 0; 3175 unsigned long n = simple_strtoul(buf, &e, 10); 3176 3177 if (!*buf || (*e && *e != '\n')) 3178 return -EINVAL; 3179 3180 if (mddev->pers) 3181 rv = update_raid_disks(mddev, n); 3182 else if (mddev->reshape_position != MaxSector) { 3183 int olddisks = mddev->raid_disks - mddev->delta_disks; 3184 mddev->delta_disks = n - olddisks; 3185 mddev->raid_disks = n; 3186 } else 3187 mddev->raid_disks = n; 3188 return rv ? rv : len; 3189 } 3190 static struct md_sysfs_entry md_raid_disks = 3191 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3192 3193 static ssize_t 3194 chunk_size_show(mddev_t *mddev, char *page) 3195 { 3196 if (mddev->reshape_position != MaxSector && 3197 mddev->chunk_sectors != mddev->new_chunk_sectors) 3198 return sprintf(page, "%d (%d)\n", 3199 mddev->new_chunk_sectors << 9, 3200 mddev->chunk_sectors << 9); 3201 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3202 } 3203 3204 static ssize_t 3205 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 3206 { 3207 char *e; 3208 unsigned long n = simple_strtoul(buf, &e, 10); 3209 3210 if (!*buf || (*e && *e != '\n')) 3211 return -EINVAL; 3212 3213 if (mddev->pers) { 3214 int err; 3215 if (mddev->pers->check_reshape == NULL) 3216 return -EBUSY; 3217 mddev->new_chunk_sectors = n >> 9; 3218 err = mddev->pers->check_reshape(mddev); 3219 if (err) { 3220 mddev->new_chunk_sectors = mddev->chunk_sectors; 3221 return err; 3222 } 3223 } else { 3224 mddev->new_chunk_sectors = n >> 9; 3225 if (mddev->reshape_position == MaxSector) 3226 mddev->chunk_sectors = n >> 9; 3227 } 3228 return len; 3229 } 3230 static struct md_sysfs_entry md_chunk_size = 3231 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3232 3233 static ssize_t 3234 resync_start_show(mddev_t *mddev, char *page) 3235 { 3236 if (mddev->recovery_cp == MaxSector) 3237 return sprintf(page, "none\n"); 3238 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3239 } 3240 3241 static ssize_t 3242 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 3243 { 3244 char *e; 3245 unsigned long long n = simple_strtoull(buf, &e, 10); 3246 3247 if (mddev->pers) 3248 return -EBUSY; 3249 if (cmd_match(buf, "none")) 3250 n = MaxSector; 3251 else if (!*buf || (*e && *e != '\n')) 3252 return -EINVAL; 3253 3254 mddev->recovery_cp = n; 3255 return len; 3256 } 3257 static struct md_sysfs_entry md_resync_start = 3258 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 3259 3260 /* 3261 * The array state can be: 3262 * 3263 * clear 3264 * No devices, no size, no level 3265 * Equivalent to STOP_ARRAY ioctl 3266 * inactive 3267 * May have some settings, but array is not active 3268 * all IO results in error 3269 * When written, doesn't tear down array, but just stops it 3270 * suspended (not supported yet) 3271 * All IO requests will block. The array can be reconfigured. 3272 * Writing this, if accepted, will block until array is quiescent 3273 * readonly 3274 * no resync can happen. no superblocks get written. 3275 * write requests fail 3276 * read-auto 3277 * like readonly, but behaves like 'clean' on a write request. 3278 * 3279 * clean - no pending writes, but otherwise active. 3280 * When written to inactive array, starts without resync 3281 * If a write request arrives then 3282 * if metadata is known, mark 'dirty' and switch to 'active'. 3283 * if not known, block and switch to write-pending 3284 * If written to an active array that has pending writes, then fails. 3285 * active 3286 * fully active: IO and resync can be happening. 3287 * When written to inactive array, starts with resync 3288 * 3289 * write-pending 3290 * clean, but writes are blocked waiting for 'active' to be written. 3291 * 3292 * active-idle 3293 * like active, but no writes have been seen for a while (100msec). 3294 * 3295 */ 3296 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 3297 write_pending, active_idle, bad_word}; 3298 static char *array_states[] = { 3299 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3300 "write-pending", "active-idle", NULL }; 3301 3302 static int match_word(const char *word, char **list) 3303 { 3304 int n; 3305 for (n=0; list[n]; n++) 3306 if (cmd_match(word, list[n])) 3307 break; 3308 return n; 3309 } 3310 3311 static ssize_t 3312 array_state_show(mddev_t *mddev, char *page) 3313 { 3314 enum array_state st = inactive; 3315 3316 if (mddev->pers) 3317 switch(mddev->ro) { 3318 case 1: 3319 st = readonly; 3320 break; 3321 case 2: 3322 st = read_auto; 3323 break; 3324 case 0: 3325 if (mddev->in_sync) 3326 st = clean; 3327 else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 3328 st = write_pending; 3329 else if (mddev->safemode) 3330 st = active_idle; 3331 else 3332 st = active; 3333 } 3334 else { 3335 if (list_empty(&mddev->disks) && 3336 mddev->raid_disks == 0 && 3337 mddev->dev_sectors == 0) 3338 st = clear; 3339 else 3340 st = inactive; 3341 } 3342 return sprintf(page, "%s\n", array_states[st]); 3343 } 3344 3345 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3346 static int md_set_readonly(mddev_t * mddev, int is_open); 3347 static int do_md_run(mddev_t * mddev); 3348 static int restart_array(mddev_t *mddev); 3349 3350 static ssize_t 3351 array_state_store(mddev_t *mddev, const char *buf, size_t len) 3352 { 3353 int err = -EINVAL; 3354 enum array_state st = match_word(buf, array_states); 3355 switch(st) { 3356 case bad_word: 3357 break; 3358 case clear: 3359 /* stopping an active array */ 3360 if (atomic_read(&mddev->openers) > 0) 3361 return -EBUSY; 3362 err = do_md_stop(mddev, 0, 0); 3363 break; 3364 case inactive: 3365 /* stopping an active array */ 3366 if (mddev->pers) { 3367 if (atomic_read(&mddev->openers) > 0) 3368 return -EBUSY; 3369 err = do_md_stop(mddev, 2, 0); 3370 } else 3371 err = 0; /* already inactive */ 3372 break; 3373 case suspended: 3374 break; /* not supported yet */ 3375 case readonly: 3376 if (mddev->pers) 3377 err = md_set_readonly(mddev, 0); 3378 else { 3379 mddev->ro = 1; 3380 set_disk_ro(mddev->gendisk, 1); 3381 err = do_md_run(mddev); 3382 } 3383 break; 3384 case read_auto: 3385 if (mddev->pers) { 3386 if (mddev->ro == 0) 3387 err = md_set_readonly(mddev, 0); 3388 else if (mddev->ro == 1) 3389 err = restart_array(mddev); 3390 if (err == 0) { 3391 mddev->ro = 2; 3392 set_disk_ro(mddev->gendisk, 0); 3393 } 3394 } else { 3395 mddev->ro = 2; 3396 err = do_md_run(mddev); 3397 } 3398 break; 3399 case clean: 3400 if (mddev->pers) { 3401 restart_array(mddev); 3402 spin_lock_irq(&mddev->write_lock); 3403 if (atomic_read(&mddev->writes_pending) == 0) { 3404 if (mddev->in_sync == 0) { 3405 mddev->in_sync = 1; 3406 if (mddev->safemode == 1) 3407 mddev->safemode = 0; 3408 if (mddev->persistent) 3409 set_bit(MD_CHANGE_CLEAN, 3410 &mddev->flags); 3411 } 3412 err = 0; 3413 } else 3414 err = -EBUSY; 3415 spin_unlock_irq(&mddev->write_lock); 3416 } else 3417 err = -EINVAL; 3418 break; 3419 case active: 3420 if (mddev->pers) { 3421 restart_array(mddev); 3422 if (mddev->external) 3423 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 3424 wake_up(&mddev->sb_wait); 3425 err = 0; 3426 } else { 3427 mddev->ro = 0; 3428 set_disk_ro(mddev->gendisk, 0); 3429 err = do_md_run(mddev); 3430 } 3431 break; 3432 case write_pending: 3433 case active_idle: 3434 /* these cannot be set */ 3435 break; 3436 } 3437 if (err) 3438 return err; 3439 else { 3440 sysfs_notify_dirent(mddev->sysfs_state); 3441 return len; 3442 } 3443 } 3444 static struct md_sysfs_entry md_array_state = 3445 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3446 3447 static ssize_t 3448 max_corrected_read_errors_show(mddev_t *mddev, char *page) { 3449 return sprintf(page, "%d\n", 3450 atomic_read(&mddev->max_corr_read_errors)); 3451 } 3452 3453 static ssize_t 3454 max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len) 3455 { 3456 char *e; 3457 unsigned long n = simple_strtoul(buf, &e, 10); 3458 3459 if (*buf && (*e == 0 || *e == '\n')) { 3460 atomic_set(&mddev->max_corr_read_errors, n); 3461 return len; 3462 } 3463 return -EINVAL; 3464 } 3465 3466 static struct md_sysfs_entry max_corr_read_errors = 3467 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 3468 max_corrected_read_errors_store); 3469 3470 static ssize_t 3471 null_show(mddev_t *mddev, char *page) 3472 { 3473 return -EINVAL; 3474 } 3475 3476 static ssize_t 3477 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 3478 { 3479 /* buf must be %d:%d\n? giving major and minor numbers */ 3480 /* The new device is added to the array. 3481 * If the array has a persistent superblock, we read the 3482 * superblock to initialise info and check validity. 3483 * Otherwise, only checking done is that in bind_rdev_to_array, 3484 * which mainly checks size. 3485 */ 3486 char *e; 3487 int major = simple_strtoul(buf, &e, 10); 3488 int minor; 3489 dev_t dev; 3490 mdk_rdev_t *rdev; 3491 int err; 3492 3493 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 3494 return -EINVAL; 3495 minor = simple_strtoul(e+1, &e, 10); 3496 if (*e && *e != '\n') 3497 return -EINVAL; 3498 dev = MKDEV(major, minor); 3499 if (major != MAJOR(dev) || 3500 minor != MINOR(dev)) 3501 return -EOVERFLOW; 3502 3503 3504 if (mddev->persistent) { 3505 rdev = md_import_device(dev, mddev->major_version, 3506 mddev->minor_version); 3507 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 3508 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3509 mdk_rdev_t, same_set); 3510 err = super_types[mddev->major_version] 3511 .load_super(rdev, rdev0, mddev->minor_version); 3512 if (err < 0) 3513 goto out; 3514 } 3515 } else if (mddev->external) 3516 rdev = md_import_device(dev, -2, -1); 3517 else 3518 rdev = md_import_device(dev, -1, -1); 3519 3520 if (IS_ERR(rdev)) 3521 return PTR_ERR(rdev); 3522 err = bind_rdev_to_array(rdev, mddev); 3523 out: 3524 if (err) 3525 export_rdev(rdev); 3526 return err ? err : len; 3527 } 3528 3529 static struct md_sysfs_entry md_new_device = 3530 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 3531 3532 static ssize_t 3533 bitmap_store(mddev_t *mddev, const char *buf, size_t len) 3534 { 3535 char *end; 3536 unsigned long chunk, end_chunk; 3537 3538 if (!mddev->bitmap) 3539 goto out; 3540 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 3541 while (*buf) { 3542 chunk = end_chunk = simple_strtoul(buf, &end, 0); 3543 if (buf == end) break; 3544 if (*end == '-') { /* range */ 3545 buf = end + 1; 3546 end_chunk = simple_strtoul(buf, &end, 0); 3547 if (buf == end) break; 3548 } 3549 if (*end && !isspace(*end)) break; 3550 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 3551 buf = skip_spaces(end); 3552 } 3553 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 3554 out: 3555 return len; 3556 } 3557 3558 static struct md_sysfs_entry md_bitmap = 3559 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 3560 3561 static ssize_t 3562 size_show(mddev_t *mddev, char *page) 3563 { 3564 return sprintf(page, "%llu\n", 3565 (unsigned long long)mddev->dev_sectors / 2); 3566 } 3567 3568 static int update_size(mddev_t *mddev, sector_t num_sectors); 3569 3570 static ssize_t 3571 size_store(mddev_t *mddev, const char *buf, size_t len) 3572 { 3573 /* If array is inactive, we can reduce the component size, but 3574 * not increase it (except from 0). 3575 * If array is active, we can try an on-line resize 3576 */ 3577 sector_t sectors; 3578 int err = strict_blocks_to_sectors(buf, §ors); 3579 3580 if (err < 0) 3581 return err; 3582 if (mddev->pers) { 3583 err = update_size(mddev, sectors); 3584 md_update_sb(mddev, 1); 3585 } else { 3586 if (mddev->dev_sectors == 0 || 3587 mddev->dev_sectors > sectors) 3588 mddev->dev_sectors = sectors; 3589 else 3590 err = -ENOSPC; 3591 } 3592 return err ? err : len; 3593 } 3594 3595 static struct md_sysfs_entry md_size = 3596 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 3597 3598 3599 /* Metdata version. 3600 * This is one of 3601 * 'none' for arrays with no metadata (good luck...) 3602 * 'external' for arrays with externally managed metadata, 3603 * or N.M for internally known formats 3604 */ 3605 static ssize_t 3606 metadata_show(mddev_t *mddev, char *page) 3607 { 3608 if (mddev->persistent) 3609 return sprintf(page, "%d.%d\n", 3610 mddev->major_version, mddev->minor_version); 3611 else if (mddev->external) 3612 return sprintf(page, "external:%s\n", mddev->metadata_type); 3613 else 3614 return sprintf(page, "none\n"); 3615 } 3616 3617 static ssize_t 3618 metadata_store(mddev_t *mddev, const char *buf, size_t len) 3619 { 3620 int major, minor; 3621 char *e; 3622 /* Changing the details of 'external' metadata is 3623 * always permitted. Otherwise there must be 3624 * no devices attached to the array. 3625 */ 3626 if (mddev->external && strncmp(buf, "external:", 9) == 0) 3627 ; 3628 else if (!list_empty(&mddev->disks)) 3629 return -EBUSY; 3630 3631 if (cmd_match(buf, "none")) { 3632 mddev->persistent = 0; 3633 mddev->external = 0; 3634 mddev->major_version = 0; 3635 mddev->minor_version = 90; 3636 return len; 3637 } 3638 if (strncmp(buf, "external:", 9) == 0) { 3639 size_t namelen = len-9; 3640 if (namelen >= sizeof(mddev->metadata_type)) 3641 namelen = sizeof(mddev->metadata_type)-1; 3642 strncpy(mddev->metadata_type, buf+9, namelen); 3643 mddev->metadata_type[namelen] = 0; 3644 if (namelen && mddev->metadata_type[namelen-1] == '\n') 3645 mddev->metadata_type[--namelen] = 0; 3646 mddev->persistent = 0; 3647 mddev->external = 1; 3648 mddev->major_version = 0; 3649 mddev->minor_version = 90; 3650 return len; 3651 } 3652 major = simple_strtoul(buf, &e, 10); 3653 if (e==buf || *e != '.') 3654 return -EINVAL; 3655 buf = e+1; 3656 minor = simple_strtoul(buf, &e, 10); 3657 if (e==buf || (*e && *e != '\n') ) 3658 return -EINVAL; 3659 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 3660 return -ENOENT; 3661 mddev->major_version = major; 3662 mddev->minor_version = minor; 3663 mddev->persistent = 1; 3664 mddev->external = 0; 3665 return len; 3666 } 3667 3668 static struct md_sysfs_entry md_metadata = 3669 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 3670 3671 static ssize_t 3672 action_show(mddev_t *mddev, char *page) 3673 { 3674 char *type = "idle"; 3675 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3676 type = "frozen"; 3677 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3678 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 3679 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3680 type = "reshape"; 3681 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3682 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 3683 type = "resync"; 3684 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 3685 type = "check"; 3686 else 3687 type = "repair"; 3688 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 3689 type = "recover"; 3690 } 3691 return sprintf(page, "%s\n", type); 3692 } 3693 3694 static ssize_t 3695 action_store(mddev_t *mddev, const char *page, size_t len) 3696 { 3697 if (!mddev->pers || !mddev->pers->sync_request) 3698 return -EINVAL; 3699 3700 if (cmd_match(page, "frozen")) 3701 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3702 else 3703 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3704 3705 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 3706 if (mddev->sync_thread) { 3707 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3708 md_unregister_thread(mddev->sync_thread); 3709 mddev->sync_thread = NULL; 3710 mddev->recovery = 0; 3711 } 3712 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3713 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3714 return -EBUSY; 3715 else if (cmd_match(page, "resync")) 3716 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3717 else if (cmd_match(page, "recover")) { 3718 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 3719 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3720 } else if (cmd_match(page, "reshape")) { 3721 int err; 3722 if (mddev->pers->start_reshape == NULL) 3723 return -EINVAL; 3724 err = mddev->pers->start_reshape(mddev); 3725 if (err) 3726 return err; 3727 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3728 } else { 3729 if (cmd_match(page, "check")) 3730 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3731 else if (!cmd_match(page, "repair")) 3732 return -EINVAL; 3733 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 3734 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3735 } 3736 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3737 md_wakeup_thread(mddev->thread); 3738 sysfs_notify_dirent(mddev->sysfs_action); 3739 return len; 3740 } 3741 3742 static ssize_t 3743 mismatch_cnt_show(mddev_t *mddev, char *page) 3744 { 3745 return sprintf(page, "%llu\n", 3746 (unsigned long long) mddev->resync_mismatches); 3747 } 3748 3749 static struct md_sysfs_entry md_scan_mode = 3750 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 3751 3752 3753 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 3754 3755 static ssize_t 3756 sync_min_show(mddev_t *mddev, char *page) 3757 { 3758 return sprintf(page, "%d (%s)\n", speed_min(mddev), 3759 mddev->sync_speed_min ? "local": "system"); 3760 } 3761 3762 static ssize_t 3763 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 3764 { 3765 int min; 3766 char *e; 3767 if (strncmp(buf, "system", 6)==0) { 3768 mddev->sync_speed_min = 0; 3769 return len; 3770 } 3771 min = simple_strtoul(buf, &e, 10); 3772 if (buf == e || (*e && *e != '\n') || min <= 0) 3773 return -EINVAL; 3774 mddev->sync_speed_min = min; 3775 return len; 3776 } 3777 3778 static struct md_sysfs_entry md_sync_min = 3779 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 3780 3781 static ssize_t 3782 sync_max_show(mddev_t *mddev, char *page) 3783 { 3784 return sprintf(page, "%d (%s)\n", speed_max(mddev), 3785 mddev->sync_speed_max ? "local": "system"); 3786 } 3787 3788 static ssize_t 3789 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 3790 { 3791 int max; 3792 char *e; 3793 if (strncmp(buf, "system", 6)==0) { 3794 mddev->sync_speed_max = 0; 3795 return len; 3796 } 3797 max = simple_strtoul(buf, &e, 10); 3798 if (buf == e || (*e && *e != '\n') || max <= 0) 3799 return -EINVAL; 3800 mddev->sync_speed_max = max; 3801 return len; 3802 } 3803 3804 static struct md_sysfs_entry md_sync_max = 3805 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 3806 3807 static ssize_t 3808 degraded_show(mddev_t *mddev, char *page) 3809 { 3810 return sprintf(page, "%d\n", mddev->degraded); 3811 } 3812 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 3813 3814 static ssize_t 3815 sync_force_parallel_show(mddev_t *mddev, char *page) 3816 { 3817 return sprintf(page, "%d\n", mddev->parallel_resync); 3818 } 3819 3820 static ssize_t 3821 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) 3822 { 3823 long n; 3824 3825 if (strict_strtol(buf, 10, &n)) 3826 return -EINVAL; 3827 3828 if (n != 0 && n != 1) 3829 return -EINVAL; 3830 3831 mddev->parallel_resync = n; 3832 3833 if (mddev->sync_thread) 3834 wake_up(&resync_wait); 3835 3836 return len; 3837 } 3838 3839 /* force parallel resync, even with shared block devices */ 3840 static struct md_sysfs_entry md_sync_force_parallel = 3841 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 3842 sync_force_parallel_show, sync_force_parallel_store); 3843 3844 static ssize_t 3845 sync_speed_show(mddev_t *mddev, char *page) 3846 { 3847 unsigned long resync, dt, db; 3848 if (mddev->curr_resync == 0) 3849 return sprintf(page, "none\n"); 3850 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 3851 dt = (jiffies - mddev->resync_mark) / HZ; 3852 if (!dt) dt++; 3853 db = resync - mddev->resync_mark_cnt; 3854 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 3855 } 3856 3857 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3858 3859 static ssize_t 3860 sync_completed_show(mddev_t *mddev, char *page) 3861 { 3862 unsigned long max_sectors, resync; 3863 3864 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3865 return sprintf(page, "none\n"); 3866 3867 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3868 max_sectors = mddev->resync_max_sectors; 3869 else 3870 max_sectors = mddev->dev_sectors; 3871 3872 resync = mddev->curr_resync_completed; 3873 return sprintf(page, "%lu / %lu\n", resync, max_sectors); 3874 } 3875 3876 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3877 3878 static ssize_t 3879 min_sync_show(mddev_t *mddev, char *page) 3880 { 3881 return sprintf(page, "%llu\n", 3882 (unsigned long long)mddev->resync_min); 3883 } 3884 static ssize_t 3885 min_sync_store(mddev_t *mddev, const char *buf, size_t len) 3886 { 3887 unsigned long long min; 3888 if (strict_strtoull(buf, 10, &min)) 3889 return -EINVAL; 3890 if (min > mddev->resync_max) 3891 return -EINVAL; 3892 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3893 return -EBUSY; 3894 3895 /* Must be a multiple of chunk_size */ 3896 if (mddev->chunk_sectors) { 3897 sector_t temp = min; 3898 if (sector_div(temp, mddev->chunk_sectors)) 3899 return -EINVAL; 3900 } 3901 mddev->resync_min = min; 3902 3903 return len; 3904 } 3905 3906 static struct md_sysfs_entry md_min_sync = 3907 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 3908 3909 static ssize_t 3910 max_sync_show(mddev_t *mddev, char *page) 3911 { 3912 if (mddev->resync_max == MaxSector) 3913 return sprintf(page, "max\n"); 3914 else 3915 return sprintf(page, "%llu\n", 3916 (unsigned long long)mddev->resync_max); 3917 } 3918 static ssize_t 3919 max_sync_store(mddev_t *mddev, const char *buf, size_t len) 3920 { 3921 if (strncmp(buf, "max", 3) == 0) 3922 mddev->resync_max = MaxSector; 3923 else { 3924 unsigned long long max; 3925 if (strict_strtoull(buf, 10, &max)) 3926 return -EINVAL; 3927 if (max < mddev->resync_min) 3928 return -EINVAL; 3929 if (max < mddev->resync_max && 3930 mddev->ro == 0 && 3931 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3932 return -EBUSY; 3933 3934 /* Must be a multiple of chunk_size */ 3935 if (mddev->chunk_sectors) { 3936 sector_t temp = max; 3937 if (sector_div(temp, mddev->chunk_sectors)) 3938 return -EINVAL; 3939 } 3940 mddev->resync_max = max; 3941 } 3942 wake_up(&mddev->recovery_wait); 3943 return len; 3944 } 3945 3946 static struct md_sysfs_entry md_max_sync = 3947 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 3948 3949 static ssize_t 3950 suspend_lo_show(mddev_t *mddev, char *page) 3951 { 3952 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 3953 } 3954 3955 static ssize_t 3956 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 3957 { 3958 char *e; 3959 unsigned long long new = simple_strtoull(buf, &e, 10); 3960 3961 if (mddev->pers == NULL || 3962 mddev->pers->quiesce == NULL) 3963 return -EINVAL; 3964 if (buf == e || (*e && *e != '\n')) 3965 return -EINVAL; 3966 if (new >= mddev->suspend_hi || 3967 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 3968 mddev->suspend_lo = new; 3969 mddev->pers->quiesce(mddev, 2); 3970 return len; 3971 } else 3972 return -EINVAL; 3973 } 3974 static struct md_sysfs_entry md_suspend_lo = 3975 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 3976 3977 3978 static ssize_t 3979 suspend_hi_show(mddev_t *mddev, char *page) 3980 { 3981 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 3982 } 3983 3984 static ssize_t 3985 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 3986 { 3987 char *e; 3988 unsigned long long new = simple_strtoull(buf, &e, 10); 3989 3990 if (mddev->pers == NULL || 3991 mddev->pers->quiesce == NULL) 3992 return -EINVAL; 3993 if (buf == e || (*e && *e != '\n')) 3994 return -EINVAL; 3995 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 3996 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 3997 mddev->suspend_hi = new; 3998 mddev->pers->quiesce(mddev, 1); 3999 mddev->pers->quiesce(mddev, 0); 4000 return len; 4001 } else 4002 return -EINVAL; 4003 } 4004 static struct md_sysfs_entry md_suspend_hi = 4005 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4006 4007 static ssize_t 4008 reshape_position_show(mddev_t *mddev, char *page) 4009 { 4010 if (mddev->reshape_position != MaxSector) 4011 return sprintf(page, "%llu\n", 4012 (unsigned long long)mddev->reshape_position); 4013 strcpy(page, "none\n"); 4014 return 5; 4015 } 4016 4017 static ssize_t 4018 reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 4019 { 4020 char *e; 4021 unsigned long long new = simple_strtoull(buf, &e, 10); 4022 if (mddev->pers) 4023 return -EBUSY; 4024 if (buf == e || (*e && *e != '\n')) 4025 return -EINVAL; 4026 mddev->reshape_position = new; 4027 mddev->delta_disks = 0; 4028 mddev->new_level = mddev->level; 4029 mddev->new_layout = mddev->layout; 4030 mddev->new_chunk_sectors = mddev->chunk_sectors; 4031 return len; 4032 } 4033 4034 static struct md_sysfs_entry md_reshape_position = 4035 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4036 reshape_position_store); 4037 4038 static ssize_t 4039 array_size_show(mddev_t *mddev, char *page) 4040 { 4041 if (mddev->external_size) 4042 return sprintf(page, "%llu\n", 4043 (unsigned long long)mddev->array_sectors/2); 4044 else 4045 return sprintf(page, "default\n"); 4046 } 4047 4048 static ssize_t 4049 array_size_store(mddev_t *mddev, const char *buf, size_t len) 4050 { 4051 sector_t sectors; 4052 4053 if (strncmp(buf, "default", 7) == 0) { 4054 if (mddev->pers) 4055 sectors = mddev->pers->size(mddev, 0, 0); 4056 else 4057 sectors = mddev->array_sectors; 4058 4059 mddev->external_size = 0; 4060 } else { 4061 if (strict_blocks_to_sectors(buf, §ors) < 0) 4062 return -EINVAL; 4063 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 4064 return -E2BIG; 4065 4066 mddev->external_size = 1; 4067 } 4068 4069 mddev->array_sectors = sectors; 4070 set_capacity(mddev->gendisk, mddev->array_sectors); 4071 if (mddev->pers) 4072 revalidate_disk(mddev->gendisk); 4073 4074 return len; 4075 } 4076 4077 static struct md_sysfs_entry md_array_size = 4078 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 4079 array_size_store); 4080 4081 static struct attribute *md_default_attrs[] = { 4082 &md_level.attr, 4083 &md_layout.attr, 4084 &md_raid_disks.attr, 4085 &md_chunk_size.attr, 4086 &md_size.attr, 4087 &md_resync_start.attr, 4088 &md_metadata.attr, 4089 &md_new_device.attr, 4090 &md_safe_delay.attr, 4091 &md_array_state.attr, 4092 &md_reshape_position.attr, 4093 &md_array_size.attr, 4094 &max_corr_read_errors.attr, 4095 NULL, 4096 }; 4097 4098 static struct attribute *md_redundancy_attrs[] = { 4099 &md_scan_mode.attr, 4100 &md_mismatches.attr, 4101 &md_sync_min.attr, 4102 &md_sync_max.attr, 4103 &md_sync_speed.attr, 4104 &md_sync_force_parallel.attr, 4105 &md_sync_completed.attr, 4106 &md_min_sync.attr, 4107 &md_max_sync.attr, 4108 &md_suspend_lo.attr, 4109 &md_suspend_hi.attr, 4110 &md_bitmap.attr, 4111 &md_degraded.attr, 4112 NULL, 4113 }; 4114 static struct attribute_group md_redundancy_group = { 4115 .name = NULL, 4116 .attrs = md_redundancy_attrs, 4117 }; 4118 4119 4120 static ssize_t 4121 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4122 { 4123 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4124 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 4125 ssize_t rv; 4126 4127 if (!entry->show) 4128 return -EIO; 4129 rv = mddev_lock(mddev); 4130 if (!rv) { 4131 rv = entry->show(mddev, page); 4132 mddev_unlock(mddev); 4133 } 4134 return rv; 4135 } 4136 4137 static ssize_t 4138 md_attr_store(struct kobject *kobj, struct attribute *attr, 4139 const char *page, size_t length) 4140 { 4141 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4142 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 4143 ssize_t rv; 4144 4145 if (!entry->store) 4146 return -EIO; 4147 if (!capable(CAP_SYS_ADMIN)) 4148 return -EACCES; 4149 rv = mddev_lock(mddev); 4150 if (mddev->hold_active == UNTIL_IOCTL) 4151 mddev->hold_active = 0; 4152 if (!rv) { 4153 rv = entry->store(mddev, page, length); 4154 mddev_unlock(mddev); 4155 } 4156 return rv; 4157 } 4158 4159 static void md_free(struct kobject *ko) 4160 { 4161 mddev_t *mddev = container_of(ko, mddev_t, kobj); 4162 4163 if (mddev->sysfs_state) 4164 sysfs_put(mddev->sysfs_state); 4165 4166 if (mddev->gendisk) { 4167 del_gendisk(mddev->gendisk); 4168 put_disk(mddev->gendisk); 4169 } 4170 if (mddev->queue) 4171 blk_cleanup_queue(mddev->queue); 4172 4173 kfree(mddev); 4174 } 4175 4176 static const struct sysfs_ops md_sysfs_ops = { 4177 .show = md_attr_show, 4178 .store = md_attr_store, 4179 }; 4180 static struct kobj_type md_ktype = { 4181 .release = md_free, 4182 .sysfs_ops = &md_sysfs_ops, 4183 .default_attrs = md_default_attrs, 4184 }; 4185 4186 int mdp_major = 0; 4187 4188 static void mddev_delayed_delete(struct work_struct *ws) 4189 { 4190 mddev_t *mddev = container_of(ws, mddev_t, del_work); 4191 4192 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4193 kobject_del(&mddev->kobj); 4194 kobject_put(&mddev->kobj); 4195 } 4196 4197 static int md_alloc(dev_t dev, char *name) 4198 { 4199 static DEFINE_MUTEX(disks_mutex); 4200 mddev_t *mddev = mddev_find(dev); 4201 struct gendisk *disk; 4202 int partitioned; 4203 int shift; 4204 int unit; 4205 int error; 4206 4207 if (!mddev) 4208 return -ENODEV; 4209 4210 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 4211 shift = partitioned ? MdpMinorShift : 0; 4212 unit = MINOR(mddev->unit) >> shift; 4213 4214 /* wait for any previous instance if this device 4215 * to be completed removed (mddev_delayed_delete). 4216 */ 4217 flush_scheduled_work(); 4218 4219 mutex_lock(&disks_mutex); 4220 error = -EEXIST; 4221 if (mddev->gendisk) 4222 goto abort; 4223 4224 if (name) { 4225 /* Need to ensure that 'name' is not a duplicate. 4226 */ 4227 mddev_t *mddev2; 4228 spin_lock(&all_mddevs_lock); 4229 4230 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 4231 if (mddev2->gendisk && 4232 strcmp(mddev2->gendisk->disk_name, name) == 0) { 4233 spin_unlock(&all_mddevs_lock); 4234 goto abort; 4235 } 4236 spin_unlock(&all_mddevs_lock); 4237 } 4238 4239 error = -ENOMEM; 4240 mddev->queue = blk_alloc_queue(GFP_KERNEL); 4241 if (!mddev->queue) 4242 goto abort; 4243 mddev->queue->queuedata = mddev; 4244 4245 /* Can be unlocked because the queue is new: no concurrency */ 4246 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); 4247 4248 blk_queue_make_request(mddev->queue, md_make_request); 4249 4250 disk = alloc_disk(1 << shift); 4251 if (!disk) { 4252 blk_cleanup_queue(mddev->queue); 4253 mddev->queue = NULL; 4254 goto abort; 4255 } 4256 disk->major = MAJOR(mddev->unit); 4257 disk->first_minor = unit << shift; 4258 if (name) 4259 strcpy(disk->disk_name, name); 4260 else if (partitioned) 4261 sprintf(disk->disk_name, "md_d%d", unit); 4262 else 4263 sprintf(disk->disk_name, "md%d", unit); 4264 disk->fops = &md_fops; 4265 disk->private_data = mddev; 4266 disk->queue = mddev->queue; 4267 /* Allow extended partitions. This makes the 4268 * 'mdp' device redundant, but we can't really 4269 * remove it now. 4270 */ 4271 disk->flags |= GENHD_FL_EXT_DEVT; 4272 add_disk(disk); 4273 mddev->gendisk = disk; 4274 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 4275 &disk_to_dev(disk)->kobj, "%s", "md"); 4276 if (error) { 4277 /* This isn't possible, but as kobject_init_and_add is marked 4278 * __must_check, we must do something with the result 4279 */ 4280 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 4281 disk->disk_name); 4282 error = 0; 4283 } 4284 if (sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 4285 printk(KERN_DEBUG "pointless warning\n"); 4286 abort: 4287 mutex_unlock(&disks_mutex); 4288 if (!error) { 4289 kobject_uevent(&mddev->kobj, KOBJ_ADD); 4290 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, NULL, "array_state"); 4291 } 4292 mddev_put(mddev); 4293 return error; 4294 } 4295 4296 static struct kobject *md_probe(dev_t dev, int *part, void *data) 4297 { 4298 md_alloc(dev, NULL); 4299 return NULL; 4300 } 4301 4302 static int add_named_array(const char *val, struct kernel_param *kp) 4303 { 4304 /* val must be "md_*" where * is not all digits. 4305 * We allocate an array with a large free minor number, and 4306 * set the name to val. val must not already be an active name. 4307 */ 4308 int len = strlen(val); 4309 char buf[DISK_NAME_LEN]; 4310 4311 while (len && val[len-1] == '\n') 4312 len--; 4313 if (len >= DISK_NAME_LEN) 4314 return -E2BIG; 4315 strlcpy(buf, val, len+1); 4316 if (strncmp(buf, "md_", 3) != 0) 4317 return -EINVAL; 4318 return md_alloc(0, buf); 4319 } 4320 4321 static void md_safemode_timeout(unsigned long data) 4322 { 4323 mddev_t *mddev = (mddev_t *) data; 4324 4325 if (!atomic_read(&mddev->writes_pending)) { 4326 mddev->safemode = 1; 4327 if (mddev->external) 4328 sysfs_notify_dirent(mddev->sysfs_state); 4329 } 4330 md_wakeup_thread(mddev->thread); 4331 } 4332 4333 static int start_dirty_degraded; 4334 4335 static int md_run(mddev_t *mddev) 4336 { 4337 int err; 4338 mdk_rdev_t *rdev; 4339 struct mdk_personality *pers; 4340 4341 if (list_empty(&mddev->disks)) 4342 /* cannot run an array with no devices.. */ 4343 return -EINVAL; 4344 4345 if (mddev->pers) 4346 return -EBUSY; 4347 4348 /* These two calls synchronise us with the 4349 * sysfs_remove_group calls in mddev_unlock, 4350 * so they must have completed. 4351 */ 4352 mutex_lock(&mddev->open_mutex); 4353 mutex_unlock(&mddev->open_mutex); 4354 4355 /* 4356 * Analyze all RAID superblock(s) 4357 */ 4358 if (!mddev->raid_disks) { 4359 if (!mddev->persistent) 4360 return -EINVAL; 4361 analyze_sbs(mddev); 4362 } 4363 4364 if (mddev->level != LEVEL_NONE) 4365 request_module("md-level-%d", mddev->level); 4366 else if (mddev->clevel[0]) 4367 request_module("md-%s", mddev->clevel); 4368 4369 /* 4370 * Drop all container device buffers, from now on 4371 * the only valid external interface is through the md 4372 * device. 4373 */ 4374 list_for_each_entry(rdev, &mddev->disks, same_set) { 4375 if (test_bit(Faulty, &rdev->flags)) 4376 continue; 4377 sync_blockdev(rdev->bdev); 4378 invalidate_bdev(rdev->bdev); 4379 4380 /* perform some consistency tests on the device. 4381 * We don't want the data to overlap the metadata, 4382 * Internal Bitmap issues have been handled elsewhere. 4383 */ 4384 if (rdev->data_offset < rdev->sb_start) { 4385 if (mddev->dev_sectors && 4386 rdev->data_offset + mddev->dev_sectors 4387 > rdev->sb_start) { 4388 printk("md: %s: data overlaps metadata\n", 4389 mdname(mddev)); 4390 return -EINVAL; 4391 } 4392 } else { 4393 if (rdev->sb_start + rdev->sb_size/512 4394 > rdev->data_offset) { 4395 printk("md: %s: metadata overlaps data\n", 4396 mdname(mddev)); 4397 return -EINVAL; 4398 } 4399 } 4400 sysfs_notify_dirent(rdev->sysfs_state); 4401 } 4402 4403 spin_lock(&pers_lock); 4404 pers = find_pers(mddev->level, mddev->clevel); 4405 if (!pers || !try_module_get(pers->owner)) { 4406 spin_unlock(&pers_lock); 4407 if (mddev->level != LEVEL_NONE) 4408 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 4409 mddev->level); 4410 else 4411 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 4412 mddev->clevel); 4413 return -EINVAL; 4414 } 4415 mddev->pers = pers; 4416 spin_unlock(&pers_lock); 4417 if (mddev->level != pers->level) { 4418 mddev->level = pers->level; 4419 mddev->new_level = pers->level; 4420 } 4421 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4422 4423 if (mddev->reshape_position != MaxSector && 4424 pers->start_reshape == NULL) { 4425 /* This personality cannot handle reshaping... */ 4426 mddev->pers = NULL; 4427 module_put(pers->owner); 4428 return -EINVAL; 4429 } 4430 4431 if (pers->sync_request) { 4432 /* Warn if this is a potentially silly 4433 * configuration. 4434 */ 4435 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4436 mdk_rdev_t *rdev2; 4437 int warned = 0; 4438 4439 list_for_each_entry(rdev, &mddev->disks, same_set) 4440 list_for_each_entry(rdev2, &mddev->disks, same_set) { 4441 if (rdev < rdev2 && 4442 rdev->bdev->bd_contains == 4443 rdev2->bdev->bd_contains) { 4444 printk(KERN_WARNING 4445 "%s: WARNING: %s appears to be" 4446 " on the same physical disk as" 4447 " %s.\n", 4448 mdname(mddev), 4449 bdevname(rdev->bdev,b), 4450 bdevname(rdev2->bdev,b2)); 4451 warned = 1; 4452 } 4453 } 4454 4455 if (warned) 4456 printk(KERN_WARNING 4457 "True protection against single-disk" 4458 " failure might be compromised.\n"); 4459 } 4460 4461 mddev->recovery = 0; 4462 /* may be over-ridden by personality */ 4463 mddev->resync_max_sectors = mddev->dev_sectors; 4464 4465 mddev->barriers_work = 1; 4466 mddev->ok_start_degraded = start_dirty_degraded; 4467 4468 if (start_readonly && mddev->ro == 0) 4469 mddev->ro = 2; /* read-only, but switch on first write */ 4470 4471 err = mddev->pers->run(mddev); 4472 if (err) 4473 printk(KERN_ERR "md: pers->run() failed ...\n"); 4474 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { 4475 WARN_ONCE(!mddev->external_size, "%s: default size too small," 4476 " but 'external_size' not in effect?\n", __func__); 4477 printk(KERN_ERR 4478 "md: invalid array_size %llu > default size %llu\n", 4479 (unsigned long long)mddev->array_sectors / 2, 4480 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); 4481 err = -EINVAL; 4482 mddev->pers->stop(mddev); 4483 } 4484 if (err == 0 && mddev->pers->sync_request) { 4485 err = bitmap_create(mddev); 4486 if (err) { 4487 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 4488 mdname(mddev), err); 4489 mddev->pers->stop(mddev); 4490 } 4491 } 4492 if (err) { 4493 module_put(mddev->pers->owner); 4494 mddev->pers = NULL; 4495 bitmap_destroy(mddev); 4496 return err; 4497 } 4498 if (mddev->pers->sync_request) { 4499 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4500 printk(KERN_WARNING 4501 "md: cannot register extra attributes for %s\n", 4502 mdname(mddev)); 4503 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); 4504 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 4505 mddev->ro = 0; 4506 4507 atomic_set(&mddev->writes_pending,0); 4508 atomic_set(&mddev->max_corr_read_errors, 4509 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 4510 mddev->safemode = 0; 4511 mddev->safemode_timer.function = md_safemode_timeout; 4512 mddev->safemode_timer.data = (unsigned long) mddev; 4513 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 4514 mddev->in_sync = 1; 4515 4516 list_for_each_entry(rdev, &mddev->disks, same_set) 4517 if (rdev->raid_disk >= 0) { 4518 char nm[20]; 4519 sprintf(nm, "rd%d", rdev->raid_disk); 4520 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 4521 printk("md: cannot register %s for %s\n", 4522 nm, mdname(mddev)); 4523 } 4524 4525 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4526 4527 if (mddev->flags) 4528 md_update_sb(mddev, 0); 4529 4530 md_wakeup_thread(mddev->thread); 4531 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4532 4533 md_new_event(mddev); 4534 sysfs_notify_dirent(mddev->sysfs_state); 4535 if (mddev->sysfs_action) 4536 sysfs_notify_dirent(mddev->sysfs_action); 4537 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4538 return 0; 4539 } 4540 4541 static int do_md_run(mddev_t *mddev) 4542 { 4543 int err; 4544 4545 err = md_run(mddev); 4546 if (err) 4547 goto out; 4548 4549 set_capacity(mddev->gendisk, mddev->array_sectors); 4550 revalidate_disk(mddev->gendisk); 4551 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4552 out: 4553 return err; 4554 } 4555 4556 static int restart_array(mddev_t *mddev) 4557 { 4558 struct gendisk *disk = mddev->gendisk; 4559 4560 /* Complain if it has no devices */ 4561 if (list_empty(&mddev->disks)) 4562 return -ENXIO; 4563 if (!mddev->pers) 4564 return -EINVAL; 4565 if (!mddev->ro) 4566 return -EBUSY; 4567 mddev->safemode = 0; 4568 mddev->ro = 0; 4569 set_disk_ro(disk, 0); 4570 printk(KERN_INFO "md: %s switched to read-write mode.\n", 4571 mdname(mddev)); 4572 /* Kick recovery or resync if necessary */ 4573 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4574 md_wakeup_thread(mddev->thread); 4575 md_wakeup_thread(mddev->sync_thread); 4576 sysfs_notify_dirent(mddev->sysfs_state); 4577 return 0; 4578 } 4579 4580 /* similar to deny_write_access, but accounts for our holding a reference 4581 * to the file ourselves */ 4582 static int deny_bitmap_write_access(struct file * file) 4583 { 4584 struct inode *inode = file->f_mapping->host; 4585 4586 spin_lock(&inode->i_lock); 4587 if (atomic_read(&inode->i_writecount) > 1) { 4588 spin_unlock(&inode->i_lock); 4589 return -ETXTBSY; 4590 } 4591 atomic_set(&inode->i_writecount, -1); 4592 spin_unlock(&inode->i_lock); 4593 4594 return 0; 4595 } 4596 4597 void restore_bitmap_write_access(struct file *file) 4598 { 4599 struct inode *inode = file->f_mapping->host; 4600 4601 spin_lock(&inode->i_lock); 4602 atomic_set(&inode->i_writecount, 1); 4603 spin_unlock(&inode->i_lock); 4604 } 4605 4606 static void md_clean(mddev_t *mddev) 4607 { 4608 mddev->array_sectors = 0; 4609 mddev->external_size = 0; 4610 mddev->dev_sectors = 0; 4611 mddev->raid_disks = 0; 4612 mddev->recovery_cp = 0; 4613 mddev->resync_min = 0; 4614 mddev->resync_max = MaxSector; 4615 mddev->reshape_position = MaxSector; 4616 mddev->external = 0; 4617 mddev->persistent = 0; 4618 mddev->level = LEVEL_NONE; 4619 mddev->clevel[0] = 0; 4620 mddev->flags = 0; 4621 mddev->ro = 0; 4622 mddev->metadata_type[0] = 0; 4623 mddev->chunk_sectors = 0; 4624 mddev->ctime = mddev->utime = 0; 4625 mddev->layout = 0; 4626 mddev->max_disks = 0; 4627 mddev->events = 0; 4628 mddev->can_decrease_events = 0; 4629 mddev->delta_disks = 0; 4630 mddev->new_level = LEVEL_NONE; 4631 mddev->new_layout = 0; 4632 mddev->new_chunk_sectors = 0; 4633 mddev->curr_resync = 0; 4634 mddev->resync_mismatches = 0; 4635 mddev->suspend_lo = mddev->suspend_hi = 0; 4636 mddev->sync_speed_min = mddev->sync_speed_max = 0; 4637 mddev->recovery = 0; 4638 mddev->in_sync = 0; 4639 mddev->degraded = 0; 4640 mddev->barriers_work = 0; 4641 mddev->safemode = 0; 4642 mddev->bitmap_info.offset = 0; 4643 mddev->bitmap_info.default_offset = 0; 4644 mddev->bitmap_info.chunksize = 0; 4645 mddev->bitmap_info.daemon_sleep = 0; 4646 mddev->bitmap_info.max_write_behind = 0; 4647 } 4648 4649 static void md_stop_writes(mddev_t *mddev) 4650 { 4651 if (mddev->sync_thread) { 4652 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4653 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4654 md_unregister_thread(mddev->sync_thread); 4655 mddev->sync_thread = NULL; 4656 } 4657 4658 del_timer_sync(&mddev->safemode_timer); 4659 4660 bitmap_flush(mddev); 4661 md_super_wait(mddev); 4662 4663 if (!mddev->in_sync || mddev->flags) { 4664 /* mark array as shutdown cleanly */ 4665 mddev->in_sync = 1; 4666 md_update_sb(mddev, 1); 4667 } 4668 } 4669 4670 static void md_stop(mddev_t *mddev) 4671 { 4672 md_stop_writes(mddev); 4673 4674 mddev->pers->stop(mddev); 4675 if (mddev->pers->sync_request && mddev->to_remove == NULL) 4676 mddev->to_remove = &md_redundancy_group; 4677 module_put(mddev->pers->owner); 4678 mddev->pers = NULL; 4679 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4680 } 4681 4682 static int md_set_readonly(mddev_t *mddev, int is_open) 4683 { 4684 int err = 0; 4685 mutex_lock(&mddev->open_mutex); 4686 if (atomic_read(&mddev->openers) > is_open) { 4687 printk("md: %s still in use.\n",mdname(mddev)); 4688 err = -EBUSY; 4689 goto out; 4690 } 4691 if (mddev->pers) { 4692 md_stop_writes(mddev); 4693 4694 err = -ENXIO; 4695 if (mddev->ro==1) 4696 goto out; 4697 mddev->ro = 1; 4698 set_disk_ro(mddev->gendisk, 1); 4699 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4700 sysfs_notify_dirent(mddev->sysfs_state); 4701 err = 0; 4702 } 4703 out: 4704 mutex_unlock(&mddev->open_mutex); 4705 return err; 4706 } 4707 4708 /* mode: 4709 * 0 - completely stop and dis-assemble array 4710 * 2 - stop but do not disassemble array 4711 */ 4712 static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4713 { 4714 int err = 0; 4715 struct gendisk *disk = mddev->gendisk; 4716 mdk_rdev_t *rdev; 4717 4718 mutex_lock(&mddev->open_mutex); 4719 if (atomic_read(&mddev->openers) > is_open) { 4720 printk("md: %s still in use.\n",mdname(mddev)); 4721 err = -EBUSY; 4722 } else if (mddev->pers) { 4723 4724 if (mddev->ro) 4725 set_disk_ro(disk, 0); 4726 4727 md_stop(mddev); 4728 mddev->queue->merge_bvec_fn = NULL; 4729 mddev->queue->unplug_fn = NULL; 4730 mddev->queue->backing_dev_info.congested_fn = NULL; 4731 4732 /* tell userspace to handle 'inactive' */ 4733 sysfs_notify_dirent(mddev->sysfs_state); 4734 4735 list_for_each_entry(rdev, &mddev->disks, same_set) 4736 if (rdev->raid_disk >= 0) { 4737 char nm[20]; 4738 sprintf(nm, "rd%d", rdev->raid_disk); 4739 sysfs_remove_link(&mddev->kobj, nm); 4740 } 4741 4742 set_capacity(disk, 0); 4743 revalidate_disk(disk); 4744 4745 if (mddev->ro) 4746 mddev->ro = 0; 4747 4748 err = 0; 4749 } 4750 mutex_unlock(&mddev->open_mutex); 4751 if (err) 4752 return err; 4753 /* 4754 * Free resources if final stop 4755 */ 4756 if (mode == 0) { 4757 4758 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4759 4760 bitmap_destroy(mddev); 4761 if (mddev->bitmap_info.file) { 4762 restore_bitmap_write_access(mddev->bitmap_info.file); 4763 fput(mddev->bitmap_info.file); 4764 mddev->bitmap_info.file = NULL; 4765 } 4766 mddev->bitmap_info.offset = 0; 4767 4768 export_array(mddev); 4769 4770 md_clean(mddev); 4771 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4772 if (mddev->hold_active == UNTIL_STOP) 4773 mddev->hold_active = 0; 4774 4775 } 4776 err = 0; 4777 blk_integrity_unregister(disk); 4778 md_new_event(mddev); 4779 sysfs_notify_dirent(mddev->sysfs_state); 4780 return err; 4781 } 4782 4783 #ifndef MODULE 4784 static void autorun_array(mddev_t *mddev) 4785 { 4786 mdk_rdev_t *rdev; 4787 int err; 4788 4789 if (list_empty(&mddev->disks)) 4790 return; 4791 4792 printk(KERN_INFO "md: running: "); 4793 4794 list_for_each_entry(rdev, &mddev->disks, same_set) { 4795 char b[BDEVNAME_SIZE]; 4796 printk("<%s>", bdevname(rdev->bdev,b)); 4797 } 4798 printk("\n"); 4799 4800 err = do_md_run(mddev); 4801 if (err) { 4802 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 4803 do_md_stop(mddev, 0, 0); 4804 } 4805 } 4806 4807 /* 4808 * lets try to run arrays based on all disks that have arrived 4809 * until now. (those are in pending_raid_disks) 4810 * 4811 * the method: pick the first pending disk, collect all disks with 4812 * the same UUID, remove all from the pending list and put them into 4813 * the 'same_array' list. Then order this list based on superblock 4814 * update time (freshest comes first), kick out 'old' disks and 4815 * compare superblocks. If everything's fine then run it. 4816 * 4817 * If "unit" is allocated, then bump its reference count 4818 */ 4819 static void autorun_devices(int part) 4820 { 4821 mdk_rdev_t *rdev0, *rdev, *tmp; 4822 mddev_t *mddev; 4823 char b[BDEVNAME_SIZE]; 4824 4825 printk(KERN_INFO "md: autorun ...\n"); 4826 while (!list_empty(&pending_raid_disks)) { 4827 int unit; 4828 dev_t dev; 4829 LIST_HEAD(candidates); 4830 rdev0 = list_entry(pending_raid_disks.next, 4831 mdk_rdev_t, same_set); 4832 4833 printk(KERN_INFO "md: considering %s ...\n", 4834 bdevname(rdev0->bdev,b)); 4835 INIT_LIST_HEAD(&candidates); 4836 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 4837 if (super_90_load(rdev, rdev0, 0) >= 0) { 4838 printk(KERN_INFO "md: adding %s ...\n", 4839 bdevname(rdev->bdev,b)); 4840 list_move(&rdev->same_set, &candidates); 4841 } 4842 /* 4843 * now we have a set of devices, with all of them having 4844 * mostly sane superblocks. It's time to allocate the 4845 * mddev. 4846 */ 4847 if (part) { 4848 dev = MKDEV(mdp_major, 4849 rdev0->preferred_minor << MdpMinorShift); 4850 unit = MINOR(dev) >> MdpMinorShift; 4851 } else { 4852 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 4853 unit = MINOR(dev); 4854 } 4855 if (rdev0->preferred_minor != unit) { 4856 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 4857 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 4858 break; 4859 } 4860 4861 md_probe(dev, NULL, NULL); 4862 mddev = mddev_find(dev); 4863 if (!mddev || !mddev->gendisk) { 4864 if (mddev) 4865 mddev_put(mddev); 4866 printk(KERN_ERR 4867 "md: cannot allocate memory for md drive.\n"); 4868 break; 4869 } 4870 if (mddev_lock(mddev)) 4871 printk(KERN_WARNING "md: %s locked, cannot run\n", 4872 mdname(mddev)); 4873 else if (mddev->raid_disks || mddev->major_version 4874 || !list_empty(&mddev->disks)) { 4875 printk(KERN_WARNING 4876 "md: %s already running, cannot run %s\n", 4877 mdname(mddev), bdevname(rdev0->bdev,b)); 4878 mddev_unlock(mddev); 4879 } else { 4880 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 4881 mddev->persistent = 1; 4882 rdev_for_each_list(rdev, tmp, &candidates) { 4883 list_del_init(&rdev->same_set); 4884 if (bind_rdev_to_array(rdev, mddev)) 4885 export_rdev(rdev); 4886 } 4887 autorun_array(mddev); 4888 mddev_unlock(mddev); 4889 } 4890 /* on success, candidates will be empty, on error 4891 * it won't... 4892 */ 4893 rdev_for_each_list(rdev, tmp, &candidates) { 4894 list_del_init(&rdev->same_set); 4895 export_rdev(rdev); 4896 } 4897 mddev_put(mddev); 4898 } 4899 printk(KERN_INFO "md: ... autorun DONE.\n"); 4900 } 4901 #endif /* !MODULE */ 4902 4903 static int get_version(void __user * arg) 4904 { 4905 mdu_version_t ver; 4906 4907 ver.major = MD_MAJOR_VERSION; 4908 ver.minor = MD_MINOR_VERSION; 4909 ver.patchlevel = MD_PATCHLEVEL_VERSION; 4910 4911 if (copy_to_user(arg, &ver, sizeof(ver))) 4912 return -EFAULT; 4913 4914 return 0; 4915 } 4916 4917 static int get_array_info(mddev_t * mddev, void __user * arg) 4918 { 4919 mdu_array_info_t info; 4920 int nr,working,insync,failed,spare; 4921 mdk_rdev_t *rdev; 4922 4923 nr=working=insync=failed=spare=0; 4924 list_for_each_entry(rdev, &mddev->disks, same_set) { 4925 nr++; 4926 if (test_bit(Faulty, &rdev->flags)) 4927 failed++; 4928 else { 4929 working++; 4930 if (test_bit(In_sync, &rdev->flags)) 4931 insync++; 4932 else 4933 spare++; 4934 } 4935 } 4936 4937 info.major_version = mddev->major_version; 4938 info.minor_version = mddev->minor_version; 4939 info.patch_version = MD_PATCHLEVEL_VERSION; 4940 info.ctime = mddev->ctime; 4941 info.level = mddev->level; 4942 info.size = mddev->dev_sectors / 2; 4943 if (info.size != mddev->dev_sectors / 2) /* overflow */ 4944 info.size = -1; 4945 info.nr_disks = nr; 4946 info.raid_disks = mddev->raid_disks; 4947 info.md_minor = mddev->md_minor; 4948 info.not_persistent= !mddev->persistent; 4949 4950 info.utime = mddev->utime; 4951 info.state = 0; 4952 if (mddev->in_sync) 4953 info.state = (1<<MD_SB_CLEAN); 4954 if (mddev->bitmap && mddev->bitmap_info.offset) 4955 info.state = (1<<MD_SB_BITMAP_PRESENT); 4956 info.active_disks = insync; 4957 info.working_disks = working; 4958 info.failed_disks = failed; 4959 info.spare_disks = spare; 4960 4961 info.layout = mddev->layout; 4962 info.chunk_size = mddev->chunk_sectors << 9; 4963 4964 if (copy_to_user(arg, &info, sizeof(info))) 4965 return -EFAULT; 4966 4967 return 0; 4968 } 4969 4970 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 4971 { 4972 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 4973 char *ptr, *buf = NULL; 4974 int err = -ENOMEM; 4975 4976 if (md_allow_write(mddev)) 4977 file = kmalloc(sizeof(*file), GFP_NOIO); 4978 else 4979 file = kmalloc(sizeof(*file), GFP_KERNEL); 4980 4981 if (!file) 4982 goto out; 4983 4984 /* bitmap disabled, zero the first byte and copy out */ 4985 if (!mddev->bitmap || !mddev->bitmap->file) { 4986 file->pathname[0] = '\0'; 4987 goto copy_out; 4988 } 4989 4990 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 4991 if (!buf) 4992 goto out; 4993 4994 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); 4995 if (IS_ERR(ptr)) 4996 goto out; 4997 4998 strcpy(file->pathname, ptr); 4999 5000 copy_out: 5001 err = 0; 5002 if (copy_to_user(arg, file, sizeof(*file))) 5003 err = -EFAULT; 5004 out: 5005 kfree(buf); 5006 kfree(file); 5007 return err; 5008 } 5009 5010 static int get_disk_info(mddev_t * mddev, void __user * arg) 5011 { 5012 mdu_disk_info_t info; 5013 mdk_rdev_t *rdev; 5014 5015 if (copy_from_user(&info, arg, sizeof(info))) 5016 return -EFAULT; 5017 5018 rdev = find_rdev_nr(mddev, info.number); 5019 if (rdev) { 5020 info.major = MAJOR(rdev->bdev->bd_dev); 5021 info.minor = MINOR(rdev->bdev->bd_dev); 5022 info.raid_disk = rdev->raid_disk; 5023 info.state = 0; 5024 if (test_bit(Faulty, &rdev->flags)) 5025 info.state |= (1<<MD_DISK_FAULTY); 5026 else if (test_bit(In_sync, &rdev->flags)) { 5027 info.state |= (1<<MD_DISK_ACTIVE); 5028 info.state |= (1<<MD_DISK_SYNC); 5029 } 5030 if (test_bit(WriteMostly, &rdev->flags)) 5031 info.state |= (1<<MD_DISK_WRITEMOSTLY); 5032 } else { 5033 info.major = info.minor = 0; 5034 info.raid_disk = -1; 5035 info.state = (1<<MD_DISK_REMOVED); 5036 } 5037 5038 if (copy_to_user(arg, &info, sizeof(info))) 5039 return -EFAULT; 5040 5041 return 0; 5042 } 5043 5044 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 5045 { 5046 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5047 mdk_rdev_t *rdev; 5048 dev_t dev = MKDEV(info->major,info->minor); 5049 5050 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 5051 return -EOVERFLOW; 5052 5053 if (!mddev->raid_disks) { 5054 int err; 5055 /* expecting a device which has a superblock */ 5056 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 5057 if (IS_ERR(rdev)) { 5058 printk(KERN_WARNING 5059 "md: md_import_device returned %ld\n", 5060 PTR_ERR(rdev)); 5061 return PTR_ERR(rdev); 5062 } 5063 if (!list_empty(&mddev->disks)) { 5064 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 5065 mdk_rdev_t, same_set); 5066 err = super_types[mddev->major_version] 5067 .load_super(rdev, rdev0, mddev->minor_version); 5068 if (err < 0) { 5069 printk(KERN_WARNING 5070 "md: %s has different UUID to %s\n", 5071 bdevname(rdev->bdev,b), 5072 bdevname(rdev0->bdev,b2)); 5073 export_rdev(rdev); 5074 return -EINVAL; 5075 } 5076 } 5077 err = bind_rdev_to_array(rdev, mddev); 5078 if (err) 5079 export_rdev(rdev); 5080 return err; 5081 } 5082 5083 /* 5084 * add_new_disk can be used once the array is assembled 5085 * to add "hot spares". They must already have a superblock 5086 * written 5087 */ 5088 if (mddev->pers) { 5089 int err; 5090 if (!mddev->pers->hot_add_disk) { 5091 printk(KERN_WARNING 5092 "%s: personality does not support diskops!\n", 5093 mdname(mddev)); 5094 return -EINVAL; 5095 } 5096 if (mddev->persistent) 5097 rdev = md_import_device(dev, mddev->major_version, 5098 mddev->minor_version); 5099 else 5100 rdev = md_import_device(dev, -1, -1); 5101 if (IS_ERR(rdev)) { 5102 printk(KERN_WARNING 5103 "md: md_import_device returned %ld\n", 5104 PTR_ERR(rdev)); 5105 return PTR_ERR(rdev); 5106 } 5107 /* set save_raid_disk if appropriate */ 5108 if (!mddev->persistent) { 5109 if (info->state & (1<<MD_DISK_SYNC) && 5110 info->raid_disk < mddev->raid_disks) 5111 rdev->raid_disk = info->raid_disk; 5112 else 5113 rdev->raid_disk = -1; 5114 } else 5115 super_types[mddev->major_version]. 5116 validate_super(mddev, rdev); 5117 rdev->saved_raid_disk = rdev->raid_disk; 5118 5119 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 5120 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5121 set_bit(WriteMostly, &rdev->flags); 5122 else 5123 clear_bit(WriteMostly, &rdev->flags); 5124 5125 rdev->raid_disk = -1; 5126 err = bind_rdev_to_array(rdev, mddev); 5127 if (!err && !mddev->pers->hot_remove_disk) { 5128 /* If there is hot_add_disk but no hot_remove_disk 5129 * then added disks for geometry changes, 5130 * and should be added immediately. 5131 */ 5132 super_types[mddev->major_version]. 5133 validate_super(mddev, rdev); 5134 err = mddev->pers->hot_add_disk(mddev, rdev); 5135 if (err) 5136 unbind_rdev_from_array(rdev); 5137 } 5138 if (err) 5139 export_rdev(rdev); 5140 else 5141 sysfs_notify_dirent(rdev->sysfs_state); 5142 5143 md_update_sb(mddev, 1); 5144 if (mddev->degraded) 5145 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5146 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5147 md_wakeup_thread(mddev->thread); 5148 return err; 5149 } 5150 5151 /* otherwise, add_new_disk is only allowed 5152 * for major_version==0 superblocks 5153 */ 5154 if (mddev->major_version != 0) { 5155 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 5156 mdname(mddev)); 5157 return -EINVAL; 5158 } 5159 5160 if (!(info->state & (1<<MD_DISK_FAULTY))) { 5161 int err; 5162 rdev = md_import_device(dev, -1, 0); 5163 if (IS_ERR(rdev)) { 5164 printk(KERN_WARNING 5165 "md: error, md_import_device() returned %ld\n", 5166 PTR_ERR(rdev)); 5167 return PTR_ERR(rdev); 5168 } 5169 rdev->desc_nr = info->number; 5170 if (info->raid_disk < mddev->raid_disks) 5171 rdev->raid_disk = info->raid_disk; 5172 else 5173 rdev->raid_disk = -1; 5174 5175 if (rdev->raid_disk < mddev->raid_disks) 5176 if (info->state & (1<<MD_DISK_SYNC)) 5177 set_bit(In_sync, &rdev->flags); 5178 5179 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5180 set_bit(WriteMostly, &rdev->flags); 5181 5182 if (!mddev->persistent) { 5183 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 5184 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 5185 } else 5186 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 5187 rdev->sectors = rdev->sb_start; 5188 5189 err = bind_rdev_to_array(rdev, mddev); 5190 if (err) { 5191 export_rdev(rdev); 5192 return err; 5193 } 5194 } 5195 5196 return 0; 5197 } 5198 5199 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 5200 { 5201 char b[BDEVNAME_SIZE]; 5202 mdk_rdev_t *rdev; 5203 5204 rdev = find_rdev(mddev, dev); 5205 if (!rdev) 5206 return -ENXIO; 5207 5208 if (rdev->raid_disk >= 0) 5209 goto busy; 5210 5211 kick_rdev_from_array(rdev); 5212 md_update_sb(mddev, 1); 5213 md_new_event(mddev); 5214 5215 return 0; 5216 busy: 5217 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 5218 bdevname(rdev->bdev,b), mdname(mddev)); 5219 return -EBUSY; 5220 } 5221 5222 static int hot_add_disk(mddev_t * mddev, dev_t dev) 5223 { 5224 char b[BDEVNAME_SIZE]; 5225 int err; 5226 mdk_rdev_t *rdev; 5227 5228 if (!mddev->pers) 5229 return -ENODEV; 5230 5231 if (mddev->major_version != 0) { 5232 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 5233 " version-0 superblocks.\n", 5234 mdname(mddev)); 5235 return -EINVAL; 5236 } 5237 if (!mddev->pers->hot_add_disk) { 5238 printk(KERN_WARNING 5239 "%s: personality does not support diskops!\n", 5240 mdname(mddev)); 5241 return -EINVAL; 5242 } 5243 5244 rdev = md_import_device(dev, -1, 0); 5245 if (IS_ERR(rdev)) { 5246 printk(KERN_WARNING 5247 "md: error, md_import_device() returned %ld\n", 5248 PTR_ERR(rdev)); 5249 return -EINVAL; 5250 } 5251 5252 if (mddev->persistent) 5253 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 5254 else 5255 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 5256 5257 rdev->sectors = rdev->sb_start; 5258 5259 if (test_bit(Faulty, &rdev->flags)) { 5260 printk(KERN_WARNING 5261 "md: can not hot-add faulty %s disk to %s!\n", 5262 bdevname(rdev->bdev,b), mdname(mddev)); 5263 err = -EINVAL; 5264 goto abort_export; 5265 } 5266 clear_bit(In_sync, &rdev->flags); 5267 rdev->desc_nr = -1; 5268 rdev->saved_raid_disk = -1; 5269 err = bind_rdev_to_array(rdev, mddev); 5270 if (err) 5271 goto abort_export; 5272 5273 /* 5274 * The rest should better be atomic, we can have disk failures 5275 * noticed in interrupt contexts ... 5276 */ 5277 5278 rdev->raid_disk = -1; 5279 5280 md_update_sb(mddev, 1); 5281 5282 /* 5283 * Kick recovery, maybe this spare has to be added to the 5284 * array immediately. 5285 */ 5286 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5287 md_wakeup_thread(mddev->thread); 5288 md_new_event(mddev); 5289 return 0; 5290 5291 abort_export: 5292 export_rdev(rdev); 5293 return err; 5294 } 5295 5296 static int set_bitmap_file(mddev_t *mddev, int fd) 5297 { 5298 int err; 5299 5300 if (mddev->pers) { 5301 if (!mddev->pers->quiesce) 5302 return -EBUSY; 5303 if (mddev->recovery || mddev->sync_thread) 5304 return -EBUSY; 5305 /* we should be able to change the bitmap.. */ 5306 } 5307 5308 5309 if (fd >= 0) { 5310 if (mddev->bitmap) 5311 return -EEXIST; /* cannot add when bitmap is present */ 5312 mddev->bitmap_info.file = fget(fd); 5313 5314 if (mddev->bitmap_info.file == NULL) { 5315 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 5316 mdname(mddev)); 5317 return -EBADF; 5318 } 5319 5320 err = deny_bitmap_write_access(mddev->bitmap_info.file); 5321 if (err) { 5322 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 5323 mdname(mddev)); 5324 fput(mddev->bitmap_info.file); 5325 mddev->bitmap_info.file = NULL; 5326 return err; 5327 } 5328 mddev->bitmap_info.offset = 0; /* file overrides offset */ 5329 } else if (mddev->bitmap == NULL) 5330 return -ENOENT; /* cannot remove what isn't there */ 5331 err = 0; 5332 if (mddev->pers) { 5333 mddev->pers->quiesce(mddev, 1); 5334 if (fd >= 0) 5335 err = bitmap_create(mddev); 5336 if (fd < 0 || err) { 5337 bitmap_destroy(mddev); 5338 fd = -1; /* make sure to put the file */ 5339 } 5340 mddev->pers->quiesce(mddev, 0); 5341 } 5342 if (fd < 0) { 5343 if (mddev->bitmap_info.file) { 5344 restore_bitmap_write_access(mddev->bitmap_info.file); 5345 fput(mddev->bitmap_info.file); 5346 } 5347 mddev->bitmap_info.file = NULL; 5348 } 5349 5350 return err; 5351 } 5352 5353 /* 5354 * set_array_info is used two different ways 5355 * The original usage is when creating a new array. 5356 * In this usage, raid_disks is > 0 and it together with 5357 * level, size, not_persistent,layout,chunksize determine the 5358 * shape of the array. 5359 * This will always create an array with a type-0.90.0 superblock. 5360 * The newer usage is when assembling an array. 5361 * In this case raid_disks will be 0, and the major_version field is 5362 * use to determine which style super-blocks are to be found on the devices. 5363 * The minor and patch _version numbers are also kept incase the 5364 * super_block handler wishes to interpret them. 5365 */ 5366 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 5367 { 5368 5369 if (info->raid_disks == 0) { 5370 /* just setting version number for superblock loading */ 5371 if (info->major_version < 0 || 5372 info->major_version >= ARRAY_SIZE(super_types) || 5373 super_types[info->major_version].name == NULL) { 5374 /* maybe try to auto-load a module? */ 5375 printk(KERN_INFO 5376 "md: superblock version %d not known\n", 5377 info->major_version); 5378 return -EINVAL; 5379 } 5380 mddev->major_version = info->major_version; 5381 mddev->minor_version = info->minor_version; 5382 mddev->patch_version = info->patch_version; 5383 mddev->persistent = !info->not_persistent; 5384 /* ensure mddev_put doesn't delete this now that there 5385 * is some minimal configuration. 5386 */ 5387 mddev->ctime = get_seconds(); 5388 return 0; 5389 } 5390 mddev->major_version = MD_MAJOR_VERSION; 5391 mddev->minor_version = MD_MINOR_VERSION; 5392 mddev->patch_version = MD_PATCHLEVEL_VERSION; 5393 mddev->ctime = get_seconds(); 5394 5395 mddev->level = info->level; 5396 mddev->clevel[0] = 0; 5397 mddev->dev_sectors = 2 * (sector_t)info->size; 5398 mddev->raid_disks = info->raid_disks; 5399 /* don't set md_minor, it is determined by which /dev/md* was 5400 * openned 5401 */ 5402 if (info->state & (1<<MD_SB_CLEAN)) 5403 mddev->recovery_cp = MaxSector; 5404 else 5405 mddev->recovery_cp = 0; 5406 mddev->persistent = ! info->not_persistent; 5407 mddev->external = 0; 5408 5409 mddev->layout = info->layout; 5410 mddev->chunk_sectors = info->chunk_size >> 9; 5411 5412 mddev->max_disks = MD_SB_DISKS; 5413 5414 if (mddev->persistent) 5415 mddev->flags = 0; 5416 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5417 5418 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 5419 mddev->bitmap_info.offset = 0; 5420 5421 mddev->reshape_position = MaxSector; 5422 5423 /* 5424 * Generate a 128 bit UUID 5425 */ 5426 get_random_bytes(mddev->uuid, 16); 5427 5428 mddev->new_level = mddev->level; 5429 mddev->new_chunk_sectors = mddev->chunk_sectors; 5430 mddev->new_layout = mddev->layout; 5431 mddev->delta_disks = 0; 5432 5433 return 0; 5434 } 5435 5436 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) 5437 { 5438 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 5439 5440 if (mddev->external_size) 5441 return; 5442 5443 mddev->array_sectors = array_sectors; 5444 } 5445 EXPORT_SYMBOL(md_set_array_sectors); 5446 5447 static int update_size(mddev_t *mddev, sector_t num_sectors) 5448 { 5449 mdk_rdev_t *rdev; 5450 int rv; 5451 int fit = (num_sectors == 0); 5452 5453 if (mddev->pers->resize == NULL) 5454 return -EINVAL; 5455 /* The "num_sectors" is the number of sectors of each device that 5456 * is used. This can only make sense for arrays with redundancy. 5457 * linear and raid0 always use whatever space is available. We can only 5458 * consider changing this number if no resync or reconstruction is 5459 * happening, and if the new size is acceptable. It must fit before the 5460 * sb_start or, if that is <data_offset, it must fit before the size 5461 * of each device. If num_sectors is zero, we find the largest size 5462 * that fits. 5463 5464 */ 5465 if (mddev->sync_thread) 5466 return -EBUSY; 5467 if (mddev->bitmap) 5468 /* Sorry, cannot grow a bitmap yet, just remove it, 5469 * grow, and re-add. 5470 */ 5471 return -EBUSY; 5472 list_for_each_entry(rdev, &mddev->disks, same_set) { 5473 sector_t avail = rdev->sectors; 5474 5475 if (fit && (num_sectors == 0 || num_sectors > avail)) 5476 num_sectors = avail; 5477 if (avail < num_sectors) 5478 return -ENOSPC; 5479 } 5480 rv = mddev->pers->resize(mddev, num_sectors); 5481 if (!rv) 5482 revalidate_disk(mddev->gendisk); 5483 return rv; 5484 } 5485 5486 static int update_raid_disks(mddev_t *mddev, int raid_disks) 5487 { 5488 int rv; 5489 /* change the number of raid disks */ 5490 if (mddev->pers->check_reshape == NULL) 5491 return -EINVAL; 5492 if (raid_disks <= 0 || 5493 (mddev->max_disks && raid_disks >= mddev->max_disks)) 5494 return -EINVAL; 5495 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5496 return -EBUSY; 5497 mddev->delta_disks = raid_disks - mddev->raid_disks; 5498 5499 rv = mddev->pers->check_reshape(mddev); 5500 return rv; 5501 } 5502 5503 5504 /* 5505 * update_array_info is used to change the configuration of an 5506 * on-line array. 5507 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 5508 * fields in the info are checked against the array. 5509 * Any differences that cannot be handled will cause an error. 5510 * Normally, only one change can be managed at a time. 5511 */ 5512 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 5513 { 5514 int rv = 0; 5515 int cnt = 0; 5516 int state = 0; 5517 5518 /* calculate expected state,ignoring low bits */ 5519 if (mddev->bitmap && mddev->bitmap_info.offset) 5520 state |= (1 << MD_SB_BITMAP_PRESENT); 5521 5522 if (mddev->major_version != info->major_version || 5523 mddev->minor_version != info->minor_version || 5524 /* mddev->patch_version != info->patch_version || */ 5525 mddev->ctime != info->ctime || 5526 mddev->level != info->level || 5527 /* mddev->layout != info->layout || */ 5528 !mddev->persistent != info->not_persistent|| 5529 mddev->chunk_sectors != info->chunk_size >> 9 || 5530 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 5531 ((state^info->state) & 0xfffffe00) 5532 ) 5533 return -EINVAL; 5534 /* Check there is only one change */ 5535 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5536 cnt++; 5537 if (mddev->raid_disks != info->raid_disks) 5538 cnt++; 5539 if (mddev->layout != info->layout) 5540 cnt++; 5541 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 5542 cnt++; 5543 if (cnt == 0) 5544 return 0; 5545 if (cnt > 1) 5546 return -EINVAL; 5547 5548 if (mddev->layout != info->layout) { 5549 /* Change layout 5550 * we don't need to do anything at the md level, the 5551 * personality will take care of it all. 5552 */ 5553 if (mddev->pers->check_reshape == NULL) 5554 return -EINVAL; 5555 else { 5556 mddev->new_layout = info->layout; 5557 rv = mddev->pers->check_reshape(mddev); 5558 if (rv) 5559 mddev->new_layout = mddev->layout; 5560 return rv; 5561 } 5562 } 5563 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5564 rv = update_size(mddev, (sector_t)info->size * 2); 5565 5566 if (mddev->raid_disks != info->raid_disks) 5567 rv = update_raid_disks(mddev, info->raid_disks); 5568 5569 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 5570 if (mddev->pers->quiesce == NULL) 5571 return -EINVAL; 5572 if (mddev->recovery || mddev->sync_thread) 5573 return -EBUSY; 5574 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 5575 /* add the bitmap */ 5576 if (mddev->bitmap) 5577 return -EEXIST; 5578 if (mddev->bitmap_info.default_offset == 0) 5579 return -EINVAL; 5580 mddev->bitmap_info.offset = 5581 mddev->bitmap_info.default_offset; 5582 mddev->pers->quiesce(mddev, 1); 5583 rv = bitmap_create(mddev); 5584 if (rv) 5585 bitmap_destroy(mddev); 5586 mddev->pers->quiesce(mddev, 0); 5587 } else { 5588 /* remove the bitmap */ 5589 if (!mddev->bitmap) 5590 return -ENOENT; 5591 if (mddev->bitmap->file) 5592 return -EINVAL; 5593 mddev->pers->quiesce(mddev, 1); 5594 bitmap_destroy(mddev); 5595 mddev->pers->quiesce(mddev, 0); 5596 mddev->bitmap_info.offset = 0; 5597 } 5598 } 5599 md_update_sb(mddev, 1); 5600 return rv; 5601 } 5602 5603 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 5604 { 5605 mdk_rdev_t *rdev; 5606 5607 if (mddev->pers == NULL) 5608 return -ENODEV; 5609 5610 rdev = find_rdev(mddev, dev); 5611 if (!rdev) 5612 return -ENODEV; 5613 5614 md_error(mddev, rdev); 5615 return 0; 5616 } 5617 5618 /* 5619 * We have a problem here : there is no easy way to give a CHS 5620 * virtual geometry. We currently pretend that we have a 2 heads 5621 * 4 sectors (with a BIG number of cylinders...). This drives 5622 * dosfs just mad... ;-) 5623 */ 5624 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 5625 { 5626 mddev_t *mddev = bdev->bd_disk->private_data; 5627 5628 geo->heads = 2; 5629 geo->sectors = 4; 5630 geo->cylinders = mddev->array_sectors / 8; 5631 return 0; 5632 } 5633 5634 static int md_ioctl(struct block_device *bdev, fmode_t mode, 5635 unsigned int cmd, unsigned long arg) 5636 { 5637 int err = 0; 5638 void __user *argp = (void __user *)arg; 5639 mddev_t *mddev = NULL; 5640 int ro; 5641 5642 if (!capable(CAP_SYS_ADMIN)) 5643 return -EACCES; 5644 5645 /* 5646 * Commands dealing with the RAID driver but not any 5647 * particular array: 5648 */ 5649 switch (cmd) 5650 { 5651 case RAID_VERSION: 5652 err = get_version(argp); 5653 goto done; 5654 5655 case PRINT_RAID_DEBUG: 5656 err = 0; 5657 md_print_devices(); 5658 goto done; 5659 5660 #ifndef MODULE 5661 case RAID_AUTORUN: 5662 err = 0; 5663 autostart_arrays(arg); 5664 goto done; 5665 #endif 5666 default:; 5667 } 5668 5669 /* 5670 * Commands creating/starting a new array: 5671 */ 5672 5673 mddev = bdev->bd_disk->private_data; 5674 5675 if (!mddev) { 5676 BUG(); 5677 goto abort; 5678 } 5679 5680 err = mddev_lock(mddev); 5681 if (err) { 5682 printk(KERN_INFO 5683 "md: ioctl lock interrupted, reason %d, cmd %d\n", 5684 err, cmd); 5685 goto abort; 5686 } 5687 5688 switch (cmd) 5689 { 5690 case SET_ARRAY_INFO: 5691 { 5692 mdu_array_info_t info; 5693 if (!arg) 5694 memset(&info, 0, sizeof(info)); 5695 else if (copy_from_user(&info, argp, sizeof(info))) { 5696 err = -EFAULT; 5697 goto abort_unlock; 5698 } 5699 if (mddev->pers) { 5700 err = update_array_info(mddev, &info); 5701 if (err) { 5702 printk(KERN_WARNING "md: couldn't update" 5703 " array info. %d\n", err); 5704 goto abort_unlock; 5705 } 5706 goto done_unlock; 5707 } 5708 if (!list_empty(&mddev->disks)) { 5709 printk(KERN_WARNING 5710 "md: array %s already has disks!\n", 5711 mdname(mddev)); 5712 err = -EBUSY; 5713 goto abort_unlock; 5714 } 5715 if (mddev->raid_disks) { 5716 printk(KERN_WARNING 5717 "md: array %s already initialised!\n", 5718 mdname(mddev)); 5719 err = -EBUSY; 5720 goto abort_unlock; 5721 } 5722 err = set_array_info(mddev, &info); 5723 if (err) { 5724 printk(KERN_WARNING "md: couldn't set" 5725 " array info. %d\n", err); 5726 goto abort_unlock; 5727 } 5728 } 5729 goto done_unlock; 5730 5731 default:; 5732 } 5733 5734 /* 5735 * Commands querying/configuring an existing array: 5736 */ 5737 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 5738 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 5739 if ((!mddev->raid_disks && !mddev->external) 5740 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 5741 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 5742 && cmd != GET_BITMAP_FILE) { 5743 err = -ENODEV; 5744 goto abort_unlock; 5745 } 5746 5747 /* 5748 * Commands even a read-only array can execute: 5749 */ 5750 switch (cmd) 5751 { 5752 case GET_ARRAY_INFO: 5753 err = get_array_info(mddev, argp); 5754 goto done_unlock; 5755 5756 case GET_BITMAP_FILE: 5757 err = get_bitmap_file(mddev, argp); 5758 goto done_unlock; 5759 5760 case GET_DISK_INFO: 5761 err = get_disk_info(mddev, argp); 5762 goto done_unlock; 5763 5764 case RESTART_ARRAY_RW: 5765 err = restart_array(mddev); 5766 goto done_unlock; 5767 5768 case STOP_ARRAY: 5769 err = do_md_stop(mddev, 0, 1); 5770 goto done_unlock; 5771 5772 case STOP_ARRAY_RO: 5773 err = md_set_readonly(mddev, 1); 5774 goto done_unlock; 5775 5776 case BLKROSET: 5777 if (get_user(ro, (int __user *)(arg))) { 5778 err = -EFAULT; 5779 goto done_unlock; 5780 } 5781 err = -EINVAL; 5782 5783 /* if the bdev is going readonly the value of mddev->ro 5784 * does not matter, no writes are coming 5785 */ 5786 if (ro) 5787 goto done_unlock; 5788 5789 /* are we are already prepared for writes? */ 5790 if (mddev->ro != 1) 5791 goto done_unlock; 5792 5793 /* transitioning to readauto need only happen for 5794 * arrays that call md_write_start 5795 */ 5796 if (mddev->pers) { 5797 err = restart_array(mddev); 5798 if (err == 0) { 5799 mddev->ro = 2; 5800 set_disk_ro(mddev->gendisk, 0); 5801 } 5802 } 5803 goto done_unlock; 5804 } 5805 5806 /* 5807 * The remaining ioctls are changing the state of the 5808 * superblock, so we do not allow them on read-only arrays. 5809 * However non-MD ioctls (e.g. get-size) will still come through 5810 * here and hit the 'default' below, so only disallow 5811 * 'md' ioctls, and switch to rw mode if started auto-readonly. 5812 */ 5813 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 5814 if (mddev->ro == 2) { 5815 mddev->ro = 0; 5816 sysfs_notify_dirent(mddev->sysfs_state); 5817 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5818 md_wakeup_thread(mddev->thread); 5819 } else { 5820 err = -EROFS; 5821 goto abort_unlock; 5822 } 5823 } 5824 5825 switch (cmd) 5826 { 5827 case ADD_NEW_DISK: 5828 { 5829 mdu_disk_info_t info; 5830 if (copy_from_user(&info, argp, sizeof(info))) 5831 err = -EFAULT; 5832 else 5833 err = add_new_disk(mddev, &info); 5834 goto done_unlock; 5835 } 5836 5837 case HOT_REMOVE_DISK: 5838 err = hot_remove_disk(mddev, new_decode_dev(arg)); 5839 goto done_unlock; 5840 5841 case HOT_ADD_DISK: 5842 err = hot_add_disk(mddev, new_decode_dev(arg)); 5843 goto done_unlock; 5844 5845 case SET_DISK_FAULTY: 5846 err = set_disk_faulty(mddev, new_decode_dev(arg)); 5847 goto done_unlock; 5848 5849 case RUN_ARRAY: 5850 err = do_md_run(mddev); 5851 goto done_unlock; 5852 5853 case SET_BITMAP_FILE: 5854 err = set_bitmap_file(mddev, (int)arg); 5855 goto done_unlock; 5856 5857 default: 5858 err = -EINVAL; 5859 goto abort_unlock; 5860 } 5861 5862 done_unlock: 5863 abort_unlock: 5864 if (mddev->hold_active == UNTIL_IOCTL && 5865 err != -EINVAL) 5866 mddev->hold_active = 0; 5867 mddev_unlock(mddev); 5868 5869 return err; 5870 done: 5871 if (err) 5872 MD_BUG(); 5873 abort: 5874 return err; 5875 } 5876 #ifdef CONFIG_COMPAT 5877 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 5878 unsigned int cmd, unsigned long arg) 5879 { 5880 switch (cmd) { 5881 case HOT_REMOVE_DISK: 5882 case HOT_ADD_DISK: 5883 case SET_DISK_FAULTY: 5884 case SET_BITMAP_FILE: 5885 /* These take in integer arg, do not convert */ 5886 break; 5887 default: 5888 arg = (unsigned long)compat_ptr(arg); 5889 break; 5890 } 5891 5892 return md_ioctl(bdev, mode, cmd, arg); 5893 } 5894 #endif /* CONFIG_COMPAT */ 5895 5896 static int md_open(struct block_device *bdev, fmode_t mode) 5897 { 5898 /* 5899 * Succeed if we can lock the mddev, which confirms that 5900 * it isn't being stopped right now. 5901 */ 5902 mddev_t *mddev = mddev_find(bdev->bd_dev); 5903 int err; 5904 5905 if (mddev->gendisk != bdev->bd_disk) { 5906 /* we are racing with mddev_put which is discarding this 5907 * bd_disk. 5908 */ 5909 mddev_put(mddev); 5910 /* Wait until bdev->bd_disk is definitely gone */ 5911 flush_scheduled_work(); 5912 /* Then retry the open from the top */ 5913 return -ERESTARTSYS; 5914 } 5915 BUG_ON(mddev != bdev->bd_disk->private_data); 5916 5917 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 5918 goto out; 5919 5920 err = 0; 5921 atomic_inc(&mddev->openers); 5922 mutex_unlock(&mddev->open_mutex); 5923 5924 check_disk_size_change(mddev->gendisk, bdev); 5925 out: 5926 return err; 5927 } 5928 5929 static int md_release(struct gendisk *disk, fmode_t mode) 5930 { 5931 mddev_t *mddev = disk->private_data; 5932 5933 BUG_ON(!mddev); 5934 atomic_dec(&mddev->openers); 5935 mddev_put(mddev); 5936 5937 return 0; 5938 } 5939 static const struct block_device_operations md_fops = 5940 { 5941 .owner = THIS_MODULE, 5942 .open = md_open, 5943 .release = md_release, 5944 .ioctl = md_ioctl, 5945 #ifdef CONFIG_COMPAT 5946 .compat_ioctl = md_compat_ioctl, 5947 #endif 5948 .getgeo = md_getgeo, 5949 }; 5950 5951 static int md_thread(void * arg) 5952 { 5953 mdk_thread_t *thread = arg; 5954 5955 /* 5956 * md_thread is a 'system-thread', it's priority should be very 5957 * high. We avoid resource deadlocks individually in each 5958 * raid personality. (RAID5 does preallocation) We also use RR and 5959 * the very same RT priority as kswapd, thus we will never get 5960 * into a priority inversion deadlock. 5961 * 5962 * we definitely have to have equal or higher priority than 5963 * bdflush, otherwise bdflush will deadlock if there are too 5964 * many dirty RAID5 blocks. 5965 */ 5966 5967 allow_signal(SIGKILL); 5968 while (!kthread_should_stop()) { 5969 5970 /* We need to wait INTERRUPTIBLE so that 5971 * we don't add to the load-average. 5972 * That means we need to be sure no signals are 5973 * pending 5974 */ 5975 if (signal_pending(current)) 5976 flush_signals(current); 5977 5978 wait_event_interruptible_timeout 5979 (thread->wqueue, 5980 test_bit(THREAD_WAKEUP, &thread->flags) 5981 || kthread_should_stop(), 5982 thread->timeout); 5983 5984 clear_bit(THREAD_WAKEUP, &thread->flags); 5985 5986 thread->run(thread->mddev); 5987 } 5988 5989 return 0; 5990 } 5991 5992 void md_wakeup_thread(mdk_thread_t *thread) 5993 { 5994 if (thread) { 5995 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 5996 set_bit(THREAD_WAKEUP, &thread->flags); 5997 wake_up(&thread->wqueue); 5998 } 5999 } 6000 6001 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 6002 const char *name) 6003 { 6004 mdk_thread_t *thread; 6005 6006 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 6007 if (!thread) 6008 return NULL; 6009 6010 init_waitqueue_head(&thread->wqueue); 6011 6012 thread->run = run; 6013 thread->mddev = mddev; 6014 thread->timeout = MAX_SCHEDULE_TIMEOUT; 6015 thread->tsk = kthread_run(md_thread, thread, 6016 "%s_%s", 6017 mdname(thread->mddev), 6018 name ?: mddev->pers->name); 6019 if (IS_ERR(thread->tsk)) { 6020 kfree(thread); 6021 return NULL; 6022 } 6023 return thread; 6024 } 6025 6026 void md_unregister_thread(mdk_thread_t *thread) 6027 { 6028 if (!thread) 6029 return; 6030 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 6031 6032 kthread_stop(thread->tsk); 6033 kfree(thread); 6034 } 6035 6036 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 6037 { 6038 if (!mddev) { 6039 MD_BUG(); 6040 return; 6041 } 6042 6043 if (!rdev || test_bit(Faulty, &rdev->flags)) 6044 return; 6045 6046 if (mddev->external) 6047 set_bit(Blocked, &rdev->flags); 6048 /* 6049 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 6050 mdname(mddev), 6051 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 6052 __builtin_return_address(0),__builtin_return_address(1), 6053 __builtin_return_address(2),__builtin_return_address(3)); 6054 */ 6055 if (!mddev->pers) 6056 return; 6057 if (!mddev->pers->error_handler) 6058 return; 6059 mddev->pers->error_handler(mddev,rdev); 6060 if (mddev->degraded) 6061 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6062 sysfs_notify_dirent(rdev->sysfs_state); 6063 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6064 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6065 md_wakeup_thread(mddev->thread); 6066 md_new_event_inintr(mddev); 6067 } 6068 6069 /* seq_file implementation /proc/mdstat */ 6070 6071 static void status_unused(struct seq_file *seq) 6072 { 6073 int i = 0; 6074 mdk_rdev_t *rdev; 6075 6076 seq_printf(seq, "unused devices: "); 6077 6078 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 6079 char b[BDEVNAME_SIZE]; 6080 i++; 6081 seq_printf(seq, "%s ", 6082 bdevname(rdev->bdev,b)); 6083 } 6084 if (!i) 6085 seq_printf(seq, "<none>"); 6086 6087 seq_printf(seq, "\n"); 6088 } 6089 6090 6091 static void status_resync(struct seq_file *seq, mddev_t * mddev) 6092 { 6093 sector_t max_sectors, resync, res; 6094 unsigned long dt, db; 6095 sector_t rt; 6096 int scale; 6097 unsigned int per_milli; 6098 6099 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); 6100 6101 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6102 max_sectors = mddev->resync_max_sectors; 6103 else 6104 max_sectors = mddev->dev_sectors; 6105 6106 /* 6107 * Should not happen. 6108 */ 6109 if (!max_sectors) { 6110 MD_BUG(); 6111 return; 6112 } 6113 /* Pick 'scale' such that (resync>>scale)*1000 will fit 6114 * in a sector_t, and (max_sectors>>scale) will fit in a 6115 * u32, as those are the requirements for sector_div. 6116 * Thus 'scale' must be at least 10 6117 */ 6118 scale = 10; 6119 if (sizeof(sector_t) > sizeof(unsigned long)) { 6120 while ( max_sectors/2 > (1ULL<<(scale+32))) 6121 scale++; 6122 } 6123 res = (resync>>scale)*1000; 6124 sector_div(res, (u32)((max_sectors>>scale)+1)); 6125 6126 per_milli = res; 6127 { 6128 int i, x = per_milli/50, y = 20-x; 6129 seq_printf(seq, "["); 6130 for (i = 0; i < x; i++) 6131 seq_printf(seq, "="); 6132 seq_printf(seq, ">"); 6133 for (i = 0; i < y; i++) 6134 seq_printf(seq, "."); 6135 seq_printf(seq, "] "); 6136 } 6137 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 6138 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 6139 "reshape" : 6140 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 6141 "check" : 6142 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 6143 "resync" : "recovery"))), 6144 per_milli/10, per_milli % 10, 6145 (unsigned long long) resync/2, 6146 (unsigned long long) max_sectors/2); 6147 6148 /* 6149 * dt: time from mark until now 6150 * db: blocks written from mark until now 6151 * rt: remaining time 6152 * 6153 * rt is a sector_t, so could be 32bit or 64bit. 6154 * So we divide before multiply in case it is 32bit and close 6155 * to the limit. 6156 * We scale the divisor (db) by 32 to avoid loosing precision 6157 * near the end of resync when the number of remaining sectors 6158 * is close to 'db'. 6159 * We then divide rt by 32 after multiplying by db to compensate. 6160 * The '+1' avoids division by zero if db is very small. 6161 */ 6162 dt = ((jiffies - mddev->resync_mark) / HZ); 6163 if (!dt) dt++; 6164 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 6165 - mddev->resync_mark_cnt; 6166 6167 rt = max_sectors - resync; /* number of remaining sectors */ 6168 sector_div(rt, db/32+1); 6169 rt *= dt; 6170 rt >>= 5; 6171 6172 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 6173 ((unsigned long)rt % 60)/6); 6174 6175 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 6176 } 6177 6178 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 6179 { 6180 struct list_head *tmp; 6181 loff_t l = *pos; 6182 mddev_t *mddev; 6183 6184 if (l >= 0x10000) 6185 return NULL; 6186 if (!l--) 6187 /* header */ 6188 return (void*)1; 6189 6190 spin_lock(&all_mddevs_lock); 6191 list_for_each(tmp,&all_mddevs) 6192 if (!l--) { 6193 mddev = list_entry(tmp, mddev_t, all_mddevs); 6194 mddev_get(mddev); 6195 spin_unlock(&all_mddevs_lock); 6196 return mddev; 6197 } 6198 spin_unlock(&all_mddevs_lock); 6199 if (!l--) 6200 return (void*)2;/* tail */ 6201 return NULL; 6202 } 6203 6204 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 6205 { 6206 struct list_head *tmp; 6207 mddev_t *next_mddev, *mddev = v; 6208 6209 ++*pos; 6210 if (v == (void*)2) 6211 return NULL; 6212 6213 spin_lock(&all_mddevs_lock); 6214 if (v == (void*)1) 6215 tmp = all_mddevs.next; 6216 else 6217 tmp = mddev->all_mddevs.next; 6218 if (tmp != &all_mddevs) 6219 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 6220 else { 6221 next_mddev = (void*)2; 6222 *pos = 0x10000; 6223 } 6224 spin_unlock(&all_mddevs_lock); 6225 6226 if (v != (void*)1) 6227 mddev_put(mddev); 6228 return next_mddev; 6229 6230 } 6231 6232 static void md_seq_stop(struct seq_file *seq, void *v) 6233 { 6234 mddev_t *mddev = v; 6235 6236 if (mddev && v != (void*)1 && v != (void*)2) 6237 mddev_put(mddev); 6238 } 6239 6240 struct mdstat_info { 6241 int event; 6242 }; 6243 6244 static int md_seq_show(struct seq_file *seq, void *v) 6245 { 6246 mddev_t *mddev = v; 6247 sector_t sectors; 6248 mdk_rdev_t *rdev; 6249 struct mdstat_info *mi = seq->private; 6250 struct bitmap *bitmap; 6251 6252 if (v == (void*)1) { 6253 struct mdk_personality *pers; 6254 seq_printf(seq, "Personalities : "); 6255 spin_lock(&pers_lock); 6256 list_for_each_entry(pers, &pers_list, list) 6257 seq_printf(seq, "[%s] ", pers->name); 6258 6259 spin_unlock(&pers_lock); 6260 seq_printf(seq, "\n"); 6261 mi->event = atomic_read(&md_event_count); 6262 return 0; 6263 } 6264 if (v == (void*)2) { 6265 status_unused(seq); 6266 return 0; 6267 } 6268 6269 if (mddev_lock(mddev) < 0) 6270 return -EINTR; 6271 6272 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 6273 seq_printf(seq, "%s : %sactive", mdname(mddev), 6274 mddev->pers ? "" : "in"); 6275 if (mddev->pers) { 6276 if (mddev->ro==1) 6277 seq_printf(seq, " (read-only)"); 6278 if (mddev->ro==2) 6279 seq_printf(seq, " (auto-read-only)"); 6280 seq_printf(seq, " %s", mddev->pers->name); 6281 } 6282 6283 sectors = 0; 6284 list_for_each_entry(rdev, &mddev->disks, same_set) { 6285 char b[BDEVNAME_SIZE]; 6286 seq_printf(seq, " %s[%d]", 6287 bdevname(rdev->bdev,b), rdev->desc_nr); 6288 if (test_bit(WriteMostly, &rdev->flags)) 6289 seq_printf(seq, "(W)"); 6290 if (test_bit(Faulty, &rdev->flags)) { 6291 seq_printf(seq, "(F)"); 6292 continue; 6293 } else if (rdev->raid_disk < 0) 6294 seq_printf(seq, "(S)"); /* spare */ 6295 sectors += rdev->sectors; 6296 } 6297 6298 if (!list_empty(&mddev->disks)) { 6299 if (mddev->pers) 6300 seq_printf(seq, "\n %llu blocks", 6301 (unsigned long long) 6302 mddev->array_sectors / 2); 6303 else 6304 seq_printf(seq, "\n %llu blocks", 6305 (unsigned long long)sectors / 2); 6306 } 6307 if (mddev->persistent) { 6308 if (mddev->major_version != 0 || 6309 mddev->minor_version != 90) { 6310 seq_printf(seq," super %d.%d", 6311 mddev->major_version, 6312 mddev->minor_version); 6313 } 6314 } else if (mddev->external) 6315 seq_printf(seq, " super external:%s", 6316 mddev->metadata_type); 6317 else 6318 seq_printf(seq, " super non-persistent"); 6319 6320 if (mddev->pers) { 6321 mddev->pers->status(seq, mddev); 6322 seq_printf(seq, "\n "); 6323 if (mddev->pers->sync_request) { 6324 if (mddev->curr_resync > 2) { 6325 status_resync(seq, mddev); 6326 seq_printf(seq, "\n "); 6327 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 6328 seq_printf(seq, "\tresync=DELAYED\n "); 6329 else if (mddev->recovery_cp < MaxSector) 6330 seq_printf(seq, "\tresync=PENDING\n "); 6331 } 6332 } else 6333 seq_printf(seq, "\n "); 6334 6335 if ((bitmap = mddev->bitmap)) { 6336 unsigned long chunk_kb; 6337 unsigned long flags; 6338 spin_lock_irqsave(&bitmap->lock, flags); 6339 chunk_kb = mddev->bitmap_info.chunksize >> 10; 6340 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 6341 "%lu%s chunk", 6342 bitmap->pages - bitmap->missing_pages, 6343 bitmap->pages, 6344 (bitmap->pages - bitmap->missing_pages) 6345 << (PAGE_SHIFT - 10), 6346 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 6347 chunk_kb ? "KB" : "B"); 6348 if (bitmap->file) { 6349 seq_printf(seq, ", file: "); 6350 seq_path(seq, &bitmap->file->f_path, " \t\n"); 6351 } 6352 6353 seq_printf(seq, "\n"); 6354 spin_unlock_irqrestore(&bitmap->lock, flags); 6355 } 6356 6357 seq_printf(seq, "\n"); 6358 } 6359 mddev_unlock(mddev); 6360 6361 return 0; 6362 } 6363 6364 static const struct seq_operations md_seq_ops = { 6365 .start = md_seq_start, 6366 .next = md_seq_next, 6367 .stop = md_seq_stop, 6368 .show = md_seq_show, 6369 }; 6370 6371 static int md_seq_open(struct inode *inode, struct file *file) 6372 { 6373 int error; 6374 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 6375 if (mi == NULL) 6376 return -ENOMEM; 6377 6378 error = seq_open(file, &md_seq_ops); 6379 if (error) 6380 kfree(mi); 6381 else { 6382 struct seq_file *p = file->private_data; 6383 p->private = mi; 6384 mi->event = atomic_read(&md_event_count); 6385 } 6386 return error; 6387 } 6388 6389 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 6390 { 6391 struct seq_file *m = filp->private_data; 6392 struct mdstat_info *mi = m->private; 6393 int mask; 6394 6395 poll_wait(filp, &md_event_waiters, wait); 6396 6397 /* always allow read */ 6398 mask = POLLIN | POLLRDNORM; 6399 6400 if (mi->event != atomic_read(&md_event_count)) 6401 mask |= POLLERR | POLLPRI; 6402 return mask; 6403 } 6404 6405 static const struct file_operations md_seq_fops = { 6406 .owner = THIS_MODULE, 6407 .open = md_seq_open, 6408 .read = seq_read, 6409 .llseek = seq_lseek, 6410 .release = seq_release_private, 6411 .poll = mdstat_poll, 6412 }; 6413 6414 int register_md_personality(struct mdk_personality *p) 6415 { 6416 spin_lock(&pers_lock); 6417 list_add_tail(&p->list, &pers_list); 6418 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 6419 spin_unlock(&pers_lock); 6420 return 0; 6421 } 6422 6423 int unregister_md_personality(struct mdk_personality *p) 6424 { 6425 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 6426 spin_lock(&pers_lock); 6427 list_del_init(&p->list); 6428 spin_unlock(&pers_lock); 6429 return 0; 6430 } 6431 6432 static int is_mddev_idle(mddev_t *mddev, int init) 6433 { 6434 mdk_rdev_t * rdev; 6435 int idle; 6436 int curr_events; 6437 6438 idle = 1; 6439 rcu_read_lock(); 6440 rdev_for_each_rcu(rdev, mddev) { 6441 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 6442 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 6443 (int)part_stat_read(&disk->part0, sectors[1]) - 6444 atomic_read(&disk->sync_io); 6445 /* sync IO will cause sync_io to increase before the disk_stats 6446 * as sync_io is counted when a request starts, and 6447 * disk_stats is counted when it completes. 6448 * So resync activity will cause curr_events to be smaller than 6449 * when there was no such activity. 6450 * non-sync IO will cause disk_stat to increase without 6451 * increasing sync_io so curr_events will (eventually) 6452 * be larger than it was before. Once it becomes 6453 * substantially larger, the test below will cause 6454 * the array to appear non-idle, and resync will slow 6455 * down. 6456 * If there is a lot of outstanding resync activity when 6457 * we set last_event to curr_events, then all that activity 6458 * completing might cause the array to appear non-idle 6459 * and resync will be slowed down even though there might 6460 * not have been non-resync activity. This will only 6461 * happen once though. 'last_events' will soon reflect 6462 * the state where there is little or no outstanding 6463 * resync requests, and further resync activity will 6464 * always make curr_events less than last_events. 6465 * 6466 */ 6467 if (init || curr_events - rdev->last_events > 64) { 6468 rdev->last_events = curr_events; 6469 idle = 0; 6470 } 6471 } 6472 rcu_read_unlock(); 6473 return idle; 6474 } 6475 6476 void md_done_sync(mddev_t *mddev, int blocks, int ok) 6477 { 6478 /* another "blocks" (512byte) blocks have been synced */ 6479 atomic_sub(blocks, &mddev->recovery_active); 6480 wake_up(&mddev->recovery_wait); 6481 if (!ok) { 6482 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6483 md_wakeup_thread(mddev->thread); 6484 // stop recovery, signal do_sync .... 6485 } 6486 } 6487 6488 6489 /* md_write_start(mddev, bi) 6490 * If we need to update some array metadata (e.g. 'active' flag 6491 * in superblock) before writing, schedule a superblock update 6492 * and wait for it to complete. 6493 */ 6494 void md_write_start(mddev_t *mddev, struct bio *bi) 6495 { 6496 int did_change = 0; 6497 if (bio_data_dir(bi) != WRITE) 6498 return; 6499 6500 BUG_ON(mddev->ro == 1); 6501 if (mddev->ro == 2) { 6502 /* need to switch to read/write */ 6503 mddev->ro = 0; 6504 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6505 md_wakeup_thread(mddev->thread); 6506 md_wakeup_thread(mddev->sync_thread); 6507 did_change = 1; 6508 } 6509 atomic_inc(&mddev->writes_pending); 6510 if (mddev->safemode == 1) 6511 mddev->safemode = 0; 6512 if (mddev->in_sync) { 6513 spin_lock_irq(&mddev->write_lock); 6514 if (mddev->in_sync) { 6515 mddev->in_sync = 0; 6516 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6517 md_wakeup_thread(mddev->thread); 6518 did_change = 1; 6519 } 6520 spin_unlock_irq(&mddev->write_lock); 6521 } 6522 if (did_change) 6523 sysfs_notify_dirent(mddev->sysfs_state); 6524 wait_event(mddev->sb_wait, 6525 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 6526 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6527 } 6528 6529 void md_write_end(mddev_t *mddev) 6530 { 6531 if (atomic_dec_and_test(&mddev->writes_pending)) { 6532 if (mddev->safemode == 2) 6533 md_wakeup_thread(mddev->thread); 6534 else if (mddev->safemode_delay) 6535 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 6536 } 6537 } 6538 6539 /* md_allow_write(mddev) 6540 * Calling this ensures that the array is marked 'active' so that writes 6541 * may proceed without blocking. It is important to call this before 6542 * attempting a GFP_KERNEL allocation while holding the mddev lock. 6543 * Must be called with mddev_lock held. 6544 * 6545 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 6546 * is dropped, so return -EAGAIN after notifying userspace. 6547 */ 6548 int md_allow_write(mddev_t *mddev) 6549 { 6550 if (!mddev->pers) 6551 return 0; 6552 if (mddev->ro) 6553 return 0; 6554 if (!mddev->pers->sync_request) 6555 return 0; 6556 6557 spin_lock_irq(&mddev->write_lock); 6558 if (mddev->in_sync) { 6559 mddev->in_sync = 0; 6560 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6561 if (mddev->safemode_delay && 6562 mddev->safemode == 0) 6563 mddev->safemode = 1; 6564 spin_unlock_irq(&mddev->write_lock); 6565 md_update_sb(mddev, 0); 6566 sysfs_notify_dirent(mddev->sysfs_state); 6567 } else 6568 spin_unlock_irq(&mddev->write_lock); 6569 6570 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 6571 return -EAGAIN; 6572 else 6573 return 0; 6574 } 6575 EXPORT_SYMBOL_GPL(md_allow_write); 6576 6577 #define SYNC_MARKS 10 6578 #define SYNC_MARK_STEP (3*HZ) 6579 void md_do_sync(mddev_t *mddev) 6580 { 6581 mddev_t *mddev2; 6582 unsigned int currspeed = 0, 6583 window; 6584 sector_t max_sectors,j, io_sectors; 6585 unsigned long mark[SYNC_MARKS]; 6586 sector_t mark_cnt[SYNC_MARKS]; 6587 int last_mark,m; 6588 struct list_head *tmp; 6589 sector_t last_check; 6590 int skipped = 0; 6591 mdk_rdev_t *rdev; 6592 char *desc; 6593 6594 /* just incase thread restarts... */ 6595 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 6596 return; 6597 if (mddev->ro) /* never try to sync a read-only array */ 6598 return; 6599 6600 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6601 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 6602 desc = "data-check"; 6603 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6604 desc = "requested-resync"; 6605 else 6606 desc = "resync"; 6607 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6608 desc = "reshape"; 6609 else 6610 desc = "recovery"; 6611 6612 /* we overload curr_resync somewhat here. 6613 * 0 == not engaged in resync at all 6614 * 2 == checking that there is no conflict with another sync 6615 * 1 == like 2, but have yielded to allow conflicting resync to 6616 * commense 6617 * other == active in resync - this many blocks 6618 * 6619 * Before starting a resync we must have set curr_resync to 6620 * 2, and then checked that every "conflicting" array has curr_resync 6621 * less than ours. When we find one that is the same or higher 6622 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 6623 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 6624 * This will mean we have to start checking from the beginning again. 6625 * 6626 */ 6627 6628 do { 6629 mddev->curr_resync = 2; 6630 6631 try_again: 6632 if (kthread_should_stop()) 6633 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6634 6635 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6636 goto skip; 6637 for_each_mddev(mddev2, tmp) { 6638 if (mddev2 == mddev) 6639 continue; 6640 if (!mddev->parallel_resync 6641 && mddev2->curr_resync 6642 && match_mddev_units(mddev, mddev2)) { 6643 DEFINE_WAIT(wq); 6644 if (mddev < mddev2 && mddev->curr_resync == 2) { 6645 /* arbitrarily yield */ 6646 mddev->curr_resync = 1; 6647 wake_up(&resync_wait); 6648 } 6649 if (mddev > mddev2 && mddev->curr_resync == 1) 6650 /* no need to wait here, we can wait the next 6651 * time 'round when curr_resync == 2 6652 */ 6653 continue; 6654 /* We need to wait 'interruptible' so as not to 6655 * contribute to the load average, and not to 6656 * be caught by 'softlockup' 6657 */ 6658 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 6659 if (!kthread_should_stop() && 6660 mddev2->curr_resync >= mddev->curr_resync) { 6661 printk(KERN_INFO "md: delaying %s of %s" 6662 " until %s has finished (they" 6663 " share one or more physical units)\n", 6664 desc, mdname(mddev), mdname(mddev2)); 6665 mddev_put(mddev2); 6666 if (signal_pending(current)) 6667 flush_signals(current); 6668 schedule(); 6669 finish_wait(&resync_wait, &wq); 6670 goto try_again; 6671 } 6672 finish_wait(&resync_wait, &wq); 6673 } 6674 } 6675 } while (mddev->curr_resync < 2); 6676 6677 j = 0; 6678 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6679 /* resync follows the size requested by the personality, 6680 * which defaults to physical size, but can be virtual size 6681 */ 6682 max_sectors = mddev->resync_max_sectors; 6683 mddev->resync_mismatches = 0; 6684 /* we don't use the checkpoint if there's a bitmap */ 6685 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6686 j = mddev->resync_min; 6687 else if (!mddev->bitmap) 6688 j = mddev->recovery_cp; 6689 6690 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6691 max_sectors = mddev->dev_sectors; 6692 else { 6693 /* recovery follows the physical size of devices */ 6694 max_sectors = mddev->dev_sectors; 6695 j = MaxSector; 6696 rcu_read_lock(); 6697 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 6698 if (rdev->raid_disk >= 0 && 6699 !test_bit(Faulty, &rdev->flags) && 6700 !test_bit(In_sync, &rdev->flags) && 6701 rdev->recovery_offset < j) 6702 j = rdev->recovery_offset; 6703 rcu_read_unlock(); 6704 } 6705 6706 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 6707 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 6708 " %d KB/sec/disk.\n", speed_min(mddev)); 6709 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 6710 "(but not more than %d KB/sec) for %s.\n", 6711 speed_max(mddev), desc); 6712 6713 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 6714 6715 io_sectors = 0; 6716 for (m = 0; m < SYNC_MARKS; m++) { 6717 mark[m] = jiffies; 6718 mark_cnt[m] = io_sectors; 6719 } 6720 last_mark = 0; 6721 mddev->resync_mark = mark[last_mark]; 6722 mddev->resync_mark_cnt = mark_cnt[last_mark]; 6723 6724 /* 6725 * Tune reconstruction: 6726 */ 6727 window = 32*(PAGE_SIZE/512); 6728 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 6729 window/2,(unsigned long long) max_sectors/2); 6730 6731 atomic_set(&mddev->recovery_active, 0); 6732 last_check = 0; 6733 6734 if (j>2) { 6735 printk(KERN_INFO 6736 "md: resuming %s of %s from checkpoint.\n", 6737 desc, mdname(mddev)); 6738 mddev->curr_resync = j; 6739 } 6740 mddev->curr_resync_completed = mddev->curr_resync; 6741 6742 while (j < max_sectors) { 6743 sector_t sectors; 6744 6745 skipped = 0; 6746 6747 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6748 ((mddev->curr_resync > mddev->curr_resync_completed && 6749 (mddev->curr_resync - mddev->curr_resync_completed) 6750 > (max_sectors >> 4)) || 6751 (j - mddev->curr_resync_completed)*2 6752 >= mddev->resync_max - mddev->curr_resync_completed 6753 )) { 6754 /* time to update curr_resync_completed */ 6755 blk_unplug(mddev->queue); 6756 wait_event(mddev->recovery_wait, 6757 atomic_read(&mddev->recovery_active) == 0); 6758 mddev->curr_resync_completed = 6759 mddev->curr_resync; 6760 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6761 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6762 } 6763 6764 while (j >= mddev->resync_max && !kthread_should_stop()) { 6765 /* As this condition is controlled by user-space, 6766 * we can block indefinitely, so use '_interruptible' 6767 * to avoid triggering warnings. 6768 */ 6769 flush_signals(current); /* just in case */ 6770 wait_event_interruptible(mddev->recovery_wait, 6771 mddev->resync_max > j 6772 || kthread_should_stop()); 6773 } 6774 6775 if (kthread_should_stop()) 6776 goto interrupted; 6777 6778 sectors = mddev->pers->sync_request(mddev, j, &skipped, 6779 currspeed < speed_min(mddev)); 6780 if (sectors == 0) { 6781 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6782 goto out; 6783 } 6784 6785 if (!skipped) { /* actual IO requested */ 6786 io_sectors += sectors; 6787 atomic_add(sectors, &mddev->recovery_active); 6788 } 6789 6790 j += sectors; 6791 if (j>1) mddev->curr_resync = j; 6792 mddev->curr_mark_cnt = io_sectors; 6793 if (last_check == 0) 6794 /* this is the earliers that rebuilt will be 6795 * visible in /proc/mdstat 6796 */ 6797 md_new_event(mddev); 6798 6799 if (last_check + window > io_sectors || j == max_sectors) 6800 continue; 6801 6802 last_check = io_sectors; 6803 6804 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6805 break; 6806 6807 repeat: 6808 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 6809 /* step marks */ 6810 int next = (last_mark+1) % SYNC_MARKS; 6811 6812 mddev->resync_mark = mark[next]; 6813 mddev->resync_mark_cnt = mark_cnt[next]; 6814 mark[next] = jiffies; 6815 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 6816 last_mark = next; 6817 } 6818 6819 6820 if (kthread_should_stop()) 6821 goto interrupted; 6822 6823 6824 /* 6825 * this loop exits only if either when we are slower than 6826 * the 'hard' speed limit, or the system was IO-idle for 6827 * a jiffy. 6828 * the system might be non-idle CPU-wise, but we only care 6829 * about not overloading the IO subsystem. (things like an 6830 * e2fsck being done on the RAID array should execute fast) 6831 */ 6832 blk_unplug(mddev->queue); 6833 cond_resched(); 6834 6835 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 6836 /((jiffies-mddev->resync_mark)/HZ +1) +1; 6837 6838 if (currspeed > speed_min(mddev)) { 6839 if ((currspeed > speed_max(mddev)) || 6840 !is_mddev_idle(mddev, 0)) { 6841 msleep(500); 6842 goto repeat; 6843 } 6844 } 6845 } 6846 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 6847 /* 6848 * this also signals 'finished resyncing' to md_stop 6849 */ 6850 out: 6851 blk_unplug(mddev->queue); 6852 6853 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 6854 6855 /* tell personality that we are finished */ 6856 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 6857 6858 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 6859 mddev->curr_resync > 2) { 6860 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6861 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6862 if (mddev->curr_resync >= mddev->recovery_cp) { 6863 printk(KERN_INFO 6864 "md: checkpointing %s of %s.\n", 6865 desc, mdname(mddev)); 6866 mddev->recovery_cp = mddev->curr_resync; 6867 } 6868 } else 6869 mddev->recovery_cp = MaxSector; 6870 } else { 6871 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6872 mddev->curr_resync = MaxSector; 6873 rcu_read_lock(); 6874 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 6875 if (rdev->raid_disk >= 0 && 6876 mddev->delta_disks >= 0 && 6877 !test_bit(Faulty, &rdev->flags) && 6878 !test_bit(In_sync, &rdev->flags) && 6879 rdev->recovery_offset < mddev->curr_resync) 6880 rdev->recovery_offset = mddev->curr_resync; 6881 rcu_read_unlock(); 6882 } 6883 } 6884 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6885 6886 skip: 6887 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6888 /* We completed so min/max setting can be forgotten if used. */ 6889 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6890 mddev->resync_min = 0; 6891 mddev->resync_max = MaxSector; 6892 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6893 mddev->resync_min = mddev->curr_resync_completed; 6894 mddev->curr_resync = 0; 6895 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6896 mddev->curr_resync_completed = 0; 6897 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6898 wake_up(&resync_wait); 6899 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 6900 md_wakeup_thread(mddev->thread); 6901 return; 6902 6903 interrupted: 6904 /* 6905 * got a signal, exit. 6906 */ 6907 printk(KERN_INFO 6908 "md: md_do_sync() got signal ... exiting\n"); 6909 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6910 goto out; 6911 6912 } 6913 EXPORT_SYMBOL_GPL(md_do_sync); 6914 6915 6916 static int remove_and_add_spares(mddev_t *mddev) 6917 { 6918 mdk_rdev_t *rdev; 6919 int spares = 0; 6920 6921 mddev->curr_resync_completed = 0; 6922 6923 list_for_each_entry(rdev, &mddev->disks, same_set) 6924 if (rdev->raid_disk >= 0 && 6925 !test_bit(Blocked, &rdev->flags) && 6926 (test_bit(Faulty, &rdev->flags) || 6927 ! test_bit(In_sync, &rdev->flags)) && 6928 atomic_read(&rdev->nr_pending)==0) { 6929 if (mddev->pers->hot_remove_disk( 6930 mddev, rdev->raid_disk)==0) { 6931 char nm[20]; 6932 sprintf(nm,"rd%d", rdev->raid_disk); 6933 sysfs_remove_link(&mddev->kobj, nm); 6934 rdev->raid_disk = -1; 6935 } 6936 } 6937 6938 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { 6939 list_for_each_entry(rdev, &mddev->disks, same_set) { 6940 if (rdev->raid_disk >= 0 && 6941 !test_bit(In_sync, &rdev->flags) && 6942 !test_bit(Blocked, &rdev->flags)) 6943 spares++; 6944 if (rdev->raid_disk < 0 6945 && !test_bit(Faulty, &rdev->flags)) { 6946 rdev->recovery_offset = 0; 6947 if (mddev->pers-> 6948 hot_add_disk(mddev, rdev) == 0) { 6949 char nm[20]; 6950 sprintf(nm, "rd%d", rdev->raid_disk); 6951 if (sysfs_create_link(&mddev->kobj, 6952 &rdev->kobj, nm)) 6953 printk(KERN_WARNING 6954 "md: cannot register " 6955 "%s for %s\n", 6956 nm, mdname(mddev)); 6957 spares++; 6958 md_new_event(mddev); 6959 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6960 } else 6961 break; 6962 } 6963 } 6964 } 6965 return spares; 6966 } 6967 /* 6968 * This routine is regularly called by all per-raid-array threads to 6969 * deal with generic issues like resync and super-block update. 6970 * Raid personalities that don't have a thread (linear/raid0) do not 6971 * need this as they never do any recovery or update the superblock. 6972 * 6973 * It does not do any resync itself, but rather "forks" off other threads 6974 * to do that as needed. 6975 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 6976 * "->recovery" and create a thread at ->sync_thread. 6977 * When the thread finishes it sets MD_RECOVERY_DONE 6978 * and wakeups up this thread which will reap the thread and finish up. 6979 * This thread also removes any faulty devices (with nr_pending == 0). 6980 * 6981 * The overall approach is: 6982 * 1/ if the superblock needs updating, update it. 6983 * 2/ If a recovery thread is running, don't do anything else. 6984 * 3/ If recovery has finished, clean up, possibly marking spares active. 6985 * 4/ If there are any faulty devices, remove them. 6986 * 5/ If array is degraded, try to add spares devices 6987 * 6/ If array has spares or is not in-sync, start a resync thread. 6988 */ 6989 void md_check_recovery(mddev_t *mddev) 6990 { 6991 mdk_rdev_t *rdev; 6992 6993 6994 if (mddev->bitmap) 6995 bitmap_daemon_work(mddev); 6996 6997 if (mddev->ro) 6998 return; 6999 7000 if (signal_pending(current)) { 7001 if (mddev->pers->sync_request && !mddev->external) { 7002 printk(KERN_INFO "md: %s in immediate safe mode\n", 7003 mdname(mddev)); 7004 mddev->safemode = 2; 7005 } 7006 flush_signals(current); 7007 } 7008 7009 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 7010 return; 7011 if ( ! ( 7012 (mddev->flags && !mddev->external) || 7013 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 7014 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 7015 (mddev->external == 0 && mddev->safemode == 1) || 7016 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 7017 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 7018 )) 7019 return; 7020 7021 if (mddev_trylock(mddev)) { 7022 int spares = 0; 7023 7024 if (mddev->ro) { 7025 /* Only thing we do on a ro array is remove 7026 * failed devices. 7027 */ 7028 remove_and_add_spares(mddev); 7029 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7030 goto unlock; 7031 } 7032 7033 if (!mddev->external) { 7034 int did_change = 0; 7035 spin_lock_irq(&mddev->write_lock); 7036 if (mddev->safemode && 7037 !atomic_read(&mddev->writes_pending) && 7038 !mddev->in_sync && 7039 mddev->recovery_cp == MaxSector) { 7040 mddev->in_sync = 1; 7041 did_change = 1; 7042 if (mddev->persistent) 7043 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7044 } 7045 if (mddev->safemode == 1) 7046 mddev->safemode = 0; 7047 spin_unlock_irq(&mddev->write_lock); 7048 if (did_change) 7049 sysfs_notify_dirent(mddev->sysfs_state); 7050 } 7051 7052 if (mddev->flags) 7053 md_update_sb(mddev, 0); 7054 7055 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 7056 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 7057 /* resync/recovery still happening */ 7058 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7059 goto unlock; 7060 } 7061 if (mddev->sync_thread) { 7062 /* resync has finished, collect result */ 7063 md_unregister_thread(mddev->sync_thread); 7064 mddev->sync_thread = NULL; 7065 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7066 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7067 /* success...*/ 7068 /* activate any spares */ 7069 if (mddev->pers->spare_active(mddev)) 7070 sysfs_notify(&mddev->kobj, NULL, 7071 "degraded"); 7072 } 7073 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 7074 mddev->pers->finish_reshape) 7075 mddev->pers->finish_reshape(mddev); 7076 md_update_sb(mddev, 1); 7077 7078 /* if array is no-longer degraded, then any saved_raid_disk 7079 * information must be scrapped 7080 */ 7081 if (!mddev->degraded) 7082 list_for_each_entry(rdev, &mddev->disks, same_set) 7083 rdev->saved_raid_disk = -1; 7084 7085 mddev->recovery = 0; 7086 /* flag recovery needed just to double check */ 7087 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7088 sysfs_notify_dirent(mddev->sysfs_action); 7089 md_new_event(mddev); 7090 goto unlock; 7091 } 7092 /* Set RUNNING before clearing NEEDED to avoid 7093 * any transients in the value of "sync_action". 7094 */ 7095 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7096 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7097 /* Clear some bits that don't mean anything, but 7098 * might be left set 7099 */ 7100 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 7101 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7102 7103 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 7104 goto unlock; 7105 /* no recovery is running. 7106 * remove any failed drives, then 7107 * add spares if possible. 7108 * Spare are also removed and re-added, to allow 7109 * the personality to fail the re-add. 7110 */ 7111 7112 if (mddev->reshape_position != MaxSector) { 7113 if (mddev->pers->check_reshape == NULL || 7114 mddev->pers->check_reshape(mddev) != 0) 7115 /* Cannot proceed */ 7116 goto unlock; 7117 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7118 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7119 } else if ((spares = remove_and_add_spares(mddev))) { 7120 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7121 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7122 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 7123 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7124 } else if (mddev->recovery_cp < MaxSector) { 7125 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7126 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7127 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 7128 /* nothing to be done ... */ 7129 goto unlock; 7130 7131 if (mddev->pers->sync_request) { 7132 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 7133 /* We are adding a device or devices to an array 7134 * which has the bitmap stored on all devices. 7135 * So make sure all bitmap pages get written 7136 */ 7137 bitmap_write_all(mddev->bitmap); 7138 } 7139 mddev->sync_thread = md_register_thread(md_do_sync, 7140 mddev, 7141 "resync"); 7142 if (!mddev->sync_thread) { 7143 printk(KERN_ERR "%s: could not start resync" 7144 " thread...\n", 7145 mdname(mddev)); 7146 /* leave the spares where they are, it shouldn't hurt */ 7147 mddev->recovery = 0; 7148 } else 7149 md_wakeup_thread(mddev->sync_thread); 7150 sysfs_notify_dirent(mddev->sysfs_action); 7151 md_new_event(mddev); 7152 } 7153 unlock: 7154 if (!mddev->sync_thread) { 7155 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7156 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 7157 &mddev->recovery)) 7158 if (mddev->sysfs_action) 7159 sysfs_notify_dirent(mddev->sysfs_action); 7160 } 7161 mddev_unlock(mddev); 7162 } 7163 } 7164 7165 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 7166 { 7167 sysfs_notify_dirent(rdev->sysfs_state); 7168 wait_event_timeout(rdev->blocked_wait, 7169 !test_bit(Blocked, &rdev->flags), 7170 msecs_to_jiffies(5000)); 7171 rdev_dec_pending(rdev, mddev); 7172 } 7173 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7174 7175 static int md_notify_reboot(struct notifier_block *this, 7176 unsigned long code, void *x) 7177 { 7178 struct list_head *tmp; 7179 mddev_t *mddev; 7180 7181 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 7182 7183 printk(KERN_INFO "md: stopping all md devices.\n"); 7184 7185 for_each_mddev(mddev, tmp) 7186 if (mddev_trylock(mddev)) { 7187 /* Force a switch to readonly even array 7188 * appears to still be in use. Hence 7189 * the '100'. 7190 */ 7191 md_set_readonly(mddev, 100); 7192 mddev_unlock(mddev); 7193 } 7194 /* 7195 * certain more exotic SCSI devices are known to be 7196 * volatile wrt too early system reboots. While the 7197 * right place to handle this issue is the given 7198 * driver, we do want to have a safe RAID driver ... 7199 */ 7200 mdelay(1000*1); 7201 } 7202 return NOTIFY_DONE; 7203 } 7204 7205 static struct notifier_block md_notifier = { 7206 .notifier_call = md_notify_reboot, 7207 .next = NULL, 7208 .priority = INT_MAX, /* before any real devices */ 7209 }; 7210 7211 static void md_geninit(void) 7212 { 7213 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 7214 7215 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 7216 } 7217 7218 static int __init md_init(void) 7219 { 7220 if (register_blkdev(MD_MAJOR, "md")) 7221 return -1; 7222 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 7223 unregister_blkdev(MD_MAJOR, "md"); 7224 return -1; 7225 } 7226 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 7227 md_probe, NULL, NULL); 7228 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 7229 md_probe, NULL, NULL); 7230 7231 register_reboot_notifier(&md_notifier); 7232 raid_table_header = register_sysctl_table(raid_root_table); 7233 7234 md_geninit(); 7235 return 0; 7236 } 7237 7238 7239 #ifndef MODULE 7240 7241 /* 7242 * Searches all registered partitions for autorun RAID arrays 7243 * at boot time. 7244 */ 7245 7246 static LIST_HEAD(all_detected_devices); 7247 struct detected_devices_node { 7248 struct list_head list; 7249 dev_t dev; 7250 }; 7251 7252 void md_autodetect_dev(dev_t dev) 7253 { 7254 struct detected_devices_node *node_detected_dev; 7255 7256 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 7257 if (node_detected_dev) { 7258 node_detected_dev->dev = dev; 7259 list_add_tail(&node_detected_dev->list, &all_detected_devices); 7260 } else { 7261 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 7262 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 7263 } 7264 } 7265 7266 7267 static void autostart_arrays(int part) 7268 { 7269 mdk_rdev_t *rdev; 7270 struct detected_devices_node *node_detected_dev; 7271 dev_t dev; 7272 int i_scanned, i_passed; 7273 7274 i_scanned = 0; 7275 i_passed = 0; 7276 7277 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 7278 7279 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 7280 i_scanned++; 7281 node_detected_dev = list_entry(all_detected_devices.next, 7282 struct detected_devices_node, list); 7283 list_del(&node_detected_dev->list); 7284 dev = node_detected_dev->dev; 7285 kfree(node_detected_dev); 7286 rdev = md_import_device(dev,0, 90); 7287 if (IS_ERR(rdev)) 7288 continue; 7289 7290 if (test_bit(Faulty, &rdev->flags)) { 7291 MD_BUG(); 7292 continue; 7293 } 7294 set_bit(AutoDetected, &rdev->flags); 7295 list_add(&rdev->same_set, &pending_raid_disks); 7296 i_passed++; 7297 } 7298 7299 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 7300 i_scanned, i_passed); 7301 7302 autorun_devices(part); 7303 } 7304 7305 #endif /* !MODULE */ 7306 7307 static __exit void md_exit(void) 7308 { 7309 mddev_t *mddev; 7310 struct list_head *tmp; 7311 7312 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); 7313 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 7314 7315 unregister_blkdev(MD_MAJOR,"md"); 7316 unregister_blkdev(mdp_major, "mdp"); 7317 unregister_reboot_notifier(&md_notifier); 7318 unregister_sysctl_table(raid_table_header); 7319 remove_proc_entry("mdstat", NULL); 7320 for_each_mddev(mddev, tmp) { 7321 export_array(mddev); 7322 mddev->hold_active = 0; 7323 } 7324 } 7325 7326 subsys_initcall(md_init); 7327 module_exit(md_exit) 7328 7329 static int get_ro(char *buffer, struct kernel_param *kp) 7330 { 7331 return sprintf(buffer, "%d", start_readonly); 7332 } 7333 static int set_ro(const char *val, struct kernel_param *kp) 7334 { 7335 char *e; 7336 int num = simple_strtoul(val, &e, 10); 7337 if (*val && (*e == '\0' || *e == '\n')) { 7338 start_readonly = num; 7339 return 0; 7340 } 7341 return -EINVAL; 7342 } 7343 7344 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 7345 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 7346 7347 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 7348 7349 EXPORT_SYMBOL(register_md_personality); 7350 EXPORT_SYMBOL(unregister_md_personality); 7351 EXPORT_SYMBOL(md_error); 7352 EXPORT_SYMBOL(md_done_sync); 7353 EXPORT_SYMBOL(md_write_start); 7354 EXPORT_SYMBOL(md_write_end); 7355 EXPORT_SYMBOL(md_register_thread); 7356 EXPORT_SYMBOL(md_unregister_thread); 7357 EXPORT_SYMBOL(md_wakeup_thread); 7358 EXPORT_SYMBOL(md_check_recovery); 7359 MODULE_LICENSE("GPL"); 7360 MODULE_DESCRIPTION("MD RAID framework"); 7361 MODULE_ALIAS("md"); 7362 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 7363