1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 34 Errors, Warnings, etc. 35 Please use: 36 pr_crit() for error conditions that risk data loss 37 pr_err() for error conditions that are unexpected, like an IO error 38 or internal inconsistency 39 pr_warn() for error conditions that could have been predicated, like 40 adding a device to an array when it has incompatible metadata 41 pr_info() for every interesting, very rare events, like an array starting 42 or stopping, or resync starting or stopping 43 pr_debug() for everything else. 44 45 */ 46 47 #include <linux/sched/signal.h> 48 #include <linux/kthread.h> 49 #include <linux/blkdev.h> 50 #include <linux/badblocks.h> 51 #include <linux/sysctl.h> 52 #include <linux/seq_file.h> 53 #include <linux/fs.h> 54 #include <linux/poll.h> 55 #include <linux/ctype.h> 56 #include <linux/string.h> 57 #include <linux/hdreg.h> 58 #include <linux/proc_fs.h> 59 #include <linux/random.h> 60 #include <linux/module.h> 61 #include <linux/reboot.h> 62 #include <linux/file.h> 63 #include <linux/compat.h> 64 #include <linux/delay.h> 65 #include <linux/raid/md_p.h> 66 #include <linux/raid/md_u.h> 67 #include <linux/slab.h> 68 #include <linux/percpu-refcount.h> 69 70 #include <trace/events/block.h> 71 #include "md.h" 72 #include "bitmap.h" 73 #include "md-cluster.h" 74 75 #ifndef MODULE 76 static void autostart_arrays(int part); 77 #endif 78 79 /* pers_list is a list of registered personalities protected 80 * by pers_lock. 81 * pers_lock does extra service to protect accesses to 82 * mddev->thread when the mutex cannot be held. 83 */ 84 static LIST_HEAD(pers_list); 85 static DEFINE_SPINLOCK(pers_lock); 86 87 struct md_cluster_operations *md_cluster_ops; 88 EXPORT_SYMBOL(md_cluster_ops); 89 struct module *md_cluster_mod; 90 EXPORT_SYMBOL(md_cluster_mod); 91 92 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 93 static struct workqueue_struct *md_wq; 94 static struct workqueue_struct *md_misc_wq; 95 96 static int remove_and_add_spares(struct mddev *mddev, 97 struct md_rdev *this); 98 static void mddev_detach(struct mddev *mddev); 99 100 /* 101 * Default number of read corrections we'll attempt on an rdev 102 * before ejecting it from the array. We divide the read error 103 * count by 2 for every hour elapsed between read errors. 104 */ 105 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 106 /* 107 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 108 * is 1000 KB/sec, so the extra system load does not show up that much. 109 * Increase it if you want to have more _guaranteed_ speed. Note that 110 * the RAID driver will use the maximum available bandwidth if the IO 111 * subsystem is idle. There is also an 'absolute maximum' reconstruction 112 * speed limit - in case reconstruction slows down your system despite 113 * idle IO detection. 114 * 115 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 116 * or /sys/block/mdX/md/sync_speed_{min,max} 117 */ 118 119 static int sysctl_speed_limit_min = 1000; 120 static int sysctl_speed_limit_max = 200000; 121 static inline int speed_min(struct mddev *mddev) 122 { 123 return mddev->sync_speed_min ? 124 mddev->sync_speed_min : sysctl_speed_limit_min; 125 } 126 127 static inline int speed_max(struct mddev *mddev) 128 { 129 return mddev->sync_speed_max ? 130 mddev->sync_speed_max : sysctl_speed_limit_max; 131 } 132 133 static struct ctl_table_header *raid_table_header; 134 135 static struct ctl_table raid_table[] = { 136 { 137 .procname = "speed_limit_min", 138 .data = &sysctl_speed_limit_min, 139 .maxlen = sizeof(int), 140 .mode = S_IRUGO|S_IWUSR, 141 .proc_handler = proc_dointvec, 142 }, 143 { 144 .procname = "speed_limit_max", 145 .data = &sysctl_speed_limit_max, 146 .maxlen = sizeof(int), 147 .mode = S_IRUGO|S_IWUSR, 148 .proc_handler = proc_dointvec, 149 }, 150 { } 151 }; 152 153 static struct ctl_table raid_dir_table[] = { 154 { 155 .procname = "raid", 156 .maxlen = 0, 157 .mode = S_IRUGO|S_IXUGO, 158 .child = raid_table, 159 }, 160 { } 161 }; 162 163 static struct ctl_table raid_root_table[] = { 164 { 165 .procname = "dev", 166 .maxlen = 0, 167 .mode = 0555, 168 .child = raid_dir_table, 169 }, 170 { } 171 }; 172 173 static const struct block_device_operations md_fops; 174 175 static int start_readonly; 176 177 /* 178 * The original mechanism for creating an md device is to create 179 * a device node in /dev and to open it. This causes races with device-close. 180 * The preferred method is to write to the "new_array" module parameter. 181 * This can avoid races. 182 * Setting create_on_open to false disables the original mechanism 183 * so all the races disappear. 184 */ 185 static bool create_on_open = true; 186 187 /* bio_clone_mddev 188 * like bio_clone_bioset, but with a local bio set 189 */ 190 191 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 192 struct mddev *mddev) 193 { 194 struct bio *b; 195 196 if (!mddev || !mddev->bio_set) 197 return bio_alloc(gfp_mask, nr_iovecs); 198 199 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); 200 if (!b) 201 return NULL; 202 return b; 203 } 204 EXPORT_SYMBOL_GPL(bio_alloc_mddev); 205 206 static struct bio *md_bio_alloc_sync(struct mddev *mddev) 207 { 208 if (!mddev || !mddev->sync_set) 209 return bio_alloc(GFP_NOIO, 1); 210 211 return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set); 212 } 213 214 /* 215 * We have a system wide 'event count' that is incremented 216 * on any 'interesting' event, and readers of /proc/mdstat 217 * can use 'poll' or 'select' to find out when the event 218 * count increases. 219 * 220 * Events are: 221 * start array, stop array, error, add device, remove device, 222 * start build, activate spare 223 */ 224 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 225 static atomic_t md_event_count; 226 void md_new_event(struct mddev *mddev) 227 { 228 atomic_inc(&md_event_count); 229 wake_up(&md_event_waiters); 230 } 231 EXPORT_SYMBOL_GPL(md_new_event); 232 233 /* 234 * Enables to iterate over all existing md arrays 235 * all_mddevs_lock protects this list. 236 */ 237 static LIST_HEAD(all_mddevs); 238 static DEFINE_SPINLOCK(all_mddevs_lock); 239 240 /* 241 * iterates through all used mddevs in the system. 242 * We take care to grab the all_mddevs_lock whenever navigating 243 * the list, and to always hold a refcount when unlocked. 244 * Any code which breaks out of this loop while own 245 * a reference to the current mddev and must mddev_put it. 246 */ 247 #define for_each_mddev(_mddev,_tmp) \ 248 \ 249 for (({ spin_lock(&all_mddevs_lock); \ 250 _tmp = all_mddevs.next; \ 251 _mddev = NULL;}); \ 252 ({ if (_tmp != &all_mddevs) \ 253 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 254 spin_unlock(&all_mddevs_lock); \ 255 if (_mddev) mddev_put(_mddev); \ 256 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 257 _tmp != &all_mddevs;}); \ 258 ({ spin_lock(&all_mddevs_lock); \ 259 _tmp = _tmp->next;}) \ 260 ) 261 262 /* Rather than calling directly into the personality make_request function, 263 * IO requests come here first so that we can check if the device is 264 * being suspended pending a reconfiguration. 265 * We hold a refcount over the call to ->make_request. By the time that 266 * call has finished, the bio has been linked into some internal structure 267 * and so is visible to ->quiesce(), so we don't need the refcount any more. 268 */ 269 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) 270 { 271 const int rw = bio_data_dir(bio); 272 struct mddev *mddev = q->queuedata; 273 unsigned int sectors; 274 int cpu; 275 276 blk_queue_split(q, &bio); 277 278 if (mddev == NULL || mddev->pers == NULL) { 279 bio_io_error(bio); 280 return BLK_QC_T_NONE; 281 } 282 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 283 if (bio_sectors(bio) != 0) 284 bio->bi_status = BLK_STS_IOERR; 285 bio_endio(bio); 286 return BLK_QC_T_NONE; 287 } 288 check_suspended: 289 rcu_read_lock(); 290 if (mddev->suspended) { 291 DEFINE_WAIT(__wait); 292 for (;;) { 293 prepare_to_wait(&mddev->sb_wait, &__wait, 294 TASK_UNINTERRUPTIBLE); 295 if (!mddev->suspended) 296 break; 297 rcu_read_unlock(); 298 schedule(); 299 rcu_read_lock(); 300 } 301 finish_wait(&mddev->sb_wait, &__wait); 302 } 303 atomic_inc(&mddev->active_io); 304 rcu_read_unlock(); 305 306 /* 307 * save the sectors now since our bio can 308 * go away inside make_request 309 */ 310 sectors = bio_sectors(bio); 311 /* bio could be mergeable after passing to underlayer */ 312 bio->bi_opf &= ~REQ_NOMERGE; 313 if (!mddev->pers->make_request(mddev, bio)) { 314 atomic_dec(&mddev->active_io); 315 wake_up(&mddev->sb_wait); 316 goto check_suspended; 317 } 318 319 cpu = part_stat_lock(); 320 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 321 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); 322 part_stat_unlock(); 323 324 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 325 wake_up(&mddev->sb_wait); 326 327 return BLK_QC_T_NONE; 328 } 329 330 /* mddev_suspend makes sure no new requests are submitted 331 * to the device, and that any requests that have been submitted 332 * are completely handled. 333 * Once mddev_detach() is called and completes, the module will be 334 * completely unused. 335 */ 336 void mddev_suspend(struct mddev *mddev) 337 { 338 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); 339 if (mddev->suspended++) 340 return; 341 synchronize_rcu(); 342 wake_up(&mddev->sb_wait); 343 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 344 mddev->pers->quiesce(mddev, 1); 345 346 del_timer_sync(&mddev->safemode_timer); 347 } 348 EXPORT_SYMBOL_GPL(mddev_suspend); 349 350 void mddev_resume(struct mddev *mddev) 351 { 352 if (--mddev->suspended) 353 return; 354 wake_up(&mddev->sb_wait); 355 mddev->pers->quiesce(mddev, 0); 356 357 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 358 md_wakeup_thread(mddev->thread); 359 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 360 } 361 EXPORT_SYMBOL_GPL(mddev_resume); 362 363 int mddev_congested(struct mddev *mddev, int bits) 364 { 365 struct md_personality *pers = mddev->pers; 366 int ret = 0; 367 368 rcu_read_lock(); 369 if (mddev->suspended) 370 ret = 1; 371 else if (pers && pers->congested) 372 ret = pers->congested(mddev, bits); 373 rcu_read_unlock(); 374 return ret; 375 } 376 EXPORT_SYMBOL_GPL(mddev_congested); 377 static int md_congested(void *data, int bits) 378 { 379 struct mddev *mddev = data; 380 return mddev_congested(mddev, bits); 381 } 382 383 /* 384 * Generic flush handling for md 385 */ 386 387 static void md_end_flush(struct bio *bio) 388 { 389 struct md_rdev *rdev = bio->bi_private; 390 struct mddev *mddev = rdev->mddev; 391 392 rdev_dec_pending(rdev, mddev); 393 394 if (atomic_dec_and_test(&mddev->flush_pending)) { 395 /* The pre-request flush has finished */ 396 queue_work(md_wq, &mddev->flush_work); 397 } 398 bio_put(bio); 399 } 400 401 static void md_submit_flush_data(struct work_struct *ws); 402 403 static void submit_flushes(struct work_struct *ws) 404 { 405 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 406 struct md_rdev *rdev; 407 408 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 409 atomic_set(&mddev->flush_pending, 1); 410 rcu_read_lock(); 411 rdev_for_each_rcu(rdev, mddev) 412 if (rdev->raid_disk >= 0 && 413 !test_bit(Faulty, &rdev->flags)) { 414 /* Take two references, one is dropped 415 * when request finishes, one after 416 * we reclaim rcu_read_lock 417 */ 418 struct bio *bi; 419 atomic_inc(&rdev->nr_pending); 420 atomic_inc(&rdev->nr_pending); 421 rcu_read_unlock(); 422 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 423 bi->bi_end_io = md_end_flush; 424 bi->bi_private = rdev; 425 bio_set_dev(bi, rdev->bdev); 426 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 427 atomic_inc(&mddev->flush_pending); 428 submit_bio(bi); 429 rcu_read_lock(); 430 rdev_dec_pending(rdev, mddev); 431 } 432 rcu_read_unlock(); 433 if (atomic_dec_and_test(&mddev->flush_pending)) 434 queue_work(md_wq, &mddev->flush_work); 435 } 436 437 static void md_submit_flush_data(struct work_struct *ws) 438 { 439 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 440 struct bio *bio = mddev->flush_bio; 441 442 if (bio->bi_iter.bi_size == 0) 443 /* an empty barrier - all done */ 444 bio_endio(bio); 445 else { 446 bio->bi_opf &= ~REQ_PREFLUSH; 447 mddev->pers->make_request(mddev, bio); 448 } 449 450 mddev->flush_bio = NULL; 451 wake_up(&mddev->sb_wait); 452 } 453 454 void md_flush_request(struct mddev *mddev, struct bio *bio) 455 { 456 spin_lock_irq(&mddev->lock); 457 wait_event_lock_irq(mddev->sb_wait, 458 !mddev->flush_bio, 459 mddev->lock); 460 mddev->flush_bio = bio; 461 spin_unlock_irq(&mddev->lock); 462 463 INIT_WORK(&mddev->flush_work, submit_flushes); 464 queue_work(md_wq, &mddev->flush_work); 465 } 466 EXPORT_SYMBOL(md_flush_request); 467 468 static inline struct mddev *mddev_get(struct mddev *mddev) 469 { 470 atomic_inc(&mddev->active); 471 return mddev; 472 } 473 474 static void mddev_delayed_delete(struct work_struct *ws); 475 476 static void mddev_put(struct mddev *mddev) 477 { 478 struct bio_set *bs = NULL, *sync_bs = NULL; 479 480 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 481 return; 482 if (!mddev->raid_disks && list_empty(&mddev->disks) && 483 mddev->ctime == 0 && !mddev->hold_active) { 484 /* Array is not configured at all, and not held active, 485 * so destroy it */ 486 list_del_init(&mddev->all_mddevs); 487 bs = mddev->bio_set; 488 sync_bs = mddev->sync_set; 489 mddev->bio_set = NULL; 490 mddev->sync_set = NULL; 491 if (mddev->gendisk) { 492 /* We did a probe so need to clean up. Call 493 * queue_work inside the spinlock so that 494 * flush_workqueue() after mddev_find will 495 * succeed in waiting for the work to be done. 496 */ 497 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 498 queue_work(md_misc_wq, &mddev->del_work); 499 } else 500 kfree(mddev); 501 } 502 spin_unlock(&all_mddevs_lock); 503 if (bs) 504 bioset_free(bs); 505 if (sync_bs) 506 bioset_free(sync_bs); 507 } 508 509 static void md_safemode_timeout(unsigned long data); 510 511 void mddev_init(struct mddev *mddev) 512 { 513 mutex_init(&mddev->open_mutex); 514 mutex_init(&mddev->reconfig_mutex); 515 mutex_init(&mddev->bitmap_info.mutex); 516 INIT_LIST_HEAD(&mddev->disks); 517 INIT_LIST_HEAD(&mddev->all_mddevs); 518 setup_timer(&mddev->safemode_timer, md_safemode_timeout, 519 (unsigned long) mddev); 520 atomic_set(&mddev->active, 1); 521 atomic_set(&mddev->openers, 0); 522 atomic_set(&mddev->active_io, 0); 523 spin_lock_init(&mddev->lock); 524 atomic_set(&mddev->flush_pending, 0); 525 init_waitqueue_head(&mddev->sb_wait); 526 init_waitqueue_head(&mddev->recovery_wait); 527 mddev->reshape_position = MaxSector; 528 mddev->reshape_backwards = 0; 529 mddev->last_sync_action = "none"; 530 mddev->resync_min = 0; 531 mddev->resync_max = MaxSector; 532 mddev->level = LEVEL_NONE; 533 } 534 EXPORT_SYMBOL_GPL(mddev_init); 535 536 static struct mddev *mddev_find(dev_t unit) 537 { 538 struct mddev *mddev, *new = NULL; 539 540 if (unit && MAJOR(unit) != MD_MAJOR) 541 unit &= ~((1<<MdpMinorShift)-1); 542 543 retry: 544 spin_lock(&all_mddevs_lock); 545 546 if (unit) { 547 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 548 if (mddev->unit == unit) { 549 mddev_get(mddev); 550 spin_unlock(&all_mddevs_lock); 551 kfree(new); 552 return mddev; 553 } 554 555 if (new) { 556 list_add(&new->all_mddevs, &all_mddevs); 557 spin_unlock(&all_mddevs_lock); 558 new->hold_active = UNTIL_IOCTL; 559 return new; 560 } 561 } else if (new) { 562 /* find an unused unit number */ 563 static int next_minor = 512; 564 int start = next_minor; 565 int is_free = 0; 566 int dev = 0; 567 while (!is_free) { 568 dev = MKDEV(MD_MAJOR, next_minor); 569 next_minor++; 570 if (next_minor > MINORMASK) 571 next_minor = 0; 572 if (next_minor == start) { 573 /* Oh dear, all in use. */ 574 spin_unlock(&all_mddevs_lock); 575 kfree(new); 576 return NULL; 577 } 578 579 is_free = 1; 580 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 581 if (mddev->unit == dev) { 582 is_free = 0; 583 break; 584 } 585 } 586 new->unit = dev; 587 new->md_minor = MINOR(dev); 588 new->hold_active = UNTIL_STOP; 589 list_add(&new->all_mddevs, &all_mddevs); 590 spin_unlock(&all_mddevs_lock); 591 return new; 592 } 593 spin_unlock(&all_mddevs_lock); 594 595 new = kzalloc(sizeof(*new), GFP_KERNEL); 596 if (!new) 597 return NULL; 598 599 new->unit = unit; 600 if (MAJOR(unit) == MD_MAJOR) 601 new->md_minor = MINOR(unit); 602 else 603 new->md_minor = MINOR(unit) >> MdpMinorShift; 604 605 mddev_init(new); 606 607 goto retry; 608 } 609 610 static struct attribute_group md_redundancy_group; 611 612 void mddev_unlock(struct mddev *mddev) 613 { 614 if (mddev->to_remove) { 615 /* These cannot be removed under reconfig_mutex as 616 * an access to the files will try to take reconfig_mutex 617 * while holding the file unremovable, which leads to 618 * a deadlock. 619 * So hold set sysfs_active while the remove in happeing, 620 * and anything else which might set ->to_remove or my 621 * otherwise change the sysfs namespace will fail with 622 * -EBUSY if sysfs_active is still set. 623 * We set sysfs_active under reconfig_mutex and elsewhere 624 * test it under the same mutex to ensure its correct value 625 * is seen. 626 */ 627 struct attribute_group *to_remove = mddev->to_remove; 628 mddev->to_remove = NULL; 629 mddev->sysfs_active = 1; 630 mutex_unlock(&mddev->reconfig_mutex); 631 632 if (mddev->kobj.sd) { 633 if (to_remove != &md_redundancy_group) 634 sysfs_remove_group(&mddev->kobj, to_remove); 635 if (mddev->pers == NULL || 636 mddev->pers->sync_request == NULL) { 637 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 638 if (mddev->sysfs_action) 639 sysfs_put(mddev->sysfs_action); 640 mddev->sysfs_action = NULL; 641 } 642 } 643 mddev->sysfs_active = 0; 644 } else 645 mutex_unlock(&mddev->reconfig_mutex); 646 647 /* As we've dropped the mutex we need a spinlock to 648 * make sure the thread doesn't disappear 649 */ 650 spin_lock(&pers_lock); 651 md_wakeup_thread(mddev->thread); 652 spin_unlock(&pers_lock); 653 } 654 EXPORT_SYMBOL_GPL(mddev_unlock); 655 656 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 657 { 658 struct md_rdev *rdev; 659 660 rdev_for_each_rcu(rdev, mddev) 661 if (rdev->desc_nr == nr) 662 return rdev; 663 664 return NULL; 665 } 666 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 667 668 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 669 { 670 struct md_rdev *rdev; 671 672 rdev_for_each(rdev, mddev) 673 if (rdev->bdev->bd_dev == dev) 674 return rdev; 675 676 return NULL; 677 } 678 679 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 680 { 681 struct md_rdev *rdev; 682 683 rdev_for_each_rcu(rdev, mddev) 684 if (rdev->bdev->bd_dev == dev) 685 return rdev; 686 687 return NULL; 688 } 689 690 static struct md_personality *find_pers(int level, char *clevel) 691 { 692 struct md_personality *pers; 693 list_for_each_entry(pers, &pers_list, list) { 694 if (level != LEVEL_NONE && pers->level == level) 695 return pers; 696 if (strcmp(pers->name, clevel)==0) 697 return pers; 698 } 699 return NULL; 700 } 701 702 /* return the offset of the super block in 512byte sectors */ 703 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 704 { 705 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 706 return MD_NEW_SIZE_SECTORS(num_sectors); 707 } 708 709 static int alloc_disk_sb(struct md_rdev *rdev) 710 { 711 rdev->sb_page = alloc_page(GFP_KERNEL); 712 if (!rdev->sb_page) 713 return -ENOMEM; 714 return 0; 715 } 716 717 void md_rdev_clear(struct md_rdev *rdev) 718 { 719 if (rdev->sb_page) { 720 put_page(rdev->sb_page); 721 rdev->sb_loaded = 0; 722 rdev->sb_page = NULL; 723 rdev->sb_start = 0; 724 rdev->sectors = 0; 725 } 726 if (rdev->bb_page) { 727 put_page(rdev->bb_page); 728 rdev->bb_page = NULL; 729 } 730 badblocks_exit(&rdev->badblocks); 731 } 732 EXPORT_SYMBOL_GPL(md_rdev_clear); 733 734 static void super_written(struct bio *bio) 735 { 736 struct md_rdev *rdev = bio->bi_private; 737 struct mddev *mddev = rdev->mddev; 738 739 if (bio->bi_status) { 740 pr_err("md: super_written gets error=%d\n", bio->bi_status); 741 md_error(mddev, rdev); 742 if (!test_bit(Faulty, &rdev->flags) 743 && (bio->bi_opf & MD_FAILFAST)) { 744 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 745 set_bit(LastDev, &rdev->flags); 746 } 747 } else 748 clear_bit(LastDev, &rdev->flags); 749 750 if (atomic_dec_and_test(&mddev->pending_writes)) 751 wake_up(&mddev->sb_wait); 752 rdev_dec_pending(rdev, mddev); 753 bio_put(bio); 754 } 755 756 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 757 sector_t sector, int size, struct page *page) 758 { 759 /* write first size bytes of page to sector of rdev 760 * Increment mddev->pending_writes before returning 761 * and decrement it on completion, waking up sb_wait 762 * if zero is reached. 763 * If an error occurred, call md_error 764 */ 765 struct bio *bio; 766 int ff = 0; 767 768 if (test_bit(Faulty, &rdev->flags)) 769 return; 770 771 bio = md_bio_alloc_sync(mddev); 772 773 atomic_inc(&rdev->nr_pending); 774 775 bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev); 776 bio->bi_iter.bi_sector = sector; 777 bio_add_page(bio, page, size, 0); 778 bio->bi_private = rdev; 779 bio->bi_end_io = super_written; 780 781 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 782 test_bit(FailFast, &rdev->flags) && 783 !test_bit(LastDev, &rdev->flags)) 784 ff = MD_FAILFAST; 785 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff; 786 787 atomic_inc(&mddev->pending_writes); 788 submit_bio(bio); 789 } 790 791 int md_super_wait(struct mddev *mddev) 792 { 793 /* wait for all superblock writes that were scheduled to complete */ 794 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 795 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 796 return -EAGAIN; 797 return 0; 798 } 799 800 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 801 struct page *page, int op, int op_flags, bool metadata_op) 802 { 803 struct bio *bio = md_bio_alloc_sync(rdev->mddev); 804 int ret; 805 806 if (metadata_op && rdev->meta_bdev) 807 bio_set_dev(bio, rdev->meta_bdev); 808 else 809 bio_set_dev(bio, rdev->bdev); 810 bio_set_op_attrs(bio, op, op_flags); 811 if (metadata_op) 812 bio->bi_iter.bi_sector = sector + rdev->sb_start; 813 else if (rdev->mddev->reshape_position != MaxSector && 814 (rdev->mddev->reshape_backwards == 815 (sector >= rdev->mddev->reshape_position))) 816 bio->bi_iter.bi_sector = sector + rdev->new_data_offset; 817 else 818 bio->bi_iter.bi_sector = sector + rdev->data_offset; 819 bio_add_page(bio, page, size, 0); 820 821 submit_bio_wait(bio); 822 823 ret = !bio->bi_status; 824 bio_put(bio); 825 return ret; 826 } 827 EXPORT_SYMBOL_GPL(sync_page_io); 828 829 static int read_disk_sb(struct md_rdev *rdev, int size) 830 { 831 char b[BDEVNAME_SIZE]; 832 833 if (rdev->sb_loaded) 834 return 0; 835 836 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) 837 goto fail; 838 rdev->sb_loaded = 1; 839 return 0; 840 841 fail: 842 pr_err("md: disabled device %s, could not read superblock.\n", 843 bdevname(rdev->bdev,b)); 844 return -EINVAL; 845 } 846 847 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 848 { 849 return sb1->set_uuid0 == sb2->set_uuid0 && 850 sb1->set_uuid1 == sb2->set_uuid1 && 851 sb1->set_uuid2 == sb2->set_uuid2 && 852 sb1->set_uuid3 == sb2->set_uuid3; 853 } 854 855 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 856 { 857 int ret; 858 mdp_super_t *tmp1, *tmp2; 859 860 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 861 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 862 863 if (!tmp1 || !tmp2) { 864 ret = 0; 865 goto abort; 866 } 867 868 *tmp1 = *sb1; 869 *tmp2 = *sb2; 870 871 /* 872 * nr_disks is not constant 873 */ 874 tmp1->nr_disks = 0; 875 tmp2->nr_disks = 0; 876 877 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 878 abort: 879 kfree(tmp1); 880 kfree(tmp2); 881 return ret; 882 } 883 884 static u32 md_csum_fold(u32 csum) 885 { 886 csum = (csum & 0xffff) + (csum >> 16); 887 return (csum & 0xffff) + (csum >> 16); 888 } 889 890 static unsigned int calc_sb_csum(mdp_super_t *sb) 891 { 892 u64 newcsum = 0; 893 u32 *sb32 = (u32*)sb; 894 int i; 895 unsigned int disk_csum, csum; 896 897 disk_csum = sb->sb_csum; 898 sb->sb_csum = 0; 899 900 for (i = 0; i < MD_SB_BYTES/4 ; i++) 901 newcsum += sb32[i]; 902 csum = (newcsum & 0xffffffff) + (newcsum>>32); 903 904 #ifdef CONFIG_ALPHA 905 /* This used to use csum_partial, which was wrong for several 906 * reasons including that different results are returned on 907 * different architectures. It isn't critical that we get exactly 908 * the same return value as before (we always csum_fold before 909 * testing, and that removes any differences). However as we 910 * know that csum_partial always returned a 16bit value on 911 * alphas, do a fold to maximise conformity to previous behaviour. 912 */ 913 sb->sb_csum = md_csum_fold(disk_csum); 914 #else 915 sb->sb_csum = disk_csum; 916 #endif 917 return csum; 918 } 919 920 /* 921 * Handle superblock details. 922 * We want to be able to handle multiple superblock formats 923 * so we have a common interface to them all, and an array of 924 * different handlers. 925 * We rely on user-space to write the initial superblock, and support 926 * reading and updating of superblocks. 927 * Interface methods are: 928 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 929 * loads and validates a superblock on dev. 930 * if refdev != NULL, compare superblocks on both devices 931 * Return: 932 * 0 - dev has a superblock that is compatible with refdev 933 * 1 - dev has a superblock that is compatible and newer than refdev 934 * so dev should be used as the refdev in future 935 * -EINVAL superblock incompatible or invalid 936 * -othererror e.g. -EIO 937 * 938 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 939 * Verify that dev is acceptable into mddev. 940 * The first time, mddev->raid_disks will be 0, and data from 941 * dev should be merged in. Subsequent calls check that dev 942 * is new enough. Return 0 or -EINVAL 943 * 944 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 945 * Update the superblock for rdev with data in mddev 946 * This does not write to disc. 947 * 948 */ 949 950 struct super_type { 951 char *name; 952 struct module *owner; 953 int (*load_super)(struct md_rdev *rdev, 954 struct md_rdev *refdev, 955 int minor_version); 956 int (*validate_super)(struct mddev *mddev, 957 struct md_rdev *rdev); 958 void (*sync_super)(struct mddev *mddev, 959 struct md_rdev *rdev); 960 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 961 sector_t num_sectors); 962 int (*allow_new_offset)(struct md_rdev *rdev, 963 unsigned long long new_offset); 964 }; 965 966 /* 967 * Check that the given mddev has no bitmap. 968 * 969 * This function is called from the run method of all personalities that do not 970 * support bitmaps. It prints an error message and returns non-zero if mddev 971 * has a bitmap. Otherwise, it returns 0. 972 * 973 */ 974 int md_check_no_bitmap(struct mddev *mddev) 975 { 976 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 977 return 0; 978 pr_warn("%s: bitmaps are not supported for %s\n", 979 mdname(mddev), mddev->pers->name); 980 return 1; 981 } 982 EXPORT_SYMBOL(md_check_no_bitmap); 983 984 /* 985 * load_super for 0.90.0 986 */ 987 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 988 { 989 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 990 mdp_super_t *sb; 991 int ret; 992 993 /* 994 * Calculate the position of the superblock (512byte sectors), 995 * it's at the end of the disk. 996 * 997 * It also happens to be a multiple of 4Kb. 998 */ 999 rdev->sb_start = calc_dev_sboffset(rdev); 1000 1001 ret = read_disk_sb(rdev, MD_SB_BYTES); 1002 if (ret) 1003 return ret; 1004 1005 ret = -EINVAL; 1006 1007 bdevname(rdev->bdev, b); 1008 sb = page_address(rdev->sb_page); 1009 1010 if (sb->md_magic != MD_SB_MAGIC) { 1011 pr_warn("md: invalid raid superblock magic on %s\n", b); 1012 goto abort; 1013 } 1014 1015 if (sb->major_version != 0 || 1016 sb->minor_version < 90 || 1017 sb->minor_version > 91) { 1018 pr_warn("Bad version number %d.%d on %s\n", 1019 sb->major_version, sb->minor_version, b); 1020 goto abort; 1021 } 1022 1023 if (sb->raid_disks <= 0) 1024 goto abort; 1025 1026 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1027 pr_warn("md: invalid superblock checksum on %s\n", b); 1028 goto abort; 1029 } 1030 1031 rdev->preferred_minor = sb->md_minor; 1032 rdev->data_offset = 0; 1033 rdev->new_data_offset = 0; 1034 rdev->sb_size = MD_SB_BYTES; 1035 rdev->badblocks.shift = -1; 1036 1037 if (sb->level == LEVEL_MULTIPATH) 1038 rdev->desc_nr = -1; 1039 else 1040 rdev->desc_nr = sb->this_disk.number; 1041 1042 if (!refdev) { 1043 ret = 1; 1044 } else { 1045 __u64 ev1, ev2; 1046 mdp_super_t *refsb = page_address(refdev->sb_page); 1047 if (!md_uuid_equal(refsb, sb)) { 1048 pr_warn("md: %s has different UUID to %s\n", 1049 b, bdevname(refdev->bdev,b2)); 1050 goto abort; 1051 } 1052 if (!md_sb_equal(refsb, sb)) { 1053 pr_warn("md: %s has same UUID but different superblock to %s\n", 1054 b, bdevname(refdev->bdev, b2)); 1055 goto abort; 1056 } 1057 ev1 = md_event(sb); 1058 ev2 = md_event(refsb); 1059 if (ev1 > ev2) 1060 ret = 1; 1061 else 1062 ret = 0; 1063 } 1064 rdev->sectors = rdev->sb_start; 1065 /* Limit to 4TB as metadata cannot record more than that. 1066 * (not needed for Linear and RAID0 as metadata doesn't 1067 * record this size) 1068 */ 1069 if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && 1070 sb->level >= 1) 1071 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1072 1073 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1074 /* "this cannot possibly happen" ... */ 1075 ret = -EINVAL; 1076 1077 abort: 1078 return ret; 1079 } 1080 1081 /* 1082 * validate_super for 0.90.0 1083 */ 1084 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1085 { 1086 mdp_disk_t *desc; 1087 mdp_super_t *sb = page_address(rdev->sb_page); 1088 __u64 ev1 = md_event(sb); 1089 1090 rdev->raid_disk = -1; 1091 clear_bit(Faulty, &rdev->flags); 1092 clear_bit(In_sync, &rdev->flags); 1093 clear_bit(Bitmap_sync, &rdev->flags); 1094 clear_bit(WriteMostly, &rdev->flags); 1095 1096 if (mddev->raid_disks == 0) { 1097 mddev->major_version = 0; 1098 mddev->minor_version = sb->minor_version; 1099 mddev->patch_version = sb->patch_version; 1100 mddev->external = 0; 1101 mddev->chunk_sectors = sb->chunk_size >> 9; 1102 mddev->ctime = sb->ctime; 1103 mddev->utime = sb->utime; 1104 mddev->level = sb->level; 1105 mddev->clevel[0] = 0; 1106 mddev->layout = sb->layout; 1107 mddev->raid_disks = sb->raid_disks; 1108 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1109 mddev->events = ev1; 1110 mddev->bitmap_info.offset = 0; 1111 mddev->bitmap_info.space = 0; 1112 /* bitmap can use 60 K after the 4K superblocks */ 1113 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1114 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1115 mddev->reshape_backwards = 0; 1116 1117 if (mddev->minor_version >= 91) { 1118 mddev->reshape_position = sb->reshape_position; 1119 mddev->delta_disks = sb->delta_disks; 1120 mddev->new_level = sb->new_level; 1121 mddev->new_layout = sb->new_layout; 1122 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1123 if (mddev->delta_disks < 0) 1124 mddev->reshape_backwards = 1; 1125 } else { 1126 mddev->reshape_position = MaxSector; 1127 mddev->delta_disks = 0; 1128 mddev->new_level = mddev->level; 1129 mddev->new_layout = mddev->layout; 1130 mddev->new_chunk_sectors = mddev->chunk_sectors; 1131 } 1132 1133 if (sb->state & (1<<MD_SB_CLEAN)) 1134 mddev->recovery_cp = MaxSector; 1135 else { 1136 if (sb->events_hi == sb->cp_events_hi && 1137 sb->events_lo == sb->cp_events_lo) { 1138 mddev->recovery_cp = sb->recovery_cp; 1139 } else 1140 mddev->recovery_cp = 0; 1141 } 1142 1143 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1144 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1145 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1146 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1147 1148 mddev->max_disks = MD_SB_DISKS; 1149 1150 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1151 mddev->bitmap_info.file == NULL) { 1152 mddev->bitmap_info.offset = 1153 mddev->bitmap_info.default_offset; 1154 mddev->bitmap_info.space = 1155 mddev->bitmap_info.default_space; 1156 } 1157 1158 } else if (mddev->pers == NULL) { 1159 /* Insist on good event counter while assembling, except 1160 * for spares (which don't need an event count) */ 1161 ++ev1; 1162 if (sb->disks[rdev->desc_nr].state & ( 1163 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1164 if (ev1 < mddev->events) 1165 return -EINVAL; 1166 } else if (mddev->bitmap) { 1167 /* if adding to array with a bitmap, then we can accept an 1168 * older device ... but not too old. 1169 */ 1170 if (ev1 < mddev->bitmap->events_cleared) 1171 return 0; 1172 if (ev1 < mddev->events) 1173 set_bit(Bitmap_sync, &rdev->flags); 1174 } else { 1175 if (ev1 < mddev->events) 1176 /* just a hot-add of a new device, leave raid_disk at -1 */ 1177 return 0; 1178 } 1179 1180 if (mddev->level != LEVEL_MULTIPATH) { 1181 desc = sb->disks + rdev->desc_nr; 1182 1183 if (desc->state & (1<<MD_DISK_FAULTY)) 1184 set_bit(Faulty, &rdev->flags); 1185 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1186 desc->raid_disk < mddev->raid_disks */) { 1187 set_bit(In_sync, &rdev->flags); 1188 rdev->raid_disk = desc->raid_disk; 1189 rdev->saved_raid_disk = desc->raid_disk; 1190 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1191 /* active but not in sync implies recovery up to 1192 * reshape position. We don't know exactly where 1193 * that is, so set to zero for now */ 1194 if (mddev->minor_version >= 91) { 1195 rdev->recovery_offset = 0; 1196 rdev->raid_disk = desc->raid_disk; 1197 } 1198 } 1199 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1200 set_bit(WriteMostly, &rdev->flags); 1201 if (desc->state & (1<<MD_DISK_FAILFAST)) 1202 set_bit(FailFast, &rdev->flags); 1203 } else /* MULTIPATH are always insync */ 1204 set_bit(In_sync, &rdev->flags); 1205 return 0; 1206 } 1207 1208 /* 1209 * sync_super for 0.90.0 1210 */ 1211 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1212 { 1213 mdp_super_t *sb; 1214 struct md_rdev *rdev2; 1215 int next_spare = mddev->raid_disks; 1216 1217 /* make rdev->sb match mddev data.. 1218 * 1219 * 1/ zero out disks 1220 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1221 * 3/ any empty disks < next_spare become removed 1222 * 1223 * disks[0] gets initialised to REMOVED because 1224 * we cannot be sure from other fields if it has 1225 * been initialised or not. 1226 */ 1227 int i; 1228 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1229 1230 rdev->sb_size = MD_SB_BYTES; 1231 1232 sb = page_address(rdev->sb_page); 1233 1234 memset(sb, 0, sizeof(*sb)); 1235 1236 sb->md_magic = MD_SB_MAGIC; 1237 sb->major_version = mddev->major_version; 1238 sb->patch_version = mddev->patch_version; 1239 sb->gvalid_words = 0; /* ignored */ 1240 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1241 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1242 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1243 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1244 1245 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1246 sb->level = mddev->level; 1247 sb->size = mddev->dev_sectors / 2; 1248 sb->raid_disks = mddev->raid_disks; 1249 sb->md_minor = mddev->md_minor; 1250 sb->not_persistent = 0; 1251 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1252 sb->state = 0; 1253 sb->events_hi = (mddev->events>>32); 1254 sb->events_lo = (u32)mddev->events; 1255 1256 if (mddev->reshape_position == MaxSector) 1257 sb->minor_version = 90; 1258 else { 1259 sb->minor_version = 91; 1260 sb->reshape_position = mddev->reshape_position; 1261 sb->new_level = mddev->new_level; 1262 sb->delta_disks = mddev->delta_disks; 1263 sb->new_layout = mddev->new_layout; 1264 sb->new_chunk = mddev->new_chunk_sectors << 9; 1265 } 1266 mddev->minor_version = sb->minor_version; 1267 if (mddev->in_sync) 1268 { 1269 sb->recovery_cp = mddev->recovery_cp; 1270 sb->cp_events_hi = (mddev->events>>32); 1271 sb->cp_events_lo = (u32)mddev->events; 1272 if (mddev->recovery_cp == MaxSector) 1273 sb->state = (1<< MD_SB_CLEAN); 1274 } else 1275 sb->recovery_cp = 0; 1276 1277 sb->layout = mddev->layout; 1278 sb->chunk_size = mddev->chunk_sectors << 9; 1279 1280 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1281 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1282 1283 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1284 rdev_for_each(rdev2, mddev) { 1285 mdp_disk_t *d; 1286 int desc_nr; 1287 int is_active = test_bit(In_sync, &rdev2->flags); 1288 1289 if (rdev2->raid_disk >= 0 && 1290 sb->minor_version >= 91) 1291 /* we have nowhere to store the recovery_offset, 1292 * but if it is not below the reshape_position, 1293 * we can piggy-back on that. 1294 */ 1295 is_active = 1; 1296 if (rdev2->raid_disk < 0 || 1297 test_bit(Faulty, &rdev2->flags)) 1298 is_active = 0; 1299 if (is_active) 1300 desc_nr = rdev2->raid_disk; 1301 else 1302 desc_nr = next_spare++; 1303 rdev2->desc_nr = desc_nr; 1304 d = &sb->disks[rdev2->desc_nr]; 1305 nr_disks++; 1306 d->number = rdev2->desc_nr; 1307 d->major = MAJOR(rdev2->bdev->bd_dev); 1308 d->minor = MINOR(rdev2->bdev->bd_dev); 1309 if (is_active) 1310 d->raid_disk = rdev2->raid_disk; 1311 else 1312 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1313 if (test_bit(Faulty, &rdev2->flags)) 1314 d->state = (1<<MD_DISK_FAULTY); 1315 else if (is_active) { 1316 d->state = (1<<MD_DISK_ACTIVE); 1317 if (test_bit(In_sync, &rdev2->flags)) 1318 d->state |= (1<<MD_DISK_SYNC); 1319 active++; 1320 working++; 1321 } else { 1322 d->state = 0; 1323 spare++; 1324 working++; 1325 } 1326 if (test_bit(WriteMostly, &rdev2->flags)) 1327 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1328 if (test_bit(FailFast, &rdev2->flags)) 1329 d->state |= (1<<MD_DISK_FAILFAST); 1330 } 1331 /* now set the "removed" and "faulty" bits on any missing devices */ 1332 for (i=0 ; i < mddev->raid_disks ; i++) { 1333 mdp_disk_t *d = &sb->disks[i]; 1334 if (d->state == 0 && d->number == 0) { 1335 d->number = i; 1336 d->raid_disk = i; 1337 d->state = (1<<MD_DISK_REMOVED); 1338 d->state |= (1<<MD_DISK_FAULTY); 1339 failed++; 1340 } 1341 } 1342 sb->nr_disks = nr_disks; 1343 sb->active_disks = active; 1344 sb->working_disks = working; 1345 sb->failed_disks = failed; 1346 sb->spare_disks = spare; 1347 1348 sb->this_disk = sb->disks[rdev->desc_nr]; 1349 sb->sb_csum = calc_sb_csum(sb); 1350 } 1351 1352 /* 1353 * rdev_size_change for 0.90.0 1354 */ 1355 static unsigned long long 1356 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1357 { 1358 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1359 return 0; /* component must fit device */ 1360 if (rdev->mddev->bitmap_info.offset) 1361 return 0; /* can't move bitmap */ 1362 rdev->sb_start = calc_dev_sboffset(rdev); 1363 if (!num_sectors || num_sectors > rdev->sb_start) 1364 num_sectors = rdev->sb_start; 1365 /* Limit to 4TB as metadata cannot record more than that. 1366 * 4TB == 2^32 KB, or 2*2^32 sectors. 1367 */ 1368 if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && 1369 rdev->mddev->level >= 1) 1370 num_sectors = (sector_t)(2ULL << 32) - 2; 1371 do { 1372 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1373 rdev->sb_page); 1374 } while (md_super_wait(rdev->mddev) < 0); 1375 return num_sectors; 1376 } 1377 1378 static int 1379 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1380 { 1381 /* non-zero offset changes not possible with v0.90 */ 1382 return new_offset == 0; 1383 } 1384 1385 /* 1386 * version 1 superblock 1387 */ 1388 1389 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1390 { 1391 __le32 disk_csum; 1392 u32 csum; 1393 unsigned long long newcsum; 1394 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1395 __le32 *isuper = (__le32*)sb; 1396 1397 disk_csum = sb->sb_csum; 1398 sb->sb_csum = 0; 1399 newcsum = 0; 1400 for (; size >= 4; size -= 4) 1401 newcsum += le32_to_cpu(*isuper++); 1402 1403 if (size == 2) 1404 newcsum += le16_to_cpu(*(__le16*) isuper); 1405 1406 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1407 sb->sb_csum = disk_csum; 1408 return cpu_to_le32(csum); 1409 } 1410 1411 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1412 { 1413 struct mdp_superblock_1 *sb; 1414 int ret; 1415 sector_t sb_start; 1416 sector_t sectors; 1417 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1418 int bmask; 1419 1420 /* 1421 * Calculate the position of the superblock in 512byte sectors. 1422 * It is always aligned to a 4K boundary and 1423 * depeding on minor_version, it can be: 1424 * 0: At least 8K, but less than 12K, from end of device 1425 * 1: At start of device 1426 * 2: 4K from start of device. 1427 */ 1428 switch(minor_version) { 1429 case 0: 1430 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; 1431 sb_start -= 8*2; 1432 sb_start &= ~(sector_t)(4*2-1); 1433 break; 1434 case 1: 1435 sb_start = 0; 1436 break; 1437 case 2: 1438 sb_start = 8; 1439 break; 1440 default: 1441 return -EINVAL; 1442 } 1443 rdev->sb_start = sb_start; 1444 1445 /* superblock is rarely larger than 1K, but it can be larger, 1446 * and it is safe to read 4k, so we do that 1447 */ 1448 ret = read_disk_sb(rdev, 4096); 1449 if (ret) return ret; 1450 1451 sb = page_address(rdev->sb_page); 1452 1453 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1454 sb->major_version != cpu_to_le32(1) || 1455 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1456 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1457 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1458 return -EINVAL; 1459 1460 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1461 pr_warn("md: invalid superblock checksum on %s\n", 1462 bdevname(rdev->bdev,b)); 1463 return -EINVAL; 1464 } 1465 if (le64_to_cpu(sb->data_size) < 10) { 1466 pr_warn("md: data_size too small on %s\n", 1467 bdevname(rdev->bdev,b)); 1468 return -EINVAL; 1469 } 1470 if (sb->pad0 || 1471 sb->pad3[0] || 1472 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1473 /* Some padding is non-zero, might be a new feature */ 1474 return -EINVAL; 1475 1476 rdev->preferred_minor = 0xffff; 1477 rdev->data_offset = le64_to_cpu(sb->data_offset); 1478 rdev->new_data_offset = rdev->data_offset; 1479 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1480 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1481 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1482 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1483 1484 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1485 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1486 if (rdev->sb_size & bmask) 1487 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1488 1489 if (minor_version 1490 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1491 return -EINVAL; 1492 if (minor_version 1493 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1494 return -EINVAL; 1495 1496 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1497 rdev->desc_nr = -1; 1498 else 1499 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1500 1501 if (!rdev->bb_page) { 1502 rdev->bb_page = alloc_page(GFP_KERNEL); 1503 if (!rdev->bb_page) 1504 return -ENOMEM; 1505 } 1506 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1507 rdev->badblocks.count == 0) { 1508 /* need to load the bad block list. 1509 * Currently we limit it to one page. 1510 */ 1511 s32 offset; 1512 sector_t bb_sector; 1513 u64 *bbp; 1514 int i; 1515 int sectors = le16_to_cpu(sb->bblog_size); 1516 if (sectors > (PAGE_SIZE / 512)) 1517 return -EINVAL; 1518 offset = le32_to_cpu(sb->bblog_offset); 1519 if (offset == 0) 1520 return -EINVAL; 1521 bb_sector = (long long)offset; 1522 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1523 rdev->bb_page, REQ_OP_READ, 0, true)) 1524 return -EIO; 1525 bbp = (u64 *)page_address(rdev->bb_page); 1526 rdev->badblocks.shift = sb->bblog_shift; 1527 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1528 u64 bb = le64_to_cpu(*bbp); 1529 int count = bb & (0x3ff); 1530 u64 sector = bb >> 10; 1531 sector <<= sb->bblog_shift; 1532 count <<= sb->bblog_shift; 1533 if (bb + 1 == 0) 1534 break; 1535 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1536 return -EINVAL; 1537 } 1538 } else if (sb->bblog_offset != 0) 1539 rdev->badblocks.shift = 0; 1540 1541 if ((le32_to_cpu(sb->feature_map) & 1542 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1543 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1544 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1545 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1546 } 1547 1548 if (!refdev) { 1549 ret = 1; 1550 } else { 1551 __u64 ev1, ev2; 1552 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1553 1554 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1555 sb->level != refsb->level || 1556 sb->layout != refsb->layout || 1557 sb->chunksize != refsb->chunksize) { 1558 pr_warn("md: %s has strangely different superblock to %s\n", 1559 bdevname(rdev->bdev,b), 1560 bdevname(refdev->bdev,b2)); 1561 return -EINVAL; 1562 } 1563 ev1 = le64_to_cpu(sb->events); 1564 ev2 = le64_to_cpu(refsb->events); 1565 1566 if (ev1 > ev2) 1567 ret = 1; 1568 else 1569 ret = 0; 1570 } 1571 if (minor_version) { 1572 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1573 sectors -= rdev->data_offset; 1574 } else 1575 sectors = rdev->sb_start; 1576 if (sectors < le64_to_cpu(sb->data_size)) 1577 return -EINVAL; 1578 rdev->sectors = le64_to_cpu(sb->data_size); 1579 return ret; 1580 } 1581 1582 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1583 { 1584 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1585 __u64 ev1 = le64_to_cpu(sb->events); 1586 1587 rdev->raid_disk = -1; 1588 clear_bit(Faulty, &rdev->flags); 1589 clear_bit(In_sync, &rdev->flags); 1590 clear_bit(Bitmap_sync, &rdev->flags); 1591 clear_bit(WriteMostly, &rdev->flags); 1592 1593 if (mddev->raid_disks == 0) { 1594 mddev->major_version = 1; 1595 mddev->patch_version = 0; 1596 mddev->external = 0; 1597 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1598 mddev->ctime = le64_to_cpu(sb->ctime); 1599 mddev->utime = le64_to_cpu(sb->utime); 1600 mddev->level = le32_to_cpu(sb->level); 1601 mddev->clevel[0] = 0; 1602 mddev->layout = le32_to_cpu(sb->layout); 1603 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1604 mddev->dev_sectors = le64_to_cpu(sb->size); 1605 mddev->events = ev1; 1606 mddev->bitmap_info.offset = 0; 1607 mddev->bitmap_info.space = 0; 1608 /* Default location for bitmap is 1K after superblock 1609 * using 3K - total of 4K 1610 */ 1611 mddev->bitmap_info.default_offset = 1024 >> 9; 1612 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1613 mddev->reshape_backwards = 0; 1614 1615 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1616 memcpy(mddev->uuid, sb->set_uuid, 16); 1617 1618 mddev->max_disks = (4096-256)/2; 1619 1620 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1621 mddev->bitmap_info.file == NULL) { 1622 mddev->bitmap_info.offset = 1623 (__s32)le32_to_cpu(sb->bitmap_offset); 1624 /* Metadata doesn't record how much space is available. 1625 * For 1.0, we assume we can use up to the superblock 1626 * if before, else to 4K beyond superblock. 1627 * For others, assume no change is possible. 1628 */ 1629 if (mddev->minor_version > 0) 1630 mddev->bitmap_info.space = 0; 1631 else if (mddev->bitmap_info.offset > 0) 1632 mddev->bitmap_info.space = 1633 8 - mddev->bitmap_info.offset; 1634 else 1635 mddev->bitmap_info.space = 1636 -mddev->bitmap_info.offset; 1637 } 1638 1639 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1640 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1641 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1642 mddev->new_level = le32_to_cpu(sb->new_level); 1643 mddev->new_layout = le32_to_cpu(sb->new_layout); 1644 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1645 if (mddev->delta_disks < 0 || 1646 (mddev->delta_disks == 0 && 1647 (le32_to_cpu(sb->feature_map) 1648 & MD_FEATURE_RESHAPE_BACKWARDS))) 1649 mddev->reshape_backwards = 1; 1650 } else { 1651 mddev->reshape_position = MaxSector; 1652 mddev->delta_disks = 0; 1653 mddev->new_level = mddev->level; 1654 mddev->new_layout = mddev->layout; 1655 mddev->new_chunk_sectors = mddev->chunk_sectors; 1656 } 1657 1658 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1659 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1660 1661 if (le32_to_cpu(sb->feature_map) & 1662 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1663 if (le32_to_cpu(sb->feature_map) & 1664 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1665 return -EINVAL; 1666 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1667 (le32_to_cpu(sb->feature_map) & 1668 MD_FEATURE_MULTIPLE_PPLS)) 1669 return -EINVAL; 1670 set_bit(MD_HAS_PPL, &mddev->flags); 1671 } 1672 } else if (mddev->pers == NULL) { 1673 /* Insist of good event counter while assembling, except for 1674 * spares (which don't need an event count) */ 1675 ++ev1; 1676 if (rdev->desc_nr >= 0 && 1677 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1678 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1679 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1680 if (ev1 < mddev->events) 1681 return -EINVAL; 1682 } else if (mddev->bitmap) { 1683 /* If adding to array with a bitmap, then we can accept an 1684 * older device, but not too old. 1685 */ 1686 if (ev1 < mddev->bitmap->events_cleared) 1687 return 0; 1688 if (ev1 < mddev->events) 1689 set_bit(Bitmap_sync, &rdev->flags); 1690 } else { 1691 if (ev1 < mddev->events) 1692 /* just a hot-add of a new device, leave raid_disk at -1 */ 1693 return 0; 1694 } 1695 if (mddev->level != LEVEL_MULTIPATH) { 1696 int role; 1697 if (rdev->desc_nr < 0 || 1698 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1699 role = MD_DISK_ROLE_SPARE; 1700 rdev->desc_nr = -1; 1701 } else 1702 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1703 switch(role) { 1704 case MD_DISK_ROLE_SPARE: /* spare */ 1705 break; 1706 case MD_DISK_ROLE_FAULTY: /* faulty */ 1707 set_bit(Faulty, &rdev->flags); 1708 break; 1709 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1710 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1711 /* journal device without journal feature */ 1712 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 1713 return -EINVAL; 1714 } 1715 set_bit(Journal, &rdev->flags); 1716 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1717 rdev->raid_disk = 0; 1718 break; 1719 default: 1720 rdev->saved_raid_disk = role; 1721 if ((le32_to_cpu(sb->feature_map) & 1722 MD_FEATURE_RECOVERY_OFFSET)) { 1723 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1724 if (!(le32_to_cpu(sb->feature_map) & 1725 MD_FEATURE_RECOVERY_BITMAP)) 1726 rdev->saved_raid_disk = -1; 1727 } else 1728 set_bit(In_sync, &rdev->flags); 1729 rdev->raid_disk = role; 1730 break; 1731 } 1732 if (sb->devflags & WriteMostly1) 1733 set_bit(WriteMostly, &rdev->flags); 1734 if (sb->devflags & FailFast1) 1735 set_bit(FailFast, &rdev->flags); 1736 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1737 set_bit(Replacement, &rdev->flags); 1738 } else /* MULTIPATH are always insync */ 1739 set_bit(In_sync, &rdev->flags); 1740 1741 return 0; 1742 } 1743 1744 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1745 { 1746 struct mdp_superblock_1 *sb; 1747 struct md_rdev *rdev2; 1748 int max_dev, i; 1749 /* make rdev->sb match mddev and rdev data. */ 1750 1751 sb = page_address(rdev->sb_page); 1752 1753 sb->feature_map = 0; 1754 sb->pad0 = 0; 1755 sb->recovery_offset = cpu_to_le64(0); 1756 memset(sb->pad3, 0, sizeof(sb->pad3)); 1757 1758 sb->utime = cpu_to_le64((__u64)mddev->utime); 1759 sb->events = cpu_to_le64(mddev->events); 1760 if (mddev->in_sync) 1761 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1762 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 1763 sb->resync_offset = cpu_to_le64(MaxSector); 1764 else 1765 sb->resync_offset = cpu_to_le64(0); 1766 1767 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1768 1769 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1770 sb->size = cpu_to_le64(mddev->dev_sectors); 1771 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1772 sb->level = cpu_to_le32(mddev->level); 1773 sb->layout = cpu_to_le32(mddev->layout); 1774 if (test_bit(FailFast, &rdev->flags)) 1775 sb->devflags |= FailFast1; 1776 else 1777 sb->devflags &= ~FailFast1; 1778 1779 if (test_bit(WriteMostly, &rdev->flags)) 1780 sb->devflags |= WriteMostly1; 1781 else 1782 sb->devflags &= ~WriteMostly1; 1783 sb->data_offset = cpu_to_le64(rdev->data_offset); 1784 sb->data_size = cpu_to_le64(rdev->sectors); 1785 1786 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1787 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1788 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1789 } 1790 1791 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 1792 !test_bit(In_sync, &rdev->flags)) { 1793 sb->feature_map |= 1794 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1795 sb->recovery_offset = 1796 cpu_to_le64(rdev->recovery_offset); 1797 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 1798 sb->feature_map |= 1799 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 1800 } 1801 /* Note: recovery_offset and journal_tail share space */ 1802 if (test_bit(Journal, &rdev->flags)) 1803 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 1804 if (test_bit(Replacement, &rdev->flags)) 1805 sb->feature_map |= 1806 cpu_to_le32(MD_FEATURE_REPLACEMENT); 1807 1808 if (mddev->reshape_position != MaxSector) { 1809 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1810 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1811 sb->new_layout = cpu_to_le32(mddev->new_layout); 1812 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1813 sb->new_level = cpu_to_le32(mddev->new_level); 1814 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1815 if (mddev->delta_disks == 0 && 1816 mddev->reshape_backwards) 1817 sb->feature_map 1818 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 1819 if (rdev->new_data_offset != rdev->data_offset) { 1820 sb->feature_map 1821 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 1822 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 1823 - rdev->data_offset)); 1824 } 1825 } 1826 1827 if (mddev_is_clustered(mddev)) 1828 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 1829 1830 if (rdev->badblocks.count == 0) 1831 /* Nothing to do for bad blocks*/ ; 1832 else if (sb->bblog_offset == 0) 1833 /* Cannot record bad blocks on this device */ 1834 md_error(mddev, rdev); 1835 else { 1836 struct badblocks *bb = &rdev->badblocks; 1837 u64 *bbp = (u64 *)page_address(rdev->bb_page); 1838 u64 *p = bb->page; 1839 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 1840 if (bb->changed) { 1841 unsigned seq; 1842 1843 retry: 1844 seq = read_seqbegin(&bb->lock); 1845 1846 memset(bbp, 0xff, PAGE_SIZE); 1847 1848 for (i = 0 ; i < bb->count ; i++) { 1849 u64 internal_bb = p[i]; 1850 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1851 | BB_LEN(internal_bb)); 1852 bbp[i] = cpu_to_le64(store_bb); 1853 } 1854 bb->changed = 0; 1855 if (read_seqretry(&bb->lock, seq)) 1856 goto retry; 1857 1858 bb->sector = (rdev->sb_start + 1859 (int)le32_to_cpu(sb->bblog_offset)); 1860 bb->size = le16_to_cpu(sb->bblog_size); 1861 } 1862 } 1863 1864 max_dev = 0; 1865 rdev_for_each(rdev2, mddev) 1866 if (rdev2->desc_nr+1 > max_dev) 1867 max_dev = rdev2->desc_nr+1; 1868 1869 if (max_dev > le32_to_cpu(sb->max_dev)) { 1870 int bmask; 1871 sb->max_dev = cpu_to_le32(max_dev); 1872 rdev->sb_size = max_dev * 2 + 256; 1873 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1874 if (rdev->sb_size & bmask) 1875 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1876 } else 1877 max_dev = le32_to_cpu(sb->max_dev); 1878 1879 for (i=0; i<max_dev;i++) 1880 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 1881 1882 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 1883 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 1884 1885 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 1886 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 1887 sb->feature_map |= 1888 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 1889 else 1890 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 1891 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 1892 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 1893 } 1894 1895 rdev_for_each(rdev2, mddev) { 1896 i = rdev2->desc_nr; 1897 if (test_bit(Faulty, &rdev2->flags)) 1898 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 1899 else if (test_bit(In_sync, &rdev2->flags)) 1900 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1901 else if (test_bit(Journal, &rdev2->flags)) 1902 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 1903 else if (rdev2->raid_disk >= 0) 1904 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1905 else 1906 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 1907 } 1908 1909 sb->sb_csum = calc_sb_1_csum(sb); 1910 } 1911 1912 static unsigned long long 1913 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1914 { 1915 struct mdp_superblock_1 *sb; 1916 sector_t max_sectors; 1917 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1918 return 0; /* component must fit device */ 1919 if (rdev->data_offset != rdev->new_data_offset) 1920 return 0; /* too confusing */ 1921 if (rdev->sb_start < rdev->data_offset) { 1922 /* minor versions 1 and 2; superblock before data */ 1923 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1924 max_sectors -= rdev->data_offset; 1925 if (!num_sectors || num_sectors > max_sectors) 1926 num_sectors = max_sectors; 1927 } else if (rdev->mddev->bitmap_info.offset) { 1928 /* minor version 0 with bitmap we can't move */ 1929 return 0; 1930 } else { 1931 /* minor version 0; superblock after data */ 1932 sector_t sb_start; 1933 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; 1934 sb_start &= ~(sector_t)(4*2 - 1); 1935 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1936 if (!num_sectors || num_sectors > max_sectors) 1937 num_sectors = max_sectors; 1938 rdev->sb_start = sb_start; 1939 } 1940 sb = page_address(rdev->sb_page); 1941 sb->data_size = cpu_to_le64(num_sectors); 1942 sb->super_offset = cpu_to_le64(rdev->sb_start); 1943 sb->sb_csum = calc_sb_1_csum(sb); 1944 do { 1945 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1946 rdev->sb_page); 1947 } while (md_super_wait(rdev->mddev) < 0); 1948 return num_sectors; 1949 1950 } 1951 1952 static int 1953 super_1_allow_new_offset(struct md_rdev *rdev, 1954 unsigned long long new_offset) 1955 { 1956 /* All necessary checks on new >= old have been done */ 1957 struct bitmap *bitmap; 1958 if (new_offset >= rdev->data_offset) 1959 return 1; 1960 1961 /* with 1.0 metadata, there is no metadata to tread on 1962 * so we can always move back */ 1963 if (rdev->mddev->minor_version == 0) 1964 return 1; 1965 1966 /* otherwise we must be sure not to step on 1967 * any metadata, so stay: 1968 * 36K beyond start of superblock 1969 * beyond end of badblocks 1970 * beyond write-intent bitmap 1971 */ 1972 if (rdev->sb_start + (32+4)*2 > new_offset) 1973 return 0; 1974 bitmap = rdev->mddev->bitmap; 1975 if (bitmap && !rdev->mddev->bitmap_info.file && 1976 rdev->sb_start + rdev->mddev->bitmap_info.offset + 1977 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 1978 return 0; 1979 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 1980 return 0; 1981 1982 return 1; 1983 } 1984 1985 static struct super_type super_types[] = { 1986 [0] = { 1987 .name = "0.90.0", 1988 .owner = THIS_MODULE, 1989 .load_super = super_90_load, 1990 .validate_super = super_90_validate, 1991 .sync_super = super_90_sync, 1992 .rdev_size_change = super_90_rdev_size_change, 1993 .allow_new_offset = super_90_allow_new_offset, 1994 }, 1995 [1] = { 1996 .name = "md-1", 1997 .owner = THIS_MODULE, 1998 .load_super = super_1_load, 1999 .validate_super = super_1_validate, 2000 .sync_super = super_1_sync, 2001 .rdev_size_change = super_1_rdev_size_change, 2002 .allow_new_offset = super_1_allow_new_offset, 2003 }, 2004 }; 2005 2006 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2007 { 2008 if (mddev->sync_super) { 2009 mddev->sync_super(mddev, rdev); 2010 return; 2011 } 2012 2013 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2014 2015 super_types[mddev->major_version].sync_super(mddev, rdev); 2016 } 2017 2018 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2019 { 2020 struct md_rdev *rdev, *rdev2; 2021 2022 rcu_read_lock(); 2023 rdev_for_each_rcu(rdev, mddev1) { 2024 if (test_bit(Faulty, &rdev->flags) || 2025 test_bit(Journal, &rdev->flags) || 2026 rdev->raid_disk == -1) 2027 continue; 2028 rdev_for_each_rcu(rdev2, mddev2) { 2029 if (test_bit(Faulty, &rdev2->flags) || 2030 test_bit(Journal, &rdev2->flags) || 2031 rdev2->raid_disk == -1) 2032 continue; 2033 if (rdev->bdev->bd_contains == 2034 rdev2->bdev->bd_contains) { 2035 rcu_read_unlock(); 2036 return 1; 2037 } 2038 } 2039 } 2040 rcu_read_unlock(); 2041 return 0; 2042 } 2043 2044 static LIST_HEAD(pending_raid_disks); 2045 2046 /* 2047 * Try to register data integrity profile for an mddev 2048 * 2049 * This is called when an array is started and after a disk has been kicked 2050 * from the array. It only succeeds if all working and active component devices 2051 * are integrity capable with matching profiles. 2052 */ 2053 int md_integrity_register(struct mddev *mddev) 2054 { 2055 struct md_rdev *rdev, *reference = NULL; 2056 2057 if (list_empty(&mddev->disks)) 2058 return 0; /* nothing to do */ 2059 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2060 return 0; /* shouldn't register, or already is */ 2061 rdev_for_each(rdev, mddev) { 2062 /* skip spares and non-functional disks */ 2063 if (test_bit(Faulty, &rdev->flags)) 2064 continue; 2065 if (rdev->raid_disk < 0) 2066 continue; 2067 if (!reference) { 2068 /* Use the first rdev as the reference */ 2069 reference = rdev; 2070 continue; 2071 } 2072 /* does this rdev's profile match the reference profile? */ 2073 if (blk_integrity_compare(reference->bdev->bd_disk, 2074 rdev->bdev->bd_disk) < 0) 2075 return -EINVAL; 2076 } 2077 if (!reference || !bdev_get_integrity(reference->bdev)) 2078 return 0; 2079 /* 2080 * All component devices are integrity capable and have matching 2081 * profiles, register the common profile for the md device. 2082 */ 2083 blk_integrity_register(mddev->gendisk, 2084 bdev_get_integrity(reference->bdev)); 2085 2086 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2087 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { 2088 pr_err("md: failed to create integrity pool for %s\n", 2089 mdname(mddev)); 2090 return -EINVAL; 2091 } 2092 return 0; 2093 } 2094 EXPORT_SYMBOL(md_integrity_register); 2095 2096 /* 2097 * Attempt to add an rdev, but only if it is consistent with the current 2098 * integrity profile 2099 */ 2100 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2101 { 2102 struct blk_integrity *bi_rdev; 2103 struct blk_integrity *bi_mddev; 2104 char name[BDEVNAME_SIZE]; 2105 2106 if (!mddev->gendisk) 2107 return 0; 2108 2109 bi_rdev = bdev_get_integrity(rdev->bdev); 2110 bi_mddev = blk_get_integrity(mddev->gendisk); 2111 2112 if (!bi_mddev) /* nothing to do */ 2113 return 0; 2114 2115 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2116 pr_err("%s: incompatible integrity profile for %s\n", 2117 mdname(mddev), bdevname(rdev->bdev, name)); 2118 return -ENXIO; 2119 } 2120 2121 return 0; 2122 } 2123 EXPORT_SYMBOL(md_integrity_add_rdev); 2124 2125 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2126 { 2127 char b[BDEVNAME_SIZE]; 2128 struct kobject *ko; 2129 int err; 2130 2131 /* prevent duplicates */ 2132 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2133 return -EEXIST; 2134 2135 if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) && 2136 mddev->pers) 2137 return -EROFS; 2138 2139 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2140 if (!test_bit(Journal, &rdev->flags) && 2141 rdev->sectors && 2142 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2143 if (mddev->pers) { 2144 /* Cannot change size, so fail 2145 * If mddev->level <= 0, then we don't care 2146 * about aligning sizes (e.g. linear) 2147 */ 2148 if (mddev->level > 0) 2149 return -ENOSPC; 2150 } else 2151 mddev->dev_sectors = rdev->sectors; 2152 } 2153 2154 /* Verify rdev->desc_nr is unique. 2155 * If it is -1, assign a free number, else 2156 * check number is not in use 2157 */ 2158 rcu_read_lock(); 2159 if (rdev->desc_nr < 0) { 2160 int choice = 0; 2161 if (mddev->pers) 2162 choice = mddev->raid_disks; 2163 while (md_find_rdev_nr_rcu(mddev, choice)) 2164 choice++; 2165 rdev->desc_nr = choice; 2166 } else { 2167 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2168 rcu_read_unlock(); 2169 return -EBUSY; 2170 } 2171 } 2172 rcu_read_unlock(); 2173 if (!test_bit(Journal, &rdev->flags) && 2174 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2175 pr_warn("md: %s: array is limited to %d devices\n", 2176 mdname(mddev), mddev->max_disks); 2177 return -EBUSY; 2178 } 2179 bdevname(rdev->bdev,b); 2180 strreplace(b, '/', '!'); 2181 2182 rdev->mddev = mddev; 2183 pr_debug("md: bind<%s>\n", b); 2184 2185 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2186 goto fail; 2187 2188 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 2189 if (sysfs_create_link(&rdev->kobj, ko, "block")) 2190 /* failure here is OK */; 2191 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2192 2193 list_add_rcu(&rdev->same_set, &mddev->disks); 2194 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2195 2196 /* May as well allow recovery to be retried once */ 2197 mddev->recovery_disabled++; 2198 2199 return 0; 2200 2201 fail: 2202 pr_warn("md: failed to register dev-%s for %s\n", 2203 b, mdname(mddev)); 2204 return err; 2205 } 2206 2207 static void md_delayed_delete(struct work_struct *ws) 2208 { 2209 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2210 kobject_del(&rdev->kobj); 2211 kobject_put(&rdev->kobj); 2212 } 2213 2214 static void unbind_rdev_from_array(struct md_rdev *rdev) 2215 { 2216 char b[BDEVNAME_SIZE]; 2217 2218 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2219 list_del_rcu(&rdev->same_set); 2220 pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2221 rdev->mddev = NULL; 2222 sysfs_remove_link(&rdev->kobj, "block"); 2223 sysfs_put(rdev->sysfs_state); 2224 rdev->sysfs_state = NULL; 2225 rdev->badblocks.count = 0; 2226 /* We need to delay this, otherwise we can deadlock when 2227 * writing to 'remove' to "dev/state". We also need 2228 * to delay it due to rcu usage. 2229 */ 2230 synchronize_rcu(); 2231 INIT_WORK(&rdev->del_work, md_delayed_delete); 2232 kobject_get(&rdev->kobj); 2233 queue_work(md_misc_wq, &rdev->del_work); 2234 } 2235 2236 /* 2237 * prevent the device from being mounted, repartitioned or 2238 * otherwise reused by a RAID array (or any other kernel 2239 * subsystem), by bd_claiming the device. 2240 */ 2241 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2242 { 2243 int err = 0; 2244 struct block_device *bdev; 2245 char b[BDEVNAME_SIZE]; 2246 2247 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2248 shared ? (struct md_rdev *)lock_rdev : rdev); 2249 if (IS_ERR(bdev)) { 2250 pr_warn("md: could not open %s.\n", __bdevname(dev, b)); 2251 return PTR_ERR(bdev); 2252 } 2253 rdev->bdev = bdev; 2254 return err; 2255 } 2256 2257 static void unlock_rdev(struct md_rdev *rdev) 2258 { 2259 struct block_device *bdev = rdev->bdev; 2260 rdev->bdev = NULL; 2261 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2262 } 2263 2264 void md_autodetect_dev(dev_t dev); 2265 2266 static void export_rdev(struct md_rdev *rdev) 2267 { 2268 char b[BDEVNAME_SIZE]; 2269 2270 pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); 2271 md_rdev_clear(rdev); 2272 #ifndef MODULE 2273 if (test_bit(AutoDetected, &rdev->flags)) 2274 md_autodetect_dev(rdev->bdev->bd_dev); 2275 #endif 2276 unlock_rdev(rdev); 2277 kobject_put(&rdev->kobj); 2278 } 2279 2280 void md_kick_rdev_from_array(struct md_rdev *rdev) 2281 { 2282 unbind_rdev_from_array(rdev); 2283 export_rdev(rdev); 2284 } 2285 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); 2286 2287 static void export_array(struct mddev *mddev) 2288 { 2289 struct md_rdev *rdev; 2290 2291 while (!list_empty(&mddev->disks)) { 2292 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2293 same_set); 2294 md_kick_rdev_from_array(rdev); 2295 } 2296 mddev->raid_disks = 0; 2297 mddev->major_version = 0; 2298 } 2299 2300 static bool set_in_sync(struct mddev *mddev) 2301 { 2302 WARN_ON_ONCE(NR_CPUS != 1 && !spin_is_locked(&mddev->lock)); 2303 if (!mddev->in_sync) { 2304 mddev->sync_checkers++; 2305 spin_unlock(&mddev->lock); 2306 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2307 spin_lock(&mddev->lock); 2308 if (!mddev->in_sync && 2309 percpu_ref_is_zero(&mddev->writes_pending)) { 2310 mddev->in_sync = 1; 2311 /* 2312 * Ensure ->in_sync is visible before we clear 2313 * ->sync_checkers. 2314 */ 2315 smp_mb(); 2316 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2317 sysfs_notify_dirent_safe(mddev->sysfs_state); 2318 } 2319 if (--mddev->sync_checkers == 0) 2320 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2321 } 2322 if (mddev->safemode == 1) 2323 mddev->safemode = 0; 2324 return mddev->in_sync; 2325 } 2326 2327 static void sync_sbs(struct mddev *mddev, int nospares) 2328 { 2329 /* Update each superblock (in-memory image), but 2330 * if we are allowed to, skip spares which already 2331 * have the right event counter, or have one earlier 2332 * (which would mean they aren't being marked as dirty 2333 * with the rest of the array) 2334 */ 2335 struct md_rdev *rdev; 2336 rdev_for_each(rdev, mddev) { 2337 if (rdev->sb_events == mddev->events || 2338 (nospares && 2339 rdev->raid_disk < 0 && 2340 rdev->sb_events+1 == mddev->events)) { 2341 /* Don't update this superblock */ 2342 rdev->sb_loaded = 2; 2343 } else { 2344 sync_super(mddev, rdev); 2345 rdev->sb_loaded = 1; 2346 } 2347 } 2348 } 2349 2350 static bool does_sb_need_changing(struct mddev *mddev) 2351 { 2352 struct md_rdev *rdev; 2353 struct mdp_superblock_1 *sb; 2354 int role; 2355 2356 /* Find a good rdev */ 2357 rdev_for_each(rdev, mddev) 2358 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) 2359 break; 2360 2361 /* No good device found. */ 2362 if (!rdev) 2363 return false; 2364 2365 sb = page_address(rdev->sb_page); 2366 /* Check if a device has become faulty or a spare become active */ 2367 rdev_for_each(rdev, mddev) { 2368 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2369 /* Device activated? */ 2370 if (role == 0xffff && rdev->raid_disk >=0 && 2371 !test_bit(Faulty, &rdev->flags)) 2372 return true; 2373 /* Device turned faulty? */ 2374 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) 2375 return true; 2376 } 2377 2378 /* Check if any mddev parameters have changed */ 2379 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2380 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2381 (mddev->layout != le32_to_cpu(sb->layout)) || 2382 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2383 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2384 return true; 2385 2386 return false; 2387 } 2388 2389 void md_update_sb(struct mddev *mddev, int force_change) 2390 { 2391 struct md_rdev *rdev; 2392 int sync_req; 2393 int nospares = 0; 2394 int any_badblocks_changed = 0; 2395 int ret = -1; 2396 2397 if (mddev->ro) { 2398 if (force_change) 2399 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2400 return; 2401 } 2402 2403 repeat: 2404 if (mddev_is_clustered(mddev)) { 2405 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2406 force_change = 1; 2407 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2408 nospares = 1; 2409 ret = md_cluster_ops->metadata_update_start(mddev); 2410 /* Has someone else has updated the sb */ 2411 if (!does_sb_need_changing(mddev)) { 2412 if (ret == 0) 2413 md_cluster_ops->metadata_update_cancel(mddev); 2414 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2415 BIT(MD_SB_CHANGE_DEVS) | 2416 BIT(MD_SB_CHANGE_CLEAN)); 2417 return; 2418 } 2419 } 2420 2421 /* First make sure individual recovery_offsets are correct */ 2422 rdev_for_each(rdev, mddev) { 2423 if (rdev->raid_disk >= 0 && 2424 mddev->delta_disks >= 0 && 2425 !test_bit(Journal, &rdev->flags) && 2426 !test_bit(In_sync, &rdev->flags) && 2427 mddev->curr_resync_completed > rdev->recovery_offset) 2428 rdev->recovery_offset = mddev->curr_resync_completed; 2429 2430 } 2431 if (!mddev->persistent) { 2432 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2433 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2434 if (!mddev->external) { 2435 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2436 rdev_for_each(rdev, mddev) { 2437 if (rdev->badblocks.changed) { 2438 rdev->badblocks.changed = 0; 2439 ack_all_badblocks(&rdev->badblocks); 2440 md_error(mddev, rdev); 2441 } 2442 clear_bit(Blocked, &rdev->flags); 2443 clear_bit(BlockedBadBlocks, &rdev->flags); 2444 wake_up(&rdev->blocked_wait); 2445 } 2446 } 2447 wake_up(&mddev->sb_wait); 2448 return; 2449 } 2450 2451 spin_lock(&mddev->lock); 2452 2453 mddev->utime = ktime_get_real_seconds(); 2454 2455 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2456 force_change = 1; 2457 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2458 /* just a clean<-> dirty transition, possibly leave spares alone, 2459 * though if events isn't the right even/odd, we will have to do 2460 * spares after all 2461 */ 2462 nospares = 1; 2463 if (force_change) 2464 nospares = 0; 2465 if (mddev->degraded) 2466 /* If the array is degraded, then skipping spares is both 2467 * dangerous and fairly pointless. 2468 * Dangerous because a device that was removed from the array 2469 * might have a event_count that still looks up-to-date, 2470 * so it can be re-added without a resync. 2471 * Pointless because if there are any spares to skip, 2472 * then a recovery will happen and soon that array won't 2473 * be degraded any more and the spare can go back to sleep then. 2474 */ 2475 nospares = 0; 2476 2477 sync_req = mddev->in_sync; 2478 2479 /* If this is just a dirty<->clean transition, and the array is clean 2480 * and 'events' is odd, we can roll back to the previous clean state */ 2481 if (nospares 2482 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2483 && mddev->can_decrease_events 2484 && mddev->events != 1) { 2485 mddev->events--; 2486 mddev->can_decrease_events = 0; 2487 } else { 2488 /* otherwise we have to go forward and ... */ 2489 mddev->events ++; 2490 mddev->can_decrease_events = nospares; 2491 } 2492 2493 /* 2494 * This 64-bit counter should never wrap. 2495 * Either we are in around ~1 trillion A.C., assuming 2496 * 1 reboot per second, or we have a bug... 2497 */ 2498 WARN_ON(mddev->events == 0); 2499 2500 rdev_for_each(rdev, mddev) { 2501 if (rdev->badblocks.changed) 2502 any_badblocks_changed++; 2503 if (test_bit(Faulty, &rdev->flags)) 2504 set_bit(FaultRecorded, &rdev->flags); 2505 } 2506 2507 sync_sbs(mddev, nospares); 2508 spin_unlock(&mddev->lock); 2509 2510 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2511 mdname(mddev), mddev->in_sync); 2512 2513 if (mddev->queue) 2514 blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2515 rewrite: 2516 bitmap_update_sb(mddev->bitmap); 2517 rdev_for_each(rdev, mddev) { 2518 char b[BDEVNAME_SIZE]; 2519 2520 if (rdev->sb_loaded != 1) 2521 continue; /* no noise on spare devices */ 2522 2523 if (!test_bit(Faulty, &rdev->flags)) { 2524 md_super_write(mddev,rdev, 2525 rdev->sb_start, rdev->sb_size, 2526 rdev->sb_page); 2527 pr_debug("md: (write) %s's sb offset: %llu\n", 2528 bdevname(rdev->bdev, b), 2529 (unsigned long long)rdev->sb_start); 2530 rdev->sb_events = mddev->events; 2531 if (rdev->badblocks.size) { 2532 md_super_write(mddev, rdev, 2533 rdev->badblocks.sector, 2534 rdev->badblocks.size << 9, 2535 rdev->bb_page); 2536 rdev->badblocks.size = 0; 2537 } 2538 2539 } else 2540 pr_debug("md: %s (skipping faulty)\n", 2541 bdevname(rdev->bdev, b)); 2542 2543 if (mddev->level == LEVEL_MULTIPATH) 2544 /* only need to write one superblock... */ 2545 break; 2546 } 2547 if (md_super_wait(mddev) < 0) 2548 goto rewrite; 2549 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2550 2551 if (mddev_is_clustered(mddev) && ret == 0) 2552 md_cluster_ops->metadata_update_finish(mddev); 2553 2554 if (mddev->in_sync != sync_req || 2555 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2556 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2557 /* have to write it out again */ 2558 goto repeat; 2559 wake_up(&mddev->sb_wait); 2560 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2561 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2562 2563 rdev_for_each(rdev, mddev) { 2564 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2565 clear_bit(Blocked, &rdev->flags); 2566 2567 if (any_badblocks_changed) 2568 ack_all_badblocks(&rdev->badblocks); 2569 clear_bit(BlockedBadBlocks, &rdev->flags); 2570 wake_up(&rdev->blocked_wait); 2571 } 2572 } 2573 EXPORT_SYMBOL(md_update_sb); 2574 2575 static int add_bound_rdev(struct md_rdev *rdev) 2576 { 2577 struct mddev *mddev = rdev->mddev; 2578 int err = 0; 2579 bool add_journal = test_bit(Journal, &rdev->flags); 2580 2581 if (!mddev->pers->hot_remove_disk || add_journal) { 2582 /* If there is hot_add_disk but no hot_remove_disk 2583 * then added disks for geometry changes, 2584 * and should be added immediately. 2585 */ 2586 super_types[mddev->major_version]. 2587 validate_super(mddev, rdev); 2588 if (add_journal) 2589 mddev_suspend(mddev); 2590 err = mddev->pers->hot_add_disk(mddev, rdev); 2591 if (add_journal) 2592 mddev_resume(mddev); 2593 if (err) { 2594 md_kick_rdev_from_array(rdev); 2595 return err; 2596 } 2597 } 2598 sysfs_notify_dirent_safe(rdev->sysfs_state); 2599 2600 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2601 if (mddev->degraded) 2602 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2603 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2604 md_new_event(mddev); 2605 md_wakeup_thread(mddev->thread); 2606 return 0; 2607 } 2608 2609 /* words written to sysfs files may, or may not, be \n terminated. 2610 * We want to accept with case. For this we use cmd_match. 2611 */ 2612 static int cmd_match(const char *cmd, const char *str) 2613 { 2614 /* See if cmd, written into a sysfs file, matches 2615 * str. They must either be the same, or cmd can 2616 * have a trailing newline 2617 */ 2618 while (*cmd && *str && *cmd == *str) { 2619 cmd++; 2620 str++; 2621 } 2622 if (*cmd == '\n') 2623 cmd++; 2624 if (*str || *cmd) 2625 return 0; 2626 return 1; 2627 } 2628 2629 struct rdev_sysfs_entry { 2630 struct attribute attr; 2631 ssize_t (*show)(struct md_rdev *, char *); 2632 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2633 }; 2634 2635 static ssize_t 2636 state_show(struct md_rdev *rdev, char *page) 2637 { 2638 char *sep = ","; 2639 size_t len = 0; 2640 unsigned long flags = ACCESS_ONCE(rdev->flags); 2641 2642 if (test_bit(Faulty, &flags) || 2643 (!test_bit(ExternalBbl, &flags) && 2644 rdev->badblocks.unacked_exist)) 2645 len += sprintf(page+len, "faulty%s", sep); 2646 if (test_bit(In_sync, &flags)) 2647 len += sprintf(page+len, "in_sync%s", sep); 2648 if (test_bit(Journal, &flags)) 2649 len += sprintf(page+len, "journal%s", sep); 2650 if (test_bit(WriteMostly, &flags)) 2651 len += sprintf(page+len, "write_mostly%s", sep); 2652 if (test_bit(Blocked, &flags) || 2653 (rdev->badblocks.unacked_exist 2654 && !test_bit(Faulty, &flags))) 2655 len += sprintf(page+len, "blocked%s", sep); 2656 if (!test_bit(Faulty, &flags) && 2657 !test_bit(Journal, &flags) && 2658 !test_bit(In_sync, &flags)) 2659 len += sprintf(page+len, "spare%s", sep); 2660 if (test_bit(WriteErrorSeen, &flags)) 2661 len += sprintf(page+len, "write_error%s", sep); 2662 if (test_bit(WantReplacement, &flags)) 2663 len += sprintf(page+len, "want_replacement%s", sep); 2664 if (test_bit(Replacement, &flags)) 2665 len += sprintf(page+len, "replacement%s", sep); 2666 if (test_bit(ExternalBbl, &flags)) 2667 len += sprintf(page+len, "external_bbl%s", sep); 2668 if (test_bit(FailFast, &flags)) 2669 len += sprintf(page+len, "failfast%s", sep); 2670 2671 if (len) 2672 len -= strlen(sep); 2673 2674 return len+sprintf(page+len, "\n"); 2675 } 2676 2677 static ssize_t 2678 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2679 { 2680 /* can write 2681 * faulty - simulates an error 2682 * remove - disconnects the device 2683 * writemostly - sets write_mostly 2684 * -writemostly - clears write_mostly 2685 * blocked - sets the Blocked flags 2686 * -blocked - clears the Blocked and possibly simulates an error 2687 * insync - sets Insync providing device isn't active 2688 * -insync - clear Insync for a device with a slot assigned, 2689 * so that it gets rebuilt based on bitmap 2690 * write_error - sets WriteErrorSeen 2691 * -write_error - clears WriteErrorSeen 2692 * {,-}failfast - set/clear FailFast 2693 */ 2694 int err = -EINVAL; 2695 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2696 md_error(rdev->mddev, rdev); 2697 if (test_bit(Faulty, &rdev->flags)) 2698 err = 0; 2699 else 2700 err = -EBUSY; 2701 } else if (cmd_match(buf, "remove")) { 2702 if (rdev->mddev->pers) { 2703 clear_bit(Blocked, &rdev->flags); 2704 remove_and_add_spares(rdev->mddev, rdev); 2705 } 2706 if (rdev->raid_disk >= 0) 2707 err = -EBUSY; 2708 else { 2709 struct mddev *mddev = rdev->mddev; 2710 err = 0; 2711 if (mddev_is_clustered(mddev)) 2712 err = md_cluster_ops->remove_disk(mddev, rdev); 2713 2714 if (err == 0) { 2715 md_kick_rdev_from_array(rdev); 2716 if (mddev->pers) { 2717 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2718 md_wakeup_thread(mddev->thread); 2719 } 2720 md_new_event(mddev); 2721 } 2722 } 2723 } else if (cmd_match(buf, "writemostly")) { 2724 set_bit(WriteMostly, &rdev->flags); 2725 err = 0; 2726 } else if (cmd_match(buf, "-writemostly")) { 2727 clear_bit(WriteMostly, &rdev->flags); 2728 err = 0; 2729 } else if (cmd_match(buf, "blocked")) { 2730 set_bit(Blocked, &rdev->flags); 2731 err = 0; 2732 } else if (cmd_match(buf, "-blocked")) { 2733 if (!test_bit(Faulty, &rdev->flags) && 2734 !test_bit(ExternalBbl, &rdev->flags) && 2735 rdev->badblocks.unacked_exist) { 2736 /* metadata handler doesn't understand badblocks, 2737 * so we need to fail the device 2738 */ 2739 md_error(rdev->mddev, rdev); 2740 } 2741 clear_bit(Blocked, &rdev->flags); 2742 clear_bit(BlockedBadBlocks, &rdev->flags); 2743 wake_up(&rdev->blocked_wait); 2744 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2745 md_wakeup_thread(rdev->mddev->thread); 2746 2747 err = 0; 2748 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2749 set_bit(In_sync, &rdev->flags); 2750 err = 0; 2751 } else if (cmd_match(buf, "failfast")) { 2752 set_bit(FailFast, &rdev->flags); 2753 err = 0; 2754 } else if (cmd_match(buf, "-failfast")) { 2755 clear_bit(FailFast, &rdev->flags); 2756 err = 0; 2757 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 2758 !test_bit(Journal, &rdev->flags)) { 2759 if (rdev->mddev->pers == NULL) { 2760 clear_bit(In_sync, &rdev->flags); 2761 rdev->saved_raid_disk = rdev->raid_disk; 2762 rdev->raid_disk = -1; 2763 err = 0; 2764 } 2765 } else if (cmd_match(buf, "write_error")) { 2766 set_bit(WriteErrorSeen, &rdev->flags); 2767 err = 0; 2768 } else if (cmd_match(buf, "-write_error")) { 2769 clear_bit(WriteErrorSeen, &rdev->flags); 2770 err = 0; 2771 } else if (cmd_match(buf, "want_replacement")) { 2772 /* Any non-spare device that is not a replacement can 2773 * become want_replacement at any time, but we then need to 2774 * check if recovery is needed. 2775 */ 2776 if (rdev->raid_disk >= 0 && 2777 !test_bit(Journal, &rdev->flags) && 2778 !test_bit(Replacement, &rdev->flags)) 2779 set_bit(WantReplacement, &rdev->flags); 2780 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2781 md_wakeup_thread(rdev->mddev->thread); 2782 err = 0; 2783 } else if (cmd_match(buf, "-want_replacement")) { 2784 /* Clearing 'want_replacement' is always allowed. 2785 * Once replacements starts it is too late though. 2786 */ 2787 err = 0; 2788 clear_bit(WantReplacement, &rdev->flags); 2789 } else if (cmd_match(buf, "replacement")) { 2790 /* Can only set a device as a replacement when array has not 2791 * yet been started. Once running, replacement is automatic 2792 * from spares, or by assigning 'slot'. 2793 */ 2794 if (rdev->mddev->pers) 2795 err = -EBUSY; 2796 else { 2797 set_bit(Replacement, &rdev->flags); 2798 err = 0; 2799 } 2800 } else if (cmd_match(buf, "-replacement")) { 2801 /* Similarly, can only clear Replacement before start */ 2802 if (rdev->mddev->pers) 2803 err = -EBUSY; 2804 else { 2805 clear_bit(Replacement, &rdev->flags); 2806 err = 0; 2807 } 2808 } else if (cmd_match(buf, "re-add")) { 2809 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { 2810 /* clear_bit is performed _after_ all the devices 2811 * have their local Faulty bit cleared. If any writes 2812 * happen in the meantime in the local node, they 2813 * will land in the local bitmap, which will be synced 2814 * by this node eventually 2815 */ 2816 if (!mddev_is_clustered(rdev->mddev) || 2817 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 2818 clear_bit(Faulty, &rdev->flags); 2819 err = add_bound_rdev(rdev); 2820 } 2821 } else 2822 err = -EBUSY; 2823 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 2824 set_bit(ExternalBbl, &rdev->flags); 2825 rdev->badblocks.shift = 0; 2826 err = 0; 2827 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 2828 clear_bit(ExternalBbl, &rdev->flags); 2829 err = 0; 2830 } 2831 if (!err) 2832 sysfs_notify_dirent_safe(rdev->sysfs_state); 2833 return err ? err : len; 2834 } 2835 static struct rdev_sysfs_entry rdev_state = 2836 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 2837 2838 static ssize_t 2839 errors_show(struct md_rdev *rdev, char *page) 2840 { 2841 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2842 } 2843 2844 static ssize_t 2845 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 2846 { 2847 unsigned int n; 2848 int rv; 2849 2850 rv = kstrtouint(buf, 10, &n); 2851 if (rv < 0) 2852 return rv; 2853 atomic_set(&rdev->corrected_errors, n); 2854 return len; 2855 } 2856 static struct rdev_sysfs_entry rdev_errors = 2857 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2858 2859 static ssize_t 2860 slot_show(struct md_rdev *rdev, char *page) 2861 { 2862 if (test_bit(Journal, &rdev->flags)) 2863 return sprintf(page, "journal\n"); 2864 else if (rdev->raid_disk < 0) 2865 return sprintf(page, "none\n"); 2866 else 2867 return sprintf(page, "%d\n", rdev->raid_disk); 2868 } 2869 2870 static ssize_t 2871 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 2872 { 2873 int slot; 2874 int err; 2875 2876 if (test_bit(Journal, &rdev->flags)) 2877 return -EBUSY; 2878 if (strncmp(buf, "none", 4)==0) 2879 slot = -1; 2880 else { 2881 err = kstrtouint(buf, 10, (unsigned int *)&slot); 2882 if (err < 0) 2883 return err; 2884 } 2885 if (rdev->mddev->pers && slot == -1) { 2886 /* Setting 'slot' on an active array requires also 2887 * updating the 'rd%d' link, and communicating 2888 * with the personality with ->hot_*_disk. 2889 * For now we only support removing 2890 * failed/spare devices. This normally happens automatically, 2891 * but not when the metadata is externally managed. 2892 */ 2893 if (rdev->raid_disk == -1) 2894 return -EEXIST; 2895 /* personality does all needed checks */ 2896 if (rdev->mddev->pers->hot_remove_disk == NULL) 2897 return -EINVAL; 2898 clear_bit(Blocked, &rdev->flags); 2899 remove_and_add_spares(rdev->mddev, rdev); 2900 if (rdev->raid_disk >= 0) 2901 return -EBUSY; 2902 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2903 md_wakeup_thread(rdev->mddev->thread); 2904 } else if (rdev->mddev->pers) { 2905 /* Activating a spare .. or possibly reactivating 2906 * if we ever get bitmaps working here. 2907 */ 2908 int err; 2909 2910 if (rdev->raid_disk != -1) 2911 return -EBUSY; 2912 2913 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 2914 return -EBUSY; 2915 2916 if (rdev->mddev->pers->hot_add_disk == NULL) 2917 return -EINVAL; 2918 2919 if (slot >= rdev->mddev->raid_disks && 2920 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2921 return -ENOSPC; 2922 2923 rdev->raid_disk = slot; 2924 if (test_bit(In_sync, &rdev->flags)) 2925 rdev->saved_raid_disk = slot; 2926 else 2927 rdev->saved_raid_disk = -1; 2928 clear_bit(In_sync, &rdev->flags); 2929 clear_bit(Bitmap_sync, &rdev->flags); 2930 err = rdev->mddev->pers-> 2931 hot_add_disk(rdev->mddev, rdev); 2932 if (err) { 2933 rdev->raid_disk = -1; 2934 return err; 2935 } else 2936 sysfs_notify_dirent_safe(rdev->sysfs_state); 2937 if (sysfs_link_rdev(rdev->mddev, rdev)) 2938 /* failure here is OK */; 2939 /* don't wakeup anyone, leave that to userspace. */ 2940 } else { 2941 if (slot >= rdev->mddev->raid_disks && 2942 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2943 return -ENOSPC; 2944 rdev->raid_disk = slot; 2945 /* assume it is working */ 2946 clear_bit(Faulty, &rdev->flags); 2947 clear_bit(WriteMostly, &rdev->flags); 2948 set_bit(In_sync, &rdev->flags); 2949 sysfs_notify_dirent_safe(rdev->sysfs_state); 2950 } 2951 return len; 2952 } 2953 2954 static struct rdev_sysfs_entry rdev_slot = 2955 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2956 2957 static ssize_t 2958 offset_show(struct md_rdev *rdev, char *page) 2959 { 2960 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2961 } 2962 2963 static ssize_t 2964 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2965 { 2966 unsigned long long offset; 2967 if (kstrtoull(buf, 10, &offset) < 0) 2968 return -EINVAL; 2969 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2970 return -EBUSY; 2971 if (rdev->sectors && rdev->mddev->external) 2972 /* Must set offset before size, so overlap checks 2973 * can be sane */ 2974 return -EBUSY; 2975 rdev->data_offset = offset; 2976 rdev->new_data_offset = offset; 2977 return len; 2978 } 2979 2980 static struct rdev_sysfs_entry rdev_offset = 2981 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2982 2983 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 2984 { 2985 return sprintf(page, "%llu\n", 2986 (unsigned long long)rdev->new_data_offset); 2987 } 2988 2989 static ssize_t new_offset_store(struct md_rdev *rdev, 2990 const char *buf, size_t len) 2991 { 2992 unsigned long long new_offset; 2993 struct mddev *mddev = rdev->mddev; 2994 2995 if (kstrtoull(buf, 10, &new_offset) < 0) 2996 return -EINVAL; 2997 2998 if (mddev->sync_thread || 2999 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 3000 return -EBUSY; 3001 if (new_offset == rdev->data_offset) 3002 /* reset is always permitted */ 3003 ; 3004 else if (new_offset > rdev->data_offset) { 3005 /* must not push array size beyond rdev_sectors */ 3006 if (new_offset - rdev->data_offset 3007 + mddev->dev_sectors > rdev->sectors) 3008 return -E2BIG; 3009 } 3010 /* Metadata worries about other space details. */ 3011 3012 /* decreasing the offset is inconsistent with a backwards 3013 * reshape. 3014 */ 3015 if (new_offset < rdev->data_offset && 3016 mddev->reshape_backwards) 3017 return -EINVAL; 3018 /* Increasing offset is inconsistent with forwards 3019 * reshape. reshape_direction should be set to 3020 * 'backwards' first. 3021 */ 3022 if (new_offset > rdev->data_offset && 3023 !mddev->reshape_backwards) 3024 return -EINVAL; 3025 3026 if (mddev->pers && mddev->persistent && 3027 !super_types[mddev->major_version] 3028 .allow_new_offset(rdev, new_offset)) 3029 return -E2BIG; 3030 rdev->new_data_offset = new_offset; 3031 if (new_offset > rdev->data_offset) 3032 mddev->reshape_backwards = 1; 3033 else if (new_offset < rdev->data_offset) 3034 mddev->reshape_backwards = 0; 3035 3036 return len; 3037 } 3038 static struct rdev_sysfs_entry rdev_new_offset = 3039 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3040 3041 static ssize_t 3042 rdev_size_show(struct md_rdev *rdev, char *page) 3043 { 3044 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3045 } 3046 3047 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 3048 { 3049 /* check if two start/length pairs overlap */ 3050 if (s1+l1 <= s2) 3051 return 0; 3052 if (s2+l2 <= s1) 3053 return 0; 3054 return 1; 3055 } 3056 3057 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3058 { 3059 unsigned long long blocks; 3060 sector_t new; 3061 3062 if (kstrtoull(buf, 10, &blocks) < 0) 3063 return -EINVAL; 3064 3065 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3066 return -EINVAL; /* sector conversion overflow */ 3067 3068 new = blocks * 2; 3069 if (new != blocks * 2) 3070 return -EINVAL; /* unsigned long long to sector_t overflow */ 3071 3072 *sectors = new; 3073 return 0; 3074 } 3075 3076 static ssize_t 3077 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3078 { 3079 struct mddev *my_mddev = rdev->mddev; 3080 sector_t oldsectors = rdev->sectors; 3081 sector_t sectors; 3082 3083 if (test_bit(Journal, &rdev->flags)) 3084 return -EBUSY; 3085 if (strict_blocks_to_sectors(buf, §ors) < 0) 3086 return -EINVAL; 3087 if (rdev->data_offset != rdev->new_data_offset) 3088 return -EINVAL; /* too confusing */ 3089 if (my_mddev->pers && rdev->raid_disk >= 0) { 3090 if (my_mddev->persistent) { 3091 sectors = super_types[my_mddev->major_version]. 3092 rdev_size_change(rdev, sectors); 3093 if (!sectors) 3094 return -EBUSY; 3095 } else if (!sectors) 3096 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 3097 rdev->data_offset; 3098 if (!my_mddev->pers->resize) 3099 /* Cannot change size for RAID0 or Linear etc */ 3100 return -EINVAL; 3101 } 3102 if (sectors < my_mddev->dev_sectors) 3103 return -EINVAL; /* component must fit device */ 3104 3105 rdev->sectors = sectors; 3106 if (sectors > oldsectors && my_mddev->external) { 3107 /* Need to check that all other rdevs with the same 3108 * ->bdev do not overlap. 'rcu' is sufficient to walk 3109 * the rdev lists safely. 3110 * This check does not provide a hard guarantee, it 3111 * just helps avoid dangerous mistakes. 3112 */ 3113 struct mddev *mddev; 3114 int overlap = 0; 3115 struct list_head *tmp; 3116 3117 rcu_read_lock(); 3118 for_each_mddev(mddev, tmp) { 3119 struct md_rdev *rdev2; 3120 3121 rdev_for_each(rdev2, mddev) 3122 if (rdev->bdev == rdev2->bdev && 3123 rdev != rdev2 && 3124 overlaps(rdev->data_offset, rdev->sectors, 3125 rdev2->data_offset, 3126 rdev2->sectors)) { 3127 overlap = 1; 3128 break; 3129 } 3130 if (overlap) { 3131 mddev_put(mddev); 3132 break; 3133 } 3134 } 3135 rcu_read_unlock(); 3136 if (overlap) { 3137 /* Someone else could have slipped in a size 3138 * change here, but doing so is just silly. 3139 * We put oldsectors back because we *know* it is 3140 * safe, and trust userspace not to race with 3141 * itself 3142 */ 3143 rdev->sectors = oldsectors; 3144 return -EBUSY; 3145 } 3146 } 3147 return len; 3148 } 3149 3150 static struct rdev_sysfs_entry rdev_size = 3151 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3152 3153 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3154 { 3155 unsigned long long recovery_start = rdev->recovery_offset; 3156 3157 if (test_bit(In_sync, &rdev->flags) || 3158 recovery_start == MaxSector) 3159 return sprintf(page, "none\n"); 3160 3161 return sprintf(page, "%llu\n", recovery_start); 3162 } 3163 3164 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3165 { 3166 unsigned long long recovery_start; 3167 3168 if (cmd_match(buf, "none")) 3169 recovery_start = MaxSector; 3170 else if (kstrtoull(buf, 10, &recovery_start)) 3171 return -EINVAL; 3172 3173 if (rdev->mddev->pers && 3174 rdev->raid_disk >= 0) 3175 return -EBUSY; 3176 3177 rdev->recovery_offset = recovery_start; 3178 if (recovery_start == MaxSector) 3179 set_bit(In_sync, &rdev->flags); 3180 else 3181 clear_bit(In_sync, &rdev->flags); 3182 return len; 3183 } 3184 3185 static struct rdev_sysfs_entry rdev_recovery_start = 3186 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3187 3188 /* sysfs access to bad-blocks list. 3189 * We present two files. 3190 * 'bad-blocks' lists sector numbers and lengths of ranges that 3191 * are recorded as bad. The list is truncated to fit within 3192 * the one-page limit of sysfs. 3193 * Writing "sector length" to this file adds an acknowledged 3194 * bad block list. 3195 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3196 * been acknowledged. Writing to this file adds bad blocks 3197 * without acknowledging them. This is largely for testing. 3198 */ 3199 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3200 { 3201 return badblocks_show(&rdev->badblocks, page, 0); 3202 } 3203 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3204 { 3205 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3206 /* Maybe that ack was all we needed */ 3207 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3208 wake_up(&rdev->blocked_wait); 3209 return rv; 3210 } 3211 static struct rdev_sysfs_entry rdev_bad_blocks = 3212 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3213 3214 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3215 { 3216 return badblocks_show(&rdev->badblocks, page, 1); 3217 } 3218 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3219 { 3220 return badblocks_store(&rdev->badblocks, page, len, 1); 3221 } 3222 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3223 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3224 3225 static ssize_t 3226 ppl_sector_show(struct md_rdev *rdev, char *page) 3227 { 3228 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3229 } 3230 3231 static ssize_t 3232 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3233 { 3234 unsigned long long sector; 3235 3236 if (kstrtoull(buf, 10, §or) < 0) 3237 return -EINVAL; 3238 if (sector != (sector_t)sector) 3239 return -EINVAL; 3240 3241 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3242 rdev->raid_disk >= 0) 3243 return -EBUSY; 3244 3245 if (rdev->mddev->persistent) { 3246 if (rdev->mddev->major_version == 0) 3247 return -EINVAL; 3248 if ((sector > rdev->sb_start && 3249 sector - rdev->sb_start > S16_MAX) || 3250 (sector < rdev->sb_start && 3251 rdev->sb_start - sector > -S16_MIN)) 3252 return -EINVAL; 3253 rdev->ppl.offset = sector - rdev->sb_start; 3254 } else if (!rdev->mddev->external) { 3255 return -EBUSY; 3256 } 3257 rdev->ppl.sector = sector; 3258 return len; 3259 } 3260 3261 static struct rdev_sysfs_entry rdev_ppl_sector = 3262 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3263 3264 static ssize_t 3265 ppl_size_show(struct md_rdev *rdev, char *page) 3266 { 3267 return sprintf(page, "%u\n", rdev->ppl.size); 3268 } 3269 3270 static ssize_t 3271 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3272 { 3273 unsigned int size; 3274 3275 if (kstrtouint(buf, 10, &size) < 0) 3276 return -EINVAL; 3277 3278 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3279 rdev->raid_disk >= 0) 3280 return -EBUSY; 3281 3282 if (rdev->mddev->persistent) { 3283 if (rdev->mddev->major_version == 0) 3284 return -EINVAL; 3285 if (size > U16_MAX) 3286 return -EINVAL; 3287 } else if (!rdev->mddev->external) { 3288 return -EBUSY; 3289 } 3290 rdev->ppl.size = size; 3291 return len; 3292 } 3293 3294 static struct rdev_sysfs_entry rdev_ppl_size = 3295 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3296 3297 static struct attribute *rdev_default_attrs[] = { 3298 &rdev_state.attr, 3299 &rdev_errors.attr, 3300 &rdev_slot.attr, 3301 &rdev_offset.attr, 3302 &rdev_new_offset.attr, 3303 &rdev_size.attr, 3304 &rdev_recovery_start.attr, 3305 &rdev_bad_blocks.attr, 3306 &rdev_unack_bad_blocks.attr, 3307 &rdev_ppl_sector.attr, 3308 &rdev_ppl_size.attr, 3309 NULL, 3310 }; 3311 static ssize_t 3312 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3313 { 3314 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3315 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3316 3317 if (!entry->show) 3318 return -EIO; 3319 if (!rdev->mddev) 3320 return -EBUSY; 3321 return entry->show(rdev, page); 3322 } 3323 3324 static ssize_t 3325 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3326 const char *page, size_t length) 3327 { 3328 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3329 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3330 ssize_t rv; 3331 struct mddev *mddev = rdev->mddev; 3332 3333 if (!entry->store) 3334 return -EIO; 3335 if (!capable(CAP_SYS_ADMIN)) 3336 return -EACCES; 3337 rv = mddev ? mddev_lock(mddev): -EBUSY; 3338 if (!rv) { 3339 if (rdev->mddev == NULL) 3340 rv = -EBUSY; 3341 else 3342 rv = entry->store(rdev, page, length); 3343 mddev_unlock(mddev); 3344 } 3345 return rv; 3346 } 3347 3348 static void rdev_free(struct kobject *ko) 3349 { 3350 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3351 kfree(rdev); 3352 } 3353 static const struct sysfs_ops rdev_sysfs_ops = { 3354 .show = rdev_attr_show, 3355 .store = rdev_attr_store, 3356 }; 3357 static struct kobj_type rdev_ktype = { 3358 .release = rdev_free, 3359 .sysfs_ops = &rdev_sysfs_ops, 3360 .default_attrs = rdev_default_attrs, 3361 }; 3362 3363 int md_rdev_init(struct md_rdev *rdev) 3364 { 3365 rdev->desc_nr = -1; 3366 rdev->saved_raid_disk = -1; 3367 rdev->raid_disk = -1; 3368 rdev->flags = 0; 3369 rdev->data_offset = 0; 3370 rdev->new_data_offset = 0; 3371 rdev->sb_events = 0; 3372 rdev->last_read_error = 0; 3373 rdev->sb_loaded = 0; 3374 rdev->bb_page = NULL; 3375 atomic_set(&rdev->nr_pending, 0); 3376 atomic_set(&rdev->read_errors, 0); 3377 atomic_set(&rdev->corrected_errors, 0); 3378 3379 INIT_LIST_HEAD(&rdev->same_set); 3380 init_waitqueue_head(&rdev->blocked_wait); 3381 3382 /* Add space to store bad block list. 3383 * This reserves the space even on arrays where it cannot 3384 * be used - I wonder if that matters 3385 */ 3386 return badblocks_init(&rdev->badblocks, 0); 3387 } 3388 EXPORT_SYMBOL_GPL(md_rdev_init); 3389 /* 3390 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3391 * 3392 * mark the device faulty if: 3393 * 3394 * - the device is nonexistent (zero size) 3395 * - the device has no valid superblock 3396 * 3397 * a faulty rdev _never_ has rdev->sb set. 3398 */ 3399 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3400 { 3401 char b[BDEVNAME_SIZE]; 3402 int err; 3403 struct md_rdev *rdev; 3404 sector_t size; 3405 3406 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3407 if (!rdev) 3408 return ERR_PTR(-ENOMEM); 3409 3410 err = md_rdev_init(rdev); 3411 if (err) 3412 goto abort_free; 3413 err = alloc_disk_sb(rdev); 3414 if (err) 3415 goto abort_free; 3416 3417 err = lock_rdev(rdev, newdev, super_format == -2); 3418 if (err) 3419 goto abort_free; 3420 3421 kobject_init(&rdev->kobj, &rdev_ktype); 3422 3423 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3424 if (!size) { 3425 pr_warn("md: %s has zero or unknown size, marking faulty!\n", 3426 bdevname(rdev->bdev,b)); 3427 err = -EINVAL; 3428 goto abort_free; 3429 } 3430 3431 if (super_format >= 0) { 3432 err = super_types[super_format]. 3433 load_super(rdev, NULL, super_minor); 3434 if (err == -EINVAL) { 3435 pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n", 3436 bdevname(rdev->bdev,b), 3437 super_format, super_minor); 3438 goto abort_free; 3439 } 3440 if (err < 0) { 3441 pr_warn("md: could not read %s's sb, not importing!\n", 3442 bdevname(rdev->bdev,b)); 3443 goto abort_free; 3444 } 3445 } 3446 3447 return rdev; 3448 3449 abort_free: 3450 if (rdev->bdev) 3451 unlock_rdev(rdev); 3452 md_rdev_clear(rdev); 3453 kfree(rdev); 3454 return ERR_PTR(err); 3455 } 3456 3457 /* 3458 * Check a full RAID array for plausibility 3459 */ 3460 3461 static void analyze_sbs(struct mddev *mddev) 3462 { 3463 int i; 3464 struct md_rdev *rdev, *freshest, *tmp; 3465 char b[BDEVNAME_SIZE]; 3466 3467 freshest = NULL; 3468 rdev_for_each_safe(rdev, tmp, mddev) 3469 switch (super_types[mddev->major_version]. 3470 load_super(rdev, freshest, mddev->minor_version)) { 3471 case 1: 3472 freshest = rdev; 3473 break; 3474 case 0: 3475 break; 3476 default: 3477 pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n", 3478 bdevname(rdev->bdev,b)); 3479 md_kick_rdev_from_array(rdev); 3480 } 3481 3482 super_types[mddev->major_version]. 3483 validate_super(mddev, freshest); 3484 3485 i = 0; 3486 rdev_for_each_safe(rdev, tmp, mddev) { 3487 if (mddev->max_disks && 3488 (rdev->desc_nr >= mddev->max_disks || 3489 i > mddev->max_disks)) { 3490 pr_warn("md: %s: %s: only %d devices permitted\n", 3491 mdname(mddev), bdevname(rdev->bdev, b), 3492 mddev->max_disks); 3493 md_kick_rdev_from_array(rdev); 3494 continue; 3495 } 3496 if (rdev != freshest) { 3497 if (super_types[mddev->major_version]. 3498 validate_super(mddev, rdev)) { 3499 pr_warn("md: kicking non-fresh %s from array!\n", 3500 bdevname(rdev->bdev,b)); 3501 md_kick_rdev_from_array(rdev); 3502 continue; 3503 } 3504 } 3505 if (mddev->level == LEVEL_MULTIPATH) { 3506 rdev->desc_nr = i++; 3507 rdev->raid_disk = rdev->desc_nr; 3508 set_bit(In_sync, &rdev->flags); 3509 } else if (rdev->raid_disk >= 3510 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3511 !test_bit(Journal, &rdev->flags)) { 3512 rdev->raid_disk = -1; 3513 clear_bit(In_sync, &rdev->flags); 3514 } 3515 } 3516 } 3517 3518 /* Read a fixed-point number. 3519 * Numbers in sysfs attributes should be in "standard" units where 3520 * possible, so time should be in seconds. 3521 * However we internally use a a much smaller unit such as 3522 * milliseconds or jiffies. 3523 * This function takes a decimal number with a possible fractional 3524 * component, and produces an integer which is the result of 3525 * multiplying that number by 10^'scale'. 3526 * all without any floating-point arithmetic. 3527 */ 3528 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3529 { 3530 unsigned long result = 0; 3531 long decimals = -1; 3532 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3533 if (*cp == '.') 3534 decimals = 0; 3535 else if (decimals < scale) { 3536 unsigned int value; 3537 value = *cp - '0'; 3538 result = result * 10 + value; 3539 if (decimals >= 0) 3540 decimals++; 3541 } 3542 cp++; 3543 } 3544 if (*cp == '\n') 3545 cp++; 3546 if (*cp) 3547 return -EINVAL; 3548 if (decimals < 0) 3549 decimals = 0; 3550 while (decimals < scale) { 3551 result *= 10; 3552 decimals ++; 3553 } 3554 *res = result; 3555 return 0; 3556 } 3557 3558 static ssize_t 3559 safe_delay_show(struct mddev *mddev, char *page) 3560 { 3561 int msec = (mddev->safemode_delay*1000)/HZ; 3562 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3563 } 3564 static ssize_t 3565 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3566 { 3567 unsigned long msec; 3568 3569 if (mddev_is_clustered(mddev)) { 3570 pr_warn("md: Safemode is disabled for clustered mode\n"); 3571 return -EINVAL; 3572 } 3573 3574 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 3575 return -EINVAL; 3576 if (msec == 0) 3577 mddev->safemode_delay = 0; 3578 else { 3579 unsigned long old_delay = mddev->safemode_delay; 3580 unsigned long new_delay = (msec*HZ)/1000; 3581 3582 if (new_delay == 0) 3583 new_delay = 1; 3584 mddev->safemode_delay = new_delay; 3585 if (new_delay < old_delay || old_delay == 0) 3586 mod_timer(&mddev->safemode_timer, jiffies+1); 3587 } 3588 return len; 3589 } 3590 static struct md_sysfs_entry md_safe_delay = 3591 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3592 3593 static ssize_t 3594 level_show(struct mddev *mddev, char *page) 3595 { 3596 struct md_personality *p; 3597 int ret; 3598 spin_lock(&mddev->lock); 3599 p = mddev->pers; 3600 if (p) 3601 ret = sprintf(page, "%s\n", p->name); 3602 else if (mddev->clevel[0]) 3603 ret = sprintf(page, "%s\n", mddev->clevel); 3604 else if (mddev->level != LEVEL_NONE) 3605 ret = sprintf(page, "%d\n", mddev->level); 3606 else 3607 ret = 0; 3608 spin_unlock(&mddev->lock); 3609 return ret; 3610 } 3611 3612 static ssize_t 3613 level_store(struct mddev *mddev, const char *buf, size_t len) 3614 { 3615 char clevel[16]; 3616 ssize_t rv; 3617 size_t slen = len; 3618 struct md_personality *pers, *oldpers; 3619 long level; 3620 void *priv, *oldpriv; 3621 struct md_rdev *rdev; 3622 3623 if (slen == 0 || slen >= sizeof(clevel)) 3624 return -EINVAL; 3625 3626 rv = mddev_lock(mddev); 3627 if (rv) 3628 return rv; 3629 3630 if (mddev->pers == NULL) { 3631 strncpy(mddev->clevel, buf, slen); 3632 if (mddev->clevel[slen-1] == '\n') 3633 slen--; 3634 mddev->clevel[slen] = 0; 3635 mddev->level = LEVEL_NONE; 3636 rv = len; 3637 goto out_unlock; 3638 } 3639 rv = -EROFS; 3640 if (mddev->ro) 3641 goto out_unlock; 3642 3643 /* request to change the personality. Need to ensure: 3644 * - array is not engaged in resync/recovery/reshape 3645 * - old personality can be suspended 3646 * - new personality will access other array. 3647 */ 3648 3649 rv = -EBUSY; 3650 if (mddev->sync_thread || 3651 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3652 mddev->reshape_position != MaxSector || 3653 mddev->sysfs_active) 3654 goto out_unlock; 3655 3656 rv = -EINVAL; 3657 if (!mddev->pers->quiesce) { 3658 pr_warn("md: %s: %s does not support online personality change\n", 3659 mdname(mddev), mddev->pers->name); 3660 goto out_unlock; 3661 } 3662 3663 /* Now find the new personality */ 3664 strncpy(clevel, buf, slen); 3665 if (clevel[slen-1] == '\n') 3666 slen--; 3667 clevel[slen] = 0; 3668 if (kstrtol(clevel, 10, &level)) 3669 level = LEVEL_NONE; 3670 3671 if (request_module("md-%s", clevel) != 0) 3672 request_module("md-level-%s", clevel); 3673 spin_lock(&pers_lock); 3674 pers = find_pers(level, clevel); 3675 if (!pers || !try_module_get(pers->owner)) { 3676 spin_unlock(&pers_lock); 3677 pr_warn("md: personality %s not loaded\n", clevel); 3678 rv = -EINVAL; 3679 goto out_unlock; 3680 } 3681 spin_unlock(&pers_lock); 3682 3683 if (pers == mddev->pers) { 3684 /* Nothing to do! */ 3685 module_put(pers->owner); 3686 rv = len; 3687 goto out_unlock; 3688 } 3689 if (!pers->takeover) { 3690 module_put(pers->owner); 3691 pr_warn("md: %s: %s does not support personality takeover\n", 3692 mdname(mddev), clevel); 3693 rv = -EINVAL; 3694 goto out_unlock; 3695 } 3696 3697 rdev_for_each(rdev, mddev) 3698 rdev->new_raid_disk = rdev->raid_disk; 3699 3700 /* ->takeover must set new_* and/or delta_disks 3701 * if it succeeds, and may set them when it fails. 3702 */ 3703 priv = pers->takeover(mddev); 3704 if (IS_ERR(priv)) { 3705 mddev->new_level = mddev->level; 3706 mddev->new_layout = mddev->layout; 3707 mddev->new_chunk_sectors = mddev->chunk_sectors; 3708 mddev->raid_disks -= mddev->delta_disks; 3709 mddev->delta_disks = 0; 3710 mddev->reshape_backwards = 0; 3711 module_put(pers->owner); 3712 pr_warn("md: %s: %s would not accept array\n", 3713 mdname(mddev), clevel); 3714 rv = PTR_ERR(priv); 3715 goto out_unlock; 3716 } 3717 3718 /* Looks like we have a winner */ 3719 mddev_suspend(mddev); 3720 mddev_detach(mddev); 3721 3722 spin_lock(&mddev->lock); 3723 oldpers = mddev->pers; 3724 oldpriv = mddev->private; 3725 mddev->pers = pers; 3726 mddev->private = priv; 3727 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3728 mddev->level = mddev->new_level; 3729 mddev->layout = mddev->new_layout; 3730 mddev->chunk_sectors = mddev->new_chunk_sectors; 3731 mddev->delta_disks = 0; 3732 mddev->reshape_backwards = 0; 3733 mddev->degraded = 0; 3734 spin_unlock(&mddev->lock); 3735 3736 if (oldpers->sync_request == NULL && 3737 mddev->external) { 3738 /* We are converting from a no-redundancy array 3739 * to a redundancy array and metadata is managed 3740 * externally so we need to be sure that writes 3741 * won't block due to a need to transition 3742 * clean->dirty 3743 * until external management is started. 3744 */ 3745 mddev->in_sync = 0; 3746 mddev->safemode_delay = 0; 3747 mddev->safemode = 0; 3748 } 3749 3750 oldpers->free(mddev, oldpriv); 3751 3752 if (oldpers->sync_request == NULL && 3753 pers->sync_request != NULL) { 3754 /* need to add the md_redundancy_group */ 3755 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3756 pr_warn("md: cannot register extra attributes for %s\n", 3757 mdname(mddev)); 3758 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 3759 } 3760 if (oldpers->sync_request != NULL && 3761 pers->sync_request == NULL) { 3762 /* need to remove the md_redundancy_group */ 3763 if (mddev->to_remove == NULL) 3764 mddev->to_remove = &md_redundancy_group; 3765 } 3766 3767 module_put(oldpers->owner); 3768 3769 rdev_for_each(rdev, mddev) { 3770 if (rdev->raid_disk < 0) 3771 continue; 3772 if (rdev->new_raid_disk >= mddev->raid_disks) 3773 rdev->new_raid_disk = -1; 3774 if (rdev->new_raid_disk == rdev->raid_disk) 3775 continue; 3776 sysfs_unlink_rdev(mddev, rdev); 3777 } 3778 rdev_for_each(rdev, mddev) { 3779 if (rdev->raid_disk < 0) 3780 continue; 3781 if (rdev->new_raid_disk == rdev->raid_disk) 3782 continue; 3783 rdev->raid_disk = rdev->new_raid_disk; 3784 if (rdev->raid_disk < 0) 3785 clear_bit(In_sync, &rdev->flags); 3786 else { 3787 if (sysfs_link_rdev(mddev, rdev)) 3788 pr_warn("md: cannot register rd%d for %s after level change\n", 3789 rdev->raid_disk, mdname(mddev)); 3790 } 3791 } 3792 3793 if (pers->sync_request == NULL) { 3794 /* this is now an array without redundancy, so 3795 * it must always be in_sync 3796 */ 3797 mddev->in_sync = 1; 3798 del_timer_sync(&mddev->safemode_timer); 3799 } 3800 blk_set_stacking_limits(&mddev->queue->limits); 3801 pers->run(mddev); 3802 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3803 mddev_resume(mddev); 3804 if (!mddev->thread) 3805 md_update_sb(mddev, 1); 3806 sysfs_notify(&mddev->kobj, NULL, "level"); 3807 md_new_event(mddev); 3808 rv = len; 3809 out_unlock: 3810 mddev_unlock(mddev); 3811 return rv; 3812 } 3813 3814 static struct md_sysfs_entry md_level = 3815 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3816 3817 static ssize_t 3818 layout_show(struct mddev *mddev, char *page) 3819 { 3820 /* just a number, not meaningful for all levels */ 3821 if (mddev->reshape_position != MaxSector && 3822 mddev->layout != mddev->new_layout) 3823 return sprintf(page, "%d (%d)\n", 3824 mddev->new_layout, mddev->layout); 3825 return sprintf(page, "%d\n", mddev->layout); 3826 } 3827 3828 static ssize_t 3829 layout_store(struct mddev *mddev, const char *buf, size_t len) 3830 { 3831 unsigned int n; 3832 int err; 3833 3834 err = kstrtouint(buf, 10, &n); 3835 if (err < 0) 3836 return err; 3837 err = mddev_lock(mddev); 3838 if (err) 3839 return err; 3840 3841 if (mddev->pers) { 3842 if (mddev->pers->check_reshape == NULL) 3843 err = -EBUSY; 3844 else if (mddev->ro) 3845 err = -EROFS; 3846 else { 3847 mddev->new_layout = n; 3848 err = mddev->pers->check_reshape(mddev); 3849 if (err) 3850 mddev->new_layout = mddev->layout; 3851 } 3852 } else { 3853 mddev->new_layout = n; 3854 if (mddev->reshape_position == MaxSector) 3855 mddev->layout = n; 3856 } 3857 mddev_unlock(mddev); 3858 return err ?: len; 3859 } 3860 static struct md_sysfs_entry md_layout = 3861 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3862 3863 static ssize_t 3864 raid_disks_show(struct mddev *mddev, char *page) 3865 { 3866 if (mddev->raid_disks == 0) 3867 return 0; 3868 if (mddev->reshape_position != MaxSector && 3869 mddev->delta_disks != 0) 3870 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3871 mddev->raid_disks - mddev->delta_disks); 3872 return sprintf(page, "%d\n", mddev->raid_disks); 3873 } 3874 3875 static int update_raid_disks(struct mddev *mddev, int raid_disks); 3876 3877 static ssize_t 3878 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3879 { 3880 unsigned int n; 3881 int err; 3882 3883 err = kstrtouint(buf, 10, &n); 3884 if (err < 0) 3885 return err; 3886 3887 err = mddev_lock(mddev); 3888 if (err) 3889 return err; 3890 if (mddev->pers) 3891 err = update_raid_disks(mddev, n); 3892 else if (mddev->reshape_position != MaxSector) { 3893 struct md_rdev *rdev; 3894 int olddisks = mddev->raid_disks - mddev->delta_disks; 3895 3896 err = -EINVAL; 3897 rdev_for_each(rdev, mddev) { 3898 if (olddisks < n && 3899 rdev->data_offset < rdev->new_data_offset) 3900 goto out_unlock; 3901 if (olddisks > n && 3902 rdev->data_offset > rdev->new_data_offset) 3903 goto out_unlock; 3904 } 3905 err = 0; 3906 mddev->delta_disks = n - olddisks; 3907 mddev->raid_disks = n; 3908 mddev->reshape_backwards = (mddev->delta_disks < 0); 3909 } else 3910 mddev->raid_disks = n; 3911 out_unlock: 3912 mddev_unlock(mddev); 3913 return err ? err : len; 3914 } 3915 static struct md_sysfs_entry md_raid_disks = 3916 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3917 3918 static ssize_t 3919 chunk_size_show(struct mddev *mddev, char *page) 3920 { 3921 if (mddev->reshape_position != MaxSector && 3922 mddev->chunk_sectors != mddev->new_chunk_sectors) 3923 return sprintf(page, "%d (%d)\n", 3924 mddev->new_chunk_sectors << 9, 3925 mddev->chunk_sectors << 9); 3926 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3927 } 3928 3929 static ssize_t 3930 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3931 { 3932 unsigned long n; 3933 int err; 3934 3935 err = kstrtoul(buf, 10, &n); 3936 if (err < 0) 3937 return err; 3938 3939 err = mddev_lock(mddev); 3940 if (err) 3941 return err; 3942 if (mddev->pers) { 3943 if (mddev->pers->check_reshape == NULL) 3944 err = -EBUSY; 3945 else if (mddev->ro) 3946 err = -EROFS; 3947 else { 3948 mddev->new_chunk_sectors = n >> 9; 3949 err = mddev->pers->check_reshape(mddev); 3950 if (err) 3951 mddev->new_chunk_sectors = mddev->chunk_sectors; 3952 } 3953 } else { 3954 mddev->new_chunk_sectors = n >> 9; 3955 if (mddev->reshape_position == MaxSector) 3956 mddev->chunk_sectors = n >> 9; 3957 } 3958 mddev_unlock(mddev); 3959 return err ?: len; 3960 } 3961 static struct md_sysfs_entry md_chunk_size = 3962 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3963 3964 static ssize_t 3965 resync_start_show(struct mddev *mddev, char *page) 3966 { 3967 if (mddev->recovery_cp == MaxSector) 3968 return sprintf(page, "none\n"); 3969 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3970 } 3971 3972 static ssize_t 3973 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3974 { 3975 unsigned long long n; 3976 int err; 3977 3978 if (cmd_match(buf, "none")) 3979 n = MaxSector; 3980 else { 3981 err = kstrtoull(buf, 10, &n); 3982 if (err < 0) 3983 return err; 3984 if (n != (sector_t)n) 3985 return -EINVAL; 3986 } 3987 3988 err = mddev_lock(mddev); 3989 if (err) 3990 return err; 3991 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3992 err = -EBUSY; 3993 3994 if (!err) { 3995 mddev->recovery_cp = n; 3996 if (mddev->pers) 3997 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 3998 } 3999 mddev_unlock(mddev); 4000 return err ?: len; 4001 } 4002 static struct md_sysfs_entry md_resync_start = 4003 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4004 resync_start_show, resync_start_store); 4005 4006 /* 4007 * The array state can be: 4008 * 4009 * clear 4010 * No devices, no size, no level 4011 * Equivalent to STOP_ARRAY ioctl 4012 * inactive 4013 * May have some settings, but array is not active 4014 * all IO results in error 4015 * When written, doesn't tear down array, but just stops it 4016 * suspended (not supported yet) 4017 * All IO requests will block. The array can be reconfigured. 4018 * Writing this, if accepted, will block until array is quiescent 4019 * readonly 4020 * no resync can happen. no superblocks get written. 4021 * write requests fail 4022 * read-auto 4023 * like readonly, but behaves like 'clean' on a write request. 4024 * 4025 * clean - no pending writes, but otherwise active. 4026 * When written to inactive array, starts without resync 4027 * If a write request arrives then 4028 * if metadata is known, mark 'dirty' and switch to 'active'. 4029 * if not known, block and switch to write-pending 4030 * If written to an active array that has pending writes, then fails. 4031 * active 4032 * fully active: IO and resync can be happening. 4033 * When written to inactive array, starts with resync 4034 * 4035 * write-pending 4036 * clean, but writes are blocked waiting for 'active' to be written. 4037 * 4038 * active-idle 4039 * like active, but no writes have been seen for a while (100msec). 4040 * 4041 */ 4042 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4043 write_pending, active_idle, bad_word}; 4044 static char *array_states[] = { 4045 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4046 "write-pending", "active-idle", NULL }; 4047 4048 static int match_word(const char *word, char **list) 4049 { 4050 int n; 4051 for (n=0; list[n]; n++) 4052 if (cmd_match(word, list[n])) 4053 break; 4054 return n; 4055 } 4056 4057 static ssize_t 4058 array_state_show(struct mddev *mddev, char *page) 4059 { 4060 enum array_state st = inactive; 4061 4062 if (mddev->pers) 4063 switch(mddev->ro) { 4064 case 1: 4065 st = readonly; 4066 break; 4067 case 2: 4068 st = read_auto; 4069 break; 4070 case 0: 4071 spin_lock(&mddev->lock); 4072 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4073 st = write_pending; 4074 else if (mddev->in_sync) 4075 st = clean; 4076 else if (mddev->safemode) 4077 st = active_idle; 4078 else 4079 st = active; 4080 spin_unlock(&mddev->lock); 4081 } 4082 else { 4083 if (list_empty(&mddev->disks) && 4084 mddev->raid_disks == 0 && 4085 mddev->dev_sectors == 0) 4086 st = clear; 4087 else 4088 st = inactive; 4089 } 4090 return sprintf(page, "%s\n", array_states[st]); 4091 } 4092 4093 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 4094 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 4095 static int do_md_run(struct mddev *mddev); 4096 static int restart_array(struct mddev *mddev); 4097 4098 static ssize_t 4099 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4100 { 4101 int err = 0; 4102 enum array_state st = match_word(buf, array_states); 4103 4104 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { 4105 /* don't take reconfig_mutex when toggling between 4106 * clean and active 4107 */ 4108 spin_lock(&mddev->lock); 4109 if (st == active) { 4110 restart_array(mddev); 4111 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4112 md_wakeup_thread(mddev->thread); 4113 wake_up(&mddev->sb_wait); 4114 } else /* st == clean */ { 4115 restart_array(mddev); 4116 if (!set_in_sync(mddev)) 4117 err = -EBUSY; 4118 } 4119 if (!err) 4120 sysfs_notify_dirent_safe(mddev->sysfs_state); 4121 spin_unlock(&mddev->lock); 4122 return err ?: len; 4123 } 4124 err = mddev_lock(mddev); 4125 if (err) 4126 return err; 4127 err = -EINVAL; 4128 switch(st) { 4129 case bad_word: 4130 break; 4131 case clear: 4132 /* stopping an active array */ 4133 err = do_md_stop(mddev, 0, NULL); 4134 break; 4135 case inactive: 4136 /* stopping an active array */ 4137 if (mddev->pers) 4138 err = do_md_stop(mddev, 2, NULL); 4139 else 4140 err = 0; /* already inactive */ 4141 break; 4142 case suspended: 4143 break; /* not supported yet */ 4144 case readonly: 4145 if (mddev->pers) 4146 err = md_set_readonly(mddev, NULL); 4147 else { 4148 mddev->ro = 1; 4149 set_disk_ro(mddev->gendisk, 1); 4150 err = do_md_run(mddev); 4151 } 4152 break; 4153 case read_auto: 4154 if (mddev->pers) { 4155 if (mddev->ro == 0) 4156 err = md_set_readonly(mddev, NULL); 4157 else if (mddev->ro == 1) 4158 err = restart_array(mddev); 4159 if (err == 0) { 4160 mddev->ro = 2; 4161 set_disk_ro(mddev->gendisk, 0); 4162 } 4163 } else { 4164 mddev->ro = 2; 4165 err = do_md_run(mddev); 4166 } 4167 break; 4168 case clean: 4169 if (mddev->pers) { 4170 err = restart_array(mddev); 4171 if (err) 4172 break; 4173 spin_lock(&mddev->lock); 4174 if (!set_in_sync(mddev)) 4175 err = -EBUSY; 4176 spin_unlock(&mddev->lock); 4177 } else 4178 err = -EINVAL; 4179 break; 4180 case active: 4181 if (mddev->pers) { 4182 err = restart_array(mddev); 4183 if (err) 4184 break; 4185 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4186 wake_up(&mddev->sb_wait); 4187 err = 0; 4188 } else { 4189 mddev->ro = 0; 4190 set_disk_ro(mddev->gendisk, 0); 4191 err = do_md_run(mddev); 4192 } 4193 break; 4194 case write_pending: 4195 case active_idle: 4196 /* these cannot be set */ 4197 break; 4198 } 4199 4200 if (!err) { 4201 if (mddev->hold_active == UNTIL_IOCTL) 4202 mddev->hold_active = 0; 4203 sysfs_notify_dirent_safe(mddev->sysfs_state); 4204 } 4205 mddev_unlock(mddev); 4206 return err ?: len; 4207 } 4208 static struct md_sysfs_entry md_array_state = 4209 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4210 4211 static ssize_t 4212 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4213 return sprintf(page, "%d\n", 4214 atomic_read(&mddev->max_corr_read_errors)); 4215 } 4216 4217 static ssize_t 4218 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4219 { 4220 unsigned int n; 4221 int rv; 4222 4223 rv = kstrtouint(buf, 10, &n); 4224 if (rv < 0) 4225 return rv; 4226 atomic_set(&mddev->max_corr_read_errors, n); 4227 return len; 4228 } 4229 4230 static struct md_sysfs_entry max_corr_read_errors = 4231 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4232 max_corrected_read_errors_store); 4233 4234 static ssize_t 4235 null_show(struct mddev *mddev, char *page) 4236 { 4237 return -EINVAL; 4238 } 4239 4240 static ssize_t 4241 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4242 { 4243 /* buf must be %d:%d\n? giving major and minor numbers */ 4244 /* The new device is added to the array. 4245 * If the array has a persistent superblock, we read the 4246 * superblock to initialise info and check validity. 4247 * Otherwise, only checking done is that in bind_rdev_to_array, 4248 * which mainly checks size. 4249 */ 4250 char *e; 4251 int major = simple_strtoul(buf, &e, 10); 4252 int minor; 4253 dev_t dev; 4254 struct md_rdev *rdev; 4255 int err; 4256 4257 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4258 return -EINVAL; 4259 minor = simple_strtoul(e+1, &e, 10); 4260 if (*e && *e != '\n') 4261 return -EINVAL; 4262 dev = MKDEV(major, minor); 4263 if (major != MAJOR(dev) || 4264 minor != MINOR(dev)) 4265 return -EOVERFLOW; 4266 4267 flush_workqueue(md_misc_wq); 4268 4269 err = mddev_lock(mddev); 4270 if (err) 4271 return err; 4272 if (mddev->persistent) { 4273 rdev = md_import_device(dev, mddev->major_version, 4274 mddev->minor_version); 4275 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4276 struct md_rdev *rdev0 4277 = list_entry(mddev->disks.next, 4278 struct md_rdev, same_set); 4279 err = super_types[mddev->major_version] 4280 .load_super(rdev, rdev0, mddev->minor_version); 4281 if (err < 0) 4282 goto out; 4283 } 4284 } else if (mddev->external) 4285 rdev = md_import_device(dev, -2, -1); 4286 else 4287 rdev = md_import_device(dev, -1, -1); 4288 4289 if (IS_ERR(rdev)) { 4290 mddev_unlock(mddev); 4291 return PTR_ERR(rdev); 4292 } 4293 err = bind_rdev_to_array(rdev, mddev); 4294 out: 4295 if (err) 4296 export_rdev(rdev); 4297 mddev_unlock(mddev); 4298 if (!err) 4299 md_new_event(mddev); 4300 return err ? err : len; 4301 } 4302 4303 static struct md_sysfs_entry md_new_device = 4304 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4305 4306 static ssize_t 4307 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4308 { 4309 char *end; 4310 unsigned long chunk, end_chunk; 4311 int err; 4312 4313 err = mddev_lock(mddev); 4314 if (err) 4315 return err; 4316 if (!mddev->bitmap) 4317 goto out; 4318 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4319 while (*buf) { 4320 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4321 if (buf == end) break; 4322 if (*end == '-') { /* range */ 4323 buf = end + 1; 4324 end_chunk = simple_strtoul(buf, &end, 0); 4325 if (buf == end) break; 4326 } 4327 if (*end && !isspace(*end)) break; 4328 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4329 buf = skip_spaces(end); 4330 } 4331 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4332 out: 4333 mddev_unlock(mddev); 4334 return len; 4335 } 4336 4337 static struct md_sysfs_entry md_bitmap = 4338 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4339 4340 static ssize_t 4341 size_show(struct mddev *mddev, char *page) 4342 { 4343 return sprintf(page, "%llu\n", 4344 (unsigned long long)mddev->dev_sectors / 2); 4345 } 4346 4347 static int update_size(struct mddev *mddev, sector_t num_sectors); 4348 4349 static ssize_t 4350 size_store(struct mddev *mddev, const char *buf, size_t len) 4351 { 4352 /* If array is inactive, we can reduce the component size, but 4353 * not increase it (except from 0). 4354 * If array is active, we can try an on-line resize 4355 */ 4356 sector_t sectors; 4357 int err = strict_blocks_to_sectors(buf, §ors); 4358 4359 if (err < 0) 4360 return err; 4361 err = mddev_lock(mddev); 4362 if (err) 4363 return err; 4364 if (mddev->pers) { 4365 err = update_size(mddev, sectors); 4366 if (err == 0) 4367 md_update_sb(mddev, 1); 4368 } else { 4369 if (mddev->dev_sectors == 0 || 4370 mddev->dev_sectors > sectors) 4371 mddev->dev_sectors = sectors; 4372 else 4373 err = -ENOSPC; 4374 } 4375 mddev_unlock(mddev); 4376 return err ? err : len; 4377 } 4378 4379 static struct md_sysfs_entry md_size = 4380 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4381 4382 /* Metadata version. 4383 * This is one of 4384 * 'none' for arrays with no metadata (good luck...) 4385 * 'external' for arrays with externally managed metadata, 4386 * or N.M for internally known formats 4387 */ 4388 static ssize_t 4389 metadata_show(struct mddev *mddev, char *page) 4390 { 4391 if (mddev->persistent) 4392 return sprintf(page, "%d.%d\n", 4393 mddev->major_version, mddev->minor_version); 4394 else if (mddev->external) 4395 return sprintf(page, "external:%s\n", mddev->metadata_type); 4396 else 4397 return sprintf(page, "none\n"); 4398 } 4399 4400 static ssize_t 4401 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4402 { 4403 int major, minor; 4404 char *e; 4405 int err; 4406 /* Changing the details of 'external' metadata is 4407 * always permitted. Otherwise there must be 4408 * no devices attached to the array. 4409 */ 4410 4411 err = mddev_lock(mddev); 4412 if (err) 4413 return err; 4414 err = -EBUSY; 4415 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4416 ; 4417 else if (!list_empty(&mddev->disks)) 4418 goto out_unlock; 4419 4420 err = 0; 4421 if (cmd_match(buf, "none")) { 4422 mddev->persistent = 0; 4423 mddev->external = 0; 4424 mddev->major_version = 0; 4425 mddev->minor_version = 90; 4426 goto out_unlock; 4427 } 4428 if (strncmp(buf, "external:", 9) == 0) { 4429 size_t namelen = len-9; 4430 if (namelen >= sizeof(mddev->metadata_type)) 4431 namelen = sizeof(mddev->metadata_type)-1; 4432 strncpy(mddev->metadata_type, buf+9, namelen); 4433 mddev->metadata_type[namelen] = 0; 4434 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4435 mddev->metadata_type[--namelen] = 0; 4436 mddev->persistent = 0; 4437 mddev->external = 1; 4438 mddev->major_version = 0; 4439 mddev->minor_version = 90; 4440 goto out_unlock; 4441 } 4442 major = simple_strtoul(buf, &e, 10); 4443 err = -EINVAL; 4444 if (e==buf || *e != '.') 4445 goto out_unlock; 4446 buf = e+1; 4447 minor = simple_strtoul(buf, &e, 10); 4448 if (e==buf || (*e && *e != '\n') ) 4449 goto out_unlock; 4450 err = -ENOENT; 4451 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4452 goto out_unlock; 4453 mddev->major_version = major; 4454 mddev->minor_version = minor; 4455 mddev->persistent = 1; 4456 mddev->external = 0; 4457 err = 0; 4458 out_unlock: 4459 mddev_unlock(mddev); 4460 return err ?: len; 4461 } 4462 4463 static struct md_sysfs_entry md_metadata = 4464 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4465 4466 static ssize_t 4467 action_show(struct mddev *mddev, char *page) 4468 { 4469 char *type = "idle"; 4470 unsigned long recovery = mddev->recovery; 4471 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4472 type = "frozen"; 4473 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4474 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4475 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4476 type = "reshape"; 4477 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4478 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4479 type = "resync"; 4480 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4481 type = "check"; 4482 else 4483 type = "repair"; 4484 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4485 type = "recover"; 4486 else if (mddev->reshape_position != MaxSector) 4487 type = "reshape"; 4488 } 4489 return sprintf(page, "%s\n", type); 4490 } 4491 4492 static ssize_t 4493 action_store(struct mddev *mddev, const char *page, size_t len) 4494 { 4495 if (!mddev->pers || !mddev->pers->sync_request) 4496 return -EINVAL; 4497 4498 4499 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4500 if (cmd_match(page, "frozen")) 4501 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4502 else 4503 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4504 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4505 mddev_lock(mddev) == 0) { 4506 flush_workqueue(md_misc_wq); 4507 if (mddev->sync_thread) { 4508 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4509 md_reap_sync_thread(mddev); 4510 } 4511 mddev_unlock(mddev); 4512 } 4513 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4514 return -EBUSY; 4515 else if (cmd_match(page, "resync")) 4516 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4517 else if (cmd_match(page, "recover")) { 4518 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4519 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4520 } else if (cmd_match(page, "reshape")) { 4521 int err; 4522 if (mddev->pers->start_reshape == NULL) 4523 return -EINVAL; 4524 err = mddev_lock(mddev); 4525 if (!err) { 4526 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4527 err = -EBUSY; 4528 else { 4529 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4530 err = mddev->pers->start_reshape(mddev); 4531 } 4532 mddev_unlock(mddev); 4533 } 4534 if (err) 4535 return err; 4536 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4537 } else { 4538 if (cmd_match(page, "check")) 4539 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4540 else if (!cmd_match(page, "repair")) 4541 return -EINVAL; 4542 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4543 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4544 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4545 } 4546 if (mddev->ro == 2) { 4547 /* A write to sync_action is enough to justify 4548 * canceling read-auto mode 4549 */ 4550 mddev->ro = 0; 4551 md_wakeup_thread(mddev->sync_thread); 4552 } 4553 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4554 md_wakeup_thread(mddev->thread); 4555 sysfs_notify_dirent_safe(mddev->sysfs_action); 4556 return len; 4557 } 4558 4559 static struct md_sysfs_entry md_scan_mode = 4560 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4561 4562 static ssize_t 4563 last_sync_action_show(struct mddev *mddev, char *page) 4564 { 4565 return sprintf(page, "%s\n", mddev->last_sync_action); 4566 } 4567 4568 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4569 4570 static ssize_t 4571 mismatch_cnt_show(struct mddev *mddev, char *page) 4572 { 4573 return sprintf(page, "%llu\n", 4574 (unsigned long long) 4575 atomic64_read(&mddev->resync_mismatches)); 4576 } 4577 4578 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4579 4580 static ssize_t 4581 sync_min_show(struct mddev *mddev, char *page) 4582 { 4583 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4584 mddev->sync_speed_min ? "local": "system"); 4585 } 4586 4587 static ssize_t 4588 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4589 { 4590 unsigned int min; 4591 int rv; 4592 4593 if (strncmp(buf, "system", 6)==0) { 4594 min = 0; 4595 } else { 4596 rv = kstrtouint(buf, 10, &min); 4597 if (rv < 0) 4598 return rv; 4599 if (min == 0) 4600 return -EINVAL; 4601 } 4602 mddev->sync_speed_min = min; 4603 return len; 4604 } 4605 4606 static struct md_sysfs_entry md_sync_min = 4607 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4608 4609 static ssize_t 4610 sync_max_show(struct mddev *mddev, char *page) 4611 { 4612 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4613 mddev->sync_speed_max ? "local": "system"); 4614 } 4615 4616 static ssize_t 4617 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4618 { 4619 unsigned int max; 4620 int rv; 4621 4622 if (strncmp(buf, "system", 6)==0) { 4623 max = 0; 4624 } else { 4625 rv = kstrtouint(buf, 10, &max); 4626 if (rv < 0) 4627 return rv; 4628 if (max == 0) 4629 return -EINVAL; 4630 } 4631 mddev->sync_speed_max = max; 4632 return len; 4633 } 4634 4635 static struct md_sysfs_entry md_sync_max = 4636 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4637 4638 static ssize_t 4639 degraded_show(struct mddev *mddev, char *page) 4640 { 4641 return sprintf(page, "%d\n", mddev->degraded); 4642 } 4643 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4644 4645 static ssize_t 4646 sync_force_parallel_show(struct mddev *mddev, char *page) 4647 { 4648 return sprintf(page, "%d\n", mddev->parallel_resync); 4649 } 4650 4651 static ssize_t 4652 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4653 { 4654 long n; 4655 4656 if (kstrtol(buf, 10, &n)) 4657 return -EINVAL; 4658 4659 if (n != 0 && n != 1) 4660 return -EINVAL; 4661 4662 mddev->parallel_resync = n; 4663 4664 if (mddev->sync_thread) 4665 wake_up(&resync_wait); 4666 4667 return len; 4668 } 4669 4670 /* force parallel resync, even with shared block devices */ 4671 static struct md_sysfs_entry md_sync_force_parallel = 4672 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 4673 sync_force_parallel_show, sync_force_parallel_store); 4674 4675 static ssize_t 4676 sync_speed_show(struct mddev *mddev, char *page) 4677 { 4678 unsigned long resync, dt, db; 4679 if (mddev->curr_resync == 0) 4680 return sprintf(page, "none\n"); 4681 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 4682 dt = (jiffies - mddev->resync_mark) / HZ; 4683 if (!dt) dt++; 4684 db = resync - mddev->resync_mark_cnt; 4685 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 4686 } 4687 4688 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 4689 4690 static ssize_t 4691 sync_completed_show(struct mddev *mddev, char *page) 4692 { 4693 unsigned long long max_sectors, resync; 4694 4695 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4696 return sprintf(page, "none\n"); 4697 4698 if (mddev->curr_resync == 1 || 4699 mddev->curr_resync == 2) 4700 return sprintf(page, "delayed\n"); 4701 4702 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4703 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4704 max_sectors = mddev->resync_max_sectors; 4705 else 4706 max_sectors = mddev->dev_sectors; 4707 4708 resync = mddev->curr_resync_completed; 4709 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 4710 } 4711 4712 static struct md_sysfs_entry md_sync_completed = 4713 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 4714 4715 static ssize_t 4716 min_sync_show(struct mddev *mddev, char *page) 4717 { 4718 return sprintf(page, "%llu\n", 4719 (unsigned long long)mddev->resync_min); 4720 } 4721 static ssize_t 4722 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4723 { 4724 unsigned long long min; 4725 int err; 4726 4727 if (kstrtoull(buf, 10, &min)) 4728 return -EINVAL; 4729 4730 spin_lock(&mddev->lock); 4731 err = -EINVAL; 4732 if (min > mddev->resync_max) 4733 goto out_unlock; 4734 4735 err = -EBUSY; 4736 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4737 goto out_unlock; 4738 4739 /* Round down to multiple of 4K for safety */ 4740 mddev->resync_min = round_down(min, 8); 4741 err = 0; 4742 4743 out_unlock: 4744 spin_unlock(&mddev->lock); 4745 return err ?: len; 4746 } 4747 4748 static struct md_sysfs_entry md_min_sync = 4749 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 4750 4751 static ssize_t 4752 max_sync_show(struct mddev *mddev, char *page) 4753 { 4754 if (mddev->resync_max == MaxSector) 4755 return sprintf(page, "max\n"); 4756 else 4757 return sprintf(page, "%llu\n", 4758 (unsigned long long)mddev->resync_max); 4759 } 4760 static ssize_t 4761 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4762 { 4763 int err; 4764 spin_lock(&mddev->lock); 4765 if (strncmp(buf, "max", 3) == 0) 4766 mddev->resync_max = MaxSector; 4767 else { 4768 unsigned long long max; 4769 int chunk; 4770 4771 err = -EINVAL; 4772 if (kstrtoull(buf, 10, &max)) 4773 goto out_unlock; 4774 if (max < mddev->resync_min) 4775 goto out_unlock; 4776 4777 err = -EBUSY; 4778 if (max < mddev->resync_max && 4779 mddev->ro == 0 && 4780 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4781 goto out_unlock; 4782 4783 /* Must be a multiple of chunk_size */ 4784 chunk = mddev->chunk_sectors; 4785 if (chunk) { 4786 sector_t temp = max; 4787 4788 err = -EINVAL; 4789 if (sector_div(temp, chunk)) 4790 goto out_unlock; 4791 } 4792 mddev->resync_max = max; 4793 } 4794 wake_up(&mddev->recovery_wait); 4795 err = 0; 4796 out_unlock: 4797 spin_unlock(&mddev->lock); 4798 return err ?: len; 4799 } 4800 4801 static struct md_sysfs_entry md_max_sync = 4802 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 4803 4804 static ssize_t 4805 suspend_lo_show(struct mddev *mddev, char *page) 4806 { 4807 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 4808 } 4809 4810 static ssize_t 4811 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4812 { 4813 unsigned long long old, new; 4814 int err; 4815 4816 err = kstrtoull(buf, 10, &new); 4817 if (err < 0) 4818 return err; 4819 if (new != (sector_t)new) 4820 return -EINVAL; 4821 4822 err = mddev_lock(mddev); 4823 if (err) 4824 return err; 4825 err = -EINVAL; 4826 if (mddev->pers == NULL || 4827 mddev->pers->quiesce == NULL) 4828 goto unlock; 4829 old = mddev->suspend_lo; 4830 mddev->suspend_lo = new; 4831 if (new >= old) 4832 /* Shrinking suspended region */ 4833 mddev->pers->quiesce(mddev, 2); 4834 else { 4835 /* Expanding suspended region - need to wait */ 4836 mddev->pers->quiesce(mddev, 1); 4837 mddev->pers->quiesce(mddev, 0); 4838 } 4839 err = 0; 4840 unlock: 4841 mddev_unlock(mddev); 4842 return err ?: len; 4843 } 4844 static struct md_sysfs_entry md_suspend_lo = 4845 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4846 4847 static ssize_t 4848 suspend_hi_show(struct mddev *mddev, char *page) 4849 { 4850 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 4851 } 4852 4853 static ssize_t 4854 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4855 { 4856 unsigned long long old, new; 4857 int err; 4858 4859 err = kstrtoull(buf, 10, &new); 4860 if (err < 0) 4861 return err; 4862 if (new != (sector_t)new) 4863 return -EINVAL; 4864 4865 err = mddev_lock(mddev); 4866 if (err) 4867 return err; 4868 err = -EINVAL; 4869 if (mddev->pers == NULL || 4870 mddev->pers->quiesce == NULL) 4871 goto unlock; 4872 old = mddev->suspend_hi; 4873 mddev->suspend_hi = new; 4874 if (new <= old) 4875 /* Shrinking suspended region */ 4876 mddev->pers->quiesce(mddev, 2); 4877 else { 4878 /* Expanding suspended region - need to wait */ 4879 mddev->pers->quiesce(mddev, 1); 4880 mddev->pers->quiesce(mddev, 0); 4881 } 4882 err = 0; 4883 unlock: 4884 mddev_unlock(mddev); 4885 return err ?: len; 4886 } 4887 static struct md_sysfs_entry md_suspend_hi = 4888 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4889 4890 static ssize_t 4891 reshape_position_show(struct mddev *mddev, char *page) 4892 { 4893 if (mddev->reshape_position != MaxSector) 4894 return sprintf(page, "%llu\n", 4895 (unsigned long long)mddev->reshape_position); 4896 strcpy(page, "none\n"); 4897 return 5; 4898 } 4899 4900 static ssize_t 4901 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4902 { 4903 struct md_rdev *rdev; 4904 unsigned long long new; 4905 int err; 4906 4907 err = kstrtoull(buf, 10, &new); 4908 if (err < 0) 4909 return err; 4910 if (new != (sector_t)new) 4911 return -EINVAL; 4912 err = mddev_lock(mddev); 4913 if (err) 4914 return err; 4915 err = -EBUSY; 4916 if (mddev->pers) 4917 goto unlock; 4918 mddev->reshape_position = new; 4919 mddev->delta_disks = 0; 4920 mddev->reshape_backwards = 0; 4921 mddev->new_level = mddev->level; 4922 mddev->new_layout = mddev->layout; 4923 mddev->new_chunk_sectors = mddev->chunk_sectors; 4924 rdev_for_each(rdev, mddev) 4925 rdev->new_data_offset = rdev->data_offset; 4926 err = 0; 4927 unlock: 4928 mddev_unlock(mddev); 4929 return err ?: len; 4930 } 4931 4932 static struct md_sysfs_entry md_reshape_position = 4933 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4934 reshape_position_store); 4935 4936 static ssize_t 4937 reshape_direction_show(struct mddev *mddev, char *page) 4938 { 4939 return sprintf(page, "%s\n", 4940 mddev->reshape_backwards ? "backwards" : "forwards"); 4941 } 4942 4943 static ssize_t 4944 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4945 { 4946 int backwards = 0; 4947 int err; 4948 4949 if (cmd_match(buf, "forwards")) 4950 backwards = 0; 4951 else if (cmd_match(buf, "backwards")) 4952 backwards = 1; 4953 else 4954 return -EINVAL; 4955 if (mddev->reshape_backwards == backwards) 4956 return len; 4957 4958 err = mddev_lock(mddev); 4959 if (err) 4960 return err; 4961 /* check if we are allowed to change */ 4962 if (mddev->delta_disks) 4963 err = -EBUSY; 4964 else if (mddev->persistent && 4965 mddev->major_version == 0) 4966 err = -EINVAL; 4967 else 4968 mddev->reshape_backwards = backwards; 4969 mddev_unlock(mddev); 4970 return err ?: len; 4971 } 4972 4973 static struct md_sysfs_entry md_reshape_direction = 4974 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 4975 reshape_direction_store); 4976 4977 static ssize_t 4978 array_size_show(struct mddev *mddev, char *page) 4979 { 4980 if (mddev->external_size) 4981 return sprintf(page, "%llu\n", 4982 (unsigned long long)mddev->array_sectors/2); 4983 else 4984 return sprintf(page, "default\n"); 4985 } 4986 4987 static ssize_t 4988 array_size_store(struct mddev *mddev, const char *buf, size_t len) 4989 { 4990 sector_t sectors; 4991 int err; 4992 4993 err = mddev_lock(mddev); 4994 if (err) 4995 return err; 4996 4997 /* cluster raid doesn't support change array_sectors */ 4998 if (mddev_is_clustered(mddev)) { 4999 mddev_unlock(mddev); 5000 return -EINVAL; 5001 } 5002 5003 if (strncmp(buf, "default", 7) == 0) { 5004 if (mddev->pers) 5005 sectors = mddev->pers->size(mddev, 0, 0); 5006 else 5007 sectors = mddev->array_sectors; 5008 5009 mddev->external_size = 0; 5010 } else { 5011 if (strict_blocks_to_sectors(buf, §ors) < 0) 5012 err = -EINVAL; 5013 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5014 err = -E2BIG; 5015 else 5016 mddev->external_size = 1; 5017 } 5018 5019 if (!err) { 5020 mddev->array_sectors = sectors; 5021 if (mddev->pers) { 5022 set_capacity(mddev->gendisk, mddev->array_sectors); 5023 revalidate_disk(mddev->gendisk); 5024 } 5025 } 5026 mddev_unlock(mddev); 5027 return err ?: len; 5028 } 5029 5030 static struct md_sysfs_entry md_array_size = 5031 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5032 array_size_store); 5033 5034 static ssize_t 5035 consistency_policy_show(struct mddev *mddev, char *page) 5036 { 5037 int ret; 5038 5039 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5040 ret = sprintf(page, "journal\n"); 5041 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5042 ret = sprintf(page, "ppl\n"); 5043 } else if (mddev->bitmap) { 5044 ret = sprintf(page, "bitmap\n"); 5045 } else if (mddev->pers) { 5046 if (mddev->pers->sync_request) 5047 ret = sprintf(page, "resync\n"); 5048 else 5049 ret = sprintf(page, "none\n"); 5050 } else { 5051 ret = sprintf(page, "unknown\n"); 5052 } 5053 5054 return ret; 5055 } 5056 5057 static ssize_t 5058 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5059 { 5060 int err = 0; 5061 5062 if (mddev->pers) { 5063 if (mddev->pers->change_consistency_policy) 5064 err = mddev->pers->change_consistency_policy(mddev, buf); 5065 else 5066 err = -EBUSY; 5067 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5068 set_bit(MD_HAS_PPL, &mddev->flags); 5069 } else { 5070 err = -EINVAL; 5071 } 5072 5073 return err ? err : len; 5074 } 5075 5076 static struct md_sysfs_entry md_consistency_policy = 5077 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5078 consistency_policy_store); 5079 5080 static struct attribute *md_default_attrs[] = { 5081 &md_level.attr, 5082 &md_layout.attr, 5083 &md_raid_disks.attr, 5084 &md_chunk_size.attr, 5085 &md_size.attr, 5086 &md_resync_start.attr, 5087 &md_metadata.attr, 5088 &md_new_device.attr, 5089 &md_safe_delay.attr, 5090 &md_array_state.attr, 5091 &md_reshape_position.attr, 5092 &md_reshape_direction.attr, 5093 &md_array_size.attr, 5094 &max_corr_read_errors.attr, 5095 &md_consistency_policy.attr, 5096 NULL, 5097 }; 5098 5099 static struct attribute *md_redundancy_attrs[] = { 5100 &md_scan_mode.attr, 5101 &md_last_scan_mode.attr, 5102 &md_mismatches.attr, 5103 &md_sync_min.attr, 5104 &md_sync_max.attr, 5105 &md_sync_speed.attr, 5106 &md_sync_force_parallel.attr, 5107 &md_sync_completed.attr, 5108 &md_min_sync.attr, 5109 &md_max_sync.attr, 5110 &md_suspend_lo.attr, 5111 &md_suspend_hi.attr, 5112 &md_bitmap.attr, 5113 &md_degraded.attr, 5114 NULL, 5115 }; 5116 static struct attribute_group md_redundancy_group = { 5117 .name = NULL, 5118 .attrs = md_redundancy_attrs, 5119 }; 5120 5121 static ssize_t 5122 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5123 { 5124 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5125 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5126 ssize_t rv; 5127 5128 if (!entry->show) 5129 return -EIO; 5130 spin_lock(&all_mddevs_lock); 5131 if (list_empty(&mddev->all_mddevs)) { 5132 spin_unlock(&all_mddevs_lock); 5133 return -EBUSY; 5134 } 5135 mddev_get(mddev); 5136 spin_unlock(&all_mddevs_lock); 5137 5138 rv = entry->show(mddev, page); 5139 mddev_put(mddev); 5140 return rv; 5141 } 5142 5143 static ssize_t 5144 md_attr_store(struct kobject *kobj, struct attribute *attr, 5145 const char *page, size_t length) 5146 { 5147 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5148 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5149 ssize_t rv; 5150 5151 if (!entry->store) 5152 return -EIO; 5153 if (!capable(CAP_SYS_ADMIN)) 5154 return -EACCES; 5155 spin_lock(&all_mddevs_lock); 5156 if (list_empty(&mddev->all_mddevs)) { 5157 spin_unlock(&all_mddevs_lock); 5158 return -EBUSY; 5159 } 5160 mddev_get(mddev); 5161 spin_unlock(&all_mddevs_lock); 5162 rv = entry->store(mddev, page, length); 5163 mddev_put(mddev); 5164 return rv; 5165 } 5166 5167 static void md_free(struct kobject *ko) 5168 { 5169 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5170 5171 if (mddev->sysfs_state) 5172 sysfs_put(mddev->sysfs_state); 5173 5174 if (mddev->queue) 5175 blk_cleanup_queue(mddev->queue); 5176 if (mddev->gendisk) { 5177 del_gendisk(mddev->gendisk); 5178 put_disk(mddev->gendisk); 5179 } 5180 percpu_ref_exit(&mddev->writes_pending); 5181 5182 kfree(mddev); 5183 } 5184 5185 static const struct sysfs_ops md_sysfs_ops = { 5186 .show = md_attr_show, 5187 .store = md_attr_store, 5188 }; 5189 static struct kobj_type md_ktype = { 5190 .release = md_free, 5191 .sysfs_ops = &md_sysfs_ops, 5192 .default_attrs = md_default_attrs, 5193 }; 5194 5195 int mdp_major = 0; 5196 5197 static void mddev_delayed_delete(struct work_struct *ws) 5198 { 5199 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5200 5201 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 5202 kobject_del(&mddev->kobj); 5203 kobject_put(&mddev->kobj); 5204 } 5205 5206 static void no_op(struct percpu_ref *r) {} 5207 5208 int mddev_init_writes_pending(struct mddev *mddev) 5209 { 5210 if (mddev->writes_pending.percpu_count_ptr) 5211 return 0; 5212 if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0) 5213 return -ENOMEM; 5214 /* We want to start with the refcount at zero */ 5215 percpu_ref_put(&mddev->writes_pending); 5216 return 0; 5217 } 5218 EXPORT_SYMBOL_GPL(mddev_init_writes_pending); 5219 5220 static int md_alloc(dev_t dev, char *name) 5221 { 5222 /* 5223 * If dev is zero, name is the name of a device to allocate with 5224 * an arbitrary minor number. It will be "md_???" 5225 * If dev is non-zero it must be a device number with a MAJOR of 5226 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5227 * the device is being created by opening a node in /dev. 5228 * If "name" is not NULL, the device is being created by 5229 * writing to /sys/module/md_mod/parameters/new_array. 5230 */ 5231 static DEFINE_MUTEX(disks_mutex); 5232 struct mddev *mddev = mddev_find(dev); 5233 struct gendisk *disk; 5234 int partitioned; 5235 int shift; 5236 int unit; 5237 int error; 5238 5239 if (!mddev) 5240 return -ENODEV; 5241 5242 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5243 shift = partitioned ? MdpMinorShift : 0; 5244 unit = MINOR(mddev->unit) >> shift; 5245 5246 /* wait for any previous instance of this device to be 5247 * completely removed (mddev_delayed_delete). 5248 */ 5249 flush_workqueue(md_misc_wq); 5250 5251 mutex_lock(&disks_mutex); 5252 error = -EEXIST; 5253 if (mddev->gendisk) 5254 goto abort; 5255 5256 if (name && !dev) { 5257 /* Need to ensure that 'name' is not a duplicate. 5258 */ 5259 struct mddev *mddev2; 5260 spin_lock(&all_mddevs_lock); 5261 5262 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5263 if (mddev2->gendisk && 5264 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5265 spin_unlock(&all_mddevs_lock); 5266 goto abort; 5267 } 5268 spin_unlock(&all_mddevs_lock); 5269 } 5270 if (name && dev) 5271 /* 5272 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5273 */ 5274 mddev->hold_active = UNTIL_STOP; 5275 5276 error = -ENOMEM; 5277 mddev->queue = blk_alloc_queue(GFP_KERNEL); 5278 if (!mddev->queue) 5279 goto abort; 5280 mddev->queue->queuedata = mddev; 5281 5282 blk_queue_make_request(mddev->queue, md_make_request); 5283 blk_set_stacking_limits(&mddev->queue->limits); 5284 5285 disk = alloc_disk(1 << shift); 5286 if (!disk) { 5287 blk_cleanup_queue(mddev->queue); 5288 mddev->queue = NULL; 5289 goto abort; 5290 } 5291 disk->major = MAJOR(mddev->unit); 5292 disk->first_minor = unit << shift; 5293 if (name) 5294 strcpy(disk->disk_name, name); 5295 else if (partitioned) 5296 sprintf(disk->disk_name, "md_d%d", unit); 5297 else 5298 sprintf(disk->disk_name, "md%d", unit); 5299 disk->fops = &md_fops; 5300 disk->private_data = mddev; 5301 disk->queue = mddev->queue; 5302 blk_queue_write_cache(mddev->queue, true, true); 5303 /* Allow extended partitions. This makes the 5304 * 'mdp' device redundant, but we can't really 5305 * remove it now. 5306 */ 5307 disk->flags |= GENHD_FL_EXT_DEVT; 5308 mddev->gendisk = disk; 5309 /* As soon as we call add_disk(), another thread could get 5310 * through to md_open, so make sure it doesn't get too far 5311 */ 5312 mutex_lock(&mddev->open_mutex); 5313 add_disk(disk); 5314 5315 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 5316 &disk_to_dev(disk)->kobj, "%s", "md"); 5317 if (error) { 5318 /* This isn't possible, but as kobject_init_and_add is marked 5319 * __must_check, we must do something with the result 5320 */ 5321 pr_debug("md: cannot register %s/md - name in use\n", 5322 disk->disk_name); 5323 error = 0; 5324 } 5325 if (mddev->kobj.sd && 5326 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 5327 pr_debug("pointless warning\n"); 5328 mutex_unlock(&mddev->open_mutex); 5329 abort: 5330 mutex_unlock(&disks_mutex); 5331 if (!error && mddev->kobj.sd) { 5332 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5333 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5334 } 5335 mddev_put(mddev); 5336 return error; 5337 } 5338 5339 static struct kobject *md_probe(dev_t dev, int *part, void *data) 5340 { 5341 if (create_on_open) 5342 md_alloc(dev, NULL); 5343 return NULL; 5344 } 5345 5346 static int add_named_array(const char *val, struct kernel_param *kp) 5347 { 5348 /* 5349 * val must be "md_*" or "mdNNN". 5350 * For "md_*" we allocate an array with a large free minor number, and 5351 * set the name to val. val must not already be an active name. 5352 * For "mdNNN" we allocate an array with the minor number NNN 5353 * which must not already be in use. 5354 */ 5355 int len = strlen(val); 5356 char buf[DISK_NAME_LEN]; 5357 unsigned long devnum; 5358 5359 while (len && val[len-1] == '\n') 5360 len--; 5361 if (len >= DISK_NAME_LEN) 5362 return -E2BIG; 5363 strlcpy(buf, val, len+1); 5364 if (strncmp(buf, "md_", 3) == 0) 5365 return md_alloc(0, buf); 5366 if (strncmp(buf, "md", 2) == 0 && 5367 isdigit(buf[2]) && 5368 kstrtoul(buf+2, 10, &devnum) == 0 && 5369 devnum <= MINORMASK) 5370 return md_alloc(MKDEV(MD_MAJOR, devnum), NULL); 5371 5372 return -EINVAL; 5373 } 5374 5375 static void md_safemode_timeout(unsigned long data) 5376 { 5377 struct mddev *mddev = (struct mddev *) data; 5378 5379 mddev->safemode = 1; 5380 if (mddev->external) 5381 sysfs_notify_dirent_safe(mddev->sysfs_state); 5382 5383 md_wakeup_thread(mddev->thread); 5384 } 5385 5386 static int start_dirty_degraded; 5387 5388 int md_run(struct mddev *mddev) 5389 { 5390 int err; 5391 struct md_rdev *rdev; 5392 struct md_personality *pers; 5393 5394 if (list_empty(&mddev->disks)) 5395 /* cannot run an array with no devices.. */ 5396 return -EINVAL; 5397 5398 if (mddev->pers) 5399 return -EBUSY; 5400 /* Cannot run until previous stop completes properly */ 5401 if (mddev->sysfs_active) 5402 return -EBUSY; 5403 5404 /* 5405 * Analyze all RAID superblock(s) 5406 */ 5407 if (!mddev->raid_disks) { 5408 if (!mddev->persistent) 5409 return -EINVAL; 5410 analyze_sbs(mddev); 5411 } 5412 5413 if (mddev->level != LEVEL_NONE) 5414 request_module("md-level-%d", mddev->level); 5415 else if (mddev->clevel[0]) 5416 request_module("md-%s", mddev->clevel); 5417 5418 /* 5419 * Drop all container device buffers, from now on 5420 * the only valid external interface is through the md 5421 * device. 5422 */ 5423 rdev_for_each(rdev, mddev) { 5424 if (test_bit(Faulty, &rdev->flags)) 5425 continue; 5426 sync_blockdev(rdev->bdev); 5427 invalidate_bdev(rdev->bdev); 5428 if (mddev->ro != 1 && 5429 (bdev_read_only(rdev->bdev) || 5430 bdev_read_only(rdev->meta_bdev))) { 5431 mddev->ro = 1; 5432 if (mddev->gendisk) 5433 set_disk_ro(mddev->gendisk, 1); 5434 } 5435 5436 /* perform some consistency tests on the device. 5437 * We don't want the data to overlap the metadata, 5438 * Internal Bitmap issues have been handled elsewhere. 5439 */ 5440 if (rdev->meta_bdev) { 5441 /* Nothing to check */; 5442 } else if (rdev->data_offset < rdev->sb_start) { 5443 if (mddev->dev_sectors && 5444 rdev->data_offset + mddev->dev_sectors 5445 > rdev->sb_start) { 5446 pr_warn("md: %s: data overlaps metadata\n", 5447 mdname(mddev)); 5448 return -EINVAL; 5449 } 5450 } else { 5451 if (rdev->sb_start + rdev->sb_size/512 5452 > rdev->data_offset) { 5453 pr_warn("md: %s: metadata overlaps data\n", 5454 mdname(mddev)); 5455 return -EINVAL; 5456 } 5457 } 5458 sysfs_notify_dirent_safe(rdev->sysfs_state); 5459 } 5460 5461 if (mddev->bio_set == NULL) { 5462 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5463 if (!mddev->bio_set) 5464 return -ENOMEM; 5465 } 5466 if (mddev->sync_set == NULL) { 5467 mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5468 if (!mddev->sync_set) 5469 return -ENOMEM; 5470 } 5471 5472 spin_lock(&pers_lock); 5473 pers = find_pers(mddev->level, mddev->clevel); 5474 if (!pers || !try_module_get(pers->owner)) { 5475 spin_unlock(&pers_lock); 5476 if (mddev->level != LEVEL_NONE) 5477 pr_warn("md: personality for level %d is not loaded!\n", 5478 mddev->level); 5479 else 5480 pr_warn("md: personality for level %s is not loaded!\n", 5481 mddev->clevel); 5482 return -EINVAL; 5483 } 5484 spin_unlock(&pers_lock); 5485 if (mddev->level != pers->level) { 5486 mddev->level = pers->level; 5487 mddev->new_level = pers->level; 5488 } 5489 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5490 5491 if (mddev->reshape_position != MaxSector && 5492 pers->start_reshape == NULL) { 5493 /* This personality cannot handle reshaping... */ 5494 module_put(pers->owner); 5495 return -EINVAL; 5496 } 5497 5498 if (pers->sync_request) { 5499 /* Warn if this is a potentially silly 5500 * configuration. 5501 */ 5502 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5503 struct md_rdev *rdev2; 5504 int warned = 0; 5505 5506 rdev_for_each(rdev, mddev) 5507 rdev_for_each(rdev2, mddev) { 5508 if (rdev < rdev2 && 5509 rdev->bdev->bd_contains == 5510 rdev2->bdev->bd_contains) { 5511 pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n", 5512 mdname(mddev), 5513 bdevname(rdev->bdev,b), 5514 bdevname(rdev2->bdev,b2)); 5515 warned = 1; 5516 } 5517 } 5518 5519 if (warned) 5520 pr_warn("True protection against single-disk failure might be compromised.\n"); 5521 } 5522 5523 mddev->recovery = 0; 5524 /* may be over-ridden by personality */ 5525 mddev->resync_max_sectors = mddev->dev_sectors; 5526 5527 mddev->ok_start_degraded = start_dirty_degraded; 5528 5529 if (start_readonly && mddev->ro == 0) 5530 mddev->ro = 2; /* read-only, but switch on first write */ 5531 5532 /* 5533 * NOTE: some pers->run(), for example r5l_recovery_log(), wakes 5534 * up mddev->thread. It is important to initialize critical 5535 * resources for mddev->thread BEFORE calling pers->run(). 5536 */ 5537 err = pers->run(mddev); 5538 if (err) 5539 pr_warn("md: pers->run() failed ...\n"); 5540 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 5541 WARN_ONCE(!mddev->external_size, 5542 "%s: default size too small, but 'external_size' not in effect?\n", 5543 __func__); 5544 pr_warn("md: invalid array_size %llu > default size %llu\n", 5545 (unsigned long long)mddev->array_sectors / 2, 5546 (unsigned long long)pers->size(mddev, 0, 0) / 2); 5547 err = -EINVAL; 5548 } 5549 if (err == 0 && pers->sync_request && 5550 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5551 struct bitmap *bitmap; 5552 5553 bitmap = bitmap_create(mddev, -1); 5554 if (IS_ERR(bitmap)) { 5555 err = PTR_ERR(bitmap); 5556 pr_warn("%s: failed to create bitmap (%d)\n", 5557 mdname(mddev), err); 5558 } else 5559 mddev->bitmap = bitmap; 5560 5561 } 5562 if (err) { 5563 mddev_detach(mddev); 5564 if (mddev->private) 5565 pers->free(mddev, mddev->private); 5566 mddev->private = NULL; 5567 module_put(pers->owner); 5568 bitmap_destroy(mddev); 5569 return err; 5570 } 5571 if (mddev->queue) { 5572 bool nonrot = true; 5573 5574 rdev_for_each(rdev, mddev) { 5575 if (rdev->raid_disk >= 0 && 5576 !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 5577 nonrot = false; 5578 break; 5579 } 5580 } 5581 if (mddev->degraded) 5582 nonrot = false; 5583 if (nonrot) 5584 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue); 5585 else 5586 queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue); 5587 mddev->queue->backing_dev_info->congested_data = mddev; 5588 mddev->queue->backing_dev_info->congested_fn = md_congested; 5589 } 5590 if (pers->sync_request) { 5591 if (mddev->kobj.sd && 5592 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5593 pr_warn("md: cannot register extra attributes for %s\n", 5594 mdname(mddev)); 5595 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 5596 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 5597 mddev->ro = 0; 5598 5599 atomic_set(&mddev->max_corr_read_errors, 5600 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 5601 mddev->safemode = 0; 5602 if (mddev_is_clustered(mddev)) 5603 mddev->safemode_delay = 0; 5604 else 5605 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 5606 mddev->in_sync = 1; 5607 smp_wmb(); 5608 spin_lock(&mddev->lock); 5609 mddev->pers = pers; 5610 spin_unlock(&mddev->lock); 5611 rdev_for_each(rdev, mddev) 5612 if (rdev->raid_disk >= 0) 5613 if (sysfs_link_rdev(mddev, rdev)) 5614 /* failure here is OK */; 5615 5616 if (mddev->degraded && !mddev->ro) 5617 /* This ensures that recovering status is reported immediately 5618 * via sysfs - until a lack of spares is confirmed. 5619 */ 5620 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5621 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5622 5623 if (mddev->sb_flags) 5624 md_update_sb(mddev, 0); 5625 5626 md_new_event(mddev); 5627 sysfs_notify_dirent_safe(mddev->sysfs_state); 5628 sysfs_notify_dirent_safe(mddev->sysfs_action); 5629 sysfs_notify(&mddev->kobj, NULL, "degraded"); 5630 return 0; 5631 } 5632 EXPORT_SYMBOL_GPL(md_run); 5633 5634 static int do_md_run(struct mddev *mddev) 5635 { 5636 int err; 5637 5638 err = md_run(mddev); 5639 if (err) 5640 goto out; 5641 err = bitmap_load(mddev); 5642 if (err) { 5643 bitmap_destroy(mddev); 5644 goto out; 5645 } 5646 5647 if (mddev_is_clustered(mddev)) 5648 md_allow_write(mddev); 5649 5650 md_wakeup_thread(mddev->thread); 5651 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 5652 5653 set_capacity(mddev->gendisk, mddev->array_sectors); 5654 revalidate_disk(mddev->gendisk); 5655 mddev->changed = 1; 5656 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5657 out: 5658 return err; 5659 } 5660 5661 static int restart_array(struct mddev *mddev) 5662 { 5663 struct gendisk *disk = mddev->gendisk; 5664 struct md_rdev *rdev; 5665 bool has_journal = false; 5666 bool has_readonly = false; 5667 5668 /* Complain if it has no devices */ 5669 if (list_empty(&mddev->disks)) 5670 return -ENXIO; 5671 if (!mddev->pers) 5672 return -EINVAL; 5673 if (!mddev->ro) 5674 return -EBUSY; 5675 5676 rcu_read_lock(); 5677 rdev_for_each_rcu(rdev, mddev) { 5678 if (test_bit(Journal, &rdev->flags) && 5679 !test_bit(Faulty, &rdev->flags)) 5680 has_journal = true; 5681 if (bdev_read_only(rdev->bdev)) 5682 has_readonly = true; 5683 } 5684 rcu_read_unlock(); 5685 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 5686 /* Don't restart rw with journal missing/faulty */ 5687 return -EINVAL; 5688 if (has_readonly) 5689 return -EROFS; 5690 5691 mddev->safemode = 0; 5692 mddev->ro = 0; 5693 set_disk_ro(disk, 0); 5694 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 5695 /* Kick recovery or resync if necessary */ 5696 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5697 md_wakeup_thread(mddev->thread); 5698 md_wakeup_thread(mddev->sync_thread); 5699 sysfs_notify_dirent_safe(mddev->sysfs_state); 5700 return 0; 5701 } 5702 5703 static void md_clean(struct mddev *mddev) 5704 { 5705 mddev->array_sectors = 0; 5706 mddev->external_size = 0; 5707 mddev->dev_sectors = 0; 5708 mddev->raid_disks = 0; 5709 mddev->recovery_cp = 0; 5710 mddev->resync_min = 0; 5711 mddev->resync_max = MaxSector; 5712 mddev->reshape_position = MaxSector; 5713 mddev->external = 0; 5714 mddev->persistent = 0; 5715 mddev->level = LEVEL_NONE; 5716 mddev->clevel[0] = 0; 5717 mddev->flags = 0; 5718 mddev->sb_flags = 0; 5719 mddev->ro = 0; 5720 mddev->metadata_type[0] = 0; 5721 mddev->chunk_sectors = 0; 5722 mddev->ctime = mddev->utime = 0; 5723 mddev->layout = 0; 5724 mddev->max_disks = 0; 5725 mddev->events = 0; 5726 mddev->can_decrease_events = 0; 5727 mddev->delta_disks = 0; 5728 mddev->reshape_backwards = 0; 5729 mddev->new_level = LEVEL_NONE; 5730 mddev->new_layout = 0; 5731 mddev->new_chunk_sectors = 0; 5732 mddev->curr_resync = 0; 5733 atomic64_set(&mddev->resync_mismatches, 0); 5734 mddev->suspend_lo = mddev->suspend_hi = 0; 5735 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5736 mddev->recovery = 0; 5737 mddev->in_sync = 0; 5738 mddev->changed = 0; 5739 mddev->degraded = 0; 5740 mddev->safemode = 0; 5741 mddev->private = NULL; 5742 mddev->cluster_info = NULL; 5743 mddev->bitmap_info.offset = 0; 5744 mddev->bitmap_info.default_offset = 0; 5745 mddev->bitmap_info.default_space = 0; 5746 mddev->bitmap_info.chunksize = 0; 5747 mddev->bitmap_info.daemon_sleep = 0; 5748 mddev->bitmap_info.max_write_behind = 0; 5749 mddev->bitmap_info.nodes = 0; 5750 } 5751 5752 static void __md_stop_writes(struct mddev *mddev) 5753 { 5754 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5755 flush_workqueue(md_misc_wq); 5756 if (mddev->sync_thread) { 5757 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5758 md_reap_sync_thread(mddev); 5759 } 5760 5761 del_timer_sync(&mddev->safemode_timer); 5762 5763 if (mddev->pers && mddev->pers->quiesce) { 5764 mddev->pers->quiesce(mddev, 1); 5765 mddev->pers->quiesce(mddev, 0); 5766 } 5767 bitmap_flush(mddev); 5768 5769 if (mddev->ro == 0 && 5770 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 5771 mddev->sb_flags)) { 5772 /* mark array as shutdown cleanly */ 5773 if (!mddev_is_clustered(mddev)) 5774 mddev->in_sync = 1; 5775 md_update_sb(mddev, 1); 5776 } 5777 } 5778 5779 void md_stop_writes(struct mddev *mddev) 5780 { 5781 mddev_lock_nointr(mddev); 5782 __md_stop_writes(mddev); 5783 mddev_unlock(mddev); 5784 } 5785 EXPORT_SYMBOL_GPL(md_stop_writes); 5786 5787 static void mddev_detach(struct mddev *mddev) 5788 { 5789 bitmap_wait_behind_writes(mddev); 5790 if (mddev->pers && mddev->pers->quiesce) { 5791 mddev->pers->quiesce(mddev, 1); 5792 mddev->pers->quiesce(mddev, 0); 5793 } 5794 md_unregister_thread(&mddev->thread); 5795 if (mddev->queue) 5796 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5797 } 5798 5799 static void __md_stop(struct mddev *mddev) 5800 { 5801 struct md_personality *pers = mddev->pers; 5802 bitmap_destroy(mddev); 5803 mddev_detach(mddev); 5804 /* Ensure ->event_work is done */ 5805 flush_workqueue(md_misc_wq); 5806 spin_lock(&mddev->lock); 5807 mddev->pers = NULL; 5808 spin_unlock(&mddev->lock); 5809 pers->free(mddev, mddev->private); 5810 mddev->private = NULL; 5811 if (pers->sync_request && mddev->to_remove == NULL) 5812 mddev->to_remove = &md_redundancy_group; 5813 module_put(pers->owner); 5814 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5815 } 5816 5817 void md_stop(struct mddev *mddev) 5818 { 5819 /* stop the array and free an attached data structures. 5820 * This is called from dm-raid 5821 */ 5822 __md_stop(mddev); 5823 if (mddev->bio_set) 5824 bioset_free(mddev->bio_set); 5825 } 5826 5827 EXPORT_SYMBOL_GPL(md_stop); 5828 5829 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5830 { 5831 int err = 0; 5832 int did_freeze = 0; 5833 5834 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5835 did_freeze = 1; 5836 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5837 md_wakeup_thread(mddev->thread); 5838 } 5839 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5840 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5841 if (mddev->sync_thread) 5842 /* Thread might be blocked waiting for metadata update 5843 * which will now never happen */ 5844 wake_up_process(mddev->sync_thread->tsk); 5845 5846 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 5847 return -EBUSY; 5848 mddev_unlock(mddev); 5849 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 5850 &mddev->recovery)); 5851 wait_event(mddev->sb_wait, 5852 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 5853 mddev_lock_nointr(mddev); 5854 5855 mutex_lock(&mddev->open_mutex); 5856 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5857 mddev->sync_thread || 5858 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5859 pr_warn("md: %s still in use.\n",mdname(mddev)); 5860 if (did_freeze) { 5861 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5862 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5863 md_wakeup_thread(mddev->thread); 5864 } 5865 err = -EBUSY; 5866 goto out; 5867 } 5868 if (mddev->pers) { 5869 __md_stop_writes(mddev); 5870 5871 err = -ENXIO; 5872 if (mddev->ro==1) 5873 goto out; 5874 mddev->ro = 1; 5875 set_disk_ro(mddev->gendisk, 1); 5876 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5877 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5878 md_wakeup_thread(mddev->thread); 5879 sysfs_notify_dirent_safe(mddev->sysfs_state); 5880 err = 0; 5881 } 5882 out: 5883 mutex_unlock(&mddev->open_mutex); 5884 return err; 5885 } 5886 5887 /* mode: 5888 * 0 - completely stop and dis-assemble array 5889 * 2 - stop but do not disassemble array 5890 */ 5891 static int do_md_stop(struct mddev *mddev, int mode, 5892 struct block_device *bdev) 5893 { 5894 struct gendisk *disk = mddev->gendisk; 5895 struct md_rdev *rdev; 5896 int did_freeze = 0; 5897 5898 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5899 did_freeze = 1; 5900 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5901 md_wakeup_thread(mddev->thread); 5902 } 5903 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5904 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5905 if (mddev->sync_thread) 5906 /* Thread might be blocked waiting for metadata update 5907 * which will now never happen */ 5908 wake_up_process(mddev->sync_thread->tsk); 5909 5910 mddev_unlock(mddev); 5911 wait_event(resync_wait, (mddev->sync_thread == NULL && 5912 !test_bit(MD_RECOVERY_RUNNING, 5913 &mddev->recovery))); 5914 mddev_lock_nointr(mddev); 5915 5916 mutex_lock(&mddev->open_mutex); 5917 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5918 mddev->sysfs_active || 5919 mddev->sync_thread || 5920 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5921 pr_warn("md: %s still in use.\n",mdname(mddev)); 5922 mutex_unlock(&mddev->open_mutex); 5923 if (did_freeze) { 5924 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5925 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5926 md_wakeup_thread(mddev->thread); 5927 } 5928 return -EBUSY; 5929 } 5930 if (mddev->pers) { 5931 if (mddev->ro) 5932 set_disk_ro(disk, 0); 5933 5934 __md_stop_writes(mddev); 5935 __md_stop(mddev); 5936 mddev->queue->backing_dev_info->congested_fn = NULL; 5937 5938 /* tell userspace to handle 'inactive' */ 5939 sysfs_notify_dirent_safe(mddev->sysfs_state); 5940 5941 rdev_for_each(rdev, mddev) 5942 if (rdev->raid_disk >= 0) 5943 sysfs_unlink_rdev(mddev, rdev); 5944 5945 set_capacity(disk, 0); 5946 mutex_unlock(&mddev->open_mutex); 5947 mddev->changed = 1; 5948 revalidate_disk(disk); 5949 5950 if (mddev->ro) 5951 mddev->ro = 0; 5952 } else 5953 mutex_unlock(&mddev->open_mutex); 5954 /* 5955 * Free resources if final stop 5956 */ 5957 if (mode == 0) { 5958 pr_info("md: %s stopped.\n", mdname(mddev)); 5959 5960 if (mddev->bitmap_info.file) { 5961 struct file *f = mddev->bitmap_info.file; 5962 spin_lock(&mddev->lock); 5963 mddev->bitmap_info.file = NULL; 5964 spin_unlock(&mddev->lock); 5965 fput(f); 5966 } 5967 mddev->bitmap_info.offset = 0; 5968 5969 export_array(mddev); 5970 5971 md_clean(mddev); 5972 if (mddev->hold_active == UNTIL_STOP) 5973 mddev->hold_active = 0; 5974 } 5975 md_new_event(mddev); 5976 sysfs_notify_dirent_safe(mddev->sysfs_state); 5977 return 0; 5978 } 5979 5980 #ifndef MODULE 5981 static void autorun_array(struct mddev *mddev) 5982 { 5983 struct md_rdev *rdev; 5984 int err; 5985 5986 if (list_empty(&mddev->disks)) 5987 return; 5988 5989 pr_info("md: running: "); 5990 5991 rdev_for_each(rdev, mddev) { 5992 char b[BDEVNAME_SIZE]; 5993 pr_cont("<%s>", bdevname(rdev->bdev,b)); 5994 } 5995 pr_cont("\n"); 5996 5997 err = do_md_run(mddev); 5998 if (err) { 5999 pr_warn("md: do_md_run() returned %d\n", err); 6000 do_md_stop(mddev, 0, NULL); 6001 } 6002 } 6003 6004 /* 6005 * lets try to run arrays based on all disks that have arrived 6006 * until now. (those are in pending_raid_disks) 6007 * 6008 * the method: pick the first pending disk, collect all disks with 6009 * the same UUID, remove all from the pending list and put them into 6010 * the 'same_array' list. Then order this list based on superblock 6011 * update time (freshest comes first), kick out 'old' disks and 6012 * compare superblocks. If everything's fine then run it. 6013 * 6014 * If "unit" is allocated, then bump its reference count 6015 */ 6016 static void autorun_devices(int part) 6017 { 6018 struct md_rdev *rdev0, *rdev, *tmp; 6019 struct mddev *mddev; 6020 char b[BDEVNAME_SIZE]; 6021 6022 pr_info("md: autorun ...\n"); 6023 while (!list_empty(&pending_raid_disks)) { 6024 int unit; 6025 dev_t dev; 6026 LIST_HEAD(candidates); 6027 rdev0 = list_entry(pending_raid_disks.next, 6028 struct md_rdev, same_set); 6029 6030 pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b)); 6031 INIT_LIST_HEAD(&candidates); 6032 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6033 if (super_90_load(rdev, rdev0, 0) >= 0) { 6034 pr_debug("md: adding %s ...\n", 6035 bdevname(rdev->bdev,b)); 6036 list_move(&rdev->same_set, &candidates); 6037 } 6038 /* 6039 * now we have a set of devices, with all of them having 6040 * mostly sane superblocks. It's time to allocate the 6041 * mddev. 6042 */ 6043 if (part) { 6044 dev = MKDEV(mdp_major, 6045 rdev0->preferred_minor << MdpMinorShift); 6046 unit = MINOR(dev) >> MdpMinorShift; 6047 } else { 6048 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6049 unit = MINOR(dev); 6050 } 6051 if (rdev0->preferred_minor != unit) { 6052 pr_warn("md: unit number in %s is bad: %d\n", 6053 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 6054 break; 6055 } 6056 6057 md_probe(dev, NULL, NULL); 6058 mddev = mddev_find(dev); 6059 if (!mddev || !mddev->gendisk) { 6060 if (mddev) 6061 mddev_put(mddev); 6062 break; 6063 } 6064 if (mddev_lock(mddev)) 6065 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6066 else if (mddev->raid_disks || mddev->major_version 6067 || !list_empty(&mddev->disks)) { 6068 pr_warn("md: %s already running, cannot run %s\n", 6069 mdname(mddev), bdevname(rdev0->bdev,b)); 6070 mddev_unlock(mddev); 6071 } else { 6072 pr_debug("md: created %s\n", mdname(mddev)); 6073 mddev->persistent = 1; 6074 rdev_for_each_list(rdev, tmp, &candidates) { 6075 list_del_init(&rdev->same_set); 6076 if (bind_rdev_to_array(rdev, mddev)) 6077 export_rdev(rdev); 6078 } 6079 autorun_array(mddev); 6080 mddev_unlock(mddev); 6081 } 6082 /* on success, candidates will be empty, on error 6083 * it won't... 6084 */ 6085 rdev_for_each_list(rdev, tmp, &candidates) { 6086 list_del_init(&rdev->same_set); 6087 export_rdev(rdev); 6088 } 6089 mddev_put(mddev); 6090 } 6091 pr_info("md: ... autorun DONE.\n"); 6092 } 6093 #endif /* !MODULE */ 6094 6095 static int get_version(void __user *arg) 6096 { 6097 mdu_version_t ver; 6098 6099 ver.major = MD_MAJOR_VERSION; 6100 ver.minor = MD_MINOR_VERSION; 6101 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6102 6103 if (copy_to_user(arg, &ver, sizeof(ver))) 6104 return -EFAULT; 6105 6106 return 0; 6107 } 6108 6109 static int get_array_info(struct mddev *mddev, void __user *arg) 6110 { 6111 mdu_array_info_t info; 6112 int nr,working,insync,failed,spare; 6113 struct md_rdev *rdev; 6114 6115 nr = working = insync = failed = spare = 0; 6116 rcu_read_lock(); 6117 rdev_for_each_rcu(rdev, mddev) { 6118 nr++; 6119 if (test_bit(Faulty, &rdev->flags)) 6120 failed++; 6121 else { 6122 working++; 6123 if (test_bit(In_sync, &rdev->flags)) 6124 insync++; 6125 else if (test_bit(Journal, &rdev->flags)) 6126 /* TODO: add journal count to md_u.h */ 6127 ; 6128 else 6129 spare++; 6130 } 6131 } 6132 rcu_read_unlock(); 6133 6134 info.major_version = mddev->major_version; 6135 info.minor_version = mddev->minor_version; 6136 info.patch_version = MD_PATCHLEVEL_VERSION; 6137 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6138 info.level = mddev->level; 6139 info.size = mddev->dev_sectors / 2; 6140 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6141 info.size = -1; 6142 info.nr_disks = nr; 6143 info.raid_disks = mddev->raid_disks; 6144 info.md_minor = mddev->md_minor; 6145 info.not_persistent= !mddev->persistent; 6146 6147 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6148 info.state = 0; 6149 if (mddev->in_sync) 6150 info.state = (1<<MD_SB_CLEAN); 6151 if (mddev->bitmap && mddev->bitmap_info.offset) 6152 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6153 if (mddev_is_clustered(mddev)) 6154 info.state |= (1<<MD_SB_CLUSTERED); 6155 info.active_disks = insync; 6156 info.working_disks = working; 6157 info.failed_disks = failed; 6158 info.spare_disks = spare; 6159 6160 info.layout = mddev->layout; 6161 info.chunk_size = mddev->chunk_sectors << 9; 6162 6163 if (copy_to_user(arg, &info, sizeof(info))) 6164 return -EFAULT; 6165 6166 return 0; 6167 } 6168 6169 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6170 { 6171 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6172 char *ptr; 6173 int err; 6174 6175 file = kzalloc(sizeof(*file), GFP_NOIO); 6176 if (!file) 6177 return -ENOMEM; 6178 6179 err = 0; 6180 spin_lock(&mddev->lock); 6181 /* bitmap enabled */ 6182 if (mddev->bitmap_info.file) { 6183 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6184 sizeof(file->pathname)); 6185 if (IS_ERR(ptr)) 6186 err = PTR_ERR(ptr); 6187 else 6188 memmove(file->pathname, ptr, 6189 sizeof(file->pathname)-(ptr-file->pathname)); 6190 } 6191 spin_unlock(&mddev->lock); 6192 6193 if (err == 0 && 6194 copy_to_user(arg, file, sizeof(*file))) 6195 err = -EFAULT; 6196 6197 kfree(file); 6198 return err; 6199 } 6200 6201 static int get_disk_info(struct mddev *mddev, void __user * arg) 6202 { 6203 mdu_disk_info_t info; 6204 struct md_rdev *rdev; 6205 6206 if (copy_from_user(&info, arg, sizeof(info))) 6207 return -EFAULT; 6208 6209 rcu_read_lock(); 6210 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6211 if (rdev) { 6212 info.major = MAJOR(rdev->bdev->bd_dev); 6213 info.minor = MINOR(rdev->bdev->bd_dev); 6214 info.raid_disk = rdev->raid_disk; 6215 info.state = 0; 6216 if (test_bit(Faulty, &rdev->flags)) 6217 info.state |= (1<<MD_DISK_FAULTY); 6218 else if (test_bit(In_sync, &rdev->flags)) { 6219 info.state |= (1<<MD_DISK_ACTIVE); 6220 info.state |= (1<<MD_DISK_SYNC); 6221 } 6222 if (test_bit(Journal, &rdev->flags)) 6223 info.state |= (1<<MD_DISK_JOURNAL); 6224 if (test_bit(WriteMostly, &rdev->flags)) 6225 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6226 if (test_bit(FailFast, &rdev->flags)) 6227 info.state |= (1<<MD_DISK_FAILFAST); 6228 } else { 6229 info.major = info.minor = 0; 6230 info.raid_disk = -1; 6231 info.state = (1<<MD_DISK_REMOVED); 6232 } 6233 rcu_read_unlock(); 6234 6235 if (copy_to_user(arg, &info, sizeof(info))) 6236 return -EFAULT; 6237 6238 return 0; 6239 } 6240 6241 static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) 6242 { 6243 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 6244 struct md_rdev *rdev; 6245 dev_t dev = MKDEV(info->major,info->minor); 6246 6247 if (mddev_is_clustered(mddev) && 6248 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6249 pr_warn("%s: Cannot add to clustered mddev.\n", 6250 mdname(mddev)); 6251 return -EINVAL; 6252 } 6253 6254 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6255 return -EOVERFLOW; 6256 6257 if (!mddev->raid_disks) { 6258 int err; 6259 /* expecting a device which has a superblock */ 6260 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6261 if (IS_ERR(rdev)) { 6262 pr_warn("md: md_import_device returned %ld\n", 6263 PTR_ERR(rdev)); 6264 return PTR_ERR(rdev); 6265 } 6266 if (!list_empty(&mddev->disks)) { 6267 struct md_rdev *rdev0 6268 = list_entry(mddev->disks.next, 6269 struct md_rdev, same_set); 6270 err = super_types[mddev->major_version] 6271 .load_super(rdev, rdev0, mddev->minor_version); 6272 if (err < 0) { 6273 pr_warn("md: %s has different UUID to %s\n", 6274 bdevname(rdev->bdev,b), 6275 bdevname(rdev0->bdev,b2)); 6276 export_rdev(rdev); 6277 return -EINVAL; 6278 } 6279 } 6280 err = bind_rdev_to_array(rdev, mddev); 6281 if (err) 6282 export_rdev(rdev); 6283 return err; 6284 } 6285 6286 /* 6287 * add_new_disk can be used once the array is assembled 6288 * to add "hot spares". They must already have a superblock 6289 * written 6290 */ 6291 if (mddev->pers) { 6292 int err; 6293 if (!mddev->pers->hot_add_disk) { 6294 pr_warn("%s: personality does not support diskops!\n", 6295 mdname(mddev)); 6296 return -EINVAL; 6297 } 6298 if (mddev->persistent) 6299 rdev = md_import_device(dev, mddev->major_version, 6300 mddev->minor_version); 6301 else 6302 rdev = md_import_device(dev, -1, -1); 6303 if (IS_ERR(rdev)) { 6304 pr_warn("md: md_import_device returned %ld\n", 6305 PTR_ERR(rdev)); 6306 return PTR_ERR(rdev); 6307 } 6308 /* set saved_raid_disk if appropriate */ 6309 if (!mddev->persistent) { 6310 if (info->state & (1<<MD_DISK_SYNC) && 6311 info->raid_disk < mddev->raid_disks) { 6312 rdev->raid_disk = info->raid_disk; 6313 set_bit(In_sync, &rdev->flags); 6314 clear_bit(Bitmap_sync, &rdev->flags); 6315 } else 6316 rdev->raid_disk = -1; 6317 rdev->saved_raid_disk = rdev->raid_disk; 6318 } else 6319 super_types[mddev->major_version]. 6320 validate_super(mddev, rdev); 6321 if ((info->state & (1<<MD_DISK_SYNC)) && 6322 rdev->raid_disk != info->raid_disk) { 6323 /* This was a hot-add request, but events doesn't 6324 * match, so reject it. 6325 */ 6326 export_rdev(rdev); 6327 return -EINVAL; 6328 } 6329 6330 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6331 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6332 set_bit(WriteMostly, &rdev->flags); 6333 else 6334 clear_bit(WriteMostly, &rdev->flags); 6335 if (info->state & (1<<MD_DISK_FAILFAST)) 6336 set_bit(FailFast, &rdev->flags); 6337 else 6338 clear_bit(FailFast, &rdev->flags); 6339 6340 if (info->state & (1<<MD_DISK_JOURNAL)) { 6341 struct md_rdev *rdev2; 6342 bool has_journal = false; 6343 6344 /* make sure no existing journal disk */ 6345 rdev_for_each(rdev2, mddev) { 6346 if (test_bit(Journal, &rdev2->flags)) { 6347 has_journal = true; 6348 break; 6349 } 6350 } 6351 if (has_journal) { 6352 export_rdev(rdev); 6353 return -EBUSY; 6354 } 6355 set_bit(Journal, &rdev->flags); 6356 } 6357 /* 6358 * check whether the device shows up in other nodes 6359 */ 6360 if (mddev_is_clustered(mddev)) { 6361 if (info->state & (1 << MD_DISK_CANDIDATE)) 6362 set_bit(Candidate, &rdev->flags); 6363 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6364 /* --add initiated by this node */ 6365 err = md_cluster_ops->add_new_disk(mddev, rdev); 6366 if (err) { 6367 export_rdev(rdev); 6368 return err; 6369 } 6370 } 6371 } 6372 6373 rdev->raid_disk = -1; 6374 err = bind_rdev_to_array(rdev, mddev); 6375 6376 if (err) 6377 export_rdev(rdev); 6378 6379 if (mddev_is_clustered(mddev)) { 6380 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6381 if (!err) { 6382 err = md_cluster_ops->new_disk_ack(mddev, 6383 err == 0); 6384 if (err) 6385 md_kick_rdev_from_array(rdev); 6386 } 6387 } else { 6388 if (err) 6389 md_cluster_ops->add_new_disk_cancel(mddev); 6390 else 6391 err = add_bound_rdev(rdev); 6392 } 6393 6394 } else if (!err) 6395 err = add_bound_rdev(rdev); 6396 6397 return err; 6398 } 6399 6400 /* otherwise, add_new_disk is only allowed 6401 * for major_version==0 superblocks 6402 */ 6403 if (mddev->major_version != 0) { 6404 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 6405 return -EINVAL; 6406 } 6407 6408 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6409 int err; 6410 rdev = md_import_device(dev, -1, 0); 6411 if (IS_ERR(rdev)) { 6412 pr_warn("md: error, md_import_device() returned %ld\n", 6413 PTR_ERR(rdev)); 6414 return PTR_ERR(rdev); 6415 } 6416 rdev->desc_nr = info->number; 6417 if (info->raid_disk < mddev->raid_disks) 6418 rdev->raid_disk = info->raid_disk; 6419 else 6420 rdev->raid_disk = -1; 6421 6422 if (rdev->raid_disk < mddev->raid_disks) 6423 if (info->state & (1<<MD_DISK_SYNC)) 6424 set_bit(In_sync, &rdev->flags); 6425 6426 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6427 set_bit(WriteMostly, &rdev->flags); 6428 if (info->state & (1<<MD_DISK_FAILFAST)) 6429 set_bit(FailFast, &rdev->flags); 6430 6431 if (!mddev->persistent) { 6432 pr_debug("md: nonpersistent superblock ...\n"); 6433 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6434 } else 6435 rdev->sb_start = calc_dev_sboffset(rdev); 6436 rdev->sectors = rdev->sb_start; 6437 6438 err = bind_rdev_to_array(rdev, mddev); 6439 if (err) { 6440 export_rdev(rdev); 6441 return err; 6442 } 6443 } 6444 6445 return 0; 6446 } 6447 6448 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6449 { 6450 char b[BDEVNAME_SIZE]; 6451 struct md_rdev *rdev; 6452 6453 rdev = find_rdev(mddev, dev); 6454 if (!rdev) 6455 return -ENXIO; 6456 6457 if (rdev->raid_disk < 0) 6458 goto kick_rdev; 6459 6460 clear_bit(Blocked, &rdev->flags); 6461 remove_and_add_spares(mddev, rdev); 6462 6463 if (rdev->raid_disk >= 0) 6464 goto busy; 6465 6466 kick_rdev: 6467 if (mddev_is_clustered(mddev)) 6468 md_cluster_ops->remove_disk(mddev, rdev); 6469 6470 md_kick_rdev_from_array(rdev); 6471 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6472 if (mddev->thread) 6473 md_wakeup_thread(mddev->thread); 6474 else 6475 md_update_sb(mddev, 1); 6476 md_new_event(mddev); 6477 6478 return 0; 6479 busy: 6480 pr_debug("md: cannot remove active disk %s from %s ...\n", 6481 bdevname(rdev->bdev,b), mdname(mddev)); 6482 return -EBUSY; 6483 } 6484 6485 static int hot_add_disk(struct mddev *mddev, dev_t dev) 6486 { 6487 char b[BDEVNAME_SIZE]; 6488 int err; 6489 struct md_rdev *rdev; 6490 6491 if (!mddev->pers) 6492 return -ENODEV; 6493 6494 if (mddev->major_version != 0) { 6495 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 6496 mdname(mddev)); 6497 return -EINVAL; 6498 } 6499 if (!mddev->pers->hot_add_disk) { 6500 pr_warn("%s: personality does not support diskops!\n", 6501 mdname(mddev)); 6502 return -EINVAL; 6503 } 6504 6505 rdev = md_import_device(dev, -1, 0); 6506 if (IS_ERR(rdev)) { 6507 pr_warn("md: error, md_import_device() returned %ld\n", 6508 PTR_ERR(rdev)); 6509 return -EINVAL; 6510 } 6511 6512 if (mddev->persistent) 6513 rdev->sb_start = calc_dev_sboffset(rdev); 6514 else 6515 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6516 6517 rdev->sectors = rdev->sb_start; 6518 6519 if (test_bit(Faulty, &rdev->flags)) { 6520 pr_warn("md: can not hot-add faulty %s disk to %s!\n", 6521 bdevname(rdev->bdev,b), mdname(mddev)); 6522 err = -EINVAL; 6523 goto abort_export; 6524 } 6525 6526 clear_bit(In_sync, &rdev->flags); 6527 rdev->desc_nr = -1; 6528 rdev->saved_raid_disk = -1; 6529 err = bind_rdev_to_array(rdev, mddev); 6530 if (err) 6531 goto abort_export; 6532 6533 /* 6534 * The rest should better be atomic, we can have disk failures 6535 * noticed in interrupt contexts ... 6536 */ 6537 6538 rdev->raid_disk = -1; 6539 6540 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6541 if (!mddev->thread) 6542 md_update_sb(mddev, 1); 6543 /* 6544 * Kick recovery, maybe this spare has to be added to the 6545 * array immediately. 6546 */ 6547 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6548 md_wakeup_thread(mddev->thread); 6549 md_new_event(mddev); 6550 return 0; 6551 6552 abort_export: 6553 export_rdev(rdev); 6554 return err; 6555 } 6556 6557 static int set_bitmap_file(struct mddev *mddev, int fd) 6558 { 6559 int err = 0; 6560 6561 if (mddev->pers) { 6562 if (!mddev->pers->quiesce || !mddev->thread) 6563 return -EBUSY; 6564 if (mddev->recovery || mddev->sync_thread) 6565 return -EBUSY; 6566 /* we should be able to change the bitmap.. */ 6567 } 6568 6569 if (fd >= 0) { 6570 struct inode *inode; 6571 struct file *f; 6572 6573 if (mddev->bitmap || mddev->bitmap_info.file) 6574 return -EEXIST; /* cannot add when bitmap is present */ 6575 f = fget(fd); 6576 6577 if (f == NULL) { 6578 pr_warn("%s: error: failed to get bitmap file\n", 6579 mdname(mddev)); 6580 return -EBADF; 6581 } 6582 6583 inode = f->f_mapping->host; 6584 if (!S_ISREG(inode->i_mode)) { 6585 pr_warn("%s: error: bitmap file must be a regular file\n", 6586 mdname(mddev)); 6587 err = -EBADF; 6588 } else if (!(f->f_mode & FMODE_WRITE)) { 6589 pr_warn("%s: error: bitmap file must open for write\n", 6590 mdname(mddev)); 6591 err = -EBADF; 6592 } else if (atomic_read(&inode->i_writecount) != 1) { 6593 pr_warn("%s: error: bitmap file is already in use\n", 6594 mdname(mddev)); 6595 err = -EBUSY; 6596 } 6597 if (err) { 6598 fput(f); 6599 return err; 6600 } 6601 mddev->bitmap_info.file = f; 6602 mddev->bitmap_info.offset = 0; /* file overrides offset */ 6603 } else if (mddev->bitmap == NULL) 6604 return -ENOENT; /* cannot remove what isn't there */ 6605 err = 0; 6606 if (mddev->pers) { 6607 mddev->pers->quiesce(mddev, 1); 6608 if (fd >= 0) { 6609 struct bitmap *bitmap; 6610 6611 bitmap = bitmap_create(mddev, -1); 6612 if (!IS_ERR(bitmap)) { 6613 mddev->bitmap = bitmap; 6614 err = bitmap_load(mddev); 6615 } else 6616 err = PTR_ERR(bitmap); 6617 } 6618 if (fd < 0 || err) { 6619 bitmap_destroy(mddev); 6620 fd = -1; /* make sure to put the file */ 6621 } 6622 mddev->pers->quiesce(mddev, 0); 6623 } 6624 if (fd < 0) { 6625 struct file *f = mddev->bitmap_info.file; 6626 if (f) { 6627 spin_lock(&mddev->lock); 6628 mddev->bitmap_info.file = NULL; 6629 spin_unlock(&mddev->lock); 6630 fput(f); 6631 } 6632 } 6633 6634 return err; 6635 } 6636 6637 /* 6638 * set_array_info is used two different ways 6639 * The original usage is when creating a new array. 6640 * In this usage, raid_disks is > 0 and it together with 6641 * level, size, not_persistent,layout,chunksize determine the 6642 * shape of the array. 6643 * This will always create an array with a type-0.90.0 superblock. 6644 * The newer usage is when assembling an array. 6645 * In this case raid_disks will be 0, and the major_version field is 6646 * use to determine which style super-blocks are to be found on the devices. 6647 * The minor and patch _version numbers are also kept incase the 6648 * super_block handler wishes to interpret them. 6649 */ 6650 static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) 6651 { 6652 6653 if (info->raid_disks == 0) { 6654 /* just setting version number for superblock loading */ 6655 if (info->major_version < 0 || 6656 info->major_version >= ARRAY_SIZE(super_types) || 6657 super_types[info->major_version].name == NULL) { 6658 /* maybe try to auto-load a module? */ 6659 pr_warn("md: superblock version %d not known\n", 6660 info->major_version); 6661 return -EINVAL; 6662 } 6663 mddev->major_version = info->major_version; 6664 mddev->minor_version = info->minor_version; 6665 mddev->patch_version = info->patch_version; 6666 mddev->persistent = !info->not_persistent; 6667 /* ensure mddev_put doesn't delete this now that there 6668 * is some minimal configuration. 6669 */ 6670 mddev->ctime = ktime_get_real_seconds(); 6671 return 0; 6672 } 6673 mddev->major_version = MD_MAJOR_VERSION; 6674 mddev->minor_version = MD_MINOR_VERSION; 6675 mddev->patch_version = MD_PATCHLEVEL_VERSION; 6676 mddev->ctime = ktime_get_real_seconds(); 6677 6678 mddev->level = info->level; 6679 mddev->clevel[0] = 0; 6680 mddev->dev_sectors = 2 * (sector_t)info->size; 6681 mddev->raid_disks = info->raid_disks; 6682 /* don't set md_minor, it is determined by which /dev/md* was 6683 * openned 6684 */ 6685 if (info->state & (1<<MD_SB_CLEAN)) 6686 mddev->recovery_cp = MaxSector; 6687 else 6688 mddev->recovery_cp = 0; 6689 mddev->persistent = ! info->not_persistent; 6690 mddev->external = 0; 6691 6692 mddev->layout = info->layout; 6693 mddev->chunk_sectors = info->chunk_size >> 9; 6694 6695 if (mddev->persistent) { 6696 mddev->max_disks = MD_SB_DISKS; 6697 mddev->flags = 0; 6698 mddev->sb_flags = 0; 6699 } 6700 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6701 6702 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6703 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6704 mddev->bitmap_info.offset = 0; 6705 6706 mddev->reshape_position = MaxSector; 6707 6708 /* 6709 * Generate a 128 bit UUID 6710 */ 6711 get_random_bytes(mddev->uuid, 16); 6712 6713 mddev->new_level = mddev->level; 6714 mddev->new_chunk_sectors = mddev->chunk_sectors; 6715 mddev->new_layout = mddev->layout; 6716 mddev->delta_disks = 0; 6717 mddev->reshape_backwards = 0; 6718 6719 return 0; 6720 } 6721 6722 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 6723 { 6724 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 6725 6726 if (mddev->external_size) 6727 return; 6728 6729 mddev->array_sectors = array_sectors; 6730 } 6731 EXPORT_SYMBOL(md_set_array_sectors); 6732 6733 static int update_size(struct mddev *mddev, sector_t num_sectors) 6734 { 6735 struct md_rdev *rdev; 6736 int rv; 6737 int fit = (num_sectors == 0); 6738 sector_t old_dev_sectors = mddev->dev_sectors; 6739 6740 if (mddev->pers->resize == NULL) 6741 return -EINVAL; 6742 /* The "num_sectors" is the number of sectors of each device that 6743 * is used. This can only make sense for arrays with redundancy. 6744 * linear and raid0 always use whatever space is available. We can only 6745 * consider changing this number if no resync or reconstruction is 6746 * happening, and if the new size is acceptable. It must fit before the 6747 * sb_start or, if that is <data_offset, it must fit before the size 6748 * of each device. If num_sectors is zero, we find the largest size 6749 * that fits. 6750 */ 6751 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6752 mddev->sync_thread) 6753 return -EBUSY; 6754 if (mddev->ro) 6755 return -EROFS; 6756 6757 rdev_for_each(rdev, mddev) { 6758 sector_t avail = rdev->sectors; 6759 6760 if (fit && (num_sectors == 0 || num_sectors > avail)) 6761 num_sectors = avail; 6762 if (avail < num_sectors) 6763 return -ENOSPC; 6764 } 6765 rv = mddev->pers->resize(mddev, num_sectors); 6766 if (!rv) { 6767 if (mddev_is_clustered(mddev)) 6768 md_cluster_ops->update_size(mddev, old_dev_sectors); 6769 else if (mddev->queue) { 6770 set_capacity(mddev->gendisk, mddev->array_sectors); 6771 revalidate_disk(mddev->gendisk); 6772 } 6773 } 6774 return rv; 6775 } 6776 6777 static int update_raid_disks(struct mddev *mddev, int raid_disks) 6778 { 6779 int rv; 6780 struct md_rdev *rdev; 6781 /* change the number of raid disks */ 6782 if (mddev->pers->check_reshape == NULL) 6783 return -EINVAL; 6784 if (mddev->ro) 6785 return -EROFS; 6786 if (raid_disks <= 0 || 6787 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6788 return -EINVAL; 6789 if (mddev->sync_thread || 6790 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6791 mddev->reshape_position != MaxSector) 6792 return -EBUSY; 6793 6794 rdev_for_each(rdev, mddev) { 6795 if (mddev->raid_disks < raid_disks && 6796 rdev->data_offset < rdev->new_data_offset) 6797 return -EINVAL; 6798 if (mddev->raid_disks > raid_disks && 6799 rdev->data_offset > rdev->new_data_offset) 6800 return -EINVAL; 6801 } 6802 6803 mddev->delta_disks = raid_disks - mddev->raid_disks; 6804 if (mddev->delta_disks < 0) 6805 mddev->reshape_backwards = 1; 6806 else if (mddev->delta_disks > 0) 6807 mddev->reshape_backwards = 0; 6808 6809 rv = mddev->pers->check_reshape(mddev); 6810 if (rv < 0) { 6811 mddev->delta_disks = 0; 6812 mddev->reshape_backwards = 0; 6813 } 6814 return rv; 6815 } 6816 6817 /* 6818 * update_array_info is used to change the configuration of an 6819 * on-line array. 6820 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 6821 * fields in the info are checked against the array. 6822 * Any differences that cannot be handled will cause an error. 6823 * Normally, only one change can be managed at a time. 6824 */ 6825 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 6826 { 6827 int rv = 0; 6828 int cnt = 0; 6829 int state = 0; 6830 6831 /* calculate expected state,ignoring low bits */ 6832 if (mddev->bitmap && mddev->bitmap_info.offset) 6833 state |= (1 << MD_SB_BITMAP_PRESENT); 6834 6835 if (mddev->major_version != info->major_version || 6836 mddev->minor_version != info->minor_version || 6837 /* mddev->patch_version != info->patch_version || */ 6838 mddev->ctime != info->ctime || 6839 mddev->level != info->level || 6840 /* mddev->layout != info->layout || */ 6841 mddev->persistent != !info->not_persistent || 6842 mddev->chunk_sectors != info->chunk_size >> 9 || 6843 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 6844 ((state^info->state) & 0xfffffe00) 6845 ) 6846 return -EINVAL; 6847 /* Check there is only one change */ 6848 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6849 cnt++; 6850 if (mddev->raid_disks != info->raid_disks) 6851 cnt++; 6852 if (mddev->layout != info->layout) 6853 cnt++; 6854 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 6855 cnt++; 6856 if (cnt == 0) 6857 return 0; 6858 if (cnt > 1) 6859 return -EINVAL; 6860 6861 if (mddev->layout != info->layout) { 6862 /* Change layout 6863 * we don't need to do anything at the md level, the 6864 * personality will take care of it all. 6865 */ 6866 if (mddev->pers->check_reshape == NULL) 6867 return -EINVAL; 6868 else { 6869 mddev->new_layout = info->layout; 6870 rv = mddev->pers->check_reshape(mddev); 6871 if (rv) 6872 mddev->new_layout = mddev->layout; 6873 return rv; 6874 } 6875 } 6876 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6877 rv = update_size(mddev, (sector_t)info->size * 2); 6878 6879 if (mddev->raid_disks != info->raid_disks) 6880 rv = update_raid_disks(mddev, info->raid_disks); 6881 6882 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 6883 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 6884 rv = -EINVAL; 6885 goto err; 6886 } 6887 if (mddev->recovery || mddev->sync_thread) { 6888 rv = -EBUSY; 6889 goto err; 6890 } 6891 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 6892 struct bitmap *bitmap; 6893 /* add the bitmap */ 6894 if (mddev->bitmap) { 6895 rv = -EEXIST; 6896 goto err; 6897 } 6898 if (mddev->bitmap_info.default_offset == 0) { 6899 rv = -EINVAL; 6900 goto err; 6901 } 6902 mddev->bitmap_info.offset = 6903 mddev->bitmap_info.default_offset; 6904 mddev->bitmap_info.space = 6905 mddev->bitmap_info.default_space; 6906 mddev->pers->quiesce(mddev, 1); 6907 bitmap = bitmap_create(mddev, -1); 6908 if (!IS_ERR(bitmap)) { 6909 mddev->bitmap = bitmap; 6910 rv = bitmap_load(mddev); 6911 } else 6912 rv = PTR_ERR(bitmap); 6913 if (rv) 6914 bitmap_destroy(mddev); 6915 mddev->pers->quiesce(mddev, 0); 6916 } else { 6917 /* remove the bitmap */ 6918 if (!mddev->bitmap) { 6919 rv = -ENOENT; 6920 goto err; 6921 } 6922 if (mddev->bitmap->storage.file) { 6923 rv = -EINVAL; 6924 goto err; 6925 } 6926 if (mddev->bitmap_info.nodes) { 6927 /* hold PW on all the bitmap lock */ 6928 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 6929 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 6930 rv = -EPERM; 6931 md_cluster_ops->unlock_all_bitmaps(mddev); 6932 goto err; 6933 } 6934 6935 mddev->bitmap_info.nodes = 0; 6936 md_cluster_ops->leave(mddev); 6937 } 6938 mddev->pers->quiesce(mddev, 1); 6939 bitmap_destroy(mddev); 6940 mddev->pers->quiesce(mddev, 0); 6941 mddev->bitmap_info.offset = 0; 6942 } 6943 } 6944 md_update_sb(mddev, 1); 6945 return rv; 6946 err: 6947 return rv; 6948 } 6949 6950 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6951 { 6952 struct md_rdev *rdev; 6953 int err = 0; 6954 6955 if (mddev->pers == NULL) 6956 return -ENODEV; 6957 6958 rcu_read_lock(); 6959 rdev = find_rdev_rcu(mddev, dev); 6960 if (!rdev) 6961 err = -ENODEV; 6962 else { 6963 md_error(mddev, rdev); 6964 if (!test_bit(Faulty, &rdev->flags)) 6965 err = -EBUSY; 6966 } 6967 rcu_read_unlock(); 6968 return err; 6969 } 6970 6971 /* 6972 * We have a problem here : there is no easy way to give a CHS 6973 * virtual geometry. We currently pretend that we have a 2 heads 6974 * 4 sectors (with a BIG number of cylinders...). This drives 6975 * dosfs just mad... ;-) 6976 */ 6977 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 6978 { 6979 struct mddev *mddev = bdev->bd_disk->private_data; 6980 6981 geo->heads = 2; 6982 geo->sectors = 4; 6983 geo->cylinders = mddev->array_sectors / 8; 6984 return 0; 6985 } 6986 6987 static inline bool md_ioctl_valid(unsigned int cmd) 6988 { 6989 switch (cmd) { 6990 case ADD_NEW_DISK: 6991 case BLKROSET: 6992 case GET_ARRAY_INFO: 6993 case GET_BITMAP_FILE: 6994 case GET_DISK_INFO: 6995 case HOT_ADD_DISK: 6996 case HOT_REMOVE_DISK: 6997 case RAID_AUTORUN: 6998 case RAID_VERSION: 6999 case RESTART_ARRAY_RW: 7000 case RUN_ARRAY: 7001 case SET_ARRAY_INFO: 7002 case SET_BITMAP_FILE: 7003 case SET_DISK_FAULTY: 7004 case STOP_ARRAY: 7005 case STOP_ARRAY_RO: 7006 case CLUSTERED_DISK_NACK: 7007 return true; 7008 default: 7009 return false; 7010 } 7011 } 7012 7013 static int md_ioctl(struct block_device *bdev, fmode_t mode, 7014 unsigned int cmd, unsigned long arg) 7015 { 7016 int err = 0; 7017 void __user *argp = (void __user *)arg; 7018 struct mddev *mddev = NULL; 7019 int ro; 7020 bool did_set_md_closing = false; 7021 7022 if (!md_ioctl_valid(cmd)) 7023 return -ENOTTY; 7024 7025 switch (cmd) { 7026 case RAID_VERSION: 7027 case GET_ARRAY_INFO: 7028 case GET_DISK_INFO: 7029 break; 7030 default: 7031 if (!capable(CAP_SYS_ADMIN)) 7032 return -EACCES; 7033 } 7034 7035 /* 7036 * Commands dealing with the RAID driver but not any 7037 * particular array: 7038 */ 7039 switch (cmd) { 7040 case RAID_VERSION: 7041 err = get_version(argp); 7042 goto out; 7043 7044 #ifndef MODULE 7045 case RAID_AUTORUN: 7046 err = 0; 7047 autostart_arrays(arg); 7048 goto out; 7049 #endif 7050 default:; 7051 } 7052 7053 /* 7054 * Commands creating/starting a new array: 7055 */ 7056 7057 mddev = bdev->bd_disk->private_data; 7058 7059 if (!mddev) { 7060 BUG(); 7061 goto out; 7062 } 7063 7064 /* Some actions do not requires the mutex */ 7065 switch (cmd) { 7066 case GET_ARRAY_INFO: 7067 if (!mddev->raid_disks && !mddev->external) 7068 err = -ENODEV; 7069 else 7070 err = get_array_info(mddev, argp); 7071 goto out; 7072 7073 case GET_DISK_INFO: 7074 if (!mddev->raid_disks && !mddev->external) 7075 err = -ENODEV; 7076 else 7077 err = get_disk_info(mddev, argp); 7078 goto out; 7079 7080 case SET_DISK_FAULTY: 7081 err = set_disk_faulty(mddev, new_decode_dev(arg)); 7082 goto out; 7083 7084 case GET_BITMAP_FILE: 7085 err = get_bitmap_file(mddev, argp); 7086 goto out; 7087 7088 } 7089 7090 if (cmd == ADD_NEW_DISK) 7091 /* need to ensure md_delayed_delete() has completed */ 7092 flush_workqueue(md_misc_wq); 7093 7094 if (cmd == HOT_REMOVE_DISK) 7095 /* need to ensure recovery thread has run */ 7096 wait_event_interruptible_timeout(mddev->sb_wait, 7097 !test_bit(MD_RECOVERY_NEEDED, 7098 &mddev->recovery), 7099 msecs_to_jiffies(5000)); 7100 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7101 /* Need to flush page cache, and ensure no-one else opens 7102 * and writes 7103 */ 7104 mutex_lock(&mddev->open_mutex); 7105 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 7106 mutex_unlock(&mddev->open_mutex); 7107 err = -EBUSY; 7108 goto out; 7109 } 7110 WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); 7111 set_bit(MD_CLOSING, &mddev->flags); 7112 did_set_md_closing = true; 7113 mutex_unlock(&mddev->open_mutex); 7114 sync_blockdev(bdev); 7115 } 7116 err = mddev_lock(mddev); 7117 if (err) { 7118 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7119 err, cmd); 7120 goto out; 7121 } 7122 7123 if (cmd == SET_ARRAY_INFO) { 7124 mdu_array_info_t info; 7125 if (!arg) 7126 memset(&info, 0, sizeof(info)); 7127 else if (copy_from_user(&info, argp, sizeof(info))) { 7128 err = -EFAULT; 7129 goto unlock; 7130 } 7131 if (mddev->pers) { 7132 err = update_array_info(mddev, &info); 7133 if (err) { 7134 pr_warn("md: couldn't update array info. %d\n", err); 7135 goto unlock; 7136 } 7137 goto unlock; 7138 } 7139 if (!list_empty(&mddev->disks)) { 7140 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7141 err = -EBUSY; 7142 goto unlock; 7143 } 7144 if (mddev->raid_disks) { 7145 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7146 err = -EBUSY; 7147 goto unlock; 7148 } 7149 err = set_array_info(mddev, &info); 7150 if (err) { 7151 pr_warn("md: couldn't set array info. %d\n", err); 7152 goto unlock; 7153 } 7154 goto unlock; 7155 } 7156 7157 /* 7158 * Commands querying/configuring an existing array: 7159 */ 7160 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7161 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7162 if ((!mddev->raid_disks && !mddev->external) 7163 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7164 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7165 && cmd != GET_BITMAP_FILE) { 7166 err = -ENODEV; 7167 goto unlock; 7168 } 7169 7170 /* 7171 * Commands even a read-only array can execute: 7172 */ 7173 switch (cmd) { 7174 case RESTART_ARRAY_RW: 7175 err = restart_array(mddev); 7176 goto unlock; 7177 7178 case STOP_ARRAY: 7179 err = do_md_stop(mddev, 0, bdev); 7180 goto unlock; 7181 7182 case STOP_ARRAY_RO: 7183 err = md_set_readonly(mddev, bdev); 7184 goto unlock; 7185 7186 case HOT_REMOVE_DISK: 7187 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7188 goto unlock; 7189 7190 case ADD_NEW_DISK: 7191 /* We can support ADD_NEW_DISK on read-only arrays 7192 * only if we are re-adding a preexisting device. 7193 * So require mddev->pers and MD_DISK_SYNC. 7194 */ 7195 if (mddev->pers) { 7196 mdu_disk_info_t info; 7197 if (copy_from_user(&info, argp, sizeof(info))) 7198 err = -EFAULT; 7199 else if (!(info.state & (1<<MD_DISK_SYNC))) 7200 /* Need to clear read-only for this */ 7201 break; 7202 else 7203 err = add_new_disk(mddev, &info); 7204 goto unlock; 7205 } 7206 break; 7207 7208 case BLKROSET: 7209 if (get_user(ro, (int __user *)(arg))) { 7210 err = -EFAULT; 7211 goto unlock; 7212 } 7213 err = -EINVAL; 7214 7215 /* if the bdev is going readonly the value of mddev->ro 7216 * does not matter, no writes are coming 7217 */ 7218 if (ro) 7219 goto unlock; 7220 7221 /* are we are already prepared for writes? */ 7222 if (mddev->ro != 1) 7223 goto unlock; 7224 7225 /* transitioning to readauto need only happen for 7226 * arrays that call md_write_start 7227 */ 7228 if (mddev->pers) { 7229 err = restart_array(mddev); 7230 if (err == 0) { 7231 mddev->ro = 2; 7232 set_disk_ro(mddev->gendisk, 0); 7233 } 7234 } 7235 goto unlock; 7236 } 7237 7238 /* 7239 * The remaining ioctls are changing the state of the 7240 * superblock, so we do not allow them on read-only arrays. 7241 */ 7242 if (mddev->ro && mddev->pers) { 7243 if (mddev->ro == 2) { 7244 mddev->ro = 0; 7245 sysfs_notify_dirent_safe(mddev->sysfs_state); 7246 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7247 /* mddev_unlock will wake thread */ 7248 /* If a device failed while we were read-only, we 7249 * need to make sure the metadata is updated now. 7250 */ 7251 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7252 mddev_unlock(mddev); 7253 wait_event(mddev->sb_wait, 7254 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7255 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7256 mddev_lock_nointr(mddev); 7257 } 7258 } else { 7259 err = -EROFS; 7260 goto unlock; 7261 } 7262 } 7263 7264 switch (cmd) { 7265 case ADD_NEW_DISK: 7266 { 7267 mdu_disk_info_t info; 7268 if (copy_from_user(&info, argp, sizeof(info))) 7269 err = -EFAULT; 7270 else 7271 err = add_new_disk(mddev, &info); 7272 goto unlock; 7273 } 7274 7275 case CLUSTERED_DISK_NACK: 7276 if (mddev_is_clustered(mddev)) 7277 md_cluster_ops->new_disk_ack(mddev, false); 7278 else 7279 err = -EINVAL; 7280 goto unlock; 7281 7282 case HOT_ADD_DISK: 7283 err = hot_add_disk(mddev, new_decode_dev(arg)); 7284 goto unlock; 7285 7286 case RUN_ARRAY: 7287 err = do_md_run(mddev); 7288 goto unlock; 7289 7290 case SET_BITMAP_FILE: 7291 err = set_bitmap_file(mddev, (int)arg); 7292 goto unlock; 7293 7294 default: 7295 err = -EINVAL; 7296 goto unlock; 7297 } 7298 7299 unlock: 7300 if (mddev->hold_active == UNTIL_IOCTL && 7301 err != -EINVAL) 7302 mddev->hold_active = 0; 7303 mddev_unlock(mddev); 7304 out: 7305 if(did_set_md_closing) 7306 clear_bit(MD_CLOSING, &mddev->flags); 7307 return err; 7308 } 7309 #ifdef CONFIG_COMPAT 7310 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 7311 unsigned int cmd, unsigned long arg) 7312 { 7313 switch (cmd) { 7314 case HOT_REMOVE_DISK: 7315 case HOT_ADD_DISK: 7316 case SET_DISK_FAULTY: 7317 case SET_BITMAP_FILE: 7318 /* These take in integer arg, do not convert */ 7319 break; 7320 default: 7321 arg = (unsigned long)compat_ptr(arg); 7322 break; 7323 } 7324 7325 return md_ioctl(bdev, mode, cmd, arg); 7326 } 7327 #endif /* CONFIG_COMPAT */ 7328 7329 static int md_open(struct block_device *bdev, fmode_t mode) 7330 { 7331 /* 7332 * Succeed if we can lock the mddev, which confirms that 7333 * it isn't being stopped right now. 7334 */ 7335 struct mddev *mddev = mddev_find(bdev->bd_dev); 7336 int err; 7337 7338 if (!mddev) 7339 return -ENODEV; 7340 7341 if (mddev->gendisk != bdev->bd_disk) { 7342 /* we are racing with mddev_put which is discarding this 7343 * bd_disk. 7344 */ 7345 mddev_put(mddev); 7346 /* Wait until bdev->bd_disk is definitely gone */ 7347 flush_workqueue(md_misc_wq); 7348 /* Then retry the open from the top */ 7349 return -ERESTARTSYS; 7350 } 7351 BUG_ON(mddev != bdev->bd_disk->private_data); 7352 7353 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 7354 goto out; 7355 7356 if (test_bit(MD_CLOSING, &mddev->flags)) { 7357 mutex_unlock(&mddev->open_mutex); 7358 err = -ENODEV; 7359 goto out; 7360 } 7361 7362 err = 0; 7363 atomic_inc(&mddev->openers); 7364 mutex_unlock(&mddev->open_mutex); 7365 7366 check_disk_change(bdev); 7367 out: 7368 if (err) 7369 mddev_put(mddev); 7370 return err; 7371 } 7372 7373 static void md_release(struct gendisk *disk, fmode_t mode) 7374 { 7375 struct mddev *mddev = disk->private_data; 7376 7377 BUG_ON(!mddev); 7378 atomic_dec(&mddev->openers); 7379 mddev_put(mddev); 7380 } 7381 7382 static int md_media_changed(struct gendisk *disk) 7383 { 7384 struct mddev *mddev = disk->private_data; 7385 7386 return mddev->changed; 7387 } 7388 7389 static int md_revalidate(struct gendisk *disk) 7390 { 7391 struct mddev *mddev = disk->private_data; 7392 7393 mddev->changed = 0; 7394 return 0; 7395 } 7396 static const struct block_device_operations md_fops = 7397 { 7398 .owner = THIS_MODULE, 7399 .open = md_open, 7400 .release = md_release, 7401 .ioctl = md_ioctl, 7402 #ifdef CONFIG_COMPAT 7403 .compat_ioctl = md_compat_ioctl, 7404 #endif 7405 .getgeo = md_getgeo, 7406 .media_changed = md_media_changed, 7407 .revalidate_disk= md_revalidate, 7408 }; 7409 7410 static int md_thread(void *arg) 7411 { 7412 struct md_thread *thread = arg; 7413 7414 /* 7415 * md_thread is a 'system-thread', it's priority should be very 7416 * high. We avoid resource deadlocks individually in each 7417 * raid personality. (RAID5 does preallocation) We also use RR and 7418 * the very same RT priority as kswapd, thus we will never get 7419 * into a priority inversion deadlock. 7420 * 7421 * we definitely have to have equal or higher priority than 7422 * bdflush, otherwise bdflush will deadlock if there are too 7423 * many dirty RAID5 blocks. 7424 */ 7425 7426 allow_signal(SIGKILL); 7427 while (!kthread_should_stop()) { 7428 7429 /* We need to wait INTERRUPTIBLE so that 7430 * we don't add to the load-average. 7431 * That means we need to be sure no signals are 7432 * pending 7433 */ 7434 if (signal_pending(current)) 7435 flush_signals(current); 7436 7437 wait_event_interruptible_timeout 7438 (thread->wqueue, 7439 test_bit(THREAD_WAKEUP, &thread->flags) 7440 || kthread_should_stop() || kthread_should_park(), 7441 thread->timeout); 7442 7443 clear_bit(THREAD_WAKEUP, &thread->flags); 7444 if (kthread_should_park()) 7445 kthread_parkme(); 7446 if (!kthread_should_stop()) 7447 thread->run(thread); 7448 } 7449 7450 return 0; 7451 } 7452 7453 void md_wakeup_thread(struct md_thread *thread) 7454 { 7455 if (thread) { 7456 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 7457 if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) 7458 wake_up(&thread->wqueue); 7459 } 7460 } 7461 EXPORT_SYMBOL(md_wakeup_thread); 7462 7463 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 7464 struct mddev *mddev, const char *name) 7465 { 7466 struct md_thread *thread; 7467 7468 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 7469 if (!thread) 7470 return NULL; 7471 7472 init_waitqueue_head(&thread->wqueue); 7473 7474 thread->run = run; 7475 thread->mddev = mddev; 7476 thread->timeout = MAX_SCHEDULE_TIMEOUT; 7477 thread->tsk = kthread_run(md_thread, thread, 7478 "%s_%s", 7479 mdname(thread->mddev), 7480 name); 7481 if (IS_ERR(thread->tsk)) { 7482 kfree(thread); 7483 return NULL; 7484 } 7485 return thread; 7486 } 7487 EXPORT_SYMBOL(md_register_thread); 7488 7489 void md_unregister_thread(struct md_thread **threadp) 7490 { 7491 struct md_thread *thread = *threadp; 7492 if (!thread) 7493 return; 7494 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 7495 /* Locking ensures that mddev_unlock does not wake_up a 7496 * non-existent thread 7497 */ 7498 spin_lock(&pers_lock); 7499 *threadp = NULL; 7500 spin_unlock(&pers_lock); 7501 7502 kthread_stop(thread->tsk); 7503 kfree(thread); 7504 } 7505 EXPORT_SYMBOL(md_unregister_thread); 7506 7507 void md_error(struct mddev *mddev, struct md_rdev *rdev) 7508 { 7509 if (!rdev || test_bit(Faulty, &rdev->flags)) 7510 return; 7511 7512 if (!mddev->pers || !mddev->pers->error_handler) 7513 return; 7514 mddev->pers->error_handler(mddev,rdev); 7515 if (mddev->degraded) 7516 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7517 sysfs_notify_dirent_safe(rdev->sysfs_state); 7518 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7519 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7520 md_wakeup_thread(mddev->thread); 7521 if (mddev->event_work.func) 7522 queue_work(md_misc_wq, &mddev->event_work); 7523 md_new_event(mddev); 7524 } 7525 EXPORT_SYMBOL(md_error); 7526 7527 /* seq_file implementation /proc/mdstat */ 7528 7529 static void status_unused(struct seq_file *seq) 7530 { 7531 int i = 0; 7532 struct md_rdev *rdev; 7533 7534 seq_printf(seq, "unused devices: "); 7535 7536 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 7537 char b[BDEVNAME_SIZE]; 7538 i++; 7539 seq_printf(seq, "%s ", 7540 bdevname(rdev->bdev,b)); 7541 } 7542 if (!i) 7543 seq_printf(seq, "<none>"); 7544 7545 seq_printf(seq, "\n"); 7546 } 7547 7548 static int status_resync(struct seq_file *seq, struct mddev *mddev) 7549 { 7550 sector_t max_sectors, resync, res; 7551 unsigned long dt, db; 7552 sector_t rt; 7553 int scale; 7554 unsigned int per_milli; 7555 7556 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7557 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7558 max_sectors = mddev->resync_max_sectors; 7559 else 7560 max_sectors = mddev->dev_sectors; 7561 7562 resync = mddev->curr_resync; 7563 if (resync <= 3) { 7564 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7565 /* Still cleaning up */ 7566 resync = max_sectors; 7567 } else 7568 resync -= atomic_read(&mddev->recovery_active); 7569 7570 if (resync == 0) { 7571 if (mddev->recovery_cp < MaxSector) { 7572 seq_printf(seq, "\tresync=PENDING"); 7573 return 1; 7574 } 7575 return 0; 7576 } 7577 if (resync < 3) { 7578 seq_printf(seq, "\tresync=DELAYED"); 7579 return 1; 7580 } 7581 7582 WARN_ON(max_sectors == 0); 7583 /* Pick 'scale' such that (resync>>scale)*1000 will fit 7584 * in a sector_t, and (max_sectors>>scale) will fit in a 7585 * u32, as those are the requirements for sector_div. 7586 * Thus 'scale' must be at least 10 7587 */ 7588 scale = 10; 7589 if (sizeof(sector_t) > sizeof(unsigned long)) { 7590 while ( max_sectors/2 > (1ULL<<(scale+32))) 7591 scale++; 7592 } 7593 res = (resync>>scale)*1000; 7594 sector_div(res, (u32)((max_sectors>>scale)+1)); 7595 7596 per_milli = res; 7597 { 7598 int i, x = per_milli/50, y = 20-x; 7599 seq_printf(seq, "["); 7600 for (i = 0; i < x; i++) 7601 seq_printf(seq, "="); 7602 seq_printf(seq, ">"); 7603 for (i = 0; i < y; i++) 7604 seq_printf(seq, "."); 7605 seq_printf(seq, "] "); 7606 } 7607 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 7608 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 7609 "reshape" : 7610 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 7611 "check" : 7612 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 7613 "resync" : "recovery"))), 7614 per_milli/10, per_milli % 10, 7615 (unsigned long long) resync/2, 7616 (unsigned long long) max_sectors/2); 7617 7618 /* 7619 * dt: time from mark until now 7620 * db: blocks written from mark until now 7621 * rt: remaining time 7622 * 7623 * rt is a sector_t, so could be 32bit or 64bit. 7624 * So we divide before multiply in case it is 32bit and close 7625 * to the limit. 7626 * We scale the divisor (db) by 32 to avoid losing precision 7627 * near the end of resync when the number of remaining sectors 7628 * is close to 'db'. 7629 * We then divide rt by 32 after multiplying by db to compensate. 7630 * The '+1' avoids division by zero if db is very small. 7631 */ 7632 dt = ((jiffies - mddev->resync_mark) / HZ); 7633 if (!dt) dt++; 7634 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 7635 - mddev->resync_mark_cnt; 7636 7637 rt = max_sectors - resync; /* number of remaining sectors */ 7638 sector_div(rt, db/32+1); 7639 rt *= dt; 7640 rt >>= 5; 7641 7642 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 7643 ((unsigned long)rt % 60)/6); 7644 7645 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 7646 return 1; 7647 } 7648 7649 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 7650 { 7651 struct list_head *tmp; 7652 loff_t l = *pos; 7653 struct mddev *mddev; 7654 7655 if (l >= 0x10000) 7656 return NULL; 7657 if (!l--) 7658 /* header */ 7659 return (void*)1; 7660 7661 spin_lock(&all_mddevs_lock); 7662 list_for_each(tmp,&all_mddevs) 7663 if (!l--) { 7664 mddev = list_entry(tmp, struct mddev, all_mddevs); 7665 mddev_get(mddev); 7666 spin_unlock(&all_mddevs_lock); 7667 return mddev; 7668 } 7669 spin_unlock(&all_mddevs_lock); 7670 if (!l--) 7671 return (void*)2;/* tail */ 7672 return NULL; 7673 } 7674 7675 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 7676 { 7677 struct list_head *tmp; 7678 struct mddev *next_mddev, *mddev = v; 7679 7680 ++*pos; 7681 if (v == (void*)2) 7682 return NULL; 7683 7684 spin_lock(&all_mddevs_lock); 7685 if (v == (void*)1) 7686 tmp = all_mddevs.next; 7687 else 7688 tmp = mddev->all_mddevs.next; 7689 if (tmp != &all_mddevs) 7690 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 7691 else { 7692 next_mddev = (void*)2; 7693 *pos = 0x10000; 7694 } 7695 spin_unlock(&all_mddevs_lock); 7696 7697 if (v != (void*)1) 7698 mddev_put(mddev); 7699 return next_mddev; 7700 7701 } 7702 7703 static void md_seq_stop(struct seq_file *seq, void *v) 7704 { 7705 struct mddev *mddev = v; 7706 7707 if (mddev && v != (void*)1 && v != (void*)2) 7708 mddev_put(mddev); 7709 } 7710 7711 static int md_seq_show(struct seq_file *seq, void *v) 7712 { 7713 struct mddev *mddev = v; 7714 sector_t sectors; 7715 struct md_rdev *rdev; 7716 7717 if (v == (void*)1) { 7718 struct md_personality *pers; 7719 seq_printf(seq, "Personalities : "); 7720 spin_lock(&pers_lock); 7721 list_for_each_entry(pers, &pers_list, list) 7722 seq_printf(seq, "[%s] ", pers->name); 7723 7724 spin_unlock(&pers_lock); 7725 seq_printf(seq, "\n"); 7726 seq->poll_event = atomic_read(&md_event_count); 7727 return 0; 7728 } 7729 if (v == (void*)2) { 7730 status_unused(seq); 7731 return 0; 7732 } 7733 7734 spin_lock(&mddev->lock); 7735 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 7736 seq_printf(seq, "%s : %sactive", mdname(mddev), 7737 mddev->pers ? "" : "in"); 7738 if (mddev->pers) { 7739 if (mddev->ro==1) 7740 seq_printf(seq, " (read-only)"); 7741 if (mddev->ro==2) 7742 seq_printf(seq, " (auto-read-only)"); 7743 seq_printf(seq, " %s", mddev->pers->name); 7744 } 7745 7746 sectors = 0; 7747 rcu_read_lock(); 7748 rdev_for_each_rcu(rdev, mddev) { 7749 char b[BDEVNAME_SIZE]; 7750 seq_printf(seq, " %s[%d]", 7751 bdevname(rdev->bdev,b), rdev->desc_nr); 7752 if (test_bit(WriteMostly, &rdev->flags)) 7753 seq_printf(seq, "(W)"); 7754 if (test_bit(Journal, &rdev->flags)) 7755 seq_printf(seq, "(J)"); 7756 if (test_bit(Faulty, &rdev->flags)) { 7757 seq_printf(seq, "(F)"); 7758 continue; 7759 } 7760 if (rdev->raid_disk < 0) 7761 seq_printf(seq, "(S)"); /* spare */ 7762 if (test_bit(Replacement, &rdev->flags)) 7763 seq_printf(seq, "(R)"); 7764 sectors += rdev->sectors; 7765 } 7766 rcu_read_unlock(); 7767 7768 if (!list_empty(&mddev->disks)) { 7769 if (mddev->pers) 7770 seq_printf(seq, "\n %llu blocks", 7771 (unsigned long long) 7772 mddev->array_sectors / 2); 7773 else 7774 seq_printf(seq, "\n %llu blocks", 7775 (unsigned long long)sectors / 2); 7776 } 7777 if (mddev->persistent) { 7778 if (mddev->major_version != 0 || 7779 mddev->minor_version != 90) { 7780 seq_printf(seq," super %d.%d", 7781 mddev->major_version, 7782 mddev->minor_version); 7783 } 7784 } else if (mddev->external) 7785 seq_printf(seq, " super external:%s", 7786 mddev->metadata_type); 7787 else 7788 seq_printf(seq, " super non-persistent"); 7789 7790 if (mddev->pers) { 7791 mddev->pers->status(seq, mddev); 7792 seq_printf(seq, "\n "); 7793 if (mddev->pers->sync_request) { 7794 if (status_resync(seq, mddev)) 7795 seq_printf(seq, "\n "); 7796 } 7797 } else 7798 seq_printf(seq, "\n "); 7799 7800 bitmap_status(seq, mddev->bitmap); 7801 7802 seq_printf(seq, "\n"); 7803 } 7804 spin_unlock(&mddev->lock); 7805 7806 return 0; 7807 } 7808 7809 static const struct seq_operations md_seq_ops = { 7810 .start = md_seq_start, 7811 .next = md_seq_next, 7812 .stop = md_seq_stop, 7813 .show = md_seq_show, 7814 }; 7815 7816 static int md_seq_open(struct inode *inode, struct file *file) 7817 { 7818 struct seq_file *seq; 7819 int error; 7820 7821 error = seq_open(file, &md_seq_ops); 7822 if (error) 7823 return error; 7824 7825 seq = file->private_data; 7826 seq->poll_event = atomic_read(&md_event_count); 7827 return error; 7828 } 7829 7830 static int md_unloading; 7831 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 7832 { 7833 struct seq_file *seq = filp->private_data; 7834 int mask; 7835 7836 if (md_unloading) 7837 return POLLIN|POLLRDNORM|POLLERR|POLLPRI; 7838 poll_wait(filp, &md_event_waiters, wait); 7839 7840 /* always allow read */ 7841 mask = POLLIN | POLLRDNORM; 7842 7843 if (seq->poll_event != atomic_read(&md_event_count)) 7844 mask |= POLLERR | POLLPRI; 7845 return mask; 7846 } 7847 7848 static const struct file_operations md_seq_fops = { 7849 .owner = THIS_MODULE, 7850 .open = md_seq_open, 7851 .read = seq_read, 7852 .llseek = seq_lseek, 7853 .release = seq_release, 7854 .poll = mdstat_poll, 7855 }; 7856 7857 int register_md_personality(struct md_personality *p) 7858 { 7859 pr_debug("md: %s personality registered for level %d\n", 7860 p->name, p->level); 7861 spin_lock(&pers_lock); 7862 list_add_tail(&p->list, &pers_list); 7863 spin_unlock(&pers_lock); 7864 return 0; 7865 } 7866 EXPORT_SYMBOL(register_md_personality); 7867 7868 int unregister_md_personality(struct md_personality *p) 7869 { 7870 pr_debug("md: %s personality unregistered\n", p->name); 7871 spin_lock(&pers_lock); 7872 list_del_init(&p->list); 7873 spin_unlock(&pers_lock); 7874 return 0; 7875 } 7876 EXPORT_SYMBOL(unregister_md_personality); 7877 7878 int register_md_cluster_operations(struct md_cluster_operations *ops, 7879 struct module *module) 7880 { 7881 int ret = 0; 7882 spin_lock(&pers_lock); 7883 if (md_cluster_ops != NULL) 7884 ret = -EALREADY; 7885 else { 7886 md_cluster_ops = ops; 7887 md_cluster_mod = module; 7888 } 7889 spin_unlock(&pers_lock); 7890 return ret; 7891 } 7892 EXPORT_SYMBOL(register_md_cluster_operations); 7893 7894 int unregister_md_cluster_operations(void) 7895 { 7896 spin_lock(&pers_lock); 7897 md_cluster_ops = NULL; 7898 spin_unlock(&pers_lock); 7899 return 0; 7900 } 7901 EXPORT_SYMBOL(unregister_md_cluster_operations); 7902 7903 int md_setup_cluster(struct mddev *mddev, int nodes) 7904 { 7905 if (!md_cluster_ops) 7906 request_module("md-cluster"); 7907 spin_lock(&pers_lock); 7908 /* ensure module won't be unloaded */ 7909 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 7910 pr_warn("can't find md-cluster module or get it's reference.\n"); 7911 spin_unlock(&pers_lock); 7912 return -ENOENT; 7913 } 7914 spin_unlock(&pers_lock); 7915 7916 return md_cluster_ops->join(mddev, nodes); 7917 } 7918 7919 void md_cluster_stop(struct mddev *mddev) 7920 { 7921 if (!md_cluster_ops) 7922 return; 7923 md_cluster_ops->leave(mddev); 7924 module_put(md_cluster_mod); 7925 } 7926 7927 static int is_mddev_idle(struct mddev *mddev, int init) 7928 { 7929 struct md_rdev *rdev; 7930 int idle; 7931 int curr_events; 7932 7933 idle = 1; 7934 rcu_read_lock(); 7935 rdev_for_each_rcu(rdev, mddev) { 7936 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 7937 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 7938 (int)part_stat_read(&disk->part0, sectors[1]) - 7939 atomic_read(&disk->sync_io); 7940 /* sync IO will cause sync_io to increase before the disk_stats 7941 * as sync_io is counted when a request starts, and 7942 * disk_stats is counted when it completes. 7943 * So resync activity will cause curr_events to be smaller than 7944 * when there was no such activity. 7945 * non-sync IO will cause disk_stat to increase without 7946 * increasing sync_io so curr_events will (eventually) 7947 * be larger than it was before. Once it becomes 7948 * substantially larger, the test below will cause 7949 * the array to appear non-idle, and resync will slow 7950 * down. 7951 * If there is a lot of outstanding resync activity when 7952 * we set last_event to curr_events, then all that activity 7953 * completing might cause the array to appear non-idle 7954 * and resync will be slowed down even though there might 7955 * not have been non-resync activity. This will only 7956 * happen once though. 'last_events' will soon reflect 7957 * the state where there is little or no outstanding 7958 * resync requests, and further resync activity will 7959 * always make curr_events less than last_events. 7960 * 7961 */ 7962 if (init || curr_events - rdev->last_events > 64) { 7963 rdev->last_events = curr_events; 7964 idle = 0; 7965 } 7966 } 7967 rcu_read_unlock(); 7968 return idle; 7969 } 7970 7971 void md_done_sync(struct mddev *mddev, int blocks, int ok) 7972 { 7973 /* another "blocks" (512byte) blocks have been synced */ 7974 atomic_sub(blocks, &mddev->recovery_active); 7975 wake_up(&mddev->recovery_wait); 7976 if (!ok) { 7977 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7978 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 7979 md_wakeup_thread(mddev->thread); 7980 // stop recovery, signal do_sync .... 7981 } 7982 } 7983 EXPORT_SYMBOL(md_done_sync); 7984 7985 /* md_write_start(mddev, bi) 7986 * If we need to update some array metadata (e.g. 'active' flag 7987 * in superblock) before writing, schedule a superblock update 7988 * and wait for it to complete. 7989 * A return value of 'false' means that the write wasn't recorded 7990 * and cannot proceed as the array is being suspend. 7991 */ 7992 bool md_write_start(struct mddev *mddev, struct bio *bi) 7993 { 7994 int did_change = 0; 7995 if (bio_data_dir(bi) != WRITE) 7996 return true; 7997 7998 BUG_ON(mddev->ro == 1); 7999 if (mddev->ro == 2) { 8000 /* need to switch to read/write */ 8001 mddev->ro = 0; 8002 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8003 md_wakeup_thread(mddev->thread); 8004 md_wakeup_thread(mddev->sync_thread); 8005 did_change = 1; 8006 } 8007 rcu_read_lock(); 8008 percpu_ref_get(&mddev->writes_pending); 8009 smp_mb(); /* Match smp_mb in set_in_sync() */ 8010 if (mddev->safemode == 1) 8011 mddev->safemode = 0; 8012 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8013 if (mddev->in_sync || mddev->sync_checkers) { 8014 spin_lock(&mddev->lock); 8015 if (mddev->in_sync) { 8016 mddev->in_sync = 0; 8017 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8018 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8019 md_wakeup_thread(mddev->thread); 8020 did_change = 1; 8021 } 8022 spin_unlock(&mddev->lock); 8023 } 8024 rcu_read_unlock(); 8025 if (did_change) 8026 sysfs_notify_dirent_safe(mddev->sysfs_state); 8027 wait_event(mddev->sb_wait, 8028 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); 8029 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 8030 percpu_ref_put(&mddev->writes_pending); 8031 return false; 8032 } 8033 return true; 8034 } 8035 EXPORT_SYMBOL(md_write_start); 8036 8037 /* md_write_inc can only be called when md_write_start() has 8038 * already been called at least once of the current request. 8039 * It increments the counter and is useful when a single request 8040 * is split into several parts. Each part causes an increment and 8041 * so needs a matching md_write_end(). 8042 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8043 * a spinlocked region. 8044 */ 8045 void md_write_inc(struct mddev *mddev, struct bio *bi) 8046 { 8047 if (bio_data_dir(bi) != WRITE) 8048 return; 8049 WARN_ON_ONCE(mddev->in_sync || mddev->ro); 8050 percpu_ref_get(&mddev->writes_pending); 8051 } 8052 EXPORT_SYMBOL(md_write_inc); 8053 8054 void md_write_end(struct mddev *mddev) 8055 { 8056 percpu_ref_put(&mddev->writes_pending); 8057 8058 if (mddev->safemode == 2) 8059 md_wakeup_thread(mddev->thread); 8060 else if (mddev->safemode_delay) 8061 /* The roundup() ensures this only performs locking once 8062 * every ->safemode_delay jiffies 8063 */ 8064 mod_timer(&mddev->safemode_timer, 8065 roundup(jiffies, mddev->safemode_delay) + 8066 mddev->safemode_delay); 8067 } 8068 8069 EXPORT_SYMBOL(md_write_end); 8070 8071 /* md_allow_write(mddev) 8072 * Calling this ensures that the array is marked 'active' so that writes 8073 * may proceed without blocking. It is important to call this before 8074 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8075 * Must be called with mddev_lock held. 8076 */ 8077 void md_allow_write(struct mddev *mddev) 8078 { 8079 if (!mddev->pers) 8080 return; 8081 if (mddev->ro) 8082 return; 8083 if (!mddev->pers->sync_request) 8084 return; 8085 8086 spin_lock(&mddev->lock); 8087 if (mddev->in_sync) { 8088 mddev->in_sync = 0; 8089 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8090 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8091 if (mddev->safemode_delay && 8092 mddev->safemode == 0) 8093 mddev->safemode = 1; 8094 spin_unlock(&mddev->lock); 8095 md_update_sb(mddev, 0); 8096 sysfs_notify_dirent_safe(mddev->sysfs_state); 8097 /* wait for the dirty state to be recorded in the metadata */ 8098 wait_event(mddev->sb_wait, 8099 !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && 8100 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8101 } else 8102 spin_unlock(&mddev->lock); 8103 } 8104 EXPORT_SYMBOL_GPL(md_allow_write); 8105 8106 #define SYNC_MARKS 10 8107 #define SYNC_MARK_STEP (3*HZ) 8108 #define UPDATE_FREQUENCY (5*60*HZ) 8109 void md_do_sync(struct md_thread *thread) 8110 { 8111 struct mddev *mddev = thread->mddev; 8112 struct mddev *mddev2; 8113 unsigned int currspeed = 0, 8114 window; 8115 sector_t max_sectors,j, io_sectors, recovery_done; 8116 unsigned long mark[SYNC_MARKS]; 8117 unsigned long update_time; 8118 sector_t mark_cnt[SYNC_MARKS]; 8119 int last_mark,m; 8120 struct list_head *tmp; 8121 sector_t last_check; 8122 int skipped = 0; 8123 struct md_rdev *rdev; 8124 char *desc, *action = NULL; 8125 struct blk_plug plug; 8126 int ret; 8127 8128 /* just incase thread restarts... */ 8129 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8130 return; 8131 if (mddev->ro) {/* never try to sync a read-only array */ 8132 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8133 return; 8134 } 8135 8136 if (mddev_is_clustered(mddev)) { 8137 ret = md_cluster_ops->resync_start(mddev); 8138 if (ret) 8139 goto skip; 8140 8141 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8142 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8143 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8144 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8145 && ((unsigned long long)mddev->curr_resync_completed 8146 < (unsigned long long)mddev->resync_max_sectors)) 8147 goto skip; 8148 } 8149 8150 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8151 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 8152 desc = "data-check"; 8153 action = "check"; 8154 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8155 desc = "requested-resync"; 8156 action = "repair"; 8157 } else 8158 desc = "resync"; 8159 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8160 desc = "reshape"; 8161 else 8162 desc = "recovery"; 8163 8164 mddev->last_sync_action = action ?: desc; 8165 8166 /* we overload curr_resync somewhat here. 8167 * 0 == not engaged in resync at all 8168 * 2 == checking that there is no conflict with another sync 8169 * 1 == like 2, but have yielded to allow conflicting resync to 8170 * commense 8171 * other == active in resync - this many blocks 8172 * 8173 * Before starting a resync we must have set curr_resync to 8174 * 2, and then checked that every "conflicting" array has curr_resync 8175 * less than ours. When we find one that is the same or higher 8176 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8177 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8178 * This will mean we have to start checking from the beginning again. 8179 * 8180 */ 8181 8182 do { 8183 int mddev2_minor = -1; 8184 mddev->curr_resync = 2; 8185 8186 try_again: 8187 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8188 goto skip; 8189 for_each_mddev(mddev2, tmp) { 8190 if (mddev2 == mddev) 8191 continue; 8192 if (!mddev->parallel_resync 8193 && mddev2->curr_resync 8194 && match_mddev_units(mddev, mddev2)) { 8195 DEFINE_WAIT(wq); 8196 if (mddev < mddev2 && mddev->curr_resync == 2) { 8197 /* arbitrarily yield */ 8198 mddev->curr_resync = 1; 8199 wake_up(&resync_wait); 8200 } 8201 if (mddev > mddev2 && mddev->curr_resync == 1) 8202 /* no need to wait here, we can wait the next 8203 * time 'round when curr_resync == 2 8204 */ 8205 continue; 8206 /* We need to wait 'interruptible' so as not to 8207 * contribute to the load average, and not to 8208 * be caught by 'softlockup' 8209 */ 8210 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 8211 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8212 mddev2->curr_resync >= mddev->curr_resync) { 8213 if (mddev2_minor != mddev2->md_minor) { 8214 mddev2_minor = mddev2->md_minor; 8215 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 8216 desc, mdname(mddev), 8217 mdname(mddev2)); 8218 } 8219 mddev_put(mddev2); 8220 if (signal_pending(current)) 8221 flush_signals(current); 8222 schedule(); 8223 finish_wait(&resync_wait, &wq); 8224 goto try_again; 8225 } 8226 finish_wait(&resync_wait, &wq); 8227 } 8228 } 8229 } while (mddev->curr_resync < 2); 8230 8231 j = 0; 8232 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8233 /* resync follows the size requested by the personality, 8234 * which defaults to physical size, but can be virtual size 8235 */ 8236 max_sectors = mddev->resync_max_sectors; 8237 atomic64_set(&mddev->resync_mismatches, 0); 8238 /* we don't use the checkpoint if there's a bitmap */ 8239 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8240 j = mddev->resync_min; 8241 else if (!mddev->bitmap) 8242 j = mddev->recovery_cp; 8243 8244 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8245 max_sectors = mddev->resync_max_sectors; 8246 else { 8247 /* recovery follows the physical size of devices */ 8248 max_sectors = mddev->dev_sectors; 8249 j = MaxSector; 8250 rcu_read_lock(); 8251 rdev_for_each_rcu(rdev, mddev) 8252 if (rdev->raid_disk >= 0 && 8253 !test_bit(Journal, &rdev->flags) && 8254 !test_bit(Faulty, &rdev->flags) && 8255 !test_bit(In_sync, &rdev->flags) && 8256 rdev->recovery_offset < j) 8257 j = rdev->recovery_offset; 8258 rcu_read_unlock(); 8259 8260 /* If there is a bitmap, we need to make sure all 8261 * writes that started before we added a spare 8262 * complete before we start doing a recovery. 8263 * Otherwise the write might complete and (via 8264 * bitmap_endwrite) set a bit in the bitmap after the 8265 * recovery has checked that bit and skipped that 8266 * region. 8267 */ 8268 if (mddev->bitmap) { 8269 mddev->pers->quiesce(mddev, 1); 8270 mddev->pers->quiesce(mddev, 0); 8271 } 8272 } 8273 8274 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 8275 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 8276 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 8277 speed_max(mddev), desc); 8278 8279 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8280 8281 io_sectors = 0; 8282 for (m = 0; m < SYNC_MARKS; m++) { 8283 mark[m] = jiffies; 8284 mark_cnt[m] = io_sectors; 8285 } 8286 last_mark = 0; 8287 mddev->resync_mark = mark[last_mark]; 8288 mddev->resync_mark_cnt = mark_cnt[last_mark]; 8289 8290 /* 8291 * Tune reconstruction: 8292 */ 8293 window = 32*(PAGE_SIZE/512); 8294 pr_debug("md: using %dk window, over a total of %lluk.\n", 8295 window/2, (unsigned long long)max_sectors/2); 8296 8297 atomic_set(&mddev->recovery_active, 0); 8298 last_check = 0; 8299 8300 if (j>2) { 8301 pr_debug("md: resuming %s of %s from checkpoint.\n", 8302 desc, mdname(mddev)); 8303 mddev->curr_resync = j; 8304 } else 8305 mddev->curr_resync = 3; /* no longer delayed */ 8306 mddev->curr_resync_completed = j; 8307 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8308 md_new_event(mddev); 8309 update_time = jiffies; 8310 8311 blk_start_plug(&plug); 8312 while (j < max_sectors) { 8313 sector_t sectors; 8314 8315 skipped = 0; 8316 8317 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8318 ((mddev->curr_resync > mddev->curr_resync_completed && 8319 (mddev->curr_resync - mddev->curr_resync_completed) 8320 > (max_sectors >> 4)) || 8321 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 8322 (j - mddev->curr_resync_completed)*2 8323 >= mddev->resync_max - mddev->curr_resync_completed || 8324 mddev->curr_resync_completed > mddev->resync_max 8325 )) { 8326 /* time to update curr_resync_completed */ 8327 wait_event(mddev->recovery_wait, 8328 atomic_read(&mddev->recovery_active) == 0); 8329 mddev->curr_resync_completed = j; 8330 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 8331 j > mddev->recovery_cp) 8332 mddev->recovery_cp = j; 8333 update_time = jiffies; 8334 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8335 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8336 } 8337 8338 while (j >= mddev->resync_max && 8339 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8340 /* As this condition is controlled by user-space, 8341 * we can block indefinitely, so use '_interruptible' 8342 * to avoid triggering warnings. 8343 */ 8344 flush_signals(current); /* just in case */ 8345 wait_event_interruptible(mddev->recovery_wait, 8346 mddev->resync_max > j 8347 || test_bit(MD_RECOVERY_INTR, 8348 &mddev->recovery)); 8349 } 8350 8351 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8352 break; 8353 8354 sectors = mddev->pers->sync_request(mddev, j, &skipped); 8355 if (sectors == 0) { 8356 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8357 break; 8358 } 8359 8360 if (!skipped) { /* actual IO requested */ 8361 io_sectors += sectors; 8362 atomic_add(sectors, &mddev->recovery_active); 8363 } 8364 8365 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8366 break; 8367 8368 j += sectors; 8369 if (j > max_sectors) 8370 /* when skipping, extra large numbers can be returned. */ 8371 j = max_sectors; 8372 if (j > 2) 8373 mddev->curr_resync = j; 8374 mddev->curr_mark_cnt = io_sectors; 8375 if (last_check == 0) 8376 /* this is the earliest that rebuild will be 8377 * visible in /proc/mdstat 8378 */ 8379 md_new_event(mddev); 8380 8381 if (last_check + window > io_sectors || j == max_sectors) 8382 continue; 8383 8384 last_check = io_sectors; 8385 repeat: 8386 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 8387 /* step marks */ 8388 int next = (last_mark+1) % SYNC_MARKS; 8389 8390 mddev->resync_mark = mark[next]; 8391 mddev->resync_mark_cnt = mark_cnt[next]; 8392 mark[next] = jiffies; 8393 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 8394 last_mark = next; 8395 } 8396 8397 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8398 break; 8399 8400 /* 8401 * this loop exits only if either when we are slower than 8402 * the 'hard' speed limit, or the system was IO-idle for 8403 * a jiffy. 8404 * the system might be non-idle CPU-wise, but we only care 8405 * about not overloading the IO subsystem. (things like an 8406 * e2fsck being done on the RAID array should execute fast) 8407 */ 8408 cond_resched(); 8409 8410 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 8411 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 8412 /((jiffies-mddev->resync_mark)/HZ +1) +1; 8413 8414 if (currspeed > speed_min(mddev)) { 8415 if (currspeed > speed_max(mddev)) { 8416 msleep(500); 8417 goto repeat; 8418 } 8419 if (!is_mddev_idle(mddev, 0)) { 8420 /* 8421 * Give other IO more of a chance. 8422 * The faster the devices, the less we wait. 8423 */ 8424 wait_event(mddev->recovery_wait, 8425 !atomic_read(&mddev->recovery_active)); 8426 } 8427 } 8428 } 8429 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 8430 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 8431 ? "interrupted" : "done"); 8432 /* 8433 * this also signals 'finished resyncing' to md_stop 8434 */ 8435 blk_finish_plug(&plug); 8436 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 8437 8438 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8439 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8440 mddev->curr_resync > 3) { 8441 mddev->curr_resync_completed = mddev->curr_resync; 8442 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8443 } 8444 mddev->pers->sync_request(mddev, max_sectors, &skipped); 8445 8446 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 8447 mddev->curr_resync > 3) { 8448 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8449 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8450 if (mddev->curr_resync >= mddev->recovery_cp) { 8451 pr_debug("md: checkpointing %s of %s.\n", 8452 desc, mdname(mddev)); 8453 if (test_bit(MD_RECOVERY_ERROR, 8454 &mddev->recovery)) 8455 mddev->recovery_cp = 8456 mddev->curr_resync_completed; 8457 else 8458 mddev->recovery_cp = 8459 mddev->curr_resync; 8460 } 8461 } else 8462 mddev->recovery_cp = MaxSector; 8463 } else { 8464 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8465 mddev->curr_resync = MaxSector; 8466 rcu_read_lock(); 8467 rdev_for_each_rcu(rdev, mddev) 8468 if (rdev->raid_disk >= 0 && 8469 mddev->delta_disks >= 0 && 8470 !test_bit(Journal, &rdev->flags) && 8471 !test_bit(Faulty, &rdev->flags) && 8472 !test_bit(In_sync, &rdev->flags) && 8473 rdev->recovery_offset < mddev->curr_resync) 8474 rdev->recovery_offset = mddev->curr_resync; 8475 rcu_read_unlock(); 8476 } 8477 } 8478 skip: 8479 /* set CHANGE_PENDING here since maybe another update is needed, 8480 * so other nodes are informed. It should be harmless for normal 8481 * raid */ 8482 set_mask_bits(&mddev->sb_flags, 0, 8483 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 8484 8485 spin_lock(&mddev->lock); 8486 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8487 /* We completed so min/max setting can be forgotten if used. */ 8488 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8489 mddev->resync_min = 0; 8490 mddev->resync_max = MaxSector; 8491 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8492 mddev->resync_min = mddev->curr_resync_completed; 8493 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 8494 mddev->curr_resync = 0; 8495 spin_unlock(&mddev->lock); 8496 8497 wake_up(&resync_wait); 8498 md_wakeup_thread(mddev->thread); 8499 return; 8500 } 8501 EXPORT_SYMBOL_GPL(md_do_sync); 8502 8503 static int remove_and_add_spares(struct mddev *mddev, 8504 struct md_rdev *this) 8505 { 8506 struct md_rdev *rdev; 8507 int spares = 0; 8508 int removed = 0; 8509 bool remove_some = false; 8510 8511 rdev_for_each(rdev, mddev) { 8512 if ((this == NULL || rdev == this) && 8513 rdev->raid_disk >= 0 && 8514 !test_bit(Blocked, &rdev->flags) && 8515 test_bit(Faulty, &rdev->flags) && 8516 atomic_read(&rdev->nr_pending)==0) { 8517 /* Faulty non-Blocked devices with nr_pending == 0 8518 * never get nr_pending incremented, 8519 * never get Faulty cleared, and never get Blocked set. 8520 * So we can synchronize_rcu now rather than once per device 8521 */ 8522 remove_some = true; 8523 set_bit(RemoveSynchronized, &rdev->flags); 8524 } 8525 } 8526 8527 if (remove_some) 8528 synchronize_rcu(); 8529 rdev_for_each(rdev, mddev) { 8530 if ((this == NULL || rdev == this) && 8531 rdev->raid_disk >= 0 && 8532 !test_bit(Blocked, &rdev->flags) && 8533 ((test_bit(RemoveSynchronized, &rdev->flags) || 8534 (!test_bit(In_sync, &rdev->flags) && 8535 !test_bit(Journal, &rdev->flags))) && 8536 atomic_read(&rdev->nr_pending)==0)) { 8537 if (mddev->pers->hot_remove_disk( 8538 mddev, rdev) == 0) { 8539 sysfs_unlink_rdev(mddev, rdev); 8540 rdev->raid_disk = -1; 8541 removed++; 8542 } 8543 } 8544 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) 8545 clear_bit(RemoveSynchronized, &rdev->flags); 8546 } 8547 8548 if (removed && mddev->kobj.sd) 8549 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8550 8551 if (this && removed) 8552 goto no_add; 8553 8554 rdev_for_each(rdev, mddev) { 8555 if (this && this != rdev) 8556 continue; 8557 if (test_bit(Candidate, &rdev->flags)) 8558 continue; 8559 if (rdev->raid_disk >= 0 && 8560 !test_bit(In_sync, &rdev->flags) && 8561 !test_bit(Journal, &rdev->flags) && 8562 !test_bit(Faulty, &rdev->flags)) 8563 spares++; 8564 if (rdev->raid_disk >= 0) 8565 continue; 8566 if (test_bit(Faulty, &rdev->flags)) 8567 continue; 8568 if (!test_bit(Journal, &rdev->flags)) { 8569 if (mddev->ro && 8570 ! (rdev->saved_raid_disk >= 0 && 8571 !test_bit(Bitmap_sync, &rdev->flags))) 8572 continue; 8573 8574 rdev->recovery_offset = 0; 8575 } 8576 if (mddev->pers-> 8577 hot_add_disk(mddev, rdev) == 0) { 8578 if (sysfs_link_rdev(mddev, rdev)) 8579 /* failure here is OK */; 8580 if (!test_bit(Journal, &rdev->flags)) 8581 spares++; 8582 md_new_event(mddev); 8583 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8584 } 8585 } 8586 no_add: 8587 if (removed) 8588 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8589 return spares; 8590 } 8591 8592 static void md_start_sync(struct work_struct *ws) 8593 { 8594 struct mddev *mddev = container_of(ws, struct mddev, del_work); 8595 8596 mddev->sync_thread = md_register_thread(md_do_sync, 8597 mddev, 8598 "resync"); 8599 if (!mddev->sync_thread) { 8600 pr_warn("%s: could not start resync thread...\n", 8601 mdname(mddev)); 8602 /* leave the spares where they are, it shouldn't hurt */ 8603 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8604 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8605 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8606 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8607 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8608 wake_up(&resync_wait); 8609 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8610 &mddev->recovery)) 8611 if (mddev->sysfs_action) 8612 sysfs_notify_dirent_safe(mddev->sysfs_action); 8613 } else 8614 md_wakeup_thread(mddev->sync_thread); 8615 sysfs_notify_dirent_safe(mddev->sysfs_action); 8616 md_new_event(mddev); 8617 } 8618 8619 /* 8620 * This routine is regularly called by all per-raid-array threads to 8621 * deal with generic issues like resync and super-block update. 8622 * Raid personalities that don't have a thread (linear/raid0) do not 8623 * need this as they never do any recovery or update the superblock. 8624 * 8625 * It does not do any resync itself, but rather "forks" off other threads 8626 * to do that as needed. 8627 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 8628 * "->recovery" and create a thread at ->sync_thread. 8629 * When the thread finishes it sets MD_RECOVERY_DONE 8630 * and wakeups up this thread which will reap the thread and finish up. 8631 * This thread also removes any faulty devices (with nr_pending == 0). 8632 * 8633 * The overall approach is: 8634 * 1/ if the superblock needs updating, update it. 8635 * 2/ If a recovery thread is running, don't do anything else. 8636 * 3/ If recovery has finished, clean up, possibly marking spares active. 8637 * 4/ If there are any faulty devices, remove them. 8638 * 5/ If array is degraded, try to add spares devices 8639 * 6/ If array has spares or is not in-sync, start a resync thread. 8640 */ 8641 void md_check_recovery(struct mddev *mddev) 8642 { 8643 if (mddev->suspended) 8644 return; 8645 8646 if (mddev->bitmap) 8647 bitmap_daemon_work(mddev); 8648 8649 if (signal_pending(current)) { 8650 if (mddev->pers->sync_request && !mddev->external) { 8651 pr_debug("md: %s in immediate safe mode\n", 8652 mdname(mddev)); 8653 mddev->safemode = 2; 8654 } 8655 flush_signals(current); 8656 } 8657 8658 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 8659 return; 8660 if ( ! ( 8661 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 8662 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8663 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8664 (mddev->external == 0 && mddev->safemode == 1) || 8665 (mddev->safemode == 2 8666 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 8667 )) 8668 return; 8669 8670 if (mddev_trylock(mddev)) { 8671 int spares = 0; 8672 8673 if (!mddev->external && mddev->safemode == 1) 8674 mddev->safemode = 0; 8675 8676 if (mddev->ro) { 8677 struct md_rdev *rdev; 8678 if (!mddev->external && mddev->in_sync) 8679 /* 'Blocked' flag not needed as failed devices 8680 * will be recorded if array switched to read/write. 8681 * Leaving it set will prevent the device 8682 * from being removed. 8683 */ 8684 rdev_for_each(rdev, mddev) 8685 clear_bit(Blocked, &rdev->flags); 8686 /* On a read-only array we can: 8687 * - remove failed devices 8688 * - add already-in_sync devices if the array itself 8689 * is in-sync. 8690 * As we only add devices that are already in-sync, 8691 * we can activate the spares immediately. 8692 */ 8693 remove_and_add_spares(mddev, NULL); 8694 /* There is no thread, but we need to call 8695 * ->spare_active and clear saved_raid_disk 8696 */ 8697 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8698 md_reap_sync_thread(mddev); 8699 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8700 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8701 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8702 goto unlock; 8703 } 8704 8705 if (mddev_is_clustered(mddev)) { 8706 struct md_rdev *rdev; 8707 /* kick the device if another node issued a 8708 * remove disk. 8709 */ 8710 rdev_for_each(rdev, mddev) { 8711 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 8712 rdev->raid_disk < 0) 8713 md_kick_rdev_from_array(rdev); 8714 } 8715 } 8716 8717 if (!mddev->external && !mddev->in_sync) { 8718 spin_lock(&mddev->lock); 8719 set_in_sync(mddev); 8720 spin_unlock(&mddev->lock); 8721 } 8722 8723 if (mddev->sb_flags) 8724 md_update_sb(mddev, 0); 8725 8726 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 8727 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 8728 /* resync/recovery still happening */ 8729 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8730 goto unlock; 8731 } 8732 if (mddev->sync_thread) { 8733 md_reap_sync_thread(mddev); 8734 goto unlock; 8735 } 8736 /* Set RUNNING before clearing NEEDED to avoid 8737 * any transients in the value of "sync_action". 8738 */ 8739 mddev->curr_resync_completed = 0; 8740 spin_lock(&mddev->lock); 8741 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8742 spin_unlock(&mddev->lock); 8743 /* Clear some bits that don't mean anything, but 8744 * might be left set 8745 */ 8746 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 8747 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8748 8749 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8750 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 8751 goto not_running; 8752 /* no recovery is running. 8753 * remove any failed drives, then 8754 * add spares if possible. 8755 * Spares are also removed and re-added, to allow 8756 * the personality to fail the re-add. 8757 */ 8758 8759 if (mddev->reshape_position != MaxSector) { 8760 if (mddev->pers->check_reshape == NULL || 8761 mddev->pers->check_reshape(mddev) != 0) 8762 /* Cannot proceed */ 8763 goto not_running; 8764 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8765 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8766 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 8767 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8768 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8769 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8770 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8771 } else if (mddev->recovery_cp < MaxSector) { 8772 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8773 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8774 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 8775 /* nothing to be done ... */ 8776 goto not_running; 8777 8778 if (mddev->pers->sync_request) { 8779 if (spares) { 8780 /* We are adding a device or devices to an array 8781 * which has the bitmap stored on all devices. 8782 * So make sure all bitmap pages get written 8783 */ 8784 bitmap_write_all(mddev->bitmap); 8785 } 8786 INIT_WORK(&mddev->del_work, md_start_sync); 8787 queue_work(md_misc_wq, &mddev->del_work); 8788 goto unlock; 8789 } 8790 not_running: 8791 if (!mddev->sync_thread) { 8792 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8793 wake_up(&resync_wait); 8794 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8795 &mddev->recovery)) 8796 if (mddev->sysfs_action) 8797 sysfs_notify_dirent_safe(mddev->sysfs_action); 8798 } 8799 unlock: 8800 wake_up(&mddev->sb_wait); 8801 mddev_unlock(mddev); 8802 } 8803 } 8804 EXPORT_SYMBOL(md_check_recovery); 8805 8806 void md_reap_sync_thread(struct mddev *mddev) 8807 { 8808 struct md_rdev *rdev; 8809 8810 /* resync has finished, collect result */ 8811 md_unregister_thread(&mddev->sync_thread); 8812 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8813 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8814 /* success...*/ 8815 /* activate any spares */ 8816 if (mddev->pers->spare_active(mddev)) { 8817 sysfs_notify(&mddev->kobj, NULL, 8818 "degraded"); 8819 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8820 } 8821 } 8822 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8823 mddev->pers->finish_reshape) 8824 mddev->pers->finish_reshape(mddev); 8825 8826 /* If array is no-longer degraded, then any saved_raid_disk 8827 * information must be scrapped. 8828 */ 8829 if (!mddev->degraded) 8830 rdev_for_each(rdev, mddev) 8831 rdev->saved_raid_disk = -1; 8832 8833 md_update_sb(mddev, 1); 8834 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 8835 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 8836 * clustered raid */ 8837 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 8838 md_cluster_ops->resync_finish(mddev); 8839 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8840 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8841 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8842 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8843 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8844 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8845 wake_up(&resync_wait); 8846 /* flag recovery needed just to double check */ 8847 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8848 sysfs_notify_dirent_safe(mddev->sysfs_action); 8849 md_new_event(mddev); 8850 if (mddev->event_work.func) 8851 queue_work(md_misc_wq, &mddev->event_work); 8852 } 8853 EXPORT_SYMBOL(md_reap_sync_thread); 8854 8855 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 8856 { 8857 sysfs_notify_dirent_safe(rdev->sysfs_state); 8858 wait_event_timeout(rdev->blocked_wait, 8859 !test_bit(Blocked, &rdev->flags) && 8860 !test_bit(BlockedBadBlocks, &rdev->flags), 8861 msecs_to_jiffies(5000)); 8862 rdev_dec_pending(rdev, mddev); 8863 } 8864 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 8865 8866 void md_finish_reshape(struct mddev *mddev) 8867 { 8868 /* called be personality module when reshape completes. */ 8869 struct md_rdev *rdev; 8870 8871 rdev_for_each(rdev, mddev) { 8872 if (rdev->data_offset > rdev->new_data_offset) 8873 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 8874 else 8875 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 8876 rdev->data_offset = rdev->new_data_offset; 8877 } 8878 } 8879 EXPORT_SYMBOL(md_finish_reshape); 8880 8881 /* Bad block management */ 8882 8883 /* Returns 1 on success, 0 on failure */ 8884 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8885 int is_new) 8886 { 8887 struct mddev *mddev = rdev->mddev; 8888 int rv; 8889 if (is_new) 8890 s += rdev->new_data_offset; 8891 else 8892 s += rdev->data_offset; 8893 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 8894 if (rv == 0) { 8895 /* Make sure they get written out promptly */ 8896 if (test_bit(ExternalBbl, &rdev->flags)) 8897 sysfs_notify(&rdev->kobj, NULL, 8898 "unacknowledged_bad_blocks"); 8899 sysfs_notify_dirent_safe(rdev->sysfs_state); 8900 set_mask_bits(&mddev->sb_flags, 0, 8901 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 8902 md_wakeup_thread(rdev->mddev->thread); 8903 return 1; 8904 } else 8905 return 0; 8906 } 8907 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 8908 8909 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8910 int is_new) 8911 { 8912 int rv; 8913 if (is_new) 8914 s += rdev->new_data_offset; 8915 else 8916 s += rdev->data_offset; 8917 rv = badblocks_clear(&rdev->badblocks, s, sectors); 8918 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 8919 sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); 8920 return rv; 8921 } 8922 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8923 8924 static int md_notify_reboot(struct notifier_block *this, 8925 unsigned long code, void *x) 8926 { 8927 struct list_head *tmp; 8928 struct mddev *mddev; 8929 int need_delay = 0; 8930 8931 for_each_mddev(mddev, tmp) { 8932 if (mddev_trylock(mddev)) { 8933 if (mddev->pers) 8934 __md_stop_writes(mddev); 8935 if (mddev->persistent) 8936 mddev->safemode = 2; 8937 mddev_unlock(mddev); 8938 } 8939 need_delay = 1; 8940 } 8941 /* 8942 * certain more exotic SCSI devices are known to be 8943 * volatile wrt too early system reboots. While the 8944 * right place to handle this issue is the given 8945 * driver, we do want to have a safe RAID driver ... 8946 */ 8947 if (need_delay) 8948 mdelay(1000*1); 8949 8950 return NOTIFY_DONE; 8951 } 8952 8953 static struct notifier_block md_notifier = { 8954 .notifier_call = md_notify_reboot, 8955 .next = NULL, 8956 .priority = INT_MAX, /* before any real devices */ 8957 }; 8958 8959 static void md_geninit(void) 8960 { 8961 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 8962 8963 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 8964 } 8965 8966 static int __init md_init(void) 8967 { 8968 int ret = -ENOMEM; 8969 8970 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 8971 if (!md_wq) 8972 goto err_wq; 8973 8974 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 8975 if (!md_misc_wq) 8976 goto err_misc_wq; 8977 8978 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) 8979 goto err_md; 8980 8981 if ((ret = register_blkdev(0, "mdp")) < 0) 8982 goto err_mdp; 8983 mdp_major = ret; 8984 8985 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE, 8986 md_probe, NULL, NULL); 8987 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 8988 md_probe, NULL, NULL); 8989 8990 register_reboot_notifier(&md_notifier); 8991 raid_table_header = register_sysctl_table(raid_root_table); 8992 8993 md_geninit(); 8994 return 0; 8995 8996 err_mdp: 8997 unregister_blkdev(MD_MAJOR, "md"); 8998 err_md: 8999 destroy_workqueue(md_misc_wq); 9000 err_misc_wq: 9001 destroy_workqueue(md_wq); 9002 err_wq: 9003 return ret; 9004 } 9005 9006 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9007 { 9008 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9009 struct md_rdev *rdev2; 9010 int role, ret; 9011 char b[BDEVNAME_SIZE]; 9012 9013 /* 9014 * If size is changed in another node then we need to 9015 * do resize as well. 9016 */ 9017 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9018 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9019 if (ret) 9020 pr_info("md-cluster: resize failed\n"); 9021 else 9022 bitmap_update_sb(mddev->bitmap); 9023 } 9024 9025 /* Check for change of roles in the active devices */ 9026 rdev_for_each(rdev2, mddev) { 9027 if (test_bit(Faulty, &rdev2->flags)) 9028 continue; 9029 9030 /* Check if the roles changed */ 9031 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9032 9033 if (test_bit(Candidate, &rdev2->flags)) { 9034 if (role == 0xfffe) { 9035 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); 9036 md_kick_rdev_from_array(rdev2); 9037 continue; 9038 } 9039 else 9040 clear_bit(Candidate, &rdev2->flags); 9041 } 9042 9043 if (role != rdev2->raid_disk) { 9044 /* got activated */ 9045 if (rdev2->raid_disk == -1 && role != 0xffff) { 9046 rdev2->saved_raid_disk = role; 9047 ret = remove_and_add_spares(mddev, rdev2); 9048 pr_info("Activated spare: %s\n", 9049 bdevname(rdev2->bdev,b)); 9050 /* wakeup mddev->thread here, so array could 9051 * perform resync with the new activated disk */ 9052 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9053 md_wakeup_thread(mddev->thread); 9054 9055 } 9056 /* device faulty 9057 * We just want to do the minimum to mark the disk 9058 * as faulty. The recovery is performed by the 9059 * one who initiated the error. 9060 */ 9061 if ((role == 0xfffe) || (role == 0xfffd)) { 9062 md_error(mddev, rdev2); 9063 clear_bit(Blocked, &rdev2->flags); 9064 } 9065 } 9066 } 9067 9068 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) 9069 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9070 9071 /* Finally set the event to be up to date */ 9072 mddev->events = le64_to_cpu(sb->events); 9073 } 9074 9075 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9076 { 9077 int err; 9078 struct page *swapout = rdev->sb_page; 9079 struct mdp_superblock_1 *sb; 9080 9081 /* Store the sb page of the rdev in the swapout temporary 9082 * variable in case we err in the future 9083 */ 9084 rdev->sb_page = NULL; 9085 err = alloc_disk_sb(rdev); 9086 if (err == 0) { 9087 ClearPageUptodate(rdev->sb_page); 9088 rdev->sb_loaded = 0; 9089 err = super_types[mddev->major_version]. 9090 load_super(rdev, NULL, mddev->minor_version); 9091 } 9092 if (err < 0) { 9093 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9094 __func__, __LINE__, rdev->desc_nr, err); 9095 if (rdev->sb_page) 9096 put_page(rdev->sb_page); 9097 rdev->sb_page = swapout; 9098 rdev->sb_loaded = 1; 9099 return err; 9100 } 9101 9102 sb = page_address(rdev->sb_page); 9103 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9104 * is not set 9105 */ 9106 9107 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9108 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9109 9110 /* The other node finished recovery, call spare_active to set 9111 * device In_sync and mddev->degraded 9112 */ 9113 if (rdev->recovery_offset == MaxSector && 9114 !test_bit(In_sync, &rdev->flags) && 9115 mddev->pers->spare_active(mddev)) 9116 sysfs_notify(&mddev->kobj, NULL, "degraded"); 9117 9118 put_page(swapout); 9119 return 0; 9120 } 9121 9122 void md_reload_sb(struct mddev *mddev, int nr) 9123 { 9124 struct md_rdev *rdev; 9125 int err; 9126 9127 /* Find the rdev */ 9128 rdev_for_each_rcu(rdev, mddev) { 9129 if (rdev->desc_nr == nr) 9130 break; 9131 } 9132 9133 if (!rdev || rdev->desc_nr != nr) { 9134 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9135 return; 9136 } 9137 9138 err = read_rdev(mddev, rdev); 9139 if (err < 0) 9140 return; 9141 9142 check_sb_changes(mddev, rdev); 9143 9144 /* Read all rdev's to update recovery_offset */ 9145 rdev_for_each_rcu(rdev, mddev) 9146 read_rdev(mddev, rdev); 9147 } 9148 EXPORT_SYMBOL(md_reload_sb); 9149 9150 #ifndef MODULE 9151 9152 /* 9153 * Searches all registered partitions for autorun RAID arrays 9154 * at boot time. 9155 */ 9156 9157 static DEFINE_MUTEX(detected_devices_mutex); 9158 static LIST_HEAD(all_detected_devices); 9159 struct detected_devices_node { 9160 struct list_head list; 9161 dev_t dev; 9162 }; 9163 9164 void md_autodetect_dev(dev_t dev) 9165 { 9166 struct detected_devices_node *node_detected_dev; 9167 9168 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 9169 if (node_detected_dev) { 9170 node_detected_dev->dev = dev; 9171 mutex_lock(&detected_devices_mutex); 9172 list_add_tail(&node_detected_dev->list, &all_detected_devices); 9173 mutex_unlock(&detected_devices_mutex); 9174 } 9175 } 9176 9177 static void autostart_arrays(int part) 9178 { 9179 struct md_rdev *rdev; 9180 struct detected_devices_node *node_detected_dev; 9181 dev_t dev; 9182 int i_scanned, i_passed; 9183 9184 i_scanned = 0; 9185 i_passed = 0; 9186 9187 pr_info("md: Autodetecting RAID arrays.\n"); 9188 9189 mutex_lock(&detected_devices_mutex); 9190 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 9191 i_scanned++; 9192 node_detected_dev = list_entry(all_detected_devices.next, 9193 struct detected_devices_node, list); 9194 list_del(&node_detected_dev->list); 9195 dev = node_detected_dev->dev; 9196 kfree(node_detected_dev); 9197 mutex_unlock(&detected_devices_mutex); 9198 rdev = md_import_device(dev,0, 90); 9199 mutex_lock(&detected_devices_mutex); 9200 if (IS_ERR(rdev)) 9201 continue; 9202 9203 if (test_bit(Faulty, &rdev->flags)) 9204 continue; 9205 9206 set_bit(AutoDetected, &rdev->flags); 9207 list_add(&rdev->same_set, &pending_raid_disks); 9208 i_passed++; 9209 } 9210 mutex_unlock(&detected_devices_mutex); 9211 9212 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 9213 9214 autorun_devices(part); 9215 } 9216 9217 #endif /* !MODULE */ 9218 9219 static __exit void md_exit(void) 9220 { 9221 struct mddev *mddev; 9222 struct list_head *tmp; 9223 int delay = 1; 9224 9225 blk_unregister_region(MKDEV(MD_MAJOR,0), 512); 9226 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 9227 9228 unregister_blkdev(MD_MAJOR,"md"); 9229 unregister_blkdev(mdp_major, "mdp"); 9230 unregister_reboot_notifier(&md_notifier); 9231 unregister_sysctl_table(raid_table_header); 9232 9233 /* We cannot unload the modules while some process is 9234 * waiting for us in select() or poll() - wake them up 9235 */ 9236 md_unloading = 1; 9237 while (waitqueue_active(&md_event_waiters)) { 9238 /* not safe to leave yet */ 9239 wake_up(&md_event_waiters); 9240 msleep(delay); 9241 delay += delay; 9242 } 9243 remove_proc_entry("mdstat", NULL); 9244 9245 for_each_mddev(mddev, tmp) { 9246 export_array(mddev); 9247 mddev->ctime = 0; 9248 mddev->hold_active = 0; 9249 /* 9250 * for_each_mddev() will call mddev_put() at the end of each 9251 * iteration. As the mddev is now fully clear, this will 9252 * schedule the mddev for destruction by a workqueue, and the 9253 * destroy_workqueue() below will wait for that to complete. 9254 */ 9255 } 9256 destroy_workqueue(md_misc_wq); 9257 destroy_workqueue(md_wq); 9258 } 9259 9260 subsys_initcall(md_init); 9261 module_exit(md_exit) 9262 9263 static int get_ro(char *buffer, struct kernel_param *kp) 9264 { 9265 return sprintf(buffer, "%d", start_readonly); 9266 } 9267 static int set_ro(const char *val, struct kernel_param *kp) 9268 { 9269 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 9270 } 9271 9272 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 9273 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 9274 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 9275 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 9276 9277 MODULE_LICENSE("GPL"); 9278 MODULE_DESCRIPTION("MD RAID framework"); 9279 MODULE_ALIAS("md"); 9280 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 9281