1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include "md.h" 69 #include "md-bitmap.h" 70 #include "md-cluster.h" 71 72 static const char *action_name[NR_SYNC_ACTIONS] = { 73 [ACTION_RESYNC] = "resync", 74 [ACTION_RECOVER] = "recover", 75 [ACTION_CHECK] = "check", 76 [ACTION_REPAIR] = "repair", 77 [ACTION_RESHAPE] = "reshape", 78 [ACTION_FROZEN] = "frozen", 79 [ACTION_IDLE] = "idle", 80 }; 81 82 static DEFINE_XARRAY(md_submodule); 83 84 static const struct kobj_type md_ktype; 85 86 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 87 static struct workqueue_struct *md_wq; 88 89 /* 90 * This workqueue is used for sync_work to register new sync_thread, and for 91 * del_work to remove rdev, and for event_work that is only set by dm-raid. 92 * 93 * Noted that sync_work will grab reconfig_mutex, hence never flush this 94 * workqueue whith reconfig_mutex grabbed. 95 */ 96 static struct workqueue_struct *md_misc_wq; 97 98 static int remove_and_add_spares(struct mddev *mddev, 99 struct md_rdev *this); 100 static void mddev_detach(struct mddev *mddev); 101 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 102 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 103 104 /* 105 * Default number of read corrections we'll attempt on an rdev 106 * before ejecting it from the array. We divide the read error 107 * count by 2 for every hour elapsed between read errors. 108 */ 109 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 110 /* Default safemode delay: 200 msec */ 111 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 112 /* 113 * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' 114 * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load 115 * does not show up that much. Increase it if you want to have more guaranteed 116 * speed. Note that the RAID driver will use the maximum bandwidth 117 * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. 118 * 119 * Background sync IO speed control: 120 * 121 * - below speed min: 122 * no limit; 123 * - above speed min and below speed max: 124 * a) if mddev is idle, then no limit; 125 * b) if mddev is busy handling normal IO, then limit inflight sync IO 126 * to sync_io_depth; 127 * - above speed max: 128 * sync IO can't be issued; 129 * 130 * Following configurations can be changed via /proc/sys/dev/raid/ for system 131 * or /sys/block/mdX/md/ for one array. 132 */ 133 static int sysctl_speed_limit_min = 1000; 134 static int sysctl_speed_limit_max = 200000; 135 static int sysctl_sync_io_depth = 32; 136 137 static int speed_min(struct mddev *mddev) 138 { 139 return mddev->sync_speed_min ? 140 mddev->sync_speed_min : sysctl_speed_limit_min; 141 } 142 143 static int speed_max(struct mddev *mddev) 144 { 145 return mddev->sync_speed_max ? 146 mddev->sync_speed_max : sysctl_speed_limit_max; 147 } 148 149 static int sync_io_depth(struct mddev *mddev) 150 { 151 return mddev->sync_io_depth ? 152 mddev->sync_io_depth : sysctl_sync_io_depth; 153 } 154 155 static void rdev_uninit_serial(struct md_rdev *rdev) 156 { 157 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 158 return; 159 160 kvfree(rdev->serial); 161 rdev->serial = NULL; 162 } 163 164 static void rdevs_uninit_serial(struct mddev *mddev) 165 { 166 struct md_rdev *rdev; 167 168 rdev_for_each(rdev, mddev) 169 rdev_uninit_serial(rdev); 170 } 171 172 static int rdev_init_serial(struct md_rdev *rdev) 173 { 174 /* serial_nums equals with BARRIER_BUCKETS_NR */ 175 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 176 struct serial_in_rdev *serial = NULL; 177 178 if (test_bit(CollisionCheck, &rdev->flags)) 179 return 0; 180 181 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 182 GFP_KERNEL); 183 if (!serial) 184 return -ENOMEM; 185 186 for (i = 0; i < serial_nums; i++) { 187 struct serial_in_rdev *serial_tmp = &serial[i]; 188 189 spin_lock_init(&serial_tmp->serial_lock); 190 serial_tmp->serial_rb = RB_ROOT_CACHED; 191 init_waitqueue_head(&serial_tmp->serial_io_wait); 192 } 193 194 rdev->serial = serial; 195 set_bit(CollisionCheck, &rdev->flags); 196 197 return 0; 198 } 199 200 static int rdevs_init_serial(struct mddev *mddev) 201 { 202 struct md_rdev *rdev; 203 int ret = 0; 204 205 rdev_for_each(rdev, mddev) { 206 ret = rdev_init_serial(rdev); 207 if (ret) 208 break; 209 } 210 211 /* Free all resources if pool is not existed */ 212 if (ret && !mddev->serial_info_pool) 213 rdevs_uninit_serial(mddev); 214 215 return ret; 216 } 217 218 /* 219 * rdev needs to enable serial stuffs if it meets the conditions: 220 * 1. it is multi-queue device flaged with writemostly. 221 * 2. the write-behind mode is enabled. 222 */ 223 static int rdev_need_serial(struct md_rdev *rdev) 224 { 225 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 226 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 227 test_bit(WriteMostly, &rdev->flags)); 228 } 229 230 /* 231 * Init resource for rdev(s), then create serial_info_pool if: 232 * 1. rdev is the first device which return true from rdev_enable_serial. 233 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 234 */ 235 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 236 { 237 int ret = 0; 238 239 if (rdev && !rdev_need_serial(rdev) && 240 !test_bit(CollisionCheck, &rdev->flags)) 241 return; 242 243 if (!rdev) 244 ret = rdevs_init_serial(mddev); 245 else 246 ret = rdev_init_serial(rdev); 247 if (ret) 248 return; 249 250 if (mddev->serial_info_pool == NULL) { 251 /* 252 * already in memalloc noio context by 253 * mddev_suspend() 254 */ 255 mddev->serial_info_pool = 256 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 257 sizeof(struct serial_info)); 258 if (!mddev->serial_info_pool) { 259 rdevs_uninit_serial(mddev); 260 pr_err("can't alloc memory pool for serialization\n"); 261 } 262 } 263 } 264 265 /* 266 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 267 * 1. rdev is the last device flaged with CollisionCheck. 268 * 2. when bitmap is destroyed while policy is not enabled. 269 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 270 */ 271 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 272 { 273 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 274 return; 275 276 if (mddev->serial_info_pool) { 277 struct md_rdev *temp; 278 int num = 0; /* used to track if other rdevs need the pool */ 279 280 rdev_for_each(temp, mddev) { 281 if (!rdev) { 282 if (!mddev->serialize_policy || 283 !rdev_need_serial(temp)) 284 rdev_uninit_serial(temp); 285 else 286 num++; 287 } else if (temp != rdev && 288 test_bit(CollisionCheck, &temp->flags)) 289 num++; 290 } 291 292 if (rdev) 293 rdev_uninit_serial(rdev); 294 295 if (num) 296 pr_info("The mempool could be used by other devices\n"); 297 else { 298 mempool_destroy(mddev->serial_info_pool); 299 mddev->serial_info_pool = NULL; 300 } 301 } 302 } 303 304 static struct ctl_table_header *raid_table_header; 305 306 static const struct ctl_table raid_table[] = { 307 { 308 .procname = "speed_limit_min", 309 .data = &sysctl_speed_limit_min, 310 .maxlen = sizeof(int), 311 .mode = 0644, 312 .proc_handler = proc_dointvec, 313 }, 314 { 315 .procname = "speed_limit_max", 316 .data = &sysctl_speed_limit_max, 317 .maxlen = sizeof(int), 318 .mode = 0644, 319 .proc_handler = proc_dointvec, 320 }, 321 { 322 .procname = "sync_io_depth", 323 .data = &sysctl_sync_io_depth, 324 .maxlen = sizeof(int), 325 .mode = 0644, 326 .proc_handler = proc_dointvec, 327 }, 328 }; 329 330 static int start_readonly; 331 332 /* 333 * The original mechanism for creating an md device is to create 334 * a device node in /dev and to open it. This causes races with device-close. 335 * The preferred method is to write to the "new_array" module parameter. 336 * This can avoid races. 337 * Setting create_on_open to false disables the original mechanism 338 * so all the races disappear. 339 */ 340 static bool create_on_open = true; 341 static bool legacy_async_del_gendisk = true; 342 343 /* 344 * We have a system wide 'event count' that is incremented 345 * on any 'interesting' event, and readers of /proc/mdstat 346 * can use 'poll' or 'select' to find out when the event 347 * count increases. 348 * 349 * Events are: 350 * start array, stop array, error, add device, remove device, 351 * start build, activate spare 352 */ 353 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 354 static atomic_t md_event_count; 355 void md_new_event(void) 356 { 357 atomic_inc(&md_event_count); 358 wake_up(&md_event_waiters); 359 } 360 EXPORT_SYMBOL_GPL(md_new_event); 361 362 /* 363 * Enables to iterate over all existing md arrays 364 * all_mddevs_lock protects this list. 365 */ 366 static LIST_HEAD(all_mddevs); 367 static DEFINE_SPINLOCK(all_mddevs_lock); 368 369 static bool is_md_suspended(struct mddev *mddev) 370 { 371 return percpu_ref_is_dying(&mddev->active_io); 372 } 373 /* Rather than calling directly into the personality make_request function, 374 * IO requests come here first so that we can check if the device is 375 * being suspended pending a reconfiguration. 376 * We hold a refcount over the call to ->make_request. By the time that 377 * call has finished, the bio has been linked into some internal structure 378 * and so is visible to ->quiesce(), so we don't need the refcount any more. 379 */ 380 static bool is_suspended(struct mddev *mddev, struct bio *bio) 381 { 382 if (is_md_suspended(mddev)) 383 return true; 384 if (bio_data_dir(bio) != WRITE) 385 return false; 386 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 387 return false; 388 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 389 return false; 390 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 391 return false; 392 return true; 393 } 394 395 bool md_handle_request(struct mddev *mddev, struct bio *bio) 396 { 397 check_suspended: 398 if (is_suspended(mddev, bio)) { 399 DEFINE_WAIT(__wait); 400 /* Bail out if REQ_NOWAIT is set for the bio */ 401 if (bio->bi_opf & REQ_NOWAIT) { 402 bio_wouldblock_error(bio); 403 return true; 404 } 405 for (;;) { 406 prepare_to_wait(&mddev->sb_wait, &__wait, 407 TASK_UNINTERRUPTIBLE); 408 if (!is_suspended(mddev, bio)) 409 break; 410 schedule(); 411 } 412 finish_wait(&mddev->sb_wait, &__wait); 413 } 414 if (!percpu_ref_tryget_live(&mddev->active_io)) 415 goto check_suspended; 416 417 if (!mddev->pers->make_request(mddev, bio)) { 418 percpu_ref_put(&mddev->active_io); 419 if (!mddev->gendisk && mddev->pers->prepare_suspend) 420 return false; 421 goto check_suspended; 422 } 423 424 percpu_ref_put(&mddev->active_io); 425 return true; 426 } 427 EXPORT_SYMBOL(md_handle_request); 428 429 static void md_submit_bio(struct bio *bio) 430 { 431 const int rw = bio_data_dir(bio); 432 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 433 434 if (mddev == NULL || mddev->pers == NULL) { 435 bio_io_error(bio); 436 return; 437 } 438 439 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 440 bio_io_error(bio); 441 return; 442 } 443 444 bio = bio_split_to_limits(bio); 445 if (!bio) 446 return; 447 448 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 449 if (bio_sectors(bio) != 0) 450 bio->bi_status = BLK_STS_IOERR; 451 bio_endio(bio); 452 return; 453 } 454 455 /* bio could be mergeable after passing to underlayer */ 456 bio->bi_opf &= ~REQ_NOMERGE; 457 458 md_handle_request(mddev, bio); 459 } 460 461 /* 462 * Make sure no new requests are submitted to the device, and any requests that 463 * have been submitted are completely handled. 464 */ 465 int mddev_suspend(struct mddev *mddev, bool interruptible) 466 { 467 int err = 0; 468 469 /* 470 * hold reconfig_mutex to wait for normal io will deadlock, because 471 * other context can't update super_block, and normal io can rely on 472 * updating super_block. 473 */ 474 lockdep_assert_not_held(&mddev->reconfig_mutex); 475 476 if (interruptible) 477 err = mutex_lock_interruptible(&mddev->suspend_mutex); 478 else 479 mutex_lock(&mddev->suspend_mutex); 480 if (err) 481 return err; 482 483 if (mddev->suspended) { 484 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 485 mutex_unlock(&mddev->suspend_mutex); 486 return 0; 487 } 488 489 percpu_ref_kill(&mddev->active_io); 490 if (interruptible) 491 err = wait_event_interruptible(mddev->sb_wait, 492 percpu_ref_is_zero(&mddev->active_io)); 493 else 494 wait_event(mddev->sb_wait, 495 percpu_ref_is_zero(&mddev->active_io)); 496 if (err) { 497 percpu_ref_resurrect(&mddev->active_io); 498 mutex_unlock(&mddev->suspend_mutex); 499 return err; 500 } 501 502 /* 503 * For raid456, io might be waiting for reshape to make progress, 504 * allow new reshape to start while waiting for io to be done to 505 * prevent deadlock. 506 */ 507 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 508 509 /* restrict memory reclaim I/O during raid array is suspend */ 510 mddev->noio_flag = memalloc_noio_save(); 511 512 mutex_unlock(&mddev->suspend_mutex); 513 return 0; 514 } 515 EXPORT_SYMBOL_GPL(mddev_suspend); 516 517 static void __mddev_resume(struct mddev *mddev, bool recovery_needed) 518 { 519 lockdep_assert_not_held(&mddev->reconfig_mutex); 520 521 mutex_lock(&mddev->suspend_mutex); 522 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 523 if (mddev->suspended) { 524 mutex_unlock(&mddev->suspend_mutex); 525 return; 526 } 527 528 /* entred the memalloc scope from mddev_suspend() */ 529 memalloc_noio_restore(mddev->noio_flag); 530 531 percpu_ref_resurrect(&mddev->active_io); 532 wake_up(&mddev->sb_wait); 533 534 if (recovery_needed) 535 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 536 md_wakeup_thread(mddev->thread); 537 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 538 539 mutex_unlock(&mddev->suspend_mutex); 540 } 541 542 void mddev_resume(struct mddev *mddev) 543 { 544 return __mddev_resume(mddev, true); 545 } 546 EXPORT_SYMBOL_GPL(mddev_resume); 547 548 /* sync bdev before setting device to readonly or stopping raid*/ 549 static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) 550 { 551 mutex_lock(&mddev->open_mutex); 552 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { 553 mutex_unlock(&mddev->open_mutex); 554 return -EBUSY; 555 } 556 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 557 mutex_unlock(&mddev->open_mutex); 558 return -EBUSY; 559 } 560 mutex_unlock(&mddev->open_mutex); 561 562 sync_blockdev(mddev->gendisk->part0); 563 return 0; 564 } 565 566 /* 567 * The only difference from bio_chain_endio() is that the current 568 * bi_status of bio does not affect the bi_status of parent. 569 */ 570 static void md_end_flush(struct bio *bio) 571 { 572 struct bio *parent = bio->bi_private; 573 574 /* 575 * If any flush io error before the power failure, 576 * disk data may be lost. 577 */ 578 if (bio->bi_status) 579 pr_err("md: %pg flush io error %d\n", bio->bi_bdev, 580 blk_status_to_errno(bio->bi_status)); 581 582 bio_put(bio); 583 bio_endio(parent); 584 } 585 586 bool md_flush_request(struct mddev *mddev, struct bio *bio) 587 { 588 struct md_rdev *rdev; 589 struct bio *new; 590 591 /* 592 * md_flush_reqeust() should be called under md_handle_request() and 593 * 'active_io' is already grabbed. Hence it's safe to get rdev directly 594 * without rcu protection. 595 */ 596 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 597 598 rdev_for_each(rdev, mddev) { 599 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 600 continue; 601 602 new = bio_alloc_bioset(rdev->bdev, 0, 603 REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, 604 &mddev->bio_set); 605 new->bi_private = bio; 606 new->bi_end_io = md_end_flush; 607 bio_inc_remaining(bio); 608 submit_bio(new); 609 } 610 611 if (bio_sectors(bio) == 0) { 612 bio_endio(bio); 613 return true; 614 } 615 616 bio->bi_opf &= ~REQ_PREFLUSH; 617 return false; 618 } 619 EXPORT_SYMBOL(md_flush_request); 620 621 static inline struct mddev *mddev_get(struct mddev *mddev) 622 { 623 lockdep_assert_held(&all_mddevs_lock); 624 625 if (test_bit(MD_DELETED, &mddev->flags)) 626 return NULL; 627 atomic_inc(&mddev->active); 628 return mddev; 629 } 630 631 static void mddev_delayed_delete(struct work_struct *ws); 632 633 static void __mddev_put(struct mddev *mddev) 634 { 635 if (mddev->raid_disks || !list_empty(&mddev->disks) || 636 mddev->ctime || mddev->hold_active) 637 return; 638 639 /* 640 * If array is freed by stopping array, MD_DELETED is set by 641 * do_md_stop(), MD_DELETED is still set here in case mddev is freed 642 * directly by closing a mddev that is created by create_on_open. 643 */ 644 set_bit(MD_DELETED, &mddev->flags); 645 /* 646 * Call queue_work inside the spinlock so that flush_workqueue() after 647 * mddev_find will succeed in waiting for the work to be done. 648 */ 649 queue_work(md_misc_wq, &mddev->del_work); 650 } 651 652 static void mddev_put_locked(struct mddev *mddev) 653 { 654 if (atomic_dec_and_test(&mddev->active)) 655 __mddev_put(mddev); 656 } 657 658 void mddev_put(struct mddev *mddev) 659 { 660 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 661 return; 662 663 __mddev_put(mddev); 664 spin_unlock(&all_mddevs_lock); 665 } 666 667 static void md_safemode_timeout(struct timer_list *t); 668 static void md_start_sync(struct work_struct *ws); 669 670 static void active_io_release(struct percpu_ref *ref) 671 { 672 struct mddev *mddev = container_of(ref, struct mddev, active_io); 673 674 wake_up(&mddev->sb_wait); 675 } 676 677 static void no_op(struct percpu_ref *r) {} 678 679 static bool mddev_set_bitmap_ops(struct mddev *mddev) 680 { 681 struct bitmap_operations *old = mddev->bitmap_ops; 682 struct md_submodule_head *head; 683 684 if (mddev->bitmap_id == ID_BITMAP_NONE || 685 (old && old->head.id == mddev->bitmap_id)) 686 return true; 687 688 xa_lock(&md_submodule); 689 head = xa_load(&md_submodule, mddev->bitmap_id); 690 691 if (!head) { 692 pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); 693 goto err; 694 } 695 696 if (head->type != MD_BITMAP) { 697 pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); 698 goto err; 699 } 700 701 mddev->bitmap_ops = (void *)head; 702 xa_unlock(&md_submodule); 703 704 if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) { 705 if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) 706 pr_warn("md: cannot register extra bitmap attributes for %s\n", 707 mdname(mddev)); 708 else 709 /* 710 * Inform user with KOBJ_CHANGE about new bitmap 711 * attributes. 712 */ 713 kobject_uevent(&mddev->kobj, KOBJ_CHANGE); 714 } 715 return true; 716 717 err: 718 xa_unlock(&md_submodule); 719 return false; 720 } 721 722 static void mddev_clear_bitmap_ops(struct mddev *mddev) 723 { 724 if (!mddev_is_dm(mddev) && mddev->bitmap_ops && 725 mddev->bitmap_ops->group) 726 sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group); 727 728 mddev->bitmap_ops = NULL; 729 } 730 731 int mddev_init(struct mddev *mddev) 732 { 733 if (!IS_ENABLED(CONFIG_MD_BITMAP)) 734 mddev->bitmap_id = ID_BITMAP_NONE; 735 else 736 mddev->bitmap_id = ID_BITMAP; 737 738 if (percpu_ref_init(&mddev->active_io, active_io_release, 739 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 740 return -ENOMEM; 741 742 if (percpu_ref_init(&mddev->writes_pending, no_op, 743 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 744 percpu_ref_exit(&mddev->active_io); 745 return -ENOMEM; 746 } 747 748 /* We want to start with the refcount at zero */ 749 percpu_ref_put(&mddev->writes_pending); 750 751 mutex_init(&mddev->open_mutex); 752 mutex_init(&mddev->reconfig_mutex); 753 mutex_init(&mddev->suspend_mutex); 754 mutex_init(&mddev->bitmap_info.mutex); 755 INIT_LIST_HEAD(&mddev->disks); 756 INIT_LIST_HEAD(&mddev->all_mddevs); 757 INIT_LIST_HEAD(&mddev->deleting); 758 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 759 atomic_set(&mddev->active, 1); 760 atomic_set(&mddev->openers, 0); 761 atomic_set(&mddev->sync_seq, 0); 762 spin_lock_init(&mddev->lock); 763 init_waitqueue_head(&mddev->sb_wait); 764 init_waitqueue_head(&mddev->recovery_wait); 765 mddev->reshape_position = MaxSector; 766 mddev->reshape_backwards = 0; 767 mddev->last_sync_action = ACTION_IDLE; 768 mddev->resync_min = 0; 769 mddev->resync_max = MaxSector; 770 mddev->level = LEVEL_NONE; 771 772 INIT_WORK(&mddev->sync_work, md_start_sync); 773 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 774 775 return 0; 776 } 777 EXPORT_SYMBOL_GPL(mddev_init); 778 779 void mddev_destroy(struct mddev *mddev) 780 { 781 percpu_ref_exit(&mddev->active_io); 782 percpu_ref_exit(&mddev->writes_pending); 783 } 784 EXPORT_SYMBOL_GPL(mddev_destroy); 785 786 static struct mddev *mddev_find_locked(dev_t unit) 787 { 788 struct mddev *mddev; 789 790 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 791 if (mddev->unit == unit) 792 return mddev; 793 794 return NULL; 795 } 796 797 /* find an unused unit number */ 798 static dev_t mddev_alloc_unit(void) 799 { 800 static int next_minor = 512; 801 int start = next_minor; 802 bool is_free = 0; 803 dev_t dev = 0; 804 805 while (!is_free) { 806 dev = MKDEV(MD_MAJOR, next_minor); 807 next_minor++; 808 if (next_minor > MINORMASK) 809 next_minor = 0; 810 if (next_minor == start) 811 return 0; /* Oh dear, all in use. */ 812 is_free = !mddev_find_locked(dev); 813 } 814 815 return dev; 816 } 817 818 static struct mddev *mddev_alloc(dev_t unit) 819 { 820 struct mddev *new; 821 int error; 822 823 if (unit && MAJOR(unit) != MD_MAJOR) 824 unit &= ~((1 << MdpMinorShift) - 1); 825 826 new = kzalloc(sizeof(*new), GFP_KERNEL); 827 if (!new) 828 return ERR_PTR(-ENOMEM); 829 830 error = mddev_init(new); 831 if (error) 832 goto out_free_new; 833 834 spin_lock(&all_mddevs_lock); 835 if (unit) { 836 error = -EEXIST; 837 if (mddev_find_locked(unit)) 838 goto out_destroy_new; 839 new->unit = unit; 840 if (MAJOR(unit) == MD_MAJOR) 841 new->md_minor = MINOR(unit); 842 else 843 new->md_minor = MINOR(unit) >> MdpMinorShift; 844 new->hold_active = UNTIL_IOCTL; 845 } else { 846 error = -ENODEV; 847 new->unit = mddev_alloc_unit(); 848 if (!new->unit) 849 goto out_destroy_new; 850 new->md_minor = MINOR(new->unit); 851 new->hold_active = UNTIL_STOP; 852 } 853 854 list_add(&new->all_mddevs, &all_mddevs); 855 spin_unlock(&all_mddevs_lock); 856 return new; 857 858 out_destroy_new: 859 spin_unlock(&all_mddevs_lock); 860 mddev_destroy(new); 861 out_free_new: 862 kfree(new); 863 return ERR_PTR(error); 864 } 865 866 static void mddev_free(struct mddev *mddev) 867 { 868 spin_lock(&all_mddevs_lock); 869 list_del(&mddev->all_mddevs); 870 spin_unlock(&all_mddevs_lock); 871 872 mddev_destroy(mddev); 873 kfree(mddev); 874 } 875 876 static const struct attribute_group md_redundancy_group; 877 878 void mddev_unlock(struct mddev *mddev) 879 { 880 struct md_rdev *rdev; 881 struct md_rdev *tmp; 882 LIST_HEAD(delete); 883 884 if (!list_empty(&mddev->deleting)) 885 list_splice_init(&mddev->deleting, &delete); 886 887 if (mddev->to_remove) { 888 /* These cannot be removed under reconfig_mutex as 889 * an access to the files will try to take reconfig_mutex 890 * while holding the file unremovable, which leads to 891 * a deadlock. 892 * So hold set sysfs_active while the remove in happeing, 893 * and anything else which might set ->to_remove or my 894 * otherwise change the sysfs namespace will fail with 895 * -EBUSY if sysfs_active is still set. 896 * We set sysfs_active under reconfig_mutex and elsewhere 897 * test it under the same mutex to ensure its correct value 898 * is seen. 899 */ 900 const struct attribute_group *to_remove = mddev->to_remove; 901 mddev->to_remove = NULL; 902 mddev->sysfs_active = 1; 903 mutex_unlock(&mddev->reconfig_mutex); 904 905 if (mddev->kobj.sd) { 906 if (to_remove != &md_redundancy_group) 907 sysfs_remove_group(&mddev->kobj, to_remove); 908 if (mddev->pers == NULL || 909 mddev->pers->sync_request == NULL) { 910 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 911 if (mddev->sysfs_action) 912 sysfs_put(mddev->sysfs_action); 913 if (mddev->sysfs_completed) 914 sysfs_put(mddev->sysfs_completed); 915 if (mddev->sysfs_degraded) 916 sysfs_put(mddev->sysfs_degraded); 917 mddev->sysfs_action = NULL; 918 mddev->sysfs_completed = NULL; 919 mddev->sysfs_degraded = NULL; 920 } 921 } 922 mddev->sysfs_active = 0; 923 } else 924 mutex_unlock(&mddev->reconfig_mutex); 925 926 md_wakeup_thread(mddev->thread); 927 wake_up(&mddev->sb_wait); 928 929 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 930 list_del_init(&rdev->same_set); 931 kobject_del(&rdev->kobj); 932 export_rdev(rdev, mddev); 933 } 934 935 if (!legacy_async_del_gendisk) { 936 /* 937 * Call del_gendisk after release reconfig_mutex to avoid 938 * deadlock (e.g. call del_gendisk under the lock and an 939 * access to sysfs files waits the lock) 940 * And MD_DELETED is only used for md raid which is set in 941 * do_md_stop. dm raid only uses md_stop to stop. So dm raid 942 * doesn't need to check MD_DELETED when getting reconfig lock 943 */ 944 if (test_bit(MD_DELETED, &mddev->flags)) 945 del_gendisk(mddev->gendisk); 946 } 947 } 948 EXPORT_SYMBOL_GPL(mddev_unlock); 949 950 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 951 { 952 struct md_rdev *rdev; 953 954 rdev_for_each_rcu(rdev, mddev) 955 if (rdev->desc_nr == nr) 956 return rdev; 957 958 return NULL; 959 } 960 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 961 962 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 963 { 964 struct md_rdev *rdev; 965 966 rdev_for_each(rdev, mddev) 967 if (rdev->bdev->bd_dev == dev) 968 return rdev; 969 970 return NULL; 971 } 972 973 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 974 { 975 struct md_rdev *rdev; 976 977 rdev_for_each_rcu(rdev, mddev) 978 if (rdev->bdev->bd_dev == dev) 979 return rdev; 980 981 return NULL; 982 } 983 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 984 985 static struct md_personality *get_pers(int level, char *clevel) 986 { 987 struct md_personality *ret = NULL; 988 struct md_submodule_head *head; 989 unsigned long i; 990 991 xa_lock(&md_submodule); 992 xa_for_each(&md_submodule, i, head) { 993 if (head->type != MD_PERSONALITY) 994 continue; 995 if ((level != LEVEL_NONE && head->id == level) || 996 !strcmp(head->name, clevel)) { 997 if (try_module_get(head->owner)) 998 ret = (void *)head; 999 break; 1000 } 1001 } 1002 xa_unlock(&md_submodule); 1003 1004 if (!ret) { 1005 if (level != LEVEL_NONE) 1006 pr_warn("md: personality for level %d is not loaded!\n", 1007 level); 1008 else 1009 pr_warn("md: personality for level %s is not loaded!\n", 1010 clevel); 1011 } 1012 1013 return ret; 1014 } 1015 1016 static void put_pers(struct md_personality *pers) 1017 { 1018 module_put(pers->head.owner); 1019 } 1020 1021 /* return the offset of the super block in 512byte sectors */ 1022 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 1023 { 1024 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 1025 } 1026 1027 static int alloc_disk_sb(struct md_rdev *rdev) 1028 { 1029 rdev->sb_page = alloc_page(GFP_KERNEL); 1030 if (!rdev->sb_page) 1031 return -ENOMEM; 1032 return 0; 1033 } 1034 1035 void md_rdev_clear(struct md_rdev *rdev) 1036 { 1037 if (rdev->sb_page) { 1038 put_page(rdev->sb_page); 1039 rdev->sb_loaded = 0; 1040 rdev->sb_page = NULL; 1041 rdev->sb_start = 0; 1042 rdev->sectors = 0; 1043 } 1044 if (rdev->bb_page) { 1045 put_page(rdev->bb_page); 1046 rdev->bb_page = NULL; 1047 } 1048 badblocks_exit(&rdev->badblocks); 1049 } 1050 EXPORT_SYMBOL_GPL(md_rdev_clear); 1051 1052 static void super_written(struct bio *bio) 1053 { 1054 struct md_rdev *rdev = bio->bi_private; 1055 struct mddev *mddev = rdev->mddev; 1056 1057 if (bio->bi_status) { 1058 pr_err("md: %s gets error=%d\n", __func__, 1059 blk_status_to_errno(bio->bi_status)); 1060 md_error(mddev, rdev); 1061 if (!test_bit(Faulty, &rdev->flags) 1062 && (bio->bi_opf & MD_FAILFAST)) { 1063 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 1064 set_bit(LastDev, &rdev->flags); 1065 } 1066 } else 1067 clear_bit(LastDev, &rdev->flags); 1068 1069 bio_put(bio); 1070 1071 rdev_dec_pending(rdev, mddev); 1072 1073 if (atomic_dec_and_test(&mddev->pending_writes)) 1074 wake_up(&mddev->sb_wait); 1075 } 1076 1077 /** 1078 * md_write_metadata - write metadata to underlying disk, including 1079 * array superblock, badblocks, bitmap superblock and bitmap bits. 1080 * @mddev: the array to write 1081 * @rdev: the underlying disk to write 1082 * @sector: the offset to @rdev 1083 * @size: the length of the metadata 1084 * @page: the metadata 1085 * @offset: the offset to @page 1086 * 1087 * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment 1088 * mddev->pending_writes before returning, and decrement it on completion, 1089 * waking up sb_wait. Caller must call md_super_wait() after issuing io to all 1090 * rdev. If an error occurred, md_error() will be called, and the @rdev will be 1091 * kicked out from @mddev. 1092 */ 1093 void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, 1094 sector_t sector, int size, struct page *page, 1095 unsigned int offset) 1096 { 1097 struct bio *bio; 1098 1099 if (!page) 1100 return; 1101 1102 if (test_bit(Faulty, &rdev->flags)) 1103 return; 1104 1105 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1106 1, 1107 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 1108 | REQ_PREFLUSH | REQ_FUA, 1109 GFP_NOIO, &mddev->sync_set); 1110 1111 atomic_inc(&rdev->nr_pending); 1112 1113 bio->bi_iter.bi_sector = sector; 1114 __bio_add_page(bio, page, size, offset); 1115 bio->bi_private = rdev; 1116 bio->bi_end_io = super_written; 1117 1118 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 1119 test_bit(FailFast, &rdev->flags) && 1120 !test_bit(LastDev, &rdev->flags)) 1121 bio->bi_opf |= MD_FAILFAST; 1122 1123 atomic_inc(&mddev->pending_writes); 1124 submit_bio(bio); 1125 } 1126 1127 int md_super_wait(struct mddev *mddev) 1128 { 1129 /* wait for all superblock writes that were scheduled to complete */ 1130 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1131 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1132 return -EAGAIN; 1133 return 0; 1134 } 1135 1136 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1137 struct page *page, blk_opf_t opf, bool metadata_op) 1138 { 1139 struct bio bio; 1140 struct bio_vec bvec; 1141 1142 if (metadata_op && rdev->meta_bdev) 1143 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1144 else 1145 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1146 1147 if (metadata_op) 1148 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1149 else if (rdev->mddev->reshape_position != MaxSector && 1150 (rdev->mddev->reshape_backwards == 1151 (sector >= rdev->mddev->reshape_position))) 1152 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1153 else 1154 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1155 __bio_add_page(&bio, page, size, 0); 1156 1157 submit_bio_wait(&bio); 1158 1159 return !bio.bi_status; 1160 } 1161 EXPORT_SYMBOL_GPL(sync_page_io); 1162 1163 static int read_disk_sb(struct md_rdev *rdev, int size) 1164 { 1165 if (rdev->sb_loaded) 1166 return 0; 1167 1168 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1169 goto fail; 1170 rdev->sb_loaded = 1; 1171 return 0; 1172 1173 fail: 1174 pr_err("md: disabled device %pg, could not read superblock.\n", 1175 rdev->bdev); 1176 return -EINVAL; 1177 } 1178 1179 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1180 { 1181 return sb1->set_uuid0 == sb2->set_uuid0 && 1182 sb1->set_uuid1 == sb2->set_uuid1 && 1183 sb1->set_uuid2 == sb2->set_uuid2 && 1184 sb1->set_uuid3 == sb2->set_uuid3; 1185 } 1186 1187 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1188 { 1189 int ret; 1190 mdp_super_t *tmp1, *tmp2; 1191 1192 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1193 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1194 1195 if (!tmp1 || !tmp2) { 1196 ret = 0; 1197 goto abort; 1198 } 1199 1200 *tmp1 = *sb1; 1201 *tmp2 = *sb2; 1202 1203 /* 1204 * nr_disks is not constant 1205 */ 1206 tmp1->nr_disks = 0; 1207 tmp2->nr_disks = 0; 1208 1209 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1210 abort: 1211 kfree(tmp1); 1212 kfree(tmp2); 1213 return ret; 1214 } 1215 1216 static u32 md_csum_fold(u32 csum) 1217 { 1218 csum = (csum & 0xffff) + (csum >> 16); 1219 return (csum & 0xffff) + (csum >> 16); 1220 } 1221 1222 static unsigned int calc_sb_csum(mdp_super_t *sb) 1223 { 1224 u64 newcsum = 0; 1225 u32 *sb32 = (u32*)sb; 1226 int i; 1227 unsigned int disk_csum, csum; 1228 1229 disk_csum = sb->sb_csum; 1230 sb->sb_csum = 0; 1231 1232 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1233 newcsum += sb32[i]; 1234 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1235 1236 #ifdef CONFIG_ALPHA 1237 /* This used to use csum_partial, which was wrong for several 1238 * reasons including that different results are returned on 1239 * different architectures. It isn't critical that we get exactly 1240 * the same return value as before (we always csum_fold before 1241 * testing, and that removes any differences). However as we 1242 * know that csum_partial always returned a 16bit value on 1243 * alphas, do a fold to maximise conformity to previous behaviour. 1244 */ 1245 sb->sb_csum = md_csum_fold(disk_csum); 1246 #else 1247 sb->sb_csum = disk_csum; 1248 #endif 1249 return csum; 1250 } 1251 1252 /* 1253 * Handle superblock details. 1254 * We want to be able to handle multiple superblock formats 1255 * so we have a common interface to them all, and an array of 1256 * different handlers. 1257 * We rely on user-space to write the initial superblock, and support 1258 * reading and updating of superblocks. 1259 * Interface methods are: 1260 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1261 * loads and validates a superblock on dev. 1262 * if refdev != NULL, compare superblocks on both devices 1263 * Return: 1264 * 0 - dev has a superblock that is compatible with refdev 1265 * 1 - dev has a superblock that is compatible and newer than refdev 1266 * so dev should be used as the refdev in future 1267 * -EINVAL superblock incompatible or invalid 1268 * -othererror e.g. -EIO 1269 * 1270 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1271 * Verify that dev is acceptable into mddev. 1272 * The first time, mddev->raid_disks will be 0, and data from 1273 * dev should be merged in. Subsequent calls check that dev 1274 * is new enough. Return 0 or -EINVAL 1275 * 1276 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1277 * Update the superblock for rdev with data in mddev 1278 * This does not write to disc. 1279 * 1280 */ 1281 1282 struct super_type { 1283 char *name; 1284 struct module *owner; 1285 int (*load_super)(struct md_rdev *rdev, 1286 struct md_rdev *refdev, 1287 int minor_version); 1288 int (*validate_super)(struct mddev *mddev, 1289 struct md_rdev *freshest, 1290 struct md_rdev *rdev); 1291 void (*sync_super)(struct mddev *mddev, 1292 struct md_rdev *rdev); 1293 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1294 sector_t num_sectors); 1295 int (*allow_new_offset)(struct md_rdev *rdev, 1296 unsigned long long new_offset); 1297 }; 1298 1299 /* 1300 * Check that the given mddev has no bitmap. 1301 * 1302 * This function is called from the run method of all personalities that do not 1303 * support bitmaps. It prints an error message and returns non-zero if mddev 1304 * has a bitmap. Otherwise, it returns 0. 1305 * 1306 */ 1307 int md_check_no_bitmap(struct mddev *mddev) 1308 { 1309 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1310 return 0; 1311 pr_warn("%s: bitmaps are not supported for %s\n", 1312 mdname(mddev), mddev->pers->head.name); 1313 return 1; 1314 } 1315 EXPORT_SYMBOL(md_check_no_bitmap); 1316 1317 /* 1318 * load_super for 0.90.0 1319 */ 1320 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1321 { 1322 mdp_super_t *sb; 1323 int ret; 1324 bool spare_disk = true; 1325 1326 /* 1327 * Calculate the position of the superblock (512byte sectors), 1328 * it's at the end of the disk. 1329 * 1330 * It also happens to be a multiple of 4Kb. 1331 */ 1332 rdev->sb_start = calc_dev_sboffset(rdev); 1333 1334 ret = read_disk_sb(rdev, MD_SB_BYTES); 1335 if (ret) 1336 return ret; 1337 1338 ret = -EINVAL; 1339 1340 sb = page_address(rdev->sb_page); 1341 1342 if (sb->md_magic != MD_SB_MAGIC) { 1343 pr_warn("md: invalid raid superblock magic on %pg\n", 1344 rdev->bdev); 1345 goto abort; 1346 } 1347 1348 if (sb->major_version != 0 || 1349 sb->minor_version < 90 || 1350 sb->minor_version > 91) { 1351 pr_warn("Bad version number %d.%d on %pg\n", 1352 sb->major_version, sb->minor_version, rdev->bdev); 1353 goto abort; 1354 } 1355 1356 if (sb->raid_disks <= 0) 1357 goto abort; 1358 1359 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1360 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1361 goto abort; 1362 } 1363 1364 rdev->preferred_minor = sb->md_minor; 1365 rdev->data_offset = 0; 1366 rdev->new_data_offset = 0; 1367 rdev->sb_size = MD_SB_BYTES; 1368 rdev->badblocks.shift = -1; 1369 1370 rdev->desc_nr = sb->this_disk.number; 1371 1372 /* not spare disk */ 1373 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && 1374 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1375 spare_disk = false; 1376 1377 if (!refdev) { 1378 if (!spare_disk) 1379 ret = 1; 1380 else 1381 ret = 0; 1382 } else { 1383 __u64 ev1, ev2; 1384 mdp_super_t *refsb = page_address(refdev->sb_page); 1385 if (!md_uuid_equal(refsb, sb)) { 1386 pr_warn("md: %pg has different UUID to %pg\n", 1387 rdev->bdev, refdev->bdev); 1388 goto abort; 1389 } 1390 if (!md_sb_equal(refsb, sb)) { 1391 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1392 rdev->bdev, refdev->bdev); 1393 goto abort; 1394 } 1395 ev1 = md_event(sb); 1396 ev2 = md_event(refsb); 1397 1398 if (!spare_disk && ev1 > ev2) 1399 ret = 1; 1400 else 1401 ret = 0; 1402 } 1403 rdev->sectors = rdev->sb_start; 1404 /* Limit to 4TB as metadata cannot record more than that. 1405 * (not needed for Linear and RAID0 as metadata doesn't 1406 * record this size) 1407 */ 1408 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1409 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1410 1411 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1412 /* "this cannot possibly happen" ... */ 1413 ret = -EINVAL; 1414 1415 abort: 1416 return ret; 1417 } 1418 1419 static u64 md_bitmap_events_cleared(struct mddev *mddev) 1420 { 1421 struct md_bitmap_stats stats; 1422 int err; 1423 1424 if (!md_bitmap_enabled(mddev, false)) 1425 return 0; 1426 1427 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 1428 if (err) 1429 return 0; 1430 1431 return stats.events_cleared; 1432 } 1433 1434 /* 1435 * validate_super for 0.90.0 1436 * note: we are not using "freshest" for 0.9 superblock 1437 */ 1438 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1439 { 1440 mdp_disk_t *desc; 1441 mdp_super_t *sb = page_address(rdev->sb_page); 1442 __u64 ev1 = md_event(sb); 1443 1444 rdev->raid_disk = -1; 1445 clear_bit(Faulty, &rdev->flags); 1446 clear_bit(In_sync, &rdev->flags); 1447 clear_bit(Bitmap_sync, &rdev->flags); 1448 clear_bit(WriteMostly, &rdev->flags); 1449 1450 if (mddev->raid_disks == 0) { 1451 mddev->major_version = 0; 1452 mddev->minor_version = sb->minor_version; 1453 mddev->patch_version = sb->patch_version; 1454 mddev->external = 0; 1455 mddev->chunk_sectors = sb->chunk_size >> 9; 1456 mddev->ctime = sb->ctime; 1457 mddev->utime = sb->utime; 1458 mddev->level = sb->level; 1459 mddev->clevel[0] = 0; 1460 mddev->layout = sb->layout; 1461 mddev->raid_disks = sb->raid_disks; 1462 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1463 mddev->events = ev1; 1464 mddev->bitmap_info.offset = 0; 1465 mddev->bitmap_info.space = 0; 1466 /* bitmap can use 60 K after the 4K superblocks */ 1467 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1468 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1469 mddev->reshape_backwards = 0; 1470 1471 if (mddev->minor_version >= 91) { 1472 mddev->reshape_position = sb->reshape_position; 1473 mddev->delta_disks = sb->delta_disks; 1474 mddev->new_level = sb->new_level; 1475 mddev->new_layout = sb->new_layout; 1476 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1477 if (mddev->delta_disks < 0) 1478 mddev->reshape_backwards = 1; 1479 } else { 1480 mddev->reshape_position = MaxSector; 1481 mddev->delta_disks = 0; 1482 mddev->new_level = mddev->level; 1483 mddev->new_layout = mddev->layout; 1484 mddev->new_chunk_sectors = mddev->chunk_sectors; 1485 } 1486 if (mddev->level == 0) 1487 mddev->layout = -1; 1488 1489 if (sb->state & (1<<MD_SB_CLEAN)) 1490 mddev->resync_offset = MaxSector; 1491 else { 1492 if (sb->events_hi == sb->cp_events_hi && 1493 sb->events_lo == sb->cp_events_lo) { 1494 mddev->resync_offset = sb->recovery_cp; 1495 } else 1496 mddev->resync_offset = 0; 1497 } 1498 1499 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1500 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1501 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1502 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1503 1504 mddev->max_disks = MD_SB_DISKS; 1505 1506 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1507 mddev->bitmap_info.file == NULL) { 1508 mddev->bitmap_info.offset = 1509 mddev->bitmap_info.default_offset; 1510 mddev->bitmap_info.space = 1511 mddev->bitmap_info.default_space; 1512 } 1513 1514 } else if (mddev->pers == NULL) { 1515 /* Insist on good event counter while assembling, except 1516 * for spares (which don't need an event count) */ 1517 ++ev1; 1518 if (sb->disks[rdev->desc_nr].state & ( 1519 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1520 if (ev1 < mddev->events) 1521 return -EINVAL; 1522 } else if (mddev->bitmap) { 1523 /* if adding to array with a bitmap, then we can accept an 1524 * older device ... but not too old. 1525 */ 1526 if (ev1 < md_bitmap_events_cleared(mddev)) 1527 return 0; 1528 if (ev1 < mddev->events) 1529 set_bit(Bitmap_sync, &rdev->flags); 1530 } else { 1531 if (ev1 < mddev->events) 1532 /* just a hot-add of a new device, leave raid_disk at -1 */ 1533 return 0; 1534 } 1535 1536 desc = sb->disks + rdev->desc_nr; 1537 1538 if (desc->state & (1<<MD_DISK_FAULTY)) 1539 set_bit(Faulty, &rdev->flags); 1540 else if (desc->state & (1<<MD_DISK_SYNC)) { 1541 set_bit(In_sync, &rdev->flags); 1542 rdev->raid_disk = desc->raid_disk; 1543 rdev->saved_raid_disk = desc->raid_disk; 1544 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1545 /* active but not in sync implies recovery up to 1546 * reshape position. We don't know exactly where 1547 * that is, so set to zero for now 1548 */ 1549 if (mddev->minor_version >= 91) { 1550 rdev->recovery_offset = 0; 1551 rdev->raid_disk = desc->raid_disk; 1552 } 1553 } 1554 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1555 set_bit(WriteMostly, &rdev->flags); 1556 if (desc->state & (1<<MD_DISK_FAILFAST)) 1557 set_bit(FailFast, &rdev->flags); 1558 return 0; 1559 } 1560 1561 /* 1562 * sync_super for 0.90.0 1563 */ 1564 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1565 { 1566 mdp_super_t *sb; 1567 struct md_rdev *rdev2; 1568 int next_spare = mddev->raid_disks; 1569 1570 /* make rdev->sb match mddev data.. 1571 * 1572 * 1/ zero out disks 1573 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1574 * 3/ any empty disks < next_spare become removed 1575 * 1576 * disks[0] gets initialised to REMOVED because 1577 * we cannot be sure from other fields if it has 1578 * been initialised or not. 1579 */ 1580 int i; 1581 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1582 1583 rdev->sb_size = MD_SB_BYTES; 1584 1585 sb = page_address(rdev->sb_page); 1586 1587 memset(sb, 0, sizeof(*sb)); 1588 1589 sb->md_magic = MD_SB_MAGIC; 1590 sb->major_version = mddev->major_version; 1591 sb->patch_version = mddev->patch_version; 1592 sb->gvalid_words = 0; /* ignored */ 1593 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1594 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1595 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1596 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1597 1598 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1599 sb->level = mddev->level; 1600 sb->size = mddev->dev_sectors / 2; 1601 sb->raid_disks = mddev->raid_disks; 1602 sb->md_minor = mddev->md_minor; 1603 sb->not_persistent = 0; 1604 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1605 sb->state = 0; 1606 sb->events_hi = (mddev->events>>32); 1607 sb->events_lo = (u32)mddev->events; 1608 1609 if (mddev->reshape_position == MaxSector) 1610 sb->minor_version = 90; 1611 else { 1612 sb->minor_version = 91; 1613 sb->reshape_position = mddev->reshape_position; 1614 sb->new_level = mddev->new_level; 1615 sb->delta_disks = mddev->delta_disks; 1616 sb->new_layout = mddev->new_layout; 1617 sb->new_chunk = mddev->new_chunk_sectors << 9; 1618 } 1619 mddev->minor_version = sb->minor_version; 1620 if (mddev->in_sync) 1621 { 1622 sb->recovery_cp = mddev->resync_offset; 1623 sb->cp_events_hi = (mddev->events>>32); 1624 sb->cp_events_lo = (u32)mddev->events; 1625 if (mddev->resync_offset == MaxSector) 1626 sb->state = (1<< MD_SB_CLEAN); 1627 } else 1628 sb->recovery_cp = 0; 1629 1630 sb->layout = mddev->layout; 1631 sb->chunk_size = mddev->chunk_sectors << 9; 1632 1633 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1634 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1635 1636 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1637 rdev_for_each(rdev2, mddev) { 1638 mdp_disk_t *d; 1639 int desc_nr; 1640 int is_active = test_bit(In_sync, &rdev2->flags); 1641 1642 if (rdev2->raid_disk >= 0 && 1643 sb->minor_version >= 91) 1644 /* we have nowhere to store the recovery_offset, 1645 * but if it is not below the reshape_position, 1646 * we can piggy-back on that. 1647 */ 1648 is_active = 1; 1649 if (rdev2->raid_disk < 0 || 1650 test_bit(Faulty, &rdev2->flags)) 1651 is_active = 0; 1652 if (is_active) 1653 desc_nr = rdev2->raid_disk; 1654 else 1655 desc_nr = next_spare++; 1656 rdev2->desc_nr = desc_nr; 1657 d = &sb->disks[rdev2->desc_nr]; 1658 nr_disks++; 1659 d->number = rdev2->desc_nr; 1660 d->major = MAJOR(rdev2->bdev->bd_dev); 1661 d->minor = MINOR(rdev2->bdev->bd_dev); 1662 if (is_active) 1663 d->raid_disk = rdev2->raid_disk; 1664 else 1665 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1666 if (test_bit(Faulty, &rdev2->flags)) 1667 d->state = (1<<MD_DISK_FAULTY); 1668 else if (is_active) { 1669 d->state = (1<<MD_DISK_ACTIVE); 1670 if (test_bit(In_sync, &rdev2->flags)) 1671 d->state |= (1<<MD_DISK_SYNC); 1672 active++; 1673 working++; 1674 } else { 1675 d->state = 0; 1676 spare++; 1677 working++; 1678 } 1679 if (test_bit(WriteMostly, &rdev2->flags)) 1680 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1681 if (test_bit(FailFast, &rdev2->flags)) 1682 d->state |= (1<<MD_DISK_FAILFAST); 1683 } 1684 /* now set the "removed" and "faulty" bits on any missing devices */ 1685 for (i=0 ; i < mddev->raid_disks ; i++) { 1686 mdp_disk_t *d = &sb->disks[i]; 1687 if (d->state == 0 && d->number == 0) { 1688 d->number = i; 1689 d->raid_disk = i; 1690 d->state = (1<<MD_DISK_REMOVED); 1691 d->state |= (1<<MD_DISK_FAULTY); 1692 failed++; 1693 } 1694 } 1695 sb->nr_disks = nr_disks; 1696 sb->active_disks = active; 1697 sb->working_disks = working; 1698 sb->failed_disks = failed; 1699 sb->spare_disks = spare; 1700 1701 sb->this_disk = sb->disks[rdev->desc_nr]; 1702 sb->sb_csum = calc_sb_csum(sb); 1703 } 1704 1705 /* 1706 * rdev_size_change for 0.90.0 1707 */ 1708 static unsigned long long 1709 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1710 { 1711 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1712 return 0; /* component must fit device */ 1713 if (rdev->mddev->bitmap_info.offset) 1714 return 0; /* can't move bitmap */ 1715 rdev->sb_start = calc_dev_sboffset(rdev); 1716 if (!num_sectors || num_sectors > rdev->sb_start) 1717 num_sectors = rdev->sb_start; 1718 /* Limit to 4TB as metadata cannot record more than that. 1719 * 4TB == 2^32 KB, or 2*2^32 sectors. 1720 */ 1721 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1722 num_sectors = (sector_t)(2ULL << 32) - 2; 1723 do { 1724 md_write_metadata(rdev->mddev, rdev, rdev->sb_start, 1725 rdev->sb_size, rdev->sb_page, 0); 1726 } while (md_super_wait(rdev->mddev) < 0); 1727 return num_sectors; 1728 } 1729 1730 static int 1731 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1732 { 1733 /* non-zero offset changes not possible with v0.90 */ 1734 return new_offset == 0; 1735 } 1736 1737 /* 1738 * version 1 superblock 1739 */ 1740 1741 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1742 { 1743 __le32 disk_csum; 1744 u32 csum; 1745 unsigned long long newcsum; 1746 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1747 __le32 *isuper = (__le32*)sb; 1748 1749 disk_csum = sb->sb_csum; 1750 sb->sb_csum = 0; 1751 newcsum = 0; 1752 for (; size >= 4; size -= 4) 1753 newcsum += le32_to_cpu(*isuper++); 1754 1755 if (size == 2) 1756 newcsum += le16_to_cpu(*(__le16*) isuper); 1757 1758 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1759 sb->sb_csum = disk_csum; 1760 return cpu_to_le32(csum); 1761 } 1762 1763 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1764 { 1765 struct mdp_superblock_1 *sb; 1766 int ret; 1767 sector_t sb_start; 1768 sector_t sectors; 1769 int bmask; 1770 bool spare_disk = true; 1771 1772 /* 1773 * Calculate the position of the superblock in 512byte sectors. 1774 * It is always aligned to a 4K boundary and 1775 * depeding on minor_version, it can be: 1776 * 0: At least 8K, but less than 12K, from end of device 1777 * 1: At start of device 1778 * 2: 4K from start of device. 1779 */ 1780 switch(minor_version) { 1781 case 0: 1782 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1783 sb_start &= ~(sector_t)(4*2-1); 1784 break; 1785 case 1: 1786 sb_start = 0; 1787 break; 1788 case 2: 1789 sb_start = 8; 1790 break; 1791 default: 1792 return -EINVAL; 1793 } 1794 rdev->sb_start = sb_start; 1795 1796 /* superblock is rarely larger than 1K, but it can be larger, 1797 * and it is safe to read 4k, so we do that 1798 */ 1799 ret = read_disk_sb(rdev, 4096); 1800 if (ret) return ret; 1801 1802 sb = page_address(rdev->sb_page); 1803 1804 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1805 sb->major_version != cpu_to_le32(1) || 1806 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1807 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1808 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1809 return -EINVAL; 1810 1811 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1812 pr_warn("md: invalid superblock checksum on %pg\n", 1813 rdev->bdev); 1814 return -EINVAL; 1815 } 1816 if (le64_to_cpu(sb->data_size) < 10) { 1817 pr_warn("md: data_size too small on %pg\n", 1818 rdev->bdev); 1819 return -EINVAL; 1820 } 1821 if (sb->pad0 || 1822 sb->pad3[0] || 1823 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1824 /* Some padding is non-zero, might be a new feature */ 1825 return -EINVAL; 1826 1827 rdev->preferred_minor = 0xffff; 1828 rdev->data_offset = le64_to_cpu(sb->data_offset); 1829 rdev->new_data_offset = rdev->data_offset; 1830 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1831 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1832 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1833 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1834 1835 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1836 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1837 if (rdev->sb_size & bmask) 1838 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1839 1840 if (minor_version 1841 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1842 return -EINVAL; 1843 if (minor_version 1844 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1845 return -EINVAL; 1846 1847 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1848 1849 if (!rdev->bb_page) { 1850 rdev->bb_page = alloc_page(GFP_KERNEL); 1851 if (!rdev->bb_page) 1852 return -ENOMEM; 1853 } 1854 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1855 rdev->badblocks.count == 0) { 1856 /* need to load the bad block list. 1857 * Currently we limit it to one page. 1858 */ 1859 s32 offset; 1860 sector_t bb_sector; 1861 __le64 *bbp; 1862 int i; 1863 int sectors = le16_to_cpu(sb->bblog_size); 1864 if (sectors > (PAGE_SIZE / 512)) 1865 return -EINVAL; 1866 offset = le32_to_cpu(sb->bblog_offset); 1867 if (offset == 0) 1868 return -EINVAL; 1869 bb_sector = (long long)offset; 1870 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1871 rdev->bb_page, REQ_OP_READ, true)) 1872 return -EIO; 1873 bbp = (__le64 *)page_address(rdev->bb_page); 1874 rdev->badblocks.shift = sb->bblog_shift; 1875 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1876 u64 bb = le64_to_cpu(*bbp); 1877 int count = bb & (0x3ff); 1878 u64 sector = bb >> 10; 1879 sector <<= sb->bblog_shift; 1880 count <<= sb->bblog_shift; 1881 if (bb + 1 == 0) 1882 break; 1883 if (!badblocks_set(&rdev->badblocks, sector, count, 1)) 1884 return -EINVAL; 1885 } 1886 } else if (sb->bblog_offset != 0) 1887 rdev->badblocks.shift = 0; 1888 1889 if ((le32_to_cpu(sb->feature_map) & 1890 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1891 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1892 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1893 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1894 } 1895 1896 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1897 sb->level != 0) 1898 return -EINVAL; 1899 1900 /* not spare disk */ 1901 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1902 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1903 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1904 spare_disk = false; 1905 1906 if (!refdev) { 1907 if (!spare_disk) 1908 ret = 1; 1909 else 1910 ret = 0; 1911 } else { 1912 __u64 ev1, ev2; 1913 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1914 1915 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1916 sb->level != refsb->level || 1917 sb->layout != refsb->layout || 1918 sb->chunksize != refsb->chunksize) { 1919 pr_warn("md: %pg has strangely different superblock to %pg\n", 1920 rdev->bdev, 1921 refdev->bdev); 1922 return -EINVAL; 1923 } 1924 ev1 = le64_to_cpu(sb->events); 1925 ev2 = le64_to_cpu(refsb->events); 1926 1927 if (!spare_disk && ev1 > ev2) 1928 ret = 1; 1929 else 1930 ret = 0; 1931 } 1932 if (minor_version) 1933 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1934 else 1935 sectors = rdev->sb_start; 1936 if (sectors < le64_to_cpu(sb->data_size)) 1937 return -EINVAL; 1938 rdev->sectors = le64_to_cpu(sb->data_size); 1939 return ret; 1940 } 1941 1942 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1943 { 1944 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1945 __u64 ev1 = le64_to_cpu(sb->events); 1946 int role; 1947 1948 rdev->raid_disk = -1; 1949 clear_bit(Faulty, &rdev->flags); 1950 clear_bit(In_sync, &rdev->flags); 1951 clear_bit(Bitmap_sync, &rdev->flags); 1952 clear_bit(WriteMostly, &rdev->flags); 1953 1954 if (mddev->raid_disks == 0) { 1955 mddev->major_version = 1; 1956 mddev->patch_version = 0; 1957 mddev->external = 0; 1958 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1959 mddev->ctime = le64_to_cpu(sb->ctime); 1960 mddev->utime = le64_to_cpu(sb->utime); 1961 mddev->level = le32_to_cpu(sb->level); 1962 mddev->clevel[0] = 0; 1963 mddev->layout = le32_to_cpu(sb->layout); 1964 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1965 mddev->dev_sectors = le64_to_cpu(sb->size); 1966 mddev->events = ev1; 1967 mddev->bitmap_info.offset = 0; 1968 mddev->bitmap_info.space = 0; 1969 /* Default location for bitmap is 1K after superblock 1970 * using 3K - total of 4K 1971 */ 1972 mddev->bitmap_info.default_offset = 1024 >> 9; 1973 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1974 mddev->reshape_backwards = 0; 1975 1976 mddev->resync_offset = le64_to_cpu(sb->resync_offset); 1977 memcpy(mddev->uuid, sb->set_uuid, 16); 1978 1979 mddev->max_disks = (4096-256)/2; 1980 1981 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1982 mddev->bitmap_info.file == NULL) { 1983 mddev->bitmap_info.offset = 1984 (__s32)le32_to_cpu(sb->bitmap_offset); 1985 /* Metadata doesn't record how much space is available. 1986 * For 1.0, we assume we can use up to the superblock 1987 * if before, else to 4K beyond superblock. 1988 * For others, assume no change is possible. 1989 */ 1990 if (mddev->minor_version > 0) 1991 mddev->bitmap_info.space = 0; 1992 else if (mddev->bitmap_info.offset > 0) 1993 mddev->bitmap_info.space = 1994 8 - mddev->bitmap_info.offset; 1995 else 1996 mddev->bitmap_info.space = 1997 -mddev->bitmap_info.offset; 1998 } 1999 2000 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 2001 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 2002 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 2003 mddev->new_level = le32_to_cpu(sb->new_level); 2004 mddev->new_layout = le32_to_cpu(sb->new_layout); 2005 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 2006 if (mddev->delta_disks < 0 || 2007 (mddev->delta_disks == 0 && 2008 (le32_to_cpu(sb->feature_map) 2009 & MD_FEATURE_RESHAPE_BACKWARDS))) 2010 mddev->reshape_backwards = 1; 2011 } else { 2012 mddev->reshape_position = MaxSector; 2013 mddev->delta_disks = 0; 2014 mddev->new_level = mddev->level; 2015 mddev->new_layout = mddev->layout; 2016 mddev->new_chunk_sectors = mddev->chunk_sectors; 2017 } 2018 2019 if (mddev->level == 0 && 2020 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 2021 mddev->layout = -1; 2022 2023 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 2024 set_bit(MD_HAS_JOURNAL, &mddev->flags); 2025 2026 if (le32_to_cpu(sb->feature_map) & 2027 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 2028 if (le32_to_cpu(sb->feature_map) & 2029 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 2030 return -EINVAL; 2031 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 2032 (le32_to_cpu(sb->feature_map) & 2033 MD_FEATURE_MULTIPLE_PPLS)) 2034 return -EINVAL; 2035 set_bit(MD_HAS_PPL, &mddev->flags); 2036 } 2037 } else if (mddev->pers == NULL) { 2038 /* Insist of good event counter while assembling, except for 2039 * spares (which don't need an event count). 2040 * Similar to mdadm, we allow event counter difference of 1 2041 * from the freshest device. 2042 */ 2043 if (rdev->desc_nr >= 0 && 2044 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 2045 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 2046 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 2047 if (ev1 + 1 < mddev->events) 2048 return -EINVAL; 2049 } else if (mddev->bitmap) { 2050 /* If adding to array with a bitmap, then we can accept an 2051 * older device, but not too old. 2052 */ 2053 if (ev1 < md_bitmap_events_cleared(mddev)) 2054 return 0; 2055 if (ev1 < mddev->events) 2056 set_bit(Bitmap_sync, &rdev->flags); 2057 } else { 2058 if (ev1 < mddev->events) 2059 /* just a hot-add of a new device, leave raid_disk at -1 */ 2060 return 0; 2061 } 2062 2063 if (rdev->desc_nr < 0 || 2064 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 2065 role = MD_DISK_ROLE_SPARE; 2066 rdev->desc_nr = -1; 2067 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 2068 /* 2069 * If we are assembling, and our event counter is smaller than the 2070 * highest event counter, we cannot trust our superblock about the role. 2071 * It could happen that our rdev was marked as Faulty, and all other 2072 * superblocks were updated with +1 event counter. 2073 * Then, before the next superblock update, which typically happens when 2074 * remove_and_add_spares() removes the device from the array, there was 2075 * a crash or reboot. 2076 * If we allow current rdev without consulting the freshest superblock, 2077 * we could cause data corruption. 2078 * Note that in this case our event counter is smaller by 1 than the 2079 * highest, otherwise, this rdev would not be allowed into array; 2080 * both kernel and mdadm allow event counter difference of 1. 2081 */ 2082 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 2083 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 2084 2085 if (rdev->desc_nr >= freshest_max_dev) { 2086 /* this is unexpected, better not proceed */ 2087 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 2088 mdname(mddev), rdev->bdev, rdev->desc_nr, 2089 freshest->bdev, freshest_max_dev); 2090 return -EUCLEAN; 2091 } 2092 2093 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 2094 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 2095 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 2096 } else { 2097 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2098 } 2099 switch (role) { 2100 case MD_DISK_ROLE_SPARE: /* spare */ 2101 break; 2102 case MD_DISK_ROLE_FAULTY: /* faulty */ 2103 set_bit(Faulty, &rdev->flags); 2104 break; 2105 case MD_DISK_ROLE_JOURNAL: /* journal device */ 2106 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 2107 /* journal device without journal feature */ 2108 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 2109 return -EINVAL; 2110 } 2111 set_bit(Journal, &rdev->flags); 2112 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 2113 rdev->raid_disk = 0; 2114 break; 2115 default: 2116 rdev->saved_raid_disk = role; 2117 if ((le32_to_cpu(sb->feature_map) & 2118 MD_FEATURE_RECOVERY_OFFSET)) { 2119 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 2120 if (!(le32_to_cpu(sb->feature_map) & 2121 MD_FEATURE_RECOVERY_BITMAP)) 2122 rdev->saved_raid_disk = -1; 2123 } else { 2124 /* 2125 * If the array is FROZEN, then the device can't 2126 * be in_sync with rest of array. 2127 */ 2128 if (!test_bit(MD_RECOVERY_FROZEN, 2129 &mddev->recovery)) 2130 set_bit(In_sync, &rdev->flags); 2131 } 2132 rdev->raid_disk = role; 2133 break; 2134 } 2135 if (sb->devflags & WriteMostly1) 2136 set_bit(WriteMostly, &rdev->flags); 2137 if (sb->devflags & FailFast1) 2138 set_bit(FailFast, &rdev->flags); 2139 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2140 set_bit(Replacement, &rdev->flags); 2141 2142 return 0; 2143 } 2144 2145 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2146 { 2147 struct mdp_superblock_1 *sb; 2148 struct md_rdev *rdev2; 2149 int max_dev, i; 2150 /* make rdev->sb match mddev and rdev data. */ 2151 2152 sb = page_address(rdev->sb_page); 2153 2154 sb->feature_map = 0; 2155 sb->pad0 = 0; 2156 sb->recovery_offset = cpu_to_le64(0); 2157 memset(sb->pad3, 0, sizeof(sb->pad3)); 2158 2159 sb->utime = cpu_to_le64((__u64)mddev->utime); 2160 sb->events = cpu_to_le64(mddev->events); 2161 if (mddev->in_sync) 2162 sb->resync_offset = cpu_to_le64(mddev->resync_offset); 2163 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2164 sb->resync_offset = cpu_to_le64(MaxSector); 2165 else 2166 sb->resync_offset = cpu_to_le64(0); 2167 2168 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2169 2170 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2171 sb->size = cpu_to_le64(mddev->dev_sectors); 2172 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2173 sb->level = cpu_to_le32(mddev->level); 2174 sb->layout = cpu_to_le32(mddev->layout); 2175 if (test_bit(FailFast, &rdev->flags)) 2176 sb->devflags |= FailFast1; 2177 else 2178 sb->devflags &= ~FailFast1; 2179 2180 if (test_bit(WriteMostly, &rdev->flags)) 2181 sb->devflags |= WriteMostly1; 2182 else 2183 sb->devflags &= ~WriteMostly1; 2184 sb->data_offset = cpu_to_le64(rdev->data_offset); 2185 sb->data_size = cpu_to_le64(rdev->sectors); 2186 2187 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2188 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2189 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2190 } 2191 2192 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2193 !test_bit(In_sync, &rdev->flags)) { 2194 sb->feature_map |= 2195 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2196 sb->recovery_offset = 2197 cpu_to_le64(rdev->recovery_offset); 2198 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2199 sb->feature_map |= 2200 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2201 } 2202 /* Note: recovery_offset and journal_tail share space */ 2203 if (test_bit(Journal, &rdev->flags)) 2204 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2205 if (test_bit(Replacement, &rdev->flags)) 2206 sb->feature_map |= 2207 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2208 2209 if (mddev->reshape_position != MaxSector) { 2210 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2211 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2212 sb->new_layout = cpu_to_le32(mddev->new_layout); 2213 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2214 sb->new_level = cpu_to_le32(mddev->new_level); 2215 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2216 if (mddev->delta_disks == 0 && 2217 mddev->reshape_backwards) 2218 sb->feature_map 2219 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2220 if (rdev->new_data_offset != rdev->data_offset) { 2221 sb->feature_map 2222 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2223 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2224 - rdev->data_offset)); 2225 } 2226 } 2227 2228 if (mddev_is_clustered(mddev)) 2229 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2230 2231 if (rdev->badblocks.count == 0) 2232 /* Nothing to do for bad blocks*/ ; 2233 else if (sb->bblog_offset == 0) 2234 /* Cannot record bad blocks on this device */ 2235 md_error(mddev, rdev); 2236 else { 2237 struct badblocks *bb = &rdev->badblocks; 2238 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2239 u64 *p = bb->page; 2240 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2241 if (bb->changed) { 2242 unsigned seq; 2243 2244 retry: 2245 seq = read_seqbegin(&bb->lock); 2246 2247 memset(bbp, 0xff, PAGE_SIZE); 2248 2249 for (i = 0 ; i < bb->count ; i++) { 2250 u64 internal_bb = p[i]; 2251 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2252 | BB_LEN(internal_bb)); 2253 bbp[i] = cpu_to_le64(store_bb); 2254 } 2255 bb->changed = 0; 2256 if (read_seqretry(&bb->lock, seq)) 2257 goto retry; 2258 2259 bb->sector = (rdev->sb_start + 2260 (int)le32_to_cpu(sb->bblog_offset)); 2261 bb->size = le16_to_cpu(sb->bblog_size); 2262 } 2263 } 2264 2265 max_dev = 0; 2266 rdev_for_each(rdev2, mddev) 2267 if (rdev2->desc_nr+1 > max_dev) 2268 max_dev = rdev2->desc_nr+1; 2269 2270 if (max_dev > le32_to_cpu(sb->max_dev)) { 2271 int bmask; 2272 sb->max_dev = cpu_to_le32(max_dev); 2273 rdev->sb_size = max_dev * 2 + 256; 2274 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2275 if (rdev->sb_size & bmask) 2276 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2277 } else 2278 max_dev = le32_to_cpu(sb->max_dev); 2279 2280 for (i=0; i<max_dev;i++) 2281 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2282 2283 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2284 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2285 2286 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2287 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2288 sb->feature_map |= 2289 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2290 else 2291 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2292 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2293 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2294 } 2295 2296 rdev_for_each(rdev2, mddev) { 2297 i = rdev2->desc_nr; 2298 if (test_bit(Faulty, &rdev2->flags)) 2299 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2300 else if (test_bit(In_sync, &rdev2->flags)) 2301 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2302 else if (test_bit(Journal, &rdev2->flags)) 2303 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2304 else if (rdev2->raid_disk >= 0) 2305 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2306 else 2307 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2308 } 2309 2310 sb->sb_csum = calc_sb_1_csum(sb); 2311 } 2312 2313 static sector_t super_1_choose_bm_space(sector_t dev_size) 2314 { 2315 sector_t bm_space; 2316 2317 /* if the device is bigger than 8Gig, save 64k for bitmap 2318 * usage, if bigger than 200Gig, save 128k 2319 */ 2320 if (dev_size < 64*2) 2321 bm_space = 0; 2322 else if (dev_size - 64*2 >= 200*1024*1024*2) 2323 bm_space = 128*2; 2324 else if (dev_size - 4*2 > 8*1024*1024*2) 2325 bm_space = 64*2; 2326 else 2327 bm_space = 4*2; 2328 return bm_space; 2329 } 2330 2331 static unsigned long long 2332 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2333 { 2334 struct mdp_superblock_1 *sb; 2335 sector_t max_sectors; 2336 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2337 return 0; /* component must fit device */ 2338 if (rdev->data_offset != rdev->new_data_offset) 2339 return 0; /* too confusing */ 2340 if (rdev->sb_start < rdev->data_offset) { 2341 /* minor versions 1 and 2; superblock before data */ 2342 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2343 if (!num_sectors || num_sectors > max_sectors) 2344 num_sectors = max_sectors; 2345 } else if (rdev->mddev->bitmap_info.offset) { 2346 /* minor version 0 with bitmap we can't move */ 2347 return 0; 2348 } else { 2349 /* minor version 0; superblock after data */ 2350 sector_t sb_start, bm_space; 2351 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2352 2353 /* 8K is for superblock */ 2354 sb_start = dev_size - 8*2; 2355 sb_start &= ~(sector_t)(4*2 - 1); 2356 2357 bm_space = super_1_choose_bm_space(dev_size); 2358 2359 /* Space that can be used to store date needs to decrease 2360 * superblock bitmap space and bad block space(4K) 2361 */ 2362 max_sectors = sb_start - bm_space - 4*2; 2363 2364 if (!num_sectors || num_sectors > max_sectors) 2365 num_sectors = max_sectors; 2366 rdev->sb_start = sb_start; 2367 } 2368 sb = page_address(rdev->sb_page); 2369 sb->data_size = cpu_to_le64(num_sectors); 2370 sb->super_offset = cpu_to_le64(rdev->sb_start); 2371 sb->sb_csum = calc_sb_1_csum(sb); 2372 do { 2373 md_write_metadata(rdev->mddev, rdev, rdev->sb_start, 2374 rdev->sb_size, rdev->sb_page, 0); 2375 } while (md_super_wait(rdev->mddev) < 0); 2376 return num_sectors; 2377 2378 } 2379 2380 static int 2381 super_1_allow_new_offset(struct md_rdev *rdev, 2382 unsigned long long new_offset) 2383 { 2384 struct mddev *mddev = rdev->mddev; 2385 2386 /* All necessary checks on new >= old have been done */ 2387 if (new_offset >= rdev->data_offset) 2388 return 1; 2389 2390 /* with 1.0 metadata, there is no metadata to tread on 2391 * so we can always move back */ 2392 if (mddev->minor_version == 0) 2393 return 1; 2394 2395 /* otherwise we must be sure not to step on 2396 * any metadata, so stay: 2397 * 36K beyond start of superblock 2398 * beyond end of badblocks 2399 * beyond write-intent bitmap 2400 */ 2401 if (rdev->sb_start + (32+4)*2 > new_offset) 2402 return 0; 2403 2404 if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { 2405 struct md_bitmap_stats stats; 2406 int err; 2407 2408 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 2409 if (!err && rdev->sb_start + mddev->bitmap_info.offset + 2410 stats.file_pages * (PAGE_SIZE >> 9) > new_offset) 2411 return 0; 2412 } 2413 2414 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2415 return 0; 2416 2417 return 1; 2418 } 2419 2420 static struct super_type super_types[] = { 2421 [0] = { 2422 .name = "0.90.0", 2423 .owner = THIS_MODULE, 2424 .load_super = super_90_load, 2425 .validate_super = super_90_validate, 2426 .sync_super = super_90_sync, 2427 .rdev_size_change = super_90_rdev_size_change, 2428 .allow_new_offset = super_90_allow_new_offset, 2429 }, 2430 [1] = { 2431 .name = "md-1", 2432 .owner = THIS_MODULE, 2433 .load_super = super_1_load, 2434 .validate_super = super_1_validate, 2435 .sync_super = super_1_sync, 2436 .rdev_size_change = super_1_rdev_size_change, 2437 .allow_new_offset = super_1_allow_new_offset, 2438 }, 2439 }; 2440 2441 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2442 { 2443 if (mddev->sync_super) { 2444 mddev->sync_super(mddev, rdev); 2445 return; 2446 } 2447 2448 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2449 2450 super_types[mddev->major_version].sync_super(mddev, rdev); 2451 } 2452 2453 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2454 { 2455 struct md_rdev *rdev, *rdev2; 2456 2457 rcu_read_lock(); 2458 rdev_for_each_rcu(rdev, mddev1) { 2459 if (test_bit(Faulty, &rdev->flags) || 2460 test_bit(Journal, &rdev->flags) || 2461 rdev->raid_disk == -1) 2462 continue; 2463 rdev_for_each_rcu(rdev2, mddev2) { 2464 if (test_bit(Faulty, &rdev2->flags) || 2465 test_bit(Journal, &rdev2->flags) || 2466 rdev2->raid_disk == -1) 2467 continue; 2468 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2469 rcu_read_unlock(); 2470 return 1; 2471 } 2472 } 2473 } 2474 rcu_read_unlock(); 2475 return 0; 2476 } 2477 2478 static LIST_HEAD(pending_raid_disks); 2479 2480 /* 2481 * Try to register data integrity profile for an mddev 2482 * 2483 * This is called when an array is started and after a disk has been kicked 2484 * from the array. It only succeeds if all working and active component devices 2485 * are integrity capable with matching profiles. 2486 */ 2487 int md_integrity_register(struct mddev *mddev) 2488 { 2489 if (list_empty(&mddev->disks)) 2490 return 0; /* nothing to do */ 2491 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) 2492 return 0; /* shouldn't register */ 2493 2494 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2495 return 0; 2496 } 2497 EXPORT_SYMBOL(md_integrity_register); 2498 2499 static bool rdev_read_only(struct md_rdev *rdev) 2500 { 2501 return bdev_read_only(rdev->bdev) || 2502 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2503 } 2504 2505 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2506 { 2507 char b[BDEVNAME_SIZE]; 2508 int err; 2509 2510 /* prevent duplicates */ 2511 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2512 return -EEXIST; 2513 2514 if (rdev_read_only(rdev) && mddev->pers) 2515 return -EROFS; 2516 2517 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2518 if (!test_bit(Journal, &rdev->flags) && 2519 rdev->sectors && 2520 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2521 if (mddev->pers) { 2522 /* Cannot change size, so fail 2523 * If mddev->level <= 0, then we don't care 2524 * about aligning sizes (e.g. linear) 2525 */ 2526 if (mddev->level > 0) 2527 return -ENOSPC; 2528 } else 2529 mddev->dev_sectors = rdev->sectors; 2530 } 2531 2532 /* Verify rdev->desc_nr is unique. 2533 * If it is -1, assign a free number, else 2534 * check number is not in use 2535 */ 2536 rcu_read_lock(); 2537 if (rdev->desc_nr < 0) { 2538 int choice = 0; 2539 if (mddev->pers) 2540 choice = mddev->raid_disks; 2541 while (md_find_rdev_nr_rcu(mddev, choice)) 2542 choice++; 2543 rdev->desc_nr = choice; 2544 } else { 2545 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2546 rcu_read_unlock(); 2547 return -EBUSY; 2548 } 2549 } 2550 rcu_read_unlock(); 2551 if (!test_bit(Journal, &rdev->flags) && 2552 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2553 pr_warn("md: %s: array is limited to %d devices\n", 2554 mdname(mddev), mddev->max_disks); 2555 return -EBUSY; 2556 } 2557 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2558 strreplace(b, '/', '!'); 2559 2560 rdev->mddev = mddev; 2561 pr_debug("md: bind<%s>\n", b); 2562 2563 if (mddev->raid_disks) 2564 mddev_create_serial_pool(mddev, rdev); 2565 2566 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2567 goto fail; 2568 2569 /* failure here is OK */ 2570 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2571 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2572 rdev->sysfs_unack_badblocks = 2573 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2574 rdev->sysfs_badblocks = 2575 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2576 2577 list_add_rcu(&rdev->same_set, &mddev->disks); 2578 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2579 2580 /* May as well allow recovery to be retried once */ 2581 mddev->recovery_disabled++; 2582 2583 return 0; 2584 2585 fail: 2586 pr_warn("md: failed to register dev-%s for %s\n", 2587 b, mdname(mddev)); 2588 mddev_destroy_serial_pool(mddev, rdev); 2589 return err; 2590 } 2591 2592 void md_autodetect_dev(dev_t dev); 2593 2594 /* just for claiming the bdev */ 2595 static struct md_rdev claim_rdev; 2596 2597 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2598 { 2599 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2600 md_rdev_clear(rdev); 2601 #ifndef MODULE 2602 if (test_bit(AutoDetected, &rdev->flags)) 2603 md_autodetect_dev(rdev->bdev->bd_dev); 2604 #endif 2605 fput(rdev->bdev_file); 2606 rdev->bdev = NULL; 2607 kobject_put(&rdev->kobj); 2608 } 2609 2610 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2611 { 2612 struct mddev *mddev = rdev->mddev; 2613 2614 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2615 list_del_rcu(&rdev->same_set); 2616 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2617 mddev_destroy_serial_pool(rdev->mddev, rdev); 2618 WRITE_ONCE(rdev->mddev, NULL); 2619 sysfs_remove_link(&rdev->kobj, "block"); 2620 sysfs_put(rdev->sysfs_state); 2621 sysfs_put(rdev->sysfs_unack_badblocks); 2622 sysfs_put(rdev->sysfs_badblocks); 2623 rdev->sysfs_state = NULL; 2624 rdev->sysfs_unack_badblocks = NULL; 2625 rdev->sysfs_badblocks = NULL; 2626 rdev->badblocks.count = 0; 2627 2628 synchronize_rcu(); 2629 2630 /* 2631 * kobject_del() will wait for all in progress writers to be done, where 2632 * reconfig_mutex is held, hence it can't be called under 2633 * reconfig_mutex and it's delayed to mddev_unlock(). 2634 */ 2635 list_add(&rdev->same_set, &mddev->deleting); 2636 } 2637 2638 static void export_array(struct mddev *mddev) 2639 { 2640 struct md_rdev *rdev; 2641 2642 while (!list_empty(&mddev->disks)) { 2643 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2644 same_set); 2645 md_kick_rdev_from_array(rdev); 2646 } 2647 mddev->raid_disks = 0; 2648 mddev->major_version = 0; 2649 } 2650 2651 static bool set_in_sync(struct mddev *mddev) 2652 { 2653 lockdep_assert_held(&mddev->lock); 2654 if (!mddev->in_sync) { 2655 mddev->sync_checkers++; 2656 spin_unlock(&mddev->lock); 2657 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2658 spin_lock(&mddev->lock); 2659 if (!mddev->in_sync && 2660 percpu_ref_is_zero(&mddev->writes_pending)) { 2661 mddev->in_sync = 1; 2662 /* 2663 * Ensure ->in_sync is visible before we clear 2664 * ->sync_checkers. 2665 */ 2666 smp_mb(); 2667 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2668 sysfs_notify_dirent_safe(mddev->sysfs_state); 2669 } 2670 if (--mddev->sync_checkers == 0) 2671 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2672 } 2673 if (mddev->safemode == 1) 2674 mddev->safemode = 0; 2675 return mddev->in_sync; 2676 } 2677 2678 static void sync_sbs(struct mddev *mddev, int nospares) 2679 { 2680 /* Update each superblock (in-memory image), but 2681 * if we are allowed to, skip spares which already 2682 * have the right event counter, or have one earlier 2683 * (which would mean they aren't being marked as dirty 2684 * with the rest of the array) 2685 */ 2686 struct md_rdev *rdev; 2687 rdev_for_each(rdev, mddev) { 2688 if (rdev->sb_events == mddev->events || 2689 (nospares && 2690 rdev->raid_disk < 0 && 2691 rdev->sb_events+1 == mddev->events)) { 2692 /* Don't update this superblock */ 2693 rdev->sb_loaded = 2; 2694 } else { 2695 sync_super(mddev, rdev); 2696 rdev->sb_loaded = 1; 2697 } 2698 } 2699 } 2700 2701 static bool does_sb_need_changing(struct mddev *mddev) 2702 { 2703 struct md_rdev *rdev = NULL, *iter; 2704 struct mdp_superblock_1 *sb; 2705 int role; 2706 2707 /* Find a good rdev */ 2708 rdev_for_each(iter, mddev) 2709 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2710 rdev = iter; 2711 break; 2712 } 2713 2714 /* No good device found. */ 2715 if (!rdev) 2716 return false; 2717 2718 sb = page_address(rdev->sb_page); 2719 /* Check if a device has become faulty or a spare become active */ 2720 rdev_for_each(rdev, mddev) { 2721 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2722 /* Device activated? */ 2723 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2724 !test_bit(Faulty, &rdev->flags)) 2725 return true; 2726 /* Device turned faulty? */ 2727 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2728 return true; 2729 } 2730 2731 /* Check if any mddev parameters have changed */ 2732 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2733 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2734 (mddev->layout != le32_to_cpu(sb->layout)) || 2735 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2736 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2737 return true; 2738 2739 return false; 2740 } 2741 2742 void md_update_sb(struct mddev *mddev, int force_change) 2743 { 2744 struct md_rdev *rdev; 2745 int sync_req; 2746 int nospares = 0; 2747 int any_badblocks_changed = 0; 2748 int ret = -1; 2749 2750 if (!md_is_rdwr(mddev)) { 2751 if (force_change) 2752 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2753 return; 2754 } 2755 2756 repeat: 2757 if (mddev_is_clustered(mddev)) { 2758 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2759 force_change = 1; 2760 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2761 nospares = 1; 2762 ret = mddev->cluster_ops->metadata_update_start(mddev); 2763 /* Has someone else has updated the sb */ 2764 if (!does_sb_need_changing(mddev)) { 2765 if (ret == 0) 2766 mddev->cluster_ops->metadata_update_cancel(mddev); 2767 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2768 BIT(MD_SB_CHANGE_DEVS) | 2769 BIT(MD_SB_CHANGE_CLEAN)); 2770 return; 2771 } 2772 } 2773 2774 /* 2775 * First make sure individual recovery_offsets are correct 2776 * curr_resync_completed can only be used during recovery. 2777 * During reshape/resync it might use array-addresses rather 2778 * that device addresses. 2779 */ 2780 rdev_for_each(rdev, mddev) { 2781 if (rdev->raid_disk >= 0 && 2782 mddev->delta_disks >= 0 && 2783 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2784 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2785 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2786 !test_bit(Journal, &rdev->flags) && 2787 !test_bit(In_sync, &rdev->flags) && 2788 mddev->curr_resync_completed > rdev->recovery_offset) 2789 rdev->recovery_offset = mddev->curr_resync_completed; 2790 2791 } 2792 if (!mddev->persistent) { 2793 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2794 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2795 if (!mddev->external) { 2796 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2797 rdev_for_each(rdev, mddev) { 2798 if (rdev->badblocks.changed) { 2799 rdev->badblocks.changed = 0; 2800 ack_all_badblocks(&rdev->badblocks); 2801 md_error(mddev, rdev); 2802 } 2803 clear_bit(Blocked, &rdev->flags); 2804 clear_bit(BlockedBadBlocks, &rdev->flags); 2805 wake_up(&rdev->blocked_wait); 2806 } 2807 } 2808 wake_up(&mddev->sb_wait); 2809 return; 2810 } 2811 2812 spin_lock(&mddev->lock); 2813 2814 mddev->utime = ktime_get_real_seconds(); 2815 2816 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2817 force_change = 1; 2818 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2819 /* just a clean<-> dirty transition, possibly leave spares alone, 2820 * though if events isn't the right even/odd, we will have to do 2821 * spares after all 2822 */ 2823 nospares = 1; 2824 if (force_change) 2825 nospares = 0; 2826 if (mddev->degraded) 2827 /* If the array is degraded, then skipping spares is both 2828 * dangerous and fairly pointless. 2829 * Dangerous because a device that was removed from the array 2830 * might have a event_count that still looks up-to-date, 2831 * so it can be re-added without a resync. 2832 * Pointless because if there are any spares to skip, 2833 * then a recovery will happen and soon that array won't 2834 * be degraded any more and the spare can go back to sleep then. 2835 */ 2836 nospares = 0; 2837 2838 sync_req = mddev->in_sync; 2839 2840 /* If this is just a dirty<->clean transition, and the array is clean 2841 * and 'events' is odd, we can roll back to the previous clean state */ 2842 if (nospares 2843 && (mddev->in_sync && mddev->resync_offset == MaxSector) 2844 && mddev->can_decrease_events 2845 && mddev->events != 1) { 2846 mddev->events--; 2847 mddev->can_decrease_events = 0; 2848 } else { 2849 /* otherwise we have to go forward and ... */ 2850 mddev->events ++; 2851 mddev->can_decrease_events = nospares; 2852 } 2853 2854 /* 2855 * This 64-bit counter should never wrap. 2856 * Either we are in around ~1 trillion A.C., assuming 2857 * 1 reboot per second, or we have a bug... 2858 */ 2859 WARN_ON(mddev->events == 0); 2860 2861 rdev_for_each(rdev, mddev) { 2862 if (rdev->badblocks.changed) 2863 any_badblocks_changed++; 2864 if (test_bit(Faulty, &rdev->flags)) 2865 set_bit(FaultRecorded, &rdev->flags); 2866 } 2867 2868 sync_sbs(mddev, nospares); 2869 spin_unlock(&mddev->lock); 2870 2871 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2872 mdname(mddev), mddev->in_sync); 2873 2874 mddev_add_trace_msg(mddev, "md md_update_sb"); 2875 rewrite: 2876 if (md_bitmap_enabled(mddev, false)) 2877 mddev->bitmap_ops->update_sb(mddev->bitmap); 2878 rdev_for_each(rdev, mddev) { 2879 if (rdev->sb_loaded != 1) 2880 continue; /* no noise on spare devices */ 2881 2882 if (!test_bit(Faulty, &rdev->flags)) { 2883 md_write_metadata(mddev, rdev, rdev->sb_start, 2884 rdev->sb_size, rdev->sb_page, 0); 2885 pr_debug("md: (write) %pg's sb offset: %llu\n", 2886 rdev->bdev, 2887 (unsigned long long)rdev->sb_start); 2888 rdev->sb_events = mddev->events; 2889 if (rdev->badblocks.size) { 2890 md_write_metadata(mddev, rdev, 2891 rdev->badblocks.sector, 2892 rdev->badblocks.size << 9, 2893 rdev->bb_page, 0); 2894 rdev->badblocks.size = 0; 2895 } 2896 2897 } else 2898 pr_debug("md: %pg (skipping faulty)\n", 2899 rdev->bdev); 2900 } 2901 if (md_super_wait(mddev) < 0) 2902 goto rewrite; 2903 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2904 2905 if (mddev_is_clustered(mddev) && ret == 0) 2906 mddev->cluster_ops->metadata_update_finish(mddev); 2907 2908 if (mddev->in_sync != sync_req || 2909 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2910 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2911 /* have to write it out again */ 2912 goto repeat; 2913 wake_up(&mddev->sb_wait); 2914 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2915 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2916 2917 rdev_for_each(rdev, mddev) { 2918 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2919 clear_bit(Blocked, &rdev->flags); 2920 2921 if (any_badblocks_changed) 2922 ack_all_badblocks(&rdev->badblocks); 2923 clear_bit(BlockedBadBlocks, &rdev->flags); 2924 wake_up(&rdev->blocked_wait); 2925 } 2926 } 2927 EXPORT_SYMBOL(md_update_sb); 2928 2929 static int add_bound_rdev(struct md_rdev *rdev) 2930 { 2931 struct mddev *mddev = rdev->mddev; 2932 int err = 0; 2933 bool add_journal = test_bit(Journal, &rdev->flags); 2934 2935 if (!mddev->pers->hot_remove_disk || add_journal) { 2936 /* If there is hot_add_disk but no hot_remove_disk 2937 * then added disks for geometry changes, 2938 * and should be added immediately. 2939 */ 2940 super_types[mddev->major_version]. 2941 validate_super(mddev, NULL/*freshest*/, rdev); 2942 err = mddev->pers->hot_add_disk(mddev, rdev); 2943 if (err) { 2944 md_kick_rdev_from_array(rdev); 2945 return err; 2946 } 2947 } 2948 sysfs_notify_dirent_safe(rdev->sysfs_state); 2949 2950 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2951 if (mddev->degraded) 2952 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2953 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2954 md_new_event(); 2955 return 0; 2956 } 2957 2958 /* words written to sysfs files may, or may not, be \n terminated. 2959 * We want to accept with case. For this we use cmd_match. 2960 */ 2961 static int cmd_match(const char *cmd, const char *str) 2962 { 2963 /* See if cmd, written into a sysfs file, matches 2964 * str. They must either be the same, or cmd can 2965 * have a trailing newline 2966 */ 2967 while (*cmd && *str && *cmd == *str) { 2968 cmd++; 2969 str++; 2970 } 2971 if (*cmd == '\n') 2972 cmd++; 2973 if (*str || *cmd) 2974 return 0; 2975 return 1; 2976 } 2977 2978 struct rdev_sysfs_entry { 2979 struct attribute attr; 2980 ssize_t (*show)(struct md_rdev *, char *); 2981 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2982 }; 2983 2984 static ssize_t 2985 state_show(struct md_rdev *rdev, char *page) 2986 { 2987 char *sep = ","; 2988 size_t len = 0; 2989 unsigned long flags = READ_ONCE(rdev->flags); 2990 2991 if (test_bit(Faulty, &flags) || 2992 (!test_bit(ExternalBbl, &flags) && 2993 rdev->badblocks.unacked_exist)) 2994 len += sprintf(page+len, "faulty%s", sep); 2995 if (test_bit(In_sync, &flags)) 2996 len += sprintf(page+len, "in_sync%s", sep); 2997 if (test_bit(Journal, &flags)) 2998 len += sprintf(page+len, "journal%s", sep); 2999 if (test_bit(WriteMostly, &flags)) 3000 len += sprintf(page+len, "write_mostly%s", sep); 3001 if (test_bit(Blocked, &flags) || 3002 (rdev->badblocks.unacked_exist 3003 && !test_bit(Faulty, &flags))) 3004 len += sprintf(page+len, "blocked%s", sep); 3005 if (!test_bit(Faulty, &flags) && 3006 !test_bit(Journal, &flags) && 3007 !test_bit(In_sync, &flags)) 3008 len += sprintf(page+len, "spare%s", sep); 3009 if (test_bit(WriteErrorSeen, &flags)) 3010 len += sprintf(page+len, "write_error%s", sep); 3011 if (test_bit(WantReplacement, &flags)) 3012 len += sprintf(page+len, "want_replacement%s", sep); 3013 if (test_bit(Replacement, &flags)) 3014 len += sprintf(page+len, "replacement%s", sep); 3015 if (test_bit(ExternalBbl, &flags)) 3016 len += sprintf(page+len, "external_bbl%s", sep); 3017 if (test_bit(FailFast, &flags)) 3018 len += sprintf(page+len, "failfast%s", sep); 3019 3020 if (len) 3021 len -= strlen(sep); 3022 3023 return len+sprintf(page+len, "\n"); 3024 } 3025 3026 static ssize_t 3027 state_store(struct md_rdev *rdev, const char *buf, size_t len) 3028 { 3029 /* can write 3030 * faulty - simulates an error 3031 * remove - disconnects the device 3032 * writemostly - sets write_mostly 3033 * -writemostly - clears write_mostly 3034 * blocked - sets the Blocked flags 3035 * -blocked - clears the Blocked and possibly simulates an error 3036 * insync - sets Insync providing device isn't active 3037 * -insync - clear Insync for a device with a slot assigned, 3038 * so that it gets rebuilt based on bitmap 3039 * write_error - sets WriteErrorSeen 3040 * -write_error - clears WriteErrorSeen 3041 * {,-}failfast - set/clear FailFast 3042 */ 3043 3044 struct mddev *mddev = rdev->mddev; 3045 int err = -EINVAL; 3046 bool need_update_sb = false; 3047 3048 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 3049 md_error(rdev->mddev, rdev); 3050 3051 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 3052 err = -EBUSY; 3053 else 3054 err = 0; 3055 } else if (cmd_match(buf, "remove")) { 3056 if (rdev->mddev->pers) { 3057 clear_bit(Blocked, &rdev->flags); 3058 remove_and_add_spares(rdev->mddev, rdev); 3059 } 3060 if (rdev->raid_disk >= 0) 3061 err = -EBUSY; 3062 else { 3063 err = 0; 3064 if (mddev_is_clustered(mddev)) 3065 err = mddev->cluster_ops->remove_disk(mddev, rdev); 3066 3067 if (err == 0) { 3068 md_kick_rdev_from_array(rdev); 3069 if (mddev->pers) 3070 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3071 md_new_event(); 3072 } 3073 } 3074 } else if (cmd_match(buf, "writemostly")) { 3075 set_bit(WriteMostly, &rdev->flags); 3076 mddev_create_serial_pool(rdev->mddev, rdev); 3077 need_update_sb = true; 3078 err = 0; 3079 } else if (cmd_match(buf, "-writemostly")) { 3080 mddev_destroy_serial_pool(rdev->mddev, rdev); 3081 clear_bit(WriteMostly, &rdev->flags); 3082 need_update_sb = true; 3083 err = 0; 3084 } else if (cmd_match(buf, "blocked")) { 3085 set_bit(Blocked, &rdev->flags); 3086 err = 0; 3087 } else if (cmd_match(buf, "-blocked")) { 3088 if (!test_bit(Faulty, &rdev->flags) && 3089 !test_bit(ExternalBbl, &rdev->flags) && 3090 rdev->badblocks.unacked_exist) { 3091 /* metadata handler doesn't understand badblocks, 3092 * so we need to fail the device 3093 */ 3094 md_error(rdev->mddev, rdev); 3095 } 3096 clear_bit(Blocked, &rdev->flags); 3097 clear_bit(BlockedBadBlocks, &rdev->flags); 3098 wake_up(&rdev->blocked_wait); 3099 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3100 3101 err = 0; 3102 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3103 set_bit(In_sync, &rdev->flags); 3104 err = 0; 3105 } else if (cmd_match(buf, "failfast")) { 3106 set_bit(FailFast, &rdev->flags); 3107 need_update_sb = true; 3108 err = 0; 3109 } else if (cmd_match(buf, "-failfast")) { 3110 clear_bit(FailFast, &rdev->flags); 3111 need_update_sb = true; 3112 err = 0; 3113 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3114 !test_bit(Journal, &rdev->flags)) { 3115 if (rdev->mddev->pers == NULL) { 3116 clear_bit(In_sync, &rdev->flags); 3117 rdev->saved_raid_disk = rdev->raid_disk; 3118 rdev->raid_disk = -1; 3119 err = 0; 3120 } 3121 } else if (cmd_match(buf, "write_error")) { 3122 set_bit(WriteErrorSeen, &rdev->flags); 3123 err = 0; 3124 } else if (cmd_match(buf, "-write_error")) { 3125 clear_bit(WriteErrorSeen, &rdev->flags); 3126 err = 0; 3127 } else if (cmd_match(buf, "want_replacement")) { 3128 /* Any non-spare device that is not a replacement can 3129 * become want_replacement at any time, but we then need to 3130 * check if recovery is needed. 3131 */ 3132 if (rdev->raid_disk >= 0 && 3133 !test_bit(Journal, &rdev->flags) && 3134 !test_bit(Replacement, &rdev->flags)) 3135 set_bit(WantReplacement, &rdev->flags); 3136 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3137 err = 0; 3138 } else if (cmd_match(buf, "-want_replacement")) { 3139 /* Clearing 'want_replacement' is always allowed. 3140 * Once replacements starts it is too late though. 3141 */ 3142 err = 0; 3143 clear_bit(WantReplacement, &rdev->flags); 3144 } else if (cmd_match(buf, "replacement")) { 3145 /* Can only set a device as a replacement when array has not 3146 * yet been started. Once running, replacement is automatic 3147 * from spares, or by assigning 'slot'. 3148 */ 3149 if (rdev->mddev->pers) 3150 err = -EBUSY; 3151 else { 3152 set_bit(Replacement, &rdev->flags); 3153 err = 0; 3154 } 3155 } else if (cmd_match(buf, "-replacement")) { 3156 /* Similarly, can only clear Replacement before start */ 3157 if (rdev->mddev->pers) 3158 err = -EBUSY; 3159 else { 3160 clear_bit(Replacement, &rdev->flags); 3161 err = 0; 3162 } 3163 } else if (cmd_match(buf, "re-add")) { 3164 if (!rdev->mddev->pers) 3165 err = -EINVAL; 3166 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3167 rdev->saved_raid_disk >= 0) { 3168 /* clear_bit is performed _after_ all the devices 3169 * have their local Faulty bit cleared. If any writes 3170 * happen in the meantime in the local node, they 3171 * will land in the local bitmap, which will be synced 3172 * by this node eventually 3173 */ 3174 if (!mddev_is_clustered(rdev->mddev) || 3175 (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { 3176 clear_bit(Faulty, &rdev->flags); 3177 err = add_bound_rdev(rdev); 3178 } 3179 } else 3180 err = -EBUSY; 3181 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3182 set_bit(ExternalBbl, &rdev->flags); 3183 rdev->badblocks.shift = 0; 3184 err = 0; 3185 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3186 clear_bit(ExternalBbl, &rdev->flags); 3187 err = 0; 3188 } 3189 if (need_update_sb) 3190 md_update_sb(mddev, 1); 3191 if (!err) 3192 sysfs_notify_dirent_safe(rdev->sysfs_state); 3193 return err ? err : len; 3194 } 3195 static struct rdev_sysfs_entry rdev_state = 3196 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3197 3198 static ssize_t 3199 errors_show(struct md_rdev *rdev, char *page) 3200 { 3201 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3202 } 3203 3204 static ssize_t 3205 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3206 { 3207 unsigned int n; 3208 int rv; 3209 3210 rv = kstrtouint(buf, 10, &n); 3211 if (rv < 0) 3212 return rv; 3213 atomic_set(&rdev->corrected_errors, n); 3214 return len; 3215 } 3216 static struct rdev_sysfs_entry rdev_errors = 3217 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3218 3219 static ssize_t 3220 slot_show(struct md_rdev *rdev, char *page) 3221 { 3222 if (test_bit(Journal, &rdev->flags)) 3223 return sprintf(page, "journal\n"); 3224 else if (rdev->raid_disk < 0) 3225 return sprintf(page, "none\n"); 3226 else 3227 return sprintf(page, "%d\n", rdev->raid_disk); 3228 } 3229 3230 static ssize_t 3231 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3232 { 3233 int slot; 3234 int err; 3235 3236 if (test_bit(Journal, &rdev->flags)) 3237 return -EBUSY; 3238 if (strncmp(buf, "none", 4)==0) 3239 slot = -1; 3240 else { 3241 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3242 if (err < 0) 3243 return err; 3244 if (slot < 0) 3245 /* overflow */ 3246 return -ENOSPC; 3247 } 3248 if (rdev->mddev->pers && slot == -1) { 3249 /* Setting 'slot' on an active array requires also 3250 * updating the 'rd%d' link, and communicating 3251 * with the personality with ->hot_*_disk. 3252 * For now we only support removing 3253 * failed/spare devices. This normally happens automatically, 3254 * but not when the metadata is externally managed. 3255 */ 3256 if (rdev->raid_disk == -1) 3257 return -EEXIST; 3258 /* personality does all needed checks */ 3259 if (rdev->mddev->pers->hot_remove_disk == NULL) 3260 return -EINVAL; 3261 clear_bit(Blocked, &rdev->flags); 3262 remove_and_add_spares(rdev->mddev, rdev); 3263 if (rdev->raid_disk >= 0) 3264 return -EBUSY; 3265 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3266 } else if (rdev->mddev->pers) { 3267 /* Activating a spare .. or possibly reactivating 3268 * if we ever get bitmaps working here. 3269 */ 3270 int err; 3271 3272 if (rdev->raid_disk != -1) 3273 return -EBUSY; 3274 3275 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3276 return -EBUSY; 3277 3278 if (rdev->mddev->pers->hot_add_disk == NULL) 3279 return -EINVAL; 3280 3281 if (slot >= rdev->mddev->raid_disks && 3282 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3283 return -ENOSPC; 3284 3285 rdev->raid_disk = slot; 3286 if (test_bit(In_sync, &rdev->flags)) 3287 rdev->saved_raid_disk = slot; 3288 else 3289 rdev->saved_raid_disk = -1; 3290 clear_bit(In_sync, &rdev->flags); 3291 clear_bit(Bitmap_sync, &rdev->flags); 3292 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3293 if (err) { 3294 rdev->raid_disk = -1; 3295 return err; 3296 } else 3297 sysfs_notify_dirent_safe(rdev->sysfs_state); 3298 /* failure here is OK */; 3299 sysfs_link_rdev(rdev->mddev, rdev); 3300 /* don't wakeup anyone, leave that to userspace. */ 3301 } else { 3302 if (slot >= rdev->mddev->raid_disks && 3303 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3304 return -ENOSPC; 3305 rdev->raid_disk = slot; 3306 /* assume it is working */ 3307 clear_bit(Faulty, &rdev->flags); 3308 clear_bit(WriteMostly, &rdev->flags); 3309 set_bit(In_sync, &rdev->flags); 3310 sysfs_notify_dirent_safe(rdev->sysfs_state); 3311 } 3312 return len; 3313 } 3314 3315 static struct rdev_sysfs_entry rdev_slot = 3316 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3317 3318 static ssize_t 3319 offset_show(struct md_rdev *rdev, char *page) 3320 { 3321 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3322 } 3323 3324 static ssize_t 3325 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3326 { 3327 unsigned long long offset; 3328 if (kstrtoull(buf, 10, &offset) < 0) 3329 return -EINVAL; 3330 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3331 return -EBUSY; 3332 if (rdev->sectors && rdev->mddev->external) 3333 /* Must set offset before size, so overlap checks 3334 * can be sane */ 3335 return -EBUSY; 3336 rdev->data_offset = offset; 3337 rdev->new_data_offset = offset; 3338 return len; 3339 } 3340 3341 static struct rdev_sysfs_entry rdev_offset = 3342 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3343 3344 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3345 { 3346 return sprintf(page, "%llu\n", 3347 (unsigned long long)rdev->new_data_offset); 3348 } 3349 3350 static ssize_t new_offset_store(struct md_rdev *rdev, 3351 const char *buf, size_t len) 3352 { 3353 unsigned long long new_offset; 3354 struct mddev *mddev = rdev->mddev; 3355 3356 if (kstrtoull(buf, 10, &new_offset) < 0) 3357 return -EINVAL; 3358 3359 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3360 return -EBUSY; 3361 if (new_offset == rdev->data_offset) 3362 /* reset is always permitted */ 3363 ; 3364 else if (new_offset > rdev->data_offset) { 3365 /* must not push array size beyond rdev_sectors */ 3366 if (new_offset - rdev->data_offset 3367 + mddev->dev_sectors > rdev->sectors) 3368 return -E2BIG; 3369 } 3370 /* Metadata worries about other space details. */ 3371 3372 /* decreasing the offset is inconsistent with a backwards 3373 * reshape. 3374 */ 3375 if (new_offset < rdev->data_offset && 3376 mddev->reshape_backwards) 3377 return -EINVAL; 3378 /* Increasing offset is inconsistent with forwards 3379 * reshape. reshape_direction should be set to 3380 * 'backwards' first. 3381 */ 3382 if (new_offset > rdev->data_offset && 3383 !mddev->reshape_backwards) 3384 return -EINVAL; 3385 3386 if (mddev->pers && mddev->persistent && 3387 !super_types[mddev->major_version] 3388 .allow_new_offset(rdev, new_offset)) 3389 return -E2BIG; 3390 rdev->new_data_offset = new_offset; 3391 if (new_offset > rdev->data_offset) 3392 mddev->reshape_backwards = 1; 3393 else if (new_offset < rdev->data_offset) 3394 mddev->reshape_backwards = 0; 3395 3396 return len; 3397 } 3398 static struct rdev_sysfs_entry rdev_new_offset = 3399 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3400 3401 static ssize_t 3402 rdev_size_show(struct md_rdev *rdev, char *page) 3403 { 3404 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3405 } 3406 3407 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3408 { 3409 /* check if two start/length pairs overlap */ 3410 if (a->data_offset + a->sectors <= b->data_offset) 3411 return false; 3412 if (b->data_offset + b->sectors <= a->data_offset) 3413 return false; 3414 return true; 3415 } 3416 3417 static bool md_rdev_overlaps(struct md_rdev *rdev) 3418 { 3419 struct mddev *mddev; 3420 struct md_rdev *rdev2; 3421 3422 spin_lock(&all_mddevs_lock); 3423 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3424 if (test_bit(MD_DELETED, &mddev->flags)) 3425 continue; 3426 rdev_for_each(rdev2, mddev) { 3427 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3428 md_rdevs_overlap(rdev, rdev2)) { 3429 spin_unlock(&all_mddevs_lock); 3430 return true; 3431 } 3432 } 3433 } 3434 spin_unlock(&all_mddevs_lock); 3435 return false; 3436 } 3437 3438 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3439 { 3440 unsigned long long blocks; 3441 sector_t new; 3442 3443 if (kstrtoull(buf, 10, &blocks) < 0) 3444 return -EINVAL; 3445 3446 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3447 return -EINVAL; /* sector conversion overflow */ 3448 3449 new = blocks * 2; 3450 if (new != blocks * 2) 3451 return -EINVAL; /* unsigned long long to sector_t overflow */ 3452 3453 *sectors = new; 3454 return 0; 3455 } 3456 3457 static ssize_t 3458 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3459 { 3460 struct mddev *my_mddev = rdev->mddev; 3461 sector_t oldsectors = rdev->sectors; 3462 sector_t sectors; 3463 3464 if (test_bit(Journal, &rdev->flags)) 3465 return -EBUSY; 3466 if (strict_blocks_to_sectors(buf, §ors) < 0) 3467 return -EINVAL; 3468 if (rdev->data_offset != rdev->new_data_offset) 3469 return -EINVAL; /* too confusing */ 3470 if (my_mddev->pers && rdev->raid_disk >= 0) { 3471 if (my_mddev->persistent) { 3472 sectors = super_types[my_mddev->major_version]. 3473 rdev_size_change(rdev, sectors); 3474 if (!sectors) 3475 return -EBUSY; 3476 } else if (!sectors) 3477 sectors = bdev_nr_sectors(rdev->bdev) - 3478 rdev->data_offset; 3479 if (!my_mddev->pers->resize) 3480 /* Cannot change size for RAID0 or Linear etc */ 3481 return -EINVAL; 3482 } 3483 if (sectors < my_mddev->dev_sectors) 3484 return -EINVAL; /* component must fit device */ 3485 3486 rdev->sectors = sectors; 3487 3488 /* 3489 * Check that all other rdevs with the same bdev do not overlap. This 3490 * check does not provide a hard guarantee, it just helps avoid 3491 * dangerous mistakes. 3492 */ 3493 if (sectors > oldsectors && my_mddev->external && 3494 md_rdev_overlaps(rdev)) { 3495 /* 3496 * Someone else could have slipped in a size change here, but 3497 * doing so is just silly. We put oldsectors back because we 3498 * know it is safe, and trust userspace not to race with itself. 3499 */ 3500 rdev->sectors = oldsectors; 3501 return -EBUSY; 3502 } 3503 return len; 3504 } 3505 3506 static struct rdev_sysfs_entry rdev_size = 3507 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3508 3509 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3510 { 3511 unsigned long long recovery_start = rdev->recovery_offset; 3512 3513 if (test_bit(In_sync, &rdev->flags) || 3514 recovery_start == MaxSector) 3515 return sprintf(page, "none\n"); 3516 3517 return sprintf(page, "%llu\n", recovery_start); 3518 } 3519 3520 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3521 { 3522 unsigned long long recovery_start; 3523 3524 if (cmd_match(buf, "none")) 3525 recovery_start = MaxSector; 3526 else if (kstrtoull(buf, 10, &recovery_start)) 3527 return -EINVAL; 3528 3529 if (rdev->mddev->pers && 3530 rdev->raid_disk >= 0) 3531 return -EBUSY; 3532 3533 rdev->recovery_offset = recovery_start; 3534 if (recovery_start == MaxSector) 3535 set_bit(In_sync, &rdev->flags); 3536 else 3537 clear_bit(In_sync, &rdev->flags); 3538 return len; 3539 } 3540 3541 static struct rdev_sysfs_entry rdev_recovery_start = 3542 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3543 3544 /* sysfs access to bad-blocks list. 3545 * We present two files. 3546 * 'bad-blocks' lists sector numbers and lengths of ranges that 3547 * are recorded as bad. The list is truncated to fit within 3548 * the one-page limit of sysfs. 3549 * Writing "sector length" to this file adds an acknowledged 3550 * bad block list. 3551 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3552 * been acknowledged. Writing to this file adds bad blocks 3553 * without acknowledging them. This is largely for testing. 3554 */ 3555 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3556 { 3557 return badblocks_show(&rdev->badblocks, page, 0); 3558 } 3559 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3560 { 3561 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3562 /* Maybe that ack was all we needed */ 3563 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3564 wake_up(&rdev->blocked_wait); 3565 return rv; 3566 } 3567 static struct rdev_sysfs_entry rdev_bad_blocks = 3568 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3569 3570 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3571 { 3572 return badblocks_show(&rdev->badblocks, page, 1); 3573 } 3574 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3575 { 3576 return badblocks_store(&rdev->badblocks, page, len, 1); 3577 } 3578 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3579 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3580 3581 static ssize_t 3582 ppl_sector_show(struct md_rdev *rdev, char *page) 3583 { 3584 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3585 } 3586 3587 static ssize_t 3588 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3589 { 3590 unsigned long long sector; 3591 3592 if (kstrtoull(buf, 10, §or) < 0) 3593 return -EINVAL; 3594 if (sector != (sector_t)sector) 3595 return -EINVAL; 3596 3597 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3598 rdev->raid_disk >= 0) 3599 return -EBUSY; 3600 3601 if (rdev->mddev->persistent) { 3602 if (rdev->mddev->major_version == 0) 3603 return -EINVAL; 3604 if ((sector > rdev->sb_start && 3605 sector - rdev->sb_start > S16_MAX) || 3606 (sector < rdev->sb_start && 3607 rdev->sb_start - sector > -S16_MIN)) 3608 return -EINVAL; 3609 rdev->ppl.offset = sector - rdev->sb_start; 3610 } else if (!rdev->mddev->external) { 3611 return -EBUSY; 3612 } 3613 rdev->ppl.sector = sector; 3614 return len; 3615 } 3616 3617 static struct rdev_sysfs_entry rdev_ppl_sector = 3618 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3619 3620 static ssize_t 3621 ppl_size_show(struct md_rdev *rdev, char *page) 3622 { 3623 return sprintf(page, "%u\n", rdev->ppl.size); 3624 } 3625 3626 static ssize_t 3627 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3628 { 3629 unsigned int size; 3630 3631 if (kstrtouint(buf, 10, &size) < 0) 3632 return -EINVAL; 3633 3634 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3635 rdev->raid_disk >= 0) 3636 return -EBUSY; 3637 3638 if (rdev->mddev->persistent) { 3639 if (rdev->mddev->major_version == 0) 3640 return -EINVAL; 3641 if (size > U16_MAX) 3642 return -EINVAL; 3643 } else if (!rdev->mddev->external) { 3644 return -EBUSY; 3645 } 3646 rdev->ppl.size = size; 3647 return len; 3648 } 3649 3650 static struct rdev_sysfs_entry rdev_ppl_size = 3651 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3652 3653 static struct attribute *rdev_default_attrs[] = { 3654 &rdev_state.attr, 3655 &rdev_errors.attr, 3656 &rdev_slot.attr, 3657 &rdev_offset.attr, 3658 &rdev_new_offset.attr, 3659 &rdev_size.attr, 3660 &rdev_recovery_start.attr, 3661 &rdev_bad_blocks.attr, 3662 &rdev_unack_bad_blocks.attr, 3663 &rdev_ppl_sector.attr, 3664 &rdev_ppl_size.attr, 3665 NULL, 3666 }; 3667 ATTRIBUTE_GROUPS(rdev_default); 3668 static ssize_t 3669 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3670 { 3671 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3672 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3673 3674 if (!entry->show) 3675 return -EIO; 3676 if (!rdev->mddev) 3677 return -ENODEV; 3678 return entry->show(rdev, page); 3679 } 3680 3681 static ssize_t 3682 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3683 const char *page, size_t length) 3684 { 3685 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3686 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3687 struct kernfs_node *kn = NULL; 3688 bool suspend = false; 3689 ssize_t rv; 3690 struct mddev *mddev = READ_ONCE(rdev->mddev); 3691 3692 if (!entry->store) 3693 return -EIO; 3694 if (!capable(CAP_SYS_ADMIN)) 3695 return -EACCES; 3696 if (!mddev) 3697 return -ENODEV; 3698 3699 if (entry->store == state_store) { 3700 if (cmd_match(page, "remove")) 3701 kn = sysfs_break_active_protection(kobj, attr); 3702 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3703 cmd_match(page, "writemostly") || 3704 cmd_match(page, "-writemostly")) 3705 suspend = true; 3706 } 3707 3708 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3709 if (!rv) { 3710 if (rdev->mddev == NULL) 3711 rv = -ENODEV; 3712 else 3713 rv = entry->store(rdev, page, length); 3714 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3715 } 3716 3717 if (kn) 3718 sysfs_unbreak_active_protection(kn); 3719 3720 return rv; 3721 } 3722 3723 static void rdev_free(struct kobject *ko) 3724 { 3725 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3726 kfree(rdev); 3727 } 3728 static const struct sysfs_ops rdev_sysfs_ops = { 3729 .show = rdev_attr_show, 3730 .store = rdev_attr_store, 3731 }; 3732 static const struct kobj_type rdev_ktype = { 3733 .release = rdev_free, 3734 .sysfs_ops = &rdev_sysfs_ops, 3735 .default_groups = rdev_default_groups, 3736 }; 3737 3738 int md_rdev_init(struct md_rdev *rdev) 3739 { 3740 rdev->desc_nr = -1; 3741 rdev->saved_raid_disk = -1; 3742 rdev->raid_disk = -1; 3743 rdev->flags = 0; 3744 rdev->data_offset = 0; 3745 rdev->new_data_offset = 0; 3746 rdev->sb_events = 0; 3747 rdev->last_read_error = 0; 3748 rdev->sb_loaded = 0; 3749 rdev->bb_page = NULL; 3750 atomic_set(&rdev->nr_pending, 0); 3751 atomic_set(&rdev->read_errors, 0); 3752 atomic_set(&rdev->corrected_errors, 0); 3753 3754 INIT_LIST_HEAD(&rdev->same_set); 3755 init_waitqueue_head(&rdev->blocked_wait); 3756 3757 /* Add space to store bad block list. 3758 * This reserves the space even on arrays where it cannot 3759 * be used - I wonder if that matters 3760 */ 3761 return badblocks_init(&rdev->badblocks, 0); 3762 } 3763 EXPORT_SYMBOL_GPL(md_rdev_init); 3764 3765 /* 3766 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3767 * 3768 * mark the device faulty if: 3769 * 3770 * - the device is nonexistent (zero size) 3771 * - the device has no valid superblock 3772 * 3773 * a faulty rdev _never_ has rdev->sb set. 3774 */ 3775 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3776 { 3777 struct md_rdev *rdev; 3778 sector_t size; 3779 int err; 3780 3781 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3782 if (!rdev) 3783 return ERR_PTR(-ENOMEM); 3784 3785 err = md_rdev_init(rdev); 3786 if (err) 3787 goto out_free_rdev; 3788 err = alloc_disk_sb(rdev); 3789 if (err) 3790 goto out_clear_rdev; 3791 3792 rdev->bdev_file = bdev_file_open_by_dev(newdev, 3793 BLK_OPEN_READ | BLK_OPEN_WRITE, 3794 super_format == -2 ? &claim_rdev : rdev, NULL); 3795 if (IS_ERR(rdev->bdev_file)) { 3796 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3797 MAJOR(newdev), MINOR(newdev)); 3798 err = PTR_ERR(rdev->bdev_file); 3799 goto out_clear_rdev; 3800 } 3801 rdev->bdev = file_bdev(rdev->bdev_file); 3802 3803 kobject_init(&rdev->kobj, &rdev_ktype); 3804 3805 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3806 if (!size) { 3807 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3808 rdev->bdev); 3809 err = -EINVAL; 3810 goto out_blkdev_put; 3811 } 3812 3813 if (super_format >= 0) { 3814 err = super_types[super_format]. 3815 load_super(rdev, NULL, super_minor); 3816 if (err == -EINVAL) { 3817 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3818 rdev->bdev, 3819 super_format, super_minor); 3820 goto out_blkdev_put; 3821 } 3822 if (err < 0) { 3823 pr_warn("md: could not read %pg's sb, not importing!\n", 3824 rdev->bdev); 3825 goto out_blkdev_put; 3826 } 3827 } 3828 3829 return rdev; 3830 3831 out_blkdev_put: 3832 fput(rdev->bdev_file); 3833 out_clear_rdev: 3834 md_rdev_clear(rdev); 3835 out_free_rdev: 3836 kfree(rdev); 3837 return ERR_PTR(err); 3838 } 3839 3840 /* 3841 * Check a full RAID array for plausibility 3842 */ 3843 3844 static int analyze_sbs(struct mddev *mddev) 3845 { 3846 int i; 3847 struct md_rdev *rdev, *freshest, *tmp; 3848 3849 freshest = NULL; 3850 rdev_for_each_safe(rdev, tmp, mddev) 3851 switch (super_types[mddev->major_version]. 3852 load_super(rdev, freshest, mddev->minor_version)) { 3853 case 1: 3854 freshest = rdev; 3855 break; 3856 case 0: 3857 break; 3858 default: 3859 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3860 rdev->bdev); 3861 md_kick_rdev_from_array(rdev); 3862 } 3863 3864 /* Cannot find a valid fresh disk */ 3865 if (!freshest) { 3866 pr_warn("md: cannot find a valid disk\n"); 3867 return -EINVAL; 3868 } 3869 3870 super_types[mddev->major_version]. 3871 validate_super(mddev, NULL/*freshest*/, freshest); 3872 3873 i = 0; 3874 rdev_for_each_safe(rdev, tmp, mddev) { 3875 if (mddev->max_disks && 3876 (rdev->desc_nr >= mddev->max_disks || 3877 i > mddev->max_disks)) { 3878 pr_warn("md: %s: %pg: only %d devices permitted\n", 3879 mdname(mddev), rdev->bdev, 3880 mddev->max_disks); 3881 md_kick_rdev_from_array(rdev); 3882 continue; 3883 } 3884 if (rdev != freshest) { 3885 if (super_types[mddev->major_version]. 3886 validate_super(mddev, freshest, rdev)) { 3887 pr_warn("md: kicking non-fresh %pg from array!\n", 3888 rdev->bdev); 3889 md_kick_rdev_from_array(rdev); 3890 continue; 3891 } 3892 } 3893 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && 3894 !test_bit(Journal, &rdev->flags)) { 3895 rdev->raid_disk = -1; 3896 clear_bit(In_sync, &rdev->flags); 3897 } 3898 } 3899 3900 return 0; 3901 } 3902 3903 /* Read a fixed-point number. 3904 * Numbers in sysfs attributes should be in "standard" units where 3905 * possible, so time should be in seconds. 3906 * However we internally use a a much smaller unit such as 3907 * milliseconds or jiffies. 3908 * This function takes a decimal number with a possible fractional 3909 * component, and produces an integer which is the result of 3910 * multiplying that number by 10^'scale'. 3911 * all without any floating-point arithmetic. 3912 */ 3913 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3914 { 3915 unsigned long result = 0; 3916 long decimals = -1; 3917 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3918 if (*cp == '.') 3919 decimals = 0; 3920 else if (decimals < scale) { 3921 unsigned int value; 3922 value = *cp - '0'; 3923 result = result * 10 + value; 3924 if (decimals >= 0) 3925 decimals++; 3926 } 3927 cp++; 3928 } 3929 if (*cp == '\n') 3930 cp++; 3931 if (*cp) 3932 return -EINVAL; 3933 if (decimals < 0) 3934 decimals = 0; 3935 *res = result * int_pow(10, scale - decimals); 3936 return 0; 3937 } 3938 3939 static ssize_t 3940 safe_delay_show(struct mddev *mddev, char *page) 3941 { 3942 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3943 3944 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3945 } 3946 static ssize_t 3947 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3948 { 3949 unsigned long msec; 3950 3951 if (mddev_is_clustered(mddev)) { 3952 pr_warn("md: Safemode is disabled for clustered mode\n"); 3953 return -EINVAL; 3954 } 3955 3956 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3957 return -EINVAL; 3958 if (msec == 0) 3959 mddev->safemode_delay = 0; 3960 else { 3961 unsigned long old_delay = mddev->safemode_delay; 3962 unsigned long new_delay = (msec*HZ)/1000; 3963 3964 if (new_delay == 0) 3965 new_delay = 1; 3966 mddev->safemode_delay = new_delay; 3967 if (new_delay < old_delay || old_delay == 0) 3968 mod_timer(&mddev->safemode_timer, jiffies+1); 3969 } 3970 return len; 3971 } 3972 static struct md_sysfs_entry md_safe_delay = 3973 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3974 3975 static ssize_t 3976 level_show(struct mddev *mddev, char *page) 3977 { 3978 struct md_personality *p; 3979 int ret; 3980 spin_lock(&mddev->lock); 3981 p = mddev->pers; 3982 if (p) 3983 ret = sprintf(page, "%s\n", p->head.name); 3984 else if (mddev->clevel[0]) 3985 ret = sprintf(page, "%s\n", mddev->clevel); 3986 else if (mddev->level != LEVEL_NONE) 3987 ret = sprintf(page, "%d\n", mddev->level); 3988 else 3989 ret = 0; 3990 spin_unlock(&mddev->lock); 3991 return ret; 3992 } 3993 3994 static ssize_t 3995 level_store(struct mddev *mddev, const char *buf, size_t len) 3996 { 3997 char clevel[16]; 3998 ssize_t rv; 3999 size_t slen = len; 4000 struct md_personality *pers, *oldpers; 4001 long level; 4002 void *priv, *oldpriv; 4003 struct md_rdev *rdev; 4004 4005 if (slen == 0 || slen >= sizeof(clevel)) 4006 return -EINVAL; 4007 4008 rv = mddev_suspend_and_lock(mddev); 4009 if (rv) 4010 return rv; 4011 4012 if (mddev->pers == NULL) { 4013 memcpy(mddev->clevel, buf, slen); 4014 if (mddev->clevel[slen-1] == '\n') 4015 slen--; 4016 mddev->clevel[slen] = 0; 4017 mddev->level = LEVEL_NONE; 4018 rv = len; 4019 goto out_unlock; 4020 } 4021 rv = -EROFS; 4022 if (!md_is_rdwr(mddev)) 4023 goto out_unlock; 4024 4025 /* request to change the personality. Need to ensure: 4026 * - array is not engaged in resync/recovery/reshape 4027 * - old personality can be suspended 4028 * - new personality will access other array. 4029 */ 4030 4031 rv = -EBUSY; 4032 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4033 mddev->reshape_position != MaxSector || 4034 mddev->sysfs_active) 4035 goto out_unlock; 4036 4037 rv = -EINVAL; 4038 if (!mddev->pers->quiesce) { 4039 pr_warn("md: %s: %s does not support online personality change\n", 4040 mdname(mddev), mddev->pers->head.name); 4041 goto out_unlock; 4042 } 4043 4044 /* Now find the new personality */ 4045 memcpy(clevel, buf, slen); 4046 if (clevel[slen-1] == '\n') 4047 slen--; 4048 clevel[slen] = 0; 4049 if (kstrtol(clevel, 10, &level)) 4050 level = LEVEL_NONE; 4051 4052 if (request_module("md-%s", clevel) != 0) 4053 request_module("md-level-%s", clevel); 4054 pers = get_pers(level, clevel); 4055 if (!pers) { 4056 rv = -EINVAL; 4057 goto out_unlock; 4058 } 4059 4060 if (pers == mddev->pers) { 4061 /* Nothing to do! */ 4062 put_pers(pers); 4063 rv = len; 4064 goto out_unlock; 4065 } 4066 if (!pers->takeover) { 4067 put_pers(pers); 4068 pr_warn("md: %s: %s does not support personality takeover\n", 4069 mdname(mddev), clevel); 4070 rv = -EINVAL; 4071 goto out_unlock; 4072 } 4073 4074 rdev_for_each(rdev, mddev) 4075 rdev->new_raid_disk = rdev->raid_disk; 4076 4077 /* ->takeover must set new_* and/or delta_disks 4078 * if it succeeds, and may set them when it fails. 4079 */ 4080 priv = pers->takeover(mddev); 4081 if (IS_ERR(priv)) { 4082 mddev->new_level = mddev->level; 4083 mddev->new_layout = mddev->layout; 4084 mddev->new_chunk_sectors = mddev->chunk_sectors; 4085 mddev->raid_disks -= mddev->delta_disks; 4086 mddev->delta_disks = 0; 4087 mddev->reshape_backwards = 0; 4088 put_pers(pers); 4089 pr_warn("md: %s: %s would not accept array\n", 4090 mdname(mddev), clevel); 4091 rv = PTR_ERR(priv); 4092 goto out_unlock; 4093 } 4094 4095 /* Looks like we have a winner */ 4096 mddev_detach(mddev); 4097 4098 spin_lock(&mddev->lock); 4099 oldpers = mddev->pers; 4100 oldpriv = mddev->private; 4101 mddev->pers = pers; 4102 mddev->private = priv; 4103 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 4104 mddev->level = mddev->new_level; 4105 mddev->layout = mddev->new_layout; 4106 mddev->chunk_sectors = mddev->new_chunk_sectors; 4107 mddev->delta_disks = 0; 4108 mddev->reshape_backwards = 0; 4109 mddev->degraded = 0; 4110 spin_unlock(&mddev->lock); 4111 4112 if (oldpers->sync_request == NULL && 4113 mddev->external) { 4114 /* We are converting from a no-redundancy array 4115 * to a redundancy array and metadata is managed 4116 * externally so we need to be sure that writes 4117 * won't block due to a need to transition 4118 * clean->dirty 4119 * until external management is started. 4120 */ 4121 mddev->in_sync = 0; 4122 mddev->safemode_delay = 0; 4123 mddev->safemode = 0; 4124 } 4125 4126 oldpers->free(mddev, oldpriv); 4127 4128 if (oldpers->sync_request == NULL && 4129 pers->sync_request != NULL) { 4130 /* need to add the md_redundancy_group */ 4131 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4132 pr_warn("md: cannot register extra attributes for %s\n", 4133 mdname(mddev)); 4134 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4135 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4136 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4137 } 4138 if (oldpers->sync_request != NULL && 4139 pers->sync_request == NULL) { 4140 /* need to remove the md_redundancy_group */ 4141 if (mddev->to_remove == NULL) 4142 mddev->to_remove = &md_redundancy_group; 4143 } 4144 4145 put_pers(oldpers); 4146 4147 rdev_for_each(rdev, mddev) { 4148 if (rdev->raid_disk < 0) 4149 continue; 4150 if (rdev->new_raid_disk >= mddev->raid_disks) 4151 rdev->new_raid_disk = -1; 4152 if (rdev->new_raid_disk == rdev->raid_disk) 4153 continue; 4154 sysfs_unlink_rdev(mddev, rdev); 4155 } 4156 rdev_for_each(rdev, mddev) { 4157 if (rdev->raid_disk < 0) 4158 continue; 4159 if (rdev->new_raid_disk == rdev->raid_disk) 4160 continue; 4161 rdev->raid_disk = rdev->new_raid_disk; 4162 if (rdev->raid_disk < 0) 4163 clear_bit(In_sync, &rdev->flags); 4164 else { 4165 if (sysfs_link_rdev(mddev, rdev)) 4166 pr_warn("md: cannot register rd%d for %s after level change\n", 4167 rdev->raid_disk, mdname(mddev)); 4168 } 4169 } 4170 4171 if (pers->sync_request == NULL) { 4172 /* this is now an array without redundancy, so 4173 * it must always be in_sync 4174 */ 4175 mddev->in_sync = 1; 4176 timer_delete_sync(&mddev->safemode_timer); 4177 } 4178 pers->run(mddev); 4179 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4180 if (!mddev->thread) 4181 md_update_sb(mddev, 1); 4182 sysfs_notify_dirent_safe(mddev->sysfs_level); 4183 md_new_event(); 4184 rv = len; 4185 out_unlock: 4186 mddev_unlock_and_resume(mddev); 4187 return rv; 4188 } 4189 4190 static struct md_sysfs_entry md_level = 4191 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4192 4193 static ssize_t 4194 new_level_show(struct mddev *mddev, char *page) 4195 { 4196 return sprintf(page, "%d\n", mddev->new_level); 4197 } 4198 4199 static ssize_t 4200 new_level_store(struct mddev *mddev, const char *buf, size_t len) 4201 { 4202 unsigned int n; 4203 int err; 4204 4205 err = kstrtouint(buf, 10, &n); 4206 if (err < 0) 4207 return err; 4208 err = mddev_lock(mddev); 4209 if (err) 4210 return err; 4211 4212 mddev->new_level = n; 4213 md_update_sb(mddev, 1); 4214 4215 mddev_unlock(mddev); 4216 return len; 4217 } 4218 static struct md_sysfs_entry md_new_level = 4219 __ATTR(new_level, 0664, new_level_show, new_level_store); 4220 4221 static ssize_t 4222 bitmap_type_show(struct mddev *mddev, char *page) 4223 { 4224 struct md_submodule_head *head; 4225 unsigned long i; 4226 ssize_t len = 0; 4227 4228 if (mddev->bitmap_id == ID_BITMAP_NONE) 4229 len += sprintf(page + len, "[none] "); 4230 else 4231 len += sprintf(page + len, "none "); 4232 4233 xa_lock(&md_submodule); 4234 xa_for_each(&md_submodule, i, head) { 4235 if (head->type != MD_BITMAP) 4236 continue; 4237 4238 if (mddev->bitmap_id == head->id) 4239 len += sprintf(page + len, "[%s] ", head->name); 4240 else 4241 len += sprintf(page + len, "%s ", head->name); 4242 } 4243 xa_unlock(&md_submodule); 4244 4245 len += sprintf(page + len, "\n"); 4246 return len; 4247 } 4248 4249 static ssize_t 4250 bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) 4251 { 4252 struct md_submodule_head *head; 4253 enum md_submodule_id id; 4254 unsigned long i; 4255 int err = 0; 4256 4257 xa_lock(&md_submodule); 4258 4259 if (mddev->bitmap_ops) { 4260 err = -EBUSY; 4261 goto out; 4262 } 4263 4264 if (cmd_match(buf, "none")) { 4265 mddev->bitmap_id = ID_BITMAP_NONE; 4266 goto out; 4267 } 4268 4269 xa_for_each(&md_submodule, i, head) { 4270 if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { 4271 mddev->bitmap_id = head->id; 4272 goto out; 4273 } 4274 } 4275 4276 err = kstrtoint(buf, 10, &id); 4277 if (err) 4278 goto out; 4279 4280 if (id == ID_BITMAP_NONE) { 4281 mddev->bitmap_id = id; 4282 goto out; 4283 } 4284 4285 head = xa_load(&md_submodule, id); 4286 if (head && head->type == MD_BITMAP) { 4287 mddev->bitmap_id = id; 4288 goto out; 4289 } 4290 4291 err = -ENOENT; 4292 4293 out: 4294 xa_unlock(&md_submodule); 4295 return err ? err : len; 4296 } 4297 4298 static struct md_sysfs_entry md_bitmap_type = 4299 __ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); 4300 4301 static ssize_t 4302 layout_show(struct mddev *mddev, char *page) 4303 { 4304 /* just a number, not meaningful for all levels */ 4305 if (mddev->reshape_position != MaxSector && 4306 mddev->layout != mddev->new_layout) 4307 return sprintf(page, "%d (%d)\n", 4308 mddev->new_layout, mddev->layout); 4309 return sprintf(page, "%d\n", mddev->layout); 4310 } 4311 4312 static ssize_t 4313 layout_store(struct mddev *mddev, const char *buf, size_t len) 4314 { 4315 unsigned int n; 4316 int err; 4317 4318 err = kstrtouint(buf, 10, &n); 4319 if (err < 0) 4320 return err; 4321 err = mddev_lock(mddev); 4322 if (err) 4323 return err; 4324 4325 if (mddev->pers) { 4326 if (mddev->pers->check_reshape == NULL) 4327 err = -EBUSY; 4328 else if (!md_is_rdwr(mddev)) 4329 err = -EROFS; 4330 else { 4331 mddev->new_layout = n; 4332 err = mddev->pers->check_reshape(mddev); 4333 if (err) 4334 mddev->new_layout = mddev->layout; 4335 } 4336 } else { 4337 mddev->new_layout = n; 4338 if (mddev->reshape_position == MaxSector) 4339 mddev->layout = n; 4340 } 4341 mddev_unlock(mddev); 4342 return err ?: len; 4343 } 4344 static struct md_sysfs_entry md_layout = 4345 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4346 4347 static ssize_t 4348 raid_disks_show(struct mddev *mddev, char *page) 4349 { 4350 if (mddev->raid_disks == 0) 4351 return 0; 4352 if (mddev->reshape_position != MaxSector && 4353 mddev->delta_disks != 0) 4354 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4355 mddev->raid_disks - mddev->delta_disks); 4356 return sprintf(page, "%d\n", mddev->raid_disks); 4357 } 4358 4359 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4360 4361 static ssize_t 4362 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4363 { 4364 unsigned int n; 4365 int err; 4366 4367 err = kstrtouint(buf, 10, &n); 4368 if (err < 0) 4369 return err; 4370 4371 err = mddev_lock(mddev); 4372 if (err) 4373 return err; 4374 if (mddev->pers) 4375 err = update_raid_disks(mddev, n); 4376 else if (mddev->reshape_position != MaxSector) { 4377 struct md_rdev *rdev; 4378 int olddisks = mddev->raid_disks - mddev->delta_disks; 4379 4380 err = -EINVAL; 4381 rdev_for_each(rdev, mddev) { 4382 if (olddisks < n && 4383 rdev->data_offset < rdev->new_data_offset) 4384 goto out_unlock; 4385 if (olddisks > n && 4386 rdev->data_offset > rdev->new_data_offset) 4387 goto out_unlock; 4388 } 4389 err = 0; 4390 mddev->delta_disks = n - olddisks; 4391 mddev->raid_disks = n; 4392 mddev->reshape_backwards = (mddev->delta_disks < 0); 4393 } else 4394 mddev->raid_disks = n; 4395 out_unlock: 4396 mddev_unlock(mddev); 4397 return err ? err : len; 4398 } 4399 static struct md_sysfs_entry md_raid_disks = 4400 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4401 4402 static ssize_t 4403 uuid_show(struct mddev *mddev, char *page) 4404 { 4405 return sprintf(page, "%pU\n", mddev->uuid); 4406 } 4407 static struct md_sysfs_entry md_uuid = 4408 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4409 4410 static ssize_t 4411 chunk_size_show(struct mddev *mddev, char *page) 4412 { 4413 if (mddev->reshape_position != MaxSector && 4414 mddev->chunk_sectors != mddev->new_chunk_sectors) 4415 return sprintf(page, "%d (%d)\n", 4416 mddev->new_chunk_sectors << 9, 4417 mddev->chunk_sectors << 9); 4418 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4419 } 4420 4421 static ssize_t 4422 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4423 { 4424 unsigned long n; 4425 int err; 4426 4427 err = kstrtoul(buf, 10, &n); 4428 if (err < 0) 4429 return err; 4430 4431 err = mddev_lock(mddev); 4432 if (err) 4433 return err; 4434 if (mddev->pers) { 4435 if (mddev->pers->check_reshape == NULL) 4436 err = -EBUSY; 4437 else if (!md_is_rdwr(mddev)) 4438 err = -EROFS; 4439 else { 4440 mddev->new_chunk_sectors = n >> 9; 4441 err = mddev->pers->check_reshape(mddev); 4442 if (err) 4443 mddev->new_chunk_sectors = mddev->chunk_sectors; 4444 } 4445 } else { 4446 mddev->new_chunk_sectors = n >> 9; 4447 if (mddev->reshape_position == MaxSector) 4448 mddev->chunk_sectors = n >> 9; 4449 } 4450 mddev_unlock(mddev); 4451 return err ?: len; 4452 } 4453 static struct md_sysfs_entry md_chunk_size = 4454 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4455 4456 static ssize_t 4457 resync_start_show(struct mddev *mddev, char *page) 4458 { 4459 if (mddev->resync_offset == MaxSector) 4460 return sprintf(page, "none\n"); 4461 return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); 4462 } 4463 4464 static ssize_t 4465 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4466 { 4467 unsigned long long n; 4468 int err; 4469 4470 if (cmd_match(buf, "none")) 4471 n = MaxSector; 4472 else { 4473 err = kstrtoull(buf, 10, &n); 4474 if (err < 0) 4475 return err; 4476 if (n != (sector_t)n) 4477 return -EINVAL; 4478 } 4479 4480 err = mddev_lock(mddev); 4481 if (err) 4482 return err; 4483 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4484 err = -EBUSY; 4485 4486 if (!err) { 4487 mddev->resync_offset = n; 4488 if (mddev->pers) 4489 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4490 } 4491 mddev_unlock(mddev); 4492 return err ?: len; 4493 } 4494 static struct md_sysfs_entry md_resync_start = 4495 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4496 resync_start_show, resync_start_store); 4497 4498 /* 4499 * The array state can be: 4500 * 4501 * clear 4502 * No devices, no size, no level 4503 * Equivalent to STOP_ARRAY ioctl 4504 * inactive 4505 * May have some settings, but array is not active 4506 * all IO results in error 4507 * When written, doesn't tear down array, but just stops it 4508 * suspended (not supported yet) 4509 * All IO requests will block. The array can be reconfigured. 4510 * Writing this, if accepted, will block until array is quiescent 4511 * readonly 4512 * no resync can happen. no superblocks get written. 4513 * write requests fail 4514 * read-auto 4515 * like readonly, but behaves like 'clean' on a write request. 4516 * 4517 * clean - no pending writes, but otherwise active. 4518 * When written to inactive array, starts without resync 4519 * If a write request arrives then 4520 * if metadata is known, mark 'dirty' and switch to 'active'. 4521 * if not known, block and switch to write-pending 4522 * If written to an active array that has pending writes, then fails. 4523 * active 4524 * fully active: IO and resync can be happening. 4525 * When written to inactive array, starts with resync 4526 * 4527 * write-pending 4528 * clean, but writes are blocked waiting for 'active' to be written. 4529 * 4530 * active-idle 4531 * like active, but no writes have been seen for a while (100msec). 4532 * 4533 * broken 4534 * Array is failed. It's useful because mounted-arrays aren't stopped 4535 * when array is failed, so this state will at least alert the user that 4536 * something is wrong. 4537 */ 4538 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4539 write_pending, active_idle, broken, bad_word}; 4540 static char *array_states[] = { 4541 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4542 "write-pending", "active-idle", "broken", NULL }; 4543 4544 static int match_word(const char *word, char **list) 4545 { 4546 int n; 4547 for (n=0; list[n]; n++) 4548 if (cmd_match(word, list[n])) 4549 break; 4550 return n; 4551 } 4552 4553 static ssize_t 4554 array_state_show(struct mddev *mddev, char *page) 4555 { 4556 enum array_state st = inactive; 4557 4558 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4559 switch(mddev->ro) { 4560 case MD_RDONLY: 4561 st = readonly; 4562 break; 4563 case MD_AUTO_READ: 4564 st = read_auto; 4565 break; 4566 case MD_RDWR: 4567 spin_lock(&mddev->lock); 4568 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4569 st = write_pending; 4570 else if (mddev->in_sync) 4571 st = clean; 4572 else if (mddev->safemode) 4573 st = active_idle; 4574 else 4575 st = active; 4576 spin_unlock(&mddev->lock); 4577 } 4578 4579 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4580 st = broken; 4581 } else { 4582 if (list_empty(&mddev->disks) && 4583 mddev->raid_disks == 0 && 4584 mddev->dev_sectors == 0) 4585 st = clear; 4586 else 4587 st = inactive; 4588 } 4589 return sprintf(page, "%s\n", array_states[st]); 4590 } 4591 4592 static int do_md_stop(struct mddev *mddev, int ro); 4593 static int md_set_readonly(struct mddev *mddev); 4594 static int restart_array(struct mddev *mddev); 4595 4596 static ssize_t 4597 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4598 { 4599 int err = 0; 4600 enum array_state st = match_word(buf, array_states); 4601 4602 /* No lock dependent actions */ 4603 switch (st) { 4604 case suspended: /* not supported yet */ 4605 case write_pending: /* cannot be set */ 4606 case active_idle: /* cannot be set */ 4607 case broken: /* cannot be set */ 4608 case bad_word: 4609 return -EINVAL; 4610 case clear: 4611 case readonly: 4612 case inactive: 4613 case read_auto: 4614 if (!mddev->pers || !md_is_rdwr(mddev)) 4615 break; 4616 /* write sysfs will not open mddev and opener should be 0 */ 4617 err = mddev_set_closing_and_sync_blockdev(mddev, 0); 4618 if (err) 4619 return err; 4620 break; 4621 default: 4622 break; 4623 } 4624 4625 if (mddev->pers && (st == active || st == clean) && 4626 mddev->ro != MD_RDONLY) { 4627 /* don't take reconfig_mutex when toggling between 4628 * clean and active 4629 */ 4630 spin_lock(&mddev->lock); 4631 if (st == active) { 4632 restart_array(mddev); 4633 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4634 md_wakeup_thread(mddev->thread); 4635 wake_up(&mddev->sb_wait); 4636 } else /* st == clean */ { 4637 restart_array(mddev); 4638 if (!set_in_sync(mddev)) 4639 err = -EBUSY; 4640 } 4641 if (!err) 4642 sysfs_notify_dirent_safe(mddev->sysfs_state); 4643 spin_unlock(&mddev->lock); 4644 return err ?: len; 4645 } 4646 err = mddev_lock(mddev); 4647 if (err) 4648 return err; 4649 4650 switch (st) { 4651 case inactive: 4652 /* stop an active array, return 0 otherwise */ 4653 if (mddev->pers) 4654 err = do_md_stop(mddev, 2); 4655 break; 4656 case clear: 4657 err = do_md_stop(mddev, 0); 4658 break; 4659 case readonly: 4660 if (mddev->pers) 4661 err = md_set_readonly(mddev); 4662 else { 4663 mddev->ro = MD_RDONLY; 4664 set_disk_ro(mddev->gendisk, 1); 4665 err = do_md_run(mddev); 4666 } 4667 break; 4668 case read_auto: 4669 if (mddev->pers) { 4670 if (md_is_rdwr(mddev)) 4671 err = md_set_readonly(mddev); 4672 else if (mddev->ro == MD_RDONLY) 4673 err = restart_array(mddev); 4674 if (err == 0) { 4675 mddev->ro = MD_AUTO_READ; 4676 set_disk_ro(mddev->gendisk, 0); 4677 } 4678 } else { 4679 mddev->ro = MD_AUTO_READ; 4680 err = do_md_run(mddev); 4681 } 4682 break; 4683 case clean: 4684 if (mddev->pers) { 4685 err = restart_array(mddev); 4686 if (err) 4687 break; 4688 spin_lock(&mddev->lock); 4689 if (!set_in_sync(mddev)) 4690 err = -EBUSY; 4691 spin_unlock(&mddev->lock); 4692 } else 4693 err = -EINVAL; 4694 break; 4695 case active: 4696 if (mddev->pers) { 4697 err = restart_array(mddev); 4698 if (err) 4699 break; 4700 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4701 wake_up(&mddev->sb_wait); 4702 err = 0; 4703 } else { 4704 mddev->ro = MD_RDWR; 4705 set_disk_ro(mddev->gendisk, 0); 4706 err = do_md_run(mddev); 4707 } 4708 break; 4709 default: 4710 err = -EINVAL; 4711 break; 4712 } 4713 4714 if (!err) { 4715 if (mddev->hold_active == UNTIL_IOCTL) 4716 mddev->hold_active = 0; 4717 sysfs_notify_dirent_safe(mddev->sysfs_state); 4718 } 4719 mddev_unlock(mddev); 4720 4721 if (st == readonly || st == read_auto || st == inactive || 4722 (err && st == clear)) 4723 clear_bit(MD_CLOSING, &mddev->flags); 4724 4725 return err ?: len; 4726 } 4727 static struct md_sysfs_entry md_array_state = 4728 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4729 4730 static ssize_t 4731 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4732 return sprintf(page, "%d\n", 4733 atomic_read(&mddev->max_corr_read_errors)); 4734 } 4735 4736 static ssize_t 4737 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4738 { 4739 unsigned int n; 4740 int rv; 4741 4742 rv = kstrtouint(buf, 10, &n); 4743 if (rv < 0) 4744 return rv; 4745 if (n > INT_MAX) 4746 return -EINVAL; 4747 atomic_set(&mddev->max_corr_read_errors, n); 4748 return len; 4749 } 4750 4751 static struct md_sysfs_entry max_corr_read_errors = 4752 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4753 max_corrected_read_errors_store); 4754 4755 static ssize_t 4756 null_show(struct mddev *mddev, char *page) 4757 { 4758 return -EINVAL; 4759 } 4760 4761 static ssize_t 4762 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4763 { 4764 /* buf must be %d:%d\n? giving major and minor numbers */ 4765 /* The new device is added to the array. 4766 * If the array has a persistent superblock, we read the 4767 * superblock to initialise info and check validity. 4768 * Otherwise, only checking done is that in bind_rdev_to_array, 4769 * which mainly checks size. 4770 */ 4771 char *e; 4772 int major = simple_strtoul(buf, &e, 10); 4773 int minor; 4774 dev_t dev; 4775 struct md_rdev *rdev; 4776 int err; 4777 4778 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4779 return -EINVAL; 4780 minor = simple_strtoul(e+1, &e, 10); 4781 if (*e && *e != '\n') 4782 return -EINVAL; 4783 dev = MKDEV(major, minor); 4784 if (major != MAJOR(dev) || 4785 minor != MINOR(dev)) 4786 return -EOVERFLOW; 4787 4788 err = mddev_suspend_and_lock(mddev); 4789 if (err) 4790 return err; 4791 if (mddev->persistent) { 4792 rdev = md_import_device(dev, mddev->major_version, 4793 mddev->minor_version); 4794 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4795 struct md_rdev *rdev0 4796 = list_entry(mddev->disks.next, 4797 struct md_rdev, same_set); 4798 err = super_types[mddev->major_version] 4799 .load_super(rdev, rdev0, mddev->minor_version); 4800 if (err < 0) 4801 goto out; 4802 } 4803 } else if (mddev->external) 4804 rdev = md_import_device(dev, -2, -1); 4805 else 4806 rdev = md_import_device(dev, -1, -1); 4807 4808 if (IS_ERR(rdev)) { 4809 mddev_unlock_and_resume(mddev); 4810 return PTR_ERR(rdev); 4811 } 4812 err = bind_rdev_to_array(rdev, mddev); 4813 out: 4814 if (err) 4815 export_rdev(rdev, mddev); 4816 mddev_unlock_and_resume(mddev); 4817 if (!err) 4818 md_new_event(); 4819 return err ? err : len; 4820 } 4821 4822 static struct md_sysfs_entry md_new_device = 4823 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4824 4825 static ssize_t 4826 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4827 { 4828 char *end; 4829 unsigned long chunk, end_chunk; 4830 int err; 4831 4832 if (!md_bitmap_enabled(mddev, false)) 4833 return len; 4834 4835 err = mddev_lock(mddev); 4836 if (err) 4837 return err; 4838 if (!mddev->bitmap) 4839 goto out; 4840 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4841 while (*buf) { 4842 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4843 if (buf == end) 4844 break; 4845 4846 if (*end == '-') { /* range */ 4847 buf = end + 1; 4848 end_chunk = simple_strtoul(buf, &end, 0); 4849 if (buf == end) 4850 break; 4851 } 4852 4853 if (*end && !isspace(*end)) 4854 break; 4855 4856 mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); 4857 buf = skip_spaces(end); 4858 } 4859 mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ 4860 out: 4861 mddev_unlock(mddev); 4862 return len; 4863 } 4864 4865 static struct md_sysfs_entry md_bitmap = 4866 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4867 4868 static ssize_t 4869 size_show(struct mddev *mddev, char *page) 4870 { 4871 return sprintf(page, "%llu\n", 4872 (unsigned long long)mddev->dev_sectors / 2); 4873 } 4874 4875 static int update_size(struct mddev *mddev, sector_t num_sectors); 4876 4877 static ssize_t 4878 size_store(struct mddev *mddev, const char *buf, size_t len) 4879 { 4880 /* If array is inactive, we can reduce the component size, but 4881 * not increase it (except from 0). 4882 * If array is active, we can try an on-line resize 4883 */ 4884 sector_t sectors; 4885 int err = strict_blocks_to_sectors(buf, §ors); 4886 4887 if (err < 0) 4888 return err; 4889 err = mddev_lock(mddev); 4890 if (err) 4891 return err; 4892 if (mddev->pers) { 4893 err = update_size(mddev, sectors); 4894 if (err == 0) 4895 md_update_sb(mddev, 1); 4896 } else { 4897 if (mddev->dev_sectors == 0 || 4898 mddev->dev_sectors > sectors) 4899 mddev->dev_sectors = sectors; 4900 else 4901 err = -ENOSPC; 4902 } 4903 mddev_unlock(mddev); 4904 return err ? err : len; 4905 } 4906 4907 static struct md_sysfs_entry md_size = 4908 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4909 4910 /* Metadata version. 4911 * This is one of 4912 * 'none' for arrays with no metadata (good luck...) 4913 * 'external' for arrays with externally managed metadata, 4914 * or N.M for internally known formats 4915 */ 4916 static ssize_t 4917 metadata_show(struct mddev *mddev, char *page) 4918 { 4919 if (mddev->persistent) 4920 return sprintf(page, "%d.%d\n", 4921 mddev->major_version, mddev->minor_version); 4922 else if (mddev->external) 4923 return sprintf(page, "external:%s\n", mddev->metadata_type); 4924 else 4925 return sprintf(page, "none\n"); 4926 } 4927 4928 static ssize_t 4929 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4930 { 4931 int major, minor; 4932 char *e; 4933 int err; 4934 /* Changing the details of 'external' metadata is 4935 * always permitted. Otherwise there must be 4936 * no devices attached to the array. 4937 */ 4938 4939 err = mddev_lock(mddev); 4940 if (err) 4941 return err; 4942 err = -EBUSY; 4943 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4944 ; 4945 else if (!list_empty(&mddev->disks)) 4946 goto out_unlock; 4947 4948 err = 0; 4949 if (cmd_match(buf, "none")) { 4950 mddev->persistent = 0; 4951 mddev->external = 0; 4952 mddev->major_version = 0; 4953 mddev->minor_version = 90; 4954 goto out_unlock; 4955 } 4956 if (strncmp(buf, "external:", 9) == 0) { 4957 size_t namelen = len-9; 4958 if (namelen >= sizeof(mddev->metadata_type)) 4959 namelen = sizeof(mddev->metadata_type)-1; 4960 memcpy(mddev->metadata_type, buf+9, namelen); 4961 mddev->metadata_type[namelen] = 0; 4962 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4963 mddev->metadata_type[--namelen] = 0; 4964 mddev->persistent = 0; 4965 mddev->external = 1; 4966 mddev->major_version = 0; 4967 mddev->minor_version = 90; 4968 goto out_unlock; 4969 } 4970 major = simple_strtoul(buf, &e, 10); 4971 err = -EINVAL; 4972 if (e==buf || *e != '.') 4973 goto out_unlock; 4974 buf = e+1; 4975 minor = simple_strtoul(buf, &e, 10); 4976 if (e==buf || (*e && *e != '\n') ) 4977 goto out_unlock; 4978 err = -ENOENT; 4979 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4980 goto out_unlock; 4981 mddev->major_version = major; 4982 mddev->minor_version = minor; 4983 mddev->persistent = 1; 4984 mddev->external = 0; 4985 err = 0; 4986 out_unlock: 4987 mddev_unlock(mddev); 4988 return err ?: len; 4989 } 4990 4991 static struct md_sysfs_entry md_metadata = 4992 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4993 4994 static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors) 4995 { 4996 return rdev->raid_disk >= 0 && 4997 !test_bit(Journal, &rdev->flags) && 4998 !test_bit(Faulty, &rdev->flags) && 4999 !test_bit(In_sync, &rdev->flags) && 5000 rdev->recovery_offset < sectors; 5001 } 5002 5003 static enum sync_action md_get_active_sync_action(struct mddev *mddev) 5004 { 5005 struct md_rdev *rdev; 5006 bool is_recover = false; 5007 5008 if (mddev->resync_offset < MaxSector) 5009 return ACTION_RESYNC; 5010 5011 if (mddev->reshape_position != MaxSector) 5012 return ACTION_RESHAPE; 5013 5014 rcu_read_lock(); 5015 rdev_for_each_rcu(rdev, mddev) { 5016 if (rdev_needs_recovery(rdev, MaxSector)) { 5017 is_recover = true; 5018 break; 5019 } 5020 } 5021 rcu_read_unlock(); 5022 5023 return is_recover ? ACTION_RECOVER : ACTION_IDLE; 5024 } 5025 5026 enum sync_action md_sync_action(struct mddev *mddev) 5027 { 5028 unsigned long recovery = mddev->recovery; 5029 enum sync_action active_action; 5030 5031 /* 5032 * frozen has the highest priority, means running sync_thread will be 5033 * stopped immediately, and no new sync_thread can start. 5034 */ 5035 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 5036 return ACTION_FROZEN; 5037 5038 /* 5039 * read-only array can't register sync_thread, and it can only 5040 * add/remove spares. 5041 */ 5042 if (!md_is_rdwr(mddev)) 5043 return ACTION_IDLE; 5044 5045 /* 5046 * idle means no sync_thread is running, and no new sync_thread is 5047 * requested. 5048 */ 5049 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && 5050 !test_bit(MD_RECOVERY_NEEDED, &recovery)) 5051 return ACTION_IDLE; 5052 5053 /* 5054 * Check if any sync operation (resync/recover/reshape) is 5055 * currently active. This ensures that only one sync operation 5056 * can run at a time. Returns the type of active operation, or 5057 * ACTION_IDLE if none are active. 5058 */ 5059 active_action = md_get_active_sync_action(mddev); 5060 if (active_action != ACTION_IDLE) 5061 return active_action; 5062 5063 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 5064 return ACTION_RESHAPE; 5065 5066 if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 5067 return ACTION_RECOVER; 5068 5069 if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 5070 /* 5071 * MD_RECOVERY_CHECK must be paired with 5072 * MD_RECOVERY_REQUESTED. 5073 */ 5074 if (test_bit(MD_RECOVERY_CHECK, &recovery)) 5075 return ACTION_CHECK; 5076 if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) 5077 return ACTION_REPAIR; 5078 return ACTION_RESYNC; 5079 } 5080 5081 /* 5082 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no 5083 * sync_action is specified. 5084 */ 5085 return ACTION_IDLE; 5086 } 5087 5088 enum sync_action md_sync_action_by_name(const char *page) 5089 { 5090 enum sync_action action; 5091 5092 for (action = 0; action < NR_SYNC_ACTIONS; ++action) { 5093 if (cmd_match(page, action_name[action])) 5094 return action; 5095 } 5096 5097 return NR_SYNC_ACTIONS; 5098 } 5099 5100 const char *md_sync_action_name(enum sync_action action) 5101 { 5102 return action_name[action]; 5103 } 5104 5105 static ssize_t 5106 action_show(struct mddev *mddev, char *page) 5107 { 5108 enum sync_action action = md_sync_action(mddev); 5109 5110 return sprintf(page, "%s\n", md_sync_action_name(action)); 5111 } 5112 5113 /** 5114 * stop_sync_thread() - wait for sync_thread to stop if it's running. 5115 * @mddev: the array. 5116 * @locked: if set, reconfig_mutex will still be held after this function 5117 * return; if not set, reconfig_mutex will be released after this 5118 * function return. 5119 */ 5120 static void stop_sync_thread(struct mddev *mddev, bool locked) 5121 { 5122 int sync_seq = atomic_read(&mddev->sync_seq); 5123 5124 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5125 if (!locked) 5126 mddev_unlock(mddev); 5127 return; 5128 } 5129 5130 mddev_unlock(mddev); 5131 5132 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5133 /* 5134 * Thread might be blocked waiting for metadata update which will now 5135 * never happen 5136 */ 5137 md_wakeup_thread_directly(mddev->sync_thread); 5138 if (work_pending(&mddev->sync_work)) 5139 flush_work(&mddev->sync_work); 5140 5141 wait_event(resync_wait, 5142 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5143 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && 5144 sync_seq != atomic_read(&mddev->sync_seq))); 5145 5146 if (locked) 5147 mddev_lock_nointr(mddev); 5148 } 5149 5150 void md_idle_sync_thread(struct mddev *mddev) 5151 { 5152 lockdep_assert_held(&mddev->reconfig_mutex); 5153 5154 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5155 stop_sync_thread(mddev, true); 5156 } 5157 EXPORT_SYMBOL_GPL(md_idle_sync_thread); 5158 5159 void md_frozen_sync_thread(struct mddev *mddev) 5160 { 5161 lockdep_assert_held(&mddev->reconfig_mutex); 5162 5163 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5164 stop_sync_thread(mddev, true); 5165 } 5166 EXPORT_SYMBOL_GPL(md_frozen_sync_thread); 5167 5168 void md_unfrozen_sync_thread(struct mddev *mddev) 5169 { 5170 lockdep_assert_held(&mddev->reconfig_mutex); 5171 5172 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5173 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5174 md_wakeup_thread(mddev->thread); 5175 sysfs_notify_dirent_safe(mddev->sysfs_action); 5176 } 5177 EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); 5178 5179 static int mddev_start_reshape(struct mddev *mddev) 5180 { 5181 int ret; 5182 5183 if (mddev->pers->start_reshape == NULL) 5184 return -EINVAL; 5185 5186 if (mddev->reshape_position == MaxSector || 5187 mddev->pers->check_reshape == NULL || 5188 mddev->pers->check_reshape(mddev)) { 5189 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5190 ret = mddev->pers->start_reshape(mddev); 5191 if (ret) 5192 return ret; 5193 } else { 5194 /* 5195 * If reshape is still in progress, and md_check_recovery() can 5196 * continue to reshape, don't restart reshape because data can 5197 * be corrupted for raid456. 5198 */ 5199 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5200 } 5201 5202 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 5203 return 0; 5204 } 5205 5206 static ssize_t 5207 action_store(struct mddev *mddev, const char *page, size_t len) 5208 { 5209 int ret; 5210 enum sync_action action; 5211 5212 if (!mddev->pers || !mddev->pers->sync_request) 5213 return -EINVAL; 5214 5215 retry: 5216 if (work_busy(&mddev->sync_work)) 5217 flush_work(&mddev->sync_work); 5218 5219 ret = mddev_lock(mddev); 5220 if (ret) 5221 return ret; 5222 5223 if (work_busy(&mddev->sync_work)) { 5224 mddev_unlock(mddev); 5225 goto retry; 5226 } 5227 5228 action = md_sync_action_by_name(page); 5229 5230 /* TODO: mdadm rely on "idle" to start sync_thread. */ 5231 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5232 switch (action) { 5233 case ACTION_FROZEN: 5234 md_frozen_sync_thread(mddev); 5235 ret = len; 5236 goto out; 5237 case ACTION_IDLE: 5238 md_idle_sync_thread(mddev); 5239 break; 5240 case ACTION_RESHAPE: 5241 case ACTION_RECOVER: 5242 case ACTION_CHECK: 5243 case ACTION_REPAIR: 5244 case ACTION_RESYNC: 5245 ret = -EBUSY; 5246 goto out; 5247 default: 5248 ret = -EINVAL; 5249 goto out; 5250 } 5251 } else { 5252 switch (action) { 5253 case ACTION_FROZEN: 5254 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5255 ret = len; 5256 goto out; 5257 case ACTION_RESHAPE: 5258 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5259 ret = mddev_start_reshape(mddev); 5260 if (ret) 5261 goto out; 5262 break; 5263 case ACTION_RECOVER: 5264 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5265 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5266 break; 5267 case ACTION_CHECK: 5268 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5269 fallthrough; 5270 case ACTION_REPAIR: 5271 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 5272 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5273 fallthrough; 5274 case ACTION_RESYNC: 5275 case ACTION_IDLE: 5276 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5277 break; 5278 default: 5279 ret = -EINVAL; 5280 goto out; 5281 } 5282 } 5283 5284 if (mddev->ro == MD_AUTO_READ) { 5285 /* A write to sync_action is enough to justify 5286 * canceling read-auto mode 5287 */ 5288 mddev->ro = MD_RDWR; 5289 md_wakeup_thread(mddev->sync_thread); 5290 } 5291 5292 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5293 md_wakeup_thread(mddev->thread); 5294 sysfs_notify_dirent_safe(mddev->sysfs_action); 5295 ret = len; 5296 5297 out: 5298 mddev_unlock(mddev); 5299 return ret; 5300 } 5301 5302 static struct md_sysfs_entry md_scan_mode = 5303 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 5304 5305 static ssize_t 5306 last_sync_action_show(struct mddev *mddev, char *page) 5307 { 5308 return sprintf(page, "%s\n", 5309 md_sync_action_name(mddev->last_sync_action)); 5310 } 5311 5312 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 5313 5314 static ssize_t 5315 mismatch_cnt_show(struct mddev *mddev, char *page) 5316 { 5317 return sprintf(page, "%llu\n", 5318 (unsigned long long) 5319 atomic64_read(&mddev->resync_mismatches)); 5320 } 5321 5322 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 5323 5324 static ssize_t 5325 sync_min_show(struct mddev *mddev, char *page) 5326 { 5327 return sprintf(page, "%d (%s)\n", speed_min(mddev), 5328 mddev->sync_speed_min ? "local" : "system"); 5329 } 5330 5331 static ssize_t 5332 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 5333 { 5334 unsigned int min; 5335 int rv; 5336 5337 if (strncmp(buf, "system", 6) == 0) { 5338 min = 0; 5339 } else { 5340 rv = kstrtouint(buf, 10, &min); 5341 if (rv < 0) 5342 return rv; 5343 if (min == 0) 5344 return -EINVAL; 5345 } 5346 mddev->sync_speed_min = min; 5347 return len; 5348 } 5349 5350 static struct md_sysfs_entry md_sync_min = 5351 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5352 5353 static ssize_t 5354 sync_max_show(struct mddev *mddev, char *page) 5355 { 5356 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5357 mddev->sync_speed_max ? "local" : "system"); 5358 } 5359 5360 static ssize_t 5361 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5362 { 5363 unsigned int max; 5364 int rv; 5365 5366 if (strncmp(buf, "system", 6) == 0) { 5367 max = 0; 5368 } else { 5369 rv = kstrtouint(buf, 10, &max); 5370 if (rv < 0) 5371 return rv; 5372 if (max == 0) 5373 return -EINVAL; 5374 } 5375 mddev->sync_speed_max = max; 5376 return len; 5377 } 5378 5379 static struct md_sysfs_entry md_sync_max = 5380 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5381 5382 static ssize_t 5383 sync_io_depth_show(struct mddev *mddev, char *page) 5384 { 5385 return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), 5386 mddev->sync_io_depth ? "local" : "system"); 5387 } 5388 5389 static ssize_t 5390 sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) 5391 { 5392 unsigned int max; 5393 int rv; 5394 5395 if (strncmp(buf, "system", 6) == 0) { 5396 max = 0; 5397 } else { 5398 rv = kstrtouint(buf, 10, &max); 5399 if (rv < 0) 5400 return rv; 5401 if (max == 0) 5402 return -EINVAL; 5403 } 5404 mddev->sync_io_depth = max; 5405 return len; 5406 } 5407 5408 static struct md_sysfs_entry md_sync_io_depth = 5409 __ATTR_RW(sync_io_depth); 5410 5411 static ssize_t 5412 degraded_show(struct mddev *mddev, char *page) 5413 { 5414 return sprintf(page, "%d\n", mddev->degraded); 5415 } 5416 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5417 5418 static ssize_t 5419 sync_force_parallel_show(struct mddev *mddev, char *page) 5420 { 5421 return sprintf(page, "%d\n", mddev->parallel_resync); 5422 } 5423 5424 static ssize_t 5425 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5426 { 5427 long n; 5428 5429 if (kstrtol(buf, 10, &n)) 5430 return -EINVAL; 5431 5432 if (n != 0 && n != 1) 5433 return -EINVAL; 5434 5435 mddev->parallel_resync = n; 5436 5437 if (mddev->sync_thread) 5438 wake_up(&resync_wait); 5439 5440 return len; 5441 } 5442 5443 /* force parallel resync, even with shared block devices */ 5444 static struct md_sysfs_entry md_sync_force_parallel = 5445 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5446 sync_force_parallel_show, sync_force_parallel_store); 5447 5448 static ssize_t 5449 sync_speed_show(struct mddev *mddev, char *page) 5450 { 5451 unsigned long resync, dt, db; 5452 if (mddev->curr_resync == MD_RESYNC_NONE) 5453 return sprintf(page, "none\n"); 5454 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5455 dt = (jiffies - mddev->resync_mark) / HZ; 5456 if (!dt) dt++; 5457 db = resync - mddev->resync_mark_cnt; 5458 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5459 } 5460 5461 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5462 5463 static ssize_t 5464 sync_completed_show(struct mddev *mddev, char *page) 5465 { 5466 unsigned long long max_sectors, resync; 5467 5468 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5469 return sprintf(page, "none\n"); 5470 5471 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5472 mddev->curr_resync == MD_RESYNC_DELAYED) 5473 return sprintf(page, "delayed\n"); 5474 5475 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5476 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5477 max_sectors = mddev->resync_max_sectors; 5478 else 5479 max_sectors = mddev->dev_sectors; 5480 5481 resync = mddev->curr_resync_completed; 5482 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5483 } 5484 5485 static struct md_sysfs_entry md_sync_completed = 5486 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5487 5488 static ssize_t 5489 min_sync_show(struct mddev *mddev, char *page) 5490 { 5491 return sprintf(page, "%llu\n", 5492 (unsigned long long)mddev->resync_min); 5493 } 5494 static ssize_t 5495 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5496 { 5497 unsigned long long min; 5498 int err; 5499 5500 if (kstrtoull(buf, 10, &min)) 5501 return -EINVAL; 5502 5503 spin_lock(&mddev->lock); 5504 err = -EINVAL; 5505 if (min > mddev->resync_max) 5506 goto out_unlock; 5507 5508 err = -EBUSY; 5509 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5510 goto out_unlock; 5511 5512 /* Round down to multiple of 4K for safety */ 5513 mddev->resync_min = round_down(min, 8); 5514 err = 0; 5515 5516 out_unlock: 5517 spin_unlock(&mddev->lock); 5518 return err ?: len; 5519 } 5520 5521 static struct md_sysfs_entry md_min_sync = 5522 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5523 5524 static ssize_t 5525 max_sync_show(struct mddev *mddev, char *page) 5526 { 5527 if (mddev->resync_max == MaxSector) 5528 return sprintf(page, "max\n"); 5529 else 5530 return sprintf(page, "%llu\n", 5531 (unsigned long long)mddev->resync_max); 5532 } 5533 static ssize_t 5534 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5535 { 5536 int err; 5537 spin_lock(&mddev->lock); 5538 if (strncmp(buf, "max", 3) == 0) 5539 mddev->resync_max = MaxSector; 5540 else { 5541 unsigned long long max; 5542 int chunk; 5543 5544 err = -EINVAL; 5545 if (kstrtoull(buf, 10, &max)) 5546 goto out_unlock; 5547 if (max < mddev->resync_min) 5548 goto out_unlock; 5549 5550 err = -EBUSY; 5551 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5552 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5553 goto out_unlock; 5554 5555 /* Must be a multiple of chunk_size */ 5556 chunk = mddev->chunk_sectors; 5557 if (chunk) { 5558 sector_t temp = max; 5559 5560 err = -EINVAL; 5561 if (sector_div(temp, chunk)) 5562 goto out_unlock; 5563 } 5564 mddev->resync_max = max; 5565 } 5566 wake_up(&mddev->recovery_wait); 5567 err = 0; 5568 out_unlock: 5569 spin_unlock(&mddev->lock); 5570 return err ?: len; 5571 } 5572 5573 static struct md_sysfs_entry md_max_sync = 5574 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5575 5576 static ssize_t 5577 suspend_lo_show(struct mddev *mddev, char *page) 5578 { 5579 return sprintf(page, "%llu\n", 5580 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5581 } 5582 5583 static ssize_t 5584 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5585 { 5586 unsigned long long new; 5587 int err; 5588 5589 err = kstrtoull(buf, 10, &new); 5590 if (err < 0) 5591 return err; 5592 if (new != (sector_t)new) 5593 return -EINVAL; 5594 5595 err = mddev_suspend(mddev, true); 5596 if (err) 5597 return err; 5598 5599 WRITE_ONCE(mddev->suspend_lo, new); 5600 mddev_resume(mddev); 5601 5602 return len; 5603 } 5604 static struct md_sysfs_entry md_suspend_lo = 5605 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5606 5607 static ssize_t 5608 suspend_hi_show(struct mddev *mddev, char *page) 5609 { 5610 return sprintf(page, "%llu\n", 5611 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5612 } 5613 5614 static ssize_t 5615 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5616 { 5617 unsigned long long new; 5618 int err; 5619 5620 err = kstrtoull(buf, 10, &new); 5621 if (err < 0) 5622 return err; 5623 if (new != (sector_t)new) 5624 return -EINVAL; 5625 5626 err = mddev_suspend(mddev, true); 5627 if (err) 5628 return err; 5629 5630 WRITE_ONCE(mddev->suspend_hi, new); 5631 mddev_resume(mddev); 5632 5633 return len; 5634 } 5635 static struct md_sysfs_entry md_suspend_hi = 5636 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5637 5638 static ssize_t 5639 reshape_position_show(struct mddev *mddev, char *page) 5640 { 5641 if (mddev->reshape_position != MaxSector) 5642 return sprintf(page, "%llu\n", 5643 (unsigned long long)mddev->reshape_position); 5644 strcpy(page, "none\n"); 5645 return 5; 5646 } 5647 5648 static ssize_t 5649 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5650 { 5651 struct md_rdev *rdev; 5652 unsigned long long new; 5653 int err; 5654 5655 err = kstrtoull(buf, 10, &new); 5656 if (err < 0) 5657 return err; 5658 if (new != (sector_t)new) 5659 return -EINVAL; 5660 err = mddev_lock(mddev); 5661 if (err) 5662 return err; 5663 err = -EBUSY; 5664 if (mddev->pers) 5665 goto unlock; 5666 mddev->reshape_position = new; 5667 mddev->delta_disks = 0; 5668 mddev->reshape_backwards = 0; 5669 mddev->new_level = mddev->level; 5670 mddev->new_layout = mddev->layout; 5671 mddev->new_chunk_sectors = mddev->chunk_sectors; 5672 rdev_for_each(rdev, mddev) 5673 rdev->new_data_offset = rdev->data_offset; 5674 err = 0; 5675 unlock: 5676 mddev_unlock(mddev); 5677 return err ?: len; 5678 } 5679 5680 static struct md_sysfs_entry md_reshape_position = 5681 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5682 reshape_position_store); 5683 5684 static ssize_t 5685 reshape_direction_show(struct mddev *mddev, char *page) 5686 { 5687 return sprintf(page, "%s\n", 5688 mddev->reshape_backwards ? "backwards" : "forwards"); 5689 } 5690 5691 static ssize_t 5692 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5693 { 5694 int backwards = 0; 5695 int err; 5696 5697 if (cmd_match(buf, "forwards")) 5698 backwards = 0; 5699 else if (cmd_match(buf, "backwards")) 5700 backwards = 1; 5701 else 5702 return -EINVAL; 5703 if (mddev->reshape_backwards == backwards) 5704 return len; 5705 5706 err = mddev_lock(mddev); 5707 if (err) 5708 return err; 5709 /* check if we are allowed to change */ 5710 if (mddev->delta_disks) 5711 err = -EBUSY; 5712 else if (mddev->persistent && 5713 mddev->major_version == 0) 5714 err = -EINVAL; 5715 else 5716 mddev->reshape_backwards = backwards; 5717 mddev_unlock(mddev); 5718 return err ?: len; 5719 } 5720 5721 static struct md_sysfs_entry md_reshape_direction = 5722 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5723 reshape_direction_store); 5724 5725 static ssize_t 5726 array_size_show(struct mddev *mddev, char *page) 5727 { 5728 if (mddev->external_size) 5729 return sprintf(page, "%llu\n", 5730 (unsigned long long)mddev->array_sectors/2); 5731 else 5732 return sprintf(page, "default\n"); 5733 } 5734 5735 static ssize_t 5736 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5737 { 5738 sector_t sectors; 5739 int err; 5740 5741 err = mddev_lock(mddev); 5742 if (err) 5743 return err; 5744 5745 /* cluster raid doesn't support change array_sectors */ 5746 if (mddev_is_clustered(mddev)) { 5747 mddev_unlock(mddev); 5748 return -EINVAL; 5749 } 5750 5751 if (strncmp(buf, "default", 7) == 0) { 5752 if (mddev->pers) 5753 sectors = mddev->pers->size(mddev, 0, 0); 5754 else 5755 sectors = mddev->array_sectors; 5756 5757 mddev->external_size = 0; 5758 } else { 5759 if (strict_blocks_to_sectors(buf, §ors) < 0) 5760 err = -EINVAL; 5761 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5762 err = -E2BIG; 5763 else 5764 mddev->external_size = 1; 5765 } 5766 5767 if (!err) { 5768 mddev->array_sectors = sectors; 5769 if (mddev->pers) 5770 set_capacity_and_notify(mddev->gendisk, 5771 mddev->array_sectors); 5772 } 5773 mddev_unlock(mddev); 5774 return err ?: len; 5775 } 5776 5777 static struct md_sysfs_entry md_array_size = 5778 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5779 array_size_store); 5780 5781 static ssize_t 5782 consistency_policy_show(struct mddev *mddev, char *page) 5783 { 5784 int ret; 5785 5786 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5787 ret = sprintf(page, "journal\n"); 5788 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5789 ret = sprintf(page, "ppl\n"); 5790 } else if (mddev->bitmap) { 5791 ret = sprintf(page, "bitmap\n"); 5792 } else if (mddev->pers) { 5793 if (mddev->pers->sync_request) 5794 ret = sprintf(page, "resync\n"); 5795 else 5796 ret = sprintf(page, "none\n"); 5797 } else { 5798 ret = sprintf(page, "unknown\n"); 5799 } 5800 5801 return ret; 5802 } 5803 5804 static ssize_t 5805 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5806 { 5807 int err = 0; 5808 5809 if (mddev->pers) { 5810 if (mddev->pers->change_consistency_policy) 5811 err = mddev->pers->change_consistency_policy(mddev, buf); 5812 else 5813 err = -EBUSY; 5814 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5815 set_bit(MD_HAS_PPL, &mddev->flags); 5816 } else { 5817 err = -EINVAL; 5818 } 5819 5820 return err ? err : len; 5821 } 5822 5823 static struct md_sysfs_entry md_consistency_policy = 5824 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5825 consistency_policy_store); 5826 5827 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5828 { 5829 return sprintf(page, "%d\n", mddev->fail_last_dev); 5830 } 5831 5832 /* 5833 * Setting fail_last_dev to true to allow last device to be forcibly removed 5834 * from RAID1/RAID10. 5835 */ 5836 static ssize_t 5837 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5838 { 5839 int ret; 5840 bool value; 5841 5842 ret = kstrtobool(buf, &value); 5843 if (ret) 5844 return ret; 5845 5846 if (value != mddev->fail_last_dev) 5847 mddev->fail_last_dev = value; 5848 5849 return len; 5850 } 5851 static struct md_sysfs_entry md_fail_last_dev = 5852 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5853 fail_last_dev_store); 5854 5855 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5856 { 5857 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) 5858 return sprintf(page, "n/a\n"); 5859 else 5860 return sprintf(page, "%d\n", mddev->serialize_policy); 5861 } 5862 5863 /* 5864 * Setting serialize_policy to true to enforce write IO is not reordered 5865 * for raid1. 5866 */ 5867 static ssize_t 5868 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5869 { 5870 int err; 5871 bool value; 5872 5873 err = kstrtobool(buf, &value); 5874 if (err) 5875 return err; 5876 5877 if (value == mddev->serialize_policy) 5878 return len; 5879 5880 err = mddev_suspend_and_lock(mddev); 5881 if (err) 5882 return err; 5883 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { 5884 pr_err("md: serialize_policy is only effective for raid1\n"); 5885 err = -EINVAL; 5886 goto unlock; 5887 } 5888 5889 if (value) 5890 mddev_create_serial_pool(mddev, NULL); 5891 else 5892 mddev_destroy_serial_pool(mddev, NULL); 5893 mddev->serialize_policy = value; 5894 unlock: 5895 mddev_unlock_and_resume(mddev); 5896 return err ?: len; 5897 } 5898 5899 static struct md_sysfs_entry md_serialize_policy = 5900 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5901 serialize_policy_store); 5902 5903 5904 static struct attribute *md_default_attrs[] = { 5905 &md_level.attr, 5906 &md_new_level.attr, 5907 &md_bitmap_type.attr, 5908 &md_layout.attr, 5909 &md_raid_disks.attr, 5910 &md_uuid.attr, 5911 &md_chunk_size.attr, 5912 &md_size.attr, 5913 &md_resync_start.attr, 5914 &md_metadata.attr, 5915 &md_new_device.attr, 5916 &md_safe_delay.attr, 5917 &md_array_state.attr, 5918 &md_reshape_position.attr, 5919 &md_reshape_direction.attr, 5920 &md_array_size.attr, 5921 &max_corr_read_errors.attr, 5922 &md_consistency_policy.attr, 5923 &md_fail_last_dev.attr, 5924 &md_serialize_policy.attr, 5925 NULL, 5926 }; 5927 5928 static const struct attribute_group md_default_group = { 5929 .attrs = md_default_attrs, 5930 }; 5931 5932 static struct attribute *md_redundancy_attrs[] = { 5933 &md_scan_mode.attr, 5934 &md_last_scan_mode.attr, 5935 &md_mismatches.attr, 5936 &md_sync_min.attr, 5937 &md_sync_max.attr, 5938 &md_sync_io_depth.attr, 5939 &md_sync_speed.attr, 5940 &md_sync_force_parallel.attr, 5941 &md_sync_completed.attr, 5942 &md_min_sync.attr, 5943 &md_max_sync.attr, 5944 &md_suspend_lo.attr, 5945 &md_suspend_hi.attr, 5946 &md_bitmap.attr, 5947 &md_degraded.attr, 5948 NULL, 5949 }; 5950 static const struct attribute_group md_redundancy_group = { 5951 .name = NULL, 5952 .attrs = md_redundancy_attrs, 5953 }; 5954 5955 static const struct attribute_group *md_attr_groups[] = { 5956 &md_default_group, 5957 NULL, 5958 }; 5959 5960 static ssize_t 5961 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5962 { 5963 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5964 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5965 ssize_t rv; 5966 5967 if (!entry->show) 5968 return -EIO; 5969 spin_lock(&all_mddevs_lock); 5970 if (!mddev_get(mddev)) { 5971 spin_unlock(&all_mddevs_lock); 5972 return -EBUSY; 5973 } 5974 spin_unlock(&all_mddevs_lock); 5975 5976 rv = entry->show(mddev, page); 5977 mddev_put(mddev); 5978 return rv; 5979 } 5980 5981 static ssize_t 5982 md_attr_store(struct kobject *kobj, struct attribute *attr, 5983 const char *page, size_t length) 5984 { 5985 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5986 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5987 ssize_t rv; 5988 struct kernfs_node *kn = NULL; 5989 5990 if (!entry->store) 5991 return -EIO; 5992 if (!capable(CAP_SYS_ADMIN)) 5993 return -EACCES; 5994 5995 if (entry->store == array_state_store && cmd_match(page, "clear")) 5996 kn = sysfs_break_active_protection(kobj, attr); 5997 5998 spin_lock(&all_mddevs_lock); 5999 if (!mddev_get(mddev)) { 6000 spin_unlock(&all_mddevs_lock); 6001 if (kn) 6002 sysfs_unbreak_active_protection(kn); 6003 return -EBUSY; 6004 } 6005 spin_unlock(&all_mddevs_lock); 6006 rv = entry->store(mddev, page, length); 6007 mddev_put(mddev); 6008 6009 if (kn) 6010 sysfs_unbreak_active_protection(kn); 6011 6012 return rv; 6013 } 6014 6015 static void md_kobj_release(struct kobject *ko) 6016 { 6017 struct mddev *mddev = container_of(ko, struct mddev, kobj); 6018 6019 if (legacy_async_del_gendisk) { 6020 if (mddev->sysfs_state) 6021 sysfs_put(mddev->sysfs_state); 6022 if (mddev->sysfs_level) 6023 sysfs_put(mddev->sysfs_level); 6024 del_gendisk(mddev->gendisk); 6025 } 6026 put_disk(mddev->gendisk); 6027 } 6028 6029 static const struct sysfs_ops md_sysfs_ops = { 6030 .show = md_attr_show, 6031 .store = md_attr_store, 6032 }; 6033 static const struct kobj_type md_ktype = { 6034 .release = md_kobj_release, 6035 .sysfs_ops = &md_sysfs_ops, 6036 .default_groups = md_attr_groups, 6037 }; 6038 6039 int mdp_major = 0; 6040 6041 /* stack the limit for all rdevs into lim */ 6042 int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, 6043 unsigned int flags) 6044 { 6045 struct md_rdev *rdev; 6046 6047 rdev_for_each(rdev, mddev) { 6048 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, 6049 mddev->gendisk->disk_name); 6050 if ((flags & MDDEV_STACK_INTEGRITY) && 6051 !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) 6052 return -EINVAL; 6053 } 6054 6055 return 0; 6056 } 6057 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); 6058 6059 /* apply the extra stacking limits from a new rdev into mddev */ 6060 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) 6061 { 6062 struct queue_limits lim; 6063 6064 if (mddev_is_dm(mddev)) 6065 return 0; 6066 6067 lim = queue_limits_start_update(mddev->gendisk->queue); 6068 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, 6069 mddev->gendisk->disk_name); 6070 6071 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { 6072 pr_err("%s: incompatible integrity profile for %pg\n", 6073 mdname(mddev), rdev->bdev); 6074 queue_limits_cancel_update(mddev->gendisk->queue); 6075 return -ENXIO; 6076 } 6077 6078 return queue_limits_commit_update(mddev->gendisk->queue, &lim); 6079 } 6080 EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); 6081 6082 /* update the optimal I/O size after a reshape */ 6083 void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) 6084 { 6085 struct queue_limits lim; 6086 6087 if (mddev_is_dm(mddev)) 6088 return; 6089 6090 /* don't bother updating io_opt if we can't suspend the array */ 6091 if (mddev_suspend(mddev, false) < 0) 6092 return; 6093 lim = queue_limits_start_update(mddev->gendisk->queue); 6094 lim.io_opt = lim.io_min * nr_stripes; 6095 queue_limits_commit_update(mddev->gendisk->queue, &lim); 6096 mddev_resume(mddev); 6097 } 6098 EXPORT_SYMBOL_GPL(mddev_update_io_opt); 6099 6100 static void mddev_delayed_delete(struct work_struct *ws) 6101 { 6102 struct mddev *mddev = container_of(ws, struct mddev, del_work); 6103 6104 kobject_put(&mddev->kobj); 6105 } 6106 6107 void md_init_stacking_limits(struct queue_limits *lim) 6108 { 6109 blk_set_stacking_limits(lim); 6110 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | 6111 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; 6112 } 6113 EXPORT_SYMBOL_GPL(md_init_stacking_limits); 6114 6115 struct mddev *md_alloc(dev_t dev, char *name) 6116 { 6117 /* 6118 * If dev is zero, name is the name of a device to allocate with 6119 * an arbitrary minor number. It will be "md_???" 6120 * If dev is non-zero it must be a device number with a MAJOR of 6121 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 6122 * the device is being created by opening a node in /dev. 6123 * If "name" is not NULL, the device is being created by 6124 * writing to /sys/module/md_mod/parameters/new_array. 6125 */ 6126 static DEFINE_MUTEX(disks_mutex); 6127 struct mddev *mddev; 6128 struct gendisk *disk; 6129 int partitioned; 6130 int shift; 6131 int unit; 6132 int error; 6133 6134 /* 6135 * Wait for any previous instance of this device to be completely 6136 * removed (mddev_delayed_delete). 6137 */ 6138 flush_workqueue(md_misc_wq); 6139 6140 mutex_lock(&disks_mutex); 6141 mddev = mddev_alloc(dev); 6142 if (IS_ERR(mddev)) { 6143 error = PTR_ERR(mddev); 6144 goto out_unlock; 6145 } 6146 6147 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 6148 shift = partitioned ? MdpMinorShift : 0; 6149 unit = MINOR(mddev->unit) >> shift; 6150 6151 if (name && !dev) { 6152 /* Need to ensure that 'name' is not a duplicate. 6153 */ 6154 struct mddev *mddev2; 6155 spin_lock(&all_mddevs_lock); 6156 6157 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 6158 if (mddev2->gendisk && 6159 strcmp(mddev2->gendisk->disk_name, name) == 0) { 6160 spin_unlock(&all_mddevs_lock); 6161 error = -EEXIST; 6162 goto out_free_mddev; 6163 } 6164 spin_unlock(&all_mddevs_lock); 6165 } 6166 if (name && dev) 6167 /* 6168 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 6169 */ 6170 mddev->hold_active = UNTIL_STOP; 6171 6172 disk = blk_alloc_disk(NULL, NUMA_NO_NODE); 6173 if (IS_ERR(disk)) { 6174 error = PTR_ERR(disk); 6175 goto out_free_mddev; 6176 } 6177 6178 disk->major = MAJOR(mddev->unit); 6179 disk->first_minor = unit << shift; 6180 disk->minors = 1 << shift; 6181 if (name) 6182 strcpy(disk->disk_name, name); 6183 else if (partitioned) 6184 sprintf(disk->disk_name, "md_d%d", unit); 6185 else 6186 sprintf(disk->disk_name, "md%d", unit); 6187 disk->fops = &md_fops; 6188 disk->private_data = mddev; 6189 6190 disk->events |= DISK_EVENT_MEDIA_CHANGE; 6191 mddev->gendisk = disk; 6192 error = add_disk(disk); 6193 if (error) 6194 goto out_put_disk; 6195 6196 kobject_init(&mddev->kobj, &md_ktype); 6197 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 6198 if (error) { 6199 /* 6200 * The disk is already live at this point. Clear the hold flag 6201 * and let mddev_put take care of the deletion, as it isn't any 6202 * different from a normal close on last release now. 6203 */ 6204 mddev->hold_active = 0; 6205 mutex_unlock(&disks_mutex); 6206 mddev_put(mddev); 6207 return ERR_PTR(error); 6208 } 6209 6210 kobject_uevent(&mddev->kobj, KOBJ_ADD); 6211 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 6212 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 6213 mutex_unlock(&disks_mutex); 6214 return mddev; 6215 6216 out_put_disk: 6217 put_disk(disk); 6218 out_free_mddev: 6219 mddev_free(mddev); 6220 out_unlock: 6221 mutex_unlock(&disks_mutex); 6222 return ERR_PTR(error); 6223 } 6224 6225 static int md_alloc_and_put(dev_t dev, char *name) 6226 { 6227 struct mddev *mddev = md_alloc(dev, name); 6228 6229 if (legacy_async_del_gendisk) 6230 pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n"); 6231 6232 if (IS_ERR(mddev)) 6233 return PTR_ERR(mddev); 6234 mddev_put(mddev); 6235 return 0; 6236 } 6237 6238 static void md_probe(dev_t dev) 6239 { 6240 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 6241 return; 6242 if (create_on_open) 6243 md_alloc_and_put(dev, NULL); 6244 } 6245 6246 static int add_named_array(const char *val, const struct kernel_param *kp) 6247 { 6248 /* 6249 * val must be "md_*" or "mdNNN". 6250 * For "md_*" we allocate an array with a large free minor number, and 6251 * set the name to val. val must not already be an active name. 6252 * For "mdNNN" we allocate an array with the minor number NNN 6253 * which must not already be in use. 6254 */ 6255 int len = strlen(val); 6256 char buf[DISK_NAME_LEN]; 6257 unsigned long devnum; 6258 6259 while (len && val[len-1] == '\n') 6260 len--; 6261 if (len >= DISK_NAME_LEN) 6262 return -E2BIG; 6263 strscpy(buf, val, len+1); 6264 if (strncmp(buf, "md_", 3) == 0) 6265 return md_alloc_and_put(0, buf); 6266 if (strncmp(buf, "md", 2) == 0 && 6267 isdigit(buf[2]) && 6268 kstrtoul(buf+2, 10, &devnum) == 0 && 6269 devnum <= MINORMASK) 6270 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 6271 6272 return -EINVAL; 6273 } 6274 6275 static void md_safemode_timeout(struct timer_list *t) 6276 { 6277 struct mddev *mddev = timer_container_of(mddev, t, safemode_timer); 6278 6279 mddev->safemode = 1; 6280 if (mddev->external) 6281 sysfs_notify_dirent_safe(mddev->sysfs_state); 6282 6283 md_wakeup_thread(mddev->thread); 6284 } 6285 6286 static int start_dirty_degraded; 6287 6288 static int md_bitmap_create(struct mddev *mddev) 6289 { 6290 if (mddev->bitmap_id == ID_BITMAP_NONE) 6291 return -EINVAL; 6292 6293 if (!mddev_set_bitmap_ops(mddev)) 6294 return -ENOENT; 6295 6296 return mddev->bitmap_ops->create(mddev); 6297 } 6298 6299 static void md_bitmap_destroy(struct mddev *mddev) 6300 { 6301 if (!md_bitmap_registered(mddev)) 6302 return; 6303 6304 mddev->bitmap_ops->destroy(mddev); 6305 mddev_clear_bitmap_ops(mddev); 6306 } 6307 6308 int md_run(struct mddev *mddev) 6309 { 6310 int err; 6311 struct md_rdev *rdev; 6312 struct md_personality *pers; 6313 bool nowait = true; 6314 6315 if (list_empty(&mddev->disks)) 6316 /* cannot run an array with no devices.. */ 6317 return -EINVAL; 6318 6319 if (mddev->pers) 6320 return -EBUSY; 6321 /* Cannot run until previous stop completes properly */ 6322 if (mddev->sysfs_active) 6323 return -EBUSY; 6324 6325 /* 6326 * Analyze all RAID superblock(s) 6327 */ 6328 if (!mddev->raid_disks) { 6329 if (!mddev->persistent) 6330 return -EINVAL; 6331 err = analyze_sbs(mddev); 6332 if (err) 6333 return -EINVAL; 6334 } 6335 6336 if (mddev->level != LEVEL_NONE) 6337 request_module("md-level-%d", mddev->level); 6338 else if (mddev->clevel[0]) 6339 request_module("md-%s", mddev->clevel); 6340 6341 /* 6342 * Drop all container device buffers, from now on 6343 * the only valid external interface is through the md 6344 * device. 6345 */ 6346 mddev->has_superblocks = false; 6347 rdev_for_each(rdev, mddev) { 6348 if (test_bit(Faulty, &rdev->flags)) 6349 continue; 6350 sync_blockdev(rdev->bdev); 6351 invalidate_bdev(rdev->bdev); 6352 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 6353 mddev->ro = MD_RDONLY; 6354 if (!mddev_is_dm(mddev)) 6355 set_disk_ro(mddev->gendisk, 1); 6356 } 6357 6358 if (rdev->sb_page) 6359 mddev->has_superblocks = true; 6360 6361 /* perform some consistency tests on the device. 6362 * We don't want the data to overlap the metadata, 6363 * Internal Bitmap issues have been handled elsewhere. 6364 */ 6365 if (rdev->meta_bdev) { 6366 /* Nothing to check */; 6367 } else if (rdev->data_offset < rdev->sb_start) { 6368 if (mddev->dev_sectors && 6369 rdev->data_offset + mddev->dev_sectors 6370 > rdev->sb_start) { 6371 pr_warn("md: %s: data overlaps metadata\n", 6372 mdname(mddev)); 6373 return -EINVAL; 6374 } 6375 } else { 6376 if (rdev->sb_start + rdev->sb_size/512 6377 > rdev->data_offset) { 6378 pr_warn("md: %s: metadata overlaps data\n", 6379 mdname(mddev)); 6380 return -EINVAL; 6381 } 6382 } 6383 sysfs_notify_dirent_safe(rdev->sysfs_state); 6384 nowait = nowait && bdev_nowait(rdev->bdev); 6385 } 6386 6387 if (!bioset_initialized(&mddev->bio_set)) { 6388 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6389 if (err) 6390 return err; 6391 } 6392 if (!bioset_initialized(&mddev->sync_set)) { 6393 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6394 if (err) 6395 goto exit_bio_set; 6396 } 6397 6398 if (!bioset_initialized(&mddev->io_clone_set)) { 6399 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 6400 offsetof(struct md_io_clone, bio_clone), 0); 6401 if (err) 6402 goto exit_sync_set; 6403 } 6404 6405 pers = get_pers(mddev->level, mddev->clevel); 6406 if (!pers) { 6407 err = -EINVAL; 6408 goto abort; 6409 } 6410 if (mddev->level != pers->head.id) { 6411 mddev->level = pers->head.id; 6412 mddev->new_level = pers->head.id; 6413 } 6414 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 6415 6416 if (mddev->reshape_position != MaxSector && 6417 pers->start_reshape == NULL) { 6418 /* This personality cannot handle reshaping... */ 6419 put_pers(pers); 6420 err = -EINVAL; 6421 goto abort; 6422 } 6423 6424 if (pers->sync_request) { 6425 /* Warn if this is a potentially silly 6426 * configuration. 6427 */ 6428 struct md_rdev *rdev2; 6429 int warned = 0; 6430 6431 rdev_for_each(rdev, mddev) 6432 rdev_for_each(rdev2, mddev) { 6433 if (rdev < rdev2 && 6434 rdev->bdev->bd_disk == 6435 rdev2->bdev->bd_disk) { 6436 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 6437 mdname(mddev), 6438 rdev->bdev, 6439 rdev2->bdev); 6440 warned = 1; 6441 } 6442 } 6443 6444 if (warned) 6445 pr_warn("True protection against single-disk failure might be compromised.\n"); 6446 } 6447 6448 /* dm-raid expect sync_thread to be frozen until resume */ 6449 if (mddev->gendisk) 6450 mddev->recovery = 0; 6451 6452 /* may be over-ridden by personality */ 6453 mddev->resync_max_sectors = mddev->dev_sectors; 6454 6455 mddev->ok_start_degraded = start_dirty_degraded; 6456 6457 if (start_readonly && md_is_rdwr(mddev)) 6458 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6459 6460 err = pers->run(mddev); 6461 if (err) 6462 pr_warn("md: pers->run() failed ...\n"); 6463 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6464 WARN_ONCE(!mddev->external_size, 6465 "%s: default size too small, but 'external_size' not in effect?\n", 6466 __func__); 6467 pr_warn("md: invalid array_size %llu > default size %llu\n", 6468 (unsigned long long)mddev->array_sectors / 2, 6469 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6470 err = -EINVAL; 6471 } 6472 if (err == 0 && pers->sync_request && 6473 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6474 err = md_bitmap_create(mddev); 6475 if (err) 6476 pr_warn("%s: failed to create bitmap (%d)\n", 6477 mdname(mddev), err); 6478 } 6479 if (err) 6480 goto bitmap_abort; 6481 6482 if (mddev->bitmap_info.max_write_behind > 0) { 6483 bool create_pool = false; 6484 6485 rdev_for_each(rdev, mddev) { 6486 if (test_bit(WriteMostly, &rdev->flags) && 6487 rdev_init_serial(rdev)) 6488 create_pool = true; 6489 } 6490 if (create_pool && mddev->serial_info_pool == NULL) { 6491 mddev->serial_info_pool = 6492 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6493 sizeof(struct serial_info)); 6494 if (!mddev->serial_info_pool) { 6495 err = -ENOMEM; 6496 goto bitmap_abort; 6497 } 6498 } 6499 } 6500 6501 if (pers->sync_request) { 6502 if (mddev->kobj.sd && 6503 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6504 pr_warn("md: cannot register extra attributes for %s\n", 6505 mdname(mddev)); 6506 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6507 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6508 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6509 } else if (mddev->ro == MD_AUTO_READ) 6510 mddev->ro = MD_RDWR; 6511 6512 atomic_set(&mddev->max_corr_read_errors, 6513 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6514 mddev->safemode = 0; 6515 if (mddev_is_clustered(mddev)) 6516 mddev->safemode_delay = 0; 6517 else 6518 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6519 mddev->in_sync = 1; 6520 smp_wmb(); 6521 spin_lock(&mddev->lock); 6522 mddev->pers = pers; 6523 spin_unlock(&mddev->lock); 6524 rdev_for_each(rdev, mddev) 6525 if (rdev->raid_disk >= 0) 6526 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6527 6528 if (mddev->degraded && md_is_rdwr(mddev)) 6529 /* This ensures that recovering status is reported immediately 6530 * via sysfs - until a lack of spares is confirmed. 6531 */ 6532 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6533 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6534 6535 if (mddev->sb_flags) 6536 md_update_sb(mddev, 0); 6537 6538 md_new_event(); 6539 return 0; 6540 6541 bitmap_abort: 6542 mddev_detach(mddev); 6543 if (mddev->private) 6544 pers->free(mddev, mddev->private); 6545 mddev->private = NULL; 6546 put_pers(pers); 6547 md_bitmap_destroy(mddev); 6548 abort: 6549 bioset_exit(&mddev->io_clone_set); 6550 exit_sync_set: 6551 bioset_exit(&mddev->sync_set); 6552 exit_bio_set: 6553 bioset_exit(&mddev->bio_set); 6554 return err; 6555 } 6556 EXPORT_SYMBOL_GPL(md_run); 6557 6558 int do_md_run(struct mddev *mddev) 6559 { 6560 int err; 6561 6562 set_bit(MD_NOT_READY, &mddev->flags); 6563 err = md_run(mddev); 6564 if (err) 6565 goto out; 6566 6567 if (md_bitmap_registered(mddev)) { 6568 err = mddev->bitmap_ops->load(mddev); 6569 if (err) { 6570 md_bitmap_destroy(mddev); 6571 goto out; 6572 } 6573 } 6574 6575 if (mddev_is_clustered(mddev)) 6576 md_allow_write(mddev); 6577 6578 /* run start up tasks that require md_thread */ 6579 md_start(mddev); 6580 6581 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6582 6583 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6584 clear_bit(MD_NOT_READY, &mddev->flags); 6585 mddev->changed = 1; 6586 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6587 sysfs_notify_dirent_safe(mddev->sysfs_state); 6588 sysfs_notify_dirent_safe(mddev->sysfs_action); 6589 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6590 out: 6591 clear_bit(MD_NOT_READY, &mddev->flags); 6592 return err; 6593 } 6594 6595 int md_start(struct mddev *mddev) 6596 { 6597 int ret = 0; 6598 6599 if (mddev->pers->start) { 6600 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6601 ret = mddev->pers->start(mddev); 6602 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6603 md_wakeup_thread(mddev->sync_thread); 6604 } 6605 return ret; 6606 } 6607 EXPORT_SYMBOL_GPL(md_start); 6608 6609 static int restart_array(struct mddev *mddev) 6610 { 6611 struct gendisk *disk = mddev->gendisk; 6612 struct md_rdev *rdev; 6613 bool has_journal = false; 6614 bool has_readonly = false; 6615 6616 /* Complain if it has no devices */ 6617 if (list_empty(&mddev->disks)) 6618 return -ENXIO; 6619 if (!mddev->pers) 6620 return -EINVAL; 6621 if (md_is_rdwr(mddev)) 6622 return -EBUSY; 6623 6624 rcu_read_lock(); 6625 rdev_for_each_rcu(rdev, mddev) { 6626 if (test_bit(Journal, &rdev->flags) && 6627 !test_bit(Faulty, &rdev->flags)) 6628 has_journal = true; 6629 if (rdev_read_only(rdev)) 6630 has_readonly = true; 6631 } 6632 rcu_read_unlock(); 6633 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6634 /* Don't restart rw with journal missing/faulty */ 6635 return -EINVAL; 6636 if (has_readonly) 6637 return -EROFS; 6638 6639 mddev->safemode = 0; 6640 mddev->ro = MD_RDWR; 6641 set_disk_ro(disk, 0); 6642 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6643 /* Kick recovery or resync if necessary */ 6644 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6645 md_wakeup_thread(mddev->sync_thread); 6646 sysfs_notify_dirent_safe(mddev->sysfs_state); 6647 return 0; 6648 } 6649 6650 static void md_clean(struct mddev *mddev) 6651 { 6652 mddev->array_sectors = 0; 6653 mddev->external_size = 0; 6654 mddev->dev_sectors = 0; 6655 mddev->raid_disks = 0; 6656 mddev->resync_offset = 0; 6657 mddev->resync_min = 0; 6658 mddev->resync_max = MaxSector; 6659 mddev->reshape_position = MaxSector; 6660 /* we still need mddev->external in export_rdev, do not clear it yet */ 6661 mddev->persistent = 0; 6662 mddev->level = LEVEL_NONE; 6663 mddev->clevel[0] = 0; 6664 6665 /* 6666 * For legacy_async_del_gendisk mode, it can stop the array in the 6667 * middle of assembling it, then it still can access the array. So 6668 * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk, 6669 * it can't open the array again after stopping it. So it doesn't 6670 * clear MD_CLOSING. 6671 */ 6672 if (legacy_async_del_gendisk && mddev->hold_active) { 6673 clear_bit(MD_CLOSING, &mddev->flags); 6674 } else { 6675 /* if UNTIL_STOP is set, it's cleared here */ 6676 mddev->hold_active = 0; 6677 /* Don't clear MD_CLOSING, or mddev can be opened again. */ 6678 mddev->flags &= BIT_ULL_MASK(MD_CLOSING); 6679 } 6680 mddev->sb_flags = 0; 6681 mddev->ro = MD_RDWR; 6682 mddev->metadata_type[0] = 0; 6683 mddev->chunk_sectors = 0; 6684 mddev->ctime = mddev->utime = 0; 6685 mddev->layout = 0; 6686 mddev->max_disks = 0; 6687 mddev->events = 0; 6688 mddev->can_decrease_events = 0; 6689 mddev->delta_disks = 0; 6690 mddev->reshape_backwards = 0; 6691 mddev->new_level = LEVEL_NONE; 6692 mddev->new_layout = 0; 6693 mddev->new_chunk_sectors = 0; 6694 mddev->curr_resync = MD_RESYNC_NONE; 6695 atomic64_set(&mddev->resync_mismatches, 0); 6696 mddev->suspend_lo = mddev->suspend_hi = 0; 6697 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6698 mddev->recovery = 0; 6699 mddev->in_sync = 0; 6700 mddev->changed = 0; 6701 mddev->degraded = 0; 6702 mddev->safemode = 0; 6703 mddev->private = NULL; 6704 mddev->cluster_info = NULL; 6705 mddev->bitmap_info.offset = 0; 6706 mddev->bitmap_info.default_offset = 0; 6707 mddev->bitmap_info.default_space = 0; 6708 mddev->bitmap_info.chunksize = 0; 6709 mddev->bitmap_info.daemon_sleep = 0; 6710 mddev->bitmap_info.max_write_behind = 0; 6711 mddev->bitmap_info.nodes = 0; 6712 } 6713 6714 static void __md_stop_writes(struct mddev *mddev) 6715 { 6716 timer_delete_sync(&mddev->safemode_timer); 6717 6718 if (mddev->pers && mddev->pers->quiesce) { 6719 mddev->pers->quiesce(mddev, 1); 6720 mddev->pers->quiesce(mddev, 0); 6721 } 6722 6723 if (md_bitmap_enabled(mddev, true)) 6724 mddev->bitmap_ops->flush(mddev); 6725 6726 if (md_is_rdwr(mddev) && 6727 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6728 mddev->sb_flags)) { 6729 /* mark array as shutdown cleanly */ 6730 if (!mddev_is_clustered(mddev)) 6731 mddev->in_sync = 1; 6732 md_update_sb(mddev, 1); 6733 } 6734 /* disable policy to guarantee rdevs free resources for serialization */ 6735 mddev->serialize_policy = 0; 6736 mddev_destroy_serial_pool(mddev, NULL); 6737 } 6738 6739 void md_stop_writes(struct mddev *mddev) 6740 { 6741 mddev_lock_nointr(mddev); 6742 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6743 stop_sync_thread(mddev, true); 6744 __md_stop_writes(mddev); 6745 mddev_unlock(mddev); 6746 } 6747 EXPORT_SYMBOL_GPL(md_stop_writes); 6748 6749 static void mddev_detach(struct mddev *mddev) 6750 { 6751 if (md_bitmap_enabled(mddev, false)) 6752 mddev->bitmap_ops->wait_behind_writes(mddev); 6753 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6754 mddev->pers->quiesce(mddev, 1); 6755 mddev->pers->quiesce(mddev, 0); 6756 } 6757 md_unregister_thread(mddev, &mddev->thread); 6758 6759 /* the unplug fn references 'conf' */ 6760 if (!mddev_is_dm(mddev)) 6761 blk_sync_queue(mddev->gendisk->queue); 6762 } 6763 6764 static void __md_stop(struct mddev *mddev) 6765 { 6766 struct md_personality *pers = mddev->pers; 6767 6768 md_bitmap_destroy(mddev); 6769 mddev_detach(mddev); 6770 spin_lock(&mddev->lock); 6771 mddev->pers = NULL; 6772 spin_unlock(&mddev->lock); 6773 if (mddev->private) 6774 pers->free(mddev, mddev->private); 6775 mddev->private = NULL; 6776 put_pers(pers); 6777 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6778 6779 bioset_exit(&mddev->bio_set); 6780 bioset_exit(&mddev->sync_set); 6781 bioset_exit(&mddev->io_clone_set); 6782 } 6783 6784 void md_stop(struct mddev *mddev) 6785 { 6786 lockdep_assert_held(&mddev->reconfig_mutex); 6787 6788 /* stop the array and free an attached data structures. 6789 * This is called from dm-raid 6790 */ 6791 __md_stop_writes(mddev); 6792 __md_stop(mddev); 6793 } 6794 6795 EXPORT_SYMBOL_GPL(md_stop); 6796 6797 /* ensure 'mddev->pers' exist before calling md_set_readonly() */ 6798 static int md_set_readonly(struct mddev *mddev) 6799 { 6800 int err = 0; 6801 int did_freeze = 0; 6802 6803 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6804 return -EBUSY; 6805 6806 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6807 did_freeze = 1; 6808 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6809 } 6810 6811 stop_sync_thread(mddev, false); 6812 wait_event(mddev->sb_wait, 6813 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6814 mddev_lock_nointr(mddev); 6815 6816 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6817 pr_warn("md: %s still in use.\n",mdname(mddev)); 6818 err = -EBUSY; 6819 goto out; 6820 } 6821 6822 __md_stop_writes(mddev); 6823 6824 if (mddev->ro == MD_RDONLY) { 6825 err = -ENXIO; 6826 goto out; 6827 } 6828 6829 mddev->ro = MD_RDONLY; 6830 set_disk_ro(mddev->gendisk, 1); 6831 6832 out: 6833 if (!err || did_freeze) { 6834 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6835 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6836 sysfs_notify_dirent_safe(mddev->sysfs_state); 6837 } 6838 6839 return err; 6840 } 6841 6842 /* mode: 6843 * 0 - completely stop and dis-assemble array 6844 * 2 - stop but do not disassemble array 6845 */ 6846 static int do_md_stop(struct mddev *mddev, int mode) 6847 { 6848 struct gendisk *disk = mddev->gendisk; 6849 struct md_rdev *rdev; 6850 int did_freeze = 0; 6851 6852 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6853 did_freeze = 1; 6854 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6855 } 6856 6857 stop_sync_thread(mddev, true); 6858 6859 if (mddev->sysfs_active || 6860 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6861 pr_warn("md: %s still in use.\n",mdname(mddev)); 6862 if (did_freeze) { 6863 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6864 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6865 } 6866 return -EBUSY; 6867 } 6868 if (mddev->pers) { 6869 if (!md_is_rdwr(mddev)) 6870 set_disk_ro(disk, 0); 6871 6872 __md_stop_writes(mddev); 6873 __md_stop(mddev); 6874 6875 /* tell userspace to handle 'inactive' */ 6876 sysfs_notify_dirent_safe(mddev->sysfs_state); 6877 6878 rdev_for_each(rdev, mddev) 6879 if (rdev->raid_disk >= 0) 6880 sysfs_unlink_rdev(mddev, rdev); 6881 6882 set_capacity_and_notify(disk, 0); 6883 mddev->changed = 1; 6884 6885 if (!md_is_rdwr(mddev)) 6886 mddev->ro = MD_RDWR; 6887 } 6888 /* 6889 * Free resources if final stop 6890 */ 6891 if (mode == 0) { 6892 pr_info("md: %s stopped.\n", mdname(mddev)); 6893 6894 if (mddev->bitmap_info.file) { 6895 struct file *f = mddev->bitmap_info.file; 6896 spin_lock(&mddev->lock); 6897 mddev->bitmap_info.file = NULL; 6898 spin_unlock(&mddev->lock); 6899 fput(f); 6900 } 6901 mddev->bitmap_info.offset = 0; 6902 6903 export_array(mddev); 6904 md_clean(mddev); 6905 if (!legacy_async_del_gendisk) 6906 set_bit(MD_DELETED, &mddev->flags); 6907 } 6908 md_new_event(); 6909 sysfs_notify_dirent_safe(mddev->sysfs_state); 6910 return 0; 6911 } 6912 6913 #ifndef MODULE 6914 static void autorun_array(struct mddev *mddev) 6915 { 6916 struct md_rdev *rdev; 6917 int err; 6918 6919 if (list_empty(&mddev->disks)) 6920 return; 6921 6922 pr_info("md: running: "); 6923 6924 rdev_for_each(rdev, mddev) { 6925 pr_cont("<%pg>", rdev->bdev); 6926 } 6927 pr_cont("\n"); 6928 6929 err = do_md_run(mddev); 6930 if (err) { 6931 pr_warn("md: do_md_run() returned %d\n", err); 6932 do_md_stop(mddev, 0); 6933 } 6934 } 6935 6936 /* 6937 * lets try to run arrays based on all disks that have arrived 6938 * until now. (those are in pending_raid_disks) 6939 * 6940 * the method: pick the first pending disk, collect all disks with 6941 * the same UUID, remove all from the pending list and put them into 6942 * the 'same_array' list. Then order this list based on superblock 6943 * update time (freshest comes first), kick out 'old' disks and 6944 * compare superblocks. If everything's fine then run it. 6945 * 6946 * If "unit" is allocated, then bump its reference count 6947 */ 6948 static void autorun_devices(int part) 6949 { 6950 struct md_rdev *rdev0, *rdev, *tmp; 6951 struct mddev *mddev; 6952 6953 pr_info("md: autorun ...\n"); 6954 while (!list_empty(&pending_raid_disks)) { 6955 int unit; 6956 dev_t dev; 6957 LIST_HEAD(candidates); 6958 rdev0 = list_entry(pending_raid_disks.next, 6959 struct md_rdev, same_set); 6960 6961 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6962 INIT_LIST_HEAD(&candidates); 6963 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6964 if (super_90_load(rdev, rdev0, 0) >= 0) { 6965 pr_debug("md: adding %pg ...\n", 6966 rdev->bdev); 6967 list_move(&rdev->same_set, &candidates); 6968 } 6969 /* 6970 * now we have a set of devices, with all of them having 6971 * mostly sane superblocks. It's time to allocate the 6972 * mddev. 6973 */ 6974 if (part) { 6975 dev = MKDEV(mdp_major, 6976 rdev0->preferred_minor << MdpMinorShift); 6977 unit = MINOR(dev) >> MdpMinorShift; 6978 } else { 6979 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6980 unit = MINOR(dev); 6981 } 6982 if (rdev0->preferred_minor != unit) { 6983 pr_warn("md: unit number in %pg is bad: %d\n", 6984 rdev0->bdev, rdev0->preferred_minor); 6985 break; 6986 } 6987 6988 mddev = md_alloc(dev, NULL); 6989 if (IS_ERR(mddev)) 6990 break; 6991 6992 if (mddev_suspend_and_lock(mddev)) 6993 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6994 else if (mddev->raid_disks || mddev->major_version 6995 || !list_empty(&mddev->disks)) { 6996 pr_warn("md: %s already running, cannot run %pg\n", 6997 mdname(mddev), rdev0->bdev); 6998 mddev_unlock_and_resume(mddev); 6999 } else { 7000 pr_debug("md: created %s\n", mdname(mddev)); 7001 mddev->persistent = 1; 7002 rdev_for_each_list(rdev, tmp, &candidates) { 7003 list_del_init(&rdev->same_set); 7004 if (bind_rdev_to_array(rdev, mddev)) 7005 export_rdev(rdev, mddev); 7006 } 7007 autorun_array(mddev); 7008 mddev_unlock_and_resume(mddev); 7009 } 7010 /* on success, candidates will be empty, on error 7011 * it won't... 7012 */ 7013 rdev_for_each_list(rdev, tmp, &candidates) { 7014 list_del_init(&rdev->same_set); 7015 export_rdev(rdev, mddev); 7016 } 7017 mddev_put(mddev); 7018 } 7019 pr_info("md: ... autorun DONE.\n"); 7020 } 7021 #endif /* !MODULE */ 7022 7023 static int get_version(void __user *arg) 7024 { 7025 mdu_version_t ver; 7026 7027 ver.major = MD_MAJOR_VERSION; 7028 ver.minor = MD_MINOR_VERSION; 7029 ver.patchlevel = MD_PATCHLEVEL_VERSION; 7030 7031 if (copy_to_user(arg, &ver, sizeof(ver))) 7032 return -EFAULT; 7033 7034 return 0; 7035 } 7036 7037 static int get_array_info(struct mddev *mddev, void __user *arg) 7038 { 7039 mdu_array_info_t info; 7040 int nr,working,insync,failed,spare; 7041 struct md_rdev *rdev; 7042 7043 nr = working = insync = failed = spare = 0; 7044 rcu_read_lock(); 7045 rdev_for_each_rcu(rdev, mddev) { 7046 nr++; 7047 if (test_bit(Faulty, &rdev->flags)) 7048 failed++; 7049 else { 7050 working++; 7051 if (test_bit(In_sync, &rdev->flags)) 7052 insync++; 7053 else if (test_bit(Journal, &rdev->flags)) 7054 /* TODO: add journal count to md_u.h */ 7055 ; 7056 else 7057 spare++; 7058 } 7059 } 7060 rcu_read_unlock(); 7061 7062 info.major_version = mddev->major_version; 7063 info.minor_version = mddev->minor_version; 7064 info.patch_version = MD_PATCHLEVEL_VERSION; 7065 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 7066 info.level = mddev->level; 7067 info.size = mddev->dev_sectors / 2; 7068 if (info.size != mddev->dev_sectors / 2) /* overflow */ 7069 info.size = -1; 7070 info.nr_disks = nr; 7071 info.raid_disks = mddev->raid_disks; 7072 info.md_minor = mddev->md_minor; 7073 info.not_persistent= !mddev->persistent; 7074 7075 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 7076 info.state = 0; 7077 if (mddev->in_sync) 7078 info.state = (1<<MD_SB_CLEAN); 7079 if (mddev->bitmap && mddev->bitmap_info.offset) 7080 info.state |= (1<<MD_SB_BITMAP_PRESENT); 7081 if (mddev_is_clustered(mddev)) 7082 info.state |= (1<<MD_SB_CLUSTERED); 7083 info.active_disks = insync; 7084 info.working_disks = working; 7085 info.failed_disks = failed; 7086 info.spare_disks = spare; 7087 7088 info.layout = mddev->layout; 7089 info.chunk_size = mddev->chunk_sectors << 9; 7090 7091 if (copy_to_user(arg, &info, sizeof(info))) 7092 return -EFAULT; 7093 7094 return 0; 7095 } 7096 7097 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 7098 { 7099 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 7100 char *ptr; 7101 int err; 7102 7103 file = kzalloc(sizeof(*file), GFP_NOIO); 7104 if (!file) 7105 return -ENOMEM; 7106 7107 err = 0; 7108 spin_lock(&mddev->lock); 7109 /* bitmap enabled */ 7110 if (mddev->bitmap_info.file) { 7111 ptr = file_path(mddev->bitmap_info.file, file->pathname, 7112 sizeof(file->pathname)); 7113 if (IS_ERR(ptr)) 7114 err = PTR_ERR(ptr); 7115 else 7116 memmove(file->pathname, ptr, 7117 sizeof(file->pathname)-(ptr-file->pathname)); 7118 } 7119 spin_unlock(&mddev->lock); 7120 7121 if (err == 0 && 7122 copy_to_user(arg, file, sizeof(*file))) 7123 err = -EFAULT; 7124 7125 kfree(file); 7126 return err; 7127 } 7128 7129 static int get_disk_info(struct mddev *mddev, void __user * arg) 7130 { 7131 mdu_disk_info_t info; 7132 struct md_rdev *rdev; 7133 7134 if (copy_from_user(&info, arg, sizeof(info))) 7135 return -EFAULT; 7136 7137 rcu_read_lock(); 7138 rdev = md_find_rdev_nr_rcu(mddev, info.number); 7139 if (rdev) { 7140 info.major = MAJOR(rdev->bdev->bd_dev); 7141 info.minor = MINOR(rdev->bdev->bd_dev); 7142 info.raid_disk = rdev->raid_disk; 7143 info.state = 0; 7144 if (test_bit(Faulty, &rdev->flags)) 7145 info.state |= (1<<MD_DISK_FAULTY); 7146 else if (test_bit(In_sync, &rdev->flags)) { 7147 info.state |= (1<<MD_DISK_ACTIVE); 7148 info.state |= (1<<MD_DISK_SYNC); 7149 } 7150 if (test_bit(Journal, &rdev->flags)) 7151 info.state |= (1<<MD_DISK_JOURNAL); 7152 if (test_bit(WriteMostly, &rdev->flags)) 7153 info.state |= (1<<MD_DISK_WRITEMOSTLY); 7154 if (test_bit(FailFast, &rdev->flags)) 7155 info.state |= (1<<MD_DISK_FAILFAST); 7156 } else { 7157 info.major = info.minor = 0; 7158 info.raid_disk = -1; 7159 info.state = (1<<MD_DISK_REMOVED); 7160 } 7161 rcu_read_unlock(); 7162 7163 if (copy_to_user(arg, &info, sizeof(info))) 7164 return -EFAULT; 7165 7166 return 0; 7167 } 7168 7169 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 7170 { 7171 struct md_rdev *rdev; 7172 dev_t dev = MKDEV(info->major,info->minor); 7173 7174 if (mddev_is_clustered(mddev) && 7175 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 7176 pr_warn("%s: Cannot add to clustered mddev.\n", 7177 mdname(mddev)); 7178 return -EINVAL; 7179 } 7180 7181 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 7182 return -EOVERFLOW; 7183 7184 if (!mddev->raid_disks) { 7185 int err; 7186 /* expecting a device which has a superblock */ 7187 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 7188 if (IS_ERR(rdev)) { 7189 pr_warn("md: md_import_device returned %ld\n", 7190 PTR_ERR(rdev)); 7191 return PTR_ERR(rdev); 7192 } 7193 if (!list_empty(&mddev->disks)) { 7194 struct md_rdev *rdev0 7195 = list_entry(mddev->disks.next, 7196 struct md_rdev, same_set); 7197 err = super_types[mddev->major_version] 7198 .load_super(rdev, rdev0, mddev->minor_version); 7199 if (err < 0) { 7200 pr_warn("md: %pg has different UUID to %pg\n", 7201 rdev->bdev, 7202 rdev0->bdev); 7203 export_rdev(rdev, mddev); 7204 return -EINVAL; 7205 } 7206 } 7207 err = bind_rdev_to_array(rdev, mddev); 7208 if (err) 7209 export_rdev(rdev, mddev); 7210 return err; 7211 } 7212 7213 /* 7214 * md_add_new_disk can be used once the array is assembled 7215 * to add "hot spares". They must already have a superblock 7216 * written 7217 */ 7218 if (mddev->pers) { 7219 int err; 7220 if (!mddev->pers->hot_add_disk) { 7221 pr_warn("%s: personality does not support diskops!\n", 7222 mdname(mddev)); 7223 return -EINVAL; 7224 } 7225 if (mddev->persistent) 7226 rdev = md_import_device(dev, mddev->major_version, 7227 mddev->minor_version); 7228 else 7229 rdev = md_import_device(dev, -1, -1); 7230 if (IS_ERR(rdev)) { 7231 pr_warn("md: md_import_device returned %ld\n", 7232 PTR_ERR(rdev)); 7233 return PTR_ERR(rdev); 7234 } 7235 /* set saved_raid_disk if appropriate */ 7236 if (!mddev->persistent) { 7237 if (info->state & (1<<MD_DISK_SYNC) && 7238 info->raid_disk < mddev->raid_disks) { 7239 rdev->raid_disk = info->raid_disk; 7240 clear_bit(Bitmap_sync, &rdev->flags); 7241 } else 7242 rdev->raid_disk = -1; 7243 rdev->saved_raid_disk = rdev->raid_disk; 7244 } else 7245 super_types[mddev->major_version]. 7246 validate_super(mddev, NULL/*freshest*/, rdev); 7247 if ((info->state & (1<<MD_DISK_SYNC)) && 7248 rdev->raid_disk != info->raid_disk) { 7249 /* This was a hot-add request, but events doesn't 7250 * match, so reject it. 7251 */ 7252 export_rdev(rdev, mddev); 7253 return -EINVAL; 7254 } 7255 7256 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 7257 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7258 set_bit(WriteMostly, &rdev->flags); 7259 else 7260 clear_bit(WriteMostly, &rdev->flags); 7261 if (info->state & (1<<MD_DISK_FAILFAST)) 7262 set_bit(FailFast, &rdev->flags); 7263 else 7264 clear_bit(FailFast, &rdev->flags); 7265 7266 if (info->state & (1<<MD_DISK_JOURNAL)) { 7267 struct md_rdev *rdev2; 7268 bool has_journal = false; 7269 7270 /* make sure no existing journal disk */ 7271 rdev_for_each(rdev2, mddev) { 7272 if (test_bit(Journal, &rdev2->flags)) { 7273 has_journal = true; 7274 break; 7275 } 7276 } 7277 if (has_journal || mddev->bitmap) { 7278 export_rdev(rdev, mddev); 7279 return -EBUSY; 7280 } 7281 set_bit(Journal, &rdev->flags); 7282 } 7283 /* 7284 * check whether the device shows up in other nodes 7285 */ 7286 if (mddev_is_clustered(mddev)) { 7287 if (info->state & (1 << MD_DISK_CANDIDATE)) 7288 set_bit(Candidate, &rdev->flags); 7289 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 7290 /* --add initiated by this node */ 7291 err = mddev->cluster_ops->add_new_disk(mddev, rdev); 7292 if (err) { 7293 export_rdev(rdev, mddev); 7294 return err; 7295 } 7296 } 7297 } 7298 7299 rdev->raid_disk = -1; 7300 err = bind_rdev_to_array(rdev, mddev); 7301 7302 if (err) 7303 export_rdev(rdev, mddev); 7304 7305 if (mddev_is_clustered(mddev)) { 7306 if (info->state & (1 << MD_DISK_CANDIDATE)) { 7307 if (!err) { 7308 err = mddev->cluster_ops->new_disk_ack( 7309 mddev, err == 0); 7310 if (err) 7311 md_kick_rdev_from_array(rdev); 7312 } 7313 } else { 7314 if (err) 7315 mddev->cluster_ops->add_new_disk_cancel(mddev); 7316 else 7317 err = add_bound_rdev(rdev); 7318 } 7319 7320 } else if (!err) 7321 err = add_bound_rdev(rdev); 7322 7323 return err; 7324 } 7325 7326 /* otherwise, md_add_new_disk is only allowed 7327 * for major_version==0 superblocks 7328 */ 7329 if (mddev->major_version != 0) { 7330 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 7331 return -EINVAL; 7332 } 7333 7334 if (!(info->state & (1<<MD_DISK_FAULTY))) { 7335 int err; 7336 rdev = md_import_device(dev, -1, 0); 7337 if (IS_ERR(rdev)) { 7338 pr_warn("md: error, md_import_device() returned %ld\n", 7339 PTR_ERR(rdev)); 7340 return PTR_ERR(rdev); 7341 } 7342 rdev->desc_nr = info->number; 7343 if (info->raid_disk < mddev->raid_disks) 7344 rdev->raid_disk = info->raid_disk; 7345 else 7346 rdev->raid_disk = -1; 7347 7348 if (rdev->raid_disk < mddev->raid_disks) 7349 if (info->state & (1<<MD_DISK_SYNC)) 7350 set_bit(In_sync, &rdev->flags); 7351 7352 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7353 set_bit(WriteMostly, &rdev->flags); 7354 if (info->state & (1<<MD_DISK_FAILFAST)) 7355 set_bit(FailFast, &rdev->flags); 7356 7357 if (!mddev->persistent) { 7358 pr_debug("md: nonpersistent superblock ...\n"); 7359 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7360 } else 7361 rdev->sb_start = calc_dev_sboffset(rdev); 7362 rdev->sectors = rdev->sb_start; 7363 7364 err = bind_rdev_to_array(rdev, mddev); 7365 if (err) { 7366 export_rdev(rdev, mddev); 7367 return err; 7368 } 7369 } 7370 7371 return 0; 7372 } 7373 7374 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 7375 { 7376 struct md_rdev *rdev; 7377 7378 if (!mddev->pers) 7379 return -ENODEV; 7380 7381 rdev = find_rdev(mddev, dev); 7382 if (!rdev) 7383 return -ENXIO; 7384 7385 if (rdev->raid_disk < 0) 7386 goto kick_rdev; 7387 7388 clear_bit(Blocked, &rdev->flags); 7389 remove_and_add_spares(mddev, rdev); 7390 7391 if (rdev->raid_disk >= 0) 7392 goto busy; 7393 7394 kick_rdev: 7395 if (mddev_is_clustered(mddev) && 7396 mddev->cluster_ops->remove_disk(mddev, rdev)) 7397 goto busy; 7398 7399 md_kick_rdev_from_array(rdev); 7400 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7401 if (!mddev->thread) 7402 md_update_sb(mddev, 1); 7403 md_new_event(); 7404 7405 return 0; 7406 busy: 7407 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7408 rdev->bdev, mdname(mddev)); 7409 return -EBUSY; 7410 } 7411 7412 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7413 { 7414 int err; 7415 struct md_rdev *rdev; 7416 7417 if (!mddev->pers) 7418 return -ENODEV; 7419 7420 if (mddev->major_version != 0) { 7421 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7422 mdname(mddev)); 7423 return -EINVAL; 7424 } 7425 if (!mddev->pers->hot_add_disk) { 7426 pr_warn("%s: personality does not support diskops!\n", 7427 mdname(mddev)); 7428 return -EINVAL; 7429 } 7430 7431 rdev = md_import_device(dev, -1, 0); 7432 if (IS_ERR(rdev)) { 7433 pr_warn("md: error, md_import_device() returned %ld\n", 7434 PTR_ERR(rdev)); 7435 return -EINVAL; 7436 } 7437 7438 if (mddev->persistent) 7439 rdev->sb_start = calc_dev_sboffset(rdev); 7440 else 7441 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7442 7443 rdev->sectors = rdev->sb_start; 7444 7445 if (test_bit(Faulty, &rdev->flags)) { 7446 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7447 rdev->bdev, mdname(mddev)); 7448 err = -EINVAL; 7449 goto abort_export; 7450 } 7451 7452 clear_bit(In_sync, &rdev->flags); 7453 rdev->desc_nr = -1; 7454 rdev->saved_raid_disk = -1; 7455 err = bind_rdev_to_array(rdev, mddev); 7456 if (err) 7457 goto abort_export; 7458 7459 /* 7460 * The rest should better be atomic, we can have disk failures 7461 * noticed in interrupt contexts ... 7462 */ 7463 7464 rdev->raid_disk = -1; 7465 7466 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7467 if (!mddev->thread) 7468 md_update_sb(mddev, 1); 7469 /* 7470 * Kick recovery, maybe this spare has to be added to the 7471 * array immediately. 7472 */ 7473 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7474 md_new_event(); 7475 return 0; 7476 7477 abort_export: 7478 export_rdev(rdev, mddev); 7479 return err; 7480 } 7481 7482 static int set_bitmap_file(struct mddev *mddev, int fd) 7483 { 7484 int err = 0; 7485 7486 if (!md_bitmap_registered(mddev)) 7487 return -EINVAL; 7488 7489 if (mddev->pers) { 7490 if (!mddev->pers->quiesce || !mddev->thread) 7491 return -EBUSY; 7492 if (mddev->recovery || mddev->sync_thread) 7493 return -EBUSY; 7494 /* we should be able to change the bitmap.. */ 7495 } 7496 7497 if (fd >= 0) { 7498 struct inode *inode; 7499 struct file *f; 7500 7501 if (mddev->bitmap || mddev->bitmap_info.file) 7502 return -EEXIST; /* cannot add when bitmap is present */ 7503 7504 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7505 pr_warn("%s: bitmap files not supported by this kernel\n", 7506 mdname(mddev)); 7507 return -EINVAL; 7508 } 7509 pr_warn("%s: using deprecated bitmap file support\n", 7510 mdname(mddev)); 7511 7512 f = fget(fd); 7513 7514 if (f == NULL) { 7515 pr_warn("%s: error: failed to get bitmap file\n", 7516 mdname(mddev)); 7517 return -EBADF; 7518 } 7519 7520 inode = f->f_mapping->host; 7521 if (!S_ISREG(inode->i_mode)) { 7522 pr_warn("%s: error: bitmap file must be a regular file\n", 7523 mdname(mddev)); 7524 err = -EBADF; 7525 } else if (!(f->f_mode & FMODE_WRITE)) { 7526 pr_warn("%s: error: bitmap file must open for write\n", 7527 mdname(mddev)); 7528 err = -EBADF; 7529 } else if (atomic_read(&inode->i_writecount) != 1) { 7530 pr_warn("%s: error: bitmap file is already in use\n", 7531 mdname(mddev)); 7532 err = -EBUSY; 7533 } 7534 if (err) { 7535 fput(f); 7536 return err; 7537 } 7538 mddev->bitmap_info.file = f; 7539 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7540 } else if (mddev->bitmap == NULL) 7541 return -ENOENT; /* cannot remove what isn't there */ 7542 err = 0; 7543 if (mddev->pers) { 7544 if (fd >= 0) { 7545 err = md_bitmap_create(mddev); 7546 if (!err) 7547 err = mddev->bitmap_ops->load(mddev); 7548 7549 if (err) { 7550 md_bitmap_destroy(mddev); 7551 fd = -1; 7552 } 7553 } else if (fd < 0) { 7554 md_bitmap_destroy(mddev); 7555 } 7556 } 7557 7558 if (fd < 0) { 7559 struct file *f = mddev->bitmap_info.file; 7560 if (f) { 7561 spin_lock(&mddev->lock); 7562 mddev->bitmap_info.file = NULL; 7563 spin_unlock(&mddev->lock); 7564 fput(f); 7565 } 7566 } 7567 7568 return err; 7569 } 7570 7571 /* 7572 * md_set_array_info is used two different ways 7573 * The original usage is when creating a new array. 7574 * In this usage, raid_disks is > 0 and it together with 7575 * level, size, not_persistent,layout,chunksize determine the 7576 * shape of the array. 7577 * This will always create an array with a type-0.90.0 superblock. 7578 * The newer usage is when assembling an array. 7579 * In this case raid_disks will be 0, and the major_version field is 7580 * use to determine which style super-blocks are to be found on the devices. 7581 * The minor and patch _version numbers are also kept incase the 7582 * super_block handler wishes to interpret them. 7583 */ 7584 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7585 { 7586 if (info->raid_disks == 0) { 7587 /* just setting version number for superblock loading */ 7588 if (info->major_version < 0 || 7589 info->major_version >= ARRAY_SIZE(super_types) || 7590 super_types[info->major_version].name == NULL) { 7591 /* maybe try to auto-load a module? */ 7592 pr_warn("md: superblock version %d not known\n", 7593 info->major_version); 7594 return -EINVAL; 7595 } 7596 mddev->major_version = info->major_version; 7597 mddev->minor_version = info->minor_version; 7598 mddev->patch_version = info->patch_version; 7599 mddev->persistent = !info->not_persistent; 7600 /* ensure mddev_put doesn't delete this now that there 7601 * is some minimal configuration. 7602 */ 7603 mddev->ctime = ktime_get_real_seconds(); 7604 return 0; 7605 } 7606 mddev->major_version = MD_MAJOR_VERSION; 7607 mddev->minor_version = MD_MINOR_VERSION; 7608 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7609 mddev->ctime = ktime_get_real_seconds(); 7610 7611 mddev->level = info->level; 7612 mddev->clevel[0] = 0; 7613 mddev->dev_sectors = 2 * (sector_t)info->size; 7614 mddev->raid_disks = info->raid_disks; 7615 /* don't set md_minor, it is determined by which /dev/md* was 7616 * openned 7617 */ 7618 if (info->state & (1<<MD_SB_CLEAN)) 7619 mddev->resync_offset = MaxSector; 7620 else 7621 mddev->resync_offset = 0; 7622 mddev->persistent = ! info->not_persistent; 7623 mddev->external = 0; 7624 7625 mddev->layout = info->layout; 7626 if (mddev->level == 0) 7627 /* Cannot trust RAID0 layout info here */ 7628 mddev->layout = -1; 7629 mddev->chunk_sectors = info->chunk_size >> 9; 7630 7631 if (mddev->persistent) { 7632 mddev->max_disks = MD_SB_DISKS; 7633 mddev->flags = 0; 7634 mddev->sb_flags = 0; 7635 } 7636 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7637 7638 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7639 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7640 mddev->bitmap_info.offset = 0; 7641 7642 mddev->reshape_position = MaxSector; 7643 7644 /* 7645 * Generate a 128 bit UUID 7646 */ 7647 get_random_bytes(mddev->uuid, 16); 7648 7649 mddev->new_level = mddev->level; 7650 mddev->new_chunk_sectors = mddev->chunk_sectors; 7651 mddev->new_layout = mddev->layout; 7652 mddev->delta_disks = 0; 7653 mddev->reshape_backwards = 0; 7654 7655 return 0; 7656 } 7657 7658 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7659 { 7660 lockdep_assert_held(&mddev->reconfig_mutex); 7661 7662 if (mddev->external_size) 7663 return; 7664 7665 mddev->array_sectors = array_sectors; 7666 } 7667 EXPORT_SYMBOL(md_set_array_sectors); 7668 7669 static int update_size(struct mddev *mddev, sector_t num_sectors) 7670 { 7671 struct md_rdev *rdev; 7672 int rv; 7673 int fit = (num_sectors == 0); 7674 sector_t old_dev_sectors = mddev->dev_sectors; 7675 7676 if (mddev->pers->resize == NULL) 7677 return -EINVAL; 7678 /* The "num_sectors" is the number of sectors of each device that 7679 * is used. This can only make sense for arrays with redundancy. 7680 * linear and raid0 always use whatever space is available. We can only 7681 * consider changing this number if no resync or reconstruction is 7682 * happening, and if the new size is acceptable. It must fit before the 7683 * sb_start or, if that is <data_offset, it must fit before the size 7684 * of each device. If num_sectors is zero, we find the largest size 7685 * that fits. 7686 */ 7687 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7688 return -EBUSY; 7689 if (!md_is_rdwr(mddev)) 7690 return -EROFS; 7691 7692 rdev_for_each(rdev, mddev) { 7693 sector_t avail = rdev->sectors; 7694 7695 if (fit && (num_sectors == 0 || num_sectors > avail)) 7696 num_sectors = avail; 7697 if (avail < num_sectors) 7698 return -ENOSPC; 7699 } 7700 rv = mddev->pers->resize(mddev, num_sectors); 7701 if (!rv) { 7702 if (mddev_is_clustered(mddev)) 7703 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 7704 else if (!mddev_is_dm(mddev)) 7705 set_capacity_and_notify(mddev->gendisk, 7706 mddev->array_sectors); 7707 } 7708 return rv; 7709 } 7710 7711 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7712 { 7713 int rv; 7714 struct md_rdev *rdev; 7715 /* change the number of raid disks */ 7716 if (mddev->pers->check_reshape == NULL) 7717 return -EINVAL; 7718 if (!md_is_rdwr(mddev)) 7719 return -EROFS; 7720 if (raid_disks <= 0 || 7721 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7722 return -EINVAL; 7723 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7724 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7725 mddev->reshape_position != MaxSector) 7726 return -EBUSY; 7727 7728 rdev_for_each(rdev, mddev) { 7729 if (mddev->raid_disks < raid_disks && 7730 rdev->data_offset < rdev->new_data_offset) 7731 return -EINVAL; 7732 if (mddev->raid_disks > raid_disks && 7733 rdev->data_offset > rdev->new_data_offset) 7734 return -EINVAL; 7735 } 7736 7737 mddev->delta_disks = raid_disks - mddev->raid_disks; 7738 if (mddev->delta_disks < 0) 7739 mddev->reshape_backwards = 1; 7740 else if (mddev->delta_disks > 0) 7741 mddev->reshape_backwards = 0; 7742 7743 rv = mddev->pers->check_reshape(mddev); 7744 if (rv < 0) { 7745 mddev->delta_disks = 0; 7746 mddev->reshape_backwards = 0; 7747 } 7748 return rv; 7749 } 7750 7751 static int get_cluster_ops(struct mddev *mddev) 7752 { 7753 xa_lock(&md_submodule); 7754 mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); 7755 if (mddev->cluster_ops && 7756 !try_module_get(mddev->cluster_ops->head.owner)) 7757 mddev->cluster_ops = NULL; 7758 xa_unlock(&md_submodule); 7759 7760 return mddev->cluster_ops == NULL ? -ENOENT : 0; 7761 } 7762 7763 static void put_cluster_ops(struct mddev *mddev) 7764 { 7765 if (!mddev->cluster_ops) 7766 return; 7767 7768 mddev->cluster_ops->leave(mddev); 7769 module_put(mddev->cluster_ops->head.owner); 7770 mddev->cluster_ops = NULL; 7771 } 7772 7773 /* 7774 * update_array_info is used to change the configuration of an 7775 * on-line array. 7776 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7777 * fields in the info are checked against the array. 7778 * Any differences that cannot be handled will cause an error. 7779 * Normally, only one change can be managed at a time. 7780 */ 7781 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7782 { 7783 int rv = 0; 7784 int cnt = 0; 7785 int state = 0; 7786 7787 /* calculate expected state,ignoring low bits */ 7788 if (mddev->bitmap && mddev->bitmap_info.offset) 7789 state |= (1 << MD_SB_BITMAP_PRESENT); 7790 7791 if (mddev->major_version != info->major_version || 7792 mddev->minor_version != info->minor_version || 7793 /* mddev->patch_version != info->patch_version || */ 7794 mddev->ctime != info->ctime || 7795 mddev->level != info->level || 7796 /* mddev->layout != info->layout || */ 7797 mddev->persistent != !info->not_persistent || 7798 mddev->chunk_sectors != info->chunk_size >> 9 || 7799 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7800 ((state^info->state) & 0xfffffe00) 7801 ) 7802 return -EINVAL; 7803 /* Check there is only one change */ 7804 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7805 cnt++; 7806 if (mddev->raid_disks != info->raid_disks) 7807 cnt++; 7808 if (mddev->layout != info->layout) 7809 cnt++; 7810 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7811 cnt++; 7812 if (cnt == 0) 7813 return 0; 7814 if (cnt > 1) 7815 return -EINVAL; 7816 7817 if (mddev->layout != info->layout) { 7818 /* Change layout 7819 * we don't need to do anything at the md level, the 7820 * personality will take care of it all. 7821 */ 7822 if (mddev->pers->check_reshape == NULL) 7823 return -EINVAL; 7824 else { 7825 mddev->new_layout = info->layout; 7826 rv = mddev->pers->check_reshape(mddev); 7827 if (rv) 7828 mddev->new_layout = mddev->layout; 7829 return rv; 7830 } 7831 } 7832 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7833 rv = update_size(mddev, (sector_t)info->size * 2); 7834 7835 if (mddev->raid_disks != info->raid_disks) 7836 rv = update_raid_disks(mddev, info->raid_disks); 7837 7838 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7839 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7840 rv = -EINVAL; 7841 goto err; 7842 } 7843 if (mddev->recovery || mddev->sync_thread) { 7844 rv = -EBUSY; 7845 goto err; 7846 } 7847 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7848 /* add the bitmap */ 7849 if (mddev->bitmap) { 7850 rv = -EEXIST; 7851 goto err; 7852 } 7853 if (mddev->bitmap_info.default_offset == 0) { 7854 rv = -EINVAL; 7855 goto err; 7856 } 7857 mddev->bitmap_info.offset = 7858 mddev->bitmap_info.default_offset; 7859 mddev->bitmap_info.space = 7860 mddev->bitmap_info.default_space; 7861 rv = md_bitmap_create(mddev); 7862 if (!rv) 7863 rv = mddev->bitmap_ops->load(mddev); 7864 7865 if (rv) 7866 md_bitmap_destroy(mddev); 7867 } else { 7868 struct md_bitmap_stats stats; 7869 7870 rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 7871 if (rv) 7872 goto err; 7873 7874 if (stats.file) { 7875 rv = -EINVAL; 7876 goto err; 7877 } 7878 7879 if (mddev->bitmap_info.nodes) { 7880 /* hold PW on all the bitmap lock */ 7881 if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7882 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7883 rv = -EPERM; 7884 mddev->cluster_ops->unlock_all_bitmaps(mddev); 7885 goto err; 7886 } 7887 7888 mddev->bitmap_info.nodes = 0; 7889 put_cluster_ops(mddev); 7890 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7891 } 7892 md_bitmap_destroy(mddev); 7893 mddev->bitmap_info.offset = 0; 7894 } 7895 } 7896 md_update_sb(mddev, 1); 7897 return rv; 7898 err: 7899 return rv; 7900 } 7901 7902 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7903 { 7904 struct md_rdev *rdev; 7905 int err = 0; 7906 7907 if (mddev->pers == NULL) 7908 return -ENODEV; 7909 7910 rcu_read_lock(); 7911 rdev = md_find_rdev_rcu(mddev, dev); 7912 if (!rdev) 7913 err = -ENODEV; 7914 else { 7915 md_error(mddev, rdev); 7916 if (test_bit(MD_BROKEN, &mddev->flags)) 7917 err = -EBUSY; 7918 } 7919 rcu_read_unlock(); 7920 return err; 7921 } 7922 7923 /* 7924 * We have a problem here : there is no easy way to give a CHS 7925 * virtual geometry. We currently pretend that we have a 2 heads 7926 * 4 sectors (with a BIG number of cylinders...). This drives 7927 * dosfs just mad... ;-) 7928 */ 7929 static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) 7930 { 7931 struct mddev *mddev = disk->private_data; 7932 7933 geo->heads = 2; 7934 geo->sectors = 4; 7935 geo->cylinders = mddev->array_sectors / 8; 7936 return 0; 7937 } 7938 7939 static inline int md_ioctl_valid(unsigned int cmd) 7940 { 7941 switch (cmd) { 7942 case GET_ARRAY_INFO: 7943 case GET_DISK_INFO: 7944 case RAID_VERSION: 7945 return 0; 7946 case ADD_NEW_DISK: 7947 case GET_BITMAP_FILE: 7948 case HOT_ADD_DISK: 7949 case HOT_REMOVE_DISK: 7950 case RESTART_ARRAY_RW: 7951 case RUN_ARRAY: 7952 case SET_ARRAY_INFO: 7953 case SET_BITMAP_FILE: 7954 case SET_DISK_FAULTY: 7955 case STOP_ARRAY: 7956 case STOP_ARRAY_RO: 7957 case CLUSTERED_DISK_NACK: 7958 if (!capable(CAP_SYS_ADMIN)) 7959 return -EACCES; 7960 return 0; 7961 default: 7962 return -ENOTTY; 7963 } 7964 } 7965 7966 static bool md_ioctl_need_suspend(unsigned int cmd) 7967 { 7968 switch (cmd) { 7969 case ADD_NEW_DISK: 7970 case HOT_ADD_DISK: 7971 case HOT_REMOVE_DISK: 7972 case SET_BITMAP_FILE: 7973 case SET_ARRAY_INFO: 7974 return true; 7975 default: 7976 return false; 7977 } 7978 } 7979 7980 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7981 { 7982 mdu_array_info_t info; 7983 int err; 7984 7985 if (!argp) 7986 memset(&info, 0, sizeof(info)); 7987 else if (copy_from_user(&info, argp, sizeof(info))) 7988 return -EFAULT; 7989 7990 if (mddev->pers) { 7991 err = update_array_info(mddev, &info); 7992 if (err) 7993 pr_warn("md: couldn't update array info. %d\n", err); 7994 return err; 7995 } 7996 7997 if (!list_empty(&mddev->disks)) { 7998 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7999 return -EBUSY; 8000 } 8001 8002 if (mddev->raid_disks) { 8003 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 8004 return -EBUSY; 8005 } 8006 8007 err = md_set_array_info(mddev, &info); 8008 if (err) 8009 pr_warn("md: couldn't set array info. %d\n", err); 8010 8011 return err; 8012 } 8013 8014 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 8015 unsigned int cmd, unsigned long arg) 8016 { 8017 int err = 0; 8018 void __user *argp = (void __user *)arg; 8019 struct mddev *mddev = NULL; 8020 8021 err = md_ioctl_valid(cmd); 8022 if (err) 8023 return err; 8024 8025 /* 8026 * Commands dealing with the RAID driver but not any 8027 * particular array: 8028 */ 8029 if (cmd == RAID_VERSION) 8030 return get_version(argp); 8031 8032 /* 8033 * Commands creating/starting a new array: 8034 */ 8035 8036 mddev = bdev->bd_disk->private_data; 8037 8038 /* Some actions do not requires the mutex */ 8039 switch (cmd) { 8040 case GET_ARRAY_INFO: 8041 if (!mddev->raid_disks && !mddev->external) 8042 return -ENODEV; 8043 return get_array_info(mddev, argp); 8044 8045 case GET_DISK_INFO: 8046 if (!mddev->raid_disks && !mddev->external) 8047 return -ENODEV; 8048 return get_disk_info(mddev, argp); 8049 8050 case SET_DISK_FAULTY: 8051 return set_disk_faulty(mddev, new_decode_dev(arg)); 8052 8053 case GET_BITMAP_FILE: 8054 return get_bitmap_file(mddev, argp); 8055 } 8056 8057 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 8058 /* Need to flush page cache, and ensure no-one else opens 8059 * and writes 8060 */ 8061 err = mddev_set_closing_and_sync_blockdev(mddev, 1); 8062 if (err) 8063 return err; 8064 } 8065 8066 if (!md_is_rdwr(mddev)) 8067 flush_work(&mddev->sync_work); 8068 8069 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 8070 mddev_lock(mddev); 8071 if (err) { 8072 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 8073 err, cmd); 8074 goto out; 8075 } 8076 8077 if (cmd == SET_ARRAY_INFO) { 8078 err = __md_set_array_info(mddev, argp); 8079 goto unlock; 8080 } 8081 8082 /* 8083 * Commands querying/configuring an existing array: 8084 */ 8085 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 8086 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 8087 if ((!mddev->raid_disks && !mddev->external) 8088 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 8089 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 8090 && cmd != GET_BITMAP_FILE) { 8091 err = -ENODEV; 8092 goto unlock; 8093 } 8094 8095 /* 8096 * Commands even a read-only array can execute: 8097 */ 8098 switch (cmd) { 8099 case RESTART_ARRAY_RW: 8100 err = restart_array(mddev); 8101 goto unlock; 8102 8103 case STOP_ARRAY: 8104 err = do_md_stop(mddev, 0); 8105 goto unlock; 8106 8107 case STOP_ARRAY_RO: 8108 if (mddev->pers) 8109 err = md_set_readonly(mddev); 8110 goto unlock; 8111 8112 case HOT_REMOVE_DISK: 8113 err = hot_remove_disk(mddev, new_decode_dev(arg)); 8114 goto unlock; 8115 8116 case ADD_NEW_DISK: 8117 /* We can support ADD_NEW_DISK on read-only arrays 8118 * only if we are re-adding a preexisting device. 8119 * So require mddev->pers and MD_DISK_SYNC. 8120 */ 8121 if (mddev->pers) { 8122 mdu_disk_info_t info; 8123 if (copy_from_user(&info, argp, sizeof(info))) 8124 err = -EFAULT; 8125 else if (!(info.state & (1<<MD_DISK_SYNC))) 8126 /* Need to clear read-only for this */ 8127 break; 8128 else 8129 err = md_add_new_disk(mddev, &info); 8130 goto unlock; 8131 } 8132 break; 8133 } 8134 8135 /* 8136 * The remaining ioctls are changing the state of the 8137 * superblock, so we do not allow them on read-only arrays. 8138 */ 8139 if (!md_is_rdwr(mddev) && mddev->pers) { 8140 if (mddev->ro != MD_AUTO_READ) { 8141 err = -EROFS; 8142 goto unlock; 8143 } 8144 mddev->ro = MD_RDWR; 8145 sysfs_notify_dirent_safe(mddev->sysfs_state); 8146 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8147 /* mddev_unlock will wake thread */ 8148 /* If a device failed while we were read-only, we 8149 * need to make sure the metadata is updated now. 8150 */ 8151 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 8152 mddev_unlock(mddev); 8153 wait_event(mddev->sb_wait, 8154 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 8155 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8156 mddev_lock_nointr(mddev); 8157 } 8158 } 8159 8160 switch (cmd) { 8161 case ADD_NEW_DISK: 8162 { 8163 mdu_disk_info_t info; 8164 if (copy_from_user(&info, argp, sizeof(info))) 8165 err = -EFAULT; 8166 else 8167 err = md_add_new_disk(mddev, &info); 8168 goto unlock; 8169 } 8170 8171 case CLUSTERED_DISK_NACK: 8172 if (mddev_is_clustered(mddev)) 8173 mddev->cluster_ops->new_disk_ack(mddev, false); 8174 else 8175 err = -EINVAL; 8176 goto unlock; 8177 8178 case HOT_ADD_DISK: 8179 err = hot_add_disk(mddev, new_decode_dev(arg)); 8180 goto unlock; 8181 8182 case RUN_ARRAY: 8183 err = do_md_run(mddev); 8184 goto unlock; 8185 8186 case SET_BITMAP_FILE: 8187 err = set_bitmap_file(mddev, (int)arg); 8188 goto unlock; 8189 8190 default: 8191 err = -EINVAL; 8192 goto unlock; 8193 } 8194 8195 unlock: 8196 if (mddev->hold_active == UNTIL_IOCTL && 8197 err != -EINVAL) 8198 mddev->hold_active = 0; 8199 8200 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 8201 mddev_unlock(mddev); 8202 8203 out: 8204 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) 8205 clear_bit(MD_CLOSING, &mddev->flags); 8206 return err; 8207 } 8208 #ifdef CONFIG_COMPAT 8209 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 8210 unsigned int cmd, unsigned long arg) 8211 { 8212 switch (cmd) { 8213 case HOT_REMOVE_DISK: 8214 case HOT_ADD_DISK: 8215 case SET_DISK_FAULTY: 8216 case SET_BITMAP_FILE: 8217 /* These take in integer arg, do not convert */ 8218 break; 8219 default: 8220 arg = (unsigned long)compat_ptr(arg); 8221 break; 8222 } 8223 8224 return md_ioctl(bdev, mode, cmd, arg); 8225 } 8226 #endif /* CONFIG_COMPAT */ 8227 8228 static int md_set_read_only(struct block_device *bdev, bool ro) 8229 { 8230 struct mddev *mddev = bdev->bd_disk->private_data; 8231 int err; 8232 8233 err = mddev_lock(mddev); 8234 if (err) 8235 return err; 8236 8237 if (!mddev->raid_disks && !mddev->external) { 8238 err = -ENODEV; 8239 goto out_unlock; 8240 } 8241 8242 /* 8243 * Transitioning to read-auto need only happen for arrays that call 8244 * md_write_start and which are not ready for writes yet. 8245 */ 8246 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 8247 err = restart_array(mddev); 8248 if (err) 8249 goto out_unlock; 8250 mddev->ro = MD_AUTO_READ; 8251 } 8252 8253 out_unlock: 8254 mddev_unlock(mddev); 8255 return err; 8256 } 8257 8258 static int md_open(struct gendisk *disk, blk_mode_t mode) 8259 { 8260 struct mddev *mddev; 8261 int err; 8262 8263 spin_lock(&all_mddevs_lock); 8264 mddev = mddev_get(disk->private_data); 8265 spin_unlock(&all_mddevs_lock); 8266 if (!mddev) 8267 return -ENODEV; 8268 8269 err = mutex_lock_interruptible(&mddev->open_mutex); 8270 if (err) 8271 goto out; 8272 8273 err = -ENODEV; 8274 if (test_bit(MD_CLOSING, &mddev->flags)) 8275 goto out_unlock; 8276 8277 atomic_inc(&mddev->openers); 8278 mutex_unlock(&mddev->open_mutex); 8279 8280 disk_check_media_change(disk); 8281 return 0; 8282 8283 out_unlock: 8284 mutex_unlock(&mddev->open_mutex); 8285 out: 8286 mddev_put(mddev); 8287 return err; 8288 } 8289 8290 static void md_release(struct gendisk *disk) 8291 { 8292 struct mddev *mddev = disk->private_data; 8293 8294 BUG_ON(!mddev); 8295 atomic_dec(&mddev->openers); 8296 mddev_put(mddev); 8297 } 8298 8299 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 8300 { 8301 struct mddev *mddev = disk->private_data; 8302 unsigned int ret = 0; 8303 8304 if (mddev->changed) 8305 ret = DISK_EVENT_MEDIA_CHANGE; 8306 mddev->changed = 0; 8307 return ret; 8308 } 8309 8310 static void md_free_disk(struct gendisk *disk) 8311 { 8312 struct mddev *mddev = disk->private_data; 8313 8314 mddev_free(mddev); 8315 } 8316 8317 const struct block_device_operations md_fops = 8318 { 8319 .owner = THIS_MODULE, 8320 .submit_bio = md_submit_bio, 8321 .open = md_open, 8322 .release = md_release, 8323 .ioctl = md_ioctl, 8324 #ifdef CONFIG_COMPAT 8325 .compat_ioctl = md_compat_ioctl, 8326 #endif 8327 .getgeo = md_getgeo, 8328 .check_events = md_check_events, 8329 .set_read_only = md_set_read_only, 8330 .free_disk = md_free_disk, 8331 }; 8332 8333 static int md_thread(void *arg) 8334 { 8335 struct md_thread *thread = arg; 8336 8337 /* 8338 * md_thread is a 'system-thread', it's priority should be very 8339 * high. We avoid resource deadlocks individually in each 8340 * raid personality. (RAID5 does preallocation) We also use RR and 8341 * the very same RT priority as kswapd, thus we will never get 8342 * into a priority inversion deadlock. 8343 * 8344 * we definitely have to have equal or higher priority than 8345 * bdflush, otherwise bdflush will deadlock if there are too 8346 * many dirty RAID5 blocks. 8347 */ 8348 8349 allow_signal(SIGKILL); 8350 while (!kthread_should_stop()) { 8351 8352 /* We need to wait INTERRUPTIBLE so that 8353 * we don't add to the load-average. 8354 * That means we need to be sure no signals are 8355 * pending 8356 */ 8357 if (signal_pending(current)) 8358 flush_signals(current); 8359 8360 wait_event_interruptible_timeout 8361 (thread->wqueue, 8362 test_bit(THREAD_WAKEUP, &thread->flags) 8363 || kthread_should_stop() || kthread_should_park(), 8364 thread->timeout); 8365 8366 clear_bit(THREAD_WAKEUP, &thread->flags); 8367 if (kthread_should_park()) 8368 kthread_parkme(); 8369 if (!kthread_should_stop()) 8370 thread->run(thread); 8371 } 8372 8373 return 0; 8374 } 8375 8376 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 8377 { 8378 struct md_thread *t; 8379 8380 rcu_read_lock(); 8381 t = rcu_dereference(thread); 8382 if (t) 8383 wake_up_process(t->tsk); 8384 rcu_read_unlock(); 8385 } 8386 8387 void md_wakeup_thread(struct md_thread __rcu *thread) 8388 { 8389 struct md_thread *t; 8390 8391 rcu_read_lock(); 8392 t = rcu_dereference(thread); 8393 if (t) { 8394 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8395 set_bit(THREAD_WAKEUP, &t->flags); 8396 if (wq_has_sleeper(&t->wqueue)) 8397 wake_up(&t->wqueue); 8398 } 8399 rcu_read_unlock(); 8400 } 8401 EXPORT_SYMBOL(md_wakeup_thread); 8402 8403 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8404 struct mddev *mddev, const char *name) 8405 { 8406 struct md_thread *thread; 8407 8408 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8409 if (!thread) 8410 return NULL; 8411 8412 init_waitqueue_head(&thread->wqueue); 8413 8414 thread->run = run; 8415 thread->mddev = mddev; 8416 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8417 thread->tsk = kthread_run(md_thread, thread, 8418 "%s_%s", 8419 mdname(thread->mddev), 8420 name); 8421 if (IS_ERR(thread->tsk)) { 8422 kfree(thread); 8423 return NULL; 8424 } 8425 return thread; 8426 } 8427 EXPORT_SYMBOL(md_register_thread); 8428 8429 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8430 { 8431 struct md_thread *thread = rcu_dereference_protected(*threadp, 8432 lockdep_is_held(&mddev->reconfig_mutex)); 8433 8434 if (!thread) 8435 return; 8436 8437 rcu_assign_pointer(*threadp, NULL); 8438 synchronize_rcu(); 8439 8440 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8441 kthread_stop(thread->tsk); 8442 kfree(thread); 8443 } 8444 EXPORT_SYMBOL(md_unregister_thread); 8445 8446 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8447 { 8448 if (!rdev || test_bit(Faulty, &rdev->flags)) 8449 return; 8450 8451 if (!mddev->pers || !mddev->pers->error_handler) 8452 return; 8453 mddev->pers->error_handler(mddev, rdev); 8454 8455 if (mddev->pers->head.id == ID_RAID0 || 8456 mddev->pers->head.id == ID_LINEAR) 8457 return; 8458 8459 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8460 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8461 sysfs_notify_dirent_safe(rdev->sysfs_state); 8462 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8463 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8464 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8465 md_wakeup_thread(mddev->thread); 8466 } 8467 if (mddev->event_work.func) 8468 queue_work(md_misc_wq, &mddev->event_work); 8469 md_new_event(); 8470 } 8471 EXPORT_SYMBOL(md_error); 8472 8473 /* seq_file implementation /proc/mdstat */ 8474 8475 static void status_unused(struct seq_file *seq) 8476 { 8477 int i = 0; 8478 struct md_rdev *rdev; 8479 8480 seq_printf(seq, "unused devices: "); 8481 8482 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8483 i++; 8484 seq_printf(seq, "%pg ", rdev->bdev); 8485 } 8486 if (!i) 8487 seq_printf(seq, "<none>"); 8488 8489 seq_printf(seq, "\n"); 8490 } 8491 8492 static void status_personalities(struct seq_file *seq) 8493 { 8494 struct md_submodule_head *head; 8495 unsigned long i; 8496 8497 seq_puts(seq, "Personalities : "); 8498 8499 xa_lock(&md_submodule); 8500 xa_for_each(&md_submodule, i, head) 8501 if (head->type == MD_PERSONALITY) 8502 seq_printf(seq, "[%s] ", head->name); 8503 xa_unlock(&md_submodule); 8504 8505 seq_puts(seq, "\n"); 8506 } 8507 8508 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8509 { 8510 sector_t max_sectors, resync, res; 8511 unsigned long dt, db = 0; 8512 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8513 int scale, recovery_active; 8514 unsigned int per_milli; 8515 8516 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8517 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8518 max_sectors = mddev->resync_max_sectors; 8519 else 8520 max_sectors = mddev->dev_sectors; 8521 8522 resync = mddev->curr_resync; 8523 if (resync < MD_RESYNC_ACTIVE) { 8524 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8525 /* Still cleaning up */ 8526 resync = max_sectors; 8527 } else if (resync > max_sectors) { 8528 resync = max_sectors; 8529 } else { 8530 res = atomic_read(&mddev->recovery_active); 8531 /* 8532 * Resync has started, but the subtraction has overflowed or 8533 * yielded one of the special values. Force it to active to 8534 * ensure the status reports an active resync. 8535 */ 8536 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8537 resync = MD_RESYNC_ACTIVE; 8538 else 8539 resync -= res; 8540 } 8541 8542 if (resync == MD_RESYNC_NONE) { 8543 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8544 struct md_rdev *rdev; 8545 8546 rdev_for_each(rdev, mddev) 8547 if (rdev->raid_disk >= 0 && 8548 !test_bit(Faulty, &rdev->flags) && 8549 rdev->recovery_offset != MaxSector && 8550 rdev->recovery_offset) { 8551 seq_printf(seq, "\trecover=REMOTE"); 8552 return 1; 8553 } 8554 if (mddev->reshape_position != MaxSector) 8555 seq_printf(seq, "\treshape=REMOTE"); 8556 else 8557 seq_printf(seq, "\tresync=REMOTE"); 8558 return 1; 8559 } 8560 if (mddev->resync_offset < MaxSector) { 8561 seq_printf(seq, "\tresync=PENDING"); 8562 return 1; 8563 } 8564 return 0; 8565 } 8566 if (resync < MD_RESYNC_ACTIVE) { 8567 seq_printf(seq, "\tresync=DELAYED"); 8568 return 1; 8569 } 8570 8571 WARN_ON(max_sectors == 0); 8572 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8573 * in a sector_t, and (max_sectors>>scale) will fit in a 8574 * u32, as those are the requirements for sector_div. 8575 * Thus 'scale' must be at least 10 8576 */ 8577 scale = 10; 8578 if (sizeof(sector_t) > sizeof(unsigned long)) { 8579 while ( max_sectors/2 > (1ULL<<(scale+32))) 8580 scale++; 8581 } 8582 res = (resync>>scale)*1000; 8583 sector_div(res, (u32)((max_sectors>>scale)+1)); 8584 8585 per_milli = res; 8586 { 8587 int i, x = per_milli/50, y = 20-x; 8588 seq_printf(seq, "["); 8589 for (i = 0; i < x; i++) 8590 seq_printf(seq, "="); 8591 seq_printf(seq, ">"); 8592 for (i = 0; i < y; i++) 8593 seq_printf(seq, "."); 8594 seq_printf(seq, "] "); 8595 } 8596 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8597 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8598 "reshape" : 8599 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8600 "check" : 8601 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8602 "resync" : "recovery"))), 8603 per_milli/10, per_milli % 10, 8604 (unsigned long long) resync/2, 8605 (unsigned long long) max_sectors/2); 8606 8607 /* 8608 * dt: time from mark until now 8609 * db: blocks written from mark until now 8610 * rt: remaining time 8611 * 8612 * rt is a sector_t, which is always 64bit now. We are keeping 8613 * the original algorithm, but it is not really necessary. 8614 * 8615 * Original algorithm: 8616 * So we divide before multiply in case it is 32bit and close 8617 * to the limit. 8618 * We scale the divisor (db) by 32 to avoid losing precision 8619 * near the end of resync when the number of remaining sectors 8620 * is close to 'db'. 8621 * We then divide rt by 32 after multiplying by db to compensate. 8622 * The '+1' avoids division by zero if db is very small. 8623 */ 8624 dt = ((jiffies - mddev->resync_mark) / HZ); 8625 if (!dt) dt++; 8626 8627 curr_mark_cnt = mddev->curr_mark_cnt; 8628 recovery_active = atomic_read(&mddev->recovery_active); 8629 resync_mark_cnt = mddev->resync_mark_cnt; 8630 8631 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8632 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8633 8634 rt = max_sectors - resync; /* number of remaining sectors */ 8635 rt = div64_u64(rt, db/32+1); 8636 rt *= dt; 8637 rt >>= 5; 8638 8639 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8640 ((unsigned long)rt % 60)/6); 8641 8642 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8643 return 1; 8644 } 8645 8646 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8647 __acquires(&all_mddevs_lock) 8648 { 8649 seq->poll_event = atomic_read(&md_event_count); 8650 spin_lock(&all_mddevs_lock); 8651 8652 return seq_list_start_head(&all_mddevs, *pos); 8653 } 8654 8655 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8656 { 8657 return seq_list_next(v, &all_mddevs, pos); 8658 } 8659 8660 static void md_seq_stop(struct seq_file *seq, void *v) 8661 __releases(&all_mddevs_lock) 8662 { 8663 spin_unlock(&all_mddevs_lock); 8664 } 8665 8666 static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) 8667 { 8668 struct md_bitmap_stats stats; 8669 unsigned long used_pages; 8670 unsigned long chunk_kb; 8671 int err; 8672 8673 if (!md_bitmap_enabled(mddev, false)) 8674 return; 8675 8676 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 8677 if (err) 8678 return; 8679 8680 chunk_kb = mddev->bitmap_info.chunksize >> 10; 8681 used_pages = stats.pages - stats.missing_pages; 8682 8683 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", 8684 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), 8685 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 8686 chunk_kb ? "KB" : "B"); 8687 8688 if (stats.file) { 8689 seq_puts(seq, ", file: "); 8690 seq_file_path(seq, stats.file, " \t\n"); 8691 } 8692 8693 seq_putc(seq, '\n'); 8694 } 8695 8696 static int md_seq_show(struct seq_file *seq, void *v) 8697 { 8698 struct mddev *mddev; 8699 sector_t sectors; 8700 struct md_rdev *rdev; 8701 8702 if (v == &all_mddevs) { 8703 status_personalities(seq); 8704 if (list_empty(&all_mddevs)) 8705 status_unused(seq); 8706 return 0; 8707 } 8708 8709 mddev = list_entry(v, struct mddev, all_mddevs); 8710 if (!mddev_get(mddev)) 8711 return 0; 8712 8713 spin_unlock(&all_mddevs_lock); 8714 8715 /* prevent bitmap to be freed after checking */ 8716 mutex_lock(&mddev->bitmap_info.mutex); 8717 8718 spin_lock(&mddev->lock); 8719 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8720 seq_printf(seq, "%s : ", mdname(mddev)); 8721 if (mddev->pers) { 8722 if (test_bit(MD_BROKEN, &mddev->flags)) 8723 seq_printf(seq, "broken"); 8724 else 8725 seq_printf(seq, "active"); 8726 if (mddev->ro == MD_RDONLY) 8727 seq_printf(seq, " (read-only)"); 8728 if (mddev->ro == MD_AUTO_READ) 8729 seq_printf(seq, " (auto-read-only)"); 8730 seq_printf(seq, " %s", mddev->pers->head.name); 8731 } else { 8732 seq_printf(seq, "inactive"); 8733 } 8734 8735 sectors = 0; 8736 rcu_read_lock(); 8737 rdev_for_each_rcu(rdev, mddev) { 8738 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8739 8740 if (test_bit(WriteMostly, &rdev->flags)) 8741 seq_printf(seq, "(W)"); 8742 if (test_bit(Journal, &rdev->flags)) 8743 seq_printf(seq, "(J)"); 8744 if (test_bit(Faulty, &rdev->flags)) { 8745 seq_printf(seq, "(F)"); 8746 continue; 8747 } 8748 if (rdev->raid_disk < 0) 8749 seq_printf(seq, "(S)"); /* spare */ 8750 if (test_bit(Replacement, &rdev->flags)) 8751 seq_printf(seq, "(R)"); 8752 sectors += rdev->sectors; 8753 } 8754 rcu_read_unlock(); 8755 8756 if (!list_empty(&mddev->disks)) { 8757 if (mddev->pers) 8758 seq_printf(seq, "\n %llu blocks", 8759 (unsigned long long) 8760 mddev->array_sectors / 2); 8761 else 8762 seq_printf(seq, "\n %llu blocks", 8763 (unsigned long long)sectors / 2); 8764 } 8765 if (mddev->persistent) { 8766 if (mddev->major_version != 0 || 8767 mddev->minor_version != 90) { 8768 seq_printf(seq," super %d.%d", 8769 mddev->major_version, 8770 mddev->minor_version); 8771 } 8772 } else if (mddev->external) 8773 seq_printf(seq, " super external:%s", 8774 mddev->metadata_type); 8775 else 8776 seq_printf(seq, " super non-persistent"); 8777 8778 if (mddev->pers) { 8779 mddev->pers->status(seq, mddev); 8780 seq_printf(seq, "\n "); 8781 if (mddev->pers->sync_request) { 8782 if (status_resync(seq, mddev)) 8783 seq_printf(seq, "\n "); 8784 } 8785 } else 8786 seq_printf(seq, "\n "); 8787 8788 md_bitmap_status(seq, mddev); 8789 8790 seq_printf(seq, "\n"); 8791 } 8792 spin_unlock(&mddev->lock); 8793 mutex_unlock(&mddev->bitmap_info.mutex); 8794 spin_lock(&all_mddevs_lock); 8795 8796 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) 8797 status_unused(seq); 8798 8799 mddev_put_locked(mddev); 8800 return 0; 8801 } 8802 8803 static const struct seq_operations md_seq_ops = { 8804 .start = md_seq_start, 8805 .next = md_seq_next, 8806 .stop = md_seq_stop, 8807 .show = md_seq_show, 8808 }; 8809 8810 static int md_seq_open(struct inode *inode, struct file *file) 8811 { 8812 struct seq_file *seq; 8813 int error; 8814 8815 error = seq_open(file, &md_seq_ops); 8816 if (error) 8817 return error; 8818 8819 seq = file->private_data; 8820 seq->poll_event = atomic_read(&md_event_count); 8821 return error; 8822 } 8823 8824 static int md_unloading; 8825 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8826 { 8827 struct seq_file *seq = filp->private_data; 8828 __poll_t mask; 8829 8830 if (md_unloading) 8831 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8832 poll_wait(filp, &md_event_waiters, wait); 8833 8834 /* always allow read */ 8835 mask = EPOLLIN | EPOLLRDNORM; 8836 8837 if (seq->poll_event != atomic_read(&md_event_count)) 8838 mask |= EPOLLERR | EPOLLPRI; 8839 return mask; 8840 } 8841 8842 static const struct proc_ops mdstat_proc_ops = { 8843 .proc_open = md_seq_open, 8844 .proc_read = seq_read, 8845 .proc_lseek = seq_lseek, 8846 .proc_release = seq_release, 8847 .proc_poll = mdstat_poll, 8848 }; 8849 8850 int register_md_submodule(struct md_submodule_head *msh) 8851 { 8852 return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); 8853 } 8854 EXPORT_SYMBOL_GPL(register_md_submodule); 8855 8856 void unregister_md_submodule(struct md_submodule_head *msh) 8857 { 8858 xa_erase(&md_submodule, msh->id); 8859 } 8860 EXPORT_SYMBOL_GPL(unregister_md_submodule); 8861 8862 int md_setup_cluster(struct mddev *mddev, int nodes) 8863 { 8864 int ret = get_cluster_ops(mddev); 8865 8866 if (ret) { 8867 request_module("md-cluster"); 8868 ret = get_cluster_ops(mddev); 8869 } 8870 8871 /* ensure module won't be unloaded */ 8872 if (ret) { 8873 pr_warn("can't find md-cluster module or get its reference.\n"); 8874 return ret; 8875 } 8876 8877 ret = mddev->cluster_ops->join(mddev, nodes); 8878 if (!ret) 8879 mddev->safemode_delay = 0; 8880 return ret; 8881 } 8882 8883 void md_cluster_stop(struct mddev *mddev) 8884 { 8885 put_cluster_ops(mddev); 8886 } 8887 8888 static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) 8889 { 8890 unsigned long last_events = rdev->last_events; 8891 8892 if (!bdev_is_partition(rdev->bdev)) 8893 return true; 8894 8895 /* 8896 * If rdev is partition, and user doesn't issue IO to the array, the 8897 * array is still not idle if user issues IO to other partitions. 8898 */ 8899 rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, 8900 sectors) - 8901 part_stat_read_accum(rdev->bdev, sectors); 8902 8903 return init || rdev->last_events <= last_events; 8904 } 8905 8906 /* 8907 * mddev is idle if following conditions are matched since last check: 8908 * 1) mddev doesn't have normal IO completed; 8909 * 2) mddev doesn't have inflight normal IO; 8910 * 3) if any member disk is partition, and other partitions don't have IO 8911 * completed; 8912 * 8913 * Noted this checking rely on IO accounting is enabled. 8914 */ 8915 static bool is_mddev_idle(struct mddev *mddev, int init) 8916 { 8917 unsigned long last_events = mddev->normal_io_events; 8918 struct gendisk *disk; 8919 struct md_rdev *rdev; 8920 bool idle = true; 8921 8922 disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; 8923 if (!disk) 8924 return true; 8925 8926 mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); 8927 if (!init && (mddev->normal_io_events > last_events || 8928 bdev_count_inflight(disk->part0))) 8929 idle = false; 8930 8931 rcu_read_lock(); 8932 rdev_for_each_rcu(rdev, mddev) 8933 if (!is_rdev_holder_idle(rdev, init)) 8934 idle = false; 8935 rcu_read_unlock(); 8936 8937 return idle; 8938 } 8939 8940 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8941 { 8942 /* another "blocks" (512byte) blocks have been synced */ 8943 atomic_sub(blocks, &mddev->recovery_active); 8944 wake_up(&mddev->recovery_wait); 8945 if (!ok) { 8946 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8947 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8948 md_wakeup_thread(mddev->thread); 8949 // stop recovery, signal do_sync .... 8950 } 8951 } 8952 EXPORT_SYMBOL(md_done_sync); 8953 8954 /* md_write_start(mddev, bi) 8955 * If we need to update some array metadata (e.g. 'active' flag 8956 * in superblock) before writing, schedule a superblock update 8957 * and wait for it to complete. 8958 * A return value of 'false' means that the write wasn't recorded 8959 * and cannot proceed as the array is being suspend. 8960 */ 8961 void md_write_start(struct mddev *mddev, struct bio *bi) 8962 { 8963 int did_change = 0; 8964 8965 if (bio_data_dir(bi) != WRITE) 8966 return; 8967 8968 BUG_ON(mddev->ro == MD_RDONLY); 8969 if (mddev->ro == MD_AUTO_READ) { 8970 /* need to switch to read/write */ 8971 mddev->ro = MD_RDWR; 8972 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8973 md_wakeup_thread(mddev->thread); 8974 md_wakeup_thread(mddev->sync_thread); 8975 did_change = 1; 8976 } 8977 rcu_read_lock(); 8978 percpu_ref_get(&mddev->writes_pending); 8979 smp_mb(); /* Match smp_mb in set_in_sync() */ 8980 if (mddev->safemode == 1) 8981 mddev->safemode = 0; 8982 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8983 if (mddev->in_sync || mddev->sync_checkers) { 8984 spin_lock(&mddev->lock); 8985 if (mddev->in_sync) { 8986 mddev->in_sync = 0; 8987 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8988 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8989 md_wakeup_thread(mddev->thread); 8990 did_change = 1; 8991 } 8992 spin_unlock(&mddev->lock); 8993 } 8994 rcu_read_unlock(); 8995 if (did_change) 8996 sysfs_notify_dirent_safe(mddev->sysfs_state); 8997 if (!mddev->has_superblocks) 8998 return; 8999 wait_event(mddev->sb_wait, 9000 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 9001 } 9002 EXPORT_SYMBOL(md_write_start); 9003 9004 /* md_write_inc can only be called when md_write_start() has 9005 * already been called at least once of the current request. 9006 * It increments the counter and is useful when a single request 9007 * is split into several parts. Each part causes an increment and 9008 * so needs a matching md_write_end(). 9009 * Unlike md_write_start(), it is safe to call md_write_inc() inside 9010 * a spinlocked region. 9011 */ 9012 void md_write_inc(struct mddev *mddev, struct bio *bi) 9013 { 9014 if (bio_data_dir(bi) != WRITE) 9015 return; 9016 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 9017 percpu_ref_get(&mddev->writes_pending); 9018 } 9019 EXPORT_SYMBOL(md_write_inc); 9020 9021 void md_write_end(struct mddev *mddev) 9022 { 9023 percpu_ref_put(&mddev->writes_pending); 9024 9025 if (mddev->safemode == 2) 9026 md_wakeup_thread(mddev->thread); 9027 else if (mddev->safemode_delay) 9028 /* The roundup() ensures this only performs locking once 9029 * every ->safemode_delay jiffies 9030 */ 9031 mod_timer(&mddev->safemode_timer, 9032 roundup(jiffies, mddev->safemode_delay) + 9033 mddev->safemode_delay); 9034 } 9035 9036 EXPORT_SYMBOL(md_write_end); 9037 9038 /* This is used by raid0 and raid10 */ 9039 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 9040 struct bio *bio, sector_t start, sector_t size) 9041 { 9042 struct bio *discard_bio = NULL; 9043 9044 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 9045 &discard_bio) || !discard_bio) 9046 return; 9047 9048 bio_chain(discard_bio, bio); 9049 bio_clone_blkg_association(discard_bio, bio); 9050 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); 9051 submit_bio_noacct(discard_bio); 9052 } 9053 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 9054 9055 static void md_bitmap_start(struct mddev *mddev, 9056 struct md_io_clone *md_io_clone) 9057 { 9058 md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? 9059 mddev->bitmap_ops->start_discard : 9060 mddev->bitmap_ops->start_write; 9061 9062 if (mddev->pers->bitmap_sector) 9063 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, 9064 &md_io_clone->sectors); 9065 9066 fn(mddev, md_io_clone->offset, md_io_clone->sectors); 9067 } 9068 9069 static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) 9070 { 9071 md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? 9072 mddev->bitmap_ops->end_discard : 9073 mddev->bitmap_ops->end_write; 9074 9075 fn(mddev, md_io_clone->offset, md_io_clone->sectors); 9076 } 9077 9078 static void md_end_clone_io(struct bio *bio) 9079 { 9080 struct md_io_clone *md_io_clone = bio->bi_private; 9081 struct bio *orig_bio = md_io_clone->orig_bio; 9082 struct mddev *mddev = md_io_clone->mddev; 9083 9084 if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) 9085 md_bitmap_end(mddev, md_io_clone); 9086 9087 if (bio->bi_status && !orig_bio->bi_status) 9088 orig_bio->bi_status = bio->bi_status; 9089 9090 if (md_io_clone->start_time) 9091 bio_end_io_acct(orig_bio, md_io_clone->start_time); 9092 9093 bio_put(bio); 9094 bio_endio(orig_bio); 9095 percpu_ref_put(&mddev->active_io); 9096 } 9097 9098 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 9099 { 9100 struct block_device *bdev = (*bio)->bi_bdev; 9101 struct md_io_clone *md_io_clone; 9102 struct bio *clone = 9103 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 9104 9105 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 9106 md_io_clone->orig_bio = *bio; 9107 md_io_clone->mddev = mddev; 9108 if (blk_queue_io_stat(bdev->bd_disk->queue)) 9109 md_io_clone->start_time = bio_start_io_acct(*bio); 9110 9111 if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { 9112 md_io_clone->offset = (*bio)->bi_iter.bi_sector; 9113 md_io_clone->sectors = bio_sectors(*bio); 9114 md_io_clone->rw = op_stat_group(bio_op(*bio)); 9115 md_bitmap_start(mddev, md_io_clone); 9116 } 9117 9118 clone->bi_end_io = md_end_clone_io; 9119 clone->bi_private = md_io_clone; 9120 *bio = clone; 9121 } 9122 9123 void md_account_bio(struct mddev *mddev, struct bio **bio) 9124 { 9125 percpu_ref_get(&mddev->active_io); 9126 md_clone_bio(mddev, bio); 9127 } 9128 EXPORT_SYMBOL_GPL(md_account_bio); 9129 9130 void md_free_cloned_bio(struct bio *bio) 9131 { 9132 struct md_io_clone *md_io_clone = bio->bi_private; 9133 struct bio *orig_bio = md_io_clone->orig_bio; 9134 struct mddev *mddev = md_io_clone->mddev; 9135 9136 if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) 9137 md_bitmap_end(mddev, md_io_clone); 9138 9139 if (bio->bi_status && !orig_bio->bi_status) 9140 orig_bio->bi_status = bio->bi_status; 9141 9142 if (md_io_clone->start_time) 9143 bio_end_io_acct(orig_bio, md_io_clone->start_time); 9144 9145 bio_put(bio); 9146 percpu_ref_put(&mddev->active_io); 9147 } 9148 EXPORT_SYMBOL_GPL(md_free_cloned_bio); 9149 9150 /* md_allow_write(mddev) 9151 * Calling this ensures that the array is marked 'active' so that writes 9152 * may proceed without blocking. It is important to call this before 9153 * attempting a GFP_KERNEL allocation while holding the mddev lock. 9154 * Must be called with mddev_lock held. 9155 */ 9156 void md_allow_write(struct mddev *mddev) 9157 { 9158 if (!mddev->pers) 9159 return; 9160 if (!md_is_rdwr(mddev)) 9161 return; 9162 if (!mddev->pers->sync_request) 9163 return; 9164 9165 spin_lock(&mddev->lock); 9166 if (mddev->in_sync) { 9167 mddev->in_sync = 0; 9168 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9169 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9170 if (mddev->safemode_delay && 9171 mddev->safemode == 0) 9172 mddev->safemode = 1; 9173 spin_unlock(&mddev->lock); 9174 md_update_sb(mddev, 0); 9175 sysfs_notify_dirent_safe(mddev->sysfs_state); 9176 /* wait for the dirty state to be recorded in the metadata */ 9177 wait_event(mddev->sb_wait, 9178 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 9179 } else 9180 spin_unlock(&mddev->lock); 9181 } 9182 EXPORT_SYMBOL_GPL(md_allow_write); 9183 9184 static sector_t md_sync_max_sectors(struct mddev *mddev, 9185 enum sync_action action) 9186 { 9187 switch (action) { 9188 case ACTION_RESYNC: 9189 case ACTION_CHECK: 9190 case ACTION_REPAIR: 9191 atomic64_set(&mddev->resync_mismatches, 0); 9192 fallthrough; 9193 case ACTION_RESHAPE: 9194 return mddev->resync_max_sectors; 9195 case ACTION_RECOVER: 9196 return mddev->dev_sectors; 9197 default: 9198 return 0; 9199 } 9200 } 9201 9202 /* 9203 * If lazy recovery is requested and all rdevs are in sync, select the rdev with 9204 * the higest index to perfore recovery to build initial xor data, this is the 9205 * same as old bitmap. 9206 */ 9207 static bool mddev_select_lazy_recover_rdev(struct mddev *mddev) 9208 { 9209 struct md_rdev *recover_rdev = NULL; 9210 struct md_rdev *rdev; 9211 bool ret = false; 9212 9213 rcu_read_lock(); 9214 rdev_for_each_rcu(rdev, mddev) { 9215 if (rdev->raid_disk < 0) 9216 continue; 9217 9218 if (test_bit(Faulty, &rdev->flags) || 9219 !test_bit(In_sync, &rdev->flags)) 9220 break; 9221 9222 if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk) 9223 recover_rdev = rdev; 9224 } 9225 9226 if (recover_rdev) { 9227 clear_bit(In_sync, &recover_rdev->flags); 9228 ret = true; 9229 } 9230 9231 rcu_read_unlock(); 9232 return ret; 9233 } 9234 9235 static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) 9236 { 9237 sector_t start = 0; 9238 struct md_rdev *rdev; 9239 9240 switch (action) { 9241 case ACTION_CHECK: 9242 case ACTION_REPAIR: 9243 return mddev->resync_min; 9244 case ACTION_RESYNC: 9245 if (!mddev->bitmap) 9246 return mddev->resync_offset; 9247 return 0; 9248 case ACTION_RESHAPE: 9249 /* 9250 * If the original node aborts reshaping then we continue the 9251 * reshaping, so set again to avoid restart reshape from the 9252 * first beginning 9253 */ 9254 if (mddev_is_clustered(mddev) && 9255 mddev->reshape_position != MaxSector) 9256 return mddev->reshape_position; 9257 return 0; 9258 case ACTION_RECOVER: 9259 start = MaxSector; 9260 rcu_read_lock(); 9261 rdev_for_each_rcu(rdev, mddev) 9262 if (rdev_needs_recovery(rdev, start)) 9263 start = rdev->recovery_offset; 9264 rcu_read_unlock(); 9265 9266 /* 9267 * If there are no spares, and raid456 lazy initial recover is 9268 * requested. 9269 */ 9270 if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) && 9271 start == MaxSector && mddev_select_lazy_recover_rdev(mddev)) 9272 start = 0; 9273 9274 /* If there is a bitmap, we need to make sure all 9275 * writes that started before we added a spare 9276 * complete before we start doing a recovery. 9277 * Otherwise the write might complete and (via 9278 * bitmap_endwrite) set a bit in the bitmap after the 9279 * recovery has checked that bit and skipped that 9280 * region. 9281 */ 9282 if (mddev->bitmap) { 9283 mddev->pers->quiesce(mddev, 1); 9284 mddev->pers->quiesce(mddev, 0); 9285 } 9286 return start; 9287 default: 9288 return MaxSector; 9289 } 9290 } 9291 9292 static bool sync_io_within_limit(struct mddev *mddev) 9293 { 9294 /* 9295 * For raid456, sync IO is stripe(4k) per IO, for other levels, it's 9296 * RESYNC_PAGES(64k) per IO. 9297 */ 9298 return atomic_read(&mddev->recovery_active) < 9299 (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); 9300 } 9301 9302 #define SYNC_MARKS 10 9303 #define SYNC_MARK_STEP (3*HZ) 9304 #define UPDATE_FREQUENCY (5*60*HZ) 9305 void md_do_sync(struct md_thread *thread) 9306 { 9307 struct mddev *mddev = thread->mddev; 9308 struct mddev *mddev2; 9309 unsigned int currspeed = 0, window; 9310 sector_t max_sectors,j, io_sectors, recovery_done; 9311 unsigned long mark[SYNC_MARKS]; 9312 unsigned long update_time; 9313 sector_t mark_cnt[SYNC_MARKS]; 9314 int last_mark,m; 9315 sector_t last_check; 9316 int skipped = 0; 9317 struct md_rdev *rdev; 9318 enum sync_action action; 9319 const char *desc; 9320 struct blk_plug plug; 9321 int ret; 9322 9323 /* just incase thread restarts... */ 9324 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9325 return; 9326 9327 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9328 goto skip; 9329 9330 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || 9331 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 9332 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9333 goto skip; 9334 } 9335 9336 if (mddev_is_clustered(mddev)) { 9337 ret = mddev->cluster_ops->resync_start(mddev); 9338 if (ret) 9339 goto skip; 9340 9341 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 9342 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 9343 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 9344 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 9345 && ((unsigned long long)mddev->curr_resync_completed 9346 < (unsigned long long)mddev->resync_max_sectors)) 9347 goto skip; 9348 } 9349 9350 action = md_sync_action(mddev); 9351 if (action == ACTION_FROZEN || action == ACTION_IDLE) { 9352 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9353 goto skip; 9354 } 9355 9356 desc = md_sync_action_name(action); 9357 mddev->last_sync_action = action; 9358 9359 /* 9360 * Before starting a resync we must have set curr_resync to 9361 * 2, and then checked that every "conflicting" array has curr_resync 9362 * less than ours. When we find one that is the same or higher 9363 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 9364 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 9365 * This will mean we have to start checking from the beginning again. 9366 * 9367 */ 9368 if (mddev_is_clustered(mddev)) 9369 mddev->cluster_ops->resync_start_notify(mddev); 9370 do { 9371 int mddev2_minor = -1; 9372 mddev->curr_resync = MD_RESYNC_DELAYED; 9373 9374 try_again: 9375 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9376 goto skip; 9377 spin_lock(&all_mddevs_lock); 9378 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 9379 if (test_bit(MD_DELETED, &mddev2->flags)) 9380 continue; 9381 if (mddev2 == mddev) 9382 continue; 9383 if (!mddev->parallel_resync 9384 && mddev2->curr_resync 9385 && match_mddev_units(mddev, mddev2)) { 9386 DEFINE_WAIT(wq); 9387 if (mddev < mddev2 && 9388 mddev->curr_resync == MD_RESYNC_DELAYED) { 9389 /* arbitrarily yield */ 9390 mddev->curr_resync = MD_RESYNC_YIELDED; 9391 wake_up(&resync_wait); 9392 } 9393 if (mddev > mddev2 && 9394 mddev->curr_resync == MD_RESYNC_YIELDED) 9395 /* no need to wait here, we can wait the next 9396 * time 'round when curr_resync == 2 9397 */ 9398 continue; 9399 /* We need to wait 'interruptible' so as not to 9400 * contribute to the load average, and not to 9401 * be caught by 'softlockup' 9402 */ 9403 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 9404 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9405 mddev2->curr_resync >= mddev->curr_resync) { 9406 if (mddev2_minor != mddev2->md_minor) { 9407 mddev2_minor = mddev2->md_minor; 9408 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 9409 desc, mdname(mddev), 9410 mdname(mddev2)); 9411 } 9412 spin_unlock(&all_mddevs_lock); 9413 9414 if (signal_pending(current)) 9415 flush_signals(current); 9416 schedule(); 9417 finish_wait(&resync_wait, &wq); 9418 goto try_again; 9419 } 9420 finish_wait(&resync_wait, &wq); 9421 } 9422 } 9423 spin_unlock(&all_mddevs_lock); 9424 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 9425 9426 max_sectors = md_sync_max_sectors(mddev, action); 9427 j = md_sync_position(mddev, action); 9428 9429 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 9430 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 9431 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 9432 speed_max(mddev), desc); 9433 9434 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 9435 9436 io_sectors = 0; 9437 for (m = 0; m < SYNC_MARKS; m++) { 9438 mark[m] = jiffies; 9439 mark_cnt[m] = io_sectors; 9440 } 9441 last_mark = 0; 9442 mddev->resync_mark = mark[last_mark]; 9443 mddev->resync_mark_cnt = mark_cnt[last_mark]; 9444 9445 /* 9446 * Tune reconstruction: 9447 */ 9448 window = 32 * (PAGE_SIZE / 512); 9449 pr_debug("md: using %dk window, over a total of %lluk.\n", 9450 window/2, (unsigned long long)max_sectors/2); 9451 9452 atomic_set(&mddev->recovery_active, 0); 9453 last_check = 0; 9454 9455 if (j >= MD_RESYNC_ACTIVE) { 9456 pr_debug("md: resuming %s of %s from checkpoint.\n", 9457 desc, mdname(mddev)); 9458 mddev->curr_resync = j; 9459 } else 9460 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 9461 mddev->curr_resync_completed = j; 9462 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9463 md_new_event(); 9464 update_time = jiffies; 9465 9466 blk_start_plug(&plug); 9467 while (j < max_sectors) { 9468 sector_t sectors; 9469 9470 skipped = 0; 9471 9472 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9473 ((mddev->curr_resync > mddev->curr_resync_completed && 9474 (mddev->curr_resync - mddev->curr_resync_completed) 9475 > (max_sectors >> 4)) || 9476 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9477 (j - mddev->curr_resync_completed)*2 9478 >= mddev->resync_max - mddev->curr_resync_completed || 9479 mddev->curr_resync_completed > mddev->resync_max 9480 )) { 9481 /* time to update curr_resync_completed */ 9482 wait_event(mddev->recovery_wait, 9483 atomic_read(&mddev->recovery_active) == 0); 9484 mddev->curr_resync_completed = j; 9485 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9486 j > mddev->resync_offset) 9487 mddev->resync_offset = j; 9488 update_time = jiffies; 9489 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9490 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9491 } 9492 9493 while (j >= mddev->resync_max && 9494 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9495 /* As this condition is controlled by user-space, 9496 * we can block indefinitely, so use '_interruptible' 9497 * to avoid triggering warnings. 9498 */ 9499 flush_signals(current); /* just in case */ 9500 wait_event_interruptible(mddev->recovery_wait, 9501 mddev->resync_max > j 9502 || test_bit(MD_RECOVERY_INTR, 9503 &mddev->recovery)); 9504 } 9505 9506 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9507 break; 9508 9509 if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) { 9510 sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j); 9511 if (sectors) 9512 goto update; 9513 } 9514 9515 sectors = mddev->pers->sync_request(mddev, j, max_sectors, 9516 &skipped); 9517 if (sectors == 0) { 9518 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9519 break; 9520 } 9521 9522 if (!skipped) { /* actual IO requested */ 9523 io_sectors += sectors; 9524 atomic_add(sectors, &mddev->recovery_active); 9525 } 9526 9527 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9528 break; 9529 9530 update: 9531 j += sectors; 9532 if (j > max_sectors) 9533 /* when skipping, extra large numbers can be returned. */ 9534 j = max_sectors; 9535 if (j >= MD_RESYNC_ACTIVE) 9536 mddev->curr_resync = j; 9537 mddev->curr_mark_cnt = io_sectors; 9538 if (last_check == 0) 9539 /* this is the earliest that rebuild will be 9540 * visible in /proc/mdstat 9541 */ 9542 md_new_event(); 9543 9544 if (last_check + window > io_sectors || j == max_sectors) 9545 continue; 9546 9547 last_check = io_sectors; 9548 repeat: 9549 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9550 /* step marks */ 9551 int next = (last_mark+1) % SYNC_MARKS; 9552 9553 mddev->resync_mark = mark[next]; 9554 mddev->resync_mark_cnt = mark_cnt[next]; 9555 mark[next] = jiffies; 9556 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9557 last_mark = next; 9558 } 9559 9560 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9561 break; 9562 9563 /* 9564 * this loop exits only if either when we are slower than 9565 * the 'hard' speed limit, or the system was IO-idle for 9566 * a jiffy. 9567 * the system might be non-idle CPU-wise, but we only care 9568 * about not overloading the IO subsystem. (things like an 9569 * e2fsck being done on the RAID array should execute fast) 9570 */ 9571 cond_resched(); 9572 9573 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9574 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9575 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9576 9577 if (currspeed > speed_min(mddev)) { 9578 if (currspeed > speed_max(mddev)) { 9579 msleep(500); 9580 goto repeat; 9581 } 9582 if (!sync_io_within_limit(mddev) && 9583 !is_mddev_idle(mddev, 0)) { 9584 /* 9585 * Give other IO more of a chance. 9586 * The faster the devices, the less we wait. 9587 */ 9588 wait_event(mddev->recovery_wait, 9589 !atomic_read(&mddev->recovery_active)); 9590 } 9591 } 9592 } 9593 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9594 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9595 ? "interrupted" : "done"); 9596 /* 9597 * this also signals 'finished resyncing' to md_stop 9598 */ 9599 blk_finish_plug(&plug); 9600 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9601 9602 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9603 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9604 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9605 mddev->curr_resync_completed = mddev->curr_resync; 9606 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9607 } 9608 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); 9609 9610 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9611 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9612 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9613 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9614 if (mddev->curr_resync >= mddev->resync_offset) { 9615 pr_debug("md: checkpointing %s of %s.\n", 9616 desc, mdname(mddev)); 9617 if (test_bit(MD_RECOVERY_ERROR, 9618 &mddev->recovery)) 9619 mddev->resync_offset = 9620 mddev->curr_resync_completed; 9621 else 9622 mddev->resync_offset = 9623 mddev->curr_resync; 9624 } 9625 } else 9626 mddev->resync_offset = MaxSector; 9627 } else { 9628 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9629 mddev->curr_resync = MaxSector; 9630 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9631 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9632 rcu_read_lock(); 9633 rdev_for_each_rcu(rdev, mddev) 9634 if (mddev->delta_disks >= 0 && 9635 rdev_needs_recovery(rdev, mddev->curr_resync)) 9636 rdev->recovery_offset = mddev->curr_resync; 9637 rcu_read_unlock(); 9638 } 9639 } 9640 } 9641 skip: 9642 /* set CHANGE_PENDING here since maybe another update is needed, 9643 * so other nodes are informed. It should be harmless for normal 9644 * raid */ 9645 set_mask_bits(&mddev->sb_flags, 0, 9646 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9647 9648 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9649 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9650 mddev->delta_disks > 0 && 9651 mddev->pers->finish_reshape && 9652 mddev->pers->size && 9653 !mddev_is_dm(mddev)) { 9654 mddev_lock_nointr(mddev); 9655 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9656 mddev_unlock(mddev); 9657 if (!mddev_is_clustered(mddev)) 9658 set_capacity_and_notify(mddev->gendisk, 9659 mddev->array_sectors); 9660 } 9661 9662 spin_lock(&mddev->lock); 9663 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9664 /* We completed so min/max setting can be forgotten if used. */ 9665 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9666 mddev->resync_min = 0; 9667 mddev->resync_max = MaxSector; 9668 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9669 mddev->resync_min = mddev->curr_resync_completed; 9670 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9671 mddev->curr_resync = MD_RESYNC_NONE; 9672 spin_unlock(&mddev->lock); 9673 9674 wake_up(&resync_wait); 9675 md_wakeup_thread(mddev->thread); 9676 return; 9677 } 9678 EXPORT_SYMBOL_GPL(md_do_sync); 9679 9680 static bool rdev_removeable(struct md_rdev *rdev) 9681 { 9682 /* rdev is not used. */ 9683 if (rdev->raid_disk < 0) 9684 return false; 9685 9686 /* There are still inflight io, don't remove this rdev. */ 9687 if (atomic_read(&rdev->nr_pending)) 9688 return false; 9689 9690 /* 9691 * An error occurred but has not yet been acknowledged by the metadata 9692 * handler, don't remove this rdev. 9693 */ 9694 if (test_bit(Blocked, &rdev->flags)) 9695 return false; 9696 9697 /* Fautly rdev is not used, it's safe to remove it. */ 9698 if (test_bit(Faulty, &rdev->flags)) 9699 return true; 9700 9701 /* Journal disk can only be removed if it's faulty. */ 9702 if (test_bit(Journal, &rdev->flags)) 9703 return false; 9704 9705 /* 9706 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9707 * replacement has just become active from pers->spare_active(), and 9708 * then pers->hot_remove_disk() will replace this rdev with replacement. 9709 */ 9710 if (!test_bit(In_sync, &rdev->flags)) 9711 return true; 9712 9713 return false; 9714 } 9715 9716 static bool rdev_is_spare(struct md_rdev *rdev) 9717 { 9718 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 9719 !test_bit(In_sync, &rdev->flags) && 9720 !test_bit(Journal, &rdev->flags) && 9721 !test_bit(Faulty, &rdev->flags); 9722 } 9723 9724 static bool rdev_addable(struct md_rdev *rdev) 9725 { 9726 struct mddev *mddev; 9727 9728 mddev = READ_ONCE(rdev->mddev); 9729 if (!mddev) 9730 return false; 9731 9732 /* rdev is already used, don't add it again. */ 9733 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9734 test_bit(Faulty, &rdev->flags)) 9735 return false; 9736 9737 /* Allow to add journal disk. */ 9738 if (test_bit(Journal, &rdev->flags)) 9739 return true; 9740 9741 /* Allow to add if array is read-write. */ 9742 if (md_is_rdwr(mddev)) 9743 return true; 9744 9745 /* 9746 * For read-only array, only allow to readd a rdev. And if bitmap is 9747 * used, don't allow to readd a rdev that is too old. 9748 */ 9749 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 9750 return true; 9751 9752 return false; 9753 } 9754 9755 static bool md_spares_need_change(struct mddev *mddev) 9756 { 9757 struct md_rdev *rdev; 9758 9759 rcu_read_lock(); 9760 rdev_for_each_rcu(rdev, mddev) { 9761 if (rdev_removeable(rdev) || rdev_addable(rdev)) { 9762 rcu_read_unlock(); 9763 return true; 9764 } 9765 } 9766 rcu_read_unlock(); 9767 return false; 9768 } 9769 9770 static int remove_spares(struct mddev *mddev, struct md_rdev *this) 9771 { 9772 struct md_rdev *rdev; 9773 int removed = 0; 9774 9775 rdev_for_each(rdev, mddev) { 9776 if ((this == NULL || rdev == this) && rdev_removeable(rdev) && 9777 !mddev->pers->hot_remove_disk(mddev, rdev)) { 9778 sysfs_unlink_rdev(mddev, rdev); 9779 rdev->saved_raid_disk = rdev->raid_disk; 9780 rdev->raid_disk = -1; 9781 removed++; 9782 } 9783 } 9784 9785 if (removed && mddev->kobj.sd) 9786 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9787 9788 return removed; 9789 } 9790 9791 static int remove_and_add_spares(struct mddev *mddev, 9792 struct md_rdev *this) 9793 { 9794 struct md_rdev *rdev; 9795 int spares = 0; 9796 int removed = 0; 9797 9798 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9799 /* Mustn't remove devices when resync thread is running */ 9800 return 0; 9801 9802 removed = remove_spares(mddev, this); 9803 if (this && removed) 9804 goto no_add; 9805 9806 rdev_for_each(rdev, mddev) { 9807 if (this && this != rdev) 9808 continue; 9809 if (rdev_is_spare(rdev)) 9810 spares++; 9811 if (!rdev_addable(rdev)) 9812 continue; 9813 if (!test_bit(Journal, &rdev->flags)) 9814 rdev->recovery_offset = 0; 9815 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9816 /* failure here is OK */ 9817 sysfs_link_rdev(mddev, rdev); 9818 if (!test_bit(Journal, &rdev->flags)) 9819 spares++; 9820 md_new_event(); 9821 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9822 } 9823 } 9824 no_add: 9825 if (removed) 9826 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9827 return spares; 9828 } 9829 9830 static bool md_choose_sync_action(struct mddev *mddev, int *spares) 9831 { 9832 /* Check if reshape is in progress first. */ 9833 if (mddev->reshape_position != MaxSector) { 9834 if (mddev->pers->check_reshape == NULL || 9835 mddev->pers->check_reshape(mddev) != 0) 9836 return false; 9837 9838 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9839 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9840 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 9841 return true; 9842 } 9843 9844 /* Check if resync is in progress. */ 9845 if (mddev->resync_offset < MaxSector) { 9846 remove_spares(mddev, NULL); 9847 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9848 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9849 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 9850 return true; 9851 } 9852 9853 /* 9854 * Remove any failed drives, then add spares if possible. Spares are 9855 * also removed and re-added, to allow the personality to fail the 9856 * re-add. 9857 */ 9858 *spares = remove_and_add_spares(mddev, NULL); 9859 if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) { 9860 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9861 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9862 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9863 9864 /* Start new recovery. */ 9865 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9866 return true; 9867 } 9868 9869 /* Delay to choose resync/check/repair in md_do_sync(). */ 9870 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9871 return true; 9872 9873 /* Nothing to be done */ 9874 return false; 9875 } 9876 9877 static void md_start_sync(struct work_struct *ws) 9878 { 9879 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 9880 int spares = 0; 9881 bool suspend = false; 9882 char *name; 9883 9884 /* 9885 * If reshape is still in progress, spares won't be added or removed 9886 * from conf until reshape is done. 9887 */ 9888 if (mddev->reshape_position == MaxSector && 9889 md_spares_need_change(mddev)) { 9890 suspend = true; 9891 mddev_suspend(mddev, false); 9892 } 9893 9894 mddev_lock_nointr(mddev); 9895 if (!md_is_rdwr(mddev)) { 9896 /* 9897 * On a read-only array we can: 9898 * - remove failed devices 9899 * - add already-in_sync devices if the array itself is in-sync. 9900 * As we only add devices that are already in-sync, we can 9901 * activate the spares immediately. 9902 */ 9903 remove_and_add_spares(mddev, NULL); 9904 goto not_running; 9905 } 9906 9907 if (!md_choose_sync_action(mddev, &spares)) 9908 goto not_running; 9909 9910 if (!mddev->pers->sync_request) 9911 goto not_running; 9912 9913 /* 9914 * We are adding a device or devices to an array which has the bitmap 9915 * stored on all devices. So make sure all bitmap pages get written. 9916 */ 9917 if (spares && md_bitmap_enabled(mddev, true)) 9918 mddev->bitmap_ops->write_all(mddev); 9919 9920 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? 9921 "reshape" : "resync"; 9922 rcu_assign_pointer(mddev->sync_thread, 9923 md_register_thread(md_do_sync, mddev, name)); 9924 if (!mddev->sync_thread) { 9925 pr_warn("%s: could not start resync thread...\n", 9926 mdname(mddev)); 9927 /* leave the spares where they are, it shouldn't hurt */ 9928 goto not_running; 9929 } 9930 9931 mddev_unlock(mddev); 9932 /* 9933 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9934 * not set it again. Otherwise, we may cause issue like this one: 9935 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9936 * Therefore, use __mddev_resume(mddev, false). 9937 */ 9938 if (suspend) 9939 __mddev_resume(mddev, false); 9940 md_wakeup_thread(mddev->sync_thread); 9941 sysfs_notify_dirent_safe(mddev->sysfs_action); 9942 md_new_event(); 9943 return; 9944 9945 not_running: 9946 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9947 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9948 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9949 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9950 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9951 mddev_unlock(mddev); 9952 /* 9953 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9954 * not set it again. Otherwise, we may cause issue like this one: 9955 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9956 * Therefore, use __mddev_resume(mddev, false). 9957 */ 9958 if (suspend) 9959 __mddev_resume(mddev, false); 9960 9961 wake_up(&resync_wait); 9962 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 9963 mddev->sysfs_action) 9964 sysfs_notify_dirent_safe(mddev->sysfs_action); 9965 } 9966 9967 static void unregister_sync_thread(struct mddev *mddev) 9968 { 9969 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9970 /* resync/recovery still happening */ 9971 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9972 return; 9973 } 9974 9975 if (WARN_ON_ONCE(!mddev->sync_thread)) 9976 return; 9977 9978 md_reap_sync_thread(mddev); 9979 } 9980 9981 /* 9982 * This routine is regularly called by all per-raid-array threads to 9983 * deal with generic issues like resync and super-block update. 9984 * Raid personalities that don't have a thread (linear/raid0) do not 9985 * need this as they never do any recovery or update the superblock. 9986 * 9987 * It does not do any resync itself, but rather "forks" off other threads 9988 * to do that as needed. 9989 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9990 * "->recovery" and create a thread at ->sync_thread. 9991 * When the thread finishes it sets MD_RECOVERY_DONE 9992 * and wakeups up this thread which will reap the thread and finish up. 9993 * This thread also removes any faulty devices (with nr_pending == 0). 9994 * 9995 * The overall approach is: 9996 * 1/ if the superblock needs updating, update it. 9997 * 2/ If a recovery thread is running, don't do anything else. 9998 * 3/ If recovery has finished, clean up, possibly marking spares active. 9999 * 4/ If there are any faulty devices, remove them. 10000 * 5/ If array is degraded, try to add spares devices 10001 * 6/ If array has spares or is not in-sync, start a resync thread. 10002 */ 10003 void md_check_recovery(struct mddev *mddev) 10004 { 10005 if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work) 10006 mddev->bitmap_ops->daemon_work(mddev); 10007 10008 if (signal_pending(current)) { 10009 if (mddev->pers->sync_request && !mddev->external) { 10010 pr_debug("md: %s in immediate safe mode\n", 10011 mdname(mddev)); 10012 mddev->safemode = 2; 10013 } 10014 flush_signals(current); 10015 } 10016 10017 if (!md_is_rdwr(mddev) && 10018 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 10019 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 10020 return; 10021 if ( ! ( 10022 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 10023 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 10024 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 10025 (mddev->external == 0 && mddev->safemode == 1) || 10026 (mddev->safemode == 2 10027 && !mddev->in_sync && mddev->resync_offset == MaxSector) 10028 )) 10029 return; 10030 10031 if (mddev_trylock(mddev)) { 10032 bool try_set_sync = mddev->safemode != 0; 10033 10034 if (!mddev->external && mddev->safemode == 1) 10035 mddev->safemode = 0; 10036 10037 if (!md_is_rdwr(mddev)) { 10038 struct md_rdev *rdev; 10039 10040 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 10041 unregister_sync_thread(mddev); 10042 goto unlock; 10043 } 10044 10045 if (!mddev->external && mddev->in_sync) 10046 /* 10047 * 'Blocked' flag not needed as failed devices 10048 * will be recorded if array switched to read/write. 10049 * Leaving it set will prevent the device 10050 * from being removed. 10051 */ 10052 rdev_for_each(rdev, mddev) 10053 clear_bit(Blocked, &rdev->flags); 10054 10055 /* 10056 * There is no thread, but we need to call 10057 * ->spare_active and clear saved_raid_disk 10058 */ 10059 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 10060 md_reap_sync_thread(mddev); 10061 10062 /* 10063 * Let md_start_sync() to remove and add rdevs to the 10064 * array. 10065 */ 10066 if (md_spares_need_change(mddev)) { 10067 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10068 queue_work(md_misc_wq, &mddev->sync_work); 10069 } 10070 10071 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 10072 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 10073 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10074 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 10075 10076 goto unlock; 10077 } 10078 10079 if (mddev_is_clustered(mddev)) { 10080 struct md_rdev *rdev, *tmp; 10081 /* kick the device if another node issued a 10082 * remove disk. 10083 */ 10084 rdev_for_each_safe(rdev, tmp, mddev) { 10085 if (rdev->raid_disk < 0 && 10086 test_and_clear_bit(ClusterRemove, &rdev->flags)) 10087 md_kick_rdev_from_array(rdev); 10088 } 10089 } 10090 10091 if (try_set_sync && !mddev->external && !mddev->in_sync) { 10092 spin_lock(&mddev->lock); 10093 set_in_sync(mddev); 10094 spin_unlock(&mddev->lock); 10095 } 10096 10097 if (mddev->sb_flags) 10098 md_update_sb(mddev, 0); 10099 10100 /* 10101 * Never start a new sync thread if MD_RECOVERY_RUNNING is 10102 * still set. 10103 */ 10104 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 10105 unregister_sync_thread(mddev); 10106 goto unlock; 10107 } 10108 10109 /* Set RUNNING before clearing NEEDED to avoid 10110 * any transients in the value of "sync_action". 10111 */ 10112 mddev->curr_resync_completed = 0; 10113 spin_lock(&mddev->lock); 10114 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10115 spin_unlock(&mddev->lock); 10116 /* Clear some bits that don't mean anything, but 10117 * might be left set 10118 */ 10119 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 10120 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 10121 10122 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 10123 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 10124 queue_work(md_misc_wq, &mddev->sync_work); 10125 } else { 10126 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10127 wake_up(&resync_wait); 10128 } 10129 10130 unlock: 10131 wake_up(&mddev->sb_wait); 10132 mddev_unlock(mddev); 10133 } 10134 } 10135 EXPORT_SYMBOL(md_check_recovery); 10136 10137 void md_reap_sync_thread(struct mddev *mddev) 10138 { 10139 struct md_rdev *rdev; 10140 sector_t old_dev_sectors = mddev->dev_sectors; 10141 bool is_reshaped = false; 10142 10143 /* resync has finished, collect result */ 10144 md_unregister_thread(mddev, &mddev->sync_thread); 10145 atomic_inc(&mddev->sync_seq); 10146 10147 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 10148 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 10149 mddev->degraded != mddev->raid_disks) { 10150 /* success...*/ 10151 /* activate any spares */ 10152 if (mddev->pers->spare_active(mddev)) { 10153 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10154 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 10155 } 10156 } 10157 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 10158 mddev->pers->finish_reshape) { 10159 mddev->pers->finish_reshape(mddev); 10160 if (mddev_is_clustered(mddev)) 10161 is_reshaped = true; 10162 } 10163 10164 /* If array is no-longer degraded, then any saved_raid_disk 10165 * information must be scrapped. 10166 */ 10167 if (!mddev->degraded) 10168 rdev_for_each(rdev, mddev) 10169 rdev->saved_raid_disk = -1; 10170 10171 md_update_sb(mddev, 1); 10172 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 10173 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 10174 * clustered raid */ 10175 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 10176 mddev->cluster_ops->resync_finish(mddev); 10177 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 10178 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 10179 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 10180 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 10181 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 10182 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 10183 clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); 10184 /* 10185 * We call mddev->cluster_ops->update_size here because sync_size could 10186 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 10187 * so it is time to update size across cluster. 10188 */ 10189 if (mddev_is_clustered(mddev) && is_reshaped 10190 && !test_bit(MD_CLOSING, &mddev->flags)) 10191 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 10192 /* flag recovery needed just to double check */ 10193 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10194 sysfs_notify_dirent_safe(mddev->sysfs_completed); 10195 sysfs_notify_dirent_safe(mddev->sysfs_action); 10196 md_new_event(); 10197 if (mddev->event_work.func) 10198 queue_work(md_misc_wq, &mddev->event_work); 10199 wake_up(&resync_wait); 10200 } 10201 EXPORT_SYMBOL(md_reap_sync_thread); 10202 10203 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 10204 { 10205 sysfs_notify_dirent_safe(rdev->sysfs_state); 10206 wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), 10207 msecs_to_jiffies(5000)); 10208 rdev_dec_pending(rdev, mddev); 10209 } 10210 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 10211 10212 void md_finish_reshape(struct mddev *mddev) 10213 { 10214 /* called be personality module when reshape completes. */ 10215 struct md_rdev *rdev; 10216 10217 rdev_for_each(rdev, mddev) { 10218 if (rdev->data_offset > rdev->new_data_offset) 10219 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 10220 else 10221 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 10222 rdev->data_offset = rdev->new_data_offset; 10223 } 10224 } 10225 EXPORT_SYMBOL(md_finish_reshape); 10226 10227 /* Bad block management */ 10228 10229 /* Returns true on success, false on failure */ 10230 bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 10231 int is_new) 10232 { 10233 struct mddev *mddev = rdev->mddev; 10234 10235 /* 10236 * Recording new badblocks for faulty rdev will force unnecessary 10237 * super block updating. This is fragile for external management because 10238 * userspace daemon may trying to remove this device and deadlock may 10239 * occur. This will be probably solved in the mdadm, but it is safer to 10240 * avoid it. 10241 */ 10242 if (test_bit(Faulty, &rdev->flags)) 10243 return true; 10244 10245 if (is_new) 10246 s += rdev->new_data_offset; 10247 else 10248 s += rdev->data_offset; 10249 10250 if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) 10251 return false; 10252 10253 /* Make sure they get written out promptly */ 10254 if (test_bit(ExternalBbl, &rdev->flags)) 10255 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 10256 sysfs_notify_dirent_safe(rdev->sysfs_state); 10257 set_mask_bits(&mddev->sb_flags, 0, 10258 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 10259 md_wakeup_thread(rdev->mddev->thread); 10260 return true; 10261 } 10262 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 10263 10264 void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 10265 int is_new) 10266 { 10267 if (is_new) 10268 s += rdev->new_data_offset; 10269 else 10270 s += rdev->data_offset; 10271 10272 if (!badblocks_clear(&rdev->badblocks, s, sectors)) 10273 return; 10274 10275 if (test_bit(ExternalBbl, &rdev->flags)) 10276 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 10277 } 10278 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 10279 10280 static int md_notify_reboot(struct notifier_block *this, 10281 unsigned long code, void *x) 10282 { 10283 struct mddev *mddev; 10284 int need_delay = 0; 10285 10286 spin_lock(&all_mddevs_lock); 10287 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 10288 if (!mddev_get(mddev)) 10289 continue; 10290 spin_unlock(&all_mddevs_lock); 10291 if (mddev_trylock(mddev)) { 10292 if (mddev->pers) 10293 __md_stop_writes(mddev); 10294 if (mddev->persistent) 10295 mddev->safemode = 2; 10296 mddev_unlock(mddev); 10297 } 10298 need_delay = 1; 10299 spin_lock(&all_mddevs_lock); 10300 mddev_put_locked(mddev); 10301 } 10302 spin_unlock(&all_mddevs_lock); 10303 10304 /* 10305 * certain more exotic SCSI devices are known to be 10306 * volatile wrt too early system reboots. While the 10307 * right place to handle this issue is the given 10308 * driver, we do want to have a safe RAID driver ... 10309 */ 10310 if (need_delay) 10311 msleep(1000); 10312 10313 return NOTIFY_DONE; 10314 } 10315 10316 static struct notifier_block md_notifier = { 10317 .notifier_call = md_notify_reboot, 10318 .next = NULL, 10319 .priority = INT_MAX, /* before any real devices */ 10320 }; 10321 10322 static void md_geninit(void) 10323 { 10324 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 10325 10326 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 10327 } 10328 10329 static int __init md_init(void) 10330 { 10331 int ret = md_bitmap_init(); 10332 10333 if (ret) 10334 return ret; 10335 10336 ret = md_llbitmap_init(); 10337 if (ret) 10338 goto err_bitmap; 10339 10340 ret = -ENOMEM; 10341 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 10342 if (!md_wq) 10343 goto err_wq; 10344 10345 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 10346 if (!md_misc_wq) 10347 goto err_misc_wq; 10348 10349 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 10350 if (ret < 0) 10351 goto err_md; 10352 10353 ret = __register_blkdev(0, "mdp", md_probe); 10354 if (ret < 0) 10355 goto err_mdp; 10356 mdp_major = ret; 10357 10358 register_reboot_notifier(&md_notifier); 10359 raid_table_header = register_sysctl("dev/raid", raid_table); 10360 10361 md_geninit(); 10362 return 0; 10363 10364 err_mdp: 10365 unregister_blkdev(MD_MAJOR, "md"); 10366 err_md: 10367 destroy_workqueue(md_misc_wq); 10368 err_misc_wq: 10369 destroy_workqueue(md_wq); 10370 err_wq: 10371 md_llbitmap_exit(); 10372 err_bitmap: 10373 md_bitmap_exit(); 10374 return ret; 10375 } 10376 10377 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 10378 { 10379 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 10380 struct md_rdev *rdev2, *tmp; 10381 int role, ret; 10382 10383 /* 10384 * If size is changed in another node then we need to 10385 * do resize as well. 10386 */ 10387 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 10388 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 10389 if (ret) 10390 pr_info("md-cluster: resize failed\n"); 10391 else if (md_bitmap_enabled(mddev, false)) 10392 mddev->bitmap_ops->update_sb(mddev->bitmap); 10393 } 10394 10395 /* Check for change of roles in the active devices */ 10396 rdev_for_each_safe(rdev2, tmp, mddev) { 10397 if (test_bit(Faulty, &rdev2->flags)) { 10398 if (test_bit(ClusterRemove, &rdev2->flags)) 10399 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10400 continue; 10401 } 10402 10403 /* Check if the roles changed */ 10404 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 10405 10406 if (test_bit(Candidate, &rdev2->flags)) { 10407 if (role == MD_DISK_ROLE_FAULTY) { 10408 pr_info("md: Removing Candidate device %pg because add failed\n", 10409 rdev2->bdev); 10410 md_kick_rdev_from_array(rdev2); 10411 continue; 10412 } 10413 else 10414 clear_bit(Candidate, &rdev2->flags); 10415 } 10416 10417 if (role != rdev2->raid_disk) { 10418 /* 10419 * got activated except reshape is happening. 10420 */ 10421 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 10422 !(le32_to_cpu(sb->feature_map) & 10423 MD_FEATURE_RESHAPE_ACTIVE) && 10424 !mddev->cluster_ops->resync_status_get(mddev)) { 10425 /* 10426 * -1 to make raid1_add_disk() set conf->fullsync 10427 * to 1. This could avoid skipping sync when the 10428 * remote node is down during resyncing. 10429 */ 10430 if ((le32_to_cpu(sb->feature_map) 10431 & MD_FEATURE_RECOVERY_OFFSET)) 10432 rdev2->saved_raid_disk = -1; 10433 else 10434 rdev2->saved_raid_disk = role; 10435 ret = remove_and_add_spares(mddev, rdev2); 10436 pr_info("Activated spare: %pg\n", 10437 rdev2->bdev); 10438 /* wakeup mddev->thread here, so array could 10439 * perform resync with the new activated disk */ 10440 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10441 md_wakeup_thread(mddev->thread); 10442 } 10443 /* device faulty 10444 * We just want to do the minimum to mark the disk 10445 * as faulty. The recovery is performed by the 10446 * one who initiated the error. 10447 */ 10448 if (role == MD_DISK_ROLE_FAULTY || 10449 role == MD_DISK_ROLE_JOURNAL) { 10450 md_error(mddev, rdev2); 10451 clear_bit(Blocked, &rdev2->flags); 10452 } 10453 } 10454 } 10455 10456 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 10457 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 10458 if (ret) 10459 pr_warn("md: updating array disks failed. %d\n", ret); 10460 } 10461 10462 /* 10463 * Since mddev->delta_disks has already updated in update_raid_disks, 10464 * so it is time to check reshape. 10465 */ 10466 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10467 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10468 /* 10469 * reshape is happening in the remote node, we need to 10470 * update reshape_position and call start_reshape. 10471 */ 10472 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 10473 if (mddev->pers->update_reshape_pos) 10474 mddev->pers->update_reshape_pos(mddev); 10475 if (mddev->pers->start_reshape) 10476 mddev->pers->start_reshape(mddev); 10477 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10478 mddev->reshape_position != MaxSector && 10479 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10480 /* reshape is just done in another node. */ 10481 mddev->reshape_position = MaxSector; 10482 if (mddev->pers->update_reshape_pos) 10483 mddev->pers->update_reshape_pos(mddev); 10484 } 10485 10486 /* Finally set the event to be up to date */ 10487 mddev->events = le64_to_cpu(sb->events); 10488 } 10489 10490 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 10491 { 10492 int err; 10493 struct page *swapout = rdev->sb_page; 10494 struct mdp_superblock_1 *sb; 10495 10496 /* Store the sb page of the rdev in the swapout temporary 10497 * variable in case we err in the future 10498 */ 10499 rdev->sb_page = NULL; 10500 err = alloc_disk_sb(rdev); 10501 if (err == 0) { 10502 ClearPageUptodate(rdev->sb_page); 10503 rdev->sb_loaded = 0; 10504 err = super_types[mddev->major_version]. 10505 load_super(rdev, NULL, mddev->minor_version); 10506 } 10507 if (err < 0) { 10508 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 10509 __func__, __LINE__, rdev->desc_nr, err); 10510 if (rdev->sb_page) 10511 put_page(rdev->sb_page); 10512 rdev->sb_page = swapout; 10513 rdev->sb_loaded = 1; 10514 return err; 10515 } 10516 10517 sb = page_address(rdev->sb_page); 10518 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 10519 * is not set 10520 */ 10521 10522 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 10523 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 10524 10525 /* The other node finished recovery, call spare_active to set 10526 * device In_sync and mddev->degraded 10527 */ 10528 if (rdev->recovery_offset == MaxSector && 10529 !test_bit(In_sync, &rdev->flags) && 10530 mddev->pers->spare_active(mddev)) 10531 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10532 10533 put_page(swapout); 10534 return 0; 10535 } 10536 10537 void md_reload_sb(struct mddev *mddev, int nr) 10538 { 10539 struct md_rdev *rdev = NULL, *iter; 10540 int err; 10541 10542 /* Find the rdev */ 10543 rdev_for_each_rcu(iter, mddev) { 10544 if (iter->desc_nr == nr) { 10545 rdev = iter; 10546 break; 10547 } 10548 } 10549 10550 if (!rdev) { 10551 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 10552 return; 10553 } 10554 10555 err = read_rdev(mddev, rdev); 10556 if (err < 0) 10557 return; 10558 10559 check_sb_changes(mddev, rdev); 10560 10561 /* Read all rdev's to update recovery_offset */ 10562 rdev_for_each_rcu(rdev, mddev) { 10563 if (!test_bit(Faulty, &rdev->flags)) 10564 read_rdev(mddev, rdev); 10565 } 10566 } 10567 EXPORT_SYMBOL(md_reload_sb); 10568 10569 #ifndef MODULE 10570 10571 /* 10572 * Searches all registered partitions for autorun RAID arrays 10573 * at boot time. 10574 */ 10575 10576 static DEFINE_MUTEX(detected_devices_mutex); 10577 static LIST_HEAD(all_detected_devices); 10578 struct detected_devices_node { 10579 struct list_head list; 10580 dev_t dev; 10581 }; 10582 10583 void md_autodetect_dev(dev_t dev) 10584 { 10585 struct detected_devices_node *node_detected_dev; 10586 10587 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 10588 if (node_detected_dev) { 10589 node_detected_dev->dev = dev; 10590 mutex_lock(&detected_devices_mutex); 10591 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10592 mutex_unlock(&detected_devices_mutex); 10593 } 10594 } 10595 10596 void md_autostart_arrays(int part) 10597 { 10598 struct md_rdev *rdev; 10599 struct detected_devices_node *node_detected_dev; 10600 dev_t dev; 10601 int i_scanned, i_passed; 10602 10603 i_scanned = 0; 10604 i_passed = 0; 10605 10606 pr_info("md: Autodetecting RAID arrays.\n"); 10607 10608 mutex_lock(&detected_devices_mutex); 10609 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10610 i_scanned++; 10611 node_detected_dev = list_entry(all_detected_devices.next, 10612 struct detected_devices_node, list); 10613 list_del(&node_detected_dev->list); 10614 dev = node_detected_dev->dev; 10615 kfree(node_detected_dev); 10616 mutex_unlock(&detected_devices_mutex); 10617 rdev = md_import_device(dev,0, 90); 10618 mutex_lock(&detected_devices_mutex); 10619 if (IS_ERR(rdev)) 10620 continue; 10621 10622 if (test_bit(Faulty, &rdev->flags)) 10623 continue; 10624 10625 set_bit(AutoDetected, &rdev->flags); 10626 list_add(&rdev->same_set, &pending_raid_disks); 10627 i_passed++; 10628 } 10629 mutex_unlock(&detected_devices_mutex); 10630 10631 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10632 10633 autorun_devices(part); 10634 } 10635 10636 #endif /* !MODULE */ 10637 10638 static __exit void md_exit(void) 10639 { 10640 struct mddev *mddev; 10641 int delay = 1; 10642 10643 unregister_blkdev(MD_MAJOR,"md"); 10644 unregister_blkdev(mdp_major, "mdp"); 10645 unregister_reboot_notifier(&md_notifier); 10646 unregister_sysctl_table(raid_table_header); 10647 10648 /* We cannot unload the modules while some process is 10649 * waiting for us in select() or poll() - wake them up 10650 */ 10651 md_unloading = 1; 10652 while (waitqueue_active(&md_event_waiters)) { 10653 /* not safe to leave yet */ 10654 wake_up(&md_event_waiters); 10655 msleep(delay); 10656 delay += delay; 10657 } 10658 remove_proc_entry("mdstat", NULL); 10659 10660 spin_lock(&all_mddevs_lock); 10661 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 10662 if (!mddev_get(mddev)) 10663 continue; 10664 spin_unlock(&all_mddevs_lock); 10665 export_array(mddev); 10666 mddev->ctime = 0; 10667 mddev->hold_active = 0; 10668 /* 10669 * As the mddev is now fully clear, mddev_put will schedule 10670 * the mddev for destruction by a workqueue, and the 10671 * destroy_workqueue() below will wait for that to complete. 10672 */ 10673 spin_lock(&all_mddevs_lock); 10674 mddev_put_locked(mddev); 10675 } 10676 spin_unlock(&all_mddevs_lock); 10677 10678 destroy_workqueue(md_misc_wq); 10679 destroy_workqueue(md_wq); 10680 md_bitmap_exit(); 10681 } 10682 10683 subsys_initcall(md_init); 10684 module_exit(md_exit) 10685 10686 static int get_ro(char *buffer, const struct kernel_param *kp) 10687 { 10688 return sprintf(buffer, "%d\n", start_readonly); 10689 } 10690 static int set_ro(const char *val, const struct kernel_param *kp) 10691 { 10692 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10693 } 10694 10695 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10696 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10697 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10698 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10699 module_param(legacy_async_del_gendisk, bool, 0600); 10700 10701 MODULE_LICENSE("GPL"); 10702 MODULE_DESCRIPTION("MD RAID framework"); 10703 MODULE_ALIAS("md"); 10704 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10705