1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include "md.h" 69 #include "md-bitmap.h" 70 #include "md-cluster.h" 71 72 static const char *action_name[NR_SYNC_ACTIONS] = { 73 [ACTION_RESYNC] = "resync", 74 [ACTION_RECOVER] = "recover", 75 [ACTION_CHECK] = "check", 76 [ACTION_REPAIR] = "repair", 77 [ACTION_RESHAPE] = "reshape", 78 [ACTION_FROZEN] = "frozen", 79 [ACTION_IDLE] = "idle", 80 }; 81 82 static DEFINE_XARRAY(md_submodule); 83 84 static const struct kobj_type md_ktype; 85 86 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 87 static struct workqueue_struct *md_wq; 88 89 /* 90 * This workqueue is used for sync_work to register new sync_thread, and for 91 * del_work to remove rdev, and for event_work that is only set by dm-raid. 92 * 93 * Noted that sync_work will grab reconfig_mutex, hence never flush this 94 * workqueue whith reconfig_mutex grabbed. 95 */ 96 static struct workqueue_struct *md_misc_wq; 97 struct workqueue_struct *md_bitmap_wq; 98 99 static int remove_and_add_spares(struct mddev *mddev, 100 struct md_rdev *this); 101 static void mddev_detach(struct mddev *mddev); 102 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 103 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 104 105 /* 106 * Default number of read corrections we'll attempt on an rdev 107 * before ejecting it from the array. We divide the read error 108 * count by 2 for every hour elapsed between read errors. 109 */ 110 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 111 /* Default safemode delay: 200 msec */ 112 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 113 /* 114 * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' 115 * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load 116 * does not show up that much. Increase it if you want to have more guaranteed 117 * speed. Note that the RAID driver will use the maximum bandwidth 118 * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. 119 * 120 * Background sync IO speed control: 121 * 122 * - below speed min: 123 * no limit; 124 * - above speed min and below speed max: 125 * a) if mddev is idle, then no limit; 126 * b) if mddev is busy handling normal IO, then limit inflight sync IO 127 * to sync_io_depth; 128 * - above speed max: 129 * sync IO can't be issued; 130 * 131 * Following configurations can be changed via /proc/sys/dev/raid/ for system 132 * or /sys/block/mdX/md/ for one array. 133 */ 134 static int sysctl_speed_limit_min = 1000; 135 static int sysctl_speed_limit_max = 200000; 136 static int sysctl_sync_io_depth = 32; 137 138 static int speed_min(struct mddev *mddev) 139 { 140 return mddev->sync_speed_min ? 141 mddev->sync_speed_min : sysctl_speed_limit_min; 142 } 143 144 static int speed_max(struct mddev *mddev) 145 { 146 return mddev->sync_speed_max ? 147 mddev->sync_speed_max : sysctl_speed_limit_max; 148 } 149 150 static int sync_io_depth(struct mddev *mddev) 151 { 152 return mddev->sync_io_depth ? 153 mddev->sync_io_depth : sysctl_sync_io_depth; 154 } 155 156 static void rdev_uninit_serial(struct md_rdev *rdev) 157 { 158 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 159 return; 160 161 kvfree(rdev->serial); 162 rdev->serial = NULL; 163 } 164 165 static void rdevs_uninit_serial(struct mddev *mddev) 166 { 167 struct md_rdev *rdev; 168 169 rdev_for_each(rdev, mddev) 170 rdev_uninit_serial(rdev); 171 } 172 173 static int rdev_init_serial(struct md_rdev *rdev) 174 { 175 /* serial_nums equals with BARRIER_BUCKETS_NR */ 176 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 177 struct serial_in_rdev *serial = NULL; 178 179 if (test_bit(CollisionCheck, &rdev->flags)) 180 return 0; 181 182 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 183 GFP_KERNEL); 184 if (!serial) 185 return -ENOMEM; 186 187 for (i = 0; i < serial_nums; i++) { 188 struct serial_in_rdev *serial_tmp = &serial[i]; 189 190 spin_lock_init(&serial_tmp->serial_lock); 191 serial_tmp->serial_rb = RB_ROOT_CACHED; 192 init_waitqueue_head(&serial_tmp->serial_io_wait); 193 } 194 195 rdev->serial = serial; 196 set_bit(CollisionCheck, &rdev->flags); 197 198 return 0; 199 } 200 201 static int rdevs_init_serial(struct mddev *mddev) 202 { 203 struct md_rdev *rdev; 204 int ret = 0; 205 206 rdev_for_each(rdev, mddev) { 207 ret = rdev_init_serial(rdev); 208 if (ret) 209 break; 210 } 211 212 /* Free all resources if pool is not existed */ 213 if (ret && !mddev->serial_info_pool) 214 rdevs_uninit_serial(mddev); 215 216 return ret; 217 } 218 219 /* 220 * rdev needs to enable serial stuffs if it meets the conditions: 221 * 1. it is multi-queue device flaged with writemostly. 222 * 2. the write-behind mode is enabled. 223 */ 224 static int rdev_need_serial(struct md_rdev *rdev) 225 { 226 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 227 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 228 test_bit(WriteMostly, &rdev->flags)); 229 } 230 231 /* 232 * Init resource for rdev(s), then create serial_info_pool if: 233 * 1. rdev is the first device which return true from rdev_enable_serial. 234 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 235 */ 236 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 237 { 238 int ret = 0; 239 240 if (rdev && !rdev_need_serial(rdev) && 241 !test_bit(CollisionCheck, &rdev->flags)) 242 return; 243 244 if (!rdev) 245 ret = rdevs_init_serial(mddev); 246 else 247 ret = rdev_init_serial(rdev); 248 if (ret) 249 return; 250 251 if (mddev->serial_info_pool == NULL) { 252 /* 253 * already in memalloc noio context by 254 * mddev_suspend() 255 */ 256 mddev->serial_info_pool = 257 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 258 sizeof(struct serial_info)); 259 if (!mddev->serial_info_pool) { 260 rdevs_uninit_serial(mddev); 261 pr_err("can't alloc memory pool for serialization\n"); 262 } 263 } 264 } 265 266 /* 267 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 268 * 1. rdev is the last device flaged with CollisionCheck. 269 * 2. when bitmap is destroyed while policy is not enabled. 270 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 271 */ 272 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 273 { 274 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 275 return; 276 277 if (mddev->serial_info_pool) { 278 struct md_rdev *temp; 279 int num = 0; /* used to track if other rdevs need the pool */ 280 281 rdev_for_each(temp, mddev) { 282 if (!rdev) { 283 if (!mddev->serialize_policy || 284 !rdev_need_serial(temp)) 285 rdev_uninit_serial(temp); 286 else 287 num++; 288 } else if (temp != rdev && 289 test_bit(CollisionCheck, &temp->flags)) 290 num++; 291 } 292 293 if (rdev) 294 rdev_uninit_serial(rdev); 295 296 if (num) 297 pr_info("The mempool could be used by other devices\n"); 298 else { 299 mempool_destroy(mddev->serial_info_pool); 300 mddev->serial_info_pool = NULL; 301 } 302 } 303 } 304 305 static struct ctl_table_header *raid_table_header; 306 307 static const struct ctl_table raid_table[] = { 308 { 309 .procname = "speed_limit_min", 310 .data = &sysctl_speed_limit_min, 311 .maxlen = sizeof(int), 312 .mode = 0644, 313 .proc_handler = proc_dointvec, 314 }, 315 { 316 .procname = "speed_limit_max", 317 .data = &sysctl_speed_limit_max, 318 .maxlen = sizeof(int), 319 .mode = 0644, 320 .proc_handler = proc_dointvec, 321 }, 322 { 323 .procname = "sync_io_depth", 324 .data = &sysctl_sync_io_depth, 325 .maxlen = sizeof(int), 326 .mode = 0644, 327 .proc_handler = proc_dointvec, 328 }, 329 }; 330 331 static int start_readonly; 332 333 /* 334 * The original mechanism for creating an md device is to create 335 * a device node in /dev and to open it. This causes races with device-close. 336 * The preferred method is to write to the "new_array" module parameter. 337 * This can avoid races. 338 * Setting create_on_open to false disables the original mechanism 339 * so all the races disappear. 340 */ 341 static bool create_on_open = true; 342 343 /* 344 * We have a system wide 'event count' that is incremented 345 * on any 'interesting' event, and readers of /proc/mdstat 346 * can use 'poll' or 'select' to find out when the event 347 * count increases. 348 * 349 * Events are: 350 * start array, stop array, error, add device, remove device, 351 * start build, activate spare 352 */ 353 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 354 static atomic_t md_event_count; 355 void md_new_event(void) 356 { 357 atomic_inc(&md_event_count); 358 wake_up(&md_event_waiters); 359 } 360 EXPORT_SYMBOL_GPL(md_new_event); 361 362 /* 363 * Enables to iterate over all existing md arrays 364 * all_mddevs_lock protects this list. 365 */ 366 static LIST_HEAD(all_mddevs); 367 static DEFINE_SPINLOCK(all_mddevs_lock); 368 369 static bool is_md_suspended(struct mddev *mddev) 370 { 371 return percpu_ref_is_dying(&mddev->active_io); 372 } 373 /* Rather than calling directly into the personality make_request function, 374 * IO requests come here first so that we can check if the device is 375 * being suspended pending a reconfiguration. 376 * We hold a refcount over the call to ->make_request. By the time that 377 * call has finished, the bio has been linked into some internal structure 378 * and so is visible to ->quiesce(), so we don't need the refcount any more. 379 */ 380 static bool is_suspended(struct mddev *mddev, struct bio *bio) 381 { 382 if (is_md_suspended(mddev)) 383 return true; 384 if (bio_data_dir(bio) != WRITE) 385 return false; 386 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 387 return false; 388 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 389 return false; 390 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 391 return false; 392 return true; 393 } 394 395 bool md_handle_request(struct mddev *mddev, struct bio *bio) 396 { 397 check_suspended: 398 if (is_suspended(mddev, bio)) { 399 DEFINE_WAIT(__wait); 400 /* Bail out if REQ_NOWAIT is set for the bio */ 401 if (bio->bi_opf & REQ_NOWAIT) { 402 bio_wouldblock_error(bio); 403 return true; 404 } 405 for (;;) { 406 prepare_to_wait(&mddev->sb_wait, &__wait, 407 TASK_UNINTERRUPTIBLE); 408 if (!is_suspended(mddev, bio)) 409 break; 410 schedule(); 411 } 412 finish_wait(&mddev->sb_wait, &__wait); 413 } 414 if (!percpu_ref_tryget_live(&mddev->active_io)) 415 goto check_suspended; 416 417 if (!mddev->pers->make_request(mddev, bio)) { 418 percpu_ref_put(&mddev->active_io); 419 if (!mddev->gendisk && mddev->pers->prepare_suspend) 420 return false; 421 goto check_suspended; 422 } 423 424 percpu_ref_put(&mddev->active_io); 425 return true; 426 } 427 EXPORT_SYMBOL(md_handle_request); 428 429 static void md_submit_bio(struct bio *bio) 430 { 431 const int rw = bio_data_dir(bio); 432 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 433 434 if (mddev == NULL || mddev->pers == NULL) { 435 bio_io_error(bio); 436 return; 437 } 438 439 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 440 bio_io_error(bio); 441 return; 442 } 443 444 bio = bio_split_to_limits(bio); 445 if (!bio) 446 return; 447 448 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 449 if (bio_sectors(bio) != 0) 450 bio->bi_status = BLK_STS_IOERR; 451 bio_endio(bio); 452 return; 453 } 454 455 /* bio could be mergeable after passing to underlayer */ 456 bio->bi_opf &= ~REQ_NOMERGE; 457 458 md_handle_request(mddev, bio); 459 } 460 461 /* 462 * Make sure no new requests are submitted to the device, and any requests that 463 * have been submitted are completely handled. 464 */ 465 int mddev_suspend(struct mddev *mddev, bool interruptible) 466 { 467 int err = 0; 468 469 /* 470 * hold reconfig_mutex to wait for normal io will deadlock, because 471 * other context can't update super_block, and normal io can rely on 472 * updating super_block. 473 */ 474 lockdep_assert_not_held(&mddev->reconfig_mutex); 475 476 if (interruptible) 477 err = mutex_lock_interruptible(&mddev->suspend_mutex); 478 else 479 mutex_lock(&mddev->suspend_mutex); 480 if (err) 481 return err; 482 483 if (mddev->suspended) { 484 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 485 mutex_unlock(&mddev->suspend_mutex); 486 return 0; 487 } 488 489 percpu_ref_kill(&mddev->active_io); 490 if (interruptible) 491 err = wait_event_interruptible(mddev->sb_wait, 492 percpu_ref_is_zero(&mddev->active_io)); 493 else 494 wait_event(mddev->sb_wait, 495 percpu_ref_is_zero(&mddev->active_io)); 496 if (err) { 497 percpu_ref_resurrect(&mddev->active_io); 498 mutex_unlock(&mddev->suspend_mutex); 499 return err; 500 } 501 502 /* 503 * For raid456, io might be waiting for reshape to make progress, 504 * allow new reshape to start while waiting for io to be done to 505 * prevent deadlock. 506 */ 507 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 508 509 /* restrict memory reclaim I/O during raid array is suspend */ 510 mddev->noio_flag = memalloc_noio_save(); 511 512 mutex_unlock(&mddev->suspend_mutex); 513 return 0; 514 } 515 EXPORT_SYMBOL_GPL(mddev_suspend); 516 517 static void __mddev_resume(struct mddev *mddev, bool recovery_needed) 518 { 519 lockdep_assert_not_held(&mddev->reconfig_mutex); 520 521 mutex_lock(&mddev->suspend_mutex); 522 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 523 if (mddev->suspended) { 524 mutex_unlock(&mddev->suspend_mutex); 525 return; 526 } 527 528 /* entred the memalloc scope from mddev_suspend() */ 529 memalloc_noio_restore(mddev->noio_flag); 530 531 percpu_ref_resurrect(&mddev->active_io); 532 wake_up(&mddev->sb_wait); 533 534 if (recovery_needed) 535 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 536 md_wakeup_thread(mddev->thread); 537 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 538 539 mutex_unlock(&mddev->suspend_mutex); 540 } 541 542 void mddev_resume(struct mddev *mddev) 543 { 544 return __mddev_resume(mddev, true); 545 } 546 EXPORT_SYMBOL_GPL(mddev_resume); 547 548 /* sync bdev before setting device to readonly or stopping raid*/ 549 static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) 550 { 551 mutex_lock(&mddev->open_mutex); 552 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { 553 mutex_unlock(&mddev->open_mutex); 554 return -EBUSY; 555 } 556 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 557 mutex_unlock(&mddev->open_mutex); 558 return -EBUSY; 559 } 560 mutex_unlock(&mddev->open_mutex); 561 562 sync_blockdev(mddev->gendisk->part0); 563 return 0; 564 } 565 566 /* 567 * The only difference from bio_chain_endio() is that the current 568 * bi_status of bio does not affect the bi_status of parent. 569 */ 570 static void md_end_flush(struct bio *bio) 571 { 572 struct bio *parent = bio->bi_private; 573 574 /* 575 * If any flush io error before the power failure, 576 * disk data may be lost. 577 */ 578 if (bio->bi_status) 579 pr_err("md: %pg flush io error %d\n", bio->bi_bdev, 580 blk_status_to_errno(bio->bi_status)); 581 582 bio_put(bio); 583 bio_endio(parent); 584 } 585 586 bool md_flush_request(struct mddev *mddev, struct bio *bio) 587 { 588 struct md_rdev *rdev; 589 struct bio *new; 590 591 /* 592 * md_flush_reqeust() should be called under md_handle_request() and 593 * 'active_io' is already grabbed. Hence it's safe to get rdev directly 594 * without rcu protection. 595 */ 596 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 597 598 rdev_for_each(rdev, mddev) { 599 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 600 continue; 601 602 new = bio_alloc_bioset(rdev->bdev, 0, 603 REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, 604 &mddev->bio_set); 605 new->bi_private = bio; 606 new->bi_end_io = md_end_flush; 607 bio_inc_remaining(bio); 608 submit_bio(new); 609 } 610 611 if (bio_sectors(bio) == 0) { 612 bio_endio(bio); 613 return true; 614 } 615 616 bio->bi_opf &= ~REQ_PREFLUSH; 617 return false; 618 } 619 EXPORT_SYMBOL(md_flush_request); 620 621 static inline struct mddev *mddev_get(struct mddev *mddev) 622 { 623 lockdep_assert_held(&all_mddevs_lock); 624 625 if (test_bit(MD_DELETED, &mddev->flags)) 626 return NULL; 627 atomic_inc(&mddev->active); 628 return mddev; 629 } 630 631 static void mddev_delayed_delete(struct work_struct *ws); 632 633 static void __mddev_put(struct mddev *mddev) 634 { 635 if (mddev->raid_disks || !list_empty(&mddev->disks) || 636 mddev->ctime || mddev->hold_active) 637 return; 638 639 /* 640 * If array is freed by stopping array, MD_DELETED is set by 641 * do_md_stop(), MD_DELETED is still set here in case mddev is freed 642 * directly by closing a mddev that is created by create_on_open. 643 */ 644 set_bit(MD_DELETED, &mddev->flags); 645 /* 646 * Call queue_work inside the spinlock so that flush_workqueue() after 647 * mddev_find will succeed in waiting for the work to be done. 648 */ 649 queue_work(md_misc_wq, &mddev->del_work); 650 } 651 652 static void mddev_put_locked(struct mddev *mddev) 653 { 654 if (atomic_dec_and_test(&mddev->active)) 655 __mddev_put(mddev); 656 } 657 658 void mddev_put(struct mddev *mddev) 659 { 660 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 661 return; 662 663 __mddev_put(mddev); 664 spin_unlock(&all_mddevs_lock); 665 } 666 667 static void md_safemode_timeout(struct timer_list *t); 668 static void md_start_sync(struct work_struct *ws); 669 670 static void active_io_release(struct percpu_ref *ref) 671 { 672 struct mddev *mddev = container_of(ref, struct mddev, active_io); 673 674 wake_up(&mddev->sb_wait); 675 } 676 677 static void no_op(struct percpu_ref *r) {} 678 679 int mddev_init(struct mddev *mddev) 680 { 681 682 if (percpu_ref_init(&mddev->active_io, active_io_release, 683 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 684 return -ENOMEM; 685 686 if (percpu_ref_init(&mddev->writes_pending, no_op, 687 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 688 percpu_ref_exit(&mddev->active_io); 689 return -ENOMEM; 690 } 691 692 /* We want to start with the refcount at zero */ 693 percpu_ref_put(&mddev->writes_pending); 694 695 mutex_init(&mddev->open_mutex); 696 mutex_init(&mddev->reconfig_mutex); 697 mutex_init(&mddev->suspend_mutex); 698 mutex_init(&mddev->bitmap_info.mutex); 699 INIT_LIST_HEAD(&mddev->disks); 700 INIT_LIST_HEAD(&mddev->all_mddevs); 701 INIT_LIST_HEAD(&mddev->deleting); 702 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 703 atomic_set(&mddev->active, 1); 704 atomic_set(&mddev->openers, 0); 705 atomic_set(&mddev->sync_seq, 0); 706 spin_lock_init(&mddev->lock); 707 init_waitqueue_head(&mddev->sb_wait); 708 init_waitqueue_head(&mddev->recovery_wait); 709 mddev->reshape_position = MaxSector; 710 mddev->reshape_backwards = 0; 711 mddev->last_sync_action = ACTION_IDLE; 712 mddev->resync_min = 0; 713 mddev->resync_max = MaxSector; 714 mddev->level = LEVEL_NONE; 715 mddev_set_bitmap_ops(mddev); 716 717 INIT_WORK(&mddev->sync_work, md_start_sync); 718 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 719 720 return 0; 721 } 722 EXPORT_SYMBOL_GPL(mddev_init); 723 724 void mddev_destroy(struct mddev *mddev) 725 { 726 percpu_ref_exit(&mddev->active_io); 727 percpu_ref_exit(&mddev->writes_pending); 728 } 729 EXPORT_SYMBOL_GPL(mddev_destroy); 730 731 static struct mddev *mddev_find_locked(dev_t unit) 732 { 733 struct mddev *mddev; 734 735 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 736 if (mddev->unit == unit) 737 return mddev; 738 739 return NULL; 740 } 741 742 /* find an unused unit number */ 743 static dev_t mddev_alloc_unit(void) 744 { 745 static int next_minor = 512; 746 int start = next_minor; 747 bool is_free = 0; 748 dev_t dev = 0; 749 750 while (!is_free) { 751 dev = MKDEV(MD_MAJOR, next_minor); 752 next_minor++; 753 if (next_minor > MINORMASK) 754 next_minor = 0; 755 if (next_minor == start) 756 return 0; /* Oh dear, all in use. */ 757 is_free = !mddev_find_locked(dev); 758 } 759 760 return dev; 761 } 762 763 static struct mddev *mddev_alloc(dev_t unit) 764 { 765 struct mddev *new; 766 int error; 767 768 if (unit && MAJOR(unit) != MD_MAJOR) 769 unit &= ~((1 << MdpMinorShift) - 1); 770 771 new = kzalloc(sizeof(*new), GFP_KERNEL); 772 if (!new) 773 return ERR_PTR(-ENOMEM); 774 775 error = mddev_init(new); 776 if (error) 777 goto out_free_new; 778 779 spin_lock(&all_mddevs_lock); 780 if (unit) { 781 error = -EEXIST; 782 if (mddev_find_locked(unit)) 783 goto out_destroy_new; 784 new->unit = unit; 785 if (MAJOR(unit) == MD_MAJOR) 786 new->md_minor = MINOR(unit); 787 else 788 new->md_minor = MINOR(unit) >> MdpMinorShift; 789 new->hold_active = UNTIL_IOCTL; 790 } else { 791 error = -ENODEV; 792 new->unit = mddev_alloc_unit(); 793 if (!new->unit) 794 goto out_destroy_new; 795 new->md_minor = MINOR(new->unit); 796 new->hold_active = UNTIL_STOP; 797 } 798 799 list_add(&new->all_mddevs, &all_mddevs); 800 spin_unlock(&all_mddevs_lock); 801 return new; 802 803 out_destroy_new: 804 spin_unlock(&all_mddevs_lock); 805 mddev_destroy(new); 806 out_free_new: 807 kfree(new); 808 return ERR_PTR(error); 809 } 810 811 static void mddev_free(struct mddev *mddev) 812 { 813 spin_lock(&all_mddevs_lock); 814 list_del(&mddev->all_mddevs); 815 spin_unlock(&all_mddevs_lock); 816 817 mddev_destroy(mddev); 818 kfree(mddev); 819 } 820 821 static const struct attribute_group md_redundancy_group; 822 823 void mddev_unlock(struct mddev *mddev) 824 { 825 struct md_rdev *rdev; 826 struct md_rdev *tmp; 827 LIST_HEAD(delete); 828 829 if (!list_empty(&mddev->deleting)) 830 list_splice_init(&mddev->deleting, &delete); 831 832 if (mddev->to_remove) { 833 /* These cannot be removed under reconfig_mutex as 834 * an access to the files will try to take reconfig_mutex 835 * while holding the file unremovable, which leads to 836 * a deadlock. 837 * So hold set sysfs_active while the remove in happeing, 838 * and anything else which might set ->to_remove or my 839 * otherwise change the sysfs namespace will fail with 840 * -EBUSY if sysfs_active is still set. 841 * We set sysfs_active under reconfig_mutex and elsewhere 842 * test it under the same mutex to ensure its correct value 843 * is seen. 844 */ 845 const struct attribute_group *to_remove = mddev->to_remove; 846 mddev->to_remove = NULL; 847 mddev->sysfs_active = 1; 848 mutex_unlock(&mddev->reconfig_mutex); 849 850 if (mddev->kobj.sd) { 851 if (to_remove != &md_redundancy_group) 852 sysfs_remove_group(&mddev->kobj, to_remove); 853 if (mddev->pers == NULL || 854 mddev->pers->sync_request == NULL) { 855 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 856 if (mddev->sysfs_action) 857 sysfs_put(mddev->sysfs_action); 858 if (mddev->sysfs_completed) 859 sysfs_put(mddev->sysfs_completed); 860 if (mddev->sysfs_degraded) 861 sysfs_put(mddev->sysfs_degraded); 862 mddev->sysfs_action = NULL; 863 mddev->sysfs_completed = NULL; 864 mddev->sysfs_degraded = NULL; 865 } 866 } 867 mddev->sysfs_active = 0; 868 } else 869 mutex_unlock(&mddev->reconfig_mutex); 870 871 md_wakeup_thread(mddev->thread); 872 wake_up(&mddev->sb_wait); 873 874 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 875 list_del_init(&rdev->same_set); 876 kobject_del(&rdev->kobj); 877 export_rdev(rdev, mddev); 878 } 879 880 /* Call del_gendisk after release reconfig_mutex to avoid 881 * deadlock (e.g. call del_gendisk under the lock and an 882 * access to sysfs files waits the lock) 883 * And MD_DELETED is only used for md raid which is set in 884 * do_md_stop. dm raid only uses md_stop to stop. So dm raid 885 * doesn't need to check MD_DELETED when getting reconfig lock 886 */ 887 if (test_bit(MD_DELETED, &mddev->flags)) 888 del_gendisk(mddev->gendisk); 889 } 890 EXPORT_SYMBOL_GPL(mddev_unlock); 891 892 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 893 { 894 struct md_rdev *rdev; 895 896 rdev_for_each_rcu(rdev, mddev) 897 if (rdev->desc_nr == nr) 898 return rdev; 899 900 return NULL; 901 } 902 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 903 904 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 905 { 906 struct md_rdev *rdev; 907 908 rdev_for_each(rdev, mddev) 909 if (rdev->bdev->bd_dev == dev) 910 return rdev; 911 912 return NULL; 913 } 914 915 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 916 { 917 struct md_rdev *rdev; 918 919 rdev_for_each_rcu(rdev, mddev) 920 if (rdev->bdev->bd_dev == dev) 921 return rdev; 922 923 return NULL; 924 } 925 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 926 927 static struct md_personality *get_pers(int level, char *clevel) 928 { 929 struct md_personality *ret = NULL; 930 struct md_submodule_head *head; 931 unsigned long i; 932 933 xa_lock(&md_submodule); 934 xa_for_each(&md_submodule, i, head) { 935 if (head->type != MD_PERSONALITY) 936 continue; 937 if ((level != LEVEL_NONE && head->id == level) || 938 !strcmp(head->name, clevel)) { 939 if (try_module_get(head->owner)) 940 ret = (void *)head; 941 break; 942 } 943 } 944 xa_unlock(&md_submodule); 945 946 if (!ret) { 947 if (level != LEVEL_NONE) 948 pr_warn("md: personality for level %d is not loaded!\n", 949 level); 950 else 951 pr_warn("md: personality for level %s is not loaded!\n", 952 clevel); 953 } 954 955 return ret; 956 } 957 958 static void put_pers(struct md_personality *pers) 959 { 960 module_put(pers->head.owner); 961 } 962 963 /* return the offset of the super block in 512byte sectors */ 964 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 965 { 966 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 967 } 968 969 static int alloc_disk_sb(struct md_rdev *rdev) 970 { 971 rdev->sb_page = alloc_page(GFP_KERNEL); 972 if (!rdev->sb_page) 973 return -ENOMEM; 974 return 0; 975 } 976 977 void md_rdev_clear(struct md_rdev *rdev) 978 { 979 if (rdev->sb_page) { 980 put_page(rdev->sb_page); 981 rdev->sb_loaded = 0; 982 rdev->sb_page = NULL; 983 rdev->sb_start = 0; 984 rdev->sectors = 0; 985 } 986 if (rdev->bb_page) { 987 put_page(rdev->bb_page); 988 rdev->bb_page = NULL; 989 } 990 badblocks_exit(&rdev->badblocks); 991 } 992 EXPORT_SYMBOL_GPL(md_rdev_clear); 993 994 static void super_written(struct bio *bio) 995 { 996 struct md_rdev *rdev = bio->bi_private; 997 struct mddev *mddev = rdev->mddev; 998 999 if (bio->bi_status) { 1000 pr_err("md: %s gets error=%d\n", __func__, 1001 blk_status_to_errno(bio->bi_status)); 1002 md_error(mddev, rdev); 1003 if (!test_bit(Faulty, &rdev->flags) 1004 && (bio->bi_opf & MD_FAILFAST)) { 1005 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 1006 set_bit(LastDev, &rdev->flags); 1007 } 1008 } else 1009 clear_bit(LastDev, &rdev->flags); 1010 1011 bio_put(bio); 1012 1013 rdev_dec_pending(rdev, mddev); 1014 1015 if (atomic_dec_and_test(&mddev->pending_writes)) 1016 wake_up(&mddev->sb_wait); 1017 } 1018 1019 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 1020 sector_t sector, int size, struct page *page) 1021 { 1022 /* write first size bytes of page to sector of rdev 1023 * Increment mddev->pending_writes before returning 1024 * and decrement it on completion, waking up sb_wait 1025 * if zero is reached. 1026 * If an error occurred, call md_error 1027 */ 1028 struct bio *bio; 1029 1030 if (!page) 1031 return; 1032 1033 if (test_bit(Faulty, &rdev->flags)) 1034 return; 1035 1036 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1037 1, 1038 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 1039 | REQ_PREFLUSH | REQ_FUA, 1040 GFP_NOIO, &mddev->sync_set); 1041 1042 atomic_inc(&rdev->nr_pending); 1043 1044 bio->bi_iter.bi_sector = sector; 1045 __bio_add_page(bio, page, size, 0); 1046 bio->bi_private = rdev; 1047 bio->bi_end_io = super_written; 1048 1049 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 1050 test_bit(FailFast, &rdev->flags) && 1051 !test_bit(LastDev, &rdev->flags)) 1052 bio->bi_opf |= MD_FAILFAST; 1053 1054 atomic_inc(&mddev->pending_writes); 1055 submit_bio(bio); 1056 } 1057 1058 int md_super_wait(struct mddev *mddev) 1059 { 1060 /* wait for all superblock writes that were scheduled to complete */ 1061 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1062 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1063 return -EAGAIN; 1064 return 0; 1065 } 1066 1067 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1068 struct page *page, blk_opf_t opf, bool metadata_op) 1069 { 1070 struct bio bio; 1071 struct bio_vec bvec; 1072 1073 if (metadata_op && rdev->meta_bdev) 1074 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1075 else 1076 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1077 1078 if (metadata_op) 1079 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1080 else if (rdev->mddev->reshape_position != MaxSector && 1081 (rdev->mddev->reshape_backwards == 1082 (sector >= rdev->mddev->reshape_position))) 1083 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1084 else 1085 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1086 __bio_add_page(&bio, page, size, 0); 1087 1088 submit_bio_wait(&bio); 1089 1090 return !bio.bi_status; 1091 } 1092 EXPORT_SYMBOL_GPL(sync_page_io); 1093 1094 static int read_disk_sb(struct md_rdev *rdev, int size) 1095 { 1096 if (rdev->sb_loaded) 1097 return 0; 1098 1099 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1100 goto fail; 1101 rdev->sb_loaded = 1; 1102 return 0; 1103 1104 fail: 1105 pr_err("md: disabled device %pg, could not read superblock.\n", 1106 rdev->bdev); 1107 return -EINVAL; 1108 } 1109 1110 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1111 { 1112 return sb1->set_uuid0 == sb2->set_uuid0 && 1113 sb1->set_uuid1 == sb2->set_uuid1 && 1114 sb1->set_uuid2 == sb2->set_uuid2 && 1115 sb1->set_uuid3 == sb2->set_uuid3; 1116 } 1117 1118 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1119 { 1120 int ret; 1121 mdp_super_t *tmp1, *tmp2; 1122 1123 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1124 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1125 1126 if (!tmp1 || !tmp2) { 1127 ret = 0; 1128 goto abort; 1129 } 1130 1131 *tmp1 = *sb1; 1132 *tmp2 = *sb2; 1133 1134 /* 1135 * nr_disks is not constant 1136 */ 1137 tmp1->nr_disks = 0; 1138 tmp2->nr_disks = 0; 1139 1140 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1141 abort: 1142 kfree(tmp1); 1143 kfree(tmp2); 1144 return ret; 1145 } 1146 1147 static u32 md_csum_fold(u32 csum) 1148 { 1149 csum = (csum & 0xffff) + (csum >> 16); 1150 return (csum & 0xffff) + (csum >> 16); 1151 } 1152 1153 static unsigned int calc_sb_csum(mdp_super_t *sb) 1154 { 1155 u64 newcsum = 0; 1156 u32 *sb32 = (u32*)sb; 1157 int i; 1158 unsigned int disk_csum, csum; 1159 1160 disk_csum = sb->sb_csum; 1161 sb->sb_csum = 0; 1162 1163 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1164 newcsum += sb32[i]; 1165 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1166 1167 #ifdef CONFIG_ALPHA 1168 /* This used to use csum_partial, which was wrong for several 1169 * reasons including that different results are returned on 1170 * different architectures. It isn't critical that we get exactly 1171 * the same return value as before (we always csum_fold before 1172 * testing, and that removes any differences). However as we 1173 * know that csum_partial always returned a 16bit value on 1174 * alphas, do a fold to maximise conformity to previous behaviour. 1175 */ 1176 sb->sb_csum = md_csum_fold(disk_csum); 1177 #else 1178 sb->sb_csum = disk_csum; 1179 #endif 1180 return csum; 1181 } 1182 1183 /* 1184 * Handle superblock details. 1185 * We want to be able to handle multiple superblock formats 1186 * so we have a common interface to them all, and an array of 1187 * different handlers. 1188 * We rely on user-space to write the initial superblock, and support 1189 * reading and updating of superblocks. 1190 * Interface methods are: 1191 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1192 * loads and validates a superblock on dev. 1193 * if refdev != NULL, compare superblocks on both devices 1194 * Return: 1195 * 0 - dev has a superblock that is compatible with refdev 1196 * 1 - dev has a superblock that is compatible and newer than refdev 1197 * so dev should be used as the refdev in future 1198 * -EINVAL superblock incompatible or invalid 1199 * -othererror e.g. -EIO 1200 * 1201 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1202 * Verify that dev is acceptable into mddev. 1203 * The first time, mddev->raid_disks will be 0, and data from 1204 * dev should be merged in. Subsequent calls check that dev 1205 * is new enough. Return 0 or -EINVAL 1206 * 1207 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1208 * Update the superblock for rdev with data in mddev 1209 * This does not write to disc. 1210 * 1211 */ 1212 1213 struct super_type { 1214 char *name; 1215 struct module *owner; 1216 int (*load_super)(struct md_rdev *rdev, 1217 struct md_rdev *refdev, 1218 int minor_version); 1219 int (*validate_super)(struct mddev *mddev, 1220 struct md_rdev *freshest, 1221 struct md_rdev *rdev); 1222 void (*sync_super)(struct mddev *mddev, 1223 struct md_rdev *rdev); 1224 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1225 sector_t num_sectors); 1226 int (*allow_new_offset)(struct md_rdev *rdev, 1227 unsigned long long new_offset); 1228 }; 1229 1230 /* 1231 * Check that the given mddev has no bitmap. 1232 * 1233 * This function is called from the run method of all personalities that do not 1234 * support bitmaps. It prints an error message and returns non-zero if mddev 1235 * has a bitmap. Otherwise, it returns 0. 1236 * 1237 */ 1238 int md_check_no_bitmap(struct mddev *mddev) 1239 { 1240 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1241 return 0; 1242 pr_warn("%s: bitmaps are not supported for %s\n", 1243 mdname(mddev), mddev->pers->head.name); 1244 return 1; 1245 } 1246 EXPORT_SYMBOL(md_check_no_bitmap); 1247 1248 /* 1249 * load_super for 0.90.0 1250 */ 1251 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1252 { 1253 mdp_super_t *sb; 1254 int ret; 1255 bool spare_disk = true; 1256 1257 /* 1258 * Calculate the position of the superblock (512byte sectors), 1259 * it's at the end of the disk. 1260 * 1261 * It also happens to be a multiple of 4Kb. 1262 */ 1263 rdev->sb_start = calc_dev_sboffset(rdev); 1264 1265 ret = read_disk_sb(rdev, MD_SB_BYTES); 1266 if (ret) 1267 return ret; 1268 1269 ret = -EINVAL; 1270 1271 sb = page_address(rdev->sb_page); 1272 1273 if (sb->md_magic != MD_SB_MAGIC) { 1274 pr_warn("md: invalid raid superblock magic on %pg\n", 1275 rdev->bdev); 1276 goto abort; 1277 } 1278 1279 if (sb->major_version != 0 || 1280 sb->minor_version < 90 || 1281 sb->minor_version > 91) { 1282 pr_warn("Bad version number %d.%d on %pg\n", 1283 sb->major_version, sb->minor_version, rdev->bdev); 1284 goto abort; 1285 } 1286 1287 if (sb->raid_disks <= 0) 1288 goto abort; 1289 1290 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1291 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1292 goto abort; 1293 } 1294 1295 rdev->preferred_minor = sb->md_minor; 1296 rdev->data_offset = 0; 1297 rdev->new_data_offset = 0; 1298 rdev->sb_size = MD_SB_BYTES; 1299 rdev->badblocks.shift = -1; 1300 1301 rdev->desc_nr = sb->this_disk.number; 1302 1303 /* not spare disk */ 1304 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && 1305 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1306 spare_disk = false; 1307 1308 if (!refdev) { 1309 if (!spare_disk) 1310 ret = 1; 1311 else 1312 ret = 0; 1313 } else { 1314 __u64 ev1, ev2; 1315 mdp_super_t *refsb = page_address(refdev->sb_page); 1316 if (!md_uuid_equal(refsb, sb)) { 1317 pr_warn("md: %pg has different UUID to %pg\n", 1318 rdev->bdev, refdev->bdev); 1319 goto abort; 1320 } 1321 if (!md_sb_equal(refsb, sb)) { 1322 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1323 rdev->bdev, refdev->bdev); 1324 goto abort; 1325 } 1326 ev1 = md_event(sb); 1327 ev2 = md_event(refsb); 1328 1329 if (!spare_disk && ev1 > ev2) 1330 ret = 1; 1331 else 1332 ret = 0; 1333 } 1334 rdev->sectors = rdev->sb_start; 1335 /* Limit to 4TB as metadata cannot record more than that. 1336 * (not needed for Linear and RAID0 as metadata doesn't 1337 * record this size) 1338 */ 1339 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1340 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1341 1342 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1343 /* "this cannot possibly happen" ... */ 1344 ret = -EINVAL; 1345 1346 abort: 1347 return ret; 1348 } 1349 1350 static u64 md_bitmap_events_cleared(struct mddev *mddev) 1351 { 1352 struct md_bitmap_stats stats; 1353 int err; 1354 1355 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 1356 if (err) 1357 return 0; 1358 1359 return stats.events_cleared; 1360 } 1361 1362 /* 1363 * validate_super for 0.90.0 1364 * note: we are not using "freshest" for 0.9 superblock 1365 */ 1366 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1367 { 1368 mdp_disk_t *desc; 1369 mdp_super_t *sb = page_address(rdev->sb_page); 1370 __u64 ev1 = md_event(sb); 1371 1372 rdev->raid_disk = -1; 1373 clear_bit(Faulty, &rdev->flags); 1374 clear_bit(In_sync, &rdev->flags); 1375 clear_bit(Bitmap_sync, &rdev->flags); 1376 clear_bit(WriteMostly, &rdev->flags); 1377 1378 if (mddev->raid_disks == 0) { 1379 mddev->major_version = 0; 1380 mddev->minor_version = sb->minor_version; 1381 mddev->patch_version = sb->patch_version; 1382 mddev->external = 0; 1383 mddev->chunk_sectors = sb->chunk_size >> 9; 1384 mddev->ctime = sb->ctime; 1385 mddev->utime = sb->utime; 1386 mddev->level = sb->level; 1387 mddev->clevel[0] = 0; 1388 mddev->layout = sb->layout; 1389 mddev->raid_disks = sb->raid_disks; 1390 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1391 mddev->events = ev1; 1392 mddev->bitmap_info.offset = 0; 1393 mddev->bitmap_info.space = 0; 1394 /* bitmap can use 60 K after the 4K superblocks */ 1395 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1396 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1397 mddev->reshape_backwards = 0; 1398 1399 if (mddev->minor_version >= 91) { 1400 mddev->reshape_position = sb->reshape_position; 1401 mddev->delta_disks = sb->delta_disks; 1402 mddev->new_level = sb->new_level; 1403 mddev->new_layout = sb->new_layout; 1404 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1405 if (mddev->delta_disks < 0) 1406 mddev->reshape_backwards = 1; 1407 } else { 1408 mddev->reshape_position = MaxSector; 1409 mddev->delta_disks = 0; 1410 mddev->new_level = mddev->level; 1411 mddev->new_layout = mddev->layout; 1412 mddev->new_chunk_sectors = mddev->chunk_sectors; 1413 } 1414 if (mddev->level == 0) 1415 mddev->layout = -1; 1416 1417 if (sb->state & (1<<MD_SB_CLEAN)) 1418 mddev->resync_offset = MaxSector; 1419 else { 1420 if (sb->events_hi == sb->cp_events_hi && 1421 sb->events_lo == sb->cp_events_lo) { 1422 mddev->resync_offset = sb->resync_offset; 1423 } else 1424 mddev->resync_offset = 0; 1425 } 1426 1427 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1428 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1429 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1430 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1431 1432 mddev->max_disks = MD_SB_DISKS; 1433 1434 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1435 mddev->bitmap_info.file == NULL) { 1436 mddev->bitmap_info.offset = 1437 mddev->bitmap_info.default_offset; 1438 mddev->bitmap_info.space = 1439 mddev->bitmap_info.default_space; 1440 } 1441 1442 } else if (mddev->pers == NULL) { 1443 /* Insist on good event counter while assembling, except 1444 * for spares (which don't need an event count) */ 1445 ++ev1; 1446 if (sb->disks[rdev->desc_nr].state & ( 1447 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1448 if (ev1 < mddev->events) 1449 return -EINVAL; 1450 } else if (mddev->bitmap) { 1451 /* if adding to array with a bitmap, then we can accept an 1452 * older device ... but not too old. 1453 */ 1454 if (ev1 < md_bitmap_events_cleared(mddev)) 1455 return 0; 1456 if (ev1 < mddev->events) 1457 set_bit(Bitmap_sync, &rdev->flags); 1458 } else { 1459 if (ev1 < mddev->events) 1460 /* just a hot-add of a new device, leave raid_disk at -1 */ 1461 return 0; 1462 } 1463 1464 desc = sb->disks + rdev->desc_nr; 1465 1466 if (desc->state & (1<<MD_DISK_FAULTY)) 1467 set_bit(Faulty, &rdev->flags); 1468 else if (desc->state & (1<<MD_DISK_SYNC)) { 1469 set_bit(In_sync, &rdev->flags); 1470 rdev->raid_disk = desc->raid_disk; 1471 rdev->saved_raid_disk = desc->raid_disk; 1472 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1473 /* active but not in sync implies recovery up to 1474 * reshape position. We don't know exactly where 1475 * that is, so set to zero for now 1476 */ 1477 if (mddev->minor_version >= 91) { 1478 rdev->recovery_offset = 0; 1479 rdev->raid_disk = desc->raid_disk; 1480 } 1481 } 1482 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1483 set_bit(WriteMostly, &rdev->flags); 1484 if (desc->state & (1<<MD_DISK_FAILFAST)) 1485 set_bit(FailFast, &rdev->flags); 1486 return 0; 1487 } 1488 1489 /* 1490 * sync_super for 0.90.0 1491 */ 1492 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1493 { 1494 mdp_super_t *sb; 1495 struct md_rdev *rdev2; 1496 int next_spare = mddev->raid_disks; 1497 1498 /* make rdev->sb match mddev data.. 1499 * 1500 * 1/ zero out disks 1501 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1502 * 3/ any empty disks < next_spare become removed 1503 * 1504 * disks[0] gets initialised to REMOVED because 1505 * we cannot be sure from other fields if it has 1506 * been initialised or not. 1507 */ 1508 int i; 1509 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1510 1511 rdev->sb_size = MD_SB_BYTES; 1512 1513 sb = page_address(rdev->sb_page); 1514 1515 memset(sb, 0, sizeof(*sb)); 1516 1517 sb->md_magic = MD_SB_MAGIC; 1518 sb->major_version = mddev->major_version; 1519 sb->patch_version = mddev->patch_version; 1520 sb->gvalid_words = 0; /* ignored */ 1521 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1522 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1523 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1524 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1525 1526 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1527 sb->level = mddev->level; 1528 sb->size = mddev->dev_sectors / 2; 1529 sb->raid_disks = mddev->raid_disks; 1530 sb->md_minor = mddev->md_minor; 1531 sb->not_persistent = 0; 1532 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1533 sb->state = 0; 1534 sb->events_hi = (mddev->events>>32); 1535 sb->events_lo = (u32)mddev->events; 1536 1537 if (mddev->reshape_position == MaxSector) 1538 sb->minor_version = 90; 1539 else { 1540 sb->minor_version = 91; 1541 sb->reshape_position = mddev->reshape_position; 1542 sb->new_level = mddev->new_level; 1543 sb->delta_disks = mddev->delta_disks; 1544 sb->new_layout = mddev->new_layout; 1545 sb->new_chunk = mddev->new_chunk_sectors << 9; 1546 } 1547 mddev->minor_version = sb->minor_version; 1548 if (mddev->in_sync) 1549 { 1550 sb->resync_offset = mddev->resync_offset; 1551 sb->cp_events_hi = (mddev->events>>32); 1552 sb->cp_events_lo = (u32)mddev->events; 1553 if (mddev->resync_offset == MaxSector) 1554 sb->state = (1<< MD_SB_CLEAN); 1555 } else 1556 sb->resync_offset = 0; 1557 1558 sb->layout = mddev->layout; 1559 sb->chunk_size = mddev->chunk_sectors << 9; 1560 1561 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1562 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1563 1564 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1565 rdev_for_each(rdev2, mddev) { 1566 mdp_disk_t *d; 1567 int desc_nr; 1568 int is_active = test_bit(In_sync, &rdev2->flags); 1569 1570 if (rdev2->raid_disk >= 0 && 1571 sb->minor_version >= 91) 1572 /* we have nowhere to store the recovery_offset, 1573 * but if it is not below the reshape_position, 1574 * we can piggy-back on that. 1575 */ 1576 is_active = 1; 1577 if (rdev2->raid_disk < 0 || 1578 test_bit(Faulty, &rdev2->flags)) 1579 is_active = 0; 1580 if (is_active) 1581 desc_nr = rdev2->raid_disk; 1582 else 1583 desc_nr = next_spare++; 1584 rdev2->desc_nr = desc_nr; 1585 d = &sb->disks[rdev2->desc_nr]; 1586 nr_disks++; 1587 d->number = rdev2->desc_nr; 1588 d->major = MAJOR(rdev2->bdev->bd_dev); 1589 d->minor = MINOR(rdev2->bdev->bd_dev); 1590 if (is_active) 1591 d->raid_disk = rdev2->raid_disk; 1592 else 1593 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1594 if (test_bit(Faulty, &rdev2->flags)) 1595 d->state = (1<<MD_DISK_FAULTY); 1596 else if (is_active) { 1597 d->state = (1<<MD_DISK_ACTIVE); 1598 if (test_bit(In_sync, &rdev2->flags)) 1599 d->state |= (1<<MD_DISK_SYNC); 1600 active++; 1601 working++; 1602 } else { 1603 d->state = 0; 1604 spare++; 1605 working++; 1606 } 1607 if (test_bit(WriteMostly, &rdev2->flags)) 1608 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1609 if (test_bit(FailFast, &rdev2->flags)) 1610 d->state |= (1<<MD_DISK_FAILFAST); 1611 } 1612 /* now set the "removed" and "faulty" bits on any missing devices */ 1613 for (i=0 ; i < mddev->raid_disks ; i++) { 1614 mdp_disk_t *d = &sb->disks[i]; 1615 if (d->state == 0 && d->number == 0) { 1616 d->number = i; 1617 d->raid_disk = i; 1618 d->state = (1<<MD_DISK_REMOVED); 1619 d->state |= (1<<MD_DISK_FAULTY); 1620 failed++; 1621 } 1622 } 1623 sb->nr_disks = nr_disks; 1624 sb->active_disks = active; 1625 sb->working_disks = working; 1626 sb->failed_disks = failed; 1627 sb->spare_disks = spare; 1628 1629 sb->this_disk = sb->disks[rdev->desc_nr]; 1630 sb->sb_csum = calc_sb_csum(sb); 1631 } 1632 1633 /* 1634 * rdev_size_change for 0.90.0 1635 */ 1636 static unsigned long long 1637 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1638 { 1639 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1640 return 0; /* component must fit device */ 1641 if (rdev->mddev->bitmap_info.offset) 1642 return 0; /* can't move bitmap */ 1643 rdev->sb_start = calc_dev_sboffset(rdev); 1644 if (!num_sectors || num_sectors > rdev->sb_start) 1645 num_sectors = rdev->sb_start; 1646 /* Limit to 4TB as metadata cannot record more than that. 1647 * 4TB == 2^32 KB, or 2*2^32 sectors. 1648 */ 1649 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1650 num_sectors = (sector_t)(2ULL << 32) - 2; 1651 do { 1652 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1653 rdev->sb_page); 1654 } while (md_super_wait(rdev->mddev) < 0); 1655 return num_sectors; 1656 } 1657 1658 static int 1659 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1660 { 1661 /* non-zero offset changes not possible with v0.90 */ 1662 return new_offset == 0; 1663 } 1664 1665 /* 1666 * version 1 superblock 1667 */ 1668 1669 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1670 { 1671 __le32 disk_csum; 1672 u32 csum; 1673 unsigned long long newcsum; 1674 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1675 __le32 *isuper = (__le32*)sb; 1676 1677 disk_csum = sb->sb_csum; 1678 sb->sb_csum = 0; 1679 newcsum = 0; 1680 for (; size >= 4; size -= 4) 1681 newcsum += le32_to_cpu(*isuper++); 1682 1683 if (size == 2) 1684 newcsum += le16_to_cpu(*(__le16*) isuper); 1685 1686 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1687 sb->sb_csum = disk_csum; 1688 return cpu_to_le32(csum); 1689 } 1690 1691 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1692 { 1693 struct mdp_superblock_1 *sb; 1694 int ret; 1695 sector_t sb_start; 1696 sector_t sectors; 1697 int bmask; 1698 bool spare_disk = true; 1699 1700 /* 1701 * Calculate the position of the superblock in 512byte sectors. 1702 * It is always aligned to a 4K boundary and 1703 * depeding on minor_version, it can be: 1704 * 0: At least 8K, but less than 12K, from end of device 1705 * 1: At start of device 1706 * 2: 4K from start of device. 1707 */ 1708 switch(minor_version) { 1709 case 0: 1710 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1711 sb_start &= ~(sector_t)(4*2-1); 1712 break; 1713 case 1: 1714 sb_start = 0; 1715 break; 1716 case 2: 1717 sb_start = 8; 1718 break; 1719 default: 1720 return -EINVAL; 1721 } 1722 rdev->sb_start = sb_start; 1723 1724 /* superblock is rarely larger than 1K, but it can be larger, 1725 * and it is safe to read 4k, so we do that 1726 */ 1727 ret = read_disk_sb(rdev, 4096); 1728 if (ret) return ret; 1729 1730 sb = page_address(rdev->sb_page); 1731 1732 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1733 sb->major_version != cpu_to_le32(1) || 1734 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1735 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1736 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1737 return -EINVAL; 1738 1739 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1740 pr_warn("md: invalid superblock checksum on %pg\n", 1741 rdev->bdev); 1742 return -EINVAL; 1743 } 1744 if (le64_to_cpu(sb->data_size) < 10) { 1745 pr_warn("md: data_size too small on %pg\n", 1746 rdev->bdev); 1747 return -EINVAL; 1748 } 1749 if (sb->pad0 || 1750 sb->pad3[0] || 1751 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1752 /* Some padding is non-zero, might be a new feature */ 1753 return -EINVAL; 1754 1755 rdev->preferred_minor = 0xffff; 1756 rdev->data_offset = le64_to_cpu(sb->data_offset); 1757 rdev->new_data_offset = rdev->data_offset; 1758 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1759 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1760 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1761 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1762 1763 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1764 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1765 if (rdev->sb_size & bmask) 1766 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1767 1768 if (minor_version 1769 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1770 return -EINVAL; 1771 if (minor_version 1772 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1773 return -EINVAL; 1774 1775 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1776 1777 if (!rdev->bb_page) { 1778 rdev->bb_page = alloc_page(GFP_KERNEL); 1779 if (!rdev->bb_page) 1780 return -ENOMEM; 1781 } 1782 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1783 rdev->badblocks.count == 0) { 1784 /* need to load the bad block list. 1785 * Currently we limit it to one page. 1786 */ 1787 s32 offset; 1788 sector_t bb_sector; 1789 __le64 *bbp; 1790 int i; 1791 int sectors = le16_to_cpu(sb->bblog_size); 1792 if (sectors > (PAGE_SIZE / 512)) 1793 return -EINVAL; 1794 offset = le32_to_cpu(sb->bblog_offset); 1795 if (offset == 0) 1796 return -EINVAL; 1797 bb_sector = (long long)offset; 1798 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1799 rdev->bb_page, REQ_OP_READ, true)) 1800 return -EIO; 1801 bbp = (__le64 *)page_address(rdev->bb_page); 1802 rdev->badblocks.shift = sb->bblog_shift; 1803 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1804 u64 bb = le64_to_cpu(*bbp); 1805 int count = bb & (0x3ff); 1806 u64 sector = bb >> 10; 1807 sector <<= sb->bblog_shift; 1808 count <<= sb->bblog_shift; 1809 if (bb + 1 == 0) 1810 break; 1811 if (!badblocks_set(&rdev->badblocks, sector, count, 1)) 1812 return -EINVAL; 1813 } 1814 } else if (sb->bblog_offset != 0) 1815 rdev->badblocks.shift = 0; 1816 1817 if ((le32_to_cpu(sb->feature_map) & 1818 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1819 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1820 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1821 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1822 } 1823 1824 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1825 sb->level != 0) 1826 return -EINVAL; 1827 1828 /* not spare disk */ 1829 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1830 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1831 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1832 spare_disk = false; 1833 1834 if (!refdev) { 1835 if (!spare_disk) 1836 ret = 1; 1837 else 1838 ret = 0; 1839 } else { 1840 __u64 ev1, ev2; 1841 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1842 1843 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1844 sb->level != refsb->level || 1845 sb->layout != refsb->layout || 1846 sb->chunksize != refsb->chunksize) { 1847 pr_warn("md: %pg has strangely different superblock to %pg\n", 1848 rdev->bdev, 1849 refdev->bdev); 1850 return -EINVAL; 1851 } 1852 ev1 = le64_to_cpu(sb->events); 1853 ev2 = le64_to_cpu(refsb->events); 1854 1855 if (!spare_disk && ev1 > ev2) 1856 ret = 1; 1857 else 1858 ret = 0; 1859 } 1860 if (minor_version) 1861 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1862 else 1863 sectors = rdev->sb_start; 1864 if (sectors < le64_to_cpu(sb->data_size)) 1865 return -EINVAL; 1866 rdev->sectors = le64_to_cpu(sb->data_size); 1867 return ret; 1868 } 1869 1870 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1871 { 1872 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1873 __u64 ev1 = le64_to_cpu(sb->events); 1874 int role; 1875 1876 rdev->raid_disk = -1; 1877 clear_bit(Faulty, &rdev->flags); 1878 clear_bit(In_sync, &rdev->flags); 1879 clear_bit(Bitmap_sync, &rdev->flags); 1880 clear_bit(WriteMostly, &rdev->flags); 1881 1882 if (mddev->raid_disks == 0) { 1883 mddev->major_version = 1; 1884 mddev->patch_version = 0; 1885 mddev->external = 0; 1886 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1887 mddev->ctime = le64_to_cpu(sb->ctime); 1888 mddev->utime = le64_to_cpu(sb->utime); 1889 mddev->level = le32_to_cpu(sb->level); 1890 mddev->clevel[0] = 0; 1891 mddev->layout = le32_to_cpu(sb->layout); 1892 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1893 mddev->dev_sectors = le64_to_cpu(sb->size); 1894 mddev->events = ev1; 1895 mddev->bitmap_info.offset = 0; 1896 mddev->bitmap_info.space = 0; 1897 /* Default location for bitmap is 1K after superblock 1898 * using 3K - total of 4K 1899 */ 1900 mddev->bitmap_info.default_offset = 1024 >> 9; 1901 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1902 mddev->reshape_backwards = 0; 1903 1904 mddev->resync_offset = le64_to_cpu(sb->resync_offset); 1905 memcpy(mddev->uuid, sb->set_uuid, 16); 1906 1907 mddev->max_disks = (4096-256)/2; 1908 1909 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1910 mddev->bitmap_info.file == NULL) { 1911 mddev->bitmap_info.offset = 1912 (__s32)le32_to_cpu(sb->bitmap_offset); 1913 /* Metadata doesn't record how much space is available. 1914 * For 1.0, we assume we can use up to the superblock 1915 * if before, else to 4K beyond superblock. 1916 * For others, assume no change is possible. 1917 */ 1918 if (mddev->minor_version > 0) 1919 mddev->bitmap_info.space = 0; 1920 else if (mddev->bitmap_info.offset > 0) 1921 mddev->bitmap_info.space = 1922 8 - mddev->bitmap_info.offset; 1923 else 1924 mddev->bitmap_info.space = 1925 -mddev->bitmap_info.offset; 1926 } 1927 1928 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1929 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1930 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1931 mddev->new_level = le32_to_cpu(sb->new_level); 1932 mddev->new_layout = le32_to_cpu(sb->new_layout); 1933 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1934 if (mddev->delta_disks < 0 || 1935 (mddev->delta_disks == 0 && 1936 (le32_to_cpu(sb->feature_map) 1937 & MD_FEATURE_RESHAPE_BACKWARDS))) 1938 mddev->reshape_backwards = 1; 1939 } else { 1940 mddev->reshape_position = MaxSector; 1941 mddev->delta_disks = 0; 1942 mddev->new_level = mddev->level; 1943 mddev->new_layout = mddev->layout; 1944 mddev->new_chunk_sectors = mddev->chunk_sectors; 1945 } 1946 1947 if (mddev->level == 0 && 1948 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1949 mddev->layout = -1; 1950 1951 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1952 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1953 1954 if (le32_to_cpu(sb->feature_map) & 1955 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1956 if (le32_to_cpu(sb->feature_map) & 1957 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1958 return -EINVAL; 1959 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1960 (le32_to_cpu(sb->feature_map) & 1961 MD_FEATURE_MULTIPLE_PPLS)) 1962 return -EINVAL; 1963 set_bit(MD_HAS_PPL, &mddev->flags); 1964 } 1965 } else if (mddev->pers == NULL) { 1966 /* Insist of good event counter while assembling, except for 1967 * spares (which don't need an event count). 1968 * Similar to mdadm, we allow event counter difference of 1 1969 * from the freshest device. 1970 */ 1971 if (rdev->desc_nr >= 0 && 1972 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1973 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1974 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1975 if (ev1 + 1 < mddev->events) 1976 return -EINVAL; 1977 } else if (mddev->bitmap) { 1978 /* If adding to array with a bitmap, then we can accept an 1979 * older device, but not too old. 1980 */ 1981 if (ev1 < md_bitmap_events_cleared(mddev)) 1982 return 0; 1983 if (ev1 < mddev->events) 1984 set_bit(Bitmap_sync, &rdev->flags); 1985 } else { 1986 if (ev1 < mddev->events) 1987 /* just a hot-add of a new device, leave raid_disk at -1 */ 1988 return 0; 1989 } 1990 1991 if (rdev->desc_nr < 0 || 1992 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1993 role = MD_DISK_ROLE_SPARE; 1994 rdev->desc_nr = -1; 1995 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 1996 /* 1997 * If we are assembling, and our event counter is smaller than the 1998 * highest event counter, we cannot trust our superblock about the role. 1999 * It could happen that our rdev was marked as Faulty, and all other 2000 * superblocks were updated with +1 event counter. 2001 * Then, before the next superblock update, which typically happens when 2002 * remove_and_add_spares() removes the device from the array, there was 2003 * a crash or reboot. 2004 * If we allow current rdev without consulting the freshest superblock, 2005 * we could cause data corruption. 2006 * Note that in this case our event counter is smaller by 1 than the 2007 * highest, otherwise, this rdev would not be allowed into array; 2008 * both kernel and mdadm allow event counter difference of 1. 2009 */ 2010 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 2011 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 2012 2013 if (rdev->desc_nr >= freshest_max_dev) { 2014 /* this is unexpected, better not proceed */ 2015 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 2016 mdname(mddev), rdev->bdev, rdev->desc_nr, 2017 freshest->bdev, freshest_max_dev); 2018 return -EUCLEAN; 2019 } 2020 2021 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 2022 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 2023 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 2024 } else { 2025 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2026 } 2027 switch (role) { 2028 case MD_DISK_ROLE_SPARE: /* spare */ 2029 break; 2030 case MD_DISK_ROLE_FAULTY: /* faulty */ 2031 set_bit(Faulty, &rdev->flags); 2032 break; 2033 case MD_DISK_ROLE_JOURNAL: /* journal device */ 2034 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 2035 /* journal device without journal feature */ 2036 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 2037 return -EINVAL; 2038 } 2039 set_bit(Journal, &rdev->flags); 2040 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 2041 rdev->raid_disk = 0; 2042 break; 2043 default: 2044 rdev->saved_raid_disk = role; 2045 if ((le32_to_cpu(sb->feature_map) & 2046 MD_FEATURE_RECOVERY_OFFSET)) { 2047 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 2048 if (!(le32_to_cpu(sb->feature_map) & 2049 MD_FEATURE_RECOVERY_BITMAP)) 2050 rdev->saved_raid_disk = -1; 2051 } else { 2052 /* 2053 * If the array is FROZEN, then the device can't 2054 * be in_sync with rest of array. 2055 */ 2056 if (!test_bit(MD_RECOVERY_FROZEN, 2057 &mddev->recovery)) 2058 set_bit(In_sync, &rdev->flags); 2059 } 2060 rdev->raid_disk = role; 2061 break; 2062 } 2063 if (sb->devflags & WriteMostly1) 2064 set_bit(WriteMostly, &rdev->flags); 2065 if (sb->devflags & FailFast1) 2066 set_bit(FailFast, &rdev->flags); 2067 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2068 set_bit(Replacement, &rdev->flags); 2069 2070 return 0; 2071 } 2072 2073 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2074 { 2075 struct mdp_superblock_1 *sb; 2076 struct md_rdev *rdev2; 2077 int max_dev, i; 2078 /* make rdev->sb match mddev and rdev data. */ 2079 2080 sb = page_address(rdev->sb_page); 2081 2082 sb->feature_map = 0; 2083 sb->pad0 = 0; 2084 sb->recovery_offset = cpu_to_le64(0); 2085 memset(sb->pad3, 0, sizeof(sb->pad3)); 2086 2087 sb->utime = cpu_to_le64((__u64)mddev->utime); 2088 sb->events = cpu_to_le64(mddev->events); 2089 if (mddev->in_sync) 2090 sb->resync_offset = cpu_to_le64(mddev->resync_offset); 2091 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2092 sb->resync_offset = cpu_to_le64(MaxSector); 2093 else 2094 sb->resync_offset = cpu_to_le64(0); 2095 2096 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2097 2098 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2099 sb->size = cpu_to_le64(mddev->dev_sectors); 2100 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2101 sb->level = cpu_to_le32(mddev->level); 2102 sb->layout = cpu_to_le32(mddev->layout); 2103 if (test_bit(FailFast, &rdev->flags)) 2104 sb->devflags |= FailFast1; 2105 else 2106 sb->devflags &= ~FailFast1; 2107 2108 if (test_bit(WriteMostly, &rdev->flags)) 2109 sb->devflags |= WriteMostly1; 2110 else 2111 sb->devflags &= ~WriteMostly1; 2112 sb->data_offset = cpu_to_le64(rdev->data_offset); 2113 sb->data_size = cpu_to_le64(rdev->sectors); 2114 2115 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2116 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2117 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2118 } 2119 2120 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2121 !test_bit(In_sync, &rdev->flags)) { 2122 sb->feature_map |= 2123 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2124 sb->recovery_offset = 2125 cpu_to_le64(rdev->recovery_offset); 2126 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2127 sb->feature_map |= 2128 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2129 } 2130 /* Note: recovery_offset and journal_tail share space */ 2131 if (test_bit(Journal, &rdev->flags)) 2132 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2133 if (test_bit(Replacement, &rdev->flags)) 2134 sb->feature_map |= 2135 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2136 2137 if (mddev->reshape_position != MaxSector) { 2138 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2139 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2140 sb->new_layout = cpu_to_le32(mddev->new_layout); 2141 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2142 sb->new_level = cpu_to_le32(mddev->new_level); 2143 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2144 if (mddev->delta_disks == 0 && 2145 mddev->reshape_backwards) 2146 sb->feature_map 2147 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2148 if (rdev->new_data_offset != rdev->data_offset) { 2149 sb->feature_map 2150 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2151 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2152 - rdev->data_offset)); 2153 } 2154 } 2155 2156 if (mddev_is_clustered(mddev)) 2157 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2158 2159 if (rdev->badblocks.count == 0) 2160 /* Nothing to do for bad blocks*/ ; 2161 else if (sb->bblog_offset == 0) 2162 /* Cannot record bad blocks on this device */ 2163 md_error(mddev, rdev); 2164 else { 2165 struct badblocks *bb = &rdev->badblocks; 2166 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2167 u64 *p = bb->page; 2168 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2169 if (bb->changed) { 2170 unsigned seq; 2171 2172 retry: 2173 seq = read_seqbegin(&bb->lock); 2174 2175 memset(bbp, 0xff, PAGE_SIZE); 2176 2177 for (i = 0 ; i < bb->count ; i++) { 2178 u64 internal_bb = p[i]; 2179 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2180 | BB_LEN(internal_bb)); 2181 bbp[i] = cpu_to_le64(store_bb); 2182 } 2183 bb->changed = 0; 2184 if (read_seqretry(&bb->lock, seq)) 2185 goto retry; 2186 2187 bb->sector = (rdev->sb_start + 2188 (int)le32_to_cpu(sb->bblog_offset)); 2189 bb->size = le16_to_cpu(sb->bblog_size); 2190 } 2191 } 2192 2193 max_dev = 0; 2194 rdev_for_each(rdev2, mddev) 2195 if (rdev2->desc_nr+1 > max_dev) 2196 max_dev = rdev2->desc_nr+1; 2197 2198 if (max_dev > le32_to_cpu(sb->max_dev)) { 2199 int bmask; 2200 sb->max_dev = cpu_to_le32(max_dev); 2201 rdev->sb_size = max_dev * 2 + 256; 2202 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2203 if (rdev->sb_size & bmask) 2204 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2205 } else 2206 max_dev = le32_to_cpu(sb->max_dev); 2207 2208 for (i=0; i<max_dev;i++) 2209 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2210 2211 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2212 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2213 2214 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2215 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2216 sb->feature_map |= 2217 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2218 else 2219 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2220 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2221 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2222 } 2223 2224 rdev_for_each(rdev2, mddev) { 2225 i = rdev2->desc_nr; 2226 if (test_bit(Faulty, &rdev2->flags)) 2227 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2228 else if (test_bit(In_sync, &rdev2->flags)) 2229 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2230 else if (test_bit(Journal, &rdev2->flags)) 2231 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2232 else if (rdev2->raid_disk >= 0) 2233 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2234 else 2235 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2236 } 2237 2238 sb->sb_csum = calc_sb_1_csum(sb); 2239 } 2240 2241 static sector_t super_1_choose_bm_space(sector_t dev_size) 2242 { 2243 sector_t bm_space; 2244 2245 /* if the device is bigger than 8Gig, save 64k for bitmap 2246 * usage, if bigger than 200Gig, save 128k 2247 */ 2248 if (dev_size < 64*2) 2249 bm_space = 0; 2250 else if (dev_size - 64*2 >= 200*1024*1024*2) 2251 bm_space = 128*2; 2252 else if (dev_size - 4*2 > 8*1024*1024*2) 2253 bm_space = 64*2; 2254 else 2255 bm_space = 4*2; 2256 return bm_space; 2257 } 2258 2259 static unsigned long long 2260 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2261 { 2262 struct mdp_superblock_1 *sb; 2263 sector_t max_sectors; 2264 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2265 return 0; /* component must fit device */ 2266 if (rdev->data_offset != rdev->new_data_offset) 2267 return 0; /* too confusing */ 2268 if (rdev->sb_start < rdev->data_offset) { 2269 /* minor versions 1 and 2; superblock before data */ 2270 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2271 if (!num_sectors || num_sectors > max_sectors) 2272 num_sectors = max_sectors; 2273 } else if (rdev->mddev->bitmap_info.offset) { 2274 /* minor version 0 with bitmap we can't move */ 2275 return 0; 2276 } else { 2277 /* minor version 0; superblock after data */ 2278 sector_t sb_start, bm_space; 2279 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2280 2281 /* 8K is for superblock */ 2282 sb_start = dev_size - 8*2; 2283 sb_start &= ~(sector_t)(4*2 - 1); 2284 2285 bm_space = super_1_choose_bm_space(dev_size); 2286 2287 /* Space that can be used to store date needs to decrease 2288 * superblock bitmap space and bad block space(4K) 2289 */ 2290 max_sectors = sb_start - bm_space - 4*2; 2291 2292 if (!num_sectors || num_sectors > max_sectors) 2293 num_sectors = max_sectors; 2294 rdev->sb_start = sb_start; 2295 } 2296 sb = page_address(rdev->sb_page); 2297 sb->data_size = cpu_to_le64(num_sectors); 2298 sb->super_offset = cpu_to_le64(rdev->sb_start); 2299 sb->sb_csum = calc_sb_1_csum(sb); 2300 do { 2301 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2302 rdev->sb_page); 2303 } while (md_super_wait(rdev->mddev) < 0); 2304 return num_sectors; 2305 2306 } 2307 2308 static int 2309 super_1_allow_new_offset(struct md_rdev *rdev, 2310 unsigned long long new_offset) 2311 { 2312 /* All necessary checks on new >= old have been done */ 2313 if (new_offset >= rdev->data_offset) 2314 return 1; 2315 2316 /* with 1.0 metadata, there is no metadata to tread on 2317 * so we can always move back */ 2318 if (rdev->mddev->minor_version == 0) 2319 return 1; 2320 2321 /* otherwise we must be sure not to step on 2322 * any metadata, so stay: 2323 * 36K beyond start of superblock 2324 * beyond end of badblocks 2325 * beyond write-intent bitmap 2326 */ 2327 if (rdev->sb_start + (32+4)*2 > new_offset) 2328 return 0; 2329 2330 if (!rdev->mddev->bitmap_info.file) { 2331 struct mddev *mddev = rdev->mddev; 2332 struct md_bitmap_stats stats; 2333 int err; 2334 2335 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 2336 if (!err && rdev->sb_start + mddev->bitmap_info.offset + 2337 stats.file_pages * (PAGE_SIZE >> 9) > new_offset) 2338 return 0; 2339 } 2340 2341 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2342 return 0; 2343 2344 return 1; 2345 } 2346 2347 static struct super_type super_types[] = { 2348 [0] = { 2349 .name = "0.90.0", 2350 .owner = THIS_MODULE, 2351 .load_super = super_90_load, 2352 .validate_super = super_90_validate, 2353 .sync_super = super_90_sync, 2354 .rdev_size_change = super_90_rdev_size_change, 2355 .allow_new_offset = super_90_allow_new_offset, 2356 }, 2357 [1] = { 2358 .name = "md-1", 2359 .owner = THIS_MODULE, 2360 .load_super = super_1_load, 2361 .validate_super = super_1_validate, 2362 .sync_super = super_1_sync, 2363 .rdev_size_change = super_1_rdev_size_change, 2364 .allow_new_offset = super_1_allow_new_offset, 2365 }, 2366 }; 2367 2368 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2369 { 2370 if (mddev->sync_super) { 2371 mddev->sync_super(mddev, rdev); 2372 return; 2373 } 2374 2375 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2376 2377 super_types[mddev->major_version].sync_super(mddev, rdev); 2378 } 2379 2380 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2381 { 2382 struct md_rdev *rdev, *rdev2; 2383 2384 rcu_read_lock(); 2385 rdev_for_each_rcu(rdev, mddev1) { 2386 if (test_bit(Faulty, &rdev->flags) || 2387 test_bit(Journal, &rdev->flags) || 2388 rdev->raid_disk == -1) 2389 continue; 2390 rdev_for_each_rcu(rdev2, mddev2) { 2391 if (test_bit(Faulty, &rdev2->flags) || 2392 test_bit(Journal, &rdev2->flags) || 2393 rdev2->raid_disk == -1) 2394 continue; 2395 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2396 rcu_read_unlock(); 2397 return 1; 2398 } 2399 } 2400 } 2401 rcu_read_unlock(); 2402 return 0; 2403 } 2404 2405 static LIST_HEAD(pending_raid_disks); 2406 2407 /* 2408 * Try to register data integrity profile for an mddev 2409 * 2410 * This is called when an array is started and after a disk has been kicked 2411 * from the array. It only succeeds if all working and active component devices 2412 * are integrity capable with matching profiles. 2413 */ 2414 int md_integrity_register(struct mddev *mddev) 2415 { 2416 if (list_empty(&mddev->disks)) 2417 return 0; /* nothing to do */ 2418 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) 2419 return 0; /* shouldn't register */ 2420 2421 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2422 return 0; 2423 } 2424 EXPORT_SYMBOL(md_integrity_register); 2425 2426 static bool rdev_read_only(struct md_rdev *rdev) 2427 { 2428 return bdev_read_only(rdev->bdev) || 2429 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2430 } 2431 2432 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2433 { 2434 char b[BDEVNAME_SIZE]; 2435 int err; 2436 2437 /* prevent duplicates */ 2438 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2439 return -EEXIST; 2440 2441 if (rdev_read_only(rdev) && mddev->pers) 2442 return -EROFS; 2443 2444 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2445 if (!test_bit(Journal, &rdev->flags) && 2446 rdev->sectors && 2447 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2448 if (mddev->pers) { 2449 /* Cannot change size, so fail 2450 * If mddev->level <= 0, then we don't care 2451 * about aligning sizes (e.g. linear) 2452 */ 2453 if (mddev->level > 0) 2454 return -ENOSPC; 2455 } else 2456 mddev->dev_sectors = rdev->sectors; 2457 } 2458 2459 /* Verify rdev->desc_nr is unique. 2460 * If it is -1, assign a free number, else 2461 * check number is not in use 2462 */ 2463 rcu_read_lock(); 2464 if (rdev->desc_nr < 0) { 2465 int choice = 0; 2466 if (mddev->pers) 2467 choice = mddev->raid_disks; 2468 while (md_find_rdev_nr_rcu(mddev, choice)) 2469 choice++; 2470 rdev->desc_nr = choice; 2471 } else { 2472 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2473 rcu_read_unlock(); 2474 return -EBUSY; 2475 } 2476 } 2477 rcu_read_unlock(); 2478 if (!test_bit(Journal, &rdev->flags) && 2479 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2480 pr_warn("md: %s: array is limited to %d devices\n", 2481 mdname(mddev), mddev->max_disks); 2482 return -EBUSY; 2483 } 2484 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2485 strreplace(b, '/', '!'); 2486 2487 rdev->mddev = mddev; 2488 pr_debug("md: bind<%s>\n", b); 2489 2490 if (mddev->raid_disks) 2491 mddev_create_serial_pool(mddev, rdev); 2492 2493 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2494 goto fail; 2495 2496 /* failure here is OK */ 2497 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2498 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2499 rdev->sysfs_unack_badblocks = 2500 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2501 rdev->sysfs_badblocks = 2502 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2503 2504 list_add_rcu(&rdev->same_set, &mddev->disks); 2505 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2506 2507 /* May as well allow recovery to be retried once */ 2508 mddev->recovery_disabled++; 2509 2510 return 0; 2511 2512 fail: 2513 pr_warn("md: failed to register dev-%s for %s\n", 2514 b, mdname(mddev)); 2515 mddev_destroy_serial_pool(mddev, rdev); 2516 return err; 2517 } 2518 2519 void md_autodetect_dev(dev_t dev); 2520 2521 /* just for claiming the bdev */ 2522 static struct md_rdev claim_rdev; 2523 2524 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2525 { 2526 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2527 md_rdev_clear(rdev); 2528 #ifndef MODULE 2529 if (test_bit(AutoDetected, &rdev->flags)) 2530 md_autodetect_dev(rdev->bdev->bd_dev); 2531 #endif 2532 fput(rdev->bdev_file); 2533 rdev->bdev = NULL; 2534 kobject_put(&rdev->kobj); 2535 } 2536 2537 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2538 { 2539 struct mddev *mddev = rdev->mddev; 2540 2541 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2542 list_del_rcu(&rdev->same_set); 2543 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2544 mddev_destroy_serial_pool(rdev->mddev, rdev); 2545 WRITE_ONCE(rdev->mddev, NULL); 2546 sysfs_remove_link(&rdev->kobj, "block"); 2547 sysfs_put(rdev->sysfs_state); 2548 sysfs_put(rdev->sysfs_unack_badblocks); 2549 sysfs_put(rdev->sysfs_badblocks); 2550 rdev->sysfs_state = NULL; 2551 rdev->sysfs_unack_badblocks = NULL; 2552 rdev->sysfs_badblocks = NULL; 2553 rdev->badblocks.count = 0; 2554 2555 synchronize_rcu(); 2556 2557 /* 2558 * kobject_del() will wait for all in progress writers to be done, where 2559 * reconfig_mutex is held, hence it can't be called under 2560 * reconfig_mutex and it's delayed to mddev_unlock(). 2561 */ 2562 list_add(&rdev->same_set, &mddev->deleting); 2563 } 2564 2565 static void export_array(struct mddev *mddev) 2566 { 2567 struct md_rdev *rdev; 2568 2569 while (!list_empty(&mddev->disks)) { 2570 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2571 same_set); 2572 md_kick_rdev_from_array(rdev); 2573 } 2574 mddev->raid_disks = 0; 2575 mddev->major_version = 0; 2576 } 2577 2578 static bool set_in_sync(struct mddev *mddev) 2579 { 2580 lockdep_assert_held(&mddev->lock); 2581 if (!mddev->in_sync) { 2582 mddev->sync_checkers++; 2583 spin_unlock(&mddev->lock); 2584 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2585 spin_lock(&mddev->lock); 2586 if (!mddev->in_sync && 2587 percpu_ref_is_zero(&mddev->writes_pending)) { 2588 mddev->in_sync = 1; 2589 /* 2590 * Ensure ->in_sync is visible before we clear 2591 * ->sync_checkers. 2592 */ 2593 smp_mb(); 2594 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2595 sysfs_notify_dirent_safe(mddev->sysfs_state); 2596 } 2597 if (--mddev->sync_checkers == 0) 2598 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2599 } 2600 if (mddev->safemode == 1) 2601 mddev->safemode = 0; 2602 return mddev->in_sync; 2603 } 2604 2605 static void sync_sbs(struct mddev *mddev, int nospares) 2606 { 2607 /* Update each superblock (in-memory image), but 2608 * if we are allowed to, skip spares which already 2609 * have the right event counter, or have one earlier 2610 * (which would mean they aren't being marked as dirty 2611 * with the rest of the array) 2612 */ 2613 struct md_rdev *rdev; 2614 rdev_for_each(rdev, mddev) { 2615 if (rdev->sb_events == mddev->events || 2616 (nospares && 2617 rdev->raid_disk < 0 && 2618 rdev->sb_events+1 == mddev->events)) { 2619 /* Don't update this superblock */ 2620 rdev->sb_loaded = 2; 2621 } else { 2622 sync_super(mddev, rdev); 2623 rdev->sb_loaded = 1; 2624 } 2625 } 2626 } 2627 2628 static bool does_sb_need_changing(struct mddev *mddev) 2629 { 2630 struct md_rdev *rdev = NULL, *iter; 2631 struct mdp_superblock_1 *sb; 2632 int role; 2633 2634 /* Find a good rdev */ 2635 rdev_for_each(iter, mddev) 2636 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2637 rdev = iter; 2638 break; 2639 } 2640 2641 /* No good device found. */ 2642 if (!rdev) 2643 return false; 2644 2645 sb = page_address(rdev->sb_page); 2646 /* Check if a device has become faulty or a spare become active */ 2647 rdev_for_each(rdev, mddev) { 2648 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2649 /* Device activated? */ 2650 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2651 !test_bit(Faulty, &rdev->flags)) 2652 return true; 2653 /* Device turned faulty? */ 2654 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2655 return true; 2656 } 2657 2658 /* Check if any mddev parameters have changed */ 2659 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2660 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2661 (mddev->layout != le32_to_cpu(sb->layout)) || 2662 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2663 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2664 return true; 2665 2666 return false; 2667 } 2668 2669 void md_update_sb(struct mddev *mddev, int force_change) 2670 { 2671 struct md_rdev *rdev; 2672 int sync_req; 2673 int nospares = 0; 2674 int any_badblocks_changed = 0; 2675 int ret = -1; 2676 2677 if (!md_is_rdwr(mddev)) { 2678 if (force_change) 2679 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2680 return; 2681 } 2682 2683 repeat: 2684 if (mddev_is_clustered(mddev)) { 2685 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2686 force_change = 1; 2687 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2688 nospares = 1; 2689 ret = mddev->cluster_ops->metadata_update_start(mddev); 2690 /* Has someone else has updated the sb */ 2691 if (!does_sb_need_changing(mddev)) { 2692 if (ret == 0) 2693 mddev->cluster_ops->metadata_update_cancel(mddev); 2694 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2695 BIT(MD_SB_CHANGE_DEVS) | 2696 BIT(MD_SB_CHANGE_CLEAN)); 2697 return; 2698 } 2699 } 2700 2701 /* 2702 * First make sure individual recovery_offsets are correct 2703 * curr_resync_completed can only be used during recovery. 2704 * During reshape/resync it might use array-addresses rather 2705 * that device addresses. 2706 */ 2707 rdev_for_each(rdev, mddev) { 2708 if (rdev->raid_disk >= 0 && 2709 mddev->delta_disks >= 0 && 2710 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2711 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2712 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2713 !test_bit(Journal, &rdev->flags) && 2714 !test_bit(In_sync, &rdev->flags) && 2715 mddev->curr_resync_completed > rdev->recovery_offset) 2716 rdev->recovery_offset = mddev->curr_resync_completed; 2717 2718 } 2719 if (!mddev->persistent) { 2720 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2721 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2722 if (!mddev->external) { 2723 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2724 rdev_for_each(rdev, mddev) { 2725 if (rdev->badblocks.changed) { 2726 rdev->badblocks.changed = 0; 2727 ack_all_badblocks(&rdev->badblocks); 2728 md_error(mddev, rdev); 2729 } 2730 clear_bit(Blocked, &rdev->flags); 2731 clear_bit(BlockedBadBlocks, &rdev->flags); 2732 wake_up(&rdev->blocked_wait); 2733 } 2734 } 2735 wake_up(&mddev->sb_wait); 2736 return; 2737 } 2738 2739 spin_lock(&mddev->lock); 2740 2741 mddev->utime = ktime_get_real_seconds(); 2742 2743 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2744 force_change = 1; 2745 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2746 /* just a clean<-> dirty transition, possibly leave spares alone, 2747 * though if events isn't the right even/odd, we will have to do 2748 * spares after all 2749 */ 2750 nospares = 1; 2751 if (force_change) 2752 nospares = 0; 2753 if (mddev->degraded) 2754 /* If the array is degraded, then skipping spares is both 2755 * dangerous and fairly pointless. 2756 * Dangerous because a device that was removed from the array 2757 * might have a event_count that still looks up-to-date, 2758 * so it can be re-added without a resync. 2759 * Pointless because if there are any spares to skip, 2760 * then a recovery will happen and soon that array won't 2761 * be degraded any more and the spare can go back to sleep then. 2762 */ 2763 nospares = 0; 2764 2765 sync_req = mddev->in_sync; 2766 2767 /* If this is just a dirty<->clean transition, and the array is clean 2768 * and 'events' is odd, we can roll back to the previous clean state */ 2769 if (nospares 2770 && (mddev->in_sync && mddev->resync_offset == MaxSector) 2771 && mddev->can_decrease_events 2772 && mddev->events != 1) { 2773 mddev->events--; 2774 mddev->can_decrease_events = 0; 2775 } else { 2776 /* otherwise we have to go forward and ... */ 2777 mddev->events ++; 2778 mddev->can_decrease_events = nospares; 2779 } 2780 2781 /* 2782 * This 64-bit counter should never wrap. 2783 * Either we are in around ~1 trillion A.C., assuming 2784 * 1 reboot per second, or we have a bug... 2785 */ 2786 WARN_ON(mddev->events == 0); 2787 2788 rdev_for_each(rdev, mddev) { 2789 if (rdev->badblocks.changed) 2790 any_badblocks_changed++; 2791 if (test_bit(Faulty, &rdev->flags)) 2792 set_bit(FaultRecorded, &rdev->flags); 2793 } 2794 2795 sync_sbs(mddev, nospares); 2796 spin_unlock(&mddev->lock); 2797 2798 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2799 mdname(mddev), mddev->in_sync); 2800 2801 mddev_add_trace_msg(mddev, "md md_update_sb"); 2802 rewrite: 2803 mddev->bitmap_ops->update_sb(mddev->bitmap); 2804 rdev_for_each(rdev, mddev) { 2805 if (rdev->sb_loaded != 1) 2806 continue; /* no noise on spare devices */ 2807 2808 if (!test_bit(Faulty, &rdev->flags)) { 2809 md_super_write(mddev,rdev, 2810 rdev->sb_start, rdev->sb_size, 2811 rdev->sb_page); 2812 pr_debug("md: (write) %pg's sb offset: %llu\n", 2813 rdev->bdev, 2814 (unsigned long long)rdev->sb_start); 2815 rdev->sb_events = mddev->events; 2816 if (rdev->badblocks.size) { 2817 md_super_write(mddev, rdev, 2818 rdev->badblocks.sector, 2819 rdev->badblocks.size << 9, 2820 rdev->bb_page); 2821 rdev->badblocks.size = 0; 2822 } 2823 2824 } else 2825 pr_debug("md: %pg (skipping faulty)\n", 2826 rdev->bdev); 2827 } 2828 if (md_super_wait(mddev) < 0) 2829 goto rewrite; 2830 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2831 2832 if (mddev_is_clustered(mddev) && ret == 0) 2833 mddev->cluster_ops->metadata_update_finish(mddev); 2834 2835 if (mddev->in_sync != sync_req || 2836 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2837 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2838 /* have to write it out again */ 2839 goto repeat; 2840 wake_up(&mddev->sb_wait); 2841 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2842 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2843 2844 rdev_for_each(rdev, mddev) { 2845 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2846 clear_bit(Blocked, &rdev->flags); 2847 2848 if (any_badblocks_changed) 2849 ack_all_badblocks(&rdev->badblocks); 2850 clear_bit(BlockedBadBlocks, &rdev->flags); 2851 wake_up(&rdev->blocked_wait); 2852 } 2853 } 2854 EXPORT_SYMBOL(md_update_sb); 2855 2856 static int add_bound_rdev(struct md_rdev *rdev) 2857 { 2858 struct mddev *mddev = rdev->mddev; 2859 int err = 0; 2860 bool add_journal = test_bit(Journal, &rdev->flags); 2861 2862 if (!mddev->pers->hot_remove_disk || add_journal) { 2863 /* If there is hot_add_disk but no hot_remove_disk 2864 * then added disks for geometry changes, 2865 * and should be added immediately. 2866 */ 2867 super_types[mddev->major_version]. 2868 validate_super(mddev, NULL/*freshest*/, rdev); 2869 err = mddev->pers->hot_add_disk(mddev, rdev); 2870 if (err) { 2871 md_kick_rdev_from_array(rdev); 2872 return err; 2873 } 2874 } 2875 sysfs_notify_dirent_safe(rdev->sysfs_state); 2876 2877 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2878 if (mddev->degraded) 2879 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2880 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2881 md_new_event(); 2882 return 0; 2883 } 2884 2885 /* words written to sysfs files may, or may not, be \n terminated. 2886 * We want to accept with case. For this we use cmd_match. 2887 */ 2888 static int cmd_match(const char *cmd, const char *str) 2889 { 2890 /* See if cmd, written into a sysfs file, matches 2891 * str. They must either be the same, or cmd can 2892 * have a trailing newline 2893 */ 2894 while (*cmd && *str && *cmd == *str) { 2895 cmd++; 2896 str++; 2897 } 2898 if (*cmd == '\n') 2899 cmd++; 2900 if (*str || *cmd) 2901 return 0; 2902 return 1; 2903 } 2904 2905 struct rdev_sysfs_entry { 2906 struct attribute attr; 2907 ssize_t (*show)(struct md_rdev *, char *); 2908 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2909 }; 2910 2911 static ssize_t 2912 state_show(struct md_rdev *rdev, char *page) 2913 { 2914 char *sep = ","; 2915 size_t len = 0; 2916 unsigned long flags = READ_ONCE(rdev->flags); 2917 2918 if (test_bit(Faulty, &flags) || 2919 (!test_bit(ExternalBbl, &flags) && 2920 rdev->badblocks.unacked_exist)) 2921 len += sprintf(page+len, "faulty%s", sep); 2922 if (test_bit(In_sync, &flags)) 2923 len += sprintf(page+len, "in_sync%s", sep); 2924 if (test_bit(Journal, &flags)) 2925 len += sprintf(page+len, "journal%s", sep); 2926 if (test_bit(WriteMostly, &flags)) 2927 len += sprintf(page+len, "write_mostly%s", sep); 2928 if (test_bit(Blocked, &flags) || 2929 (rdev->badblocks.unacked_exist 2930 && !test_bit(Faulty, &flags))) 2931 len += sprintf(page+len, "blocked%s", sep); 2932 if (!test_bit(Faulty, &flags) && 2933 !test_bit(Journal, &flags) && 2934 !test_bit(In_sync, &flags)) 2935 len += sprintf(page+len, "spare%s", sep); 2936 if (test_bit(WriteErrorSeen, &flags)) 2937 len += sprintf(page+len, "write_error%s", sep); 2938 if (test_bit(WantReplacement, &flags)) 2939 len += sprintf(page+len, "want_replacement%s", sep); 2940 if (test_bit(Replacement, &flags)) 2941 len += sprintf(page+len, "replacement%s", sep); 2942 if (test_bit(ExternalBbl, &flags)) 2943 len += sprintf(page+len, "external_bbl%s", sep); 2944 if (test_bit(FailFast, &flags)) 2945 len += sprintf(page+len, "failfast%s", sep); 2946 2947 if (len) 2948 len -= strlen(sep); 2949 2950 return len+sprintf(page+len, "\n"); 2951 } 2952 2953 static ssize_t 2954 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2955 { 2956 /* can write 2957 * faulty - simulates an error 2958 * remove - disconnects the device 2959 * writemostly - sets write_mostly 2960 * -writemostly - clears write_mostly 2961 * blocked - sets the Blocked flags 2962 * -blocked - clears the Blocked and possibly simulates an error 2963 * insync - sets Insync providing device isn't active 2964 * -insync - clear Insync for a device with a slot assigned, 2965 * so that it gets rebuilt based on bitmap 2966 * write_error - sets WriteErrorSeen 2967 * -write_error - clears WriteErrorSeen 2968 * {,-}failfast - set/clear FailFast 2969 */ 2970 2971 struct mddev *mddev = rdev->mddev; 2972 int err = -EINVAL; 2973 bool need_update_sb = false; 2974 2975 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2976 md_error(rdev->mddev, rdev); 2977 2978 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 2979 err = -EBUSY; 2980 else 2981 err = 0; 2982 } else if (cmd_match(buf, "remove")) { 2983 if (rdev->mddev->pers) { 2984 clear_bit(Blocked, &rdev->flags); 2985 remove_and_add_spares(rdev->mddev, rdev); 2986 } 2987 if (rdev->raid_disk >= 0) 2988 err = -EBUSY; 2989 else { 2990 err = 0; 2991 if (mddev_is_clustered(mddev)) 2992 err = mddev->cluster_ops->remove_disk(mddev, rdev); 2993 2994 if (err == 0) { 2995 md_kick_rdev_from_array(rdev); 2996 if (mddev->pers) 2997 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2998 md_new_event(); 2999 } 3000 } 3001 } else if (cmd_match(buf, "writemostly")) { 3002 set_bit(WriteMostly, &rdev->flags); 3003 mddev_create_serial_pool(rdev->mddev, rdev); 3004 need_update_sb = true; 3005 err = 0; 3006 } else if (cmd_match(buf, "-writemostly")) { 3007 mddev_destroy_serial_pool(rdev->mddev, rdev); 3008 clear_bit(WriteMostly, &rdev->flags); 3009 need_update_sb = true; 3010 err = 0; 3011 } else if (cmd_match(buf, "blocked")) { 3012 set_bit(Blocked, &rdev->flags); 3013 err = 0; 3014 } else if (cmd_match(buf, "-blocked")) { 3015 if (!test_bit(Faulty, &rdev->flags) && 3016 !test_bit(ExternalBbl, &rdev->flags) && 3017 rdev->badblocks.unacked_exist) { 3018 /* metadata handler doesn't understand badblocks, 3019 * so we need to fail the device 3020 */ 3021 md_error(rdev->mddev, rdev); 3022 } 3023 clear_bit(Blocked, &rdev->flags); 3024 clear_bit(BlockedBadBlocks, &rdev->flags); 3025 wake_up(&rdev->blocked_wait); 3026 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3027 3028 err = 0; 3029 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3030 set_bit(In_sync, &rdev->flags); 3031 err = 0; 3032 } else if (cmd_match(buf, "failfast")) { 3033 set_bit(FailFast, &rdev->flags); 3034 need_update_sb = true; 3035 err = 0; 3036 } else if (cmd_match(buf, "-failfast")) { 3037 clear_bit(FailFast, &rdev->flags); 3038 need_update_sb = true; 3039 err = 0; 3040 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3041 !test_bit(Journal, &rdev->flags)) { 3042 if (rdev->mddev->pers == NULL) { 3043 clear_bit(In_sync, &rdev->flags); 3044 rdev->saved_raid_disk = rdev->raid_disk; 3045 rdev->raid_disk = -1; 3046 err = 0; 3047 } 3048 } else if (cmd_match(buf, "write_error")) { 3049 set_bit(WriteErrorSeen, &rdev->flags); 3050 err = 0; 3051 } else if (cmd_match(buf, "-write_error")) { 3052 clear_bit(WriteErrorSeen, &rdev->flags); 3053 err = 0; 3054 } else if (cmd_match(buf, "want_replacement")) { 3055 /* Any non-spare device that is not a replacement can 3056 * become want_replacement at any time, but we then need to 3057 * check if recovery is needed. 3058 */ 3059 if (rdev->raid_disk >= 0 && 3060 !test_bit(Journal, &rdev->flags) && 3061 !test_bit(Replacement, &rdev->flags)) 3062 set_bit(WantReplacement, &rdev->flags); 3063 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3064 err = 0; 3065 } else if (cmd_match(buf, "-want_replacement")) { 3066 /* Clearing 'want_replacement' is always allowed. 3067 * Once replacements starts it is too late though. 3068 */ 3069 err = 0; 3070 clear_bit(WantReplacement, &rdev->flags); 3071 } else if (cmd_match(buf, "replacement")) { 3072 /* Can only set a device as a replacement when array has not 3073 * yet been started. Once running, replacement is automatic 3074 * from spares, or by assigning 'slot'. 3075 */ 3076 if (rdev->mddev->pers) 3077 err = -EBUSY; 3078 else { 3079 set_bit(Replacement, &rdev->flags); 3080 err = 0; 3081 } 3082 } else if (cmd_match(buf, "-replacement")) { 3083 /* Similarly, can only clear Replacement before start */ 3084 if (rdev->mddev->pers) 3085 err = -EBUSY; 3086 else { 3087 clear_bit(Replacement, &rdev->flags); 3088 err = 0; 3089 } 3090 } else if (cmd_match(buf, "re-add")) { 3091 if (!rdev->mddev->pers) 3092 err = -EINVAL; 3093 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3094 rdev->saved_raid_disk >= 0) { 3095 /* clear_bit is performed _after_ all the devices 3096 * have their local Faulty bit cleared. If any writes 3097 * happen in the meantime in the local node, they 3098 * will land in the local bitmap, which will be synced 3099 * by this node eventually 3100 */ 3101 if (!mddev_is_clustered(rdev->mddev) || 3102 (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { 3103 clear_bit(Faulty, &rdev->flags); 3104 err = add_bound_rdev(rdev); 3105 } 3106 } else 3107 err = -EBUSY; 3108 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3109 set_bit(ExternalBbl, &rdev->flags); 3110 rdev->badblocks.shift = 0; 3111 err = 0; 3112 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3113 clear_bit(ExternalBbl, &rdev->flags); 3114 err = 0; 3115 } 3116 if (need_update_sb) 3117 md_update_sb(mddev, 1); 3118 if (!err) 3119 sysfs_notify_dirent_safe(rdev->sysfs_state); 3120 return err ? err : len; 3121 } 3122 static struct rdev_sysfs_entry rdev_state = 3123 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3124 3125 static ssize_t 3126 errors_show(struct md_rdev *rdev, char *page) 3127 { 3128 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3129 } 3130 3131 static ssize_t 3132 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3133 { 3134 unsigned int n; 3135 int rv; 3136 3137 rv = kstrtouint(buf, 10, &n); 3138 if (rv < 0) 3139 return rv; 3140 atomic_set(&rdev->corrected_errors, n); 3141 return len; 3142 } 3143 static struct rdev_sysfs_entry rdev_errors = 3144 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3145 3146 static ssize_t 3147 slot_show(struct md_rdev *rdev, char *page) 3148 { 3149 if (test_bit(Journal, &rdev->flags)) 3150 return sprintf(page, "journal\n"); 3151 else if (rdev->raid_disk < 0) 3152 return sprintf(page, "none\n"); 3153 else 3154 return sprintf(page, "%d\n", rdev->raid_disk); 3155 } 3156 3157 static ssize_t 3158 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3159 { 3160 int slot; 3161 int err; 3162 3163 if (test_bit(Journal, &rdev->flags)) 3164 return -EBUSY; 3165 if (strncmp(buf, "none", 4)==0) 3166 slot = -1; 3167 else { 3168 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3169 if (err < 0) 3170 return err; 3171 if (slot < 0) 3172 /* overflow */ 3173 return -ENOSPC; 3174 } 3175 if (rdev->mddev->pers && slot == -1) { 3176 /* Setting 'slot' on an active array requires also 3177 * updating the 'rd%d' link, and communicating 3178 * with the personality with ->hot_*_disk. 3179 * For now we only support removing 3180 * failed/spare devices. This normally happens automatically, 3181 * but not when the metadata is externally managed. 3182 */ 3183 if (rdev->raid_disk == -1) 3184 return -EEXIST; 3185 /* personality does all needed checks */ 3186 if (rdev->mddev->pers->hot_remove_disk == NULL) 3187 return -EINVAL; 3188 clear_bit(Blocked, &rdev->flags); 3189 remove_and_add_spares(rdev->mddev, rdev); 3190 if (rdev->raid_disk >= 0) 3191 return -EBUSY; 3192 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3193 } else if (rdev->mddev->pers) { 3194 /* Activating a spare .. or possibly reactivating 3195 * if we ever get bitmaps working here. 3196 */ 3197 int err; 3198 3199 if (rdev->raid_disk != -1) 3200 return -EBUSY; 3201 3202 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3203 return -EBUSY; 3204 3205 if (rdev->mddev->pers->hot_add_disk == NULL) 3206 return -EINVAL; 3207 3208 if (slot >= rdev->mddev->raid_disks && 3209 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3210 return -ENOSPC; 3211 3212 rdev->raid_disk = slot; 3213 if (test_bit(In_sync, &rdev->flags)) 3214 rdev->saved_raid_disk = slot; 3215 else 3216 rdev->saved_raid_disk = -1; 3217 clear_bit(In_sync, &rdev->flags); 3218 clear_bit(Bitmap_sync, &rdev->flags); 3219 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3220 if (err) { 3221 rdev->raid_disk = -1; 3222 return err; 3223 } else 3224 sysfs_notify_dirent_safe(rdev->sysfs_state); 3225 /* failure here is OK */; 3226 sysfs_link_rdev(rdev->mddev, rdev); 3227 /* don't wakeup anyone, leave that to userspace. */ 3228 } else { 3229 if (slot >= rdev->mddev->raid_disks && 3230 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3231 return -ENOSPC; 3232 rdev->raid_disk = slot; 3233 /* assume it is working */ 3234 clear_bit(Faulty, &rdev->flags); 3235 clear_bit(WriteMostly, &rdev->flags); 3236 set_bit(In_sync, &rdev->flags); 3237 sysfs_notify_dirent_safe(rdev->sysfs_state); 3238 } 3239 return len; 3240 } 3241 3242 static struct rdev_sysfs_entry rdev_slot = 3243 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3244 3245 static ssize_t 3246 offset_show(struct md_rdev *rdev, char *page) 3247 { 3248 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3249 } 3250 3251 static ssize_t 3252 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3253 { 3254 unsigned long long offset; 3255 if (kstrtoull(buf, 10, &offset) < 0) 3256 return -EINVAL; 3257 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3258 return -EBUSY; 3259 if (rdev->sectors && rdev->mddev->external) 3260 /* Must set offset before size, so overlap checks 3261 * can be sane */ 3262 return -EBUSY; 3263 rdev->data_offset = offset; 3264 rdev->new_data_offset = offset; 3265 return len; 3266 } 3267 3268 static struct rdev_sysfs_entry rdev_offset = 3269 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3270 3271 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3272 { 3273 return sprintf(page, "%llu\n", 3274 (unsigned long long)rdev->new_data_offset); 3275 } 3276 3277 static ssize_t new_offset_store(struct md_rdev *rdev, 3278 const char *buf, size_t len) 3279 { 3280 unsigned long long new_offset; 3281 struct mddev *mddev = rdev->mddev; 3282 3283 if (kstrtoull(buf, 10, &new_offset) < 0) 3284 return -EINVAL; 3285 3286 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3287 return -EBUSY; 3288 if (new_offset == rdev->data_offset) 3289 /* reset is always permitted */ 3290 ; 3291 else if (new_offset > rdev->data_offset) { 3292 /* must not push array size beyond rdev_sectors */ 3293 if (new_offset - rdev->data_offset 3294 + mddev->dev_sectors > rdev->sectors) 3295 return -E2BIG; 3296 } 3297 /* Metadata worries about other space details. */ 3298 3299 /* decreasing the offset is inconsistent with a backwards 3300 * reshape. 3301 */ 3302 if (new_offset < rdev->data_offset && 3303 mddev->reshape_backwards) 3304 return -EINVAL; 3305 /* Increasing offset is inconsistent with forwards 3306 * reshape. reshape_direction should be set to 3307 * 'backwards' first. 3308 */ 3309 if (new_offset > rdev->data_offset && 3310 !mddev->reshape_backwards) 3311 return -EINVAL; 3312 3313 if (mddev->pers && mddev->persistent && 3314 !super_types[mddev->major_version] 3315 .allow_new_offset(rdev, new_offset)) 3316 return -E2BIG; 3317 rdev->new_data_offset = new_offset; 3318 if (new_offset > rdev->data_offset) 3319 mddev->reshape_backwards = 1; 3320 else if (new_offset < rdev->data_offset) 3321 mddev->reshape_backwards = 0; 3322 3323 return len; 3324 } 3325 static struct rdev_sysfs_entry rdev_new_offset = 3326 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3327 3328 static ssize_t 3329 rdev_size_show(struct md_rdev *rdev, char *page) 3330 { 3331 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3332 } 3333 3334 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3335 { 3336 /* check if two start/length pairs overlap */ 3337 if (a->data_offset + a->sectors <= b->data_offset) 3338 return false; 3339 if (b->data_offset + b->sectors <= a->data_offset) 3340 return false; 3341 return true; 3342 } 3343 3344 static bool md_rdev_overlaps(struct md_rdev *rdev) 3345 { 3346 struct mddev *mddev; 3347 struct md_rdev *rdev2; 3348 3349 spin_lock(&all_mddevs_lock); 3350 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3351 if (test_bit(MD_DELETED, &mddev->flags)) 3352 continue; 3353 rdev_for_each(rdev2, mddev) { 3354 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3355 md_rdevs_overlap(rdev, rdev2)) { 3356 spin_unlock(&all_mddevs_lock); 3357 return true; 3358 } 3359 } 3360 } 3361 spin_unlock(&all_mddevs_lock); 3362 return false; 3363 } 3364 3365 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3366 { 3367 unsigned long long blocks; 3368 sector_t new; 3369 3370 if (kstrtoull(buf, 10, &blocks) < 0) 3371 return -EINVAL; 3372 3373 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3374 return -EINVAL; /* sector conversion overflow */ 3375 3376 new = blocks * 2; 3377 if (new != blocks * 2) 3378 return -EINVAL; /* unsigned long long to sector_t overflow */ 3379 3380 *sectors = new; 3381 return 0; 3382 } 3383 3384 static ssize_t 3385 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3386 { 3387 struct mddev *my_mddev = rdev->mddev; 3388 sector_t oldsectors = rdev->sectors; 3389 sector_t sectors; 3390 3391 if (test_bit(Journal, &rdev->flags)) 3392 return -EBUSY; 3393 if (strict_blocks_to_sectors(buf, §ors) < 0) 3394 return -EINVAL; 3395 if (rdev->data_offset != rdev->new_data_offset) 3396 return -EINVAL; /* too confusing */ 3397 if (my_mddev->pers && rdev->raid_disk >= 0) { 3398 if (my_mddev->persistent) { 3399 sectors = super_types[my_mddev->major_version]. 3400 rdev_size_change(rdev, sectors); 3401 if (!sectors) 3402 return -EBUSY; 3403 } else if (!sectors) 3404 sectors = bdev_nr_sectors(rdev->bdev) - 3405 rdev->data_offset; 3406 if (!my_mddev->pers->resize) 3407 /* Cannot change size for RAID0 or Linear etc */ 3408 return -EINVAL; 3409 } 3410 if (sectors < my_mddev->dev_sectors) 3411 return -EINVAL; /* component must fit device */ 3412 3413 rdev->sectors = sectors; 3414 3415 /* 3416 * Check that all other rdevs with the same bdev do not overlap. This 3417 * check does not provide a hard guarantee, it just helps avoid 3418 * dangerous mistakes. 3419 */ 3420 if (sectors > oldsectors && my_mddev->external && 3421 md_rdev_overlaps(rdev)) { 3422 /* 3423 * Someone else could have slipped in a size change here, but 3424 * doing so is just silly. We put oldsectors back because we 3425 * know it is safe, and trust userspace not to race with itself. 3426 */ 3427 rdev->sectors = oldsectors; 3428 return -EBUSY; 3429 } 3430 return len; 3431 } 3432 3433 static struct rdev_sysfs_entry rdev_size = 3434 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3435 3436 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3437 { 3438 unsigned long long recovery_start = rdev->recovery_offset; 3439 3440 if (test_bit(In_sync, &rdev->flags) || 3441 recovery_start == MaxSector) 3442 return sprintf(page, "none\n"); 3443 3444 return sprintf(page, "%llu\n", recovery_start); 3445 } 3446 3447 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3448 { 3449 unsigned long long recovery_start; 3450 3451 if (cmd_match(buf, "none")) 3452 recovery_start = MaxSector; 3453 else if (kstrtoull(buf, 10, &recovery_start)) 3454 return -EINVAL; 3455 3456 if (rdev->mddev->pers && 3457 rdev->raid_disk >= 0) 3458 return -EBUSY; 3459 3460 rdev->recovery_offset = recovery_start; 3461 if (recovery_start == MaxSector) 3462 set_bit(In_sync, &rdev->flags); 3463 else 3464 clear_bit(In_sync, &rdev->flags); 3465 return len; 3466 } 3467 3468 static struct rdev_sysfs_entry rdev_recovery_start = 3469 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3470 3471 /* sysfs access to bad-blocks list. 3472 * We present two files. 3473 * 'bad-blocks' lists sector numbers and lengths of ranges that 3474 * are recorded as bad. The list is truncated to fit within 3475 * the one-page limit of sysfs. 3476 * Writing "sector length" to this file adds an acknowledged 3477 * bad block list. 3478 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3479 * been acknowledged. Writing to this file adds bad blocks 3480 * without acknowledging them. This is largely for testing. 3481 */ 3482 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3483 { 3484 return badblocks_show(&rdev->badblocks, page, 0); 3485 } 3486 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3487 { 3488 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3489 /* Maybe that ack was all we needed */ 3490 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3491 wake_up(&rdev->blocked_wait); 3492 return rv; 3493 } 3494 static struct rdev_sysfs_entry rdev_bad_blocks = 3495 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3496 3497 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3498 { 3499 return badblocks_show(&rdev->badblocks, page, 1); 3500 } 3501 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3502 { 3503 return badblocks_store(&rdev->badblocks, page, len, 1); 3504 } 3505 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3506 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3507 3508 static ssize_t 3509 ppl_sector_show(struct md_rdev *rdev, char *page) 3510 { 3511 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3512 } 3513 3514 static ssize_t 3515 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3516 { 3517 unsigned long long sector; 3518 3519 if (kstrtoull(buf, 10, §or) < 0) 3520 return -EINVAL; 3521 if (sector != (sector_t)sector) 3522 return -EINVAL; 3523 3524 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3525 rdev->raid_disk >= 0) 3526 return -EBUSY; 3527 3528 if (rdev->mddev->persistent) { 3529 if (rdev->mddev->major_version == 0) 3530 return -EINVAL; 3531 if ((sector > rdev->sb_start && 3532 sector - rdev->sb_start > S16_MAX) || 3533 (sector < rdev->sb_start && 3534 rdev->sb_start - sector > -S16_MIN)) 3535 return -EINVAL; 3536 rdev->ppl.offset = sector - rdev->sb_start; 3537 } else if (!rdev->mddev->external) { 3538 return -EBUSY; 3539 } 3540 rdev->ppl.sector = sector; 3541 return len; 3542 } 3543 3544 static struct rdev_sysfs_entry rdev_ppl_sector = 3545 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3546 3547 static ssize_t 3548 ppl_size_show(struct md_rdev *rdev, char *page) 3549 { 3550 return sprintf(page, "%u\n", rdev->ppl.size); 3551 } 3552 3553 static ssize_t 3554 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3555 { 3556 unsigned int size; 3557 3558 if (kstrtouint(buf, 10, &size) < 0) 3559 return -EINVAL; 3560 3561 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3562 rdev->raid_disk >= 0) 3563 return -EBUSY; 3564 3565 if (rdev->mddev->persistent) { 3566 if (rdev->mddev->major_version == 0) 3567 return -EINVAL; 3568 if (size > U16_MAX) 3569 return -EINVAL; 3570 } else if (!rdev->mddev->external) { 3571 return -EBUSY; 3572 } 3573 rdev->ppl.size = size; 3574 return len; 3575 } 3576 3577 static struct rdev_sysfs_entry rdev_ppl_size = 3578 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3579 3580 static struct attribute *rdev_default_attrs[] = { 3581 &rdev_state.attr, 3582 &rdev_errors.attr, 3583 &rdev_slot.attr, 3584 &rdev_offset.attr, 3585 &rdev_new_offset.attr, 3586 &rdev_size.attr, 3587 &rdev_recovery_start.attr, 3588 &rdev_bad_blocks.attr, 3589 &rdev_unack_bad_blocks.attr, 3590 &rdev_ppl_sector.attr, 3591 &rdev_ppl_size.attr, 3592 NULL, 3593 }; 3594 ATTRIBUTE_GROUPS(rdev_default); 3595 static ssize_t 3596 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3597 { 3598 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3599 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3600 3601 if (!entry->show) 3602 return -EIO; 3603 if (!rdev->mddev) 3604 return -ENODEV; 3605 return entry->show(rdev, page); 3606 } 3607 3608 static ssize_t 3609 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3610 const char *page, size_t length) 3611 { 3612 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3613 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3614 struct kernfs_node *kn = NULL; 3615 bool suspend = false; 3616 ssize_t rv; 3617 struct mddev *mddev = READ_ONCE(rdev->mddev); 3618 3619 if (!entry->store) 3620 return -EIO; 3621 if (!capable(CAP_SYS_ADMIN)) 3622 return -EACCES; 3623 if (!mddev) 3624 return -ENODEV; 3625 3626 if (entry->store == state_store) { 3627 if (cmd_match(page, "remove")) 3628 kn = sysfs_break_active_protection(kobj, attr); 3629 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3630 cmd_match(page, "writemostly") || 3631 cmd_match(page, "-writemostly")) 3632 suspend = true; 3633 } 3634 3635 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3636 if (!rv) { 3637 if (rdev->mddev == NULL) 3638 rv = -ENODEV; 3639 else 3640 rv = entry->store(rdev, page, length); 3641 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3642 } 3643 3644 if (kn) 3645 sysfs_unbreak_active_protection(kn); 3646 3647 return rv; 3648 } 3649 3650 static void rdev_free(struct kobject *ko) 3651 { 3652 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3653 kfree(rdev); 3654 } 3655 static const struct sysfs_ops rdev_sysfs_ops = { 3656 .show = rdev_attr_show, 3657 .store = rdev_attr_store, 3658 }; 3659 static const struct kobj_type rdev_ktype = { 3660 .release = rdev_free, 3661 .sysfs_ops = &rdev_sysfs_ops, 3662 .default_groups = rdev_default_groups, 3663 }; 3664 3665 int md_rdev_init(struct md_rdev *rdev) 3666 { 3667 rdev->desc_nr = -1; 3668 rdev->saved_raid_disk = -1; 3669 rdev->raid_disk = -1; 3670 rdev->flags = 0; 3671 rdev->data_offset = 0; 3672 rdev->new_data_offset = 0; 3673 rdev->sb_events = 0; 3674 rdev->last_read_error = 0; 3675 rdev->sb_loaded = 0; 3676 rdev->bb_page = NULL; 3677 atomic_set(&rdev->nr_pending, 0); 3678 atomic_set(&rdev->read_errors, 0); 3679 atomic_set(&rdev->corrected_errors, 0); 3680 3681 INIT_LIST_HEAD(&rdev->same_set); 3682 init_waitqueue_head(&rdev->blocked_wait); 3683 3684 /* Add space to store bad block list. 3685 * This reserves the space even on arrays where it cannot 3686 * be used - I wonder if that matters 3687 */ 3688 return badblocks_init(&rdev->badblocks, 0); 3689 } 3690 EXPORT_SYMBOL_GPL(md_rdev_init); 3691 3692 /* 3693 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3694 * 3695 * mark the device faulty if: 3696 * 3697 * - the device is nonexistent (zero size) 3698 * - the device has no valid superblock 3699 * 3700 * a faulty rdev _never_ has rdev->sb set. 3701 */ 3702 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3703 { 3704 struct md_rdev *rdev; 3705 sector_t size; 3706 int err; 3707 3708 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3709 if (!rdev) 3710 return ERR_PTR(-ENOMEM); 3711 3712 err = md_rdev_init(rdev); 3713 if (err) 3714 goto out_free_rdev; 3715 err = alloc_disk_sb(rdev); 3716 if (err) 3717 goto out_clear_rdev; 3718 3719 rdev->bdev_file = bdev_file_open_by_dev(newdev, 3720 BLK_OPEN_READ | BLK_OPEN_WRITE, 3721 super_format == -2 ? &claim_rdev : rdev, NULL); 3722 if (IS_ERR(rdev->bdev_file)) { 3723 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3724 MAJOR(newdev), MINOR(newdev)); 3725 err = PTR_ERR(rdev->bdev_file); 3726 goto out_clear_rdev; 3727 } 3728 rdev->bdev = file_bdev(rdev->bdev_file); 3729 3730 kobject_init(&rdev->kobj, &rdev_ktype); 3731 3732 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3733 if (!size) { 3734 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3735 rdev->bdev); 3736 err = -EINVAL; 3737 goto out_blkdev_put; 3738 } 3739 3740 if (super_format >= 0) { 3741 err = super_types[super_format]. 3742 load_super(rdev, NULL, super_minor); 3743 if (err == -EINVAL) { 3744 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3745 rdev->bdev, 3746 super_format, super_minor); 3747 goto out_blkdev_put; 3748 } 3749 if (err < 0) { 3750 pr_warn("md: could not read %pg's sb, not importing!\n", 3751 rdev->bdev); 3752 goto out_blkdev_put; 3753 } 3754 } 3755 3756 return rdev; 3757 3758 out_blkdev_put: 3759 fput(rdev->bdev_file); 3760 out_clear_rdev: 3761 md_rdev_clear(rdev); 3762 out_free_rdev: 3763 kfree(rdev); 3764 return ERR_PTR(err); 3765 } 3766 3767 /* 3768 * Check a full RAID array for plausibility 3769 */ 3770 3771 static int analyze_sbs(struct mddev *mddev) 3772 { 3773 int i; 3774 struct md_rdev *rdev, *freshest, *tmp; 3775 3776 freshest = NULL; 3777 rdev_for_each_safe(rdev, tmp, mddev) 3778 switch (super_types[mddev->major_version]. 3779 load_super(rdev, freshest, mddev->minor_version)) { 3780 case 1: 3781 freshest = rdev; 3782 break; 3783 case 0: 3784 break; 3785 default: 3786 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3787 rdev->bdev); 3788 md_kick_rdev_from_array(rdev); 3789 } 3790 3791 /* Cannot find a valid fresh disk */ 3792 if (!freshest) { 3793 pr_warn("md: cannot find a valid disk\n"); 3794 return -EINVAL; 3795 } 3796 3797 super_types[mddev->major_version]. 3798 validate_super(mddev, NULL/*freshest*/, freshest); 3799 3800 i = 0; 3801 rdev_for_each_safe(rdev, tmp, mddev) { 3802 if (mddev->max_disks && 3803 (rdev->desc_nr >= mddev->max_disks || 3804 i > mddev->max_disks)) { 3805 pr_warn("md: %s: %pg: only %d devices permitted\n", 3806 mdname(mddev), rdev->bdev, 3807 mddev->max_disks); 3808 md_kick_rdev_from_array(rdev); 3809 continue; 3810 } 3811 if (rdev != freshest) { 3812 if (super_types[mddev->major_version]. 3813 validate_super(mddev, freshest, rdev)) { 3814 pr_warn("md: kicking non-fresh %pg from array!\n", 3815 rdev->bdev); 3816 md_kick_rdev_from_array(rdev); 3817 continue; 3818 } 3819 } 3820 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && 3821 !test_bit(Journal, &rdev->flags)) { 3822 rdev->raid_disk = -1; 3823 clear_bit(In_sync, &rdev->flags); 3824 } 3825 } 3826 3827 return 0; 3828 } 3829 3830 /* Read a fixed-point number. 3831 * Numbers in sysfs attributes should be in "standard" units where 3832 * possible, so time should be in seconds. 3833 * However we internally use a a much smaller unit such as 3834 * milliseconds or jiffies. 3835 * This function takes a decimal number with a possible fractional 3836 * component, and produces an integer which is the result of 3837 * multiplying that number by 10^'scale'. 3838 * all without any floating-point arithmetic. 3839 */ 3840 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3841 { 3842 unsigned long result = 0; 3843 long decimals = -1; 3844 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3845 if (*cp == '.') 3846 decimals = 0; 3847 else if (decimals < scale) { 3848 unsigned int value; 3849 value = *cp - '0'; 3850 result = result * 10 + value; 3851 if (decimals >= 0) 3852 decimals++; 3853 } 3854 cp++; 3855 } 3856 if (*cp == '\n') 3857 cp++; 3858 if (*cp) 3859 return -EINVAL; 3860 if (decimals < 0) 3861 decimals = 0; 3862 *res = result * int_pow(10, scale - decimals); 3863 return 0; 3864 } 3865 3866 static ssize_t 3867 safe_delay_show(struct mddev *mddev, char *page) 3868 { 3869 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3870 3871 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3872 } 3873 static ssize_t 3874 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3875 { 3876 unsigned long msec; 3877 3878 if (mddev_is_clustered(mddev)) { 3879 pr_warn("md: Safemode is disabled for clustered mode\n"); 3880 return -EINVAL; 3881 } 3882 3883 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3884 return -EINVAL; 3885 if (msec == 0) 3886 mddev->safemode_delay = 0; 3887 else { 3888 unsigned long old_delay = mddev->safemode_delay; 3889 unsigned long new_delay = (msec*HZ)/1000; 3890 3891 if (new_delay == 0) 3892 new_delay = 1; 3893 mddev->safemode_delay = new_delay; 3894 if (new_delay < old_delay || old_delay == 0) 3895 mod_timer(&mddev->safemode_timer, jiffies+1); 3896 } 3897 return len; 3898 } 3899 static struct md_sysfs_entry md_safe_delay = 3900 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3901 3902 static ssize_t 3903 level_show(struct mddev *mddev, char *page) 3904 { 3905 struct md_personality *p; 3906 int ret; 3907 spin_lock(&mddev->lock); 3908 p = mddev->pers; 3909 if (p) 3910 ret = sprintf(page, "%s\n", p->head.name); 3911 else if (mddev->clevel[0]) 3912 ret = sprintf(page, "%s\n", mddev->clevel); 3913 else if (mddev->level != LEVEL_NONE) 3914 ret = sprintf(page, "%d\n", mddev->level); 3915 else 3916 ret = 0; 3917 spin_unlock(&mddev->lock); 3918 return ret; 3919 } 3920 3921 static ssize_t 3922 level_store(struct mddev *mddev, const char *buf, size_t len) 3923 { 3924 char clevel[16]; 3925 ssize_t rv; 3926 size_t slen = len; 3927 struct md_personality *pers, *oldpers; 3928 long level; 3929 void *priv, *oldpriv; 3930 struct md_rdev *rdev; 3931 3932 if (slen == 0 || slen >= sizeof(clevel)) 3933 return -EINVAL; 3934 3935 rv = mddev_suspend_and_lock(mddev); 3936 if (rv) 3937 return rv; 3938 3939 if (mddev->pers == NULL) { 3940 memcpy(mddev->clevel, buf, slen); 3941 if (mddev->clevel[slen-1] == '\n') 3942 slen--; 3943 mddev->clevel[slen] = 0; 3944 mddev->level = LEVEL_NONE; 3945 rv = len; 3946 goto out_unlock; 3947 } 3948 rv = -EROFS; 3949 if (!md_is_rdwr(mddev)) 3950 goto out_unlock; 3951 3952 /* request to change the personality. Need to ensure: 3953 * - array is not engaged in resync/recovery/reshape 3954 * - old personality can be suspended 3955 * - new personality will access other array. 3956 */ 3957 3958 rv = -EBUSY; 3959 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3960 mddev->reshape_position != MaxSector || 3961 mddev->sysfs_active) 3962 goto out_unlock; 3963 3964 rv = -EINVAL; 3965 if (!mddev->pers->quiesce) { 3966 pr_warn("md: %s: %s does not support online personality change\n", 3967 mdname(mddev), mddev->pers->head.name); 3968 goto out_unlock; 3969 } 3970 3971 /* Now find the new personality */ 3972 memcpy(clevel, buf, slen); 3973 if (clevel[slen-1] == '\n') 3974 slen--; 3975 clevel[slen] = 0; 3976 if (kstrtol(clevel, 10, &level)) 3977 level = LEVEL_NONE; 3978 3979 if (request_module("md-%s", clevel) != 0) 3980 request_module("md-level-%s", clevel); 3981 pers = get_pers(level, clevel); 3982 if (!pers) { 3983 rv = -EINVAL; 3984 goto out_unlock; 3985 } 3986 3987 if (pers == mddev->pers) { 3988 /* Nothing to do! */ 3989 put_pers(pers); 3990 rv = len; 3991 goto out_unlock; 3992 } 3993 if (!pers->takeover) { 3994 put_pers(pers); 3995 pr_warn("md: %s: %s does not support personality takeover\n", 3996 mdname(mddev), clevel); 3997 rv = -EINVAL; 3998 goto out_unlock; 3999 } 4000 4001 rdev_for_each(rdev, mddev) 4002 rdev->new_raid_disk = rdev->raid_disk; 4003 4004 /* ->takeover must set new_* and/or delta_disks 4005 * if it succeeds, and may set them when it fails. 4006 */ 4007 priv = pers->takeover(mddev); 4008 if (IS_ERR(priv)) { 4009 mddev->new_level = mddev->level; 4010 mddev->new_layout = mddev->layout; 4011 mddev->new_chunk_sectors = mddev->chunk_sectors; 4012 mddev->raid_disks -= mddev->delta_disks; 4013 mddev->delta_disks = 0; 4014 mddev->reshape_backwards = 0; 4015 put_pers(pers); 4016 pr_warn("md: %s: %s would not accept array\n", 4017 mdname(mddev), clevel); 4018 rv = PTR_ERR(priv); 4019 goto out_unlock; 4020 } 4021 4022 /* Looks like we have a winner */ 4023 mddev_detach(mddev); 4024 4025 spin_lock(&mddev->lock); 4026 oldpers = mddev->pers; 4027 oldpriv = mddev->private; 4028 mddev->pers = pers; 4029 mddev->private = priv; 4030 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 4031 mddev->level = mddev->new_level; 4032 mddev->layout = mddev->new_layout; 4033 mddev->chunk_sectors = mddev->new_chunk_sectors; 4034 mddev->delta_disks = 0; 4035 mddev->reshape_backwards = 0; 4036 mddev->degraded = 0; 4037 spin_unlock(&mddev->lock); 4038 4039 if (oldpers->sync_request == NULL && 4040 mddev->external) { 4041 /* We are converting from a no-redundancy array 4042 * to a redundancy array and metadata is managed 4043 * externally so we need to be sure that writes 4044 * won't block due to a need to transition 4045 * clean->dirty 4046 * until external management is started. 4047 */ 4048 mddev->in_sync = 0; 4049 mddev->safemode_delay = 0; 4050 mddev->safemode = 0; 4051 } 4052 4053 oldpers->free(mddev, oldpriv); 4054 4055 if (oldpers->sync_request == NULL && 4056 pers->sync_request != NULL) { 4057 /* need to add the md_redundancy_group */ 4058 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4059 pr_warn("md: cannot register extra attributes for %s\n", 4060 mdname(mddev)); 4061 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4062 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4063 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4064 } 4065 if (oldpers->sync_request != NULL && 4066 pers->sync_request == NULL) { 4067 /* need to remove the md_redundancy_group */ 4068 if (mddev->to_remove == NULL) 4069 mddev->to_remove = &md_redundancy_group; 4070 } 4071 4072 put_pers(oldpers); 4073 4074 rdev_for_each(rdev, mddev) { 4075 if (rdev->raid_disk < 0) 4076 continue; 4077 if (rdev->new_raid_disk >= mddev->raid_disks) 4078 rdev->new_raid_disk = -1; 4079 if (rdev->new_raid_disk == rdev->raid_disk) 4080 continue; 4081 sysfs_unlink_rdev(mddev, rdev); 4082 } 4083 rdev_for_each(rdev, mddev) { 4084 if (rdev->raid_disk < 0) 4085 continue; 4086 if (rdev->new_raid_disk == rdev->raid_disk) 4087 continue; 4088 rdev->raid_disk = rdev->new_raid_disk; 4089 if (rdev->raid_disk < 0) 4090 clear_bit(In_sync, &rdev->flags); 4091 else { 4092 if (sysfs_link_rdev(mddev, rdev)) 4093 pr_warn("md: cannot register rd%d for %s after level change\n", 4094 rdev->raid_disk, mdname(mddev)); 4095 } 4096 } 4097 4098 if (pers->sync_request == NULL) { 4099 /* this is now an array without redundancy, so 4100 * it must always be in_sync 4101 */ 4102 mddev->in_sync = 1; 4103 timer_delete_sync(&mddev->safemode_timer); 4104 } 4105 pers->run(mddev); 4106 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4107 if (!mddev->thread) 4108 md_update_sb(mddev, 1); 4109 sysfs_notify_dirent_safe(mddev->sysfs_level); 4110 md_new_event(); 4111 rv = len; 4112 out_unlock: 4113 mddev_unlock_and_resume(mddev); 4114 return rv; 4115 } 4116 4117 static struct md_sysfs_entry md_level = 4118 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4119 4120 static ssize_t 4121 new_level_show(struct mddev *mddev, char *page) 4122 { 4123 return sprintf(page, "%d\n", mddev->new_level); 4124 } 4125 4126 static ssize_t 4127 new_level_store(struct mddev *mddev, const char *buf, size_t len) 4128 { 4129 unsigned int n; 4130 int err; 4131 4132 err = kstrtouint(buf, 10, &n); 4133 if (err < 0) 4134 return err; 4135 err = mddev_lock(mddev); 4136 if (err) 4137 return err; 4138 4139 mddev->new_level = n; 4140 md_update_sb(mddev, 1); 4141 4142 mddev_unlock(mddev); 4143 return len; 4144 } 4145 static struct md_sysfs_entry md_new_level = 4146 __ATTR(new_level, 0664, new_level_show, new_level_store); 4147 4148 static ssize_t 4149 layout_show(struct mddev *mddev, char *page) 4150 { 4151 /* just a number, not meaningful for all levels */ 4152 if (mddev->reshape_position != MaxSector && 4153 mddev->layout != mddev->new_layout) 4154 return sprintf(page, "%d (%d)\n", 4155 mddev->new_layout, mddev->layout); 4156 return sprintf(page, "%d\n", mddev->layout); 4157 } 4158 4159 static ssize_t 4160 layout_store(struct mddev *mddev, const char *buf, size_t len) 4161 { 4162 unsigned int n; 4163 int err; 4164 4165 err = kstrtouint(buf, 10, &n); 4166 if (err < 0) 4167 return err; 4168 err = mddev_lock(mddev); 4169 if (err) 4170 return err; 4171 4172 if (mddev->pers) { 4173 if (mddev->pers->check_reshape == NULL) 4174 err = -EBUSY; 4175 else if (!md_is_rdwr(mddev)) 4176 err = -EROFS; 4177 else { 4178 mddev->new_layout = n; 4179 err = mddev->pers->check_reshape(mddev); 4180 if (err) 4181 mddev->new_layout = mddev->layout; 4182 } 4183 } else { 4184 mddev->new_layout = n; 4185 if (mddev->reshape_position == MaxSector) 4186 mddev->layout = n; 4187 } 4188 mddev_unlock(mddev); 4189 return err ?: len; 4190 } 4191 static struct md_sysfs_entry md_layout = 4192 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4193 4194 static ssize_t 4195 raid_disks_show(struct mddev *mddev, char *page) 4196 { 4197 if (mddev->raid_disks == 0) 4198 return 0; 4199 if (mddev->reshape_position != MaxSector && 4200 mddev->delta_disks != 0) 4201 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4202 mddev->raid_disks - mddev->delta_disks); 4203 return sprintf(page, "%d\n", mddev->raid_disks); 4204 } 4205 4206 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4207 4208 static ssize_t 4209 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4210 { 4211 unsigned int n; 4212 int err; 4213 4214 err = kstrtouint(buf, 10, &n); 4215 if (err < 0) 4216 return err; 4217 4218 err = mddev_lock(mddev); 4219 if (err) 4220 return err; 4221 if (mddev->pers) 4222 err = update_raid_disks(mddev, n); 4223 else if (mddev->reshape_position != MaxSector) { 4224 struct md_rdev *rdev; 4225 int olddisks = mddev->raid_disks - mddev->delta_disks; 4226 4227 err = -EINVAL; 4228 rdev_for_each(rdev, mddev) { 4229 if (olddisks < n && 4230 rdev->data_offset < rdev->new_data_offset) 4231 goto out_unlock; 4232 if (olddisks > n && 4233 rdev->data_offset > rdev->new_data_offset) 4234 goto out_unlock; 4235 } 4236 err = 0; 4237 mddev->delta_disks = n - olddisks; 4238 mddev->raid_disks = n; 4239 mddev->reshape_backwards = (mddev->delta_disks < 0); 4240 } else 4241 mddev->raid_disks = n; 4242 out_unlock: 4243 mddev_unlock(mddev); 4244 return err ? err : len; 4245 } 4246 static struct md_sysfs_entry md_raid_disks = 4247 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4248 4249 static ssize_t 4250 uuid_show(struct mddev *mddev, char *page) 4251 { 4252 return sprintf(page, "%pU\n", mddev->uuid); 4253 } 4254 static struct md_sysfs_entry md_uuid = 4255 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4256 4257 static ssize_t 4258 chunk_size_show(struct mddev *mddev, char *page) 4259 { 4260 if (mddev->reshape_position != MaxSector && 4261 mddev->chunk_sectors != mddev->new_chunk_sectors) 4262 return sprintf(page, "%d (%d)\n", 4263 mddev->new_chunk_sectors << 9, 4264 mddev->chunk_sectors << 9); 4265 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4266 } 4267 4268 static ssize_t 4269 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4270 { 4271 unsigned long n; 4272 int err; 4273 4274 err = kstrtoul(buf, 10, &n); 4275 if (err < 0) 4276 return err; 4277 4278 err = mddev_lock(mddev); 4279 if (err) 4280 return err; 4281 if (mddev->pers) { 4282 if (mddev->pers->check_reshape == NULL) 4283 err = -EBUSY; 4284 else if (!md_is_rdwr(mddev)) 4285 err = -EROFS; 4286 else { 4287 mddev->new_chunk_sectors = n >> 9; 4288 err = mddev->pers->check_reshape(mddev); 4289 if (err) 4290 mddev->new_chunk_sectors = mddev->chunk_sectors; 4291 } 4292 } else { 4293 mddev->new_chunk_sectors = n >> 9; 4294 if (mddev->reshape_position == MaxSector) 4295 mddev->chunk_sectors = n >> 9; 4296 } 4297 mddev_unlock(mddev); 4298 return err ?: len; 4299 } 4300 static struct md_sysfs_entry md_chunk_size = 4301 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4302 4303 static ssize_t 4304 resync_start_show(struct mddev *mddev, char *page) 4305 { 4306 if (mddev->resync_offset == MaxSector) 4307 return sprintf(page, "none\n"); 4308 return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); 4309 } 4310 4311 static ssize_t 4312 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4313 { 4314 unsigned long long n; 4315 int err; 4316 4317 if (cmd_match(buf, "none")) 4318 n = MaxSector; 4319 else { 4320 err = kstrtoull(buf, 10, &n); 4321 if (err < 0) 4322 return err; 4323 if (n != (sector_t)n) 4324 return -EINVAL; 4325 } 4326 4327 err = mddev_lock(mddev); 4328 if (err) 4329 return err; 4330 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4331 err = -EBUSY; 4332 4333 if (!err) { 4334 mddev->resync_offset = n; 4335 if (mddev->pers) 4336 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4337 } 4338 mddev_unlock(mddev); 4339 return err ?: len; 4340 } 4341 static struct md_sysfs_entry md_resync_start = 4342 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4343 resync_start_show, resync_start_store); 4344 4345 /* 4346 * The array state can be: 4347 * 4348 * clear 4349 * No devices, no size, no level 4350 * Equivalent to STOP_ARRAY ioctl 4351 * inactive 4352 * May have some settings, but array is not active 4353 * all IO results in error 4354 * When written, doesn't tear down array, but just stops it 4355 * suspended (not supported yet) 4356 * All IO requests will block. The array can be reconfigured. 4357 * Writing this, if accepted, will block until array is quiescent 4358 * readonly 4359 * no resync can happen. no superblocks get written. 4360 * write requests fail 4361 * read-auto 4362 * like readonly, but behaves like 'clean' on a write request. 4363 * 4364 * clean - no pending writes, but otherwise active. 4365 * When written to inactive array, starts without resync 4366 * If a write request arrives then 4367 * if metadata is known, mark 'dirty' and switch to 'active'. 4368 * if not known, block and switch to write-pending 4369 * If written to an active array that has pending writes, then fails. 4370 * active 4371 * fully active: IO and resync can be happening. 4372 * When written to inactive array, starts with resync 4373 * 4374 * write-pending 4375 * clean, but writes are blocked waiting for 'active' to be written. 4376 * 4377 * active-idle 4378 * like active, but no writes have been seen for a while (100msec). 4379 * 4380 * broken 4381 * Array is failed. It's useful because mounted-arrays aren't stopped 4382 * when array is failed, so this state will at least alert the user that 4383 * something is wrong. 4384 */ 4385 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4386 write_pending, active_idle, broken, bad_word}; 4387 static char *array_states[] = { 4388 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4389 "write-pending", "active-idle", "broken", NULL }; 4390 4391 static int match_word(const char *word, char **list) 4392 { 4393 int n; 4394 for (n=0; list[n]; n++) 4395 if (cmd_match(word, list[n])) 4396 break; 4397 return n; 4398 } 4399 4400 static ssize_t 4401 array_state_show(struct mddev *mddev, char *page) 4402 { 4403 enum array_state st = inactive; 4404 4405 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4406 switch(mddev->ro) { 4407 case MD_RDONLY: 4408 st = readonly; 4409 break; 4410 case MD_AUTO_READ: 4411 st = read_auto; 4412 break; 4413 case MD_RDWR: 4414 spin_lock(&mddev->lock); 4415 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4416 st = write_pending; 4417 else if (mddev->in_sync) 4418 st = clean; 4419 else if (mddev->safemode) 4420 st = active_idle; 4421 else 4422 st = active; 4423 spin_unlock(&mddev->lock); 4424 } 4425 4426 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4427 st = broken; 4428 } else { 4429 if (list_empty(&mddev->disks) && 4430 mddev->raid_disks == 0 && 4431 mddev->dev_sectors == 0) 4432 st = clear; 4433 else 4434 st = inactive; 4435 } 4436 return sprintf(page, "%s\n", array_states[st]); 4437 } 4438 4439 static int do_md_stop(struct mddev *mddev, int ro); 4440 static int md_set_readonly(struct mddev *mddev); 4441 static int restart_array(struct mddev *mddev); 4442 4443 static ssize_t 4444 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4445 { 4446 int err = 0; 4447 enum array_state st = match_word(buf, array_states); 4448 4449 /* No lock dependent actions */ 4450 switch (st) { 4451 case suspended: /* not supported yet */ 4452 case write_pending: /* cannot be set */ 4453 case active_idle: /* cannot be set */ 4454 case broken: /* cannot be set */ 4455 case bad_word: 4456 return -EINVAL; 4457 case clear: 4458 case readonly: 4459 case inactive: 4460 case read_auto: 4461 if (!mddev->pers || !md_is_rdwr(mddev)) 4462 break; 4463 /* write sysfs will not open mddev and opener should be 0 */ 4464 err = mddev_set_closing_and_sync_blockdev(mddev, 0); 4465 if (err) 4466 return err; 4467 break; 4468 default: 4469 break; 4470 } 4471 4472 if (mddev->pers && (st == active || st == clean) && 4473 mddev->ro != MD_RDONLY) { 4474 /* don't take reconfig_mutex when toggling between 4475 * clean and active 4476 */ 4477 spin_lock(&mddev->lock); 4478 if (st == active) { 4479 restart_array(mddev); 4480 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4481 md_wakeup_thread(mddev->thread); 4482 wake_up(&mddev->sb_wait); 4483 } else /* st == clean */ { 4484 restart_array(mddev); 4485 if (!set_in_sync(mddev)) 4486 err = -EBUSY; 4487 } 4488 if (!err) 4489 sysfs_notify_dirent_safe(mddev->sysfs_state); 4490 spin_unlock(&mddev->lock); 4491 return err ?: len; 4492 } 4493 err = mddev_lock(mddev); 4494 if (err) 4495 return err; 4496 4497 switch (st) { 4498 case inactive: 4499 /* stop an active array, return 0 otherwise */ 4500 if (mddev->pers) 4501 err = do_md_stop(mddev, 2); 4502 break; 4503 case clear: 4504 err = do_md_stop(mddev, 0); 4505 break; 4506 case readonly: 4507 if (mddev->pers) 4508 err = md_set_readonly(mddev); 4509 else { 4510 mddev->ro = MD_RDONLY; 4511 set_disk_ro(mddev->gendisk, 1); 4512 err = do_md_run(mddev); 4513 } 4514 break; 4515 case read_auto: 4516 if (mddev->pers) { 4517 if (md_is_rdwr(mddev)) 4518 err = md_set_readonly(mddev); 4519 else if (mddev->ro == MD_RDONLY) 4520 err = restart_array(mddev); 4521 if (err == 0) { 4522 mddev->ro = MD_AUTO_READ; 4523 set_disk_ro(mddev->gendisk, 0); 4524 } 4525 } else { 4526 mddev->ro = MD_AUTO_READ; 4527 err = do_md_run(mddev); 4528 } 4529 break; 4530 case clean: 4531 if (mddev->pers) { 4532 err = restart_array(mddev); 4533 if (err) 4534 break; 4535 spin_lock(&mddev->lock); 4536 if (!set_in_sync(mddev)) 4537 err = -EBUSY; 4538 spin_unlock(&mddev->lock); 4539 } else 4540 err = -EINVAL; 4541 break; 4542 case active: 4543 if (mddev->pers) { 4544 err = restart_array(mddev); 4545 if (err) 4546 break; 4547 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4548 wake_up(&mddev->sb_wait); 4549 err = 0; 4550 } else { 4551 mddev->ro = MD_RDWR; 4552 set_disk_ro(mddev->gendisk, 0); 4553 err = do_md_run(mddev); 4554 } 4555 break; 4556 default: 4557 err = -EINVAL; 4558 break; 4559 } 4560 4561 if (!err) { 4562 if (mddev->hold_active == UNTIL_IOCTL) 4563 mddev->hold_active = 0; 4564 sysfs_notify_dirent_safe(mddev->sysfs_state); 4565 } 4566 mddev_unlock(mddev); 4567 4568 if (st == readonly || st == read_auto || st == inactive || 4569 (err && st == clear)) 4570 clear_bit(MD_CLOSING, &mddev->flags); 4571 4572 return err ?: len; 4573 } 4574 static struct md_sysfs_entry md_array_state = 4575 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4576 4577 static ssize_t 4578 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4579 return sprintf(page, "%d\n", 4580 atomic_read(&mddev->max_corr_read_errors)); 4581 } 4582 4583 static ssize_t 4584 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4585 { 4586 unsigned int n; 4587 int rv; 4588 4589 rv = kstrtouint(buf, 10, &n); 4590 if (rv < 0) 4591 return rv; 4592 if (n > INT_MAX) 4593 return -EINVAL; 4594 atomic_set(&mddev->max_corr_read_errors, n); 4595 return len; 4596 } 4597 4598 static struct md_sysfs_entry max_corr_read_errors = 4599 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4600 max_corrected_read_errors_store); 4601 4602 static ssize_t 4603 null_show(struct mddev *mddev, char *page) 4604 { 4605 return -EINVAL; 4606 } 4607 4608 static ssize_t 4609 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4610 { 4611 /* buf must be %d:%d\n? giving major and minor numbers */ 4612 /* The new device is added to the array. 4613 * If the array has a persistent superblock, we read the 4614 * superblock to initialise info and check validity. 4615 * Otherwise, only checking done is that in bind_rdev_to_array, 4616 * which mainly checks size. 4617 */ 4618 char *e; 4619 int major = simple_strtoul(buf, &e, 10); 4620 int minor; 4621 dev_t dev; 4622 struct md_rdev *rdev; 4623 int err; 4624 4625 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4626 return -EINVAL; 4627 minor = simple_strtoul(e+1, &e, 10); 4628 if (*e && *e != '\n') 4629 return -EINVAL; 4630 dev = MKDEV(major, minor); 4631 if (major != MAJOR(dev) || 4632 minor != MINOR(dev)) 4633 return -EOVERFLOW; 4634 4635 err = mddev_suspend_and_lock(mddev); 4636 if (err) 4637 return err; 4638 if (mddev->persistent) { 4639 rdev = md_import_device(dev, mddev->major_version, 4640 mddev->minor_version); 4641 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4642 struct md_rdev *rdev0 4643 = list_entry(mddev->disks.next, 4644 struct md_rdev, same_set); 4645 err = super_types[mddev->major_version] 4646 .load_super(rdev, rdev0, mddev->minor_version); 4647 if (err < 0) 4648 goto out; 4649 } 4650 } else if (mddev->external) 4651 rdev = md_import_device(dev, -2, -1); 4652 else 4653 rdev = md_import_device(dev, -1, -1); 4654 4655 if (IS_ERR(rdev)) { 4656 mddev_unlock_and_resume(mddev); 4657 return PTR_ERR(rdev); 4658 } 4659 err = bind_rdev_to_array(rdev, mddev); 4660 out: 4661 if (err) 4662 export_rdev(rdev, mddev); 4663 mddev_unlock_and_resume(mddev); 4664 if (!err) 4665 md_new_event(); 4666 return err ? err : len; 4667 } 4668 4669 static struct md_sysfs_entry md_new_device = 4670 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4671 4672 static ssize_t 4673 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4674 { 4675 char *end; 4676 unsigned long chunk, end_chunk; 4677 int err; 4678 4679 err = mddev_lock(mddev); 4680 if (err) 4681 return err; 4682 if (!mddev->bitmap) 4683 goto out; 4684 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4685 while (*buf) { 4686 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4687 if (buf == end) 4688 break; 4689 4690 if (*end == '-') { /* range */ 4691 buf = end + 1; 4692 end_chunk = simple_strtoul(buf, &end, 0); 4693 if (buf == end) 4694 break; 4695 } 4696 4697 if (*end && !isspace(*end)) 4698 break; 4699 4700 mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); 4701 buf = skip_spaces(end); 4702 } 4703 mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ 4704 out: 4705 mddev_unlock(mddev); 4706 return len; 4707 } 4708 4709 static struct md_sysfs_entry md_bitmap = 4710 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4711 4712 static ssize_t 4713 size_show(struct mddev *mddev, char *page) 4714 { 4715 return sprintf(page, "%llu\n", 4716 (unsigned long long)mddev->dev_sectors / 2); 4717 } 4718 4719 static int update_size(struct mddev *mddev, sector_t num_sectors); 4720 4721 static ssize_t 4722 size_store(struct mddev *mddev, const char *buf, size_t len) 4723 { 4724 /* If array is inactive, we can reduce the component size, but 4725 * not increase it (except from 0). 4726 * If array is active, we can try an on-line resize 4727 */ 4728 sector_t sectors; 4729 int err = strict_blocks_to_sectors(buf, §ors); 4730 4731 if (err < 0) 4732 return err; 4733 err = mddev_lock(mddev); 4734 if (err) 4735 return err; 4736 if (mddev->pers) { 4737 err = update_size(mddev, sectors); 4738 if (err == 0) 4739 md_update_sb(mddev, 1); 4740 } else { 4741 if (mddev->dev_sectors == 0 || 4742 mddev->dev_sectors > sectors) 4743 mddev->dev_sectors = sectors; 4744 else 4745 err = -ENOSPC; 4746 } 4747 mddev_unlock(mddev); 4748 return err ? err : len; 4749 } 4750 4751 static struct md_sysfs_entry md_size = 4752 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4753 4754 /* Metadata version. 4755 * This is one of 4756 * 'none' for arrays with no metadata (good luck...) 4757 * 'external' for arrays with externally managed metadata, 4758 * or N.M for internally known formats 4759 */ 4760 static ssize_t 4761 metadata_show(struct mddev *mddev, char *page) 4762 { 4763 if (mddev->persistent) 4764 return sprintf(page, "%d.%d\n", 4765 mddev->major_version, mddev->minor_version); 4766 else if (mddev->external) 4767 return sprintf(page, "external:%s\n", mddev->metadata_type); 4768 else 4769 return sprintf(page, "none\n"); 4770 } 4771 4772 static ssize_t 4773 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4774 { 4775 int major, minor; 4776 char *e; 4777 int err; 4778 /* Changing the details of 'external' metadata is 4779 * always permitted. Otherwise there must be 4780 * no devices attached to the array. 4781 */ 4782 4783 err = mddev_lock(mddev); 4784 if (err) 4785 return err; 4786 err = -EBUSY; 4787 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4788 ; 4789 else if (!list_empty(&mddev->disks)) 4790 goto out_unlock; 4791 4792 err = 0; 4793 if (cmd_match(buf, "none")) { 4794 mddev->persistent = 0; 4795 mddev->external = 0; 4796 mddev->major_version = 0; 4797 mddev->minor_version = 90; 4798 goto out_unlock; 4799 } 4800 if (strncmp(buf, "external:", 9) == 0) { 4801 size_t namelen = len-9; 4802 if (namelen >= sizeof(mddev->metadata_type)) 4803 namelen = sizeof(mddev->metadata_type)-1; 4804 memcpy(mddev->metadata_type, buf+9, namelen); 4805 mddev->metadata_type[namelen] = 0; 4806 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4807 mddev->metadata_type[--namelen] = 0; 4808 mddev->persistent = 0; 4809 mddev->external = 1; 4810 mddev->major_version = 0; 4811 mddev->minor_version = 90; 4812 goto out_unlock; 4813 } 4814 major = simple_strtoul(buf, &e, 10); 4815 err = -EINVAL; 4816 if (e==buf || *e != '.') 4817 goto out_unlock; 4818 buf = e+1; 4819 minor = simple_strtoul(buf, &e, 10); 4820 if (e==buf || (*e && *e != '\n') ) 4821 goto out_unlock; 4822 err = -ENOENT; 4823 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4824 goto out_unlock; 4825 mddev->major_version = major; 4826 mddev->minor_version = minor; 4827 mddev->persistent = 1; 4828 mddev->external = 0; 4829 err = 0; 4830 out_unlock: 4831 mddev_unlock(mddev); 4832 return err ?: len; 4833 } 4834 4835 static struct md_sysfs_entry md_metadata = 4836 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4837 4838 enum sync_action md_sync_action(struct mddev *mddev) 4839 { 4840 unsigned long recovery = mddev->recovery; 4841 4842 /* 4843 * frozen has the highest priority, means running sync_thread will be 4844 * stopped immediately, and no new sync_thread can start. 4845 */ 4846 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4847 return ACTION_FROZEN; 4848 4849 /* 4850 * read-only array can't register sync_thread, and it can only 4851 * add/remove spares. 4852 */ 4853 if (!md_is_rdwr(mddev)) 4854 return ACTION_IDLE; 4855 4856 /* 4857 * idle means no sync_thread is running, and no new sync_thread is 4858 * requested. 4859 */ 4860 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && 4861 !test_bit(MD_RECOVERY_NEEDED, &recovery)) 4862 return ACTION_IDLE; 4863 4864 if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || 4865 mddev->reshape_position != MaxSector) 4866 return ACTION_RESHAPE; 4867 4868 if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4869 return ACTION_RECOVER; 4870 4871 if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4872 /* 4873 * MD_RECOVERY_CHECK must be paired with 4874 * MD_RECOVERY_REQUESTED. 4875 */ 4876 if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4877 return ACTION_CHECK; 4878 if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4879 return ACTION_REPAIR; 4880 return ACTION_RESYNC; 4881 } 4882 4883 /* 4884 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no 4885 * sync_action is specified. 4886 */ 4887 return ACTION_IDLE; 4888 } 4889 4890 enum sync_action md_sync_action_by_name(const char *page) 4891 { 4892 enum sync_action action; 4893 4894 for (action = 0; action < NR_SYNC_ACTIONS; ++action) { 4895 if (cmd_match(page, action_name[action])) 4896 return action; 4897 } 4898 4899 return NR_SYNC_ACTIONS; 4900 } 4901 4902 const char *md_sync_action_name(enum sync_action action) 4903 { 4904 return action_name[action]; 4905 } 4906 4907 static ssize_t 4908 action_show(struct mddev *mddev, char *page) 4909 { 4910 enum sync_action action = md_sync_action(mddev); 4911 4912 return sprintf(page, "%s\n", md_sync_action_name(action)); 4913 } 4914 4915 /** 4916 * stop_sync_thread() - wait for sync_thread to stop if it's running. 4917 * @mddev: the array. 4918 * @locked: if set, reconfig_mutex will still be held after this function 4919 * return; if not set, reconfig_mutex will be released after this 4920 * function return. 4921 */ 4922 static void stop_sync_thread(struct mddev *mddev, bool locked) 4923 { 4924 int sync_seq = atomic_read(&mddev->sync_seq); 4925 4926 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4927 if (!locked) 4928 mddev_unlock(mddev); 4929 return; 4930 } 4931 4932 mddev_unlock(mddev); 4933 4934 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4935 /* 4936 * Thread might be blocked waiting for metadata update which will now 4937 * never happen 4938 */ 4939 md_wakeup_thread_directly(mddev->sync_thread); 4940 if (work_pending(&mddev->sync_work)) 4941 flush_work(&mddev->sync_work); 4942 4943 wait_event(resync_wait, 4944 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4945 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && 4946 sync_seq != atomic_read(&mddev->sync_seq))); 4947 4948 if (locked) 4949 mddev_lock_nointr(mddev); 4950 } 4951 4952 void md_idle_sync_thread(struct mddev *mddev) 4953 { 4954 lockdep_assert_held(&mddev->reconfig_mutex); 4955 4956 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4957 stop_sync_thread(mddev, true); 4958 } 4959 EXPORT_SYMBOL_GPL(md_idle_sync_thread); 4960 4961 void md_frozen_sync_thread(struct mddev *mddev) 4962 { 4963 lockdep_assert_held(&mddev->reconfig_mutex); 4964 4965 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4966 stop_sync_thread(mddev, true); 4967 } 4968 EXPORT_SYMBOL_GPL(md_frozen_sync_thread); 4969 4970 void md_unfrozen_sync_thread(struct mddev *mddev) 4971 { 4972 lockdep_assert_held(&mddev->reconfig_mutex); 4973 4974 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4975 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4976 md_wakeup_thread(mddev->thread); 4977 sysfs_notify_dirent_safe(mddev->sysfs_action); 4978 } 4979 EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); 4980 4981 static int mddev_start_reshape(struct mddev *mddev) 4982 { 4983 int ret; 4984 4985 if (mddev->pers->start_reshape == NULL) 4986 return -EINVAL; 4987 4988 if (mddev->reshape_position == MaxSector || 4989 mddev->pers->check_reshape == NULL || 4990 mddev->pers->check_reshape(mddev)) { 4991 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4992 ret = mddev->pers->start_reshape(mddev); 4993 if (ret) 4994 return ret; 4995 } else { 4996 /* 4997 * If reshape is still in progress, and md_check_recovery() can 4998 * continue to reshape, don't restart reshape because data can 4999 * be corrupted for raid456. 5000 */ 5001 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5002 } 5003 5004 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 5005 return 0; 5006 } 5007 5008 static ssize_t 5009 action_store(struct mddev *mddev, const char *page, size_t len) 5010 { 5011 int ret; 5012 enum sync_action action; 5013 5014 if (!mddev->pers || !mddev->pers->sync_request) 5015 return -EINVAL; 5016 5017 retry: 5018 if (work_busy(&mddev->sync_work)) 5019 flush_work(&mddev->sync_work); 5020 5021 ret = mddev_lock(mddev); 5022 if (ret) 5023 return ret; 5024 5025 if (work_busy(&mddev->sync_work)) { 5026 mddev_unlock(mddev); 5027 goto retry; 5028 } 5029 5030 action = md_sync_action_by_name(page); 5031 5032 /* TODO: mdadm rely on "idle" to start sync_thread. */ 5033 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5034 switch (action) { 5035 case ACTION_FROZEN: 5036 md_frozen_sync_thread(mddev); 5037 ret = len; 5038 goto out; 5039 case ACTION_IDLE: 5040 md_idle_sync_thread(mddev); 5041 break; 5042 case ACTION_RESHAPE: 5043 case ACTION_RECOVER: 5044 case ACTION_CHECK: 5045 case ACTION_REPAIR: 5046 case ACTION_RESYNC: 5047 ret = -EBUSY; 5048 goto out; 5049 default: 5050 ret = -EINVAL; 5051 goto out; 5052 } 5053 } else { 5054 switch (action) { 5055 case ACTION_FROZEN: 5056 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5057 ret = len; 5058 goto out; 5059 case ACTION_RESHAPE: 5060 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5061 ret = mddev_start_reshape(mddev); 5062 if (ret) 5063 goto out; 5064 break; 5065 case ACTION_RECOVER: 5066 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5067 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5068 break; 5069 case ACTION_CHECK: 5070 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5071 fallthrough; 5072 case ACTION_REPAIR: 5073 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 5074 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5075 fallthrough; 5076 case ACTION_RESYNC: 5077 case ACTION_IDLE: 5078 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5079 break; 5080 default: 5081 ret = -EINVAL; 5082 goto out; 5083 } 5084 } 5085 5086 if (mddev->ro == MD_AUTO_READ) { 5087 /* A write to sync_action is enough to justify 5088 * canceling read-auto mode 5089 */ 5090 mddev->ro = MD_RDWR; 5091 md_wakeup_thread(mddev->sync_thread); 5092 } 5093 5094 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5095 md_wakeup_thread(mddev->thread); 5096 sysfs_notify_dirent_safe(mddev->sysfs_action); 5097 ret = len; 5098 5099 out: 5100 mddev_unlock(mddev); 5101 return ret; 5102 } 5103 5104 static struct md_sysfs_entry md_scan_mode = 5105 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 5106 5107 static ssize_t 5108 last_sync_action_show(struct mddev *mddev, char *page) 5109 { 5110 return sprintf(page, "%s\n", 5111 md_sync_action_name(mddev->last_sync_action)); 5112 } 5113 5114 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 5115 5116 static ssize_t 5117 mismatch_cnt_show(struct mddev *mddev, char *page) 5118 { 5119 return sprintf(page, "%llu\n", 5120 (unsigned long long) 5121 atomic64_read(&mddev->resync_mismatches)); 5122 } 5123 5124 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 5125 5126 static ssize_t 5127 sync_min_show(struct mddev *mddev, char *page) 5128 { 5129 return sprintf(page, "%d (%s)\n", speed_min(mddev), 5130 mddev->sync_speed_min ? "local" : "system"); 5131 } 5132 5133 static ssize_t 5134 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 5135 { 5136 unsigned int min; 5137 int rv; 5138 5139 if (strncmp(buf, "system", 6) == 0) { 5140 min = 0; 5141 } else { 5142 rv = kstrtouint(buf, 10, &min); 5143 if (rv < 0) 5144 return rv; 5145 if (min == 0) 5146 return -EINVAL; 5147 } 5148 mddev->sync_speed_min = min; 5149 return len; 5150 } 5151 5152 static struct md_sysfs_entry md_sync_min = 5153 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5154 5155 static ssize_t 5156 sync_max_show(struct mddev *mddev, char *page) 5157 { 5158 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5159 mddev->sync_speed_max ? "local" : "system"); 5160 } 5161 5162 static ssize_t 5163 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5164 { 5165 unsigned int max; 5166 int rv; 5167 5168 if (strncmp(buf, "system", 6) == 0) { 5169 max = 0; 5170 } else { 5171 rv = kstrtouint(buf, 10, &max); 5172 if (rv < 0) 5173 return rv; 5174 if (max == 0) 5175 return -EINVAL; 5176 } 5177 mddev->sync_speed_max = max; 5178 return len; 5179 } 5180 5181 static struct md_sysfs_entry md_sync_max = 5182 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5183 5184 static ssize_t 5185 sync_io_depth_show(struct mddev *mddev, char *page) 5186 { 5187 return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), 5188 mddev->sync_io_depth ? "local" : "system"); 5189 } 5190 5191 static ssize_t 5192 sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) 5193 { 5194 unsigned int max; 5195 int rv; 5196 5197 if (strncmp(buf, "system", 6) == 0) { 5198 max = 0; 5199 } else { 5200 rv = kstrtouint(buf, 10, &max); 5201 if (rv < 0) 5202 return rv; 5203 if (max == 0) 5204 return -EINVAL; 5205 } 5206 mddev->sync_io_depth = max; 5207 return len; 5208 } 5209 5210 static struct md_sysfs_entry md_sync_io_depth = 5211 __ATTR_RW(sync_io_depth); 5212 5213 static ssize_t 5214 degraded_show(struct mddev *mddev, char *page) 5215 { 5216 return sprintf(page, "%d\n", mddev->degraded); 5217 } 5218 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5219 5220 static ssize_t 5221 sync_force_parallel_show(struct mddev *mddev, char *page) 5222 { 5223 return sprintf(page, "%d\n", mddev->parallel_resync); 5224 } 5225 5226 static ssize_t 5227 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5228 { 5229 long n; 5230 5231 if (kstrtol(buf, 10, &n)) 5232 return -EINVAL; 5233 5234 if (n != 0 && n != 1) 5235 return -EINVAL; 5236 5237 mddev->parallel_resync = n; 5238 5239 if (mddev->sync_thread) 5240 wake_up(&resync_wait); 5241 5242 return len; 5243 } 5244 5245 /* force parallel resync, even with shared block devices */ 5246 static struct md_sysfs_entry md_sync_force_parallel = 5247 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5248 sync_force_parallel_show, sync_force_parallel_store); 5249 5250 static ssize_t 5251 sync_speed_show(struct mddev *mddev, char *page) 5252 { 5253 unsigned long resync, dt, db; 5254 if (mddev->curr_resync == MD_RESYNC_NONE) 5255 return sprintf(page, "none\n"); 5256 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5257 dt = (jiffies - mddev->resync_mark) / HZ; 5258 if (!dt) dt++; 5259 db = resync - mddev->resync_mark_cnt; 5260 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5261 } 5262 5263 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5264 5265 static ssize_t 5266 sync_completed_show(struct mddev *mddev, char *page) 5267 { 5268 unsigned long long max_sectors, resync; 5269 5270 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5271 return sprintf(page, "none\n"); 5272 5273 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5274 mddev->curr_resync == MD_RESYNC_DELAYED) 5275 return sprintf(page, "delayed\n"); 5276 5277 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5278 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5279 max_sectors = mddev->resync_max_sectors; 5280 else 5281 max_sectors = mddev->dev_sectors; 5282 5283 resync = mddev->curr_resync_completed; 5284 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5285 } 5286 5287 static struct md_sysfs_entry md_sync_completed = 5288 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5289 5290 static ssize_t 5291 min_sync_show(struct mddev *mddev, char *page) 5292 { 5293 return sprintf(page, "%llu\n", 5294 (unsigned long long)mddev->resync_min); 5295 } 5296 static ssize_t 5297 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5298 { 5299 unsigned long long min; 5300 int err; 5301 5302 if (kstrtoull(buf, 10, &min)) 5303 return -EINVAL; 5304 5305 spin_lock(&mddev->lock); 5306 err = -EINVAL; 5307 if (min > mddev->resync_max) 5308 goto out_unlock; 5309 5310 err = -EBUSY; 5311 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5312 goto out_unlock; 5313 5314 /* Round down to multiple of 4K for safety */ 5315 mddev->resync_min = round_down(min, 8); 5316 err = 0; 5317 5318 out_unlock: 5319 spin_unlock(&mddev->lock); 5320 return err ?: len; 5321 } 5322 5323 static struct md_sysfs_entry md_min_sync = 5324 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5325 5326 static ssize_t 5327 max_sync_show(struct mddev *mddev, char *page) 5328 { 5329 if (mddev->resync_max == MaxSector) 5330 return sprintf(page, "max\n"); 5331 else 5332 return sprintf(page, "%llu\n", 5333 (unsigned long long)mddev->resync_max); 5334 } 5335 static ssize_t 5336 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5337 { 5338 int err; 5339 spin_lock(&mddev->lock); 5340 if (strncmp(buf, "max", 3) == 0) 5341 mddev->resync_max = MaxSector; 5342 else { 5343 unsigned long long max; 5344 int chunk; 5345 5346 err = -EINVAL; 5347 if (kstrtoull(buf, 10, &max)) 5348 goto out_unlock; 5349 if (max < mddev->resync_min) 5350 goto out_unlock; 5351 5352 err = -EBUSY; 5353 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5354 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5355 goto out_unlock; 5356 5357 /* Must be a multiple of chunk_size */ 5358 chunk = mddev->chunk_sectors; 5359 if (chunk) { 5360 sector_t temp = max; 5361 5362 err = -EINVAL; 5363 if (sector_div(temp, chunk)) 5364 goto out_unlock; 5365 } 5366 mddev->resync_max = max; 5367 } 5368 wake_up(&mddev->recovery_wait); 5369 err = 0; 5370 out_unlock: 5371 spin_unlock(&mddev->lock); 5372 return err ?: len; 5373 } 5374 5375 static struct md_sysfs_entry md_max_sync = 5376 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5377 5378 static ssize_t 5379 suspend_lo_show(struct mddev *mddev, char *page) 5380 { 5381 return sprintf(page, "%llu\n", 5382 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5383 } 5384 5385 static ssize_t 5386 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5387 { 5388 unsigned long long new; 5389 int err; 5390 5391 err = kstrtoull(buf, 10, &new); 5392 if (err < 0) 5393 return err; 5394 if (new != (sector_t)new) 5395 return -EINVAL; 5396 5397 err = mddev_suspend(mddev, true); 5398 if (err) 5399 return err; 5400 5401 WRITE_ONCE(mddev->suspend_lo, new); 5402 mddev_resume(mddev); 5403 5404 return len; 5405 } 5406 static struct md_sysfs_entry md_suspend_lo = 5407 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5408 5409 static ssize_t 5410 suspend_hi_show(struct mddev *mddev, char *page) 5411 { 5412 return sprintf(page, "%llu\n", 5413 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5414 } 5415 5416 static ssize_t 5417 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5418 { 5419 unsigned long long new; 5420 int err; 5421 5422 err = kstrtoull(buf, 10, &new); 5423 if (err < 0) 5424 return err; 5425 if (new != (sector_t)new) 5426 return -EINVAL; 5427 5428 err = mddev_suspend(mddev, true); 5429 if (err) 5430 return err; 5431 5432 WRITE_ONCE(mddev->suspend_hi, new); 5433 mddev_resume(mddev); 5434 5435 return len; 5436 } 5437 static struct md_sysfs_entry md_suspend_hi = 5438 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5439 5440 static ssize_t 5441 reshape_position_show(struct mddev *mddev, char *page) 5442 { 5443 if (mddev->reshape_position != MaxSector) 5444 return sprintf(page, "%llu\n", 5445 (unsigned long long)mddev->reshape_position); 5446 strcpy(page, "none\n"); 5447 return 5; 5448 } 5449 5450 static ssize_t 5451 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5452 { 5453 struct md_rdev *rdev; 5454 unsigned long long new; 5455 int err; 5456 5457 err = kstrtoull(buf, 10, &new); 5458 if (err < 0) 5459 return err; 5460 if (new != (sector_t)new) 5461 return -EINVAL; 5462 err = mddev_lock(mddev); 5463 if (err) 5464 return err; 5465 err = -EBUSY; 5466 if (mddev->pers) 5467 goto unlock; 5468 mddev->reshape_position = new; 5469 mddev->delta_disks = 0; 5470 mddev->reshape_backwards = 0; 5471 mddev->new_level = mddev->level; 5472 mddev->new_layout = mddev->layout; 5473 mddev->new_chunk_sectors = mddev->chunk_sectors; 5474 rdev_for_each(rdev, mddev) 5475 rdev->new_data_offset = rdev->data_offset; 5476 err = 0; 5477 unlock: 5478 mddev_unlock(mddev); 5479 return err ?: len; 5480 } 5481 5482 static struct md_sysfs_entry md_reshape_position = 5483 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5484 reshape_position_store); 5485 5486 static ssize_t 5487 reshape_direction_show(struct mddev *mddev, char *page) 5488 { 5489 return sprintf(page, "%s\n", 5490 mddev->reshape_backwards ? "backwards" : "forwards"); 5491 } 5492 5493 static ssize_t 5494 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5495 { 5496 int backwards = 0; 5497 int err; 5498 5499 if (cmd_match(buf, "forwards")) 5500 backwards = 0; 5501 else if (cmd_match(buf, "backwards")) 5502 backwards = 1; 5503 else 5504 return -EINVAL; 5505 if (mddev->reshape_backwards == backwards) 5506 return len; 5507 5508 err = mddev_lock(mddev); 5509 if (err) 5510 return err; 5511 /* check if we are allowed to change */ 5512 if (mddev->delta_disks) 5513 err = -EBUSY; 5514 else if (mddev->persistent && 5515 mddev->major_version == 0) 5516 err = -EINVAL; 5517 else 5518 mddev->reshape_backwards = backwards; 5519 mddev_unlock(mddev); 5520 return err ?: len; 5521 } 5522 5523 static struct md_sysfs_entry md_reshape_direction = 5524 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5525 reshape_direction_store); 5526 5527 static ssize_t 5528 array_size_show(struct mddev *mddev, char *page) 5529 { 5530 if (mddev->external_size) 5531 return sprintf(page, "%llu\n", 5532 (unsigned long long)mddev->array_sectors/2); 5533 else 5534 return sprintf(page, "default\n"); 5535 } 5536 5537 static ssize_t 5538 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5539 { 5540 sector_t sectors; 5541 int err; 5542 5543 err = mddev_lock(mddev); 5544 if (err) 5545 return err; 5546 5547 /* cluster raid doesn't support change array_sectors */ 5548 if (mddev_is_clustered(mddev)) { 5549 mddev_unlock(mddev); 5550 return -EINVAL; 5551 } 5552 5553 if (strncmp(buf, "default", 7) == 0) { 5554 if (mddev->pers) 5555 sectors = mddev->pers->size(mddev, 0, 0); 5556 else 5557 sectors = mddev->array_sectors; 5558 5559 mddev->external_size = 0; 5560 } else { 5561 if (strict_blocks_to_sectors(buf, §ors) < 0) 5562 err = -EINVAL; 5563 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5564 err = -E2BIG; 5565 else 5566 mddev->external_size = 1; 5567 } 5568 5569 if (!err) { 5570 mddev->array_sectors = sectors; 5571 if (mddev->pers) 5572 set_capacity_and_notify(mddev->gendisk, 5573 mddev->array_sectors); 5574 } 5575 mddev_unlock(mddev); 5576 return err ?: len; 5577 } 5578 5579 static struct md_sysfs_entry md_array_size = 5580 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5581 array_size_store); 5582 5583 static ssize_t 5584 consistency_policy_show(struct mddev *mddev, char *page) 5585 { 5586 int ret; 5587 5588 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5589 ret = sprintf(page, "journal\n"); 5590 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5591 ret = sprintf(page, "ppl\n"); 5592 } else if (mddev->bitmap) { 5593 ret = sprintf(page, "bitmap\n"); 5594 } else if (mddev->pers) { 5595 if (mddev->pers->sync_request) 5596 ret = sprintf(page, "resync\n"); 5597 else 5598 ret = sprintf(page, "none\n"); 5599 } else { 5600 ret = sprintf(page, "unknown\n"); 5601 } 5602 5603 return ret; 5604 } 5605 5606 static ssize_t 5607 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5608 { 5609 int err = 0; 5610 5611 if (mddev->pers) { 5612 if (mddev->pers->change_consistency_policy) 5613 err = mddev->pers->change_consistency_policy(mddev, buf); 5614 else 5615 err = -EBUSY; 5616 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5617 set_bit(MD_HAS_PPL, &mddev->flags); 5618 } else { 5619 err = -EINVAL; 5620 } 5621 5622 return err ? err : len; 5623 } 5624 5625 static struct md_sysfs_entry md_consistency_policy = 5626 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5627 consistency_policy_store); 5628 5629 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5630 { 5631 return sprintf(page, "%d\n", mddev->fail_last_dev); 5632 } 5633 5634 /* 5635 * Setting fail_last_dev to true to allow last device to be forcibly removed 5636 * from RAID1/RAID10. 5637 */ 5638 static ssize_t 5639 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5640 { 5641 int ret; 5642 bool value; 5643 5644 ret = kstrtobool(buf, &value); 5645 if (ret) 5646 return ret; 5647 5648 if (value != mddev->fail_last_dev) 5649 mddev->fail_last_dev = value; 5650 5651 return len; 5652 } 5653 static struct md_sysfs_entry md_fail_last_dev = 5654 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5655 fail_last_dev_store); 5656 5657 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5658 { 5659 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) 5660 return sprintf(page, "n/a\n"); 5661 else 5662 return sprintf(page, "%d\n", mddev->serialize_policy); 5663 } 5664 5665 /* 5666 * Setting serialize_policy to true to enforce write IO is not reordered 5667 * for raid1. 5668 */ 5669 static ssize_t 5670 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5671 { 5672 int err; 5673 bool value; 5674 5675 err = kstrtobool(buf, &value); 5676 if (err) 5677 return err; 5678 5679 if (value == mddev->serialize_policy) 5680 return len; 5681 5682 err = mddev_suspend_and_lock(mddev); 5683 if (err) 5684 return err; 5685 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { 5686 pr_err("md: serialize_policy is only effective for raid1\n"); 5687 err = -EINVAL; 5688 goto unlock; 5689 } 5690 5691 if (value) 5692 mddev_create_serial_pool(mddev, NULL); 5693 else 5694 mddev_destroy_serial_pool(mddev, NULL); 5695 mddev->serialize_policy = value; 5696 unlock: 5697 mddev_unlock_and_resume(mddev); 5698 return err ?: len; 5699 } 5700 5701 static struct md_sysfs_entry md_serialize_policy = 5702 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5703 serialize_policy_store); 5704 5705 5706 static struct attribute *md_default_attrs[] = { 5707 &md_level.attr, 5708 &md_new_level.attr, 5709 &md_layout.attr, 5710 &md_raid_disks.attr, 5711 &md_uuid.attr, 5712 &md_chunk_size.attr, 5713 &md_size.attr, 5714 &md_resync_start.attr, 5715 &md_metadata.attr, 5716 &md_new_device.attr, 5717 &md_safe_delay.attr, 5718 &md_array_state.attr, 5719 &md_reshape_position.attr, 5720 &md_reshape_direction.attr, 5721 &md_array_size.attr, 5722 &max_corr_read_errors.attr, 5723 &md_consistency_policy.attr, 5724 &md_fail_last_dev.attr, 5725 &md_serialize_policy.attr, 5726 NULL, 5727 }; 5728 5729 static const struct attribute_group md_default_group = { 5730 .attrs = md_default_attrs, 5731 }; 5732 5733 static struct attribute *md_redundancy_attrs[] = { 5734 &md_scan_mode.attr, 5735 &md_last_scan_mode.attr, 5736 &md_mismatches.attr, 5737 &md_sync_min.attr, 5738 &md_sync_max.attr, 5739 &md_sync_io_depth.attr, 5740 &md_sync_speed.attr, 5741 &md_sync_force_parallel.attr, 5742 &md_sync_completed.attr, 5743 &md_min_sync.attr, 5744 &md_max_sync.attr, 5745 &md_suspend_lo.attr, 5746 &md_suspend_hi.attr, 5747 &md_bitmap.attr, 5748 &md_degraded.attr, 5749 NULL, 5750 }; 5751 static const struct attribute_group md_redundancy_group = { 5752 .name = NULL, 5753 .attrs = md_redundancy_attrs, 5754 }; 5755 5756 static const struct attribute_group *md_attr_groups[] = { 5757 &md_default_group, 5758 &md_bitmap_group, 5759 NULL, 5760 }; 5761 5762 static ssize_t 5763 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5764 { 5765 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5766 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5767 ssize_t rv; 5768 5769 if (!entry->show) 5770 return -EIO; 5771 spin_lock(&all_mddevs_lock); 5772 if (!mddev_get(mddev)) { 5773 spin_unlock(&all_mddevs_lock); 5774 return -EBUSY; 5775 } 5776 spin_unlock(&all_mddevs_lock); 5777 5778 rv = entry->show(mddev, page); 5779 mddev_put(mddev); 5780 return rv; 5781 } 5782 5783 static ssize_t 5784 md_attr_store(struct kobject *kobj, struct attribute *attr, 5785 const char *page, size_t length) 5786 { 5787 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5788 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5789 ssize_t rv; 5790 struct kernfs_node *kn = NULL; 5791 5792 if (!entry->store) 5793 return -EIO; 5794 if (!capable(CAP_SYS_ADMIN)) 5795 return -EACCES; 5796 5797 if (entry->store == array_state_store && cmd_match(page, "clear")) 5798 kn = sysfs_break_active_protection(kobj, attr); 5799 5800 spin_lock(&all_mddevs_lock); 5801 if (!mddev_get(mddev)) { 5802 spin_unlock(&all_mddevs_lock); 5803 if (kn) 5804 sysfs_unbreak_active_protection(kn); 5805 return -EBUSY; 5806 } 5807 spin_unlock(&all_mddevs_lock); 5808 rv = entry->store(mddev, page, length); 5809 mddev_put(mddev); 5810 5811 if (kn) 5812 sysfs_unbreak_active_protection(kn); 5813 5814 return rv; 5815 } 5816 5817 static void md_kobj_release(struct kobject *ko) 5818 { 5819 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5820 5821 put_disk(mddev->gendisk); 5822 } 5823 5824 static const struct sysfs_ops md_sysfs_ops = { 5825 .show = md_attr_show, 5826 .store = md_attr_store, 5827 }; 5828 static const struct kobj_type md_ktype = { 5829 .release = md_kobj_release, 5830 .sysfs_ops = &md_sysfs_ops, 5831 .default_groups = md_attr_groups, 5832 }; 5833 5834 int mdp_major = 0; 5835 5836 /* stack the limit for all rdevs into lim */ 5837 int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, 5838 unsigned int flags) 5839 { 5840 struct md_rdev *rdev; 5841 5842 rdev_for_each(rdev, mddev) { 5843 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, 5844 mddev->gendisk->disk_name); 5845 if ((flags & MDDEV_STACK_INTEGRITY) && 5846 !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) 5847 return -EINVAL; 5848 } 5849 5850 return 0; 5851 } 5852 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); 5853 5854 /* apply the extra stacking limits from a new rdev into mddev */ 5855 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) 5856 { 5857 struct queue_limits lim; 5858 5859 if (mddev_is_dm(mddev)) 5860 return 0; 5861 5862 lim = queue_limits_start_update(mddev->gendisk->queue); 5863 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, 5864 mddev->gendisk->disk_name); 5865 5866 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { 5867 pr_err("%s: incompatible integrity profile for %pg\n", 5868 mdname(mddev), rdev->bdev); 5869 queue_limits_cancel_update(mddev->gendisk->queue); 5870 return -ENXIO; 5871 } 5872 5873 return queue_limits_commit_update(mddev->gendisk->queue, &lim); 5874 } 5875 EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); 5876 5877 /* update the optimal I/O size after a reshape */ 5878 void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) 5879 { 5880 struct queue_limits lim; 5881 5882 if (mddev_is_dm(mddev)) 5883 return; 5884 5885 /* don't bother updating io_opt if we can't suspend the array */ 5886 if (mddev_suspend(mddev, false) < 0) 5887 return; 5888 lim = queue_limits_start_update(mddev->gendisk->queue); 5889 lim.io_opt = lim.io_min * nr_stripes; 5890 queue_limits_commit_update(mddev->gendisk->queue, &lim); 5891 mddev_resume(mddev); 5892 } 5893 EXPORT_SYMBOL_GPL(mddev_update_io_opt); 5894 5895 static void mddev_delayed_delete(struct work_struct *ws) 5896 { 5897 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5898 5899 kobject_put(&mddev->kobj); 5900 } 5901 5902 void md_init_stacking_limits(struct queue_limits *lim) 5903 { 5904 blk_set_stacking_limits(lim); 5905 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | 5906 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; 5907 } 5908 EXPORT_SYMBOL_GPL(md_init_stacking_limits); 5909 5910 struct mddev *md_alloc(dev_t dev, char *name) 5911 { 5912 /* 5913 * If dev is zero, name is the name of a device to allocate with 5914 * an arbitrary minor number. It will be "md_???" 5915 * If dev is non-zero it must be a device number with a MAJOR of 5916 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5917 * the device is being created by opening a node in /dev. 5918 * If "name" is not NULL, the device is being created by 5919 * writing to /sys/module/md_mod/parameters/new_array. 5920 */ 5921 static DEFINE_MUTEX(disks_mutex); 5922 struct mddev *mddev; 5923 struct gendisk *disk; 5924 int partitioned; 5925 int shift; 5926 int unit; 5927 int error; 5928 5929 /* 5930 * Wait for any previous instance of this device to be completely 5931 * removed (mddev_delayed_delete). 5932 */ 5933 flush_workqueue(md_misc_wq); 5934 5935 mutex_lock(&disks_mutex); 5936 mddev = mddev_alloc(dev); 5937 if (IS_ERR(mddev)) { 5938 error = PTR_ERR(mddev); 5939 goto out_unlock; 5940 } 5941 5942 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5943 shift = partitioned ? MdpMinorShift : 0; 5944 unit = MINOR(mddev->unit) >> shift; 5945 5946 if (name && !dev) { 5947 /* Need to ensure that 'name' is not a duplicate. 5948 */ 5949 struct mddev *mddev2; 5950 spin_lock(&all_mddevs_lock); 5951 5952 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5953 if (mddev2->gendisk && 5954 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5955 spin_unlock(&all_mddevs_lock); 5956 error = -EEXIST; 5957 goto out_free_mddev; 5958 } 5959 spin_unlock(&all_mddevs_lock); 5960 } 5961 if (name && dev) 5962 /* 5963 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5964 */ 5965 mddev->hold_active = UNTIL_STOP; 5966 5967 disk = blk_alloc_disk(NULL, NUMA_NO_NODE); 5968 if (IS_ERR(disk)) { 5969 error = PTR_ERR(disk); 5970 goto out_free_mddev; 5971 } 5972 5973 disk->major = MAJOR(mddev->unit); 5974 disk->first_minor = unit << shift; 5975 disk->minors = 1 << shift; 5976 if (name) 5977 strcpy(disk->disk_name, name); 5978 else if (partitioned) 5979 sprintf(disk->disk_name, "md_d%d", unit); 5980 else 5981 sprintf(disk->disk_name, "md%d", unit); 5982 disk->fops = &md_fops; 5983 disk->private_data = mddev; 5984 5985 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5986 mddev->gendisk = disk; 5987 error = add_disk(disk); 5988 if (error) 5989 goto out_put_disk; 5990 5991 kobject_init(&mddev->kobj, &md_ktype); 5992 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5993 if (error) { 5994 /* 5995 * The disk is already live at this point. Clear the hold flag 5996 * and let mddev_put take care of the deletion, as it isn't any 5997 * different from a normal close on last release now. 5998 */ 5999 mddev->hold_active = 0; 6000 mutex_unlock(&disks_mutex); 6001 mddev_put(mddev); 6002 return ERR_PTR(error); 6003 } 6004 6005 kobject_uevent(&mddev->kobj, KOBJ_ADD); 6006 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 6007 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 6008 mutex_unlock(&disks_mutex); 6009 return mddev; 6010 6011 out_put_disk: 6012 put_disk(disk); 6013 out_free_mddev: 6014 mddev_free(mddev); 6015 out_unlock: 6016 mutex_unlock(&disks_mutex); 6017 return ERR_PTR(error); 6018 } 6019 6020 static int md_alloc_and_put(dev_t dev, char *name) 6021 { 6022 struct mddev *mddev = md_alloc(dev, name); 6023 6024 if (IS_ERR(mddev)) 6025 return PTR_ERR(mddev); 6026 mddev_put(mddev); 6027 return 0; 6028 } 6029 6030 static void md_probe(dev_t dev) 6031 { 6032 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 6033 return; 6034 if (create_on_open) 6035 md_alloc_and_put(dev, NULL); 6036 } 6037 6038 static int add_named_array(const char *val, const struct kernel_param *kp) 6039 { 6040 /* 6041 * val must be "md_*" or "mdNNN". 6042 * For "md_*" we allocate an array with a large free minor number, and 6043 * set the name to val. val must not already be an active name. 6044 * For "mdNNN" we allocate an array with the minor number NNN 6045 * which must not already be in use. 6046 */ 6047 int len = strlen(val); 6048 char buf[DISK_NAME_LEN]; 6049 unsigned long devnum; 6050 6051 while (len && val[len-1] == '\n') 6052 len--; 6053 if (len >= DISK_NAME_LEN) 6054 return -E2BIG; 6055 strscpy(buf, val, len+1); 6056 if (strncmp(buf, "md_", 3) == 0) 6057 return md_alloc_and_put(0, buf); 6058 if (strncmp(buf, "md", 2) == 0 && 6059 isdigit(buf[2]) && 6060 kstrtoul(buf+2, 10, &devnum) == 0 && 6061 devnum <= MINORMASK) 6062 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 6063 6064 return -EINVAL; 6065 } 6066 6067 static void md_safemode_timeout(struct timer_list *t) 6068 { 6069 struct mddev *mddev = timer_container_of(mddev, t, safemode_timer); 6070 6071 mddev->safemode = 1; 6072 if (mddev->external) 6073 sysfs_notify_dirent_safe(mddev->sysfs_state); 6074 6075 md_wakeup_thread(mddev->thread); 6076 } 6077 6078 static int start_dirty_degraded; 6079 6080 int md_run(struct mddev *mddev) 6081 { 6082 int err; 6083 struct md_rdev *rdev; 6084 struct md_personality *pers; 6085 bool nowait = true; 6086 6087 if (list_empty(&mddev->disks)) 6088 /* cannot run an array with no devices.. */ 6089 return -EINVAL; 6090 6091 if (mddev->pers) 6092 return -EBUSY; 6093 /* Cannot run until previous stop completes properly */ 6094 if (mddev->sysfs_active) 6095 return -EBUSY; 6096 6097 /* 6098 * Analyze all RAID superblock(s) 6099 */ 6100 if (!mddev->raid_disks) { 6101 if (!mddev->persistent) 6102 return -EINVAL; 6103 err = analyze_sbs(mddev); 6104 if (err) 6105 return -EINVAL; 6106 } 6107 6108 if (mddev->level != LEVEL_NONE) 6109 request_module("md-level-%d", mddev->level); 6110 else if (mddev->clevel[0]) 6111 request_module("md-%s", mddev->clevel); 6112 6113 /* 6114 * Drop all container device buffers, from now on 6115 * the only valid external interface is through the md 6116 * device. 6117 */ 6118 mddev->has_superblocks = false; 6119 rdev_for_each(rdev, mddev) { 6120 if (test_bit(Faulty, &rdev->flags)) 6121 continue; 6122 sync_blockdev(rdev->bdev); 6123 invalidate_bdev(rdev->bdev); 6124 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 6125 mddev->ro = MD_RDONLY; 6126 if (!mddev_is_dm(mddev)) 6127 set_disk_ro(mddev->gendisk, 1); 6128 } 6129 6130 if (rdev->sb_page) 6131 mddev->has_superblocks = true; 6132 6133 /* perform some consistency tests on the device. 6134 * We don't want the data to overlap the metadata, 6135 * Internal Bitmap issues have been handled elsewhere. 6136 */ 6137 if (rdev->meta_bdev) { 6138 /* Nothing to check */; 6139 } else if (rdev->data_offset < rdev->sb_start) { 6140 if (mddev->dev_sectors && 6141 rdev->data_offset + mddev->dev_sectors 6142 > rdev->sb_start) { 6143 pr_warn("md: %s: data overlaps metadata\n", 6144 mdname(mddev)); 6145 return -EINVAL; 6146 } 6147 } else { 6148 if (rdev->sb_start + rdev->sb_size/512 6149 > rdev->data_offset) { 6150 pr_warn("md: %s: metadata overlaps data\n", 6151 mdname(mddev)); 6152 return -EINVAL; 6153 } 6154 } 6155 sysfs_notify_dirent_safe(rdev->sysfs_state); 6156 nowait = nowait && bdev_nowait(rdev->bdev); 6157 } 6158 6159 if (!bioset_initialized(&mddev->bio_set)) { 6160 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6161 if (err) 6162 return err; 6163 } 6164 if (!bioset_initialized(&mddev->sync_set)) { 6165 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6166 if (err) 6167 goto exit_bio_set; 6168 } 6169 6170 if (!bioset_initialized(&mddev->io_clone_set)) { 6171 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 6172 offsetof(struct md_io_clone, bio_clone), 0); 6173 if (err) 6174 goto exit_sync_set; 6175 } 6176 6177 pers = get_pers(mddev->level, mddev->clevel); 6178 if (!pers) { 6179 err = -EINVAL; 6180 goto abort; 6181 } 6182 if (mddev->level != pers->head.id) { 6183 mddev->level = pers->head.id; 6184 mddev->new_level = pers->head.id; 6185 } 6186 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 6187 6188 if (mddev->reshape_position != MaxSector && 6189 pers->start_reshape == NULL) { 6190 /* This personality cannot handle reshaping... */ 6191 put_pers(pers); 6192 err = -EINVAL; 6193 goto abort; 6194 } 6195 6196 if (pers->sync_request) { 6197 /* Warn if this is a potentially silly 6198 * configuration. 6199 */ 6200 struct md_rdev *rdev2; 6201 int warned = 0; 6202 6203 rdev_for_each(rdev, mddev) 6204 rdev_for_each(rdev2, mddev) { 6205 if (rdev < rdev2 && 6206 rdev->bdev->bd_disk == 6207 rdev2->bdev->bd_disk) { 6208 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 6209 mdname(mddev), 6210 rdev->bdev, 6211 rdev2->bdev); 6212 warned = 1; 6213 } 6214 } 6215 6216 if (warned) 6217 pr_warn("True protection against single-disk failure might be compromised.\n"); 6218 } 6219 6220 /* dm-raid expect sync_thread to be frozen until resume */ 6221 if (mddev->gendisk) 6222 mddev->recovery = 0; 6223 6224 /* may be over-ridden by personality */ 6225 mddev->resync_max_sectors = mddev->dev_sectors; 6226 6227 mddev->ok_start_degraded = start_dirty_degraded; 6228 6229 if (start_readonly && md_is_rdwr(mddev)) 6230 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6231 6232 err = pers->run(mddev); 6233 if (err) 6234 pr_warn("md: pers->run() failed ...\n"); 6235 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6236 WARN_ONCE(!mddev->external_size, 6237 "%s: default size too small, but 'external_size' not in effect?\n", 6238 __func__); 6239 pr_warn("md: invalid array_size %llu > default size %llu\n", 6240 (unsigned long long)mddev->array_sectors / 2, 6241 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6242 err = -EINVAL; 6243 } 6244 if (err == 0 && pers->sync_request && 6245 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6246 err = mddev->bitmap_ops->create(mddev); 6247 if (err) 6248 pr_warn("%s: failed to create bitmap (%d)\n", 6249 mdname(mddev), err); 6250 } 6251 if (err) 6252 goto bitmap_abort; 6253 6254 if (mddev->bitmap_info.max_write_behind > 0) { 6255 bool create_pool = false; 6256 6257 rdev_for_each(rdev, mddev) { 6258 if (test_bit(WriteMostly, &rdev->flags) && 6259 rdev_init_serial(rdev)) 6260 create_pool = true; 6261 } 6262 if (create_pool && mddev->serial_info_pool == NULL) { 6263 mddev->serial_info_pool = 6264 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6265 sizeof(struct serial_info)); 6266 if (!mddev->serial_info_pool) { 6267 err = -ENOMEM; 6268 goto bitmap_abort; 6269 } 6270 } 6271 } 6272 6273 if (pers->sync_request) { 6274 if (mddev->kobj.sd && 6275 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6276 pr_warn("md: cannot register extra attributes for %s\n", 6277 mdname(mddev)); 6278 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6279 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6280 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6281 } else if (mddev->ro == MD_AUTO_READ) 6282 mddev->ro = MD_RDWR; 6283 6284 atomic_set(&mddev->max_corr_read_errors, 6285 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6286 mddev->safemode = 0; 6287 if (mddev_is_clustered(mddev)) 6288 mddev->safemode_delay = 0; 6289 else 6290 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6291 mddev->in_sync = 1; 6292 smp_wmb(); 6293 spin_lock(&mddev->lock); 6294 mddev->pers = pers; 6295 spin_unlock(&mddev->lock); 6296 rdev_for_each(rdev, mddev) 6297 if (rdev->raid_disk >= 0) 6298 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6299 6300 if (mddev->degraded && md_is_rdwr(mddev)) 6301 /* This ensures that recovering status is reported immediately 6302 * via sysfs - until a lack of spares is confirmed. 6303 */ 6304 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6305 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6306 6307 if (mddev->sb_flags) 6308 md_update_sb(mddev, 0); 6309 6310 md_new_event(); 6311 return 0; 6312 6313 bitmap_abort: 6314 mddev_detach(mddev); 6315 if (mddev->private) 6316 pers->free(mddev, mddev->private); 6317 mddev->private = NULL; 6318 put_pers(pers); 6319 mddev->bitmap_ops->destroy(mddev); 6320 abort: 6321 bioset_exit(&mddev->io_clone_set); 6322 exit_sync_set: 6323 bioset_exit(&mddev->sync_set); 6324 exit_bio_set: 6325 bioset_exit(&mddev->bio_set); 6326 return err; 6327 } 6328 EXPORT_SYMBOL_GPL(md_run); 6329 6330 int do_md_run(struct mddev *mddev) 6331 { 6332 int err; 6333 6334 set_bit(MD_NOT_READY, &mddev->flags); 6335 err = md_run(mddev); 6336 if (err) 6337 goto out; 6338 6339 err = mddev->bitmap_ops->load(mddev); 6340 if (err) { 6341 mddev->bitmap_ops->destroy(mddev); 6342 goto out; 6343 } 6344 6345 if (mddev_is_clustered(mddev)) 6346 md_allow_write(mddev); 6347 6348 /* run start up tasks that require md_thread */ 6349 md_start(mddev); 6350 6351 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6352 6353 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6354 clear_bit(MD_NOT_READY, &mddev->flags); 6355 mddev->changed = 1; 6356 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6357 sysfs_notify_dirent_safe(mddev->sysfs_state); 6358 sysfs_notify_dirent_safe(mddev->sysfs_action); 6359 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6360 out: 6361 clear_bit(MD_NOT_READY, &mddev->flags); 6362 return err; 6363 } 6364 6365 int md_start(struct mddev *mddev) 6366 { 6367 int ret = 0; 6368 6369 if (mddev->pers->start) { 6370 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6371 ret = mddev->pers->start(mddev); 6372 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6373 md_wakeup_thread(mddev->sync_thread); 6374 } 6375 return ret; 6376 } 6377 EXPORT_SYMBOL_GPL(md_start); 6378 6379 static int restart_array(struct mddev *mddev) 6380 { 6381 struct gendisk *disk = mddev->gendisk; 6382 struct md_rdev *rdev; 6383 bool has_journal = false; 6384 bool has_readonly = false; 6385 6386 /* Complain if it has no devices */ 6387 if (list_empty(&mddev->disks)) 6388 return -ENXIO; 6389 if (!mddev->pers) 6390 return -EINVAL; 6391 if (md_is_rdwr(mddev)) 6392 return -EBUSY; 6393 6394 rcu_read_lock(); 6395 rdev_for_each_rcu(rdev, mddev) { 6396 if (test_bit(Journal, &rdev->flags) && 6397 !test_bit(Faulty, &rdev->flags)) 6398 has_journal = true; 6399 if (rdev_read_only(rdev)) 6400 has_readonly = true; 6401 } 6402 rcu_read_unlock(); 6403 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6404 /* Don't restart rw with journal missing/faulty */ 6405 return -EINVAL; 6406 if (has_readonly) 6407 return -EROFS; 6408 6409 mddev->safemode = 0; 6410 mddev->ro = MD_RDWR; 6411 set_disk_ro(disk, 0); 6412 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6413 /* Kick recovery or resync if necessary */ 6414 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6415 md_wakeup_thread(mddev->sync_thread); 6416 sysfs_notify_dirent_safe(mddev->sysfs_state); 6417 return 0; 6418 } 6419 6420 static void md_clean(struct mddev *mddev) 6421 { 6422 mddev->array_sectors = 0; 6423 mddev->external_size = 0; 6424 mddev->dev_sectors = 0; 6425 mddev->raid_disks = 0; 6426 mddev->resync_offset = 0; 6427 mddev->resync_min = 0; 6428 mddev->resync_max = MaxSector; 6429 mddev->reshape_position = MaxSector; 6430 /* we still need mddev->external in export_rdev, do not clear it yet */ 6431 mddev->persistent = 0; 6432 mddev->level = LEVEL_NONE; 6433 mddev->clevel[0] = 0; 6434 /* if UNTIL_STOP is set, it's cleared here */ 6435 mddev->hold_active = 0; 6436 /* Don't clear MD_CLOSING, or mddev can be opened again. */ 6437 mddev->flags &= BIT_ULL_MASK(MD_CLOSING); 6438 mddev->sb_flags = 0; 6439 mddev->ro = MD_RDWR; 6440 mddev->metadata_type[0] = 0; 6441 mddev->chunk_sectors = 0; 6442 mddev->ctime = mddev->utime = 0; 6443 mddev->layout = 0; 6444 mddev->max_disks = 0; 6445 mddev->events = 0; 6446 mddev->can_decrease_events = 0; 6447 mddev->delta_disks = 0; 6448 mddev->reshape_backwards = 0; 6449 mddev->new_level = LEVEL_NONE; 6450 mddev->new_layout = 0; 6451 mddev->new_chunk_sectors = 0; 6452 mddev->curr_resync = MD_RESYNC_NONE; 6453 atomic64_set(&mddev->resync_mismatches, 0); 6454 mddev->suspend_lo = mddev->suspend_hi = 0; 6455 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6456 mddev->recovery = 0; 6457 mddev->in_sync = 0; 6458 mddev->changed = 0; 6459 mddev->degraded = 0; 6460 mddev->safemode = 0; 6461 mddev->private = NULL; 6462 mddev->cluster_info = NULL; 6463 mddev->bitmap_info.offset = 0; 6464 mddev->bitmap_info.default_offset = 0; 6465 mddev->bitmap_info.default_space = 0; 6466 mddev->bitmap_info.chunksize = 0; 6467 mddev->bitmap_info.daemon_sleep = 0; 6468 mddev->bitmap_info.max_write_behind = 0; 6469 mddev->bitmap_info.nodes = 0; 6470 } 6471 6472 static void __md_stop_writes(struct mddev *mddev) 6473 { 6474 timer_delete_sync(&mddev->safemode_timer); 6475 6476 if (mddev->pers && mddev->pers->quiesce) { 6477 mddev->pers->quiesce(mddev, 1); 6478 mddev->pers->quiesce(mddev, 0); 6479 } 6480 6481 mddev->bitmap_ops->flush(mddev); 6482 6483 if (md_is_rdwr(mddev) && 6484 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6485 mddev->sb_flags)) { 6486 /* mark array as shutdown cleanly */ 6487 if (!mddev_is_clustered(mddev)) 6488 mddev->in_sync = 1; 6489 md_update_sb(mddev, 1); 6490 } 6491 /* disable policy to guarantee rdevs free resources for serialization */ 6492 mddev->serialize_policy = 0; 6493 mddev_destroy_serial_pool(mddev, NULL); 6494 } 6495 6496 void md_stop_writes(struct mddev *mddev) 6497 { 6498 mddev_lock_nointr(mddev); 6499 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6500 stop_sync_thread(mddev, true); 6501 __md_stop_writes(mddev); 6502 mddev_unlock(mddev); 6503 } 6504 EXPORT_SYMBOL_GPL(md_stop_writes); 6505 6506 static void mddev_detach(struct mddev *mddev) 6507 { 6508 mddev->bitmap_ops->wait_behind_writes(mddev); 6509 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6510 mddev->pers->quiesce(mddev, 1); 6511 mddev->pers->quiesce(mddev, 0); 6512 } 6513 md_unregister_thread(mddev, &mddev->thread); 6514 6515 /* the unplug fn references 'conf' */ 6516 if (!mddev_is_dm(mddev)) 6517 blk_sync_queue(mddev->gendisk->queue); 6518 } 6519 6520 static void __md_stop(struct mddev *mddev) 6521 { 6522 struct md_personality *pers = mddev->pers; 6523 6524 mddev->bitmap_ops->destroy(mddev); 6525 mddev_detach(mddev); 6526 spin_lock(&mddev->lock); 6527 mddev->pers = NULL; 6528 spin_unlock(&mddev->lock); 6529 if (mddev->private) 6530 pers->free(mddev, mddev->private); 6531 mddev->private = NULL; 6532 put_pers(pers); 6533 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6534 6535 bioset_exit(&mddev->bio_set); 6536 bioset_exit(&mddev->sync_set); 6537 bioset_exit(&mddev->io_clone_set); 6538 } 6539 6540 void md_stop(struct mddev *mddev) 6541 { 6542 lockdep_assert_held(&mddev->reconfig_mutex); 6543 6544 /* stop the array and free an attached data structures. 6545 * This is called from dm-raid 6546 */ 6547 __md_stop_writes(mddev); 6548 __md_stop(mddev); 6549 } 6550 6551 EXPORT_SYMBOL_GPL(md_stop); 6552 6553 /* ensure 'mddev->pers' exist before calling md_set_readonly() */ 6554 static int md_set_readonly(struct mddev *mddev) 6555 { 6556 int err = 0; 6557 int did_freeze = 0; 6558 6559 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6560 return -EBUSY; 6561 6562 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6563 did_freeze = 1; 6564 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6565 } 6566 6567 stop_sync_thread(mddev, false); 6568 wait_event(mddev->sb_wait, 6569 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6570 mddev_lock_nointr(mddev); 6571 6572 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6573 pr_warn("md: %s still in use.\n",mdname(mddev)); 6574 err = -EBUSY; 6575 goto out; 6576 } 6577 6578 __md_stop_writes(mddev); 6579 6580 if (mddev->ro == MD_RDONLY) { 6581 err = -ENXIO; 6582 goto out; 6583 } 6584 6585 mddev->ro = MD_RDONLY; 6586 set_disk_ro(mddev->gendisk, 1); 6587 6588 out: 6589 if (!err || did_freeze) { 6590 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6591 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6592 sysfs_notify_dirent_safe(mddev->sysfs_state); 6593 } 6594 6595 return err; 6596 } 6597 6598 /* mode: 6599 * 0 - completely stop and dis-assemble array 6600 * 2 - stop but do not disassemble array 6601 */ 6602 static int do_md_stop(struct mddev *mddev, int mode) 6603 { 6604 struct gendisk *disk = mddev->gendisk; 6605 struct md_rdev *rdev; 6606 int did_freeze = 0; 6607 6608 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6609 did_freeze = 1; 6610 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6611 } 6612 6613 stop_sync_thread(mddev, true); 6614 6615 if (mddev->sysfs_active || 6616 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6617 pr_warn("md: %s still in use.\n",mdname(mddev)); 6618 if (did_freeze) { 6619 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6620 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6621 } 6622 return -EBUSY; 6623 } 6624 if (mddev->pers) { 6625 if (!md_is_rdwr(mddev)) 6626 set_disk_ro(disk, 0); 6627 6628 __md_stop_writes(mddev); 6629 __md_stop(mddev); 6630 6631 /* tell userspace to handle 'inactive' */ 6632 sysfs_notify_dirent_safe(mddev->sysfs_state); 6633 6634 rdev_for_each(rdev, mddev) 6635 if (rdev->raid_disk >= 0) 6636 sysfs_unlink_rdev(mddev, rdev); 6637 6638 set_capacity_and_notify(disk, 0); 6639 mddev->changed = 1; 6640 6641 if (!md_is_rdwr(mddev)) 6642 mddev->ro = MD_RDWR; 6643 } 6644 /* 6645 * Free resources if final stop 6646 */ 6647 if (mode == 0) { 6648 pr_info("md: %s stopped.\n", mdname(mddev)); 6649 6650 if (mddev->bitmap_info.file) { 6651 struct file *f = mddev->bitmap_info.file; 6652 spin_lock(&mddev->lock); 6653 mddev->bitmap_info.file = NULL; 6654 spin_unlock(&mddev->lock); 6655 fput(f); 6656 } 6657 mddev->bitmap_info.offset = 0; 6658 6659 export_array(mddev); 6660 md_clean(mddev); 6661 set_bit(MD_DELETED, &mddev->flags); 6662 } 6663 md_new_event(); 6664 sysfs_notify_dirent_safe(mddev->sysfs_state); 6665 return 0; 6666 } 6667 6668 #ifndef MODULE 6669 static void autorun_array(struct mddev *mddev) 6670 { 6671 struct md_rdev *rdev; 6672 int err; 6673 6674 if (list_empty(&mddev->disks)) 6675 return; 6676 6677 pr_info("md: running: "); 6678 6679 rdev_for_each(rdev, mddev) { 6680 pr_cont("<%pg>", rdev->bdev); 6681 } 6682 pr_cont("\n"); 6683 6684 err = do_md_run(mddev); 6685 if (err) { 6686 pr_warn("md: do_md_run() returned %d\n", err); 6687 do_md_stop(mddev, 0); 6688 } 6689 } 6690 6691 /* 6692 * lets try to run arrays based on all disks that have arrived 6693 * until now. (those are in pending_raid_disks) 6694 * 6695 * the method: pick the first pending disk, collect all disks with 6696 * the same UUID, remove all from the pending list and put them into 6697 * the 'same_array' list. Then order this list based on superblock 6698 * update time (freshest comes first), kick out 'old' disks and 6699 * compare superblocks. If everything's fine then run it. 6700 * 6701 * If "unit" is allocated, then bump its reference count 6702 */ 6703 static void autorun_devices(int part) 6704 { 6705 struct md_rdev *rdev0, *rdev, *tmp; 6706 struct mddev *mddev; 6707 6708 pr_info("md: autorun ...\n"); 6709 while (!list_empty(&pending_raid_disks)) { 6710 int unit; 6711 dev_t dev; 6712 LIST_HEAD(candidates); 6713 rdev0 = list_entry(pending_raid_disks.next, 6714 struct md_rdev, same_set); 6715 6716 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6717 INIT_LIST_HEAD(&candidates); 6718 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6719 if (super_90_load(rdev, rdev0, 0) >= 0) { 6720 pr_debug("md: adding %pg ...\n", 6721 rdev->bdev); 6722 list_move(&rdev->same_set, &candidates); 6723 } 6724 /* 6725 * now we have a set of devices, with all of them having 6726 * mostly sane superblocks. It's time to allocate the 6727 * mddev. 6728 */ 6729 if (part) { 6730 dev = MKDEV(mdp_major, 6731 rdev0->preferred_minor << MdpMinorShift); 6732 unit = MINOR(dev) >> MdpMinorShift; 6733 } else { 6734 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6735 unit = MINOR(dev); 6736 } 6737 if (rdev0->preferred_minor != unit) { 6738 pr_warn("md: unit number in %pg is bad: %d\n", 6739 rdev0->bdev, rdev0->preferred_minor); 6740 break; 6741 } 6742 6743 mddev = md_alloc(dev, NULL); 6744 if (IS_ERR(mddev)) 6745 break; 6746 6747 if (mddev_suspend_and_lock(mddev)) 6748 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6749 else if (mddev->raid_disks || mddev->major_version 6750 || !list_empty(&mddev->disks)) { 6751 pr_warn("md: %s already running, cannot run %pg\n", 6752 mdname(mddev), rdev0->bdev); 6753 mddev_unlock_and_resume(mddev); 6754 } else { 6755 pr_debug("md: created %s\n", mdname(mddev)); 6756 mddev->persistent = 1; 6757 rdev_for_each_list(rdev, tmp, &candidates) { 6758 list_del_init(&rdev->same_set); 6759 if (bind_rdev_to_array(rdev, mddev)) 6760 export_rdev(rdev, mddev); 6761 } 6762 autorun_array(mddev); 6763 mddev_unlock_and_resume(mddev); 6764 } 6765 /* on success, candidates will be empty, on error 6766 * it won't... 6767 */ 6768 rdev_for_each_list(rdev, tmp, &candidates) { 6769 list_del_init(&rdev->same_set); 6770 export_rdev(rdev, mddev); 6771 } 6772 mddev_put(mddev); 6773 } 6774 pr_info("md: ... autorun DONE.\n"); 6775 } 6776 #endif /* !MODULE */ 6777 6778 static int get_version(void __user *arg) 6779 { 6780 mdu_version_t ver; 6781 6782 ver.major = MD_MAJOR_VERSION; 6783 ver.minor = MD_MINOR_VERSION; 6784 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6785 6786 if (copy_to_user(arg, &ver, sizeof(ver))) 6787 return -EFAULT; 6788 6789 return 0; 6790 } 6791 6792 static int get_array_info(struct mddev *mddev, void __user *arg) 6793 { 6794 mdu_array_info_t info; 6795 int nr,working,insync,failed,spare; 6796 struct md_rdev *rdev; 6797 6798 nr = working = insync = failed = spare = 0; 6799 rcu_read_lock(); 6800 rdev_for_each_rcu(rdev, mddev) { 6801 nr++; 6802 if (test_bit(Faulty, &rdev->flags)) 6803 failed++; 6804 else { 6805 working++; 6806 if (test_bit(In_sync, &rdev->flags)) 6807 insync++; 6808 else if (test_bit(Journal, &rdev->flags)) 6809 /* TODO: add journal count to md_u.h */ 6810 ; 6811 else 6812 spare++; 6813 } 6814 } 6815 rcu_read_unlock(); 6816 6817 info.major_version = mddev->major_version; 6818 info.minor_version = mddev->minor_version; 6819 info.patch_version = MD_PATCHLEVEL_VERSION; 6820 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6821 info.level = mddev->level; 6822 info.size = mddev->dev_sectors / 2; 6823 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6824 info.size = -1; 6825 info.nr_disks = nr; 6826 info.raid_disks = mddev->raid_disks; 6827 info.md_minor = mddev->md_minor; 6828 info.not_persistent= !mddev->persistent; 6829 6830 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6831 info.state = 0; 6832 if (mddev->in_sync) 6833 info.state = (1<<MD_SB_CLEAN); 6834 if (mddev->bitmap && mddev->bitmap_info.offset) 6835 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6836 if (mddev_is_clustered(mddev)) 6837 info.state |= (1<<MD_SB_CLUSTERED); 6838 info.active_disks = insync; 6839 info.working_disks = working; 6840 info.failed_disks = failed; 6841 info.spare_disks = spare; 6842 6843 info.layout = mddev->layout; 6844 info.chunk_size = mddev->chunk_sectors << 9; 6845 6846 if (copy_to_user(arg, &info, sizeof(info))) 6847 return -EFAULT; 6848 6849 return 0; 6850 } 6851 6852 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6853 { 6854 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6855 char *ptr; 6856 int err; 6857 6858 file = kzalloc(sizeof(*file), GFP_NOIO); 6859 if (!file) 6860 return -ENOMEM; 6861 6862 err = 0; 6863 spin_lock(&mddev->lock); 6864 /* bitmap enabled */ 6865 if (mddev->bitmap_info.file) { 6866 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6867 sizeof(file->pathname)); 6868 if (IS_ERR(ptr)) 6869 err = PTR_ERR(ptr); 6870 else 6871 memmove(file->pathname, ptr, 6872 sizeof(file->pathname)-(ptr-file->pathname)); 6873 } 6874 spin_unlock(&mddev->lock); 6875 6876 if (err == 0 && 6877 copy_to_user(arg, file, sizeof(*file))) 6878 err = -EFAULT; 6879 6880 kfree(file); 6881 return err; 6882 } 6883 6884 static int get_disk_info(struct mddev *mddev, void __user * arg) 6885 { 6886 mdu_disk_info_t info; 6887 struct md_rdev *rdev; 6888 6889 if (copy_from_user(&info, arg, sizeof(info))) 6890 return -EFAULT; 6891 6892 rcu_read_lock(); 6893 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6894 if (rdev) { 6895 info.major = MAJOR(rdev->bdev->bd_dev); 6896 info.minor = MINOR(rdev->bdev->bd_dev); 6897 info.raid_disk = rdev->raid_disk; 6898 info.state = 0; 6899 if (test_bit(Faulty, &rdev->flags)) 6900 info.state |= (1<<MD_DISK_FAULTY); 6901 else if (test_bit(In_sync, &rdev->flags)) { 6902 info.state |= (1<<MD_DISK_ACTIVE); 6903 info.state |= (1<<MD_DISK_SYNC); 6904 } 6905 if (test_bit(Journal, &rdev->flags)) 6906 info.state |= (1<<MD_DISK_JOURNAL); 6907 if (test_bit(WriteMostly, &rdev->flags)) 6908 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6909 if (test_bit(FailFast, &rdev->flags)) 6910 info.state |= (1<<MD_DISK_FAILFAST); 6911 } else { 6912 info.major = info.minor = 0; 6913 info.raid_disk = -1; 6914 info.state = (1<<MD_DISK_REMOVED); 6915 } 6916 rcu_read_unlock(); 6917 6918 if (copy_to_user(arg, &info, sizeof(info))) 6919 return -EFAULT; 6920 6921 return 0; 6922 } 6923 6924 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6925 { 6926 struct md_rdev *rdev; 6927 dev_t dev = MKDEV(info->major,info->minor); 6928 6929 if (mddev_is_clustered(mddev) && 6930 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6931 pr_warn("%s: Cannot add to clustered mddev.\n", 6932 mdname(mddev)); 6933 return -EINVAL; 6934 } 6935 6936 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6937 return -EOVERFLOW; 6938 6939 if (!mddev->raid_disks) { 6940 int err; 6941 /* expecting a device which has a superblock */ 6942 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6943 if (IS_ERR(rdev)) { 6944 pr_warn("md: md_import_device returned %ld\n", 6945 PTR_ERR(rdev)); 6946 return PTR_ERR(rdev); 6947 } 6948 if (!list_empty(&mddev->disks)) { 6949 struct md_rdev *rdev0 6950 = list_entry(mddev->disks.next, 6951 struct md_rdev, same_set); 6952 err = super_types[mddev->major_version] 6953 .load_super(rdev, rdev0, mddev->minor_version); 6954 if (err < 0) { 6955 pr_warn("md: %pg has different UUID to %pg\n", 6956 rdev->bdev, 6957 rdev0->bdev); 6958 export_rdev(rdev, mddev); 6959 return -EINVAL; 6960 } 6961 } 6962 err = bind_rdev_to_array(rdev, mddev); 6963 if (err) 6964 export_rdev(rdev, mddev); 6965 return err; 6966 } 6967 6968 /* 6969 * md_add_new_disk can be used once the array is assembled 6970 * to add "hot spares". They must already have a superblock 6971 * written 6972 */ 6973 if (mddev->pers) { 6974 int err; 6975 if (!mddev->pers->hot_add_disk) { 6976 pr_warn("%s: personality does not support diskops!\n", 6977 mdname(mddev)); 6978 return -EINVAL; 6979 } 6980 if (mddev->persistent) 6981 rdev = md_import_device(dev, mddev->major_version, 6982 mddev->minor_version); 6983 else 6984 rdev = md_import_device(dev, -1, -1); 6985 if (IS_ERR(rdev)) { 6986 pr_warn("md: md_import_device returned %ld\n", 6987 PTR_ERR(rdev)); 6988 return PTR_ERR(rdev); 6989 } 6990 /* set saved_raid_disk if appropriate */ 6991 if (!mddev->persistent) { 6992 if (info->state & (1<<MD_DISK_SYNC) && 6993 info->raid_disk < mddev->raid_disks) { 6994 rdev->raid_disk = info->raid_disk; 6995 clear_bit(Bitmap_sync, &rdev->flags); 6996 } else 6997 rdev->raid_disk = -1; 6998 rdev->saved_raid_disk = rdev->raid_disk; 6999 } else 7000 super_types[mddev->major_version]. 7001 validate_super(mddev, NULL/*freshest*/, rdev); 7002 if ((info->state & (1<<MD_DISK_SYNC)) && 7003 rdev->raid_disk != info->raid_disk) { 7004 /* This was a hot-add request, but events doesn't 7005 * match, so reject it. 7006 */ 7007 export_rdev(rdev, mddev); 7008 return -EINVAL; 7009 } 7010 7011 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 7012 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7013 set_bit(WriteMostly, &rdev->flags); 7014 else 7015 clear_bit(WriteMostly, &rdev->flags); 7016 if (info->state & (1<<MD_DISK_FAILFAST)) 7017 set_bit(FailFast, &rdev->flags); 7018 else 7019 clear_bit(FailFast, &rdev->flags); 7020 7021 if (info->state & (1<<MD_DISK_JOURNAL)) { 7022 struct md_rdev *rdev2; 7023 bool has_journal = false; 7024 7025 /* make sure no existing journal disk */ 7026 rdev_for_each(rdev2, mddev) { 7027 if (test_bit(Journal, &rdev2->flags)) { 7028 has_journal = true; 7029 break; 7030 } 7031 } 7032 if (has_journal || mddev->bitmap) { 7033 export_rdev(rdev, mddev); 7034 return -EBUSY; 7035 } 7036 set_bit(Journal, &rdev->flags); 7037 } 7038 /* 7039 * check whether the device shows up in other nodes 7040 */ 7041 if (mddev_is_clustered(mddev)) { 7042 if (info->state & (1 << MD_DISK_CANDIDATE)) 7043 set_bit(Candidate, &rdev->flags); 7044 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 7045 /* --add initiated by this node */ 7046 err = mddev->cluster_ops->add_new_disk(mddev, rdev); 7047 if (err) { 7048 export_rdev(rdev, mddev); 7049 return err; 7050 } 7051 } 7052 } 7053 7054 rdev->raid_disk = -1; 7055 err = bind_rdev_to_array(rdev, mddev); 7056 7057 if (err) 7058 export_rdev(rdev, mddev); 7059 7060 if (mddev_is_clustered(mddev)) { 7061 if (info->state & (1 << MD_DISK_CANDIDATE)) { 7062 if (!err) { 7063 err = mddev->cluster_ops->new_disk_ack( 7064 mddev, err == 0); 7065 if (err) 7066 md_kick_rdev_from_array(rdev); 7067 } 7068 } else { 7069 if (err) 7070 mddev->cluster_ops->add_new_disk_cancel(mddev); 7071 else 7072 err = add_bound_rdev(rdev); 7073 } 7074 7075 } else if (!err) 7076 err = add_bound_rdev(rdev); 7077 7078 return err; 7079 } 7080 7081 /* otherwise, md_add_new_disk is only allowed 7082 * for major_version==0 superblocks 7083 */ 7084 if (mddev->major_version != 0) { 7085 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 7086 return -EINVAL; 7087 } 7088 7089 if (!(info->state & (1<<MD_DISK_FAULTY))) { 7090 int err; 7091 rdev = md_import_device(dev, -1, 0); 7092 if (IS_ERR(rdev)) { 7093 pr_warn("md: error, md_import_device() returned %ld\n", 7094 PTR_ERR(rdev)); 7095 return PTR_ERR(rdev); 7096 } 7097 rdev->desc_nr = info->number; 7098 if (info->raid_disk < mddev->raid_disks) 7099 rdev->raid_disk = info->raid_disk; 7100 else 7101 rdev->raid_disk = -1; 7102 7103 if (rdev->raid_disk < mddev->raid_disks) 7104 if (info->state & (1<<MD_DISK_SYNC)) 7105 set_bit(In_sync, &rdev->flags); 7106 7107 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7108 set_bit(WriteMostly, &rdev->flags); 7109 if (info->state & (1<<MD_DISK_FAILFAST)) 7110 set_bit(FailFast, &rdev->flags); 7111 7112 if (!mddev->persistent) { 7113 pr_debug("md: nonpersistent superblock ...\n"); 7114 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7115 } else 7116 rdev->sb_start = calc_dev_sboffset(rdev); 7117 rdev->sectors = rdev->sb_start; 7118 7119 err = bind_rdev_to_array(rdev, mddev); 7120 if (err) { 7121 export_rdev(rdev, mddev); 7122 return err; 7123 } 7124 } 7125 7126 return 0; 7127 } 7128 7129 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 7130 { 7131 struct md_rdev *rdev; 7132 7133 if (!mddev->pers) 7134 return -ENODEV; 7135 7136 rdev = find_rdev(mddev, dev); 7137 if (!rdev) 7138 return -ENXIO; 7139 7140 if (rdev->raid_disk < 0) 7141 goto kick_rdev; 7142 7143 clear_bit(Blocked, &rdev->flags); 7144 remove_and_add_spares(mddev, rdev); 7145 7146 if (rdev->raid_disk >= 0) 7147 goto busy; 7148 7149 kick_rdev: 7150 if (mddev_is_clustered(mddev) && 7151 mddev->cluster_ops->remove_disk(mddev, rdev)) 7152 goto busy; 7153 7154 md_kick_rdev_from_array(rdev); 7155 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7156 if (!mddev->thread) 7157 md_update_sb(mddev, 1); 7158 md_new_event(); 7159 7160 return 0; 7161 busy: 7162 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7163 rdev->bdev, mdname(mddev)); 7164 return -EBUSY; 7165 } 7166 7167 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7168 { 7169 int err; 7170 struct md_rdev *rdev; 7171 7172 if (!mddev->pers) 7173 return -ENODEV; 7174 7175 if (mddev->major_version != 0) { 7176 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7177 mdname(mddev)); 7178 return -EINVAL; 7179 } 7180 if (!mddev->pers->hot_add_disk) { 7181 pr_warn("%s: personality does not support diskops!\n", 7182 mdname(mddev)); 7183 return -EINVAL; 7184 } 7185 7186 rdev = md_import_device(dev, -1, 0); 7187 if (IS_ERR(rdev)) { 7188 pr_warn("md: error, md_import_device() returned %ld\n", 7189 PTR_ERR(rdev)); 7190 return -EINVAL; 7191 } 7192 7193 if (mddev->persistent) 7194 rdev->sb_start = calc_dev_sboffset(rdev); 7195 else 7196 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7197 7198 rdev->sectors = rdev->sb_start; 7199 7200 if (test_bit(Faulty, &rdev->flags)) { 7201 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7202 rdev->bdev, mdname(mddev)); 7203 err = -EINVAL; 7204 goto abort_export; 7205 } 7206 7207 clear_bit(In_sync, &rdev->flags); 7208 rdev->desc_nr = -1; 7209 rdev->saved_raid_disk = -1; 7210 err = bind_rdev_to_array(rdev, mddev); 7211 if (err) 7212 goto abort_export; 7213 7214 /* 7215 * The rest should better be atomic, we can have disk failures 7216 * noticed in interrupt contexts ... 7217 */ 7218 7219 rdev->raid_disk = -1; 7220 7221 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7222 if (!mddev->thread) 7223 md_update_sb(mddev, 1); 7224 /* 7225 * Kick recovery, maybe this spare has to be added to the 7226 * array immediately. 7227 */ 7228 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7229 md_new_event(); 7230 return 0; 7231 7232 abort_export: 7233 export_rdev(rdev, mddev); 7234 return err; 7235 } 7236 7237 static int set_bitmap_file(struct mddev *mddev, int fd) 7238 { 7239 int err = 0; 7240 7241 if (mddev->pers) { 7242 if (!mddev->pers->quiesce || !mddev->thread) 7243 return -EBUSY; 7244 if (mddev->recovery || mddev->sync_thread) 7245 return -EBUSY; 7246 /* we should be able to change the bitmap.. */ 7247 } 7248 7249 if (fd >= 0) { 7250 struct inode *inode; 7251 struct file *f; 7252 7253 if (mddev->bitmap || mddev->bitmap_info.file) 7254 return -EEXIST; /* cannot add when bitmap is present */ 7255 7256 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7257 pr_warn("%s: bitmap files not supported by this kernel\n", 7258 mdname(mddev)); 7259 return -EINVAL; 7260 } 7261 pr_warn("%s: using deprecated bitmap file support\n", 7262 mdname(mddev)); 7263 7264 f = fget(fd); 7265 7266 if (f == NULL) { 7267 pr_warn("%s: error: failed to get bitmap file\n", 7268 mdname(mddev)); 7269 return -EBADF; 7270 } 7271 7272 inode = f->f_mapping->host; 7273 if (!S_ISREG(inode->i_mode)) { 7274 pr_warn("%s: error: bitmap file must be a regular file\n", 7275 mdname(mddev)); 7276 err = -EBADF; 7277 } else if (!(f->f_mode & FMODE_WRITE)) { 7278 pr_warn("%s: error: bitmap file must open for write\n", 7279 mdname(mddev)); 7280 err = -EBADF; 7281 } else if (atomic_read(&inode->i_writecount) != 1) { 7282 pr_warn("%s: error: bitmap file is already in use\n", 7283 mdname(mddev)); 7284 err = -EBUSY; 7285 } 7286 if (err) { 7287 fput(f); 7288 return err; 7289 } 7290 mddev->bitmap_info.file = f; 7291 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7292 } else if (mddev->bitmap == NULL) 7293 return -ENOENT; /* cannot remove what isn't there */ 7294 err = 0; 7295 if (mddev->pers) { 7296 if (fd >= 0) { 7297 err = mddev->bitmap_ops->create(mddev); 7298 if (!err) 7299 err = mddev->bitmap_ops->load(mddev); 7300 7301 if (err) { 7302 mddev->bitmap_ops->destroy(mddev); 7303 fd = -1; 7304 } 7305 } else if (fd < 0) { 7306 mddev->bitmap_ops->destroy(mddev); 7307 } 7308 } 7309 7310 if (fd < 0) { 7311 struct file *f = mddev->bitmap_info.file; 7312 if (f) { 7313 spin_lock(&mddev->lock); 7314 mddev->bitmap_info.file = NULL; 7315 spin_unlock(&mddev->lock); 7316 fput(f); 7317 } 7318 } 7319 7320 return err; 7321 } 7322 7323 /* 7324 * md_set_array_info is used two different ways 7325 * The original usage is when creating a new array. 7326 * In this usage, raid_disks is > 0 and it together with 7327 * level, size, not_persistent,layout,chunksize determine the 7328 * shape of the array. 7329 * This will always create an array with a type-0.90.0 superblock. 7330 * The newer usage is when assembling an array. 7331 * In this case raid_disks will be 0, and the major_version field is 7332 * use to determine which style super-blocks are to be found on the devices. 7333 * The minor and patch _version numbers are also kept incase the 7334 * super_block handler wishes to interpret them. 7335 */ 7336 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7337 { 7338 if (info->raid_disks == 0) { 7339 /* just setting version number for superblock loading */ 7340 if (info->major_version < 0 || 7341 info->major_version >= ARRAY_SIZE(super_types) || 7342 super_types[info->major_version].name == NULL) { 7343 /* maybe try to auto-load a module? */ 7344 pr_warn("md: superblock version %d not known\n", 7345 info->major_version); 7346 return -EINVAL; 7347 } 7348 mddev->major_version = info->major_version; 7349 mddev->minor_version = info->minor_version; 7350 mddev->patch_version = info->patch_version; 7351 mddev->persistent = !info->not_persistent; 7352 /* ensure mddev_put doesn't delete this now that there 7353 * is some minimal configuration. 7354 */ 7355 mddev->ctime = ktime_get_real_seconds(); 7356 return 0; 7357 } 7358 mddev->major_version = MD_MAJOR_VERSION; 7359 mddev->minor_version = MD_MINOR_VERSION; 7360 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7361 mddev->ctime = ktime_get_real_seconds(); 7362 7363 mddev->level = info->level; 7364 mddev->clevel[0] = 0; 7365 mddev->dev_sectors = 2 * (sector_t)info->size; 7366 mddev->raid_disks = info->raid_disks; 7367 /* don't set md_minor, it is determined by which /dev/md* was 7368 * openned 7369 */ 7370 if (info->state & (1<<MD_SB_CLEAN)) 7371 mddev->resync_offset = MaxSector; 7372 else 7373 mddev->resync_offset = 0; 7374 mddev->persistent = ! info->not_persistent; 7375 mddev->external = 0; 7376 7377 mddev->layout = info->layout; 7378 if (mddev->level == 0) 7379 /* Cannot trust RAID0 layout info here */ 7380 mddev->layout = -1; 7381 mddev->chunk_sectors = info->chunk_size >> 9; 7382 7383 if (mddev->persistent) { 7384 mddev->max_disks = MD_SB_DISKS; 7385 mddev->flags = 0; 7386 mddev->sb_flags = 0; 7387 } 7388 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7389 7390 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7391 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7392 mddev->bitmap_info.offset = 0; 7393 7394 mddev->reshape_position = MaxSector; 7395 7396 /* 7397 * Generate a 128 bit UUID 7398 */ 7399 get_random_bytes(mddev->uuid, 16); 7400 7401 mddev->new_level = mddev->level; 7402 mddev->new_chunk_sectors = mddev->chunk_sectors; 7403 mddev->new_layout = mddev->layout; 7404 mddev->delta_disks = 0; 7405 mddev->reshape_backwards = 0; 7406 7407 return 0; 7408 } 7409 7410 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7411 { 7412 lockdep_assert_held(&mddev->reconfig_mutex); 7413 7414 if (mddev->external_size) 7415 return; 7416 7417 mddev->array_sectors = array_sectors; 7418 } 7419 EXPORT_SYMBOL(md_set_array_sectors); 7420 7421 static int update_size(struct mddev *mddev, sector_t num_sectors) 7422 { 7423 struct md_rdev *rdev; 7424 int rv; 7425 int fit = (num_sectors == 0); 7426 sector_t old_dev_sectors = mddev->dev_sectors; 7427 7428 if (mddev->pers->resize == NULL) 7429 return -EINVAL; 7430 /* The "num_sectors" is the number of sectors of each device that 7431 * is used. This can only make sense for arrays with redundancy. 7432 * linear and raid0 always use whatever space is available. We can only 7433 * consider changing this number if no resync or reconstruction is 7434 * happening, and if the new size is acceptable. It must fit before the 7435 * sb_start or, if that is <data_offset, it must fit before the size 7436 * of each device. If num_sectors is zero, we find the largest size 7437 * that fits. 7438 */ 7439 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7440 return -EBUSY; 7441 if (!md_is_rdwr(mddev)) 7442 return -EROFS; 7443 7444 rdev_for_each(rdev, mddev) { 7445 sector_t avail = rdev->sectors; 7446 7447 if (fit && (num_sectors == 0 || num_sectors > avail)) 7448 num_sectors = avail; 7449 if (avail < num_sectors) 7450 return -ENOSPC; 7451 } 7452 rv = mddev->pers->resize(mddev, num_sectors); 7453 if (!rv) { 7454 if (mddev_is_clustered(mddev)) 7455 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 7456 else if (!mddev_is_dm(mddev)) 7457 set_capacity_and_notify(mddev->gendisk, 7458 mddev->array_sectors); 7459 } 7460 return rv; 7461 } 7462 7463 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7464 { 7465 int rv; 7466 struct md_rdev *rdev; 7467 /* change the number of raid disks */ 7468 if (mddev->pers->check_reshape == NULL) 7469 return -EINVAL; 7470 if (!md_is_rdwr(mddev)) 7471 return -EROFS; 7472 if (raid_disks <= 0 || 7473 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7474 return -EINVAL; 7475 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7476 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7477 mddev->reshape_position != MaxSector) 7478 return -EBUSY; 7479 7480 rdev_for_each(rdev, mddev) { 7481 if (mddev->raid_disks < raid_disks && 7482 rdev->data_offset < rdev->new_data_offset) 7483 return -EINVAL; 7484 if (mddev->raid_disks > raid_disks && 7485 rdev->data_offset > rdev->new_data_offset) 7486 return -EINVAL; 7487 } 7488 7489 mddev->delta_disks = raid_disks - mddev->raid_disks; 7490 if (mddev->delta_disks < 0) 7491 mddev->reshape_backwards = 1; 7492 else if (mddev->delta_disks > 0) 7493 mddev->reshape_backwards = 0; 7494 7495 rv = mddev->pers->check_reshape(mddev); 7496 if (rv < 0) { 7497 mddev->delta_disks = 0; 7498 mddev->reshape_backwards = 0; 7499 } 7500 return rv; 7501 } 7502 7503 static int get_cluster_ops(struct mddev *mddev) 7504 { 7505 xa_lock(&md_submodule); 7506 mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); 7507 if (mddev->cluster_ops && 7508 !try_module_get(mddev->cluster_ops->head.owner)) 7509 mddev->cluster_ops = NULL; 7510 xa_unlock(&md_submodule); 7511 7512 return mddev->cluster_ops == NULL ? -ENOENT : 0; 7513 } 7514 7515 static void put_cluster_ops(struct mddev *mddev) 7516 { 7517 if (!mddev->cluster_ops) 7518 return; 7519 7520 mddev->cluster_ops->leave(mddev); 7521 module_put(mddev->cluster_ops->head.owner); 7522 mddev->cluster_ops = NULL; 7523 } 7524 7525 /* 7526 * update_array_info is used to change the configuration of an 7527 * on-line array. 7528 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7529 * fields in the info are checked against the array. 7530 * Any differences that cannot be handled will cause an error. 7531 * Normally, only one change can be managed at a time. 7532 */ 7533 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7534 { 7535 int rv = 0; 7536 int cnt = 0; 7537 int state = 0; 7538 7539 /* calculate expected state,ignoring low bits */ 7540 if (mddev->bitmap && mddev->bitmap_info.offset) 7541 state |= (1 << MD_SB_BITMAP_PRESENT); 7542 7543 if (mddev->major_version != info->major_version || 7544 mddev->minor_version != info->minor_version || 7545 /* mddev->patch_version != info->patch_version || */ 7546 mddev->ctime != info->ctime || 7547 mddev->level != info->level || 7548 /* mddev->layout != info->layout || */ 7549 mddev->persistent != !info->not_persistent || 7550 mddev->chunk_sectors != info->chunk_size >> 9 || 7551 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7552 ((state^info->state) & 0xfffffe00) 7553 ) 7554 return -EINVAL; 7555 /* Check there is only one change */ 7556 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7557 cnt++; 7558 if (mddev->raid_disks != info->raid_disks) 7559 cnt++; 7560 if (mddev->layout != info->layout) 7561 cnt++; 7562 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7563 cnt++; 7564 if (cnt == 0) 7565 return 0; 7566 if (cnt > 1) 7567 return -EINVAL; 7568 7569 if (mddev->layout != info->layout) { 7570 /* Change layout 7571 * we don't need to do anything at the md level, the 7572 * personality will take care of it all. 7573 */ 7574 if (mddev->pers->check_reshape == NULL) 7575 return -EINVAL; 7576 else { 7577 mddev->new_layout = info->layout; 7578 rv = mddev->pers->check_reshape(mddev); 7579 if (rv) 7580 mddev->new_layout = mddev->layout; 7581 return rv; 7582 } 7583 } 7584 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7585 rv = update_size(mddev, (sector_t)info->size * 2); 7586 7587 if (mddev->raid_disks != info->raid_disks) 7588 rv = update_raid_disks(mddev, info->raid_disks); 7589 7590 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7591 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7592 rv = -EINVAL; 7593 goto err; 7594 } 7595 if (mddev->recovery || mddev->sync_thread) { 7596 rv = -EBUSY; 7597 goto err; 7598 } 7599 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7600 /* add the bitmap */ 7601 if (mddev->bitmap) { 7602 rv = -EEXIST; 7603 goto err; 7604 } 7605 if (mddev->bitmap_info.default_offset == 0) { 7606 rv = -EINVAL; 7607 goto err; 7608 } 7609 mddev->bitmap_info.offset = 7610 mddev->bitmap_info.default_offset; 7611 mddev->bitmap_info.space = 7612 mddev->bitmap_info.default_space; 7613 rv = mddev->bitmap_ops->create(mddev); 7614 if (!rv) 7615 rv = mddev->bitmap_ops->load(mddev); 7616 7617 if (rv) 7618 mddev->bitmap_ops->destroy(mddev); 7619 } else { 7620 struct md_bitmap_stats stats; 7621 7622 rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 7623 if (rv) 7624 goto err; 7625 7626 if (stats.file) { 7627 rv = -EINVAL; 7628 goto err; 7629 } 7630 7631 if (mddev->bitmap_info.nodes) { 7632 /* hold PW on all the bitmap lock */ 7633 if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7634 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7635 rv = -EPERM; 7636 mddev->cluster_ops->unlock_all_bitmaps(mddev); 7637 goto err; 7638 } 7639 7640 mddev->bitmap_info.nodes = 0; 7641 put_cluster_ops(mddev); 7642 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7643 } 7644 mddev->bitmap_ops->destroy(mddev); 7645 mddev->bitmap_info.offset = 0; 7646 } 7647 } 7648 md_update_sb(mddev, 1); 7649 return rv; 7650 err: 7651 return rv; 7652 } 7653 7654 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7655 { 7656 struct md_rdev *rdev; 7657 int err = 0; 7658 7659 if (mddev->pers == NULL) 7660 return -ENODEV; 7661 7662 rcu_read_lock(); 7663 rdev = md_find_rdev_rcu(mddev, dev); 7664 if (!rdev) 7665 err = -ENODEV; 7666 else { 7667 md_error(mddev, rdev); 7668 if (test_bit(MD_BROKEN, &mddev->flags)) 7669 err = -EBUSY; 7670 } 7671 rcu_read_unlock(); 7672 return err; 7673 } 7674 7675 /* 7676 * We have a problem here : there is no easy way to give a CHS 7677 * virtual geometry. We currently pretend that we have a 2 heads 7678 * 4 sectors (with a BIG number of cylinders...). This drives 7679 * dosfs just mad... ;-) 7680 */ 7681 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7682 { 7683 struct mddev *mddev = bdev->bd_disk->private_data; 7684 7685 geo->heads = 2; 7686 geo->sectors = 4; 7687 geo->cylinders = mddev->array_sectors / 8; 7688 return 0; 7689 } 7690 7691 static inline int md_ioctl_valid(unsigned int cmd) 7692 { 7693 switch (cmd) { 7694 case GET_ARRAY_INFO: 7695 case GET_DISK_INFO: 7696 case RAID_VERSION: 7697 return 0; 7698 case ADD_NEW_DISK: 7699 case GET_BITMAP_FILE: 7700 case HOT_ADD_DISK: 7701 case HOT_REMOVE_DISK: 7702 case RESTART_ARRAY_RW: 7703 case RUN_ARRAY: 7704 case SET_ARRAY_INFO: 7705 case SET_BITMAP_FILE: 7706 case SET_DISK_FAULTY: 7707 case STOP_ARRAY: 7708 case STOP_ARRAY_RO: 7709 case CLUSTERED_DISK_NACK: 7710 if (!capable(CAP_SYS_ADMIN)) 7711 return -EACCES; 7712 return 0; 7713 default: 7714 return -ENOTTY; 7715 } 7716 } 7717 7718 static bool md_ioctl_need_suspend(unsigned int cmd) 7719 { 7720 switch (cmd) { 7721 case ADD_NEW_DISK: 7722 case HOT_ADD_DISK: 7723 case HOT_REMOVE_DISK: 7724 case SET_BITMAP_FILE: 7725 case SET_ARRAY_INFO: 7726 return true; 7727 default: 7728 return false; 7729 } 7730 } 7731 7732 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7733 { 7734 mdu_array_info_t info; 7735 int err; 7736 7737 if (!argp) 7738 memset(&info, 0, sizeof(info)); 7739 else if (copy_from_user(&info, argp, sizeof(info))) 7740 return -EFAULT; 7741 7742 if (mddev->pers) { 7743 err = update_array_info(mddev, &info); 7744 if (err) 7745 pr_warn("md: couldn't update array info. %d\n", err); 7746 return err; 7747 } 7748 7749 if (!list_empty(&mddev->disks)) { 7750 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7751 return -EBUSY; 7752 } 7753 7754 if (mddev->raid_disks) { 7755 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7756 return -EBUSY; 7757 } 7758 7759 err = md_set_array_info(mddev, &info); 7760 if (err) 7761 pr_warn("md: couldn't set array info. %d\n", err); 7762 7763 return err; 7764 } 7765 7766 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7767 unsigned int cmd, unsigned long arg) 7768 { 7769 int err = 0; 7770 void __user *argp = (void __user *)arg; 7771 struct mddev *mddev = NULL; 7772 7773 err = md_ioctl_valid(cmd); 7774 if (err) 7775 return err; 7776 7777 /* 7778 * Commands dealing with the RAID driver but not any 7779 * particular array: 7780 */ 7781 if (cmd == RAID_VERSION) 7782 return get_version(argp); 7783 7784 /* 7785 * Commands creating/starting a new array: 7786 */ 7787 7788 mddev = bdev->bd_disk->private_data; 7789 7790 /* Some actions do not requires the mutex */ 7791 switch (cmd) { 7792 case GET_ARRAY_INFO: 7793 if (!mddev->raid_disks && !mddev->external) 7794 return -ENODEV; 7795 return get_array_info(mddev, argp); 7796 7797 case GET_DISK_INFO: 7798 if (!mddev->raid_disks && !mddev->external) 7799 return -ENODEV; 7800 return get_disk_info(mddev, argp); 7801 7802 case SET_DISK_FAULTY: 7803 return set_disk_faulty(mddev, new_decode_dev(arg)); 7804 7805 case GET_BITMAP_FILE: 7806 return get_bitmap_file(mddev, argp); 7807 } 7808 7809 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7810 /* Need to flush page cache, and ensure no-one else opens 7811 * and writes 7812 */ 7813 err = mddev_set_closing_and_sync_blockdev(mddev, 1); 7814 if (err) 7815 return err; 7816 } 7817 7818 if (!md_is_rdwr(mddev)) 7819 flush_work(&mddev->sync_work); 7820 7821 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 7822 mddev_lock(mddev); 7823 if (err) { 7824 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7825 err, cmd); 7826 goto out; 7827 } 7828 7829 if (cmd == SET_ARRAY_INFO) { 7830 err = __md_set_array_info(mddev, argp); 7831 goto unlock; 7832 } 7833 7834 /* 7835 * Commands querying/configuring an existing array: 7836 */ 7837 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7838 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7839 if ((!mddev->raid_disks && !mddev->external) 7840 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7841 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7842 && cmd != GET_BITMAP_FILE) { 7843 err = -ENODEV; 7844 goto unlock; 7845 } 7846 7847 /* 7848 * Commands even a read-only array can execute: 7849 */ 7850 switch (cmd) { 7851 case RESTART_ARRAY_RW: 7852 err = restart_array(mddev); 7853 goto unlock; 7854 7855 case STOP_ARRAY: 7856 err = do_md_stop(mddev, 0); 7857 goto unlock; 7858 7859 case STOP_ARRAY_RO: 7860 if (mddev->pers) 7861 err = md_set_readonly(mddev); 7862 goto unlock; 7863 7864 case HOT_REMOVE_DISK: 7865 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7866 goto unlock; 7867 7868 case ADD_NEW_DISK: 7869 /* We can support ADD_NEW_DISK on read-only arrays 7870 * only if we are re-adding a preexisting device. 7871 * So require mddev->pers and MD_DISK_SYNC. 7872 */ 7873 if (mddev->pers) { 7874 mdu_disk_info_t info; 7875 if (copy_from_user(&info, argp, sizeof(info))) 7876 err = -EFAULT; 7877 else if (!(info.state & (1<<MD_DISK_SYNC))) 7878 /* Need to clear read-only for this */ 7879 break; 7880 else 7881 err = md_add_new_disk(mddev, &info); 7882 goto unlock; 7883 } 7884 break; 7885 } 7886 7887 /* 7888 * The remaining ioctls are changing the state of the 7889 * superblock, so we do not allow them on read-only arrays. 7890 */ 7891 if (!md_is_rdwr(mddev) && mddev->pers) { 7892 if (mddev->ro != MD_AUTO_READ) { 7893 err = -EROFS; 7894 goto unlock; 7895 } 7896 mddev->ro = MD_RDWR; 7897 sysfs_notify_dirent_safe(mddev->sysfs_state); 7898 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7899 /* mddev_unlock will wake thread */ 7900 /* If a device failed while we were read-only, we 7901 * need to make sure the metadata is updated now. 7902 */ 7903 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7904 mddev_unlock(mddev); 7905 wait_event(mddev->sb_wait, 7906 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7907 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7908 mddev_lock_nointr(mddev); 7909 } 7910 } 7911 7912 switch (cmd) { 7913 case ADD_NEW_DISK: 7914 { 7915 mdu_disk_info_t info; 7916 if (copy_from_user(&info, argp, sizeof(info))) 7917 err = -EFAULT; 7918 else 7919 err = md_add_new_disk(mddev, &info); 7920 goto unlock; 7921 } 7922 7923 case CLUSTERED_DISK_NACK: 7924 if (mddev_is_clustered(mddev)) 7925 mddev->cluster_ops->new_disk_ack(mddev, false); 7926 else 7927 err = -EINVAL; 7928 goto unlock; 7929 7930 case HOT_ADD_DISK: 7931 err = hot_add_disk(mddev, new_decode_dev(arg)); 7932 goto unlock; 7933 7934 case RUN_ARRAY: 7935 err = do_md_run(mddev); 7936 goto unlock; 7937 7938 case SET_BITMAP_FILE: 7939 err = set_bitmap_file(mddev, (int)arg); 7940 goto unlock; 7941 7942 default: 7943 err = -EINVAL; 7944 goto unlock; 7945 } 7946 7947 unlock: 7948 if (mddev->hold_active == UNTIL_IOCTL && 7949 err != -EINVAL) 7950 mddev->hold_active = 0; 7951 7952 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 7953 mddev_unlock(mddev); 7954 7955 out: 7956 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) 7957 clear_bit(MD_CLOSING, &mddev->flags); 7958 return err; 7959 } 7960 #ifdef CONFIG_COMPAT 7961 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 7962 unsigned int cmd, unsigned long arg) 7963 { 7964 switch (cmd) { 7965 case HOT_REMOVE_DISK: 7966 case HOT_ADD_DISK: 7967 case SET_DISK_FAULTY: 7968 case SET_BITMAP_FILE: 7969 /* These take in integer arg, do not convert */ 7970 break; 7971 default: 7972 arg = (unsigned long)compat_ptr(arg); 7973 break; 7974 } 7975 7976 return md_ioctl(bdev, mode, cmd, arg); 7977 } 7978 #endif /* CONFIG_COMPAT */ 7979 7980 static int md_set_read_only(struct block_device *bdev, bool ro) 7981 { 7982 struct mddev *mddev = bdev->bd_disk->private_data; 7983 int err; 7984 7985 err = mddev_lock(mddev); 7986 if (err) 7987 return err; 7988 7989 if (!mddev->raid_disks && !mddev->external) { 7990 err = -ENODEV; 7991 goto out_unlock; 7992 } 7993 7994 /* 7995 * Transitioning to read-auto need only happen for arrays that call 7996 * md_write_start and which are not ready for writes yet. 7997 */ 7998 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 7999 err = restart_array(mddev); 8000 if (err) 8001 goto out_unlock; 8002 mddev->ro = MD_AUTO_READ; 8003 } 8004 8005 out_unlock: 8006 mddev_unlock(mddev); 8007 return err; 8008 } 8009 8010 static int md_open(struct gendisk *disk, blk_mode_t mode) 8011 { 8012 struct mddev *mddev; 8013 int err; 8014 8015 spin_lock(&all_mddevs_lock); 8016 mddev = mddev_get(disk->private_data); 8017 spin_unlock(&all_mddevs_lock); 8018 if (!mddev) 8019 return -ENODEV; 8020 8021 err = mutex_lock_interruptible(&mddev->open_mutex); 8022 if (err) 8023 goto out; 8024 8025 err = -ENODEV; 8026 if (test_bit(MD_CLOSING, &mddev->flags)) 8027 goto out_unlock; 8028 8029 atomic_inc(&mddev->openers); 8030 mutex_unlock(&mddev->open_mutex); 8031 8032 disk_check_media_change(disk); 8033 return 0; 8034 8035 out_unlock: 8036 mutex_unlock(&mddev->open_mutex); 8037 out: 8038 mddev_put(mddev); 8039 return err; 8040 } 8041 8042 static void md_release(struct gendisk *disk) 8043 { 8044 struct mddev *mddev = disk->private_data; 8045 8046 BUG_ON(!mddev); 8047 atomic_dec(&mddev->openers); 8048 mddev_put(mddev); 8049 } 8050 8051 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 8052 { 8053 struct mddev *mddev = disk->private_data; 8054 unsigned int ret = 0; 8055 8056 if (mddev->changed) 8057 ret = DISK_EVENT_MEDIA_CHANGE; 8058 mddev->changed = 0; 8059 return ret; 8060 } 8061 8062 static void md_free_disk(struct gendisk *disk) 8063 { 8064 struct mddev *mddev = disk->private_data; 8065 8066 mddev_free(mddev); 8067 } 8068 8069 const struct block_device_operations md_fops = 8070 { 8071 .owner = THIS_MODULE, 8072 .submit_bio = md_submit_bio, 8073 .open = md_open, 8074 .release = md_release, 8075 .ioctl = md_ioctl, 8076 #ifdef CONFIG_COMPAT 8077 .compat_ioctl = md_compat_ioctl, 8078 #endif 8079 .getgeo = md_getgeo, 8080 .check_events = md_check_events, 8081 .set_read_only = md_set_read_only, 8082 .free_disk = md_free_disk, 8083 }; 8084 8085 static int md_thread(void *arg) 8086 { 8087 struct md_thread *thread = arg; 8088 8089 /* 8090 * md_thread is a 'system-thread', it's priority should be very 8091 * high. We avoid resource deadlocks individually in each 8092 * raid personality. (RAID5 does preallocation) We also use RR and 8093 * the very same RT priority as kswapd, thus we will never get 8094 * into a priority inversion deadlock. 8095 * 8096 * we definitely have to have equal or higher priority than 8097 * bdflush, otherwise bdflush will deadlock if there are too 8098 * many dirty RAID5 blocks. 8099 */ 8100 8101 allow_signal(SIGKILL); 8102 while (!kthread_should_stop()) { 8103 8104 /* We need to wait INTERRUPTIBLE so that 8105 * we don't add to the load-average. 8106 * That means we need to be sure no signals are 8107 * pending 8108 */ 8109 if (signal_pending(current)) 8110 flush_signals(current); 8111 8112 wait_event_interruptible_timeout 8113 (thread->wqueue, 8114 test_bit(THREAD_WAKEUP, &thread->flags) 8115 || kthread_should_stop() || kthread_should_park(), 8116 thread->timeout); 8117 8118 clear_bit(THREAD_WAKEUP, &thread->flags); 8119 if (kthread_should_park()) 8120 kthread_parkme(); 8121 if (!kthread_should_stop()) 8122 thread->run(thread); 8123 } 8124 8125 return 0; 8126 } 8127 8128 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 8129 { 8130 struct md_thread *t; 8131 8132 rcu_read_lock(); 8133 t = rcu_dereference(thread); 8134 if (t) 8135 wake_up_process(t->tsk); 8136 rcu_read_unlock(); 8137 } 8138 8139 void md_wakeup_thread(struct md_thread __rcu *thread) 8140 { 8141 struct md_thread *t; 8142 8143 rcu_read_lock(); 8144 t = rcu_dereference(thread); 8145 if (t) { 8146 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8147 set_bit(THREAD_WAKEUP, &t->flags); 8148 if (wq_has_sleeper(&t->wqueue)) 8149 wake_up(&t->wqueue); 8150 } 8151 rcu_read_unlock(); 8152 } 8153 EXPORT_SYMBOL(md_wakeup_thread); 8154 8155 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8156 struct mddev *mddev, const char *name) 8157 { 8158 struct md_thread *thread; 8159 8160 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8161 if (!thread) 8162 return NULL; 8163 8164 init_waitqueue_head(&thread->wqueue); 8165 8166 thread->run = run; 8167 thread->mddev = mddev; 8168 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8169 thread->tsk = kthread_run(md_thread, thread, 8170 "%s_%s", 8171 mdname(thread->mddev), 8172 name); 8173 if (IS_ERR(thread->tsk)) { 8174 kfree(thread); 8175 return NULL; 8176 } 8177 return thread; 8178 } 8179 EXPORT_SYMBOL(md_register_thread); 8180 8181 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8182 { 8183 struct md_thread *thread = rcu_dereference_protected(*threadp, 8184 lockdep_is_held(&mddev->reconfig_mutex)); 8185 8186 if (!thread) 8187 return; 8188 8189 rcu_assign_pointer(*threadp, NULL); 8190 synchronize_rcu(); 8191 8192 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8193 kthread_stop(thread->tsk); 8194 kfree(thread); 8195 } 8196 EXPORT_SYMBOL(md_unregister_thread); 8197 8198 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8199 { 8200 if (!rdev || test_bit(Faulty, &rdev->flags)) 8201 return; 8202 8203 if (!mddev->pers || !mddev->pers->error_handler) 8204 return; 8205 mddev->pers->error_handler(mddev, rdev); 8206 8207 if (mddev->pers->head.id == ID_RAID0 || 8208 mddev->pers->head.id == ID_LINEAR) 8209 return; 8210 8211 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8212 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8213 sysfs_notify_dirent_safe(rdev->sysfs_state); 8214 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8215 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8216 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8217 md_wakeup_thread(mddev->thread); 8218 } 8219 if (mddev->event_work.func) 8220 queue_work(md_misc_wq, &mddev->event_work); 8221 md_new_event(); 8222 } 8223 EXPORT_SYMBOL(md_error); 8224 8225 /* seq_file implementation /proc/mdstat */ 8226 8227 static void status_unused(struct seq_file *seq) 8228 { 8229 int i = 0; 8230 struct md_rdev *rdev; 8231 8232 seq_printf(seq, "unused devices: "); 8233 8234 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8235 i++; 8236 seq_printf(seq, "%pg ", rdev->bdev); 8237 } 8238 if (!i) 8239 seq_printf(seq, "<none>"); 8240 8241 seq_printf(seq, "\n"); 8242 } 8243 8244 static void status_personalities(struct seq_file *seq) 8245 { 8246 struct md_submodule_head *head; 8247 unsigned long i; 8248 8249 seq_puts(seq, "Personalities : "); 8250 8251 xa_lock(&md_submodule); 8252 xa_for_each(&md_submodule, i, head) 8253 if (head->type == MD_PERSONALITY) 8254 seq_printf(seq, "[%s] ", head->name); 8255 xa_unlock(&md_submodule); 8256 8257 seq_puts(seq, "\n"); 8258 } 8259 8260 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8261 { 8262 sector_t max_sectors, resync, res; 8263 unsigned long dt, db = 0; 8264 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8265 int scale, recovery_active; 8266 unsigned int per_milli; 8267 8268 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8269 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8270 max_sectors = mddev->resync_max_sectors; 8271 else 8272 max_sectors = mddev->dev_sectors; 8273 8274 resync = mddev->curr_resync; 8275 if (resync < MD_RESYNC_ACTIVE) { 8276 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8277 /* Still cleaning up */ 8278 resync = max_sectors; 8279 } else if (resync > max_sectors) { 8280 resync = max_sectors; 8281 } else { 8282 res = atomic_read(&mddev->recovery_active); 8283 /* 8284 * Resync has started, but the subtraction has overflowed or 8285 * yielded one of the special values. Force it to active to 8286 * ensure the status reports an active resync. 8287 */ 8288 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8289 resync = MD_RESYNC_ACTIVE; 8290 else 8291 resync -= res; 8292 } 8293 8294 if (resync == MD_RESYNC_NONE) { 8295 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8296 struct md_rdev *rdev; 8297 8298 rdev_for_each(rdev, mddev) 8299 if (rdev->raid_disk >= 0 && 8300 !test_bit(Faulty, &rdev->flags) && 8301 rdev->recovery_offset != MaxSector && 8302 rdev->recovery_offset) { 8303 seq_printf(seq, "\trecover=REMOTE"); 8304 return 1; 8305 } 8306 if (mddev->reshape_position != MaxSector) 8307 seq_printf(seq, "\treshape=REMOTE"); 8308 else 8309 seq_printf(seq, "\tresync=REMOTE"); 8310 return 1; 8311 } 8312 if (mddev->resync_offset < MaxSector) { 8313 seq_printf(seq, "\tresync=PENDING"); 8314 return 1; 8315 } 8316 return 0; 8317 } 8318 if (resync < MD_RESYNC_ACTIVE) { 8319 seq_printf(seq, "\tresync=DELAYED"); 8320 return 1; 8321 } 8322 8323 WARN_ON(max_sectors == 0); 8324 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8325 * in a sector_t, and (max_sectors>>scale) will fit in a 8326 * u32, as those are the requirements for sector_div. 8327 * Thus 'scale' must be at least 10 8328 */ 8329 scale = 10; 8330 if (sizeof(sector_t) > sizeof(unsigned long)) { 8331 while ( max_sectors/2 > (1ULL<<(scale+32))) 8332 scale++; 8333 } 8334 res = (resync>>scale)*1000; 8335 sector_div(res, (u32)((max_sectors>>scale)+1)); 8336 8337 per_milli = res; 8338 { 8339 int i, x = per_milli/50, y = 20-x; 8340 seq_printf(seq, "["); 8341 for (i = 0; i < x; i++) 8342 seq_printf(seq, "="); 8343 seq_printf(seq, ">"); 8344 for (i = 0; i < y; i++) 8345 seq_printf(seq, "."); 8346 seq_printf(seq, "] "); 8347 } 8348 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8349 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8350 "reshape" : 8351 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8352 "check" : 8353 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8354 "resync" : "recovery"))), 8355 per_milli/10, per_milli % 10, 8356 (unsigned long long) resync/2, 8357 (unsigned long long) max_sectors/2); 8358 8359 /* 8360 * dt: time from mark until now 8361 * db: blocks written from mark until now 8362 * rt: remaining time 8363 * 8364 * rt is a sector_t, which is always 64bit now. We are keeping 8365 * the original algorithm, but it is not really necessary. 8366 * 8367 * Original algorithm: 8368 * So we divide before multiply in case it is 32bit and close 8369 * to the limit. 8370 * We scale the divisor (db) by 32 to avoid losing precision 8371 * near the end of resync when the number of remaining sectors 8372 * is close to 'db'. 8373 * We then divide rt by 32 after multiplying by db to compensate. 8374 * The '+1' avoids division by zero if db is very small. 8375 */ 8376 dt = ((jiffies - mddev->resync_mark) / HZ); 8377 if (!dt) dt++; 8378 8379 curr_mark_cnt = mddev->curr_mark_cnt; 8380 recovery_active = atomic_read(&mddev->recovery_active); 8381 resync_mark_cnt = mddev->resync_mark_cnt; 8382 8383 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8384 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8385 8386 rt = max_sectors - resync; /* number of remaining sectors */ 8387 rt = div64_u64(rt, db/32+1); 8388 rt *= dt; 8389 rt >>= 5; 8390 8391 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8392 ((unsigned long)rt % 60)/6); 8393 8394 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8395 return 1; 8396 } 8397 8398 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8399 __acquires(&all_mddevs_lock) 8400 { 8401 seq->poll_event = atomic_read(&md_event_count); 8402 spin_lock(&all_mddevs_lock); 8403 8404 return seq_list_start_head(&all_mddevs, *pos); 8405 } 8406 8407 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8408 { 8409 return seq_list_next(v, &all_mddevs, pos); 8410 } 8411 8412 static void md_seq_stop(struct seq_file *seq, void *v) 8413 __releases(&all_mddevs_lock) 8414 { 8415 spin_unlock(&all_mddevs_lock); 8416 } 8417 8418 static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) 8419 { 8420 struct md_bitmap_stats stats; 8421 unsigned long used_pages; 8422 unsigned long chunk_kb; 8423 int err; 8424 8425 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 8426 if (err) 8427 return; 8428 8429 chunk_kb = mddev->bitmap_info.chunksize >> 10; 8430 used_pages = stats.pages - stats.missing_pages; 8431 8432 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", 8433 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), 8434 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 8435 chunk_kb ? "KB" : "B"); 8436 8437 if (stats.file) { 8438 seq_puts(seq, ", file: "); 8439 seq_file_path(seq, stats.file, " \t\n"); 8440 } 8441 8442 seq_putc(seq, '\n'); 8443 } 8444 8445 static int md_seq_show(struct seq_file *seq, void *v) 8446 { 8447 struct mddev *mddev; 8448 sector_t sectors; 8449 struct md_rdev *rdev; 8450 8451 if (v == &all_mddevs) { 8452 status_personalities(seq); 8453 if (list_empty(&all_mddevs)) 8454 status_unused(seq); 8455 return 0; 8456 } 8457 8458 mddev = list_entry(v, struct mddev, all_mddevs); 8459 if (!mddev_get(mddev)) 8460 return 0; 8461 8462 spin_unlock(&all_mddevs_lock); 8463 8464 /* prevent bitmap to be freed after checking */ 8465 mutex_lock(&mddev->bitmap_info.mutex); 8466 8467 spin_lock(&mddev->lock); 8468 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8469 seq_printf(seq, "%s : ", mdname(mddev)); 8470 if (mddev->pers) { 8471 if (test_bit(MD_BROKEN, &mddev->flags)) 8472 seq_printf(seq, "broken"); 8473 else 8474 seq_printf(seq, "active"); 8475 if (mddev->ro == MD_RDONLY) 8476 seq_printf(seq, " (read-only)"); 8477 if (mddev->ro == MD_AUTO_READ) 8478 seq_printf(seq, " (auto-read-only)"); 8479 seq_printf(seq, " %s", mddev->pers->head.name); 8480 } else { 8481 seq_printf(seq, "inactive"); 8482 } 8483 8484 sectors = 0; 8485 rcu_read_lock(); 8486 rdev_for_each_rcu(rdev, mddev) { 8487 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8488 8489 if (test_bit(WriteMostly, &rdev->flags)) 8490 seq_printf(seq, "(W)"); 8491 if (test_bit(Journal, &rdev->flags)) 8492 seq_printf(seq, "(J)"); 8493 if (test_bit(Faulty, &rdev->flags)) { 8494 seq_printf(seq, "(F)"); 8495 continue; 8496 } 8497 if (rdev->raid_disk < 0) 8498 seq_printf(seq, "(S)"); /* spare */ 8499 if (test_bit(Replacement, &rdev->flags)) 8500 seq_printf(seq, "(R)"); 8501 sectors += rdev->sectors; 8502 } 8503 rcu_read_unlock(); 8504 8505 if (!list_empty(&mddev->disks)) { 8506 if (mddev->pers) 8507 seq_printf(seq, "\n %llu blocks", 8508 (unsigned long long) 8509 mddev->array_sectors / 2); 8510 else 8511 seq_printf(seq, "\n %llu blocks", 8512 (unsigned long long)sectors / 2); 8513 } 8514 if (mddev->persistent) { 8515 if (mddev->major_version != 0 || 8516 mddev->minor_version != 90) { 8517 seq_printf(seq," super %d.%d", 8518 mddev->major_version, 8519 mddev->minor_version); 8520 } 8521 } else if (mddev->external) 8522 seq_printf(seq, " super external:%s", 8523 mddev->metadata_type); 8524 else 8525 seq_printf(seq, " super non-persistent"); 8526 8527 if (mddev->pers) { 8528 mddev->pers->status(seq, mddev); 8529 seq_printf(seq, "\n "); 8530 if (mddev->pers->sync_request) { 8531 if (status_resync(seq, mddev)) 8532 seq_printf(seq, "\n "); 8533 } 8534 } else 8535 seq_printf(seq, "\n "); 8536 8537 md_bitmap_status(seq, mddev); 8538 8539 seq_printf(seq, "\n"); 8540 } 8541 spin_unlock(&mddev->lock); 8542 mutex_unlock(&mddev->bitmap_info.mutex); 8543 spin_lock(&all_mddevs_lock); 8544 8545 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) 8546 status_unused(seq); 8547 8548 mddev_put_locked(mddev); 8549 return 0; 8550 } 8551 8552 static const struct seq_operations md_seq_ops = { 8553 .start = md_seq_start, 8554 .next = md_seq_next, 8555 .stop = md_seq_stop, 8556 .show = md_seq_show, 8557 }; 8558 8559 static int md_seq_open(struct inode *inode, struct file *file) 8560 { 8561 struct seq_file *seq; 8562 int error; 8563 8564 error = seq_open(file, &md_seq_ops); 8565 if (error) 8566 return error; 8567 8568 seq = file->private_data; 8569 seq->poll_event = atomic_read(&md_event_count); 8570 return error; 8571 } 8572 8573 static int md_unloading; 8574 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8575 { 8576 struct seq_file *seq = filp->private_data; 8577 __poll_t mask; 8578 8579 if (md_unloading) 8580 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8581 poll_wait(filp, &md_event_waiters, wait); 8582 8583 /* always allow read */ 8584 mask = EPOLLIN | EPOLLRDNORM; 8585 8586 if (seq->poll_event != atomic_read(&md_event_count)) 8587 mask |= EPOLLERR | EPOLLPRI; 8588 return mask; 8589 } 8590 8591 static const struct proc_ops mdstat_proc_ops = { 8592 .proc_open = md_seq_open, 8593 .proc_read = seq_read, 8594 .proc_lseek = seq_lseek, 8595 .proc_release = seq_release, 8596 .proc_poll = mdstat_poll, 8597 }; 8598 8599 int register_md_submodule(struct md_submodule_head *msh) 8600 { 8601 return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); 8602 } 8603 EXPORT_SYMBOL_GPL(register_md_submodule); 8604 8605 void unregister_md_submodule(struct md_submodule_head *msh) 8606 { 8607 xa_erase(&md_submodule, msh->id); 8608 } 8609 EXPORT_SYMBOL_GPL(unregister_md_submodule); 8610 8611 int md_setup_cluster(struct mddev *mddev, int nodes) 8612 { 8613 int ret = get_cluster_ops(mddev); 8614 8615 if (ret) { 8616 request_module("md-cluster"); 8617 ret = get_cluster_ops(mddev); 8618 } 8619 8620 /* ensure module won't be unloaded */ 8621 if (ret) { 8622 pr_warn("can't find md-cluster module or get its reference.\n"); 8623 return ret; 8624 } 8625 8626 ret = mddev->cluster_ops->join(mddev, nodes); 8627 if (!ret) 8628 mddev->safemode_delay = 0; 8629 return ret; 8630 } 8631 8632 void md_cluster_stop(struct mddev *mddev) 8633 { 8634 put_cluster_ops(mddev); 8635 } 8636 8637 static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) 8638 { 8639 unsigned long last_events = rdev->last_events; 8640 8641 if (!bdev_is_partition(rdev->bdev)) 8642 return true; 8643 8644 /* 8645 * If rdev is partition, and user doesn't issue IO to the array, the 8646 * array is still not idle if user issues IO to other partitions. 8647 */ 8648 rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, 8649 sectors) - 8650 part_stat_read_accum(rdev->bdev, sectors); 8651 8652 return init || rdev->last_events <= last_events; 8653 } 8654 8655 /* 8656 * mddev is idle if following conditions are matched since last check: 8657 * 1) mddev doesn't have normal IO completed; 8658 * 2) mddev doesn't have inflight normal IO; 8659 * 3) if any member disk is partition, and other partitions don't have IO 8660 * completed; 8661 * 8662 * Noted this checking rely on IO accounting is enabled. 8663 */ 8664 static bool is_mddev_idle(struct mddev *mddev, int init) 8665 { 8666 unsigned long last_events = mddev->normal_io_events; 8667 struct gendisk *disk; 8668 struct md_rdev *rdev; 8669 bool idle = true; 8670 8671 disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; 8672 if (!disk) 8673 return true; 8674 8675 mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); 8676 if (!init && (mddev->normal_io_events > last_events || 8677 bdev_count_inflight(disk->part0))) 8678 idle = false; 8679 8680 rcu_read_lock(); 8681 rdev_for_each_rcu(rdev, mddev) 8682 if (!is_rdev_holder_idle(rdev, init)) 8683 idle = false; 8684 rcu_read_unlock(); 8685 8686 return idle; 8687 } 8688 8689 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8690 { 8691 /* another "blocks" (512byte) blocks have been synced */ 8692 atomic_sub(blocks, &mddev->recovery_active); 8693 wake_up(&mddev->recovery_wait); 8694 if (!ok) { 8695 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8696 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8697 md_wakeup_thread(mddev->thread); 8698 // stop recovery, signal do_sync .... 8699 } 8700 } 8701 EXPORT_SYMBOL(md_done_sync); 8702 8703 /* md_write_start(mddev, bi) 8704 * If we need to update some array metadata (e.g. 'active' flag 8705 * in superblock) before writing, schedule a superblock update 8706 * and wait for it to complete. 8707 * A return value of 'false' means that the write wasn't recorded 8708 * and cannot proceed as the array is being suspend. 8709 */ 8710 void md_write_start(struct mddev *mddev, struct bio *bi) 8711 { 8712 int did_change = 0; 8713 8714 if (bio_data_dir(bi) != WRITE) 8715 return; 8716 8717 BUG_ON(mddev->ro == MD_RDONLY); 8718 if (mddev->ro == MD_AUTO_READ) { 8719 /* need to switch to read/write */ 8720 mddev->ro = MD_RDWR; 8721 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8722 md_wakeup_thread(mddev->thread); 8723 md_wakeup_thread(mddev->sync_thread); 8724 did_change = 1; 8725 } 8726 rcu_read_lock(); 8727 percpu_ref_get(&mddev->writes_pending); 8728 smp_mb(); /* Match smp_mb in set_in_sync() */ 8729 if (mddev->safemode == 1) 8730 mddev->safemode = 0; 8731 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8732 if (mddev->in_sync || mddev->sync_checkers) { 8733 spin_lock(&mddev->lock); 8734 if (mddev->in_sync) { 8735 mddev->in_sync = 0; 8736 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8737 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8738 md_wakeup_thread(mddev->thread); 8739 did_change = 1; 8740 } 8741 spin_unlock(&mddev->lock); 8742 } 8743 rcu_read_unlock(); 8744 if (did_change) 8745 sysfs_notify_dirent_safe(mddev->sysfs_state); 8746 if (!mddev->has_superblocks) 8747 return; 8748 wait_event(mddev->sb_wait, 8749 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8750 } 8751 EXPORT_SYMBOL(md_write_start); 8752 8753 /* md_write_inc can only be called when md_write_start() has 8754 * already been called at least once of the current request. 8755 * It increments the counter and is useful when a single request 8756 * is split into several parts. Each part causes an increment and 8757 * so needs a matching md_write_end(). 8758 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8759 * a spinlocked region. 8760 */ 8761 void md_write_inc(struct mddev *mddev, struct bio *bi) 8762 { 8763 if (bio_data_dir(bi) != WRITE) 8764 return; 8765 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8766 percpu_ref_get(&mddev->writes_pending); 8767 } 8768 EXPORT_SYMBOL(md_write_inc); 8769 8770 void md_write_end(struct mddev *mddev) 8771 { 8772 percpu_ref_put(&mddev->writes_pending); 8773 8774 if (mddev->safemode == 2) 8775 md_wakeup_thread(mddev->thread); 8776 else if (mddev->safemode_delay) 8777 /* The roundup() ensures this only performs locking once 8778 * every ->safemode_delay jiffies 8779 */ 8780 mod_timer(&mddev->safemode_timer, 8781 roundup(jiffies, mddev->safemode_delay) + 8782 mddev->safemode_delay); 8783 } 8784 8785 EXPORT_SYMBOL(md_write_end); 8786 8787 /* This is used by raid0 and raid10 */ 8788 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8789 struct bio *bio, sector_t start, sector_t size) 8790 { 8791 struct bio *discard_bio = NULL; 8792 8793 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8794 &discard_bio) || !discard_bio) 8795 return; 8796 8797 bio_chain(discard_bio, bio); 8798 bio_clone_blkg_association(discard_bio, bio); 8799 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); 8800 submit_bio_noacct(discard_bio); 8801 } 8802 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8803 8804 static void md_bitmap_start(struct mddev *mddev, 8805 struct md_io_clone *md_io_clone) 8806 { 8807 if (mddev->pers->bitmap_sector) 8808 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, 8809 &md_io_clone->sectors); 8810 8811 mddev->bitmap_ops->start_write(mddev, md_io_clone->offset, 8812 md_io_clone->sectors); 8813 } 8814 8815 static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) 8816 { 8817 mddev->bitmap_ops->end_write(mddev, md_io_clone->offset, 8818 md_io_clone->sectors); 8819 } 8820 8821 static void md_end_clone_io(struct bio *bio) 8822 { 8823 struct md_io_clone *md_io_clone = bio->bi_private; 8824 struct bio *orig_bio = md_io_clone->orig_bio; 8825 struct mddev *mddev = md_io_clone->mddev; 8826 8827 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) 8828 md_bitmap_end(mddev, md_io_clone); 8829 8830 if (bio->bi_status && !orig_bio->bi_status) 8831 orig_bio->bi_status = bio->bi_status; 8832 8833 if (md_io_clone->start_time) 8834 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8835 8836 bio_put(bio); 8837 bio_endio(orig_bio); 8838 percpu_ref_put(&mddev->active_io); 8839 } 8840 8841 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8842 { 8843 struct block_device *bdev = (*bio)->bi_bdev; 8844 struct md_io_clone *md_io_clone; 8845 struct bio *clone = 8846 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8847 8848 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8849 md_io_clone->orig_bio = *bio; 8850 md_io_clone->mddev = mddev; 8851 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8852 md_io_clone->start_time = bio_start_io_acct(*bio); 8853 8854 if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { 8855 md_io_clone->offset = (*bio)->bi_iter.bi_sector; 8856 md_io_clone->sectors = bio_sectors(*bio); 8857 md_bitmap_start(mddev, md_io_clone); 8858 } 8859 8860 clone->bi_end_io = md_end_clone_io; 8861 clone->bi_private = md_io_clone; 8862 *bio = clone; 8863 } 8864 8865 void md_account_bio(struct mddev *mddev, struct bio **bio) 8866 { 8867 percpu_ref_get(&mddev->active_io); 8868 md_clone_bio(mddev, bio); 8869 } 8870 EXPORT_SYMBOL_GPL(md_account_bio); 8871 8872 void md_free_cloned_bio(struct bio *bio) 8873 { 8874 struct md_io_clone *md_io_clone = bio->bi_private; 8875 struct bio *orig_bio = md_io_clone->orig_bio; 8876 struct mddev *mddev = md_io_clone->mddev; 8877 8878 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) 8879 md_bitmap_end(mddev, md_io_clone); 8880 8881 if (bio->bi_status && !orig_bio->bi_status) 8882 orig_bio->bi_status = bio->bi_status; 8883 8884 if (md_io_clone->start_time) 8885 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8886 8887 bio_put(bio); 8888 percpu_ref_put(&mddev->active_io); 8889 } 8890 EXPORT_SYMBOL_GPL(md_free_cloned_bio); 8891 8892 /* md_allow_write(mddev) 8893 * Calling this ensures that the array is marked 'active' so that writes 8894 * may proceed without blocking. It is important to call this before 8895 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8896 * Must be called with mddev_lock held. 8897 */ 8898 void md_allow_write(struct mddev *mddev) 8899 { 8900 if (!mddev->pers) 8901 return; 8902 if (!md_is_rdwr(mddev)) 8903 return; 8904 if (!mddev->pers->sync_request) 8905 return; 8906 8907 spin_lock(&mddev->lock); 8908 if (mddev->in_sync) { 8909 mddev->in_sync = 0; 8910 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8911 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8912 if (mddev->safemode_delay && 8913 mddev->safemode == 0) 8914 mddev->safemode = 1; 8915 spin_unlock(&mddev->lock); 8916 md_update_sb(mddev, 0); 8917 sysfs_notify_dirent_safe(mddev->sysfs_state); 8918 /* wait for the dirty state to be recorded in the metadata */ 8919 wait_event(mddev->sb_wait, 8920 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8921 } else 8922 spin_unlock(&mddev->lock); 8923 } 8924 EXPORT_SYMBOL_GPL(md_allow_write); 8925 8926 static sector_t md_sync_max_sectors(struct mddev *mddev, 8927 enum sync_action action) 8928 { 8929 switch (action) { 8930 case ACTION_RESYNC: 8931 case ACTION_CHECK: 8932 case ACTION_REPAIR: 8933 atomic64_set(&mddev->resync_mismatches, 0); 8934 fallthrough; 8935 case ACTION_RESHAPE: 8936 return mddev->resync_max_sectors; 8937 case ACTION_RECOVER: 8938 return mddev->dev_sectors; 8939 default: 8940 return 0; 8941 } 8942 } 8943 8944 static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) 8945 { 8946 sector_t start = 0; 8947 struct md_rdev *rdev; 8948 8949 switch (action) { 8950 case ACTION_CHECK: 8951 case ACTION_REPAIR: 8952 return mddev->resync_min; 8953 case ACTION_RESYNC: 8954 if (!mddev->bitmap) 8955 return mddev->resync_offset; 8956 return 0; 8957 case ACTION_RESHAPE: 8958 /* 8959 * If the original node aborts reshaping then we continue the 8960 * reshaping, so set again to avoid restart reshape from the 8961 * first beginning 8962 */ 8963 if (mddev_is_clustered(mddev) && 8964 mddev->reshape_position != MaxSector) 8965 return mddev->reshape_position; 8966 return 0; 8967 case ACTION_RECOVER: 8968 start = MaxSector; 8969 rcu_read_lock(); 8970 rdev_for_each_rcu(rdev, mddev) 8971 if (rdev->raid_disk >= 0 && 8972 !test_bit(Journal, &rdev->flags) && 8973 !test_bit(Faulty, &rdev->flags) && 8974 !test_bit(In_sync, &rdev->flags) && 8975 rdev->recovery_offset < start) 8976 start = rdev->recovery_offset; 8977 rcu_read_unlock(); 8978 8979 /* If there is a bitmap, we need to make sure all 8980 * writes that started before we added a spare 8981 * complete before we start doing a recovery. 8982 * Otherwise the write might complete and (via 8983 * bitmap_endwrite) set a bit in the bitmap after the 8984 * recovery has checked that bit and skipped that 8985 * region. 8986 */ 8987 if (mddev->bitmap) { 8988 mddev->pers->quiesce(mddev, 1); 8989 mddev->pers->quiesce(mddev, 0); 8990 } 8991 return start; 8992 default: 8993 return MaxSector; 8994 } 8995 } 8996 8997 static bool sync_io_within_limit(struct mddev *mddev) 8998 { 8999 int io_sectors; 9000 9001 /* 9002 * For raid456, sync IO is stripe(4k) per IO, for other levels, it's 9003 * RESYNC_PAGES(64k) per IO. 9004 */ 9005 if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) 9006 io_sectors = 8; 9007 else 9008 io_sectors = 128; 9009 9010 return atomic_read(&mddev->recovery_active) < 9011 io_sectors * sync_io_depth(mddev); 9012 } 9013 9014 #define SYNC_MARKS 10 9015 #define SYNC_MARK_STEP (3*HZ) 9016 #define UPDATE_FREQUENCY (5*60*HZ) 9017 void md_do_sync(struct md_thread *thread) 9018 { 9019 struct mddev *mddev = thread->mddev; 9020 struct mddev *mddev2; 9021 unsigned int currspeed = 0, window; 9022 sector_t max_sectors,j, io_sectors, recovery_done; 9023 unsigned long mark[SYNC_MARKS]; 9024 unsigned long update_time; 9025 sector_t mark_cnt[SYNC_MARKS]; 9026 int last_mark,m; 9027 sector_t last_check; 9028 int skipped = 0; 9029 struct md_rdev *rdev; 9030 enum sync_action action; 9031 const char *desc; 9032 struct blk_plug plug; 9033 int ret; 9034 9035 /* just incase thread restarts... */ 9036 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9037 return; 9038 9039 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9040 goto skip; 9041 9042 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || 9043 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 9044 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9045 goto skip; 9046 } 9047 9048 if (mddev_is_clustered(mddev)) { 9049 ret = mddev->cluster_ops->resync_start(mddev); 9050 if (ret) 9051 goto skip; 9052 9053 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 9054 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 9055 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 9056 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 9057 && ((unsigned long long)mddev->curr_resync_completed 9058 < (unsigned long long)mddev->resync_max_sectors)) 9059 goto skip; 9060 } 9061 9062 action = md_sync_action(mddev); 9063 desc = md_sync_action_name(action); 9064 mddev->last_sync_action = action; 9065 9066 /* 9067 * Before starting a resync we must have set curr_resync to 9068 * 2, and then checked that every "conflicting" array has curr_resync 9069 * less than ours. When we find one that is the same or higher 9070 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 9071 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 9072 * This will mean we have to start checking from the beginning again. 9073 * 9074 */ 9075 if (mddev_is_clustered(mddev)) 9076 mddev->cluster_ops->resync_start_notify(mddev); 9077 do { 9078 int mddev2_minor = -1; 9079 mddev->curr_resync = MD_RESYNC_DELAYED; 9080 9081 try_again: 9082 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9083 goto skip; 9084 spin_lock(&all_mddevs_lock); 9085 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 9086 if (test_bit(MD_DELETED, &mddev2->flags)) 9087 continue; 9088 if (mddev2 == mddev) 9089 continue; 9090 if (!mddev->parallel_resync 9091 && mddev2->curr_resync 9092 && match_mddev_units(mddev, mddev2)) { 9093 DEFINE_WAIT(wq); 9094 if (mddev < mddev2 && 9095 mddev->curr_resync == MD_RESYNC_DELAYED) { 9096 /* arbitrarily yield */ 9097 mddev->curr_resync = MD_RESYNC_YIELDED; 9098 wake_up(&resync_wait); 9099 } 9100 if (mddev > mddev2 && 9101 mddev->curr_resync == MD_RESYNC_YIELDED) 9102 /* no need to wait here, we can wait the next 9103 * time 'round when curr_resync == 2 9104 */ 9105 continue; 9106 /* We need to wait 'interruptible' so as not to 9107 * contribute to the load average, and not to 9108 * be caught by 'softlockup' 9109 */ 9110 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 9111 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9112 mddev2->curr_resync >= mddev->curr_resync) { 9113 if (mddev2_minor != mddev2->md_minor) { 9114 mddev2_minor = mddev2->md_minor; 9115 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 9116 desc, mdname(mddev), 9117 mdname(mddev2)); 9118 } 9119 spin_unlock(&all_mddevs_lock); 9120 9121 if (signal_pending(current)) 9122 flush_signals(current); 9123 schedule(); 9124 finish_wait(&resync_wait, &wq); 9125 goto try_again; 9126 } 9127 finish_wait(&resync_wait, &wq); 9128 } 9129 } 9130 spin_unlock(&all_mddevs_lock); 9131 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 9132 9133 max_sectors = md_sync_max_sectors(mddev, action); 9134 j = md_sync_position(mddev, action); 9135 9136 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 9137 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 9138 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 9139 speed_max(mddev), desc); 9140 9141 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 9142 9143 io_sectors = 0; 9144 for (m = 0; m < SYNC_MARKS; m++) { 9145 mark[m] = jiffies; 9146 mark_cnt[m] = io_sectors; 9147 } 9148 last_mark = 0; 9149 mddev->resync_mark = mark[last_mark]; 9150 mddev->resync_mark_cnt = mark_cnt[last_mark]; 9151 9152 /* 9153 * Tune reconstruction: 9154 */ 9155 window = 32 * (PAGE_SIZE / 512); 9156 pr_debug("md: using %dk window, over a total of %lluk.\n", 9157 window/2, (unsigned long long)max_sectors/2); 9158 9159 atomic_set(&mddev->recovery_active, 0); 9160 last_check = 0; 9161 9162 if (j >= MD_RESYNC_ACTIVE) { 9163 pr_debug("md: resuming %s of %s from checkpoint.\n", 9164 desc, mdname(mddev)); 9165 mddev->curr_resync = j; 9166 } else 9167 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 9168 mddev->curr_resync_completed = j; 9169 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9170 md_new_event(); 9171 update_time = jiffies; 9172 9173 blk_start_plug(&plug); 9174 while (j < max_sectors) { 9175 sector_t sectors; 9176 9177 skipped = 0; 9178 9179 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9180 ((mddev->curr_resync > mddev->curr_resync_completed && 9181 (mddev->curr_resync - mddev->curr_resync_completed) 9182 > (max_sectors >> 4)) || 9183 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9184 (j - mddev->curr_resync_completed)*2 9185 >= mddev->resync_max - mddev->curr_resync_completed || 9186 mddev->curr_resync_completed > mddev->resync_max 9187 )) { 9188 /* time to update curr_resync_completed */ 9189 wait_event(mddev->recovery_wait, 9190 atomic_read(&mddev->recovery_active) == 0); 9191 mddev->curr_resync_completed = j; 9192 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9193 j > mddev->resync_offset) 9194 mddev->resync_offset = j; 9195 update_time = jiffies; 9196 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9197 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9198 } 9199 9200 while (j >= mddev->resync_max && 9201 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9202 /* As this condition is controlled by user-space, 9203 * we can block indefinitely, so use '_interruptible' 9204 * to avoid triggering warnings. 9205 */ 9206 flush_signals(current); /* just in case */ 9207 wait_event_interruptible(mddev->recovery_wait, 9208 mddev->resync_max > j 9209 || test_bit(MD_RECOVERY_INTR, 9210 &mddev->recovery)); 9211 } 9212 9213 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9214 break; 9215 9216 sectors = mddev->pers->sync_request(mddev, j, max_sectors, 9217 &skipped); 9218 if (sectors == 0) { 9219 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9220 break; 9221 } 9222 9223 if (!skipped) { /* actual IO requested */ 9224 io_sectors += sectors; 9225 atomic_add(sectors, &mddev->recovery_active); 9226 } 9227 9228 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9229 break; 9230 9231 j += sectors; 9232 if (j > max_sectors) 9233 /* when skipping, extra large numbers can be returned. */ 9234 j = max_sectors; 9235 if (j >= MD_RESYNC_ACTIVE) 9236 mddev->curr_resync = j; 9237 mddev->curr_mark_cnt = io_sectors; 9238 if (last_check == 0) 9239 /* this is the earliest that rebuild will be 9240 * visible in /proc/mdstat 9241 */ 9242 md_new_event(); 9243 9244 if (last_check + window > io_sectors || j == max_sectors) 9245 continue; 9246 9247 last_check = io_sectors; 9248 repeat: 9249 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9250 /* step marks */ 9251 int next = (last_mark+1) % SYNC_MARKS; 9252 9253 mddev->resync_mark = mark[next]; 9254 mddev->resync_mark_cnt = mark_cnt[next]; 9255 mark[next] = jiffies; 9256 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9257 last_mark = next; 9258 } 9259 9260 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9261 break; 9262 9263 /* 9264 * this loop exits only if either when we are slower than 9265 * the 'hard' speed limit, or the system was IO-idle for 9266 * a jiffy. 9267 * the system might be non-idle CPU-wise, but we only care 9268 * about not overloading the IO subsystem. (things like an 9269 * e2fsck being done on the RAID array should execute fast) 9270 */ 9271 cond_resched(); 9272 9273 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9274 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9275 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9276 9277 if (currspeed > speed_min(mddev)) { 9278 if (currspeed > speed_max(mddev)) { 9279 msleep(500); 9280 goto repeat; 9281 } 9282 if (!sync_io_within_limit(mddev) && 9283 !is_mddev_idle(mddev, 0)) { 9284 /* 9285 * Give other IO more of a chance. 9286 * The faster the devices, the less we wait. 9287 */ 9288 wait_event(mddev->recovery_wait, 9289 !atomic_read(&mddev->recovery_active)); 9290 } 9291 } 9292 } 9293 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9294 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9295 ? "interrupted" : "done"); 9296 /* 9297 * this also signals 'finished resyncing' to md_stop 9298 */ 9299 blk_finish_plug(&plug); 9300 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9301 9302 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9303 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9304 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9305 mddev->curr_resync_completed = mddev->curr_resync; 9306 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9307 } 9308 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); 9309 9310 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9311 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9312 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9313 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9314 if (mddev->curr_resync >= mddev->resync_offset) { 9315 pr_debug("md: checkpointing %s of %s.\n", 9316 desc, mdname(mddev)); 9317 if (test_bit(MD_RECOVERY_ERROR, 9318 &mddev->recovery)) 9319 mddev->resync_offset = 9320 mddev->curr_resync_completed; 9321 else 9322 mddev->resync_offset = 9323 mddev->curr_resync; 9324 } 9325 } else 9326 mddev->resync_offset = MaxSector; 9327 } else { 9328 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9329 mddev->curr_resync = MaxSector; 9330 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9331 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9332 rcu_read_lock(); 9333 rdev_for_each_rcu(rdev, mddev) 9334 if (rdev->raid_disk >= 0 && 9335 mddev->delta_disks >= 0 && 9336 !test_bit(Journal, &rdev->flags) && 9337 !test_bit(Faulty, &rdev->flags) && 9338 !test_bit(In_sync, &rdev->flags) && 9339 rdev->recovery_offset < mddev->curr_resync) 9340 rdev->recovery_offset = mddev->curr_resync; 9341 rcu_read_unlock(); 9342 } 9343 } 9344 } 9345 skip: 9346 /* set CHANGE_PENDING here since maybe another update is needed, 9347 * so other nodes are informed. It should be harmless for normal 9348 * raid */ 9349 set_mask_bits(&mddev->sb_flags, 0, 9350 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9351 9352 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9353 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9354 mddev->delta_disks > 0 && 9355 mddev->pers->finish_reshape && 9356 mddev->pers->size && 9357 !mddev_is_dm(mddev)) { 9358 mddev_lock_nointr(mddev); 9359 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9360 mddev_unlock(mddev); 9361 if (!mddev_is_clustered(mddev)) 9362 set_capacity_and_notify(mddev->gendisk, 9363 mddev->array_sectors); 9364 } 9365 9366 spin_lock(&mddev->lock); 9367 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9368 /* We completed so min/max setting can be forgotten if used. */ 9369 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9370 mddev->resync_min = 0; 9371 mddev->resync_max = MaxSector; 9372 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9373 mddev->resync_min = mddev->curr_resync_completed; 9374 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9375 mddev->curr_resync = MD_RESYNC_NONE; 9376 spin_unlock(&mddev->lock); 9377 9378 wake_up(&resync_wait); 9379 md_wakeup_thread(mddev->thread); 9380 return; 9381 } 9382 EXPORT_SYMBOL_GPL(md_do_sync); 9383 9384 static bool rdev_removeable(struct md_rdev *rdev) 9385 { 9386 /* rdev is not used. */ 9387 if (rdev->raid_disk < 0) 9388 return false; 9389 9390 /* There are still inflight io, don't remove this rdev. */ 9391 if (atomic_read(&rdev->nr_pending)) 9392 return false; 9393 9394 /* 9395 * An error occurred but has not yet been acknowledged by the metadata 9396 * handler, don't remove this rdev. 9397 */ 9398 if (test_bit(Blocked, &rdev->flags)) 9399 return false; 9400 9401 /* Fautly rdev is not used, it's safe to remove it. */ 9402 if (test_bit(Faulty, &rdev->flags)) 9403 return true; 9404 9405 /* Journal disk can only be removed if it's faulty. */ 9406 if (test_bit(Journal, &rdev->flags)) 9407 return false; 9408 9409 /* 9410 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9411 * replacement has just become active from pers->spare_active(), and 9412 * then pers->hot_remove_disk() will replace this rdev with replacement. 9413 */ 9414 if (!test_bit(In_sync, &rdev->flags)) 9415 return true; 9416 9417 return false; 9418 } 9419 9420 static bool rdev_is_spare(struct md_rdev *rdev) 9421 { 9422 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 9423 !test_bit(In_sync, &rdev->flags) && 9424 !test_bit(Journal, &rdev->flags) && 9425 !test_bit(Faulty, &rdev->flags); 9426 } 9427 9428 static bool rdev_addable(struct md_rdev *rdev) 9429 { 9430 struct mddev *mddev; 9431 9432 mddev = READ_ONCE(rdev->mddev); 9433 if (!mddev) 9434 return false; 9435 9436 /* rdev is already used, don't add it again. */ 9437 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9438 test_bit(Faulty, &rdev->flags)) 9439 return false; 9440 9441 /* Allow to add journal disk. */ 9442 if (test_bit(Journal, &rdev->flags)) 9443 return true; 9444 9445 /* Allow to add if array is read-write. */ 9446 if (md_is_rdwr(mddev)) 9447 return true; 9448 9449 /* 9450 * For read-only array, only allow to readd a rdev. And if bitmap is 9451 * used, don't allow to readd a rdev that is too old. 9452 */ 9453 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 9454 return true; 9455 9456 return false; 9457 } 9458 9459 static bool md_spares_need_change(struct mddev *mddev) 9460 { 9461 struct md_rdev *rdev; 9462 9463 rcu_read_lock(); 9464 rdev_for_each_rcu(rdev, mddev) { 9465 if (rdev_removeable(rdev) || rdev_addable(rdev)) { 9466 rcu_read_unlock(); 9467 return true; 9468 } 9469 } 9470 rcu_read_unlock(); 9471 return false; 9472 } 9473 9474 static int remove_spares(struct mddev *mddev, struct md_rdev *this) 9475 { 9476 struct md_rdev *rdev; 9477 int removed = 0; 9478 9479 rdev_for_each(rdev, mddev) { 9480 if ((this == NULL || rdev == this) && rdev_removeable(rdev) && 9481 !mddev->pers->hot_remove_disk(mddev, rdev)) { 9482 sysfs_unlink_rdev(mddev, rdev); 9483 rdev->saved_raid_disk = rdev->raid_disk; 9484 rdev->raid_disk = -1; 9485 removed++; 9486 } 9487 } 9488 9489 if (removed && mddev->kobj.sd) 9490 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9491 9492 return removed; 9493 } 9494 9495 static int remove_and_add_spares(struct mddev *mddev, 9496 struct md_rdev *this) 9497 { 9498 struct md_rdev *rdev; 9499 int spares = 0; 9500 int removed = 0; 9501 9502 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9503 /* Mustn't remove devices when resync thread is running */ 9504 return 0; 9505 9506 removed = remove_spares(mddev, this); 9507 if (this && removed) 9508 goto no_add; 9509 9510 rdev_for_each(rdev, mddev) { 9511 if (this && this != rdev) 9512 continue; 9513 if (rdev_is_spare(rdev)) 9514 spares++; 9515 if (!rdev_addable(rdev)) 9516 continue; 9517 if (!test_bit(Journal, &rdev->flags)) 9518 rdev->recovery_offset = 0; 9519 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9520 /* failure here is OK */ 9521 sysfs_link_rdev(mddev, rdev); 9522 if (!test_bit(Journal, &rdev->flags)) 9523 spares++; 9524 md_new_event(); 9525 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9526 } 9527 } 9528 no_add: 9529 if (removed) 9530 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9531 return spares; 9532 } 9533 9534 static bool md_choose_sync_action(struct mddev *mddev, int *spares) 9535 { 9536 /* Check if reshape is in progress first. */ 9537 if (mddev->reshape_position != MaxSector) { 9538 if (mddev->pers->check_reshape == NULL || 9539 mddev->pers->check_reshape(mddev) != 0) 9540 return false; 9541 9542 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9543 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9544 return true; 9545 } 9546 9547 /* Check if resync is in progress. */ 9548 if (mddev->resync_offset < MaxSector) { 9549 remove_spares(mddev, NULL); 9550 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9551 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9552 return true; 9553 } 9554 9555 /* 9556 * Remove any failed drives, then add spares if possible. Spares are 9557 * also removed and re-added, to allow the personality to fail the 9558 * re-add. 9559 */ 9560 *spares = remove_and_add_spares(mddev, NULL); 9561 if (*spares) { 9562 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9563 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9564 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9565 9566 /* Start new recovery. */ 9567 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9568 return true; 9569 } 9570 9571 /* Delay to choose resync/check/repair in md_do_sync(). */ 9572 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9573 return true; 9574 9575 /* Nothing to be done */ 9576 return false; 9577 } 9578 9579 static void md_start_sync(struct work_struct *ws) 9580 { 9581 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 9582 int spares = 0; 9583 bool suspend = false; 9584 char *name; 9585 9586 /* 9587 * If reshape is still in progress, spares won't be added or removed 9588 * from conf until reshape is done. 9589 */ 9590 if (mddev->reshape_position == MaxSector && 9591 md_spares_need_change(mddev)) { 9592 suspend = true; 9593 mddev_suspend(mddev, false); 9594 } 9595 9596 mddev_lock_nointr(mddev); 9597 if (!md_is_rdwr(mddev)) { 9598 /* 9599 * On a read-only array we can: 9600 * - remove failed devices 9601 * - add already-in_sync devices if the array itself is in-sync. 9602 * As we only add devices that are already in-sync, we can 9603 * activate the spares immediately. 9604 */ 9605 remove_and_add_spares(mddev, NULL); 9606 goto not_running; 9607 } 9608 9609 if (!md_choose_sync_action(mddev, &spares)) 9610 goto not_running; 9611 9612 if (!mddev->pers->sync_request) 9613 goto not_running; 9614 9615 /* 9616 * We are adding a device or devices to an array which has the bitmap 9617 * stored on all devices. So make sure all bitmap pages get written. 9618 */ 9619 if (spares) 9620 mddev->bitmap_ops->write_all(mddev); 9621 9622 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? 9623 "reshape" : "resync"; 9624 rcu_assign_pointer(mddev->sync_thread, 9625 md_register_thread(md_do_sync, mddev, name)); 9626 if (!mddev->sync_thread) { 9627 pr_warn("%s: could not start resync thread...\n", 9628 mdname(mddev)); 9629 /* leave the spares where they are, it shouldn't hurt */ 9630 goto not_running; 9631 } 9632 9633 mddev_unlock(mddev); 9634 /* 9635 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9636 * not set it again. Otherwise, we may cause issue like this one: 9637 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9638 * Therefore, use __mddev_resume(mddev, false). 9639 */ 9640 if (suspend) 9641 __mddev_resume(mddev, false); 9642 md_wakeup_thread(mddev->sync_thread); 9643 sysfs_notify_dirent_safe(mddev->sysfs_action); 9644 md_new_event(); 9645 return; 9646 9647 not_running: 9648 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9649 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9650 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9651 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9652 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9653 mddev_unlock(mddev); 9654 /* 9655 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9656 * not set it again. Otherwise, we may cause issue like this one: 9657 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9658 * Therefore, use __mddev_resume(mddev, false). 9659 */ 9660 if (suspend) 9661 __mddev_resume(mddev, false); 9662 9663 wake_up(&resync_wait); 9664 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 9665 mddev->sysfs_action) 9666 sysfs_notify_dirent_safe(mddev->sysfs_action); 9667 } 9668 9669 static void unregister_sync_thread(struct mddev *mddev) 9670 { 9671 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9672 /* resync/recovery still happening */ 9673 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9674 return; 9675 } 9676 9677 if (WARN_ON_ONCE(!mddev->sync_thread)) 9678 return; 9679 9680 md_reap_sync_thread(mddev); 9681 } 9682 9683 /* 9684 * This routine is regularly called by all per-raid-array threads to 9685 * deal with generic issues like resync and super-block update. 9686 * Raid personalities that don't have a thread (linear/raid0) do not 9687 * need this as they never do any recovery or update the superblock. 9688 * 9689 * It does not do any resync itself, but rather "forks" off other threads 9690 * to do that as needed. 9691 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9692 * "->recovery" and create a thread at ->sync_thread. 9693 * When the thread finishes it sets MD_RECOVERY_DONE 9694 * and wakeups up this thread which will reap the thread and finish up. 9695 * This thread also removes any faulty devices (with nr_pending == 0). 9696 * 9697 * The overall approach is: 9698 * 1/ if the superblock needs updating, update it. 9699 * 2/ If a recovery thread is running, don't do anything else. 9700 * 3/ If recovery has finished, clean up, possibly marking spares active. 9701 * 4/ If there are any faulty devices, remove them. 9702 * 5/ If array is degraded, try to add spares devices 9703 * 6/ If array has spares or is not in-sync, start a resync thread. 9704 */ 9705 void md_check_recovery(struct mddev *mddev) 9706 { 9707 if (mddev->bitmap) 9708 mddev->bitmap_ops->daemon_work(mddev); 9709 9710 if (signal_pending(current)) { 9711 if (mddev->pers->sync_request && !mddev->external) { 9712 pr_debug("md: %s in immediate safe mode\n", 9713 mdname(mddev)); 9714 mddev->safemode = 2; 9715 } 9716 flush_signals(current); 9717 } 9718 9719 if (!md_is_rdwr(mddev) && 9720 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9721 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9722 return; 9723 if ( ! ( 9724 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9725 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9726 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9727 (mddev->external == 0 && mddev->safemode == 1) || 9728 (mddev->safemode == 2 9729 && !mddev->in_sync && mddev->resync_offset == MaxSector) 9730 )) 9731 return; 9732 9733 if (mddev_trylock(mddev)) { 9734 bool try_set_sync = mddev->safemode != 0; 9735 9736 if (!mddev->external && mddev->safemode == 1) 9737 mddev->safemode = 0; 9738 9739 if (!md_is_rdwr(mddev)) { 9740 struct md_rdev *rdev; 9741 9742 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9743 unregister_sync_thread(mddev); 9744 goto unlock; 9745 } 9746 9747 if (!mddev->external && mddev->in_sync) 9748 /* 9749 * 'Blocked' flag not needed as failed devices 9750 * will be recorded if array switched to read/write. 9751 * Leaving it set will prevent the device 9752 * from being removed. 9753 */ 9754 rdev_for_each(rdev, mddev) 9755 clear_bit(Blocked, &rdev->flags); 9756 9757 /* 9758 * There is no thread, but we need to call 9759 * ->spare_active and clear saved_raid_disk 9760 */ 9761 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9762 md_reap_sync_thread(mddev); 9763 9764 /* 9765 * Let md_start_sync() to remove and add rdevs to the 9766 * array. 9767 */ 9768 if (md_spares_need_change(mddev)) { 9769 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9770 queue_work(md_misc_wq, &mddev->sync_work); 9771 } 9772 9773 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9774 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9775 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9776 9777 goto unlock; 9778 } 9779 9780 if (mddev_is_clustered(mddev)) { 9781 struct md_rdev *rdev, *tmp; 9782 /* kick the device if another node issued a 9783 * remove disk. 9784 */ 9785 rdev_for_each_safe(rdev, tmp, mddev) { 9786 if (rdev->raid_disk < 0 && 9787 test_and_clear_bit(ClusterRemove, &rdev->flags)) 9788 md_kick_rdev_from_array(rdev); 9789 } 9790 } 9791 9792 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9793 spin_lock(&mddev->lock); 9794 set_in_sync(mddev); 9795 spin_unlock(&mddev->lock); 9796 } 9797 9798 if (mddev->sb_flags) 9799 md_update_sb(mddev, 0); 9800 9801 /* 9802 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9803 * still set. 9804 */ 9805 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9806 unregister_sync_thread(mddev); 9807 goto unlock; 9808 } 9809 9810 /* Set RUNNING before clearing NEEDED to avoid 9811 * any transients in the value of "sync_action". 9812 */ 9813 mddev->curr_resync_completed = 0; 9814 spin_lock(&mddev->lock); 9815 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9816 spin_unlock(&mddev->lock); 9817 /* Clear some bits that don't mean anything, but 9818 * might be left set 9819 */ 9820 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9821 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9822 9823 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9824 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 9825 queue_work(md_misc_wq, &mddev->sync_work); 9826 } else { 9827 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9828 wake_up(&resync_wait); 9829 } 9830 9831 unlock: 9832 wake_up(&mddev->sb_wait); 9833 mddev_unlock(mddev); 9834 } 9835 } 9836 EXPORT_SYMBOL(md_check_recovery); 9837 9838 void md_reap_sync_thread(struct mddev *mddev) 9839 { 9840 struct md_rdev *rdev; 9841 sector_t old_dev_sectors = mddev->dev_sectors; 9842 bool is_reshaped = false; 9843 9844 /* resync has finished, collect result */ 9845 md_unregister_thread(mddev, &mddev->sync_thread); 9846 atomic_inc(&mddev->sync_seq); 9847 9848 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9849 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9850 mddev->degraded != mddev->raid_disks) { 9851 /* success...*/ 9852 /* activate any spares */ 9853 if (mddev->pers->spare_active(mddev)) { 9854 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9855 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9856 } 9857 } 9858 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9859 mddev->pers->finish_reshape) { 9860 mddev->pers->finish_reshape(mddev); 9861 if (mddev_is_clustered(mddev)) 9862 is_reshaped = true; 9863 } 9864 9865 /* If array is no-longer degraded, then any saved_raid_disk 9866 * information must be scrapped. 9867 */ 9868 if (!mddev->degraded) 9869 rdev_for_each(rdev, mddev) 9870 rdev->saved_raid_disk = -1; 9871 9872 md_update_sb(mddev, 1); 9873 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9874 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9875 * clustered raid */ 9876 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9877 mddev->cluster_ops->resync_finish(mddev); 9878 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9879 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9880 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9881 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9882 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9883 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9884 /* 9885 * We call mddev->cluster_ops->update_size here because sync_size could 9886 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9887 * so it is time to update size across cluster. 9888 */ 9889 if (mddev_is_clustered(mddev) && is_reshaped 9890 && !test_bit(MD_CLOSING, &mddev->flags)) 9891 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 9892 /* flag recovery needed just to double check */ 9893 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9894 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9895 sysfs_notify_dirent_safe(mddev->sysfs_action); 9896 md_new_event(); 9897 if (mddev->event_work.func) 9898 queue_work(md_misc_wq, &mddev->event_work); 9899 wake_up(&resync_wait); 9900 } 9901 EXPORT_SYMBOL(md_reap_sync_thread); 9902 9903 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9904 { 9905 sysfs_notify_dirent_safe(rdev->sysfs_state); 9906 wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), 9907 msecs_to_jiffies(5000)); 9908 rdev_dec_pending(rdev, mddev); 9909 } 9910 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9911 9912 void md_finish_reshape(struct mddev *mddev) 9913 { 9914 /* called be personality module when reshape completes. */ 9915 struct md_rdev *rdev; 9916 9917 rdev_for_each(rdev, mddev) { 9918 if (rdev->data_offset > rdev->new_data_offset) 9919 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9920 else 9921 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9922 rdev->data_offset = rdev->new_data_offset; 9923 } 9924 } 9925 EXPORT_SYMBOL(md_finish_reshape); 9926 9927 /* Bad block management */ 9928 9929 /* Returns true on success, false on failure */ 9930 bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9931 int is_new) 9932 { 9933 struct mddev *mddev = rdev->mddev; 9934 9935 /* 9936 * Recording new badblocks for faulty rdev will force unnecessary 9937 * super block updating. This is fragile for external management because 9938 * userspace daemon may trying to remove this device and deadlock may 9939 * occur. This will be probably solved in the mdadm, but it is safer to 9940 * avoid it. 9941 */ 9942 if (test_bit(Faulty, &rdev->flags)) 9943 return true; 9944 9945 if (is_new) 9946 s += rdev->new_data_offset; 9947 else 9948 s += rdev->data_offset; 9949 9950 if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) 9951 return false; 9952 9953 /* Make sure they get written out promptly */ 9954 if (test_bit(ExternalBbl, &rdev->flags)) 9955 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9956 sysfs_notify_dirent_safe(rdev->sysfs_state); 9957 set_mask_bits(&mddev->sb_flags, 0, 9958 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9959 md_wakeup_thread(rdev->mddev->thread); 9960 return true; 9961 } 9962 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9963 9964 void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9965 int is_new) 9966 { 9967 if (is_new) 9968 s += rdev->new_data_offset; 9969 else 9970 s += rdev->data_offset; 9971 9972 if (!badblocks_clear(&rdev->badblocks, s, sectors)) 9973 return; 9974 9975 if (test_bit(ExternalBbl, &rdev->flags)) 9976 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9977 } 9978 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9979 9980 static int md_notify_reboot(struct notifier_block *this, 9981 unsigned long code, void *x) 9982 { 9983 struct mddev *mddev; 9984 int need_delay = 0; 9985 9986 spin_lock(&all_mddevs_lock); 9987 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 9988 if (!mddev_get(mddev)) 9989 continue; 9990 spin_unlock(&all_mddevs_lock); 9991 if (mddev_trylock(mddev)) { 9992 if (mddev->pers) 9993 __md_stop_writes(mddev); 9994 if (mddev->persistent) 9995 mddev->safemode = 2; 9996 mddev_unlock(mddev); 9997 } 9998 need_delay = 1; 9999 spin_lock(&all_mddevs_lock); 10000 mddev_put_locked(mddev); 10001 } 10002 spin_unlock(&all_mddevs_lock); 10003 10004 /* 10005 * certain more exotic SCSI devices are known to be 10006 * volatile wrt too early system reboots. While the 10007 * right place to handle this issue is the given 10008 * driver, we do want to have a safe RAID driver ... 10009 */ 10010 if (need_delay) 10011 msleep(1000); 10012 10013 return NOTIFY_DONE; 10014 } 10015 10016 static struct notifier_block md_notifier = { 10017 .notifier_call = md_notify_reboot, 10018 .next = NULL, 10019 .priority = INT_MAX, /* before any real devices */ 10020 }; 10021 10022 static void md_geninit(void) 10023 { 10024 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 10025 10026 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 10027 } 10028 10029 static int __init md_init(void) 10030 { 10031 int ret = -ENOMEM; 10032 10033 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 10034 if (!md_wq) 10035 goto err_wq; 10036 10037 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 10038 if (!md_misc_wq) 10039 goto err_misc_wq; 10040 10041 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 10042 0); 10043 if (!md_bitmap_wq) 10044 goto err_bitmap_wq; 10045 10046 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 10047 if (ret < 0) 10048 goto err_md; 10049 10050 ret = __register_blkdev(0, "mdp", md_probe); 10051 if (ret < 0) 10052 goto err_mdp; 10053 mdp_major = ret; 10054 10055 register_reboot_notifier(&md_notifier); 10056 raid_table_header = register_sysctl("dev/raid", raid_table); 10057 10058 md_geninit(); 10059 return 0; 10060 10061 err_mdp: 10062 unregister_blkdev(MD_MAJOR, "md"); 10063 err_md: 10064 destroy_workqueue(md_bitmap_wq); 10065 err_bitmap_wq: 10066 destroy_workqueue(md_misc_wq); 10067 err_misc_wq: 10068 destroy_workqueue(md_wq); 10069 err_wq: 10070 return ret; 10071 } 10072 10073 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 10074 { 10075 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 10076 struct md_rdev *rdev2, *tmp; 10077 int role, ret; 10078 10079 /* 10080 * If size is changed in another node then we need to 10081 * do resize as well. 10082 */ 10083 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 10084 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 10085 if (ret) 10086 pr_info("md-cluster: resize failed\n"); 10087 else 10088 mddev->bitmap_ops->update_sb(mddev->bitmap); 10089 } 10090 10091 /* Check for change of roles in the active devices */ 10092 rdev_for_each_safe(rdev2, tmp, mddev) { 10093 if (test_bit(Faulty, &rdev2->flags)) { 10094 if (test_bit(ClusterRemove, &rdev2->flags)) 10095 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10096 continue; 10097 } 10098 10099 /* Check if the roles changed */ 10100 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 10101 10102 if (test_bit(Candidate, &rdev2->flags)) { 10103 if (role == MD_DISK_ROLE_FAULTY) { 10104 pr_info("md: Removing Candidate device %pg because add failed\n", 10105 rdev2->bdev); 10106 md_kick_rdev_from_array(rdev2); 10107 continue; 10108 } 10109 else 10110 clear_bit(Candidate, &rdev2->flags); 10111 } 10112 10113 if (role != rdev2->raid_disk) { 10114 /* 10115 * got activated except reshape is happening. 10116 */ 10117 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 10118 !(le32_to_cpu(sb->feature_map) & 10119 MD_FEATURE_RESHAPE_ACTIVE) && 10120 !mddev->cluster_ops->resync_status_get(mddev)) { 10121 /* 10122 * -1 to make raid1_add_disk() set conf->fullsync 10123 * to 1. This could avoid skipping sync when the 10124 * remote node is down during resyncing. 10125 */ 10126 if ((le32_to_cpu(sb->feature_map) 10127 & MD_FEATURE_RECOVERY_OFFSET)) 10128 rdev2->saved_raid_disk = -1; 10129 else 10130 rdev2->saved_raid_disk = role; 10131 ret = remove_and_add_spares(mddev, rdev2); 10132 pr_info("Activated spare: %pg\n", 10133 rdev2->bdev); 10134 /* wakeup mddev->thread here, so array could 10135 * perform resync with the new activated disk */ 10136 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10137 md_wakeup_thread(mddev->thread); 10138 } 10139 /* device faulty 10140 * We just want to do the minimum to mark the disk 10141 * as faulty. The recovery is performed by the 10142 * one who initiated the error. 10143 */ 10144 if (role == MD_DISK_ROLE_FAULTY || 10145 role == MD_DISK_ROLE_JOURNAL) { 10146 md_error(mddev, rdev2); 10147 clear_bit(Blocked, &rdev2->flags); 10148 } 10149 } 10150 } 10151 10152 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 10153 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 10154 if (ret) 10155 pr_warn("md: updating array disks failed. %d\n", ret); 10156 } 10157 10158 /* 10159 * Since mddev->delta_disks has already updated in update_raid_disks, 10160 * so it is time to check reshape. 10161 */ 10162 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10163 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10164 /* 10165 * reshape is happening in the remote node, we need to 10166 * update reshape_position and call start_reshape. 10167 */ 10168 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 10169 if (mddev->pers->update_reshape_pos) 10170 mddev->pers->update_reshape_pos(mddev); 10171 if (mddev->pers->start_reshape) 10172 mddev->pers->start_reshape(mddev); 10173 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10174 mddev->reshape_position != MaxSector && 10175 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10176 /* reshape is just done in another node. */ 10177 mddev->reshape_position = MaxSector; 10178 if (mddev->pers->update_reshape_pos) 10179 mddev->pers->update_reshape_pos(mddev); 10180 } 10181 10182 /* Finally set the event to be up to date */ 10183 mddev->events = le64_to_cpu(sb->events); 10184 } 10185 10186 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 10187 { 10188 int err; 10189 struct page *swapout = rdev->sb_page; 10190 struct mdp_superblock_1 *sb; 10191 10192 /* Store the sb page of the rdev in the swapout temporary 10193 * variable in case we err in the future 10194 */ 10195 rdev->sb_page = NULL; 10196 err = alloc_disk_sb(rdev); 10197 if (err == 0) { 10198 ClearPageUptodate(rdev->sb_page); 10199 rdev->sb_loaded = 0; 10200 err = super_types[mddev->major_version]. 10201 load_super(rdev, NULL, mddev->minor_version); 10202 } 10203 if (err < 0) { 10204 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 10205 __func__, __LINE__, rdev->desc_nr, err); 10206 if (rdev->sb_page) 10207 put_page(rdev->sb_page); 10208 rdev->sb_page = swapout; 10209 rdev->sb_loaded = 1; 10210 return err; 10211 } 10212 10213 sb = page_address(rdev->sb_page); 10214 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 10215 * is not set 10216 */ 10217 10218 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 10219 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 10220 10221 /* The other node finished recovery, call spare_active to set 10222 * device In_sync and mddev->degraded 10223 */ 10224 if (rdev->recovery_offset == MaxSector && 10225 !test_bit(In_sync, &rdev->flags) && 10226 mddev->pers->spare_active(mddev)) 10227 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10228 10229 put_page(swapout); 10230 return 0; 10231 } 10232 10233 void md_reload_sb(struct mddev *mddev, int nr) 10234 { 10235 struct md_rdev *rdev = NULL, *iter; 10236 int err; 10237 10238 /* Find the rdev */ 10239 rdev_for_each_rcu(iter, mddev) { 10240 if (iter->desc_nr == nr) { 10241 rdev = iter; 10242 break; 10243 } 10244 } 10245 10246 if (!rdev) { 10247 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 10248 return; 10249 } 10250 10251 err = read_rdev(mddev, rdev); 10252 if (err < 0) 10253 return; 10254 10255 check_sb_changes(mddev, rdev); 10256 10257 /* Read all rdev's to update recovery_offset */ 10258 rdev_for_each_rcu(rdev, mddev) { 10259 if (!test_bit(Faulty, &rdev->flags)) 10260 read_rdev(mddev, rdev); 10261 } 10262 } 10263 EXPORT_SYMBOL(md_reload_sb); 10264 10265 #ifndef MODULE 10266 10267 /* 10268 * Searches all registered partitions for autorun RAID arrays 10269 * at boot time. 10270 */ 10271 10272 static DEFINE_MUTEX(detected_devices_mutex); 10273 static LIST_HEAD(all_detected_devices); 10274 struct detected_devices_node { 10275 struct list_head list; 10276 dev_t dev; 10277 }; 10278 10279 void md_autodetect_dev(dev_t dev) 10280 { 10281 struct detected_devices_node *node_detected_dev; 10282 10283 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 10284 if (node_detected_dev) { 10285 node_detected_dev->dev = dev; 10286 mutex_lock(&detected_devices_mutex); 10287 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10288 mutex_unlock(&detected_devices_mutex); 10289 } 10290 } 10291 10292 void md_autostart_arrays(int part) 10293 { 10294 struct md_rdev *rdev; 10295 struct detected_devices_node *node_detected_dev; 10296 dev_t dev; 10297 int i_scanned, i_passed; 10298 10299 i_scanned = 0; 10300 i_passed = 0; 10301 10302 pr_info("md: Autodetecting RAID arrays.\n"); 10303 10304 mutex_lock(&detected_devices_mutex); 10305 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10306 i_scanned++; 10307 node_detected_dev = list_entry(all_detected_devices.next, 10308 struct detected_devices_node, list); 10309 list_del(&node_detected_dev->list); 10310 dev = node_detected_dev->dev; 10311 kfree(node_detected_dev); 10312 mutex_unlock(&detected_devices_mutex); 10313 rdev = md_import_device(dev,0, 90); 10314 mutex_lock(&detected_devices_mutex); 10315 if (IS_ERR(rdev)) 10316 continue; 10317 10318 if (test_bit(Faulty, &rdev->flags)) 10319 continue; 10320 10321 set_bit(AutoDetected, &rdev->flags); 10322 list_add(&rdev->same_set, &pending_raid_disks); 10323 i_passed++; 10324 } 10325 mutex_unlock(&detected_devices_mutex); 10326 10327 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10328 10329 autorun_devices(part); 10330 } 10331 10332 #endif /* !MODULE */ 10333 10334 static __exit void md_exit(void) 10335 { 10336 struct mddev *mddev; 10337 int delay = 1; 10338 10339 unregister_blkdev(MD_MAJOR,"md"); 10340 unregister_blkdev(mdp_major, "mdp"); 10341 unregister_reboot_notifier(&md_notifier); 10342 unregister_sysctl_table(raid_table_header); 10343 10344 /* We cannot unload the modules while some process is 10345 * waiting for us in select() or poll() - wake them up 10346 */ 10347 md_unloading = 1; 10348 while (waitqueue_active(&md_event_waiters)) { 10349 /* not safe to leave yet */ 10350 wake_up(&md_event_waiters); 10351 msleep(delay); 10352 delay += delay; 10353 } 10354 remove_proc_entry("mdstat", NULL); 10355 10356 spin_lock(&all_mddevs_lock); 10357 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 10358 if (!mddev_get(mddev)) 10359 continue; 10360 spin_unlock(&all_mddevs_lock); 10361 export_array(mddev); 10362 mddev->ctime = 0; 10363 mddev->hold_active = 0; 10364 /* 10365 * As the mddev is now fully clear, mddev_put will schedule 10366 * the mddev for destruction by a workqueue, and the 10367 * destroy_workqueue() below will wait for that to complete. 10368 */ 10369 spin_lock(&all_mddevs_lock); 10370 mddev_put_locked(mddev); 10371 } 10372 spin_unlock(&all_mddevs_lock); 10373 10374 destroy_workqueue(md_misc_wq); 10375 destroy_workqueue(md_bitmap_wq); 10376 destroy_workqueue(md_wq); 10377 } 10378 10379 subsys_initcall(md_init); 10380 module_exit(md_exit) 10381 10382 static int get_ro(char *buffer, const struct kernel_param *kp) 10383 { 10384 return sprintf(buffer, "%d\n", start_readonly); 10385 } 10386 static int set_ro(const char *val, const struct kernel_param *kp) 10387 { 10388 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10389 } 10390 10391 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10392 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10393 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10394 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10395 10396 MODULE_LICENSE("GPL"); 10397 MODULE_DESCRIPTION("MD RAID framework"); 10398 MODULE_ALIAS("md"); 10399 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10400