1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include "md.h" 69 #include "md-bitmap.h" 70 #include "md-cluster.h" 71 72 static const char *action_name[NR_SYNC_ACTIONS] = { 73 [ACTION_RESYNC] = "resync", 74 [ACTION_RECOVER] = "recover", 75 [ACTION_CHECK] = "check", 76 [ACTION_REPAIR] = "repair", 77 [ACTION_RESHAPE] = "reshape", 78 [ACTION_FROZEN] = "frozen", 79 [ACTION_IDLE] = "idle", 80 }; 81 82 static DEFINE_XARRAY(md_submodule); 83 84 static const struct kobj_type md_ktype; 85 86 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 87 static struct workqueue_struct *md_wq; 88 89 /* 90 * This workqueue is used for sync_work to register new sync_thread, and for 91 * del_work to remove rdev, and for event_work that is only set by dm-raid. 92 * 93 * Noted that sync_work will grab reconfig_mutex, hence never flush this 94 * workqueue whith reconfig_mutex grabbed. 95 */ 96 static struct workqueue_struct *md_misc_wq; 97 struct workqueue_struct *md_bitmap_wq; 98 99 static int remove_and_add_spares(struct mddev *mddev, 100 struct md_rdev *this); 101 static void mddev_detach(struct mddev *mddev); 102 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 103 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 104 105 /* 106 * Default number of read corrections we'll attempt on an rdev 107 * before ejecting it from the array. We divide the read error 108 * count by 2 for every hour elapsed between read errors. 109 */ 110 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 111 /* Default safemode delay: 200 msec */ 112 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 113 /* 114 * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' 115 * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load 116 * does not show up that much. Increase it if you want to have more guaranteed 117 * speed. Note that the RAID driver will use the maximum bandwidth 118 * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. 119 * 120 * Background sync IO speed control: 121 * 122 * - below speed min: 123 * no limit; 124 * - above speed min and below speed max: 125 * a) if mddev is idle, then no limit; 126 * b) if mddev is busy handling normal IO, then limit inflight sync IO 127 * to sync_io_depth; 128 * - above speed max: 129 * sync IO can't be issued; 130 * 131 * Following configurations can be changed via /proc/sys/dev/raid/ for system 132 * or /sys/block/mdX/md/ for one array. 133 */ 134 static int sysctl_speed_limit_min = 1000; 135 static int sysctl_speed_limit_max = 200000; 136 static int sysctl_sync_io_depth = 32; 137 138 static int speed_min(struct mddev *mddev) 139 { 140 return mddev->sync_speed_min ? 141 mddev->sync_speed_min : sysctl_speed_limit_min; 142 } 143 144 static int speed_max(struct mddev *mddev) 145 { 146 return mddev->sync_speed_max ? 147 mddev->sync_speed_max : sysctl_speed_limit_max; 148 } 149 150 static int sync_io_depth(struct mddev *mddev) 151 { 152 return mddev->sync_io_depth ? 153 mddev->sync_io_depth : sysctl_sync_io_depth; 154 } 155 156 static void rdev_uninit_serial(struct md_rdev *rdev) 157 { 158 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 159 return; 160 161 kvfree(rdev->serial); 162 rdev->serial = NULL; 163 } 164 165 static void rdevs_uninit_serial(struct mddev *mddev) 166 { 167 struct md_rdev *rdev; 168 169 rdev_for_each(rdev, mddev) 170 rdev_uninit_serial(rdev); 171 } 172 173 static int rdev_init_serial(struct md_rdev *rdev) 174 { 175 /* serial_nums equals with BARRIER_BUCKETS_NR */ 176 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 177 struct serial_in_rdev *serial = NULL; 178 179 if (test_bit(CollisionCheck, &rdev->flags)) 180 return 0; 181 182 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 183 GFP_KERNEL); 184 if (!serial) 185 return -ENOMEM; 186 187 for (i = 0; i < serial_nums; i++) { 188 struct serial_in_rdev *serial_tmp = &serial[i]; 189 190 spin_lock_init(&serial_tmp->serial_lock); 191 serial_tmp->serial_rb = RB_ROOT_CACHED; 192 init_waitqueue_head(&serial_tmp->serial_io_wait); 193 } 194 195 rdev->serial = serial; 196 set_bit(CollisionCheck, &rdev->flags); 197 198 return 0; 199 } 200 201 static int rdevs_init_serial(struct mddev *mddev) 202 { 203 struct md_rdev *rdev; 204 int ret = 0; 205 206 rdev_for_each(rdev, mddev) { 207 ret = rdev_init_serial(rdev); 208 if (ret) 209 break; 210 } 211 212 /* Free all resources if pool is not existed */ 213 if (ret && !mddev->serial_info_pool) 214 rdevs_uninit_serial(mddev); 215 216 return ret; 217 } 218 219 /* 220 * rdev needs to enable serial stuffs if it meets the conditions: 221 * 1. it is multi-queue device flaged with writemostly. 222 * 2. the write-behind mode is enabled. 223 */ 224 static int rdev_need_serial(struct md_rdev *rdev) 225 { 226 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 227 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 228 test_bit(WriteMostly, &rdev->flags)); 229 } 230 231 /* 232 * Init resource for rdev(s), then create serial_info_pool if: 233 * 1. rdev is the first device which return true from rdev_enable_serial. 234 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 235 */ 236 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 237 { 238 int ret = 0; 239 240 if (rdev && !rdev_need_serial(rdev) && 241 !test_bit(CollisionCheck, &rdev->flags)) 242 return; 243 244 if (!rdev) 245 ret = rdevs_init_serial(mddev); 246 else 247 ret = rdev_init_serial(rdev); 248 if (ret) 249 return; 250 251 if (mddev->serial_info_pool == NULL) { 252 /* 253 * already in memalloc noio context by 254 * mddev_suspend() 255 */ 256 mddev->serial_info_pool = 257 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 258 sizeof(struct serial_info)); 259 if (!mddev->serial_info_pool) { 260 rdevs_uninit_serial(mddev); 261 pr_err("can't alloc memory pool for serialization\n"); 262 } 263 } 264 } 265 266 /* 267 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 268 * 1. rdev is the last device flaged with CollisionCheck. 269 * 2. when bitmap is destroyed while policy is not enabled. 270 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 271 */ 272 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 273 { 274 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 275 return; 276 277 if (mddev->serial_info_pool) { 278 struct md_rdev *temp; 279 int num = 0; /* used to track if other rdevs need the pool */ 280 281 rdev_for_each(temp, mddev) { 282 if (!rdev) { 283 if (!mddev->serialize_policy || 284 !rdev_need_serial(temp)) 285 rdev_uninit_serial(temp); 286 else 287 num++; 288 } else if (temp != rdev && 289 test_bit(CollisionCheck, &temp->flags)) 290 num++; 291 } 292 293 if (rdev) 294 rdev_uninit_serial(rdev); 295 296 if (num) 297 pr_info("The mempool could be used by other devices\n"); 298 else { 299 mempool_destroy(mddev->serial_info_pool); 300 mddev->serial_info_pool = NULL; 301 } 302 } 303 } 304 305 static struct ctl_table_header *raid_table_header; 306 307 static const struct ctl_table raid_table[] = { 308 { 309 .procname = "speed_limit_min", 310 .data = &sysctl_speed_limit_min, 311 .maxlen = sizeof(int), 312 .mode = 0644, 313 .proc_handler = proc_dointvec, 314 }, 315 { 316 .procname = "speed_limit_max", 317 .data = &sysctl_speed_limit_max, 318 .maxlen = sizeof(int), 319 .mode = 0644, 320 .proc_handler = proc_dointvec, 321 }, 322 { 323 .procname = "sync_io_depth", 324 .data = &sysctl_sync_io_depth, 325 .maxlen = sizeof(int), 326 .mode = 0644, 327 .proc_handler = proc_dointvec, 328 }, 329 }; 330 331 static int start_readonly; 332 333 /* 334 * The original mechanism for creating an md device is to create 335 * a device node in /dev and to open it. This causes races with device-close. 336 * The preferred method is to write to the "new_array" module parameter. 337 * This can avoid races. 338 * Setting create_on_open to false disables the original mechanism 339 * so all the races disappear. 340 */ 341 static bool create_on_open = true; 342 static bool legacy_async_del_gendisk = true; 343 344 /* 345 * We have a system wide 'event count' that is incremented 346 * on any 'interesting' event, and readers of /proc/mdstat 347 * can use 'poll' or 'select' to find out when the event 348 * count increases. 349 * 350 * Events are: 351 * start array, stop array, error, add device, remove device, 352 * start build, activate spare 353 */ 354 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 355 static atomic_t md_event_count; 356 void md_new_event(void) 357 { 358 atomic_inc(&md_event_count); 359 wake_up(&md_event_waiters); 360 } 361 EXPORT_SYMBOL_GPL(md_new_event); 362 363 /* 364 * Enables to iterate over all existing md arrays 365 * all_mddevs_lock protects this list. 366 */ 367 static LIST_HEAD(all_mddevs); 368 static DEFINE_SPINLOCK(all_mddevs_lock); 369 370 static bool is_md_suspended(struct mddev *mddev) 371 { 372 return percpu_ref_is_dying(&mddev->active_io); 373 } 374 /* Rather than calling directly into the personality make_request function, 375 * IO requests come here first so that we can check if the device is 376 * being suspended pending a reconfiguration. 377 * We hold a refcount over the call to ->make_request. By the time that 378 * call has finished, the bio has been linked into some internal structure 379 * and so is visible to ->quiesce(), so we don't need the refcount any more. 380 */ 381 static bool is_suspended(struct mddev *mddev, struct bio *bio) 382 { 383 if (is_md_suspended(mddev)) 384 return true; 385 if (bio_data_dir(bio) != WRITE) 386 return false; 387 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 388 return false; 389 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 390 return false; 391 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 392 return false; 393 return true; 394 } 395 396 bool md_handle_request(struct mddev *mddev, struct bio *bio) 397 { 398 check_suspended: 399 if (is_suspended(mddev, bio)) { 400 DEFINE_WAIT(__wait); 401 /* Bail out if REQ_NOWAIT is set for the bio */ 402 if (bio->bi_opf & REQ_NOWAIT) { 403 bio_wouldblock_error(bio); 404 return true; 405 } 406 for (;;) { 407 prepare_to_wait(&mddev->sb_wait, &__wait, 408 TASK_UNINTERRUPTIBLE); 409 if (!is_suspended(mddev, bio)) 410 break; 411 schedule(); 412 } 413 finish_wait(&mddev->sb_wait, &__wait); 414 } 415 if (!percpu_ref_tryget_live(&mddev->active_io)) 416 goto check_suspended; 417 418 if (!mddev->pers->make_request(mddev, bio)) { 419 percpu_ref_put(&mddev->active_io); 420 if (!mddev->gendisk && mddev->pers->prepare_suspend) 421 return false; 422 goto check_suspended; 423 } 424 425 percpu_ref_put(&mddev->active_io); 426 return true; 427 } 428 EXPORT_SYMBOL(md_handle_request); 429 430 static void md_submit_bio(struct bio *bio) 431 { 432 const int rw = bio_data_dir(bio); 433 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 434 435 if (mddev == NULL || mddev->pers == NULL) { 436 bio_io_error(bio); 437 return; 438 } 439 440 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 441 bio_io_error(bio); 442 return; 443 } 444 445 bio = bio_split_to_limits(bio); 446 if (!bio) 447 return; 448 449 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 450 if (bio_sectors(bio) != 0) 451 bio->bi_status = BLK_STS_IOERR; 452 bio_endio(bio); 453 return; 454 } 455 456 /* bio could be mergeable after passing to underlayer */ 457 bio->bi_opf &= ~REQ_NOMERGE; 458 459 md_handle_request(mddev, bio); 460 } 461 462 /* 463 * Make sure no new requests are submitted to the device, and any requests that 464 * have been submitted are completely handled. 465 */ 466 int mddev_suspend(struct mddev *mddev, bool interruptible) 467 { 468 int err = 0; 469 470 /* 471 * hold reconfig_mutex to wait for normal io will deadlock, because 472 * other context can't update super_block, and normal io can rely on 473 * updating super_block. 474 */ 475 lockdep_assert_not_held(&mddev->reconfig_mutex); 476 477 if (interruptible) 478 err = mutex_lock_interruptible(&mddev->suspend_mutex); 479 else 480 mutex_lock(&mddev->suspend_mutex); 481 if (err) 482 return err; 483 484 if (mddev->suspended) { 485 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 486 mutex_unlock(&mddev->suspend_mutex); 487 return 0; 488 } 489 490 percpu_ref_kill(&mddev->active_io); 491 if (interruptible) 492 err = wait_event_interruptible(mddev->sb_wait, 493 percpu_ref_is_zero(&mddev->active_io)); 494 else 495 wait_event(mddev->sb_wait, 496 percpu_ref_is_zero(&mddev->active_io)); 497 if (err) { 498 percpu_ref_resurrect(&mddev->active_io); 499 mutex_unlock(&mddev->suspend_mutex); 500 return err; 501 } 502 503 /* 504 * For raid456, io might be waiting for reshape to make progress, 505 * allow new reshape to start while waiting for io to be done to 506 * prevent deadlock. 507 */ 508 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 509 510 /* restrict memory reclaim I/O during raid array is suspend */ 511 mddev->noio_flag = memalloc_noio_save(); 512 513 mutex_unlock(&mddev->suspend_mutex); 514 return 0; 515 } 516 EXPORT_SYMBOL_GPL(mddev_suspend); 517 518 static void __mddev_resume(struct mddev *mddev, bool recovery_needed) 519 { 520 lockdep_assert_not_held(&mddev->reconfig_mutex); 521 522 mutex_lock(&mddev->suspend_mutex); 523 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 524 if (mddev->suspended) { 525 mutex_unlock(&mddev->suspend_mutex); 526 return; 527 } 528 529 /* entred the memalloc scope from mddev_suspend() */ 530 memalloc_noio_restore(mddev->noio_flag); 531 532 percpu_ref_resurrect(&mddev->active_io); 533 wake_up(&mddev->sb_wait); 534 535 if (recovery_needed) 536 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 537 md_wakeup_thread(mddev->thread); 538 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 539 540 mutex_unlock(&mddev->suspend_mutex); 541 } 542 543 void mddev_resume(struct mddev *mddev) 544 { 545 return __mddev_resume(mddev, true); 546 } 547 EXPORT_SYMBOL_GPL(mddev_resume); 548 549 /* sync bdev before setting device to readonly or stopping raid*/ 550 static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) 551 { 552 mutex_lock(&mddev->open_mutex); 553 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { 554 mutex_unlock(&mddev->open_mutex); 555 return -EBUSY; 556 } 557 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 558 mutex_unlock(&mddev->open_mutex); 559 return -EBUSY; 560 } 561 mutex_unlock(&mddev->open_mutex); 562 563 sync_blockdev(mddev->gendisk->part0); 564 return 0; 565 } 566 567 /* 568 * The only difference from bio_chain_endio() is that the current 569 * bi_status of bio does not affect the bi_status of parent. 570 */ 571 static void md_end_flush(struct bio *bio) 572 { 573 struct bio *parent = bio->bi_private; 574 575 /* 576 * If any flush io error before the power failure, 577 * disk data may be lost. 578 */ 579 if (bio->bi_status) 580 pr_err("md: %pg flush io error %d\n", bio->bi_bdev, 581 blk_status_to_errno(bio->bi_status)); 582 583 bio_put(bio); 584 bio_endio(parent); 585 } 586 587 bool md_flush_request(struct mddev *mddev, struct bio *bio) 588 { 589 struct md_rdev *rdev; 590 struct bio *new; 591 592 /* 593 * md_flush_reqeust() should be called under md_handle_request() and 594 * 'active_io' is already grabbed. Hence it's safe to get rdev directly 595 * without rcu protection. 596 */ 597 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 598 599 rdev_for_each(rdev, mddev) { 600 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 601 continue; 602 603 new = bio_alloc_bioset(rdev->bdev, 0, 604 REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, 605 &mddev->bio_set); 606 new->bi_private = bio; 607 new->bi_end_io = md_end_flush; 608 bio_inc_remaining(bio); 609 submit_bio(new); 610 } 611 612 if (bio_sectors(bio) == 0) { 613 bio_endio(bio); 614 return true; 615 } 616 617 bio->bi_opf &= ~REQ_PREFLUSH; 618 return false; 619 } 620 EXPORT_SYMBOL(md_flush_request); 621 622 static inline struct mddev *mddev_get(struct mddev *mddev) 623 { 624 lockdep_assert_held(&all_mddevs_lock); 625 626 if (test_bit(MD_DELETED, &mddev->flags)) 627 return NULL; 628 atomic_inc(&mddev->active); 629 return mddev; 630 } 631 632 static void mddev_delayed_delete(struct work_struct *ws); 633 634 static void __mddev_put(struct mddev *mddev) 635 { 636 if (mddev->raid_disks || !list_empty(&mddev->disks) || 637 mddev->ctime || mddev->hold_active) 638 return; 639 640 /* 641 * If array is freed by stopping array, MD_DELETED is set by 642 * do_md_stop(), MD_DELETED is still set here in case mddev is freed 643 * directly by closing a mddev that is created by create_on_open. 644 */ 645 set_bit(MD_DELETED, &mddev->flags); 646 /* 647 * Call queue_work inside the spinlock so that flush_workqueue() after 648 * mddev_find will succeed in waiting for the work to be done. 649 */ 650 queue_work(md_misc_wq, &mddev->del_work); 651 } 652 653 static void mddev_put_locked(struct mddev *mddev) 654 { 655 if (atomic_dec_and_test(&mddev->active)) 656 __mddev_put(mddev); 657 } 658 659 void mddev_put(struct mddev *mddev) 660 { 661 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 662 return; 663 664 __mddev_put(mddev); 665 spin_unlock(&all_mddevs_lock); 666 } 667 668 static void md_safemode_timeout(struct timer_list *t); 669 static void md_start_sync(struct work_struct *ws); 670 671 static void active_io_release(struct percpu_ref *ref) 672 { 673 struct mddev *mddev = container_of(ref, struct mddev, active_io); 674 675 wake_up(&mddev->sb_wait); 676 } 677 678 static void no_op(struct percpu_ref *r) {} 679 680 int mddev_init(struct mddev *mddev) 681 { 682 683 if (percpu_ref_init(&mddev->active_io, active_io_release, 684 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 685 return -ENOMEM; 686 687 if (percpu_ref_init(&mddev->writes_pending, no_op, 688 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 689 percpu_ref_exit(&mddev->active_io); 690 return -ENOMEM; 691 } 692 693 /* We want to start with the refcount at zero */ 694 percpu_ref_put(&mddev->writes_pending); 695 696 mutex_init(&mddev->open_mutex); 697 mutex_init(&mddev->reconfig_mutex); 698 mutex_init(&mddev->suspend_mutex); 699 mutex_init(&mddev->bitmap_info.mutex); 700 INIT_LIST_HEAD(&mddev->disks); 701 INIT_LIST_HEAD(&mddev->all_mddevs); 702 INIT_LIST_HEAD(&mddev->deleting); 703 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 704 atomic_set(&mddev->active, 1); 705 atomic_set(&mddev->openers, 0); 706 atomic_set(&mddev->sync_seq, 0); 707 spin_lock_init(&mddev->lock); 708 init_waitqueue_head(&mddev->sb_wait); 709 init_waitqueue_head(&mddev->recovery_wait); 710 mddev->reshape_position = MaxSector; 711 mddev->reshape_backwards = 0; 712 mddev->last_sync_action = ACTION_IDLE; 713 mddev->resync_min = 0; 714 mddev->resync_max = MaxSector; 715 mddev->level = LEVEL_NONE; 716 mddev_set_bitmap_ops(mddev); 717 718 INIT_WORK(&mddev->sync_work, md_start_sync); 719 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 720 721 return 0; 722 } 723 EXPORT_SYMBOL_GPL(mddev_init); 724 725 void mddev_destroy(struct mddev *mddev) 726 { 727 percpu_ref_exit(&mddev->active_io); 728 percpu_ref_exit(&mddev->writes_pending); 729 } 730 EXPORT_SYMBOL_GPL(mddev_destroy); 731 732 static struct mddev *mddev_find_locked(dev_t unit) 733 { 734 struct mddev *mddev; 735 736 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 737 if (mddev->unit == unit) 738 return mddev; 739 740 return NULL; 741 } 742 743 /* find an unused unit number */ 744 static dev_t mddev_alloc_unit(void) 745 { 746 static int next_minor = 512; 747 int start = next_minor; 748 bool is_free = 0; 749 dev_t dev = 0; 750 751 while (!is_free) { 752 dev = MKDEV(MD_MAJOR, next_minor); 753 next_minor++; 754 if (next_minor > MINORMASK) 755 next_minor = 0; 756 if (next_minor == start) 757 return 0; /* Oh dear, all in use. */ 758 is_free = !mddev_find_locked(dev); 759 } 760 761 return dev; 762 } 763 764 static struct mddev *mddev_alloc(dev_t unit) 765 { 766 struct mddev *new; 767 int error; 768 769 if (unit && MAJOR(unit) != MD_MAJOR) 770 unit &= ~((1 << MdpMinorShift) - 1); 771 772 new = kzalloc(sizeof(*new), GFP_KERNEL); 773 if (!new) 774 return ERR_PTR(-ENOMEM); 775 776 error = mddev_init(new); 777 if (error) 778 goto out_free_new; 779 780 spin_lock(&all_mddevs_lock); 781 if (unit) { 782 error = -EEXIST; 783 if (mddev_find_locked(unit)) 784 goto out_destroy_new; 785 new->unit = unit; 786 if (MAJOR(unit) == MD_MAJOR) 787 new->md_minor = MINOR(unit); 788 else 789 new->md_minor = MINOR(unit) >> MdpMinorShift; 790 new->hold_active = UNTIL_IOCTL; 791 } else { 792 error = -ENODEV; 793 new->unit = mddev_alloc_unit(); 794 if (!new->unit) 795 goto out_destroy_new; 796 new->md_minor = MINOR(new->unit); 797 new->hold_active = UNTIL_STOP; 798 } 799 800 list_add(&new->all_mddevs, &all_mddevs); 801 spin_unlock(&all_mddevs_lock); 802 return new; 803 804 out_destroy_new: 805 spin_unlock(&all_mddevs_lock); 806 mddev_destroy(new); 807 out_free_new: 808 kfree(new); 809 return ERR_PTR(error); 810 } 811 812 static void mddev_free(struct mddev *mddev) 813 { 814 spin_lock(&all_mddevs_lock); 815 list_del(&mddev->all_mddevs); 816 spin_unlock(&all_mddevs_lock); 817 818 mddev_destroy(mddev); 819 kfree(mddev); 820 } 821 822 static const struct attribute_group md_redundancy_group; 823 824 void mddev_unlock(struct mddev *mddev) 825 { 826 struct md_rdev *rdev; 827 struct md_rdev *tmp; 828 LIST_HEAD(delete); 829 830 if (!list_empty(&mddev->deleting)) 831 list_splice_init(&mddev->deleting, &delete); 832 833 if (mddev->to_remove) { 834 /* These cannot be removed under reconfig_mutex as 835 * an access to the files will try to take reconfig_mutex 836 * while holding the file unremovable, which leads to 837 * a deadlock. 838 * So hold set sysfs_active while the remove in happeing, 839 * and anything else which might set ->to_remove or my 840 * otherwise change the sysfs namespace will fail with 841 * -EBUSY if sysfs_active is still set. 842 * We set sysfs_active under reconfig_mutex and elsewhere 843 * test it under the same mutex to ensure its correct value 844 * is seen. 845 */ 846 const struct attribute_group *to_remove = mddev->to_remove; 847 mddev->to_remove = NULL; 848 mddev->sysfs_active = 1; 849 mutex_unlock(&mddev->reconfig_mutex); 850 851 if (mddev->kobj.sd) { 852 if (to_remove != &md_redundancy_group) 853 sysfs_remove_group(&mddev->kobj, to_remove); 854 if (mddev->pers == NULL || 855 mddev->pers->sync_request == NULL) { 856 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 857 if (mddev->sysfs_action) 858 sysfs_put(mddev->sysfs_action); 859 if (mddev->sysfs_completed) 860 sysfs_put(mddev->sysfs_completed); 861 if (mddev->sysfs_degraded) 862 sysfs_put(mddev->sysfs_degraded); 863 mddev->sysfs_action = NULL; 864 mddev->sysfs_completed = NULL; 865 mddev->sysfs_degraded = NULL; 866 } 867 } 868 mddev->sysfs_active = 0; 869 } else 870 mutex_unlock(&mddev->reconfig_mutex); 871 872 md_wakeup_thread(mddev->thread); 873 wake_up(&mddev->sb_wait); 874 875 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 876 list_del_init(&rdev->same_set); 877 kobject_del(&rdev->kobj); 878 export_rdev(rdev, mddev); 879 } 880 881 if (!legacy_async_del_gendisk) { 882 /* 883 * Call del_gendisk after release reconfig_mutex to avoid 884 * deadlock (e.g. call del_gendisk under the lock and an 885 * access to sysfs files waits the lock) 886 * And MD_DELETED is only used for md raid which is set in 887 * do_md_stop. dm raid only uses md_stop to stop. So dm raid 888 * doesn't need to check MD_DELETED when getting reconfig lock 889 */ 890 if (test_bit(MD_DELETED, &mddev->flags)) 891 del_gendisk(mddev->gendisk); 892 } 893 } 894 EXPORT_SYMBOL_GPL(mddev_unlock); 895 896 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 897 { 898 struct md_rdev *rdev; 899 900 rdev_for_each_rcu(rdev, mddev) 901 if (rdev->desc_nr == nr) 902 return rdev; 903 904 return NULL; 905 } 906 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 907 908 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 909 { 910 struct md_rdev *rdev; 911 912 rdev_for_each(rdev, mddev) 913 if (rdev->bdev->bd_dev == dev) 914 return rdev; 915 916 return NULL; 917 } 918 919 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 920 { 921 struct md_rdev *rdev; 922 923 rdev_for_each_rcu(rdev, mddev) 924 if (rdev->bdev->bd_dev == dev) 925 return rdev; 926 927 return NULL; 928 } 929 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 930 931 static struct md_personality *get_pers(int level, char *clevel) 932 { 933 struct md_personality *ret = NULL; 934 struct md_submodule_head *head; 935 unsigned long i; 936 937 xa_lock(&md_submodule); 938 xa_for_each(&md_submodule, i, head) { 939 if (head->type != MD_PERSONALITY) 940 continue; 941 if ((level != LEVEL_NONE && head->id == level) || 942 !strcmp(head->name, clevel)) { 943 if (try_module_get(head->owner)) 944 ret = (void *)head; 945 break; 946 } 947 } 948 xa_unlock(&md_submodule); 949 950 if (!ret) { 951 if (level != LEVEL_NONE) 952 pr_warn("md: personality for level %d is not loaded!\n", 953 level); 954 else 955 pr_warn("md: personality for level %s is not loaded!\n", 956 clevel); 957 } 958 959 return ret; 960 } 961 962 static void put_pers(struct md_personality *pers) 963 { 964 module_put(pers->head.owner); 965 } 966 967 /* return the offset of the super block in 512byte sectors */ 968 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 969 { 970 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 971 } 972 973 static int alloc_disk_sb(struct md_rdev *rdev) 974 { 975 rdev->sb_page = alloc_page(GFP_KERNEL); 976 if (!rdev->sb_page) 977 return -ENOMEM; 978 return 0; 979 } 980 981 void md_rdev_clear(struct md_rdev *rdev) 982 { 983 if (rdev->sb_page) { 984 put_page(rdev->sb_page); 985 rdev->sb_loaded = 0; 986 rdev->sb_page = NULL; 987 rdev->sb_start = 0; 988 rdev->sectors = 0; 989 } 990 if (rdev->bb_page) { 991 put_page(rdev->bb_page); 992 rdev->bb_page = NULL; 993 } 994 badblocks_exit(&rdev->badblocks); 995 } 996 EXPORT_SYMBOL_GPL(md_rdev_clear); 997 998 static void super_written(struct bio *bio) 999 { 1000 struct md_rdev *rdev = bio->bi_private; 1001 struct mddev *mddev = rdev->mddev; 1002 1003 if (bio->bi_status) { 1004 pr_err("md: %s gets error=%d\n", __func__, 1005 blk_status_to_errno(bio->bi_status)); 1006 md_error(mddev, rdev); 1007 if (!test_bit(Faulty, &rdev->flags) 1008 && (bio->bi_opf & MD_FAILFAST)) { 1009 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 1010 set_bit(LastDev, &rdev->flags); 1011 } 1012 } else 1013 clear_bit(LastDev, &rdev->flags); 1014 1015 bio_put(bio); 1016 1017 rdev_dec_pending(rdev, mddev); 1018 1019 if (atomic_dec_and_test(&mddev->pending_writes)) 1020 wake_up(&mddev->sb_wait); 1021 } 1022 1023 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 1024 sector_t sector, int size, struct page *page) 1025 { 1026 /* write first size bytes of page to sector of rdev 1027 * Increment mddev->pending_writes before returning 1028 * and decrement it on completion, waking up sb_wait 1029 * if zero is reached. 1030 * If an error occurred, call md_error 1031 */ 1032 struct bio *bio; 1033 1034 if (!page) 1035 return; 1036 1037 if (test_bit(Faulty, &rdev->flags)) 1038 return; 1039 1040 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1041 1, 1042 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 1043 | REQ_PREFLUSH | REQ_FUA, 1044 GFP_NOIO, &mddev->sync_set); 1045 1046 atomic_inc(&rdev->nr_pending); 1047 1048 bio->bi_iter.bi_sector = sector; 1049 __bio_add_page(bio, page, size, 0); 1050 bio->bi_private = rdev; 1051 bio->bi_end_io = super_written; 1052 1053 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 1054 test_bit(FailFast, &rdev->flags) && 1055 !test_bit(LastDev, &rdev->flags)) 1056 bio->bi_opf |= MD_FAILFAST; 1057 1058 atomic_inc(&mddev->pending_writes); 1059 submit_bio(bio); 1060 } 1061 1062 int md_super_wait(struct mddev *mddev) 1063 { 1064 /* wait for all superblock writes that were scheduled to complete */ 1065 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1066 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1067 return -EAGAIN; 1068 return 0; 1069 } 1070 1071 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1072 struct page *page, blk_opf_t opf, bool metadata_op) 1073 { 1074 struct bio bio; 1075 struct bio_vec bvec; 1076 1077 if (metadata_op && rdev->meta_bdev) 1078 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1079 else 1080 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1081 1082 if (metadata_op) 1083 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1084 else if (rdev->mddev->reshape_position != MaxSector && 1085 (rdev->mddev->reshape_backwards == 1086 (sector >= rdev->mddev->reshape_position))) 1087 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1088 else 1089 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1090 __bio_add_page(&bio, page, size, 0); 1091 1092 submit_bio_wait(&bio); 1093 1094 return !bio.bi_status; 1095 } 1096 EXPORT_SYMBOL_GPL(sync_page_io); 1097 1098 static int read_disk_sb(struct md_rdev *rdev, int size) 1099 { 1100 if (rdev->sb_loaded) 1101 return 0; 1102 1103 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1104 goto fail; 1105 rdev->sb_loaded = 1; 1106 return 0; 1107 1108 fail: 1109 pr_err("md: disabled device %pg, could not read superblock.\n", 1110 rdev->bdev); 1111 return -EINVAL; 1112 } 1113 1114 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1115 { 1116 return sb1->set_uuid0 == sb2->set_uuid0 && 1117 sb1->set_uuid1 == sb2->set_uuid1 && 1118 sb1->set_uuid2 == sb2->set_uuid2 && 1119 sb1->set_uuid3 == sb2->set_uuid3; 1120 } 1121 1122 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1123 { 1124 int ret; 1125 mdp_super_t *tmp1, *tmp2; 1126 1127 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1128 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1129 1130 if (!tmp1 || !tmp2) { 1131 ret = 0; 1132 goto abort; 1133 } 1134 1135 *tmp1 = *sb1; 1136 *tmp2 = *sb2; 1137 1138 /* 1139 * nr_disks is not constant 1140 */ 1141 tmp1->nr_disks = 0; 1142 tmp2->nr_disks = 0; 1143 1144 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1145 abort: 1146 kfree(tmp1); 1147 kfree(tmp2); 1148 return ret; 1149 } 1150 1151 static u32 md_csum_fold(u32 csum) 1152 { 1153 csum = (csum & 0xffff) + (csum >> 16); 1154 return (csum & 0xffff) + (csum >> 16); 1155 } 1156 1157 static unsigned int calc_sb_csum(mdp_super_t *sb) 1158 { 1159 u64 newcsum = 0; 1160 u32 *sb32 = (u32*)sb; 1161 int i; 1162 unsigned int disk_csum, csum; 1163 1164 disk_csum = sb->sb_csum; 1165 sb->sb_csum = 0; 1166 1167 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1168 newcsum += sb32[i]; 1169 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1170 1171 #ifdef CONFIG_ALPHA 1172 /* This used to use csum_partial, which was wrong for several 1173 * reasons including that different results are returned on 1174 * different architectures. It isn't critical that we get exactly 1175 * the same return value as before (we always csum_fold before 1176 * testing, and that removes any differences). However as we 1177 * know that csum_partial always returned a 16bit value on 1178 * alphas, do a fold to maximise conformity to previous behaviour. 1179 */ 1180 sb->sb_csum = md_csum_fold(disk_csum); 1181 #else 1182 sb->sb_csum = disk_csum; 1183 #endif 1184 return csum; 1185 } 1186 1187 /* 1188 * Handle superblock details. 1189 * We want to be able to handle multiple superblock formats 1190 * so we have a common interface to them all, and an array of 1191 * different handlers. 1192 * We rely on user-space to write the initial superblock, and support 1193 * reading and updating of superblocks. 1194 * Interface methods are: 1195 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1196 * loads and validates a superblock on dev. 1197 * if refdev != NULL, compare superblocks on both devices 1198 * Return: 1199 * 0 - dev has a superblock that is compatible with refdev 1200 * 1 - dev has a superblock that is compatible and newer than refdev 1201 * so dev should be used as the refdev in future 1202 * -EINVAL superblock incompatible or invalid 1203 * -othererror e.g. -EIO 1204 * 1205 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1206 * Verify that dev is acceptable into mddev. 1207 * The first time, mddev->raid_disks will be 0, and data from 1208 * dev should be merged in. Subsequent calls check that dev 1209 * is new enough. Return 0 or -EINVAL 1210 * 1211 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1212 * Update the superblock for rdev with data in mddev 1213 * This does not write to disc. 1214 * 1215 */ 1216 1217 struct super_type { 1218 char *name; 1219 struct module *owner; 1220 int (*load_super)(struct md_rdev *rdev, 1221 struct md_rdev *refdev, 1222 int minor_version); 1223 int (*validate_super)(struct mddev *mddev, 1224 struct md_rdev *freshest, 1225 struct md_rdev *rdev); 1226 void (*sync_super)(struct mddev *mddev, 1227 struct md_rdev *rdev); 1228 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1229 sector_t num_sectors); 1230 int (*allow_new_offset)(struct md_rdev *rdev, 1231 unsigned long long new_offset); 1232 }; 1233 1234 /* 1235 * Check that the given mddev has no bitmap. 1236 * 1237 * This function is called from the run method of all personalities that do not 1238 * support bitmaps. It prints an error message and returns non-zero if mddev 1239 * has a bitmap. Otherwise, it returns 0. 1240 * 1241 */ 1242 int md_check_no_bitmap(struct mddev *mddev) 1243 { 1244 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1245 return 0; 1246 pr_warn("%s: bitmaps are not supported for %s\n", 1247 mdname(mddev), mddev->pers->head.name); 1248 return 1; 1249 } 1250 EXPORT_SYMBOL(md_check_no_bitmap); 1251 1252 /* 1253 * load_super for 0.90.0 1254 */ 1255 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1256 { 1257 mdp_super_t *sb; 1258 int ret; 1259 bool spare_disk = true; 1260 1261 /* 1262 * Calculate the position of the superblock (512byte sectors), 1263 * it's at the end of the disk. 1264 * 1265 * It also happens to be a multiple of 4Kb. 1266 */ 1267 rdev->sb_start = calc_dev_sboffset(rdev); 1268 1269 ret = read_disk_sb(rdev, MD_SB_BYTES); 1270 if (ret) 1271 return ret; 1272 1273 ret = -EINVAL; 1274 1275 sb = page_address(rdev->sb_page); 1276 1277 if (sb->md_magic != MD_SB_MAGIC) { 1278 pr_warn("md: invalid raid superblock magic on %pg\n", 1279 rdev->bdev); 1280 goto abort; 1281 } 1282 1283 if (sb->major_version != 0 || 1284 sb->minor_version < 90 || 1285 sb->minor_version > 91) { 1286 pr_warn("Bad version number %d.%d on %pg\n", 1287 sb->major_version, sb->minor_version, rdev->bdev); 1288 goto abort; 1289 } 1290 1291 if (sb->raid_disks <= 0) 1292 goto abort; 1293 1294 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1295 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1296 goto abort; 1297 } 1298 1299 rdev->preferred_minor = sb->md_minor; 1300 rdev->data_offset = 0; 1301 rdev->new_data_offset = 0; 1302 rdev->sb_size = MD_SB_BYTES; 1303 rdev->badblocks.shift = -1; 1304 1305 rdev->desc_nr = sb->this_disk.number; 1306 1307 /* not spare disk */ 1308 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && 1309 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1310 spare_disk = false; 1311 1312 if (!refdev) { 1313 if (!spare_disk) 1314 ret = 1; 1315 else 1316 ret = 0; 1317 } else { 1318 __u64 ev1, ev2; 1319 mdp_super_t *refsb = page_address(refdev->sb_page); 1320 if (!md_uuid_equal(refsb, sb)) { 1321 pr_warn("md: %pg has different UUID to %pg\n", 1322 rdev->bdev, refdev->bdev); 1323 goto abort; 1324 } 1325 if (!md_sb_equal(refsb, sb)) { 1326 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1327 rdev->bdev, refdev->bdev); 1328 goto abort; 1329 } 1330 ev1 = md_event(sb); 1331 ev2 = md_event(refsb); 1332 1333 if (!spare_disk && ev1 > ev2) 1334 ret = 1; 1335 else 1336 ret = 0; 1337 } 1338 rdev->sectors = rdev->sb_start; 1339 /* Limit to 4TB as metadata cannot record more than that. 1340 * (not needed for Linear and RAID0 as metadata doesn't 1341 * record this size) 1342 */ 1343 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1344 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1345 1346 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1347 /* "this cannot possibly happen" ... */ 1348 ret = -EINVAL; 1349 1350 abort: 1351 return ret; 1352 } 1353 1354 static u64 md_bitmap_events_cleared(struct mddev *mddev) 1355 { 1356 struct md_bitmap_stats stats; 1357 int err; 1358 1359 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 1360 if (err) 1361 return 0; 1362 1363 return stats.events_cleared; 1364 } 1365 1366 /* 1367 * validate_super for 0.90.0 1368 * note: we are not using "freshest" for 0.9 superblock 1369 */ 1370 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1371 { 1372 mdp_disk_t *desc; 1373 mdp_super_t *sb = page_address(rdev->sb_page); 1374 __u64 ev1 = md_event(sb); 1375 1376 rdev->raid_disk = -1; 1377 clear_bit(Faulty, &rdev->flags); 1378 clear_bit(In_sync, &rdev->flags); 1379 clear_bit(Bitmap_sync, &rdev->flags); 1380 clear_bit(WriteMostly, &rdev->flags); 1381 1382 if (mddev->raid_disks == 0) { 1383 mddev->major_version = 0; 1384 mddev->minor_version = sb->minor_version; 1385 mddev->patch_version = sb->patch_version; 1386 mddev->external = 0; 1387 mddev->chunk_sectors = sb->chunk_size >> 9; 1388 mddev->ctime = sb->ctime; 1389 mddev->utime = sb->utime; 1390 mddev->level = sb->level; 1391 mddev->clevel[0] = 0; 1392 mddev->layout = sb->layout; 1393 mddev->raid_disks = sb->raid_disks; 1394 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1395 mddev->events = ev1; 1396 mddev->bitmap_info.offset = 0; 1397 mddev->bitmap_info.space = 0; 1398 /* bitmap can use 60 K after the 4K superblocks */ 1399 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1400 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1401 mddev->reshape_backwards = 0; 1402 1403 if (mddev->minor_version >= 91) { 1404 mddev->reshape_position = sb->reshape_position; 1405 mddev->delta_disks = sb->delta_disks; 1406 mddev->new_level = sb->new_level; 1407 mddev->new_layout = sb->new_layout; 1408 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1409 if (mddev->delta_disks < 0) 1410 mddev->reshape_backwards = 1; 1411 } else { 1412 mddev->reshape_position = MaxSector; 1413 mddev->delta_disks = 0; 1414 mddev->new_level = mddev->level; 1415 mddev->new_layout = mddev->layout; 1416 mddev->new_chunk_sectors = mddev->chunk_sectors; 1417 } 1418 if (mddev->level == 0) 1419 mddev->layout = -1; 1420 1421 if (sb->state & (1<<MD_SB_CLEAN)) 1422 mddev->resync_offset = MaxSector; 1423 else { 1424 if (sb->events_hi == sb->cp_events_hi && 1425 sb->events_lo == sb->cp_events_lo) { 1426 mddev->resync_offset = sb->recovery_cp; 1427 } else 1428 mddev->resync_offset = 0; 1429 } 1430 1431 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1432 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1433 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1434 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1435 1436 mddev->max_disks = MD_SB_DISKS; 1437 1438 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1439 mddev->bitmap_info.file == NULL) { 1440 mddev->bitmap_info.offset = 1441 mddev->bitmap_info.default_offset; 1442 mddev->bitmap_info.space = 1443 mddev->bitmap_info.default_space; 1444 } 1445 1446 } else if (mddev->pers == NULL) { 1447 /* Insist on good event counter while assembling, except 1448 * for spares (which don't need an event count) */ 1449 ++ev1; 1450 if (sb->disks[rdev->desc_nr].state & ( 1451 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1452 if (ev1 < mddev->events) 1453 return -EINVAL; 1454 } else if (mddev->bitmap) { 1455 /* if adding to array with a bitmap, then we can accept an 1456 * older device ... but not too old. 1457 */ 1458 if (ev1 < md_bitmap_events_cleared(mddev)) 1459 return 0; 1460 if (ev1 < mddev->events) 1461 set_bit(Bitmap_sync, &rdev->flags); 1462 } else { 1463 if (ev1 < mddev->events) 1464 /* just a hot-add of a new device, leave raid_disk at -1 */ 1465 return 0; 1466 } 1467 1468 desc = sb->disks + rdev->desc_nr; 1469 1470 if (desc->state & (1<<MD_DISK_FAULTY)) 1471 set_bit(Faulty, &rdev->flags); 1472 else if (desc->state & (1<<MD_DISK_SYNC)) { 1473 set_bit(In_sync, &rdev->flags); 1474 rdev->raid_disk = desc->raid_disk; 1475 rdev->saved_raid_disk = desc->raid_disk; 1476 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1477 /* active but not in sync implies recovery up to 1478 * reshape position. We don't know exactly where 1479 * that is, so set to zero for now 1480 */ 1481 if (mddev->minor_version >= 91) { 1482 rdev->recovery_offset = 0; 1483 rdev->raid_disk = desc->raid_disk; 1484 } 1485 } 1486 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1487 set_bit(WriteMostly, &rdev->flags); 1488 if (desc->state & (1<<MD_DISK_FAILFAST)) 1489 set_bit(FailFast, &rdev->flags); 1490 return 0; 1491 } 1492 1493 /* 1494 * sync_super for 0.90.0 1495 */ 1496 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1497 { 1498 mdp_super_t *sb; 1499 struct md_rdev *rdev2; 1500 int next_spare = mddev->raid_disks; 1501 1502 /* make rdev->sb match mddev data.. 1503 * 1504 * 1/ zero out disks 1505 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1506 * 3/ any empty disks < next_spare become removed 1507 * 1508 * disks[0] gets initialised to REMOVED because 1509 * we cannot be sure from other fields if it has 1510 * been initialised or not. 1511 */ 1512 int i; 1513 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1514 1515 rdev->sb_size = MD_SB_BYTES; 1516 1517 sb = page_address(rdev->sb_page); 1518 1519 memset(sb, 0, sizeof(*sb)); 1520 1521 sb->md_magic = MD_SB_MAGIC; 1522 sb->major_version = mddev->major_version; 1523 sb->patch_version = mddev->patch_version; 1524 sb->gvalid_words = 0; /* ignored */ 1525 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1526 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1527 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1528 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1529 1530 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1531 sb->level = mddev->level; 1532 sb->size = mddev->dev_sectors / 2; 1533 sb->raid_disks = mddev->raid_disks; 1534 sb->md_minor = mddev->md_minor; 1535 sb->not_persistent = 0; 1536 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1537 sb->state = 0; 1538 sb->events_hi = (mddev->events>>32); 1539 sb->events_lo = (u32)mddev->events; 1540 1541 if (mddev->reshape_position == MaxSector) 1542 sb->minor_version = 90; 1543 else { 1544 sb->minor_version = 91; 1545 sb->reshape_position = mddev->reshape_position; 1546 sb->new_level = mddev->new_level; 1547 sb->delta_disks = mddev->delta_disks; 1548 sb->new_layout = mddev->new_layout; 1549 sb->new_chunk = mddev->new_chunk_sectors << 9; 1550 } 1551 mddev->minor_version = sb->minor_version; 1552 if (mddev->in_sync) 1553 { 1554 sb->recovery_cp = mddev->resync_offset; 1555 sb->cp_events_hi = (mddev->events>>32); 1556 sb->cp_events_lo = (u32)mddev->events; 1557 if (mddev->resync_offset == MaxSector) 1558 sb->state = (1<< MD_SB_CLEAN); 1559 } else 1560 sb->recovery_cp = 0; 1561 1562 sb->layout = mddev->layout; 1563 sb->chunk_size = mddev->chunk_sectors << 9; 1564 1565 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1566 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1567 1568 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1569 rdev_for_each(rdev2, mddev) { 1570 mdp_disk_t *d; 1571 int desc_nr; 1572 int is_active = test_bit(In_sync, &rdev2->flags); 1573 1574 if (rdev2->raid_disk >= 0 && 1575 sb->minor_version >= 91) 1576 /* we have nowhere to store the recovery_offset, 1577 * but if it is not below the reshape_position, 1578 * we can piggy-back on that. 1579 */ 1580 is_active = 1; 1581 if (rdev2->raid_disk < 0 || 1582 test_bit(Faulty, &rdev2->flags)) 1583 is_active = 0; 1584 if (is_active) 1585 desc_nr = rdev2->raid_disk; 1586 else 1587 desc_nr = next_spare++; 1588 rdev2->desc_nr = desc_nr; 1589 d = &sb->disks[rdev2->desc_nr]; 1590 nr_disks++; 1591 d->number = rdev2->desc_nr; 1592 d->major = MAJOR(rdev2->bdev->bd_dev); 1593 d->minor = MINOR(rdev2->bdev->bd_dev); 1594 if (is_active) 1595 d->raid_disk = rdev2->raid_disk; 1596 else 1597 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1598 if (test_bit(Faulty, &rdev2->flags)) 1599 d->state = (1<<MD_DISK_FAULTY); 1600 else if (is_active) { 1601 d->state = (1<<MD_DISK_ACTIVE); 1602 if (test_bit(In_sync, &rdev2->flags)) 1603 d->state |= (1<<MD_DISK_SYNC); 1604 active++; 1605 working++; 1606 } else { 1607 d->state = 0; 1608 spare++; 1609 working++; 1610 } 1611 if (test_bit(WriteMostly, &rdev2->flags)) 1612 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1613 if (test_bit(FailFast, &rdev2->flags)) 1614 d->state |= (1<<MD_DISK_FAILFAST); 1615 } 1616 /* now set the "removed" and "faulty" bits on any missing devices */ 1617 for (i=0 ; i < mddev->raid_disks ; i++) { 1618 mdp_disk_t *d = &sb->disks[i]; 1619 if (d->state == 0 && d->number == 0) { 1620 d->number = i; 1621 d->raid_disk = i; 1622 d->state = (1<<MD_DISK_REMOVED); 1623 d->state |= (1<<MD_DISK_FAULTY); 1624 failed++; 1625 } 1626 } 1627 sb->nr_disks = nr_disks; 1628 sb->active_disks = active; 1629 sb->working_disks = working; 1630 sb->failed_disks = failed; 1631 sb->spare_disks = spare; 1632 1633 sb->this_disk = sb->disks[rdev->desc_nr]; 1634 sb->sb_csum = calc_sb_csum(sb); 1635 } 1636 1637 /* 1638 * rdev_size_change for 0.90.0 1639 */ 1640 static unsigned long long 1641 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1642 { 1643 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1644 return 0; /* component must fit device */ 1645 if (rdev->mddev->bitmap_info.offset) 1646 return 0; /* can't move bitmap */ 1647 rdev->sb_start = calc_dev_sboffset(rdev); 1648 if (!num_sectors || num_sectors > rdev->sb_start) 1649 num_sectors = rdev->sb_start; 1650 /* Limit to 4TB as metadata cannot record more than that. 1651 * 4TB == 2^32 KB, or 2*2^32 sectors. 1652 */ 1653 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1654 num_sectors = (sector_t)(2ULL << 32) - 2; 1655 do { 1656 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1657 rdev->sb_page); 1658 } while (md_super_wait(rdev->mddev) < 0); 1659 return num_sectors; 1660 } 1661 1662 static int 1663 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1664 { 1665 /* non-zero offset changes not possible with v0.90 */ 1666 return new_offset == 0; 1667 } 1668 1669 /* 1670 * version 1 superblock 1671 */ 1672 1673 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1674 { 1675 __le32 disk_csum; 1676 u32 csum; 1677 unsigned long long newcsum; 1678 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1679 __le32 *isuper = (__le32*)sb; 1680 1681 disk_csum = sb->sb_csum; 1682 sb->sb_csum = 0; 1683 newcsum = 0; 1684 for (; size >= 4; size -= 4) 1685 newcsum += le32_to_cpu(*isuper++); 1686 1687 if (size == 2) 1688 newcsum += le16_to_cpu(*(__le16*) isuper); 1689 1690 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1691 sb->sb_csum = disk_csum; 1692 return cpu_to_le32(csum); 1693 } 1694 1695 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1696 { 1697 struct mdp_superblock_1 *sb; 1698 int ret; 1699 sector_t sb_start; 1700 sector_t sectors; 1701 int bmask; 1702 bool spare_disk = true; 1703 1704 /* 1705 * Calculate the position of the superblock in 512byte sectors. 1706 * It is always aligned to a 4K boundary and 1707 * depeding on minor_version, it can be: 1708 * 0: At least 8K, but less than 12K, from end of device 1709 * 1: At start of device 1710 * 2: 4K from start of device. 1711 */ 1712 switch(minor_version) { 1713 case 0: 1714 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1715 sb_start &= ~(sector_t)(4*2-1); 1716 break; 1717 case 1: 1718 sb_start = 0; 1719 break; 1720 case 2: 1721 sb_start = 8; 1722 break; 1723 default: 1724 return -EINVAL; 1725 } 1726 rdev->sb_start = sb_start; 1727 1728 /* superblock is rarely larger than 1K, but it can be larger, 1729 * and it is safe to read 4k, so we do that 1730 */ 1731 ret = read_disk_sb(rdev, 4096); 1732 if (ret) return ret; 1733 1734 sb = page_address(rdev->sb_page); 1735 1736 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1737 sb->major_version != cpu_to_le32(1) || 1738 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1739 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1740 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1741 return -EINVAL; 1742 1743 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1744 pr_warn("md: invalid superblock checksum on %pg\n", 1745 rdev->bdev); 1746 return -EINVAL; 1747 } 1748 if (le64_to_cpu(sb->data_size) < 10) { 1749 pr_warn("md: data_size too small on %pg\n", 1750 rdev->bdev); 1751 return -EINVAL; 1752 } 1753 if (sb->pad0 || 1754 sb->pad3[0] || 1755 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1756 /* Some padding is non-zero, might be a new feature */ 1757 return -EINVAL; 1758 1759 rdev->preferred_minor = 0xffff; 1760 rdev->data_offset = le64_to_cpu(sb->data_offset); 1761 rdev->new_data_offset = rdev->data_offset; 1762 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1763 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1764 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1765 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1766 1767 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1768 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1769 if (rdev->sb_size & bmask) 1770 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1771 1772 if (minor_version 1773 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1774 return -EINVAL; 1775 if (minor_version 1776 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1777 return -EINVAL; 1778 1779 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1780 1781 if (!rdev->bb_page) { 1782 rdev->bb_page = alloc_page(GFP_KERNEL); 1783 if (!rdev->bb_page) 1784 return -ENOMEM; 1785 } 1786 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1787 rdev->badblocks.count == 0) { 1788 /* need to load the bad block list. 1789 * Currently we limit it to one page. 1790 */ 1791 s32 offset; 1792 sector_t bb_sector; 1793 __le64 *bbp; 1794 int i; 1795 int sectors = le16_to_cpu(sb->bblog_size); 1796 if (sectors > (PAGE_SIZE / 512)) 1797 return -EINVAL; 1798 offset = le32_to_cpu(sb->bblog_offset); 1799 if (offset == 0) 1800 return -EINVAL; 1801 bb_sector = (long long)offset; 1802 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1803 rdev->bb_page, REQ_OP_READ, true)) 1804 return -EIO; 1805 bbp = (__le64 *)page_address(rdev->bb_page); 1806 rdev->badblocks.shift = sb->bblog_shift; 1807 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1808 u64 bb = le64_to_cpu(*bbp); 1809 int count = bb & (0x3ff); 1810 u64 sector = bb >> 10; 1811 sector <<= sb->bblog_shift; 1812 count <<= sb->bblog_shift; 1813 if (bb + 1 == 0) 1814 break; 1815 if (!badblocks_set(&rdev->badblocks, sector, count, 1)) 1816 return -EINVAL; 1817 } 1818 } else if (sb->bblog_offset != 0) 1819 rdev->badblocks.shift = 0; 1820 1821 if ((le32_to_cpu(sb->feature_map) & 1822 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1823 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1824 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1825 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1826 } 1827 1828 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1829 sb->level != 0) 1830 return -EINVAL; 1831 1832 /* not spare disk */ 1833 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1834 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1835 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1836 spare_disk = false; 1837 1838 if (!refdev) { 1839 if (!spare_disk) 1840 ret = 1; 1841 else 1842 ret = 0; 1843 } else { 1844 __u64 ev1, ev2; 1845 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1846 1847 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1848 sb->level != refsb->level || 1849 sb->layout != refsb->layout || 1850 sb->chunksize != refsb->chunksize) { 1851 pr_warn("md: %pg has strangely different superblock to %pg\n", 1852 rdev->bdev, 1853 refdev->bdev); 1854 return -EINVAL; 1855 } 1856 ev1 = le64_to_cpu(sb->events); 1857 ev2 = le64_to_cpu(refsb->events); 1858 1859 if (!spare_disk && ev1 > ev2) 1860 ret = 1; 1861 else 1862 ret = 0; 1863 } 1864 if (minor_version) 1865 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1866 else 1867 sectors = rdev->sb_start; 1868 if (sectors < le64_to_cpu(sb->data_size)) 1869 return -EINVAL; 1870 rdev->sectors = le64_to_cpu(sb->data_size); 1871 return ret; 1872 } 1873 1874 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1875 { 1876 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1877 __u64 ev1 = le64_to_cpu(sb->events); 1878 int role; 1879 1880 rdev->raid_disk = -1; 1881 clear_bit(Faulty, &rdev->flags); 1882 clear_bit(In_sync, &rdev->flags); 1883 clear_bit(Bitmap_sync, &rdev->flags); 1884 clear_bit(WriteMostly, &rdev->flags); 1885 1886 if (mddev->raid_disks == 0) { 1887 mddev->major_version = 1; 1888 mddev->patch_version = 0; 1889 mddev->external = 0; 1890 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1891 mddev->ctime = le64_to_cpu(sb->ctime); 1892 mddev->utime = le64_to_cpu(sb->utime); 1893 mddev->level = le32_to_cpu(sb->level); 1894 mddev->clevel[0] = 0; 1895 mddev->layout = le32_to_cpu(sb->layout); 1896 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1897 mddev->dev_sectors = le64_to_cpu(sb->size); 1898 mddev->events = ev1; 1899 mddev->bitmap_info.offset = 0; 1900 mddev->bitmap_info.space = 0; 1901 /* Default location for bitmap is 1K after superblock 1902 * using 3K - total of 4K 1903 */ 1904 mddev->bitmap_info.default_offset = 1024 >> 9; 1905 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1906 mddev->reshape_backwards = 0; 1907 1908 mddev->resync_offset = le64_to_cpu(sb->resync_offset); 1909 memcpy(mddev->uuid, sb->set_uuid, 16); 1910 1911 mddev->max_disks = (4096-256)/2; 1912 1913 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1914 mddev->bitmap_info.file == NULL) { 1915 mddev->bitmap_info.offset = 1916 (__s32)le32_to_cpu(sb->bitmap_offset); 1917 /* Metadata doesn't record how much space is available. 1918 * For 1.0, we assume we can use up to the superblock 1919 * if before, else to 4K beyond superblock. 1920 * For others, assume no change is possible. 1921 */ 1922 if (mddev->minor_version > 0) 1923 mddev->bitmap_info.space = 0; 1924 else if (mddev->bitmap_info.offset > 0) 1925 mddev->bitmap_info.space = 1926 8 - mddev->bitmap_info.offset; 1927 else 1928 mddev->bitmap_info.space = 1929 -mddev->bitmap_info.offset; 1930 } 1931 1932 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1933 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1934 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1935 mddev->new_level = le32_to_cpu(sb->new_level); 1936 mddev->new_layout = le32_to_cpu(sb->new_layout); 1937 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1938 if (mddev->delta_disks < 0 || 1939 (mddev->delta_disks == 0 && 1940 (le32_to_cpu(sb->feature_map) 1941 & MD_FEATURE_RESHAPE_BACKWARDS))) 1942 mddev->reshape_backwards = 1; 1943 } else { 1944 mddev->reshape_position = MaxSector; 1945 mddev->delta_disks = 0; 1946 mddev->new_level = mddev->level; 1947 mddev->new_layout = mddev->layout; 1948 mddev->new_chunk_sectors = mddev->chunk_sectors; 1949 } 1950 1951 if (mddev->level == 0 && 1952 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1953 mddev->layout = -1; 1954 1955 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1956 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1957 1958 if (le32_to_cpu(sb->feature_map) & 1959 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1960 if (le32_to_cpu(sb->feature_map) & 1961 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1962 return -EINVAL; 1963 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1964 (le32_to_cpu(sb->feature_map) & 1965 MD_FEATURE_MULTIPLE_PPLS)) 1966 return -EINVAL; 1967 set_bit(MD_HAS_PPL, &mddev->flags); 1968 } 1969 } else if (mddev->pers == NULL) { 1970 /* Insist of good event counter while assembling, except for 1971 * spares (which don't need an event count). 1972 * Similar to mdadm, we allow event counter difference of 1 1973 * from the freshest device. 1974 */ 1975 if (rdev->desc_nr >= 0 && 1976 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1977 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1978 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1979 if (ev1 + 1 < mddev->events) 1980 return -EINVAL; 1981 } else if (mddev->bitmap) { 1982 /* If adding to array with a bitmap, then we can accept an 1983 * older device, but not too old. 1984 */ 1985 if (ev1 < md_bitmap_events_cleared(mddev)) 1986 return 0; 1987 if (ev1 < mddev->events) 1988 set_bit(Bitmap_sync, &rdev->flags); 1989 } else { 1990 if (ev1 < mddev->events) 1991 /* just a hot-add of a new device, leave raid_disk at -1 */ 1992 return 0; 1993 } 1994 1995 if (rdev->desc_nr < 0 || 1996 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1997 role = MD_DISK_ROLE_SPARE; 1998 rdev->desc_nr = -1; 1999 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 2000 /* 2001 * If we are assembling, and our event counter is smaller than the 2002 * highest event counter, we cannot trust our superblock about the role. 2003 * It could happen that our rdev was marked as Faulty, and all other 2004 * superblocks were updated with +1 event counter. 2005 * Then, before the next superblock update, which typically happens when 2006 * remove_and_add_spares() removes the device from the array, there was 2007 * a crash or reboot. 2008 * If we allow current rdev without consulting the freshest superblock, 2009 * we could cause data corruption. 2010 * Note that in this case our event counter is smaller by 1 than the 2011 * highest, otherwise, this rdev would not be allowed into array; 2012 * both kernel and mdadm allow event counter difference of 1. 2013 */ 2014 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 2015 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 2016 2017 if (rdev->desc_nr >= freshest_max_dev) { 2018 /* this is unexpected, better not proceed */ 2019 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 2020 mdname(mddev), rdev->bdev, rdev->desc_nr, 2021 freshest->bdev, freshest_max_dev); 2022 return -EUCLEAN; 2023 } 2024 2025 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 2026 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 2027 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 2028 } else { 2029 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2030 } 2031 switch (role) { 2032 case MD_DISK_ROLE_SPARE: /* spare */ 2033 break; 2034 case MD_DISK_ROLE_FAULTY: /* faulty */ 2035 set_bit(Faulty, &rdev->flags); 2036 break; 2037 case MD_DISK_ROLE_JOURNAL: /* journal device */ 2038 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 2039 /* journal device without journal feature */ 2040 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 2041 return -EINVAL; 2042 } 2043 set_bit(Journal, &rdev->flags); 2044 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 2045 rdev->raid_disk = 0; 2046 break; 2047 default: 2048 rdev->saved_raid_disk = role; 2049 if ((le32_to_cpu(sb->feature_map) & 2050 MD_FEATURE_RECOVERY_OFFSET)) { 2051 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 2052 if (!(le32_to_cpu(sb->feature_map) & 2053 MD_FEATURE_RECOVERY_BITMAP)) 2054 rdev->saved_raid_disk = -1; 2055 } else { 2056 /* 2057 * If the array is FROZEN, then the device can't 2058 * be in_sync with rest of array. 2059 */ 2060 if (!test_bit(MD_RECOVERY_FROZEN, 2061 &mddev->recovery)) 2062 set_bit(In_sync, &rdev->flags); 2063 } 2064 rdev->raid_disk = role; 2065 break; 2066 } 2067 if (sb->devflags & WriteMostly1) 2068 set_bit(WriteMostly, &rdev->flags); 2069 if (sb->devflags & FailFast1) 2070 set_bit(FailFast, &rdev->flags); 2071 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2072 set_bit(Replacement, &rdev->flags); 2073 2074 return 0; 2075 } 2076 2077 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2078 { 2079 struct mdp_superblock_1 *sb; 2080 struct md_rdev *rdev2; 2081 int max_dev, i; 2082 /* make rdev->sb match mddev and rdev data. */ 2083 2084 sb = page_address(rdev->sb_page); 2085 2086 sb->feature_map = 0; 2087 sb->pad0 = 0; 2088 sb->recovery_offset = cpu_to_le64(0); 2089 memset(sb->pad3, 0, sizeof(sb->pad3)); 2090 2091 sb->utime = cpu_to_le64((__u64)mddev->utime); 2092 sb->events = cpu_to_le64(mddev->events); 2093 if (mddev->in_sync) 2094 sb->resync_offset = cpu_to_le64(mddev->resync_offset); 2095 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2096 sb->resync_offset = cpu_to_le64(MaxSector); 2097 else 2098 sb->resync_offset = cpu_to_le64(0); 2099 2100 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2101 2102 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2103 sb->size = cpu_to_le64(mddev->dev_sectors); 2104 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2105 sb->level = cpu_to_le32(mddev->level); 2106 sb->layout = cpu_to_le32(mddev->layout); 2107 if (test_bit(FailFast, &rdev->flags)) 2108 sb->devflags |= FailFast1; 2109 else 2110 sb->devflags &= ~FailFast1; 2111 2112 if (test_bit(WriteMostly, &rdev->flags)) 2113 sb->devflags |= WriteMostly1; 2114 else 2115 sb->devflags &= ~WriteMostly1; 2116 sb->data_offset = cpu_to_le64(rdev->data_offset); 2117 sb->data_size = cpu_to_le64(rdev->sectors); 2118 2119 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2120 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2121 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2122 } 2123 2124 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2125 !test_bit(In_sync, &rdev->flags)) { 2126 sb->feature_map |= 2127 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2128 sb->recovery_offset = 2129 cpu_to_le64(rdev->recovery_offset); 2130 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2131 sb->feature_map |= 2132 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2133 } 2134 /* Note: recovery_offset and journal_tail share space */ 2135 if (test_bit(Journal, &rdev->flags)) 2136 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2137 if (test_bit(Replacement, &rdev->flags)) 2138 sb->feature_map |= 2139 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2140 2141 if (mddev->reshape_position != MaxSector) { 2142 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2143 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2144 sb->new_layout = cpu_to_le32(mddev->new_layout); 2145 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2146 sb->new_level = cpu_to_le32(mddev->new_level); 2147 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2148 if (mddev->delta_disks == 0 && 2149 mddev->reshape_backwards) 2150 sb->feature_map 2151 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2152 if (rdev->new_data_offset != rdev->data_offset) { 2153 sb->feature_map 2154 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2155 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2156 - rdev->data_offset)); 2157 } 2158 } 2159 2160 if (mddev_is_clustered(mddev)) 2161 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2162 2163 if (rdev->badblocks.count == 0) 2164 /* Nothing to do for bad blocks*/ ; 2165 else if (sb->bblog_offset == 0) 2166 /* Cannot record bad blocks on this device */ 2167 md_error(mddev, rdev); 2168 else { 2169 struct badblocks *bb = &rdev->badblocks; 2170 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2171 u64 *p = bb->page; 2172 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2173 if (bb->changed) { 2174 unsigned seq; 2175 2176 retry: 2177 seq = read_seqbegin(&bb->lock); 2178 2179 memset(bbp, 0xff, PAGE_SIZE); 2180 2181 for (i = 0 ; i < bb->count ; i++) { 2182 u64 internal_bb = p[i]; 2183 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2184 | BB_LEN(internal_bb)); 2185 bbp[i] = cpu_to_le64(store_bb); 2186 } 2187 bb->changed = 0; 2188 if (read_seqretry(&bb->lock, seq)) 2189 goto retry; 2190 2191 bb->sector = (rdev->sb_start + 2192 (int)le32_to_cpu(sb->bblog_offset)); 2193 bb->size = le16_to_cpu(sb->bblog_size); 2194 } 2195 } 2196 2197 max_dev = 0; 2198 rdev_for_each(rdev2, mddev) 2199 if (rdev2->desc_nr+1 > max_dev) 2200 max_dev = rdev2->desc_nr+1; 2201 2202 if (max_dev > le32_to_cpu(sb->max_dev)) { 2203 int bmask; 2204 sb->max_dev = cpu_to_le32(max_dev); 2205 rdev->sb_size = max_dev * 2 + 256; 2206 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2207 if (rdev->sb_size & bmask) 2208 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2209 } else 2210 max_dev = le32_to_cpu(sb->max_dev); 2211 2212 for (i=0; i<max_dev;i++) 2213 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2214 2215 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2216 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2217 2218 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2219 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2220 sb->feature_map |= 2221 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2222 else 2223 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2224 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2225 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2226 } 2227 2228 rdev_for_each(rdev2, mddev) { 2229 i = rdev2->desc_nr; 2230 if (test_bit(Faulty, &rdev2->flags)) 2231 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2232 else if (test_bit(In_sync, &rdev2->flags)) 2233 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2234 else if (test_bit(Journal, &rdev2->flags)) 2235 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2236 else if (rdev2->raid_disk >= 0) 2237 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2238 else 2239 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2240 } 2241 2242 sb->sb_csum = calc_sb_1_csum(sb); 2243 } 2244 2245 static sector_t super_1_choose_bm_space(sector_t dev_size) 2246 { 2247 sector_t bm_space; 2248 2249 /* if the device is bigger than 8Gig, save 64k for bitmap 2250 * usage, if bigger than 200Gig, save 128k 2251 */ 2252 if (dev_size < 64*2) 2253 bm_space = 0; 2254 else if (dev_size - 64*2 >= 200*1024*1024*2) 2255 bm_space = 128*2; 2256 else if (dev_size - 4*2 > 8*1024*1024*2) 2257 bm_space = 64*2; 2258 else 2259 bm_space = 4*2; 2260 return bm_space; 2261 } 2262 2263 static unsigned long long 2264 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2265 { 2266 struct mdp_superblock_1 *sb; 2267 sector_t max_sectors; 2268 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2269 return 0; /* component must fit device */ 2270 if (rdev->data_offset != rdev->new_data_offset) 2271 return 0; /* too confusing */ 2272 if (rdev->sb_start < rdev->data_offset) { 2273 /* minor versions 1 and 2; superblock before data */ 2274 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2275 if (!num_sectors || num_sectors > max_sectors) 2276 num_sectors = max_sectors; 2277 } else if (rdev->mddev->bitmap_info.offset) { 2278 /* minor version 0 with bitmap we can't move */ 2279 return 0; 2280 } else { 2281 /* minor version 0; superblock after data */ 2282 sector_t sb_start, bm_space; 2283 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2284 2285 /* 8K is for superblock */ 2286 sb_start = dev_size - 8*2; 2287 sb_start &= ~(sector_t)(4*2 - 1); 2288 2289 bm_space = super_1_choose_bm_space(dev_size); 2290 2291 /* Space that can be used to store date needs to decrease 2292 * superblock bitmap space and bad block space(4K) 2293 */ 2294 max_sectors = sb_start - bm_space - 4*2; 2295 2296 if (!num_sectors || num_sectors > max_sectors) 2297 num_sectors = max_sectors; 2298 rdev->sb_start = sb_start; 2299 } 2300 sb = page_address(rdev->sb_page); 2301 sb->data_size = cpu_to_le64(num_sectors); 2302 sb->super_offset = cpu_to_le64(rdev->sb_start); 2303 sb->sb_csum = calc_sb_1_csum(sb); 2304 do { 2305 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2306 rdev->sb_page); 2307 } while (md_super_wait(rdev->mddev) < 0); 2308 return num_sectors; 2309 2310 } 2311 2312 static int 2313 super_1_allow_new_offset(struct md_rdev *rdev, 2314 unsigned long long new_offset) 2315 { 2316 /* All necessary checks on new >= old have been done */ 2317 if (new_offset >= rdev->data_offset) 2318 return 1; 2319 2320 /* with 1.0 metadata, there is no metadata to tread on 2321 * so we can always move back */ 2322 if (rdev->mddev->minor_version == 0) 2323 return 1; 2324 2325 /* otherwise we must be sure not to step on 2326 * any metadata, so stay: 2327 * 36K beyond start of superblock 2328 * beyond end of badblocks 2329 * beyond write-intent bitmap 2330 */ 2331 if (rdev->sb_start + (32+4)*2 > new_offset) 2332 return 0; 2333 2334 if (!rdev->mddev->bitmap_info.file) { 2335 struct mddev *mddev = rdev->mddev; 2336 struct md_bitmap_stats stats; 2337 int err; 2338 2339 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 2340 if (!err && rdev->sb_start + mddev->bitmap_info.offset + 2341 stats.file_pages * (PAGE_SIZE >> 9) > new_offset) 2342 return 0; 2343 } 2344 2345 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2346 return 0; 2347 2348 return 1; 2349 } 2350 2351 static struct super_type super_types[] = { 2352 [0] = { 2353 .name = "0.90.0", 2354 .owner = THIS_MODULE, 2355 .load_super = super_90_load, 2356 .validate_super = super_90_validate, 2357 .sync_super = super_90_sync, 2358 .rdev_size_change = super_90_rdev_size_change, 2359 .allow_new_offset = super_90_allow_new_offset, 2360 }, 2361 [1] = { 2362 .name = "md-1", 2363 .owner = THIS_MODULE, 2364 .load_super = super_1_load, 2365 .validate_super = super_1_validate, 2366 .sync_super = super_1_sync, 2367 .rdev_size_change = super_1_rdev_size_change, 2368 .allow_new_offset = super_1_allow_new_offset, 2369 }, 2370 }; 2371 2372 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2373 { 2374 if (mddev->sync_super) { 2375 mddev->sync_super(mddev, rdev); 2376 return; 2377 } 2378 2379 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2380 2381 super_types[mddev->major_version].sync_super(mddev, rdev); 2382 } 2383 2384 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2385 { 2386 struct md_rdev *rdev, *rdev2; 2387 2388 rcu_read_lock(); 2389 rdev_for_each_rcu(rdev, mddev1) { 2390 if (test_bit(Faulty, &rdev->flags) || 2391 test_bit(Journal, &rdev->flags) || 2392 rdev->raid_disk == -1) 2393 continue; 2394 rdev_for_each_rcu(rdev2, mddev2) { 2395 if (test_bit(Faulty, &rdev2->flags) || 2396 test_bit(Journal, &rdev2->flags) || 2397 rdev2->raid_disk == -1) 2398 continue; 2399 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2400 rcu_read_unlock(); 2401 return 1; 2402 } 2403 } 2404 } 2405 rcu_read_unlock(); 2406 return 0; 2407 } 2408 2409 static LIST_HEAD(pending_raid_disks); 2410 2411 /* 2412 * Try to register data integrity profile for an mddev 2413 * 2414 * This is called when an array is started and after a disk has been kicked 2415 * from the array. It only succeeds if all working and active component devices 2416 * are integrity capable with matching profiles. 2417 */ 2418 int md_integrity_register(struct mddev *mddev) 2419 { 2420 if (list_empty(&mddev->disks)) 2421 return 0; /* nothing to do */ 2422 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) 2423 return 0; /* shouldn't register */ 2424 2425 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2426 return 0; 2427 } 2428 EXPORT_SYMBOL(md_integrity_register); 2429 2430 static bool rdev_read_only(struct md_rdev *rdev) 2431 { 2432 return bdev_read_only(rdev->bdev) || 2433 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2434 } 2435 2436 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2437 { 2438 char b[BDEVNAME_SIZE]; 2439 int err; 2440 2441 /* prevent duplicates */ 2442 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2443 return -EEXIST; 2444 2445 if (rdev_read_only(rdev) && mddev->pers) 2446 return -EROFS; 2447 2448 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2449 if (!test_bit(Journal, &rdev->flags) && 2450 rdev->sectors && 2451 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2452 if (mddev->pers) { 2453 /* Cannot change size, so fail 2454 * If mddev->level <= 0, then we don't care 2455 * about aligning sizes (e.g. linear) 2456 */ 2457 if (mddev->level > 0) 2458 return -ENOSPC; 2459 } else 2460 mddev->dev_sectors = rdev->sectors; 2461 } 2462 2463 /* Verify rdev->desc_nr is unique. 2464 * If it is -1, assign a free number, else 2465 * check number is not in use 2466 */ 2467 rcu_read_lock(); 2468 if (rdev->desc_nr < 0) { 2469 int choice = 0; 2470 if (mddev->pers) 2471 choice = mddev->raid_disks; 2472 while (md_find_rdev_nr_rcu(mddev, choice)) 2473 choice++; 2474 rdev->desc_nr = choice; 2475 } else { 2476 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2477 rcu_read_unlock(); 2478 return -EBUSY; 2479 } 2480 } 2481 rcu_read_unlock(); 2482 if (!test_bit(Journal, &rdev->flags) && 2483 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2484 pr_warn("md: %s: array is limited to %d devices\n", 2485 mdname(mddev), mddev->max_disks); 2486 return -EBUSY; 2487 } 2488 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2489 strreplace(b, '/', '!'); 2490 2491 rdev->mddev = mddev; 2492 pr_debug("md: bind<%s>\n", b); 2493 2494 if (mddev->raid_disks) 2495 mddev_create_serial_pool(mddev, rdev); 2496 2497 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2498 goto fail; 2499 2500 /* failure here is OK */ 2501 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2502 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2503 rdev->sysfs_unack_badblocks = 2504 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2505 rdev->sysfs_badblocks = 2506 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2507 2508 list_add_rcu(&rdev->same_set, &mddev->disks); 2509 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2510 2511 /* May as well allow recovery to be retried once */ 2512 mddev->recovery_disabled++; 2513 2514 return 0; 2515 2516 fail: 2517 pr_warn("md: failed to register dev-%s for %s\n", 2518 b, mdname(mddev)); 2519 mddev_destroy_serial_pool(mddev, rdev); 2520 return err; 2521 } 2522 2523 void md_autodetect_dev(dev_t dev); 2524 2525 /* just for claiming the bdev */ 2526 static struct md_rdev claim_rdev; 2527 2528 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2529 { 2530 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2531 md_rdev_clear(rdev); 2532 #ifndef MODULE 2533 if (test_bit(AutoDetected, &rdev->flags)) 2534 md_autodetect_dev(rdev->bdev->bd_dev); 2535 #endif 2536 fput(rdev->bdev_file); 2537 rdev->bdev = NULL; 2538 kobject_put(&rdev->kobj); 2539 } 2540 2541 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2542 { 2543 struct mddev *mddev = rdev->mddev; 2544 2545 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2546 list_del_rcu(&rdev->same_set); 2547 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2548 mddev_destroy_serial_pool(rdev->mddev, rdev); 2549 WRITE_ONCE(rdev->mddev, NULL); 2550 sysfs_remove_link(&rdev->kobj, "block"); 2551 sysfs_put(rdev->sysfs_state); 2552 sysfs_put(rdev->sysfs_unack_badblocks); 2553 sysfs_put(rdev->sysfs_badblocks); 2554 rdev->sysfs_state = NULL; 2555 rdev->sysfs_unack_badblocks = NULL; 2556 rdev->sysfs_badblocks = NULL; 2557 rdev->badblocks.count = 0; 2558 2559 synchronize_rcu(); 2560 2561 /* 2562 * kobject_del() will wait for all in progress writers to be done, where 2563 * reconfig_mutex is held, hence it can't be called under 2564 * reconfig_mutex and it's delayed to mddev_unlock(). 2565 */ 2566 list_add(&rdev->same_set, &mddev->deleting); 2567 } 2568 2569 static void export_array(struct mddev *mddev) 2570 { 2571 struct md_rdev *rdev; 2572 2573 while (!list_empty(&mddev->disks)) { 2574 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2575 same_set); 2576 md_kick_rdev_from_array(rdev); 2577 } 2578 mddev->raid_disks = 0; 2579 mddev->major_version = 0; 2580 } 2581 2582 static bool set_in_sync(struct mddev *mddev) 2583 { 2584 lockdep_assert_held(&mddev->lock); 2585 if (!mddev->in_sync) { 2586 mddev->sync_checkers++; 2587 spin_unlock(&mddev->lock); 2588 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2589 spin_lock(&mddev->lock); 2590 if (!mddev->in_sync && 2591 percpu_ref_is_zero(&mddev->writes_pending)) { 2592 mddev->in_sync = 1; 2593 /* 2594 * Ensure ->in_sync is visible before we clear 2595 * ->sync_checkers. 2596 */ 2597 smp_mb(); 2598 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2599 sysfs_notify_dirent_safe(mddev->sysfs_state); 2600 } 2601 if (--mddev->sync_checkers == 0) 2602 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2603 } 2604 if (mddev->safemode == 1) 2605 mddev->safemode = 0; 2606 return mddev->in_sync; 2607 } 2608 2609 static void sync_sbs(struct mddev *mddev, int nospares) 2610 { 2611 /* Update each superblock (in-memory image), but 2612 * if we are allowed to, skip spares which already 2613 * have the right event counter, or have one earlier 2614 * (which would mean they aren't being marked as dirty 2615 * with the rest of the array) 2616 */ 2617 struct md_rdev *rdev; 2618 rdev_for_each(rdev, mddev) { 2619 if (rdev->sb_events == mddev->events || 2620 (nospares && 2621 rdev->raid_disk < 0 && 2622 rdev->sb_events+1 == mddev->events)) { 2623 /* Don't update this superblock */ 2624 rdev->sb_loaded = 2; 2625 } else { 2626 sync_super(mddev, rdev); 2627 rdev->sb_loaded = 1; 2628 } 2629 } 2630 } 2631 2632 static bool does_sb_need_changing(struct mddev *mddev) 2633 { 2634 struct md_rdev *rdev = NULL, *iter; 2635 struct mdp_superblock_1 *sb; 2636 int role; 2637 2638 /* Find a good rdev */ 2639 rdev_for_each(iter, mddev) 2640 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2641 rdev = iter; 2642 break; 2643 } 2644 2645 /* No good device found. */ 2646 if (!rdev) 2647 return false; 2648 2649 sb = page_address(rdev->sb_page); 2650 /* Check if a device has become faulty or a spare become active */ 2651 rdev_for_each(rdev, mddev) { 2652 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2653 /* Device activated? */ 2654 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2655 !test_bit(Faulty, &rdev->flags)) 2656 return true; 2657 /* Device turned faulty? */ 2658 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2659 return true; 2660 } 2661 2662 /* Check if any mddev parameters have changed */ 2663 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2664 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2665 (mddev->layout != le32_to_cpu(sb->layout)) || 2666 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2667 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2668 return true; 2669 2670 return false; 2671 } 2672 2673 void md_update_sb(struct mddev *mddev, int force_change) 2674 { 2675 struct md_rdev *rdev; 2676 int sync_req; 2677 int nospares = 0; 2678 int any_badblocks_changed = 0; 2679 int ret = -1; 2680 2681 if (!md_is_rdwr(mddev)) { 2682 if (force_change) 2683 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2684 return; 2685 } 2686 2687 repeat: 2688 if (mddev_is_clustered(mddev)) { 2689 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2690 force_change = 1; 2691 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2692 nospares = 1; 2693 ret = mddev->cluster_ops->metadata_update_start(mddev); 2694 /* Has someone else has updated the sb */ 2695 if (!does_sb_need_changing(mddev)) { 2696 if (ret == 0) 2697 mddev->cluster_ops->metadata_update_cancel(mddev); 2698 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2699 BIT(MD_SB_CHANGE_DEVS) | 2700 BIT(MD_SB_CHANGE_CLEAN)); 2701 return; 2702 } 2703 } 2704 2705 /* 2706 * First make sure individual recovery_offsets are correct 2707 * curr_resync_completed can only be used during recovery. 2708 * During reshape/resync it might use array-addresses rather 2709 * that device addresses. 2710 */ 2711 rdev_for_each(rdev, mddev) { 2712 if (rdev->raid_disk >= 0 && 2713 mddev->delta_disks >= 0 && 2714 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2715 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2716 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2717 !test_bit(Journal, &rdev->flags) && 2718 !test_bit(In_sync, &rdev->flags) && 2719 mddev->curr_resync_completed > rdev->recovery_offset) 2720 rdev->recovery_offset = mddev->curr_resync_completed; 2721 2722 } 2723 if (!mddev->persistent) { 2724 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2725 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2726 if (!mddev->external) { 2727 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2728 rdev_for_each(rdev, mddev) { 2729 if (rdev->badblocks.changed) { 2730 rdev->badblocks.changed = 0; 2731 ack_all_badblocks(&rdev->badblocks); 2732 md_error(mddev, rdev); 2733 } 2734 clear_bit(Blocked, &rdev->flags); 2735 clear_bit(BlockedBadBlocks, &rdev->flags); 2736 wake_up(&rdev->blocked_wait); 2737 } 2738 } 2739 wake_up(&mddev->sb_wait); 2740 return; 2741 } 2742 2743 spin_lock(&mddev->lock); 2744 2745 mddev->utime = ktime_get_real_seconds(); 2746 2747 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2748 force_change = 1; 2749 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2750 /* just a clean<-> dirty transition, possibly leave spares alone, 2751 * though if events isn't the right even/odd, we will have to do 2752 * spares after all 2753 */ 2754 nospares = 1; 2755 if (force_change) 2756 nospares = 0; 2757 if (mddev->degraded) 2758 /* If the array is degraded, then skipping spares is both 2759 * dangerous and fairly pointless. 2760 * Dangerous because a device that was removed from the array 2761 * might have a event_count that still looks up-to-date, 2762 * so it can be re-added without a resync. 2763 * Pointless because if there are any spares to skip, 2764 * then a recovery will happen and soon that array won't 2765 * be degraded any more and the spare can go back to sleep then. 2766 */ 2767 nospares = 0; 2768 2769 sync_req = mddev->in_sync; 2770 2771 /* If this is just a dirty<->clean transition, and the array is clean 2772 * and 'events' is odd, we can roll back to the previous clean state */ 2773 if (nospares 2774 && (mddev->in_sync && mddev->resync_offset == MaxSector) 2775 && mddev->can_decrease_events 2776 && mddev->events != 1) { 2777 mddev->events--; 2778 mddev->can_decrease_events = 0; 2779 } else { 2780 /* otherwise we have to go forward and ... */ 2781 mddev->events ++; 2782 mddev->can_decrease_events = nospares; 2783 } 2784 2785 /* 2786 * This 64-bit counter should never wrap. 2787 * Either we are in around ~1 trillion A.C., assuming 2788 * 1 reboot per second, or we have a bug... 2789 */ 2790 WARN_ON(mddev->events == 0); 2791 2792 rdev_for_each(rdev, mddev) { 2793 if (rdev->badblocks.changed) 2794 any_badblocks_changed++; 2795 if (test_bit(Faulty, &rdev->flags)) 2796 set_bit(FaultRecorded, &rdev->flags); 2797 } 2798 2799 sync_sbs(mddev, nospares); 2800 spin_unlock(&mddev->lock); 2801 2802 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2803 mdname(mddev), mddev->in_sync); 2804 2805 mddev_add_trace_msg(mddev, "md md_update_sb"); 2806 rewrite: 2807 mddev->bitmap_ops->update_sb(mddev->bitmap); 2808 rdev_for_each(rdev, mddev) { 2809 if (rdev->sb_loaded != 1) 2810 continue; /* no noise on spare devices */ 2811 2812 if (!test_bit(Faulty, &rdev->flags)) { 2813 md_super_write(mddev,rdev, 2814 rdev->sb_start, rdev->sb_size, 2815 rdev->sb_page); 2816 pr_debug("md: (write) %pg's sb offset: %llu\n", 2817 rdev->bdev, 2818 (unsigned long long)rdev->sb_start); 2819 rdev->sb_events = mddev->events; 2820 if (rdev->badblocks.size) { 2821 md_super_write(mddev, rdev, 2822 rdev->badblocks.sector, 2823 rdev->badblocks.size << 9, 2824 rdev->bb_page); 2825 rdev->badblocks.size = 0; 2826 } 2827 2828 } else 2829 pr_debug("md: %pg (skipping faulty)\n", 2830 rdev->bdev); 2831 } 2832 if (md_super_wait(mddev) < 0) 2833 goto rewrite; 2834 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2835 2836 if (mddev_is_clustered(mddev) && ret == 0) 2837 mddev->cluster_ops->metadata_update_finish(mddev); 2838 2839 if (mddev->in_sync != sync_req || 2840 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2841 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2842 /* have to write it out again */ 2843 goto repeat; 2844 wake_up(&mddev->sb_wait); 2845 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2846 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2847 2848 rdev_for_each(rdev, mddev) { 2849 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2850 clear_bit(Blocked, &rdev->flags); 2851 2852 if (any_badblocks_changed) 2853 ack_all_badblocks(&rdev->badblocks); 2854 clear_bit(BlockedBadBlocks, &rdev->flags); 2855 wake_up(&rdev->blocked_wait); 2856 } 2857 } 2858 EXPORT_SYMBOL(md_update_sb); 2859 2860 static int add_bound_rdev(struct md_rdev *rdev) 2861 { 2862 struct mddev *mddev = rdev->mddev; 2863 int err = 0; 2864 bool add_journal = test_bit(Journal, &rdev->flags); 2865 2866 if (!mddev->pers->hot_remove_disk || add_journal) { 2867 /* If there is hot_add_disk but no hot_remove_disk 2868 * then added disks for geometry changes, 2869 * and should be added immediately. 2870 */ 2871 super_types[mddev->major_version]. 2872 validate_super(mddev, NULL/*freshest*/, rdev); 2873 err = mddev->pers->hot_add_disk(mddev, rdev); 2874 if (err) { 2875 md_kick_rdev_from_array(rdev); 2876 return err; 2877 } 2878 } 2879 sysfs_notify_dirent_safe(rdev->sysfs_state); 2880 2881 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2882 if (mddev->degraded) 2883 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2884 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2885 md_new_event(); 2886 return 0; 2887 } 2888 2889 /* words written to sysfs files may, or may not, be \n terminated. 2890 * We want to accept with case. For this we use cmd_match. 2891 */ 2892 static int cmd_match(const char *cmd, const char *str) 2893 { 2894 /* See if cmd, written into a sysfs file, matches 2895 * str. They must either be the same, or cmd can 2896 * have a trailing newline 2897 */ 2898 while (*cmd && *str && *cmd == *str) { 2899 cmd++; 2900 str++; 2901 } 2902 if (*cmd == '\n') 2903 cmd++; 2904 if (*str || *cmd) 2905 return 0; 2906 return 1; 2907 } 2908 2909 struct rdev_sysfs_entry { 2910 struct attribute attr; 2911 ssize_t (*show)(struct md_rdev *, char *); 2912 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2913 }; 2914 2915 static ssize_t 2916 state_show(struct md_rdev *rdev, char *page) 2917 { 2918 char *sep = ","; 2919 size_t len = 0; 2920 unsigned long flags = READ_ONCE(rdev->flags); 2921 2922 if (test_bit(Faulty, &flags) || 2923 (!test_bit(ExternalBbl, &flags) && 2924 rdev->badblocks.unacked_exist)) 2925 len += sprintf(page+len, "faulty%s", sep); 2926 if (test_bit(In_sync, &flags)) 2927 len += sprintf(page+len, "in_sync%s", sep); 2928 if (test_bit(Journal, &flags)) 2929 len += sprintf(page+len, "journal%s", sep); 2930 if (test_bit(WriteMostly, &flags)) 2931 len += sprintf(page+len, "write_mostly%s", sep); 2932 if (test_bit(Blocked, &flags) || 2933 (rdev->badblocks.unacked_exist 2934 && !test_bit(Faulty, &flags))) 2935 len += sprintf(page+len, "blocked%s", sep); 2936 if (!test_bit(Faulty, &flags) && 2937 !test_bit(Journal, &flags) && 2938 !test_bit(In_sync, &flags)) 2939 len += sprintf(page+len, "spare%s", sep); 2940 if (test_bit(WriteErrorSeen, &flags)) 2941 len += sprintf(page+len, "write_error%s", sep); 2942 if (test_bit(WantReplacement, &flags)) 2943 len += sprintf(page+len, "want_replacement%s", sep); 2944 if (test_bit(Replacement, &flags)) 2945 len += sprintf(page+len, "replacement%s", sep); 2946 if (test_bit(ExternalBbl, &flags)) 2947 len += sprintf(page+len, "external_bbl%s", sep); 2948 if (test_bit(FailFast, &flags)) 2949 len += sprintf(page+len, "failfast%s", sep); 2950 2951 if (len) 2952 len -= strlen(sep); 2953 2954 return len+sprintf(page+len, "\n"); 2955 } 2956 2957 static ssize_t 2958 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2959 { 2960 /* can write 2961 * faulty - simulates an error 2962 * remove - disconnects the device 2963 * writemostly - sets write_mostly 2964 * -writemostly - clears write_mostly 2965 * blocked - sets the Blocked flags 2966 * -blocked - clears the Blocked and possibly simulates an error 2967 * insync - sets Insync providing device isn't active 2968 * -insync - clear Insync for a device with a slot assigned, 2969 * so that it gets rebuilt based on bitmap 2970 * write_error - sets WriteErrorSeen 2971 * -write_error - clears WriteErrorSeen 2972 * {,-}failfast - set/clear FailFast 2973 */ 2974 2975 struct mddev *mddev = rdev->mddev; 2976 int err = -EINVAL; 2977 bool need_update_sb = false; 2978 2979 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2980 md_error(rdev->mddev, rdev); 2981 2982 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 2983 err = -EBUSY; 2984 else 2985 err = 0; 2986 } else if (cmd_match(buf, "remove")) { 2987 if (rdev->mddev->pers) { 2988 clear_bit(Blocked, &rdev->flags); 2989 remove_and_add_spares(rdev->mddev, rdev); 2990 } 2991 if (rdev->raid_disk >= 0) 2992 err = -EBUSY; 2993 else { 2994 err = 0; 2995 if (mddev_is_clustered(mddev)) 2996 err = mddev->cluster_ops->remove_disk(mddev, rdev); 2997 2998 if (err == 0) { 2999 md_kick_rdev_from_array(rdev); 3000 if (mddev->pers) 3001 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3002 md_new_event(); 3003 } 3004 } 3005 } else if (cmd_match(buf, "writemostly")) { 3006 set_bit(WriteMostly, &rdev->flags); 3007 mddev_create_serial_pool(rdev->mddev, rdev); 3008 need_update_sb = true; 3009 err = 0; 3010 } else if (cmd_match(buf, "-writemostly")) { 3011 mddev_destroy_serial_pool(rdev->mddev, rdev); 3012 clear_bit(WriteMostly, &rdev->flags); 3013 need_update_sb = true; 3014 err = 0; 3015 } else if (cmd_match(buf, "blocked")) { 3016 set_bit(Blocked, &rdev->flags); 3017 err = 0; 3018 } else if (cmd_match(buf, "-blocked")) { 3019 if (!test_bit(Faulty, &rdev->flags) && 3020 !test_bit(ExternalBbl, &rdev->flags) && 3021 rdev->badblocks.unacked_exist) { 3022 /* metadata handler doesn't understand badblocks, 3023 * so we need to fail the device 3024 */ 3025 md_error(rdev->mddev, rdev); 3026 } 3027 clear_bit(Blocked, &rdev->flags); 3028 clear_bit(BlockedBadBlocks, &rdev->flags); 3029 wake_up(&rdev->blocked_wait); 3030 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3031 3032 err = 0; 3033 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3034 set_bit(In_sync, &rdev->flags); 3035 err = 0; 3036 } else if (cmd_match(buf, "failfast")) { 3037 set_bit(FailFast, &rdev->flags); 3038 need_update_sb = true; 3039 err = 0; 3040 } else if (cmd_match(buf, "-failfast")) { 3041 clear_bit(FailFast, &rdev->flags); 3042 need_update_sb = true; 3043 err = 0; 3044 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3045 !test_bit(Journal, &rdev->flags)) { 3046 if (rdev->mddev->pers == NULL) { 3047 clear_bit(In_sync, &rdev->flags); 3048 rdev->saved_raid_disk = rdev->raid_disk; 3049 rdev->raid_disk = -1; 3050 err = 0; 3051 } 3052 } else if (cmd_match(buf, "write_error")) { 3053 set_bit(WriteErrorSeen, &rdev->flags); 3054 err = 0; 3055 } else if (cmd_match(buf, "-write_error")) { 3056 clear_bit(WriteErrorSeen, &rdev->flags); 3057 err = 0; 3058 } else if (cmd_match(buf, "want_replacement")) { 3059 /* Any non-spare device that is not a replacement can 3060 * become want_replacement at any time, but we then need to 3061 * check if recovery is needed. 3062 */ 3063 if (rdev->raid_disk >= 0 && 3064 !test_bit(Journal, &rdev->flags) && 3065 !test_bit(Replacement, &rdev->flags)) 3066 set_bit(WantReplacement, &rdev->flags); 3067 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3068 err = 0; 3069 } else if (cmd_match(buf, "-want_replacement")) { 3070 /* Clearing 'want_replacement' is always allowed. 3071 * Once replacements starts it is too late though. 3072 */ 3073 err = 0; 3074 clear_bit(WantReplacement, &rdev->flags); 3075 } else if (cmd_match(buf, "replacement")) { 3076 /* Can only set a device as a replacement when array has not 3077 * yet been started. Once running, replacement is automatic 3078 * from spares, or by assigning 'slot'. 3079 */ 3080 if (rdev->mddev->pers) 3081 err = -EBUSY; 3082 else { 3083 set_bit(Replacement, &rdev->flags); 3084 err = 0; 3085 } 3086 } else if (cmd_match(buf, "-replacement")) { 3087 /* Similarly, can only clear Replacement before start */ 3088 if (rdev->mddev->pers) 3089 err = -EBUSY; 3090 else { 3091 clear_bit(Replacement, &rdev->flags); 3092 err = 0; 3093 } 3094 } else if (cmd_match(buf, "re-add")) { 3095 if (!rdev->mddev->pers) 3096 err = -EINVAL; 3097 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3098 rdev->saved_raid_disk >= 0) { 3099 /* clear_bit is performed _after_ all the devices 3100 * have their local Faulty bit cleared. If any writes 3101 * happen in the meantime in the local node, they 3102 * will land in the local bitmap, which will be synced 3103 * by this node eventually 3104 */ 3105 if (!mddev_is_clustered(rdev->mddev) || 3106 (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { 3107 clear_bit(Faulty, &rdev->flags); 3108 err = add_bound_rdev(rdev); 3109 } 3110 } else 3111 err = -EBUSY; 3112 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3113 set_bit(ExternalBbl, &rdev->flags); 3114 rdev->badblocks.shift = 0; 3115 err = 0; 3116 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3117 clear_bit(ExternalBbl, &rdev->flags); 3118 err = 0; 3119 } 3120 if (need_update_sb) 3121 md_update_sb(mddev, 1); 3122 if (!err) 3123 sysfs_notify_dirent_safe(rdev->sysfs_state); 3124 return err ? err : len; 3125 } 3126 static struct rdev_sysfs_entry rdev_state = 3127 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3128 3129 static ssize_t 3130 errors_show(struct md_rdev *rdev, char *page) 3131 { 3132 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3133 } 3134 3135 static ssize_t 3136 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3137 { 3138 unsigned int n; 3139 int rv; 3140 3141 rv = kstrtouint(buf, 10, &n); 3142 if (rv < 0) 3143 return rv; 3144 atomic_set(&rdev->corrected_errors, n); 3145 return len; 3146 } 3147 static struct rdev_sysfs_entry rdev_errors = 3148 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3149 3150 static ssize_t 3151 slot_show(struct md_rdev *rdev, char *page) 3152 { 3153 if (test_bit(Journal, &rdev->flags)) 3154 return sprintf(page, "journal\n"); 3155 else if (rdev->raid_disk < 0) 3156 return sprintf(page, "none\n"); 3157 else 3158 return sprintf(page, "%d\n", rdev->raid_disk); 3159 } 3160 3161 static ssize_t 3162 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3163 { 3164 int slot; 3165 int err; 3166 3167 if (test_bit(Journal, &rdev->flags)) 3168 return -EBUSY; 3169 if (strncmp(buf, "none", 4)==0) 3170 slot = -1; 3171 else { 3172 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3173 if (err < 0) 3174 return err; 3175 if (slot < 0) 3176 /* overflow */ 3177 return -ENOSPC; 3178 } 3179 if (rdev->mddev->pers && slot == -1) { 3180 /* Setting 'slot' on an active array requires also 3181 * updating the 'rd%d' link, and communicating 3182 * with the personality with ->hot_*_disk. 3183 * For now we only support removing 3184 * failed/spare devices. This normally happens automatically, 3185 * but not when the metadata is externally managed. 3186 */ 3187 if (rdev->raid_disk == -1) 3188 return -EEXIST; 3189 /* personality does all needed checks */ 3190 if (rdev->mddev->pers->hot_remove_disk == NULL) 3191 return -EINVAL; 3192 clear_bit(Blocked, &rdev->flags); 3193 remove_and_add_spares(rdev->mddev, rdev); 3194 if (rdev->raid_disk >= 0) 3195 return -EBUSY; 3196 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3197 } else if (rdev->mddev->pers) { 3198 /* Activating a spare .. or possibly reactivating 3199 * if we ever get bitmaps working here. 3200 */ 3201 int err; 3202 3203 if (rdev->raid_disk != -1) 3204 return -EBUSY; 3205 3206 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3207 return -EBUSY; 3208 3209 if (rdev->mddev->pers->hot_add_disk == NULL) 3210 return -EINVAL; 3211 3212 if (slot >= rdev->mddev->raid_disks && 3213 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3214 return -ENOSPC; 3215 3216 rdev->raid_disk = slot; 3217 if (test_bit(In_sync, &rdev->flags)) 3218 rdev->saved_raid_disk = slot; 3219 else 3220 rdev->saved_raid_disk = -1; 3221 clear_bit(In_sync, &rdev->flags); 3222 clear_bit(Bitmap_sync, &rdev->flags); 3223 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3224 if (err) { 3225 rdev->raid_disk = -1; 3226 return err; 3227 } else 3228 sysfs_notify_dirent_safe(rdev->sysfs_state); 3229 /* failure here is OK */; 3230 sysfs_link_rdev(rdev->mddev, rdev); 3231 /* don't wakeup anyone, leave that to userspace. */ 3232 } else { 3233 if (slot >= rdev->mddev->raid_disks && 3234 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3235 return -ENOSPC; 3236 rdev->raid_disk = slot; 3237 /* assume it is working */ 3238 clear_bit(Faulty, &rdev->flags); 3239 clear_bit(WriteMostly, &rdev->flags); 3240 set_bit(In_sync, &rdev->flags); 3241 sysfs_notify_dirent_safe(rdev->sysfs_state); 3242 } 3243 return len; 3244 } 3245 3246 static struct rdev_sysfs_entry rdev_slot = 3247 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3248 3249 static ssize_t 3250 offset_show(struct md_rdev *rdev, char *page) 3251 { 3252 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3253 } 3254 3255 static ssize_t 3256 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3257 { 3258 unsigned long long offset; 3259 if (kstrtoull(buf, 10, &offset) < 0) 3260 return -EINVAL; 3261 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3262 return -EBUSY; 3263 if (rdev->sectors && rdev->mddev->external) 3264 /* Must set offset before size, so overlap checks 3265 * can be sane */ 3266 return -EBUSY; 3267 rdev->data_offset = offset; 3268 rdev->new_data_offset = offset; 3269 return len; 3270 } 3271 3272 static struct rdev_sysfs_entry rdev_offset = 3273 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3274 3275 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3276 { 3277 return sprintf(page, "%llu\n", 3278 (unsigned long long)rdev->new_data_offset); 3279 } 3280 3281 static ssize_t new_offset_store(struct md_rdev *rdev, 3282 const char *buf, size_t len) 3283 { 3284 unsigned long long new_offset; 3285 struct mddev *mddev = rdev->mddev; 3286 3287 if (kstrtoull(buf, 10, &new_offset) < 0) 3288 return -EINVAL; 3289 3290 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3291 return -EBUSY; 3292 if (new_offset == rdev->data_offset) 3293 /* reset is always permitted */ 3294 ; 3295 else if (new_offset > rdev->data_offset) { 3296 /* must not push array size beyond rdev_sectors */ 3297 if (new_offset - rdev->data_offset 3298 + mddev->dev_sectors > rdev->sectors) 3299 return -E2BIG; 3300 } 3301 /* Metadata worries about other space details. */ 3302 3303 /* decreasing the offset is inconsistent with a backwards 3304 * reshape. 3305 */ 3306 if (new_offset < rdev->data_offset && 3307 mddev->reshape_backwards) 3308 return -EINVAL; 3309 /* Increasing offset is inconsistent with forwards 3310 * reshape. reshape_direction should be set to 3311 * 'backwards' first. 3312 */ 3313 if (new_offset > rdev->data_offset && 3314 !mddev->reshape_backwards) 3315 return -EINVAL; 3316 3317 if (mddev->pers && mddev->persistent && 3318 !super_types[mddev->major_version] 3319 .allow_new_offset(rdev, new_offset)) 3320 return -E2BIG; 3321 rdev->new_data_offset = new_offset; 3322 if (new_offset > rdev->data_offset) 3323 mddev->reshape_backwards = 1; 3324 else if (new_offset < rdev->data_offset) 3325 mddev->reshape_backwards = 0; 3326 3327 return len; 3328 } 3329 static struct rdev_sysfs_entry rdev_new_offset = 3330 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3331 3332 static ssize_t 3333 rdev_size_show(struct md_rdev *rdev, char *page) 3334 { 3335 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3336 } 3337 3338 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3339 { 3340 /* check if two start/length pairs overlap */ 3341 if (a->data_offset + a->sectors <= b->data_offset) 3342 return false; 3343 if (b->data_offset + b->sectors <= a->data_offset) 3344 return false; 3345 return true; 3346 } 3347 3348 static bool md_rdev_overlaps(struct md_rdev *rdev) 3349 { 3350 struct mddev *mddev; 3351 struct md_rdev *rdev2; 3352 3353 spin_lock(&all_mddevs_lock); 3354 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3355 if (test_bit(MD_DELETED, &mddev->flags)) 3356 continue; 3357 rdev_for_each(rdev2, mddev) { 3358 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3359 md_rdevs_overlap(rdev, rdev2)) { 3360 spin_unlock(&all_mddevs_lock); 3361 return true; 3362 } 3363 } 3364 } 3365 spin_unlock(&all_mddevs_lock); 3366 return false; 3367 } 3368 3369 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3370 { 3371 unsigned long long blocks; 3372 sector_t new; 3373 3374 if (kstrtoull(buf, 10, &blocks) < 0) 3375 return -EINVAL; 3376 3377 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3378 return -EINVAL; /* sector conversion overflow */ 3379 3380 new = blocks * 2; 3381 if (new != blocks * 2) 3382 return -EINVAL; /* unsigned long long to sector_t overflow */ 3383 3384 *sectors = new; 3385 return 0; 3386 } 3387 3388 static ssize_t 3389 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3390 { 3391 struct mddev *my_mddev = rdev->mddev; 3392 sector_t oldsectors = rdev->sectors; 3393 sector_t sectors; 3394 3395 if (test_bit(Journal, &rdev->flags)) 3396 return -EBUSY; 3397 if (strict_blocks_to_sectors(buf, §ors) < 0) 3398 return -EINVAL; 3399 if (rdev->data_offset != rdev->new_data_offset) 3400 return -EINVAL; /* too confusing */ 3401 if (my_mddev->pers && rdev->raid_disk >= 0) { 3402 if (my_mddev->persistent) { 3403 sectors = super_types[my_mddev->major_version]. 3404 rdev_size_change(rdev, sectors); 3405 if (!sectors) 3406 return -EBUSY; 3407 } else if (!sectors) 3408 sectors = bdev_nr_sectors(rdev->bdev) - 3409 rdev->data_offset; 3410 if (!my_mddev->pers->resize) 3411 /* Cannot change size for RAID0 or Linear etc */ 3412 return -EINVAL; 3413 } 3414 if (sectors < my_mddev->dev_sectors) 3415 return -EINVAL; /* component must fit device */ 3416 3417 rdev->sectors = sectors; 3418 3419 /* 3420 * Check that all other rdevs with the same bdev do not overlap. This 3421 * check does not provide a hard guarantee, it just helps avoid 3422 * dangerous mistakes. 3423 */ 3424 if (sectors > oldsectors && my_mddev->external && 3425 md_rdev_overlaps(rdev)) { 3426 /* 3427 * Someone else could have slipped in a size change here, but 3428 * doing so is just silly. We put oldsectors back because we 3429 * know it is safe, and trust userspace not to race with itself. 3430 */ 3431 rdev->sectors = oldsectors; 3432 return -EBUSY; 3433 } 3434 return len; 3435 } 3436 3437 static struct rdev_sysfs_entry rdev_size = 3438 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3439 3440 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3441 { 3442 unsigned long long recovery_start = rdev->recovery_offset; 3443 3444 if (test_bit(In_sync, &rdev->flags) || 3445 recovery_start == MaxSector) 3446 return sprintf(page, "none\n"); 3447 3448 return sprintf(page, "%llu\n", recovery_start); 3449 } 3450 3451 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3452 { 3453 unsigned long long recovery_start; 3454 3455 if (cmd_match(buf, "none")) 3456 recovery_start = MaxSector; 3457 else if (kstrtoull(buf, 10, &recovery_start)) 3458 return -EINVAL; 3459 3460 if (rdev->mddev->pers && 3461 rdev->raid_disk >= 0) 3462 return -EBUSY; 3463 3464 rdev->recovery_offset = recovery_start; 3465 if (recovery_start == MaxSector) 3466 set_bit(In_sync, &rdev->flags); 3467 else 3468 clear_bit(In_sync, &rdev->flags); 3469 return len; 3470 } 3471 3472 static struct rdev_sysfs_entry rdev_recovery_start = 3473 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3474 3475 /* sysfs access to bad-blocks list. 3476 * We present two files. 3477 * 'bad-blocks' lists sector numbers and lengths of ranges that 3478 * are recorded as bad. The list is truncated to fit within 3479 * the one-page limit of sysfs. 3480 * Writing "sector length" to this file adds an acknowledged 3481 * bad block list. 3482 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3483 * been acknowledged. Writing to this file adds bad blocks 3484 * without acknowledging them. This is largely for testing. 3485 */ 3486 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3487 { 3488 return badblocks_show(&rdev->badblocks, page, 0); 3489 } 3490 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3491 { 3492 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3493 /* Maybe that ack was all we needed */ 3494 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3495 wake_up(&rdev->blocked_wait); 3496 return rv; 3497 } 3498 static struct rdev_sysfs_entry rdev_bad_blocks = 3499 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3500 3501 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3502 { 3503 return badblocks_show(&rdev->badblocks, page, 1); 3504 } 3505 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3506 { 3507 return badblocks_store(&rdev->badblocks, page, len, 1); 3508 } 3509 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3510 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3511 3512 static ssize_t 3513 ppl_sector_show(struct md_rdev *rdev, char *page) 3514 { 3515 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3516 } 3517 3518 static ssize_t 3519 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3520 { 3521 unsigned long long sector; 3522 3523 if (kstrtoull(buf, 10, §or) < 0) 3524 return -EINVAL; 3525 if (sector != (sector_t)sector) 3526 return -EINVAL; 3527 3528 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3529 rdev->raid_disk >= 0) 3530 return -EBUSY; 3531 3532 if (rdev->mddev->persistent) { 3533 if (rdev->mddev->major_version == 0) 3534 return -EINVAL; 3535 if ((sector > rdev->sb_start && 3536 sector - rdev->sb_start > S16_MAX) || 3537 (sector < rdev->sb_start && 3538 rdev->sb_start - sector > -S16_MIN)) 3539 return -EINVAL; 3540 rdev->ppl.offset = sector - rdev->sb_start; 3541 } else if (!rdev->mddev->external) { 3542 return -EBUSY; 3543 } 3544 rdev->ppl.sector = sector; 3545 return len; 3546 } 3547 3548 static struct rdev_sysfs_entry rdev_ppl_sector = 3549 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3550 3551 static ssize_t 3552 ppl_size_show(struct md_rdev *rdev, char *page) 3553 { 3554 return sprintf(page, "%u\n", rdev->ppl.size); 3555 } 3556 3557 static ssize_t 3558 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3559 { 3560 unsigned int size; 3561 3562 if (kstrtouint(buf, 10, &size) < 0) 3563 return -EINVAL; 3564 3565 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3566 rdev->raid_disk >= 0) 3567 return -EBUSY; 3568 3569 if (rdev->mddev->persistent) { 3570 if (rdev->mddev->major_version == 0) 3571 return -EINVAL; 3572 if (size > U16_MAX) 3573 return -EINVAL; 3574 } else if (!rdev->mddev->external) { 3575 return -EBUSY; 3576 } 3577 rdev->ppl.size = size; 3578 return len; 3579 } 3580 3581 static struct rdev_sysfs_entry rdev_ppl_size = 3582 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3583 3584 static struct attribute *rdev_default_attrs[] = { 3585 &rdev_state.attr, 3586 &rdev_errors.attr, 3587 &rdev_slot.attr, 3588 &rdev_offset.attr, 3589 &rdev_new_offset.attr, 3590 &rdev_size.attr, 3591 &rdev_recovery_start.attr, 3592 &rdev_bad_blocks.attr, 3593 &rdev_unack_bad_blocks.attr, 3594 &rdev_ppl_sector.attr, 3595 &rdev_ppl_size.attr, 3596 NULL, 3597 }; 3598 ATTRIBUTE_GROUPS(rdev_default); 3599 static ssize_t 3600 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3601 { 3602 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3603 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3604 3605 if (!entry->show) 3606 return -EIO; 3607 if (!rdev->mddev) 3608 return -ENODEV; 3609 return entry->show(rdev, page); 3610 } 3611 3612 static ssize_t 3613 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3614 const char *page, size_t length) 3615 { 3616 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3617 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3618 struct kernfs_node *kn = NULL; 3619 bool suspend = false; 3620 ssize_t rv; 3621 struct mddev *mddev = READ_ONCE(rdev->mddev); 3622 3623 if (!entry->store) 3624 return -EIO; 3625 if (!capable(CAP_SYS_ADMIN)) 3626 return -EACCES; 3627 if (!mddev) 3628 return -ENODEV; 3629 3630 if (entry->store == state_store) { 3631 if (cmd_match(page, "remove")) 3632 kn = sysfs_break_active_protection(kobj, attr); 3633 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3634 cmd_match(page, "writemostly") || 3635 cmd_match(page, "-writemostly")) 3636 suspend = true; 3637 } 3638 3639 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3640 if (!rv) { 3641 if (rdev->mddev == NULL) 3642 rv = -ENODEV; 3643 else 3644 rv = entry->store(rdev, page, length); 3645 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3646 } 3647 3648 if (kn) 3649 sysfs_unbreak_active_protection(kn); 3650 3651 return rv; 3652 } 3653 3654 static void rdev_free(struct kobject *ko) 3655 { 3656 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3657 kfree(rdev); 3658 } 3659 static const struct sysfs_ops rdev_sysfs_ops = { 3660 .show = rdev_attr_show, 3661 .store = rdev_attr_store, 3662 }; 3663 static const struct kobj_type rdev_ktype = { 3664 .release = rdev_free, 3665 .sysfs_ops = &rdev_sysfs_ops, 3666 .default_groups = rdev_default_groups, 3667 }; 3668 3669 int md_rdev_init(struct md_rdev *rdev) 3670 { 3671 rdev->desc_nr = -1; 3672 rdev->saved_raid_disk = -1; 3673 rdev->raid_disk = -1; 3674 rdev->flags = 0; 3675 rdev->data_offset = 0; 3676 rdev->new_data_offset = 0; 3677 rdev->sb_events = 0; 3678 rdev->last_read_error = 0; 3679 rdev->sb_loaded = 0; 3680 rdev->bb_page = NULL; 3681 atomic_set(&rdev->nr_pending, 0); 3682 atomic_set(&rdev->read_errors, 0); 3683 atomic_set(&rdev->corrected_errors, 0); 3684 3685 INIT_LIST_HEAD(&rdev->same_set); 3686 init_waitqueue_head(&rdev->blocked_wait); 3687 3688 /* Add space to store bad block list. 3689 * This reserves the space even on arrays where it cannot 3690 * be used - I wonder if that matters 3691 */ 3692 return badblocks_init(&rdev->badblocks, 0); 3693 } 3694 EXPORT_SYMBOL_GPL(md_rdev_init); 3695 3696 /* 3697 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3698 * 3699 * mark the device faulty if: 3700 * 3701 * - the device is nonexistent (zero size) 3702 * - the device has no valid superblock 3703 * 3704 * a faulty rdev _never_ has rdev->sb set. 3705 */ 3706 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3707 { 3708 struct md_rdev *rdev; 3709 sector_t size; 3710 int err; 3711 3712 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3713 if (!rdev) 3714 return ERR_PTR(-ENOMEM); 3715 3716 err = md_rdev_init(rdev); 3717 if (err) 3718 goto out_free_rdev; 3719 err = alloc_disk_sb(rdev); 3720 if (err) 3721 goto out_clear_rdev; 3722 3723 rdev->bdev_file = bdev_file_open_by_dev(newdev, 3724 BLK_OPEN_READ | BLK_OPEN_WRITE, 3725 super_format == -2 ? &claim_rdev : rdev, NULL); 3726 if (IS_ERR(rdev->bdev_file)) { 3727 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3728 MAJOR(newdev), MINOR(newdev)); 3729 err = PTR_ERR(rdev->bdev_file); 3730 goto out_clear_rdev; 3731 } 3732 rdev->bdev = file_bdev(rdev->bdev_file); 3733 3734 kobject_init(&rdev->kobj, &rdev_ktype); 3735 3736 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3737 if (!size) { 3738 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3739 rdev->bdev); 3740 err = -EINVAL; 3741 goto out_blkdev_put; 3742 } 3743 3744 if (super_format >= 0) { 3745 err = super_types[super_format]. 3746 load_super(rdev, NULL, super_minor); 3747 if (err == -EINVAL) { 3748 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3749 rdev->bdev, 3750 super_format, super_minor); 3751 goto out_blkdev_put; 3752 } 3753 if (err < 0) { 3754 pr_warn("md: could not read %pg's sb, not importing!\n", 3755 rdev->bdev); 3756 goto out_blkdev_put; 3757 } 3758 } 3759 3760 return rdev; 3761 3762 out_blkdev_put: 3763 fput(rdev->bdev_file); 3764 out_clear_rdev: 3765 md_rdev_clear(rdev); 3766 out_free_rdev: 3767 kfree(rdev); 3768 return ERR_PTR(err); 3769 } 3770 3771 /* 3772 * Check a full RAID array for plausibility 3773 */ 3774 3775 static int analyze_sbs(struct mddev *mddev) 3776 { 3777 int i; 3778 struct md_rdev *rdev, *freshest, *tmp; 3779 3780 freshest = NULL; 3781 rdev_for_each_safe(rdev, tmp, mddev) 3782 switch (super_types[mddev->major_version]. 3783 load_super(rdev, freshest, mddev->minor_version)) { 3784 case 1: 3785 freshest = rdev; 3786 break; 3787 case 0: 3788 break; 3789 default: 3790 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3791 rdev->bdev); 3792 md_kick_rdev_from_array(rdev); 3793 } 3794 3795 /* Cannot find a valid fresh disk */ 3796 if (!freshest) { 3797 pr_warn("md: cannot find a valid disk\n"); 3798 return -EINVAL; 3799 } 3800 3801 super_types[mddev->major_version]. 3802 validate_super(mddev, NULL/*freshest*/, freshest); 3803 3804 i = 0; 3805 rdev_for_each_safe(rdev, tmp, mddev) { 3806 if (mddev->max_disks && 3807 (rdev->desc_nr >= mddev->max_disks || 3808 i > mddev->max_disks)) { 3809 pr_warn("md: %s: %pg: only %d devices permitted\n", 3810 mdname(mddev), rdev->bdev, 3811 mddev->max_disks); 3812 md_kick_rdev_from_array(rdev); 3813 continue; 3814 } 3815 if (rdev != freshest) { 3816 if (super_types[mddev->major_version]. 3817 validate_super(mddev, freshest, rdev)) { 3818 pr_warn("md: kicking non-fresh %pg from array!\n", 3819 rdev->bdev); 3820 md_kick_rdev_from_array(rdev); 3821 continue; 3822 } 3823 } 3824 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && 3825 !test_bit(Journal, &rdev->flags)) { 3826 rdev->raid_disk = -1; 3827 clear_bit(In_sync, &rdev->flags); 3828 } 3829 } 3830 3831 return 0; 3832 } 3833 3834 /* Read a fixed-point number. 3835 * Numbers in sysfs attributes should be in "standard" units where 3836 * possible, so time should be in seconds. 3837 * However we internally use a a much smaller unit such as 3838 * milliseconds or jiffies. 3839 * This function takes a decimal number with a possible fractional 3840 * component, and produces an integer which is the result of 3841 * multiplying that number by 10^'scale'. 3842 * all without any floating-point arithmetic. 3843 */ 3844 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3845 { 3846 unsigned long result = 0; 3847 long decimals = -1; 3848 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3849 if (*cp == '.') 3850 decimals = 0; 3851 else if (decimals < scale) { 3852 unsigned int value; 3853 value = *cp - '0'; 3854 result = result * 10 + value; 3855 if (decimals >= 0) 3856 decimals++; 3857 } 3858 cp++; 3859 } 3860 if (*cp == '\n') 3861 cp++; 3862 if (*cp) 3863 return -EINVAL; 3864 if (decimals < 0) 3865 decimals = 0; 3866 *res = result * int_pow(10, scale - decimals); 3867 return 0; 3868 } 3869 3870 static ssize_t 3871 safe_delay_show(struct mddev *mddev, char *page) 3872 { 3873 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3874 3875 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3876 } 3877 static ssize_t 3878 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3879 { 3880 unsigned long msec; 3881 3882 if (mddev_is_clustered(mddev)) { 3883 pr_warn("md: Safemode is disabled for clustered mode\n"); 3884 return -EINVAL; 3885 } 3886 3887 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3888 return -EINVAL; 3889 if (msec == 0) 3890 mddev->safemode_delay = 0; 3891 else { 3892 unsigned long old_delay = mddev->safemode_delay; 3893 unsigned long new_delay = (msec*HZ)/1000; 3894 3895 if (new_delay == 0) 3896 new_delay = 1; 3897 mddev->safemode_delay = new_delay; 3898 if (new_delay < old_delay || old_delay == 0) 3899 mod_timer(&mddev->safemode_timer, jiffies+1); 3900 } 3901 return len; 3902 } 3903 static struct md_sysfs_entry md_safe_delay = 3904 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3905 3906 static ssize_t 3907 level_show(struct mddev *mddev, char *page) 3908 { 3909 struct md_personality *p; 3910 int ret; 3911 spin_lock(&mddev->lock); 3912 p = mddev->pers; 3913 if (p) 3914 ret = sprintf(page, "%s\n", p->head.name); 3915 else if (mddev->clevel[0]) 3916 ret = sprintf(page, "%s\n", mddev->clevel); 3917 else if (mddev->level != LEVEL_NONE) 3918 ret = sprintf(page, "%d\n", mddev->level); 3919 else 3920 ret = 0; 3921 spin_unlock(&mddev->lock); 3922 return ret; 3923 } 3924 3925 static ssize_t 3926 level_store(struct mddev *mddev, const char *buf, size_t len) 3927 { 3928 char clevel[16]; 3929 ssize_t rv; 3930 size_t slen = len; 3931 struct md_personality *pers, *oldpers; 3932 long level; 3933 void *priv, *oldpriv; 3934 struct md_rdev *rdev; 3935 3936 if (slen == 0 || slen >= sizeof(clevel)) 3937 return -EINVAL; 3938 3939 rv = mddev_suspend_and_lock(mddev); 3940 if (rv) 3941 return rv; 3942 3943 if (mddev->pers == NULL) { 3944 memcpy(mddev->clevel, buf, slen); 3945 if (mddev->clevel[slen-1] == '\n') 3946 slen--; 3947 mddev->clevel[slen] = 0; 3948 mddev->level = LEVEL_NONE; 3949 rv = len; 3950 goto out_unlock; 3951 } 3952 rv = -EROFS; 3953 if (!md_is_rdwr(mddev)) 3954 goto out_unlock; 3955 3956 /* request to change the personality. Need to ensure: 3957 * - array is not engaged in resync/recovery/reshape 3958 * - old personality can be suspended 3959 * - new personality will access other array. 3960 */ 3961 3962 rv = -EBUSY; 3963 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3964 mddev->reshape_position != MaxSector || 3965 mddev->sysfs_active) 3966 goto out_unlock; 3967 3968 rv = -EINVAL; 3969 if (!mddev->pers->quiesce) { 3970 pr_warn("md: %s: %s does not support online personality change\n", 3971 mdname(mddev), mddev->pers->head.name); 3972 goto out_unlock; 3973 } 3974 3975 /* Now find the new personality */ 3976 memcpy(clevel, buf, slen); 3977 if (clevel[slen-1] == '\n') 3978 slen--; 3979 clevel[slen] = 0; 3980 if (kstrtol(clevel, 10, &level)) 3981 level = LEVEL_NONE; 3982 3983 if (request_module("md-%s", clevel) != 0) 3984 request_module("md-level-%s", clevel); 3985 pers = get_pers(level, clevel); 3986 if (!pers) { 3987 rv = -EINVAL; 3988 goto out_unlock; 3989 } 3990 3991 if (pers == mddev->pers) { 3992 /* Nothing to do! */ 3993 put_pers(pers); 3994 rv = len; 3995 goto out_unlock; 3996 } 3997 if (!pers->takeover) { 3998 put_pers(pers); 3999 pr_warn("md: %s: %s does not support personality takeover\n", 4000 mdname(mddev), clevel); 4001 rv = -EINVAL; 4002 goto out_unlock; 4003 } 4004 4005 rdev_for_each(rdev, mddev) 4006 rdev->new_raid_disk = rdev->raid_disk; 4007 4008 /* ->takeover must set new_* and/or delta_disks 4009 * if it succeeds, and may set them when it fails. 4010 */ 4011 priv = pers->takeover(mddev); 4012 if (IS_ERR(priv)) { 4013 mddev->new_level = mddev->level; 4014 mddev->new_layout = mddev->layout; 4015 mddev->new_chunk_sectors = mddev->chunk_sectors; 4016 mddev->raid_disks -= mddev->delta_disks; 4017 mddev->delta_disks = 0; 4018 mddev->reshape_backwards = 0; 4019 put_pers(pers); 4020 pr_warn("md: %s: %s would not accept array\n", 4021 mdname(mddev), clevel); 4022 rv = PTR_ERR(priv); 4023 goto out_unlock; 4024 } 4025 4026 /* Looks like we have a winner */ 4027 mddev_detach(mddev); 4028 4029 spin_lock(&mddev->lock); 4030 oldpers = mddev->pers; 4031 oldpriv = mddev->private; 4032 mddev->pers = pers; 4033 mddev->private = priv; 4034 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 4035 mddev->level = mddev->new_level; 4036 mddev->layout = mddev->new_layout; 4037 mddev->chunk_sectors = mddev->new_chunk_sectors; 4038 mddev->delta_disks = 0; 4039 mddev->reshape_backwards = 0; 4040 mddev->degraded = 0; 4041 spin_unlock(&mddev->lock); 4042 4043 if (oldpers->sync_request == NULL && 4044 mddev->external) { 4045 /* We are converting from a no-redundancy array 4046 * to a redundancy array and metadata is managed 4047 * externally so we need to be sure that writes 4048 * won't block due to a need to transition 4049 * clean->dirty 4050 * until external management is started. 4051 */ 4052 mddev->in_sync = 0; 4053 mddev->safemode_delay = 0; 4054 mddev->safemode = 0; 4055 } 4056 4057 oldpers->free(mddev, oldpriv); 4058 4059 if (oldpers->sync_request == NULL && 4060 pers->sync_request != NULL) { 4061 /* need to add the md_redundancy_group */ 4062 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4063 pr_warn("md: cannot register extra attributes for %s\n", 4064 mdname(mddev)); 4065 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4066 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4067 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4068 } 4069 if (oldpers->sync_request != NULL && 4070 pers->sync_request == NULL) { 4071 /* need to remove the md_redundancy_group */ 4072 if (mddev->to_remove == NULL) 4073 mddev->to_remove = &md_redundancy_group; 4074 } 4075 4076 put_pers(oldpers); 4077 4078 rdev_for_each(rdev, mddev) { 4079 if (rdev->raid_disk < 0) 4080 continue; 4081 if (rdev->new_raid_disk >= mddev->raid_disks) 4082 rdev->new_raid_disk = -1; 4083 if (rdev->new_raid_disk == rdev->raid_disk) 4084 continue; 4085 sysfs_unlink_rdev(mddev, rdev); 4086 } 4087 rdev_for_each(rdev, mddev) { 4088 if (rdev->raid_disk < 0) 4089 continue; 4090 if (rdev->new_raid_disk == rdev->raid_disk) 4091 continue; 4092 rdev->raid_disk = rdev->new_raid_disk; 4093 if (rdev->raid_disk < 0) 4094 clear_bit(In_sync, &rdev->flags); 4095 else { 4096 if (sysfs_link_rdev(mddev, rdev)) 4097 pr_warn("md: cannot register rd%d for %s after level change\n", 4098 rdev->raid_disk, mdname(mddev)); 4099 } 4100 } 4101 4102 if (pers->sync_request == NULL) { 4103 /* this is now an array without redundancy, so 4104 * it must always be in_sync 4105 */ 4106 mddev->in_sync = 1; 4107 timer_delete_sync(&mddev->safemode_timer); 4108 } 4109 pers->run(mddev); 4110 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4111 if (!mddev->thread) 4112 md_update_sb(mddev, 1); 4113 sysfs_notify_dirent_safe(mddev->sysfs_level); 4114 md_new_event(); 4115 rv = len; 4116 out_unlock: 4117 mddev_unlock_and_resume(mddev); 4118 return rv; 4119 } 4120 4121 static struct md_sysfs_entry md_level = 4122 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4123 4124 static ssize_t 4125 new_level_show(struct mddev *mddev, char *page) 4126 { 4127 return sprintf(page, "%d\n", mddev->new_level); 4128 } 4129 4130 static ssize_t 4131 new_level_store(struct mddev *mddev, const char *buf, size_t len) 4132 { 4133 unsigned int n; 4134 int err; 4135 4136 err = kstrtouint(buf, 10, &n); 4137 if (err < 0) 4138 return err; 4139 err = mddev_lock(mddev); 4140 if (err) 4141 return err; 4142 4143 mddev->new_level = n; 4144 md_update_sb(mddev, 1); 4145 4146 mddev_unlock(mddev); 4147 return len; 4148 } 4149 static struct md_sysfs_entry md_new_level = 4150 __ATTR(new_level, 0664, new_level_show, new_level_store); 4151 4152 static ssize_t 4153 layout_show(struct mddev *mddev, char *page) 4154 { 4155 /* just a number, not meaningful for all levels */ 4156 if (mddev->reshape_position != MaxSector && 4157 mddev->layout != mddev->new_layout) 4158 return sprintf(page, "%d (%d)\n", 4159 mddev->new_layout, mddev->layout); 4160 return sprintf(page, "%d\n", mddev->layout); 4161 } 4162 4163 static ssize_t 4164 layout_store(struct mddev *mddev, const char *buf, size_t len) 4165 { 4166 unsigned int n; 4167 int err; 4168 4169 err = kstrtouint(buf, 10, &n); 4170 if (err < 0) 4171 return err; 4172 err = mddev_lock(mddev); 4173 if (err) 4174 return err; 4175 4176 if (mddev->pers) { 4177 if (mddev->pers->check_reshape == NULL) 4178 err = -EBUSY; 4179 else if (!md_is_rdwr(mddev)) 4180 err = -EROFS; 4181 else { 4182 mddev->new_layout = n; 4183 err = mddev->pers->check_reshape(mddev); 4184 if (err) 4185 mddev->new_layout = mddev->layout; 4186 } 4187 } else { 4188 mddev->new_layout = n; 4189 if (mddev->reshape_position == MaxSector) 4190 mddev->layout = n; 4191 } 4192 mddev_unlock(mddev); 4193 return err ?: len; 4194 } 4195 static struct md_sysfs_entry md_layout = 4196 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4197 4198 static ssize_t 4199 raid_disks_show(struct mddev *mddev, char *page) 4200 { 4201 if (mddev->raid_disks == 0) 4202 return 0; 4203 if (mddev->reshape_position != MaxSector && 4204 mddev->delta_disks != 0) 4205 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4206 mddev->raid_disks - mddev->delta_disks); 4207 return sprintf(page, "%d\n", mddev->raid_disks); 4208 } 4209 4210 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4211 4212 static ssize_t 4213 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4214 { 4215 unsigned int n; 4216 int err; 4217 4218 err = kstrtouint(buf, 10, &n); 4219 if (err < 0) 4220 return err; 4221 4222 err = mddev_lock(mddev); 4223 if (err) 4224 return err; 4225 if (mddev->pers) 4226 err = update_raid_disks(mddev, n); 4227 else if (mddev->reshape_position != MaxSector) { 4228 struct md_rdev *rdev; 4229 int olddisks = mddev->raid_disks - mddev->delta_disks; 4230 4231 err = -EINVAL; 4232 rdev_for_each(rdev, mddev) { 4233 if (olddisks < n && 4234 rdev->data_offset < rdev->new_data_offset) 4235 goto out_unlock; 4236 if (olddisks > n && 4237 rdev->data_offset > rdev->new_data_offset) 4238 goto out_unlock; 4239 } 4240 err = 0; 4241 mddev->delta_disks = n - olddisks; 4242 mddev->raid_disks = n; 4243 mddev->reshape_backwards = (mddev->delta_disks < 0); 4244 } else 4245 mddev->raid_disks = n; 4246 out_unlock: 4247 mddev_unlock(mddev); 4248 return err ? err : len; 4249 } 4250 static struct md_sysfs_entry md_raid_disks = 4251 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4252 4253 static ssize_t 4254 uuid_show(struct mddev *mddev, char *page) 4255 { 4256 return sprintf(page, "%pU\n", mddev->uuid); 4257 } 4258 static struct md_sysfs_entry md_uuid = 4259 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4260 4261 static ssize_t 4262 chunk_size_show(struct mddev *mddev, char *page) 4263 { 4264 if (mddev->reshape_position != MaxSector && 4265 mddev->chunk_sectors != mddev->new_chunk_sectors) 4266 return sprintf(page, "%d (%d)\n", 4267 mddev->new_chunk_sectors << 9, 4268 mddev->chunk_sectors << 9); 4269 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4270 } 4271 4272 static ssize_t 4273 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4274 { 4275 unsigned long n; 4276 int err; 4277 4278 err = kstrtoul(buf, 10, &n); 4279 if (err < 0) 4280 return err; 4281 4282 err = mddev_lock(mddev); 4283 if (err) 4284 return err; 4285 if (mddev->pers) { 4286 if (mddev->pers->check_reshape == NULL) 4287 err = -EBUSY; 4288 else if (!md_is_rdwr(mddev)) 4289 err = -EROFS; 4290 else { 4291 mddev->new_chunk_sectors = n >> 9; 4292 err = mddev->pers->check_reshape(mddev); 4293 if (err) 4294 mddev->new_chunk_sectors = mddev->chunk_sectors; 4295 } 4296 } else { 4297 mddev->new_chunk_sectors = n >> 9; 4298 if (mddev->reshape_position == MaxSector) 4299 mddev->chunk_sectors = n >> 9; 4300 } 4301 mddev_unlock(mddev); 4302 return err ?: len; 4303 } 4304 static struct md_sysfs_entry md_chunk_size = 4305 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4306 4307 static ssize_t 4308 resync_start_show(struct mddev *mddev, char *page) 4309 { 4310 if (mddev->resync_offset == MaxSector) 4311 return sprintf(page, "none\n"); 4312 return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); 4313 } 4314 4315 static ssize_t 4316 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4317 { 4318 unsigned long long n; 4319 int err; 4320 4321 if (cmd_match(buf, "none")) 4322 n = MaxSector; 4323 else { 4324 err = kstrtoull(buf, 10, &n); 4325 if (err < 0) 4326 return err; 4327 if (n != (sector_t)n) 4328 return -EINVAL; 4329 } 4330 4331 err = mddev_lock(mddev); 4332 if (err) 4333 return err; 4334 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4335 err = -EBUSY; 4336 4337 if (!err) { 4338 mddev->resync_offset = n; 4339 if (mddev->pers) 4340 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4341 } 4342 mddev_unlock(mddev); 4343 return err ?: len; 4344 } 4345 static struct md_sysfs_entry md_resync_start = 4346 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4347 resync_start_show, resync_start_store); 4348 4349 /* 4350 * The array state can be: 4351 * 4352 * clear 4353 * No devices, no size, no level 4354 * Equivalent to STOP_ARRAY ioctl 4355 * inactive 4356 * May have some settings, but array is not active 4357 * all IO results in error 4358 * When written, doesn't tear down array, but just stops it 4359 * suspended (not supported yet) 4360 * All IO requests will block. The array can be reconfigured. 4361 * Writing this, if accepted, will block until array is quiescent 4362 * readonly 4363 * no resync can happen. no superblocks get written. 4364 * write requests fail 4365 * read-auto 4366 * like readonly, but behaves like 'clean' on a write request. 4367 * 4368 * clean - no pending writes, but otherwise active. 4369 * When written to inactive array, starts without resync 4370 * If a write request arrives then 4371 * if metadata is known, mark 'dirty' and switch to 'active'. 4372 * if not known, block and switch to write-pending 4373 * If written to an active array that has pending writes, then fails. 4374 * active 4375 * fully active: IO and resync can be happening. 4376 * When written to inactive array, starts with resync 4377 * 4378 * write-pending 4379 * clean, but writes are blocked waiting for 'active' to be written. 4380 * 4381 * active-idle 4382 * like active, but no writes have been seen for a while (100msec). 4383 * 4384 * broken 4385 * Array is failed. It's useful because mounted-arrays aren't stopped 4386 * when array is failed, so this state will at least alert the user that 4387 * something is wrong. 4388 */ 4389 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4390 write_pending, active_idle, broken, bad_word}; 4391 static char *array_states[] = { 4392 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4393 "write-pending", "active-idle", "broken", NULL }; 4394 4395 static int match_word(const char *word, char **list) 4396 { 4397 int n; 4398 for (n=0; list[n]; n++) 4399 if (cmd_match(word, list[n])) 4400 break; 4401 return n; 4402 } 4403 4404 static ssize_t 4405 array_state_show(struct mddev *mddev, char *page) 4406 { 4407 enum array_state st = inactive; 4408 4409 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4410 switch(mddev->ro) { 4411 case MD_RDONLY: 4412 st = readonly; 4413 break; 4414 case MD_AUTO_READ: 4415 st = read_auto; 4416 break; 4417 case MD_RDWR: 4418 spin_lock(&mddev->lock); 4419 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4420 st = write_pending; 4421 else if (mddev->in_sync) 4422 st = clean; 4423 else if (mddev->safemode) 4424 st = active_idle; 4425 else 4426 st = active; 4427 spin_unlock(&mddev->lock); 4428 } 4429 4430 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4431 st = broken; 4432 } else { 4433 if (list_empty(&mddev->disks) && 4434 mddev->raid_disks == 0 && 4435 mddev->dev_sectors == 0) 4436 st = clear; 4437 else 4438 st = inactive; 4439 } 4440 return sprintf(page, "%s\n", array_states[st]); 4441 } 4442 4443 static int do_md_stop(struct mddev *mddev, int ro); 4444 static int md_set_readonly(struct mddev *mddev); 4445 static int restart_array(struct mddev *mddev); 4446 4447 static ssize_t 4448 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4449 { 4450 int err = 0; 4451 enum array_state st = match_word(buf, array_states); 4452 4453 /* No lock dependent actions */ 4454 switch (st) { 4455 case suspended: /* not supported yet */ 4456 case write_pending: /* cannot be set */ 4457 case active_idle: /* cannot be set */ 4458 case broken: /* cannot be set */ 4459 case bad_word: 4460 return -EINVAL; 4461 case clear: 4462 case readonly: 4463 case inactive: 4464 case read_auto: 4465 if (!mddev->pers || !md_is_rdwr(mddev)) 4466 break; 4467 /* write sysfs will not open mddev and opener should be 0 */ 4468 err = mddev_set_closing_and_sync_blockdev(mddev, 0); 4469 if (err) 4470 return err; 4471 break; 4472 default: 4473 break; 4474 } 4475 4476 if (mddev->pers && (st == active || st == clean) && 4477 mddev->ro != MD_RDONLY) { 4478 /* don't take reconfig_mutex when toggling between 4479 * clean and active 4480 */ 4481 spin_lock(&mddev->lock); 4482 if (st == active) { 4483 restart_array(mddev); 4484 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4485 md_wakeup_thread(mddev->thread); 4486 wake_up(&mddev->sb_wait); 4487 } else /* st == clean */ { 4488 restart_array(mddev); 4489 if (!set_in_sync(mddev)) 4490 err = -EBUSY; 4491 } 4492 if (!err) 4493 sysfs_notify_dirent_safe(mddev->sysfs_state); 4494 spin_unlock(&mddev->lock); 4495 return err ?: len; 4496 } 4497 err = mddev_lock(mddev); 4498 if (err) 4499 return err; 4500 4501 switch (st) { 4502 case inactive: 4503 /* stop an active array, return 0 otherwise */ 4504 if (mddev->pers) 4505 err = do_md_stop(mddev, 2); 4506 break; 4507 case clear: 4508 err = do_md_stop(mddev, 0); 4509 break; 4510 case readonly: 4511 if (mddev->pers) 4512 err = md_set_readonly(mddev); 4513 else { 4514 mddev->ro = MD_RDONLY; 4515 set_disk_ro(mddev->gendisk, 1); 4516 err = do_md_run(mddev); 4517 } 4518 break; 4519 case read_auto: 4520 if (mddev->pers) { 4521 if (md_is_rdwr(mddev)) 4522 err = md_set_readonly(mddev); 4523 else if (mddev->ro == MD_RDONLY) 4524 err = restart_array(mddev); 4525 if (err == 0) { 4526 mddev->ro = MD_AUTO_READ; 4527 set_disk_ro(mddev->gendisk, 0); 4528 } 4529 } else { 4530 mddev->ro = MD_AUTO_READ; 4531 err = do_md_run(mddev); 4532 } 4533 break; 4534 case clean: 4535 if (mddev->pers) { 4536 err = restart_array(mddev); 4537 if (err) 4538 break; 4539 spin_lock(&mddev->lock); 4540 if (!set_in_sync(mddev)) 4541 err = -EBUSY; 4542 spin_unlock(&mddev->lock); 4543 } else 4544 err = -EINVAL; 4545 break; 4546 case active: 4547 if (mddev->pers) { 4548 err = restart_array(mddev); 4549 if (err) 4550 break; 4551 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4552 wake_up(&mddev->sb_wait); 4553 err = 0; 4554 } else { 4555 mddev->ro = MD_RDWR; 4556 set_disk_ro(mddev->gendisk, 0); 4557 err = do_md_run(mddev); 4558 } 4559 break; 4560 default: 4561 err = -EINVAL; 4562 break; 4563 } 4564 4565 if (!err) { 4566 if (mddev->hold_active == UNTIL_IOCTL) 4567 mddev->hold_active = 0; 4568 sysfs_notify_dirent_safe(mddev->sysfs_state); 4569 } 4570 mddev_unlock(mddev); 4571 4572 if (st == readonly || st == read_auto || st == inactive || 4573 (err && st == clear)) 4574 clear_bit(MD_CLOSING, &mddev->flags); 4575 4576 return err ?: len; 4577 } 4578 static struct md_sysfs_entry md_array_state = 4579 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4580 4581 static ssize_t 4582 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4583 return sprintf(page, "%d\n", 4584 atomic_read(&mddev->max_corr_read_errors)); 4585 } 4586 4587 static ssize_t 4588 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4589 { 4590 unsigned int n; 4591 int rv; 4592 4593 rv = kstrtouint(buf, 10, &n); 4594 if (rv < 0) 4595 return rv; 4596 if (n > INT_MAX) 4597 return -EINVAL; 4598 atomic_set(&mddev->max_corr_read_errors, n); 4599 return len; 4600 } 4601 4602 static struct md_sysfs_entry max_corr_read_errors = 4603 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4604 max_corrected_read_errors_store); 4605 4606 static ssize_t 4607 null_show(struct mddev *mddev, char *page) 4608 { 4609 return -EINVAL; 4610 } 4611 4612 static ssize_t 4613 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4614 { 4615 /* buf must be %d:%d\n? giving major and minor numbers */ 4616 /* The new device is added to the array. 4617 * If the array has a persistent superblock, we read the 4618 * superblock to initialise info and check validity. 4619 * Otherwise, only checking done is that in bind_rdev_to_array, 4620 * which mainly checks size. 4621 */ 4622 char *e; 4623 int major = simple_strtoul(buf, &e, 10); 4624 int minor; 4625 dev_t dev; 4626 struct md_rdev *rdev; 4627 int err; 4628 4629 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4630 return -EINVAL; 4631 minor = simple_strtoul(e+1, &e, 10); 4632 if (*e && *e != '\n') 4633 return -EINVAL; 4634 dev = MKDEV(major, minor); 4635 if (major != MAJOR(dev) || 4636 minor != MINOR(dev)) 4637 return -EOVERFLOW; 4638 4639 err = mddev_suspend_and_lock(mddev); 4640 if (err) 4641 return err; 4642 if (mddev->persistent) { 4643 rdev = md_import_device(dev, mddev->major_version, 4644 mddev->minor_version); 4645 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4646 struct md_rdev *rdev0 4647 = list_entry(mddev->disks.next, 4648 struct md_rdev, same_set); 4649 err = super_types[mddev->major_version] 4650 .load_super(rdev, rdev0, mddev->minor_version); 4651 if (err < 0) 4652 goto out; 4653 } 4654 } else if (mddev->external) 4655 rdev = md_import_device(dev, -2, -1); 4656 else 4657 rdev = md_import_device(dev, -1, -1); 4658 4659 if (IS_ERR(rdev)) { 4660 mddev_unlock_and_resume(mddev); 4661 return PTR_ERR(rdev); 4662 } 4663 err = bind_rdev_to_array(rdev, mddev); 4664 out: 4665 if (err) 4666 export_rdev(rdev, mddev); 4667 mddev_unlock_and_resume(mddev); 4668 if (!err) 4669 md_new_event(); 4670 return err ? err : len; 4671 } 4672 4673 static struct md_sysfs_entry md_new_device = 4674 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4675 4676 static ssize_t 4677 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4678 { 4679 char *end; 4680 unsigned long chunk, end_chunk; 4681 int err; 4682 4683 err = mddev_lock(mddev); 4684 if (err) 4685 return err; 4686 if (!mddev->bitmap) 4687 goto out; 4688 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4689 while (*buf) { 4690 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4691 if (buf == end) 4692 break; 4693 4694 if (*end == '-') { /* range */ 4695 buf = end + 1; 4696 end_chunk = simple_strtoul(buf, &end, 0); 4697 if (buf == end) 4698 break; 4699 } 4700 4701 if (*end && !isspace(*end)) 4702 break; 4703 4704 mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); 4705 buf = skip_spaces(end); 4706 } 4707 mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ 4708 out: 4709 mddev_unlock(mddev); 4710 return len; 4711 } 4712 4713 static struct md_sysfs_entry md_bitmap = 4714 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4715 4716 static ssize_t 4717 size_show(struct mddev *mddev, char *page) 4718 { 4719 return sprintf(page, "%llu\n", 4720 (unsigned long long)mddev->dev_sectors / 2); 4721 } 4722 4723 static int update_size(struct mddev *mddev, sector_t num_sectors); 4724 4725 static ssize_t 4726 size_store(struct mddev *mddev, const char *buf, size_t len) 4727 { 4728 /* If array is inactive, we can reduce the component size, but 4729 * not increase it (except from 0). 4730 * If array is active, we can try an on-line resize 4731 */ 4732 sector_t sectors; 4733 int err = strict_blocks_to_sectors(buf, §ors); 4734 4735 if (err < 0) 4736 return err; 4737 err = mddev_lock(mddev); 4738 if (err) 4739 return err; 4740 if (mddev->pers) { 4741 err = update_size(mddev, sectors); 4742 if (err == 0) 4743 md_update_sb(mddev, 1); 4744 } else { 4745 if (mddev->dev_sectors == 0 || 4746 mddev->dev_sectors > sectors) 4747 mddev->dev_sectors = sectors; 4748 else 4749 err = -ENOSPC; 4750 } 4751 mddev_unlock(mddev); 4752 return err ? err : len; 4753 } 4754 4755 static struct md_sysfs_entry md_size = 4756 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4757 4758 /* Metadata version. 4759 * This is one of 4760 * 'none' for arrays with no metadata (good luck...) 4761 * 'external' for arrays with externally managed metadata, 4762 * or N.M for internally known formats 4763 */ 4764 static ssize_t 4765 metadata_show(struct mddev *mddev, char *page) 4766 { 4767 if (mddev->persistent) 4768 return sprintf(page, "%d.%d\n", 4769 mddev->major_version, mddev->minor_version); 4770 else if (mddev->external) 4771 return sprintf(page, "external:%s\n", mddev->metadata_type); 4772 else 4773 return sprintf(page, "none\n"); 4774 } 4775 4776 static ssize_t 4777 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4778 { 4779 int major, minor; 4780 char *e; 4781 int err; 4782 /* Changing the details of 'external' metadata is 4783 * always permitted. Otherwise there must be 4784 * no devices attached to the array. 4785 */ 4786 4787 err = mddev_lock(mddev); 4788 if (err) 4789 return err; 4790 err = -EBUSY; 4791 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4792 ; 4793 else if (!list_empty(&mddev->disks)) 4794 goto out_unlock; 4795 4796 err = 0; 4797 if (cmd_match(buf, "none")) { 4798 mddev->persistent = 0; 4799 mddev->external = 0; 4800 mddev->major_version = 0; 4801 mddev->minor_version = 90; 4802 goto out_unlock; 4803 } 4804 if (strncmp(buf, "external:", 9) == 0) { 4805 size_t namelen = len-9; 4806 if (namelen >= sizeof(mddev->metadata_type)) 4807 namelen = sizeof(mddev->metadata_type)-1; 4808 memcpy(mddev->metadata_type, buf+9, namelen); 4809 mddev->metadata_type[namelen] = 0; 4810 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4811 mddev->metadata_type[--namelen] = 0; 4812 mddev->persistent = 0; 4813 mddev->external = 1; 4814 mddev->major_version = 0; 4815 mddev->minor_version = 90; 4816 goto out_unlock; 4817 } 4818 major = simple_strtoul(buf, &e, 10); 4819 err = -EINVAL; 4820 if (e==buf || *e != '.') 4821 goto out_unlock; 4822 buf = e+1; 4823 minor = simple_strtoul(buf, &e, 10); 4824 if (e==buf || (*e && *e != '\n') ) 4825 goto out_unlock; 4826 err = -ENOENT; 4827 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4828 goto out_unlock; 4829 mddev->major_version = major; 4830 mddev->minor_version = minor; 4831 mddev->persistent = 1; 4832 mddev->external = 0; 4833 err = 0; 4834 out_unlock: 4835 mddev_unlock(mddev); 4836 return err ?: len; 4837 } 4838 4839 static struct md_sysfs_entry md_metadata = 4840 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4841 4842 static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors) 4843 { 4844 return rdev->raid_disk >= 0 && 4845 !test_bit(Journal, &rdev->flags) && 4846 !test_bit(Faulty, &rdev->flags) && 4847 !test_bit(In_sync, &rdev->flags) && 4848 rdev->recovery_offset < sectors; 4849 } 4850 4851 static enum sync_action md_get_active_sync_action(struct mddev *mddev) 4852 { 4853 struct md_rdev *rdev; 4854 bool is_recover = false; 4855 4856 if (mddev->resync_offset < MaxSector) 4857 return ACTION_RESYNC; 4858 4859 if (mddev->reshape_position != MaxSector) 4860 return ACTION_RESHAPE; 4861 4862 rcu_read_lock(); 4863 rdev_for_each_rcu(rdev, mddev) { 4864 if (rdev_needs_recovery(rdev, MaxSector)) { 4865 is_recover = true; 4866 break; 4867 } 4868 } 4869 rcu_read_unlock(); 4870 4871 return is_recover ? ACTION_RECOVER : ACTION_IDLE; 4872 } 4873 4874 enum sync_action md_sync_action(struct mddev *mddev) 4875 { 4876 unsigned long recovery = mddev->recovery; 4877 enum sync_action active_action; 4878 4879 /* 4880 * frozen has the highest priority, means running sync_thread will be 4881 * stopped immediately, and no new sync_thread can start. 4882 */ 4883 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4884 return ACTION_FROZEN; 4885 4886 /* 4887 * read-only array can't register sync_thread, and it can only 4888 * add/remove spares. 4889 */ 4890 if (!md_is_rdwr(mddev)) 4891 return ACTION_IDLE; 4892 4893 /* 4894 * idle means no sync_thread is running, and no new sync_thread is 4895 * requested. 4896 */ 4897 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && 4898 !test_bit(MD_RECOVERY_NEEDED, &recovery)) 4899 return ACTION_IDLE; 4900 4901 /* 4902 * Check if any sync operation (resync/recover/reshape) is 4903 * currently active. This ensures that only one sync operation 4904 * can run at a time. Returns the type of active operation, or 4905 * ACTION_IDLE if none are active. 4906 */ 4907 active_action = md_get_active_sync_action(mddev); 4908 if (active_action != ACTION_IDLE) 4909 return active_action; 4910 4911 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4912 return ACTION_RESHAPE; 4913 4914 if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4915 return ACTION_RECOVER; 4916 4917 if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4918 /* 4919 * MD_RECOVERY_CHECK must be paired with 4920 * MD_RECOVERY_REQUESTED. 4921 */ 4922 if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4923 return ACTION_CHECK; 4924 if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4925 return ACTION_REPAIR; 4926 return ACTION_RESYNC; 4927 } 4928 4929 /* 4930 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no 4931 * sync_action is specified. 4932 */ 4933 return ACTION_IDLE; 4934 } 4935 4936 enum sync_action md_sync_action_by_name(const char *page) 4937 { 4938 enum sync_action action; 4939 4940 for (action = 0; action < NR_SYNC_ACTIONS; ++action) { 4941 if (cmd_match(page, action_name[action])) 4942 return action; 4943 } 4944 4945 return NR_SYNC_ACTIONS; 4946 } 4947 4948 const char *md_sync_action_name(enum sync_action action) 4949 { 4950 return action_name[action]; 4951 } 4952 4953 static ssize_t 4954 action_show(struct mddev *mddev, char *page) 4955 { 4956 enum sync_action action = md_sync_action(mddev); 4957 4958 return sprintf(page, "%s\n", md_sync_action_name(action)); 4959 } 4960 4961 /** 4962 * stop_sync_thread() - wait for sync_thread to stop if it's running. 4963 * @mddev: the array. 4964 * @locked: if set, reconfig_mutex will still be held after this function 4965 * return; if not set, reconfig_mutex will be released after this 4966 * function return. 4967 */ 4968 static void stop_sync_thread(struct mddev *mddev, bool locked) 4969 { 4970 int sync_seq = atomic_read(&mddev->sync_seq); 4971 4972 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4973 if (!locked) 4974 mddev_unlock(mddev); 4975 return; 4976 } 4977 4978 mddev_unlock(mddev); 4979 4980 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4981 /* 4982 * Thread might be blocked waiting for metadata update which will now 4983 * never happen 4984 */ 4985 md_wakeup_thread_directly(mddev->sync_thread); 4986 if (work_pending(&mddev->sync_work)) 4987 flush_work(&mddev->sync_work); 4988 4989 wait_event(resync_wait, 4990 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4991 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && 4992 sync_seq != atomic_read(&mddev->sync_seq))); 4993 4994 if (locked) 4995 mddev_lock_nointr(mddev); 4996 } 4997 4998 void md_idle_sync_thread(struct mddev *mddev) 4999 { 5000 lockdep_assert_held(&mddev->reconfig_mutex); 5001 5002 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5003 stop_sync_thread(mddev, true); 5004 } 5005 EXPORT_SYMBOL_GPL(md_idle_sync_thread); 5006 5007 void md_frozen_sync_thread(struct mddev *mddev) 5008 { 5009 lockdep_assert_held(&mddev->reconfig_mutex); 5010 5011 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5012 stop_sync_thread(mddev, true); 5013 } 5014 EXPORT_SYMBOL_GPL(md_frozen_sync_thread); 5015 5016 void md_unfrozen_sync_thread(struct mddev *mddev) 5017 { 5018 lockdep_assert_held(&mddev->reconfig_mutex); 5019 5020 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5021 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5022 md_wakeup_thread(mddev->thread); 5023 sysfs_notify_dirent_safe(mddev->sysfs_action); 5024 } 5025 EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); 5026 5027 static int mddev_start_reshape(struct mddev *mddev) 5028 { 5029 int ret; 5030 5031 if (mddev->pers->start_reshape == NULL) 5032 return -EINVAL; 5033 5034 if (mddev->reshape_position == MaxSector || 5035 mddev->pers->check_reshape == NULL || 5036 mddev->pers->check_reshape(mddev)) { 5037 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5038 ret = mddev->pers->start_reshape(mddev); 5039 if (ret) 5040 return ret; 5041 } else { 5042 /* 5043 * If reshape is still in progress, and md_check_recovery() can 5044 * continue to reshape, don't restart reshape because data can 5045 * be corrupted for raid456. 5046 */ 5047 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5048 } 5049 5050 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 5051 return 0; 5052 } 5053 5054 static ssize_t 5055 action_store(struct mddev *mddev, const char *page, size_t len) 5056 { 5057 int ret; 5058 enum sync_action action; 5059 5060 if (!mddev->pers || !mddev->pers->sync_request) 5061 return -EINVAL; 5062 5063 retry: 5064 if (work_busy(&mddev->sync_work)) 5065 flush_work(&mddev->sync_work); 5066 5067 ret = mddev_lock(mddev); 5068 if (ret) 5069 return ret; 5070 5071 if (work_busy(&mddev->sync_work)) { 5072 mddev_unlock(mddev); 5073 goto retry; 5074 } 5075 5076 action = md_sync_action_by_name(page); 5077 5078 /* TODO: mdadm rely on "idle" to start sync_thread. */ 5079 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5080 switch (action) { 5081 case ACTION_FROZEN: 5082 md_frozen_sync_thread(mddev); 5083 ret = len; 5084 goto out; 5085 case ACTION_IDLE: 5086 md_idle_sync_thread(mddev); 5087 break; 5088 case ACTION_RESHAPE: 5089 case ACTION_RECOVER: 5090 case ACTION_CHECK: 5091 case ACTION_REPAIR: 5092 case ACTION_RESYNC: 5093 ret = -EBUSY; 5094 goto out; 5095 default: 5096 ret = -EINVAL; 5097 goto out; 5098 } 5099 } else { 5100 switch (action) { 5101 case ACTION_FROZEN: 5102 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5103 ret = len; 5104 goto out; 5105 case ACTION_RESHAPE: 5106 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5107 ret = mddev_start_reshape(mddev); 5108 if (ret) 5109 goto out; 5110 break; 5111 case ACTION_RECOVER: 5112 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5113 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5114 break; 5115 case ACTION_CHECK: 5116 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5117 fallthrough; 5118 case ACTION_REPAIR: 5119 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 5120 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5121 fallthrough; 5122 case ACTION_RESYNC: 5123 case ACTION_IDLE: 5124 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5125 break; 5126 default: 5127 ret = -EINVAL; 5128 goto out; 5129 } 5130 } 5131 5132 if (mddev->ro == MD_AUTO_READ) { 5133 /* A write to sync_action is enough to justify 5134 * canceling read-auto mode 5135 */ 5136 mddev->ro = MD_RDWR; 5137 md_wakeup_thread(mddev->sync_thread); 5138 } 5139 5140 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5141 md_wakeup_thread(mddev->thread); 5142 sysfs_notify_dirent_safe(mddev->sysfs_action); 5143 ret = len; 5144 5145 out: 5146 mddev_unlock(mddev); 5147 return ret; 5148 } 5149 5150 static struct md_sysfs_entry md_scan_mode = 5151 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 5152 5153 static ssize_t 5154 last_sync_action_show(struct mddev *mddev, char *page) 5155 { 5156 return sprintf(page, "%s\n", 5157 md_sync_action_name(mddev->last_sync_action)); 5158 } 5159 5160 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 5161 5162 static ssize_t 5163 mismatch_cnt_show(struct mddev *mddev, char *page) 5164 { 5165 return sprintf(page, "%llu\n", 5166 (unsigned long long) 5167 atomic64_read(&mddev->resync_mismatches)); 5168 } 5169 5170 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 5171 5172 static ssize_t 5173 sync_min_show(struct mddev *mddev, char *page) 5174 { 5175 return sprintf(page, "%d (%s)\n", speed_min(mddev), 5176 mddev->sync_speed_min ? "local" : "system"); 5177 } 5178 5179 static ssize_t 5180 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 5181 { 5182 unsigned int min; 5183 int rv; 5184 5185 if (strncmp(buf, "system", 6) == 0) { 5186 min = 0; 5187 } else { 5188 rv = kstrtouint(buf, 10, &min); 5189 if (rv < 0) 5190 return rv; 5191 if (min == 0) 5192 return -EINVAL; 5193 } 5194 mddev->sync_speed_min = min; 5195 return len; 5196 } 5197 5198 static struct md_sysfs_entry md_sync_min = 5199 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5200 5201 static ssize_t 5202 sync_max_show(struct mddev *mddev, char *page) 5203 { 5204 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5205 mddev->sync_speed_max ? "local" : "system"); 5206 } 5207 5208 static ssize_t 5209 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5210 { 5211 unsigned int max; 5212 int rv; 5213 5214 if (strncmp(buf, "system", 6) == 0) { 5215 max = 0; 5216 } else { 5217 rv = kstrtouint(buf, 10, &max); 5218 if (rv < 0) 5219 return rv; 5220 if (max == 0) 5221 return -EINVAL; 5222 } 5223 mddev->sync_speed_max = max; 5224 return len; 5225 } 5226 5227 static struct md_sysfs_entry md_sync_max = 5228 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5229 5230 static ssize_t 5231 sync_io_depth_show(struct mddev *mddev, char *page) 5232 { 5233 return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), 5234 mddev->sync_io_depth ? "local" : "system"); 5235 } 5236 5237 static ssize_t 5238 sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) 5239 { 5240 unsigned int max; 5241 int rv; 5242 5243 if (strncmp(buf, "system", 6) == 0) { 5244 max = 0; 5245 } else { 5246 rv = kstrtouint(buf, 10, &max); 5247 if (rv < 0) 5248 return rv; 5249 if (max == 0) 5250 return -EINVAL; 5251 } 5252 mddev->sync_io_depth = max; 5253 return len; 5254 } 5255 5256 static struct md_sysfs_entry md_sync_io_depth = 5257 __ATTR_RW(sync_io_depth); 5258 5259 static ssize_t 5260 degraded_show(struct mddev *mddev, char *page) 5261 { 5262 return sprintf(page, "%d\n", mddev->degraded); 5263 } 5264 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5265 5266 static ssize_t 5267 sync_force_parallel_show(struct mddev *mddev, char *page) 5268 { 5269 return sprintf(page, "%d\n", mddev->parallel_resync); 5270 } 5271 5272 static ssize_t 5273 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5274 { 5275 long n; 5276 5277 if (kstrtol(buf, 10, &n)) 5278 return -EINVAL; 5279 5280 if (n != 0 && n != 1) 5281 return -EINVAL; 5282 5283 mddev->parallel_resync = n; 5284 5285 if (mddev->sync_thread) 5286 wake_up(&resync_wait); 5287 5288 return len; 5289 } 5290 5291 /* force parallel resync, even with shared block devices */ 5292 static struct md_sysfs_entry md_sync_force_parallel = 5293 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5294 sync_force_parallel_show, sync_force_parallel_store); 5295 5296 static ssize_t 5297 sync_speed_show(struct mddev *mddev, char *page) 5298 { 5299 unsigned long resync, dt, db; 5300 if (mddev->curr_resync == MD_RESYNC_NONE) 5301 return sprintf(page, "none\n"); 5302 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5303 dt = (jiffies - mddev->resync_mark) / HZ; 5304 if (!dt) dt++; 5305 db = resync - mddev->resync_mark_cnt; 5306 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5307 } 5308 5309 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5310 5311 static ssize_t 5312 sync_completed_show(struct mddev *mddev, char *page) 5313 { 5314 unsigned long long max_sectors, resync; 5315 5316 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5317 return sprintf(page, "none\n"); 5318 5319 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5320 mddev->curr_resync == MD_RESYNC_DELAYED) 5321 return sprintf(page, "delayed\n"); 5322 5323 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5324 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5325 max_sectors = mddev->resync_max_sectors; 5326 else 5327 max_sectors = mddev->dev_sectors; 5328 5329 resync = mddev->curr_resync_completed; 5330 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5331 } 5332 5333 static struct md_sysfs_entry md_sync_completed = 5334 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5335 5336 static ssize_t 5337 min_sync_show(struct mddev *mddev, char *page) 5338 { 5339 return sprintf(page, "%llu\n", 5340 (unsigned long long)mddev->resync_min); 5341 } 5342 static ssize_t 5343 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5344 { 5345 unsigned long long min; 5346 int err; 5347 5348 if (kstrtoull(buf, 10, &min)) 5349 return -EINVAL; 5350 5351 spin_lock(&mddev->lock); 5352 err = -EINVAL; 5353 if (min > mddev->resync_max) 5354 goto out_unlock; 5355 5356 err = -EBUSY; 5357 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5358 goto out_unlock; 5359 5360 /* Round down to multiple of 4K for safety */ 5361 mddev->resync_min = round_down(min, 8); 5362 err = 0; 5363 5364 out_unlock: 5365 spin_unlock(&mddev->lock); 5366 return err ?: len; 5367 } 5368 5369 static struct md_sysfs_entry md_min_sync = 5370 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5371 5372 static ssize_t 5373 max_sync_show(struct mddev *mddev, char *page) 5374 { 5375 if (mddev->resync_max == MaxSector) 5376 return sprintf(page, "max\n"); 5377 else 5378 return sprintf(page, "%llu\n", 5379 (unsigned long long)mddev->resync_max); 5380 } 5381 static ssize_t 5382 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5383 { 5384 int err; 5385 spin_lock(&mddev->lock); 5386 if (strncmp(buf, "max", 3) == 0) 5387 mddev->resync_max = MaxSector; 5388 else { 5389 unsigned long long max; 5390 int chunk; 5391 5392 err = -EINVAL; 5393 if (kstrtoull(buf, 10, &max)) 5394 goto out_unlock; 5395 if (max < mddev->resync_min) 5396 goto out_unlock; 5397 5398 err = -EBUSY; 5399 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5400 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5401 goto out_unlock; 5402 5403 /* Must be a multiple of chunk_size */ 5404 chunk = mddev->chunk_sectors; 5405 if (chunk) { 5406 sector_t temp = max; 5407 5408 err = -EINVAL; 5409 if (sector_div(temp, chunk)) 5410 goto out_unlock; 5411 } 5412 mddev->resync_max = max; 5413 } 5414 wake_up(&mddev->recovery_wait); 5415 err = 0; 5416 out_unlock: 5417 spin_unlock(&mddev->lock); 5418 return err ?: len; 5419 } 5420 5421 static struct md_sysfs_entry md_max_sync = 5422 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5423 5424 static ssize_t 5425 suspend_lo_show(struct mddev *mddev, char *page) 5426 { 5427 return sprintf(page, "%llu\n", 5428 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5429 } 5430 5431 static ssize_t 5432 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5433 { 5434 unsigned long long new; 5435 int err; 5436 5437 err = kstrtoull(buf, 10, &new); 5438 if (err < 0) 5439 return err; 5440 if (new != (sector_t)new) 5441 return -EINVAL; 5442 5443 err = mddev_suspend(mddev, true); 5444 if (err) 5445 return err; 5446 5447 WRITE_ONCE(mddev->suspend_lo, new); 5448 mddev_resume(mddev); 5449 5450 return len; 5451 } 5452 static struct md_sysfs_entry md_suspend_lo = 5453 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5454 5455 static ssize_t 5456 suspend_hi_show(struct mddev *mddev, char *page) 5457 { 5458 return sprintf(page, "%llu\n", 5459 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5460 } 5461 5462 static ssize_t 5463 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5464 { 5465 unsigned long long new; 5466 int err; 5467 5468 err = kstrtoull(buf, 10, &new); 5469 if (err < 0) 5470 return err; 5471 if (new != (sector_t)new) 5472 return -EINVAL; 5473 5474 err = mddev_suspend(mddev, true); 5475 if (err) 5476 return err; 5477 5478 WRITE_ONCE(mddev->suspend_hi, new); 5479 mddev_resume(mddev); 5480 5481 return len; 5482 } 5483 static struct md_sysfs_entry md_suspend_hi = 5484 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5485 5486 static ssize_t 5487 reshape_position_show(struct mddev *mddev, char *page) 5488 { 5489 if (mddev->reshape_position != MaxSector) 5490 return sprintf(page, "%llu\n", 5491 (unsigned long long)mddev->reshape_position); 5492 strcpy(page, "none\n"); 5493 return 5; 5494 } 5495 5496 static ssize_t 5497 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5498 { 5499 struct md_rdev *rdev; 5500 unsigned long long new; 5501 int err; 5502 5503 err = kstrtoull(buf, 10, &new); 5504 if (err < 0) 5505 return err; 5506 if (new != (sector_t)new) 5507 return -EINVAL; 5508 err = mddev_lock(mddev); 5509 if (err) 5510 return err; 5511 err = -EBUSY; 5512 if (mddev->pers) 5513 goto unlock; 5514 mddev->reshape_position = new; 5515 mddev->delta_disks = 0; 5516 mddev->reshape_backwards = 0; 5517 mddev->new_level = mddev->level; 5518 mddev->new_layout = mddev->layout; 5519 mddev->new_chunk_sectors = mddev->chunk_sectors; 5520 rdev_for_each(rdev, mddev) 5521 rdev->new_data_offset = rdev->data_offset; 5522 err = 0; 5523 unlock: 5524 mddev_unlock(mddev); 5525 return err ?: len; 5526 } 5527 5528 static struct md_sysfs_entry md_reshape_position = 5529 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5530 reshape_position_store); 5531 5532 static ssize_t 5533 reshape_direction_show(struct mddev *mddev, char *page) 5534 { 5535 return sprintf(page, "%s\n", 5536 mddev->reshape_backwards ? "backwards" : "forwards"); 5537 } 5538 5539 static ssize_t 5540 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5541 { 5542 int backwards = 0; 5543 int err; 5544 5545 if (cmd_match(buf, "forwards")) 5546 backwards = 0; 5547 else if (cmd_match(buf, "backwards")) 5548 backwards = 1; 5549 else 5550 return -EINVAL; 5551 if (mddev->reshape_backwards == backwards) 5552 return len; 5553 5554 err = mddev_lock(mddev); 5555 if (err) 5556 return err; 5557 /* check if we are allowed to change */ 5558 if (mddev->delta_disks) 5559 err = -EBUSY; 5560 else if (mddev->persistent && 5561 mddev->major_version == 0) 5562 err = -EINVAL; 5563 else 5564 mddev->reshape_backwards = backwards; 5565 mddev_unlock(mddev); 5566 return err ?: len; 5567 } 5568 5569 static struct md_sysfs_entry md_reshape_direction = 5570 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5571 reshape_direction_store); 5572 5573 static ssize_t 5574 array_size_show(struct mddev *mddev, char *page) 5575 { 5576 if (mddev->external_size) 5577 return sprintf(page, "%llu\n", 5578 (unsigned long long)mddev->array_sectors/2); 5579 else 5580 return sprintf(page, "default\n"); 5581 } 5582 5583 static ssize_t 5584 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5585 { 5586 sector_t sectors; 5587 int err; 5588 5589 err = mddev_lock(mddev); 5590 if (err) 5591 return err; 5592 5593 /* cluster raid doesn't support change array_sectors */ 5594 if (mddev_is_clustered(mddev)) { 5595 mddev_unlock(mddev); 5596 return -EINVAL; 5597 } 5598 5599 if (strncmp(buf, "default", 7) == 0) { 5600 if (mddev->pers) 5601 sectors = mddev->pers->size(mddev, 0, 0); 5602 else 5603 sectors = mddev->array_sectors; 5604 5605 mddev->external_size = 0; 5606 } else { 5607 if (strict_blocks_to_sectors(buf, §ors) < 0) 5608 err = -EINVAL; 5609 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5610 err = -E2BIG; 5611 else 5612 mddev->external_size = 1; 5613 } 5614 5615 if (!err) { 5616 mddev->array_sectors = sectors; 5617 if (mddev->pers) 5618 set_capacity_and_notify(mddev->gendisk, 5619 mddev->array_sectors); 5620 } 5621 mddev_unlock(mddev); 5622 return err ?: len; 5623 } 5624 5625 static struct md_sysfs_entry md_array_size = 5626 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5627 array_size_store); 5628 5629 static ssize_t 5630 consistency_policy_show(struct mddev *mddev, char *page) 5631 { 5632 int ret; 5633 5634 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5635 ret = sprintf(page, "journal\n"); 5636 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5637 ret = sprintf(page, "ppl\n"); 5638 } else if (mddev->bitmap) { 5639 ret = sprintf(page, "bitmap\n"); 5640 } else if (mddev->pers) { 5641 if (mddev->pers->sync_request) 5642 ret = sprintf(page, "resync\n"); 5643 else 5644 ret = sprintf(page, "none\n"); 5645 } else { 5646 ret = sprintf(page, "unknown\n"); 5647 } 5648 5649 return ret; 5650 } 5651 5652 static ssize_t 5653 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5654 { 5655 int err = 0; 5656 5657 if (mddev->pers) { 5658 if (mddev->pers->change_consistency_policy) 5659 err = mddev->pers->change_consistency_policy(mddev, buf); 5660 else 5661 err = -EBUSY; 5662 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5663 set_bit(MD_HAS_PPL, &mddev->flags); 5664 } else { 5665 err = -EINVAL; 5666 } 5667 5668 return err ? err : len; 5669 } 5670 5671 static struct md_sysfs_entry md_consistency_policy = 5672 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5673 consistency_policy_store); 5674 5675 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5676 { 5677 return sprintf(page, "%d\n", mddev->fail_last_dev); 5678 } 5679 5680 /* 5681 * Setting fail_last_dev to true to allow last device to be forcibly removed 5682 * from RAID1/RAID10. 5683 */ 5684 static ssize_t 5685 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5686 { 5687 int ret; 5688 bool value; 5689 5690 ret = kstrtobool(buf, &value); 5691 if (ret) 5692 return ret; 5693 5694 if (value != mddev->fail_last_dev) 5695 mddev->fail_last_dev = value; 5696 5697 return len; 5698 } 5699 static struct md_sysfs_entry md_fail_last_dev = 5700 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5701 fail_last_dev_store); 5702 5703 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5704 { 5705 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) 5706 return sprintf(page, "n/a\n"); 5707 else 5708 return sprintf(page, "%d\n", mddev->serialize_policy); 5709 } 5710 5711 /* 5712 * Setting serialize_policy to true to enforce write IO is not reordered 5713 * for raid1. 5714 */ 5715 static ssize_t 5716 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5717 { 5718 int err; 5719 bool value; 5720 5721 err = kstrtobool(buf, &value); 5722 if (err) 5723 return err; 5724 5725 if (value == mddev->serialize_policy) 5726 return len; 5727 5728 err = mddev_suspend_and_lock(mddev); 5729 if (err) 5730 return err; 5731 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { 5732 pr_err("md: serialize_policy is only effective for raid1\n"); 5733 err = -EINVAL; 5734 goto unlock; 5735 } 5736 5737 if (value) 5738 mddev_create_serial_pool(mddev, NULL); 5739 else 5740 mddev_destroy_serial_pool(mddev, NULL); 5741 mddev->serialize_policy = value; 5742 unlock: 5743 mddev_unlock_and_resume(mddev); 5744 return err ?: len; 5745 } 5746 5747 static struct md_sysfs_entry md_serialize_policy = 5748 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5749 serialize_policy_store); 5750 5751 5752 static struct attribute *md_default_attrs[] = { 5753 &md_level.attr, 5754 &md_new_level.attr, 5755 &md_layout.attr, 5756 &md_raid_disks.attr, 5757 &md_uuid.attr, 5758 &md_chunk_size.attr, 5759 &md_size.attr, 5760 &md_resync_start.attr, 5761 &md_metadata.attr, 5762 &md_new_device.attr, 5763 &md_safe_delay.attr, 5764 &md_array_state.attr, 5765 &md_reshape_position.attr, 5766 &md_reshape_direction.attr, 5767 &md_array_size.attr, 5768 &max_corr_read_errors.attr, 5769 &md_consistency_policy.attr, 5770 &md_fail_last_dev.attr, 5771 &md_serialize_policy.attr, 5772 NULL, 5773 }; 5774 5775 static const struct attribute_group md_default_group = { 5776 .attrs = md_default_attrs, 5777 }; 5778 5779 static struct attribute *md_redundancy_attrs[] = { 5780 &md_scan_mode.attr, 5781 &md_last_scan_mode.attr, 5782 &md_mismatches.attr, 5783 &md_sync_min.attr, 5784 &md_sync_max.attr, 5785 &md_sync_io_depth.attr, 5786 &md_sync_speed.attr, 5787 &md_sync_force_parallel.attr, 5788 &md_sync_completed.attr, 5789 &md_min_sync.attr, 5790 &md_max_sync.attr, 5791 &md_suspend_lo.attr, 5792 &md_suspend_hi.attr, 5793 &md_bitmap.attr, 5794 &md_degraded.attr, 5795 NULL, 5796 }; 5797 static const struct attribute_group md_redundancy_group = { 5798 .name = NULL, 5799 .attrs = md_redundancy_attrs, 5800 }; 5801 5802 static const struct attribute_group *md_attr_groups[] = { 5803 &md_default_group, 5804 &md_bitmap_group, 5805 NULL, 5806 }; 5807 5808 static ssize_t 5809 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5810 { 5811 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5812 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5813 ssize_t rv; 5814 5815 if (!entry->show) 5816 return -EIO; 5817 spin_lock(&all_mddevs_lock); 5818 if (!mddev_get(mddev)) { 5819 spin_unlock(&all_mddevs_lock); 5820 return -EBUSY; 5821 } 5822 spin_unlock(&all_mddevs_lock); 5823 5824 rv = entry->show(mddev, page); 5825 mddev_put(mddev); 5826 return rv; 5827 } 5828 5829 static ssize_t 5830 md_attr_store(struct kobject *kobj, struct attribute *attr, 5831 const char *page, size_t length) 5832 { 5833 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5834 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5835 ssize_t rv; 5836 struct kernfs_node *kn = NULL; 5837 5838 if (!entry->store) 5839 return -EIO; 5840 if (!capable(CAP_SYS_ADMIN)) 5841 return -EACCES; 5842 5843 if (entry->store == array_state_store && cmd_match(page, "clear")) 5844 kn = sysfs_break_active_protection(kobj, attr); 5845 5846 spin_lock(&all_mddevs_lock); 5847 if (!mddev_get(mddev)) { 5848 spin_unlock(&all_mddevs_lock); 5849 if (kn) 5850 sysfs_unbreak_active_protection(kn); 5851 return -EBUSY; 5852 } 5853 spin_unlock(&all_mddevs_lock); 5854 rv = entry->store(mddev, page, length); 5855 mddev_put(mddev); 5856 5857 if (kn) 5858 sysfs_unbreak_active_protection(kn); 5859 5860 return rv; 5861 } 5862 5863 static void md_kobj_release(struct kobject *ko) 5864 { 5865 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5866 5867 if (legacy_async_del_gendisk) { 5868 if (mddev->sysfs_state) 5869 sysfs_put(mddev->sysfs_state); 5870 if (mddev->sysfs_level) 5871 sysfs_put(mddev->sysfs_level); 5872 del_gendisk(mddev->gendisk); 5873 } 5874 put_disk(mddev->gendisk); 5875 } 5876 5877 static const struct sysfs_ops md_sysfs_ops = { 5878 .show = md_attr_show, 5879 .store = md_attr_store, 5880 }; 5881 static const struct kobj_type md_ktype = { 5882 .release = md_kobj_release, 5883 .sysfs_ops = &md_sysfs_ops, 5884 .default_groups = md_attr_groups, 5885 }; 5886 5887 int mdp_major = 0; 5888 5889 /* stack the limit for all rdevs into lim */ 5890 int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, 5891 unsigned int flags) 5892 { 5893 struct md_rdev *rdev; 5894 5895 rdev_for_each(rdev, mddev) { 5896 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, 5897 mddev->gendisk->disk_name); 5898 if ((flags & MDDEV_STACK_INTEGRITY) && 5899 !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) 5900 return -EINVAL; 5901 } 5902 5903 return 0; 5904 } 5905 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); 5906 5907 /* apply the extra stacking limits from a new rdev into mddev */ 5908 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) 5909 { 5910 struct queue_limits lim; 5911 5912 if (mddev_is_dm(mddev)) 5913 return 0; 5914 5915 lim = queue_limits_start_update(mddev->gendisk->queue); 5916 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, 5917 mddev->gendisk->disk_name); 5918 5919 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { 5920 pr_err("%s: incompatible integrity profile for %pg\n", 5921 mdname(mddev), rdev->bdev); 5922 queue_limits_cancel_update(mddev->gendisk->queue); 5923 return -ENXIO; 5924 } 5925 5926 return queue_limits_commit_update(mddev->gendisk->queue, &lim); 5927 } 5928 EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); 5929 5930 /* update the optimal I/O size after a reshape */ 5931 void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) 5932 { 5933 struct queue_limits lim; 5934 5935 if (mddev_is_dm(mddev)) 5936 return; 5937 5938 /* don't bother updating io_opt if we can't suspend the array */ 5939 if (mddev_suspend(mddev, false) < 0) 5940 return; 5941 lim = queue_limits_start_update(mddev->gendisk->queue); 5942 lim.io_opt = lim.io_min * nr_stripes; 5943 queue_limits_commit_update(mddev->gendisk->queue, &lim); 5944 mddev_resume(mddev); 5945 } 5946 EXPORT_SYMBOL_GPL(mddev_update_io_opt); 5947 5948 static void mddev_delayed_delete(struct work_struct *ws) 5949 { 5950 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5951 5952 kobject_put(&mddev->kobj); 5953 } 5954 5955 void md_init_stacking_limits(struct queue_limits *lim) 5956 { 5957 blk_set_stacking_limits(lim); 5958 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | 5959 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; 5960 } 5961 EXPORT_SYMBOL_GPL(md_init_stacking_limits); 5962 5963 struct mddev *md_alloc(dev_t dev, char *name) 5964 { 5965 /* 5966 * If dev is zero, name is the name of a device to allocate with 5967 * an arbitrary minor number. It will be "md_???" 5968 * If dev is non-zero it must be a device number with a MAJOR of 5969 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5970 * the device is being created by opening a node in /dev. 5971 * If "name" is not NULL, the device is being created by 5972 * writing to /sys/module/md_mod/parameters/new_array. 5973 */ 5974 static DEFINE_MUTEX(disks_mutex); 5975 struct mddev *mddev; 5976 struct gendisk *disk; 5977 int partitioned; 5978 int shift; 5979 int unit; 5980 int error; 5981 5982 /* 5983 * Wait for any previous instance of this device to be completely 5984 * removed (mddev_delayed_delete). 5985 */ 5986 flush_workqueue(md_misc_wq); 5987 5988 mutex_lock(&disks_mutex); 5989 mddev = mddev_alloc(dev); 5990 if (IS_ERR(mddev)) { 5991 error = PTR_ERR(mddev); 5992 goto out_unlock; 5993 } 5994 5995 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5996 shift = partitioned ? MdpMinorShift : 0; 5997 unit = MINOR(mddev->unit) >> shift; 5998 5999 if (name && !dev) { 6000 /* Need to ensure that 'name' is not a duplicate. 6001 */ 6002 struct mddev *mddev2; 6003 spin_lock(&all_mddevs_lock); 6004 6005 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 6006 if (mddev2->gendisk && 6007 strcmp(mddev2->gendisk->disk_name, name) == 0) { 6008 spin_unlock(&all_mddevs_lock); 6009 error = -EEXIST; 6010 goto out_free_mddev; 6011 } 6012 spin_unlock(&all_mddevs_lock); 6013 } 6014 if (name && dev) 6015 /* 6016 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 6017 */ 6018 mddev->hold_active = UNTIL_STOP; 6019 6020 disk = blk_alloc_disk(NULL, NUMA_NO_NODE); 6021 if (IS_ERR(disk)) { 6022 error = PTR_ERR(disk); 6023 goto out_free_mddev; 6024 } 6025 6026 disk->major = MAJOR(mddev->unit); 6027 disk->first_minor = unit << shift; 6028 disk->minors = 1 << shift; 6029 if (name) 6030 strcpy(disk->disk_name, name); 6031 else if (partitioned) 6032 sprintf(disk->disk_name, "md_d%d", unit); 6033 else 6034 sprintf(disk->disk_name, "md%d", unit); 6035 disk->fops = &md_fops; 6036 disk->private_data = mddev; 6037 6038 disk->events |= DISK_EVENT_MEDIA_CHANGE; 6039 mddev->gendisk = disk; 6040 error = add_disk(disk); 6041 if (error) 6042 goto out_put_disk; 6043 6044 kobject_init(&mddev->kobj, &md_ktype); 6045 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 6046 if (error) { 6047 /* 6048 * The disk is already live at this point. Clear the hold flag 6049 * and let mddev_put take care of the deletion, as it isn't any 6050 * different from a normal close on last release now. 6051 */ 6052 mddev->hold_active = 0; 6053 mutex_unlock(&disks_mutex); 6054 mddev_put(mddev); 6055 return ERR_PTR(error); 6056 } 6057 6058 kobject_uevent(&mddev->kobj, KOBJ_ADD); 6059 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 6060 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 6061 mutex_unlock(&disks_mutex); 6062 return mddev; 6063 6064 out_put_disk: 6065 put_disk(disk); 6066 out_free_mddev: 6067 mddev_free(mddev); 6068 out_unlock: 6069 mutex_unlock(&disks_mutex); 6070 return ERR_PTR(error); 6071 } 6072 6073 static int md_alloc_and_put(dev_t dev, char *name) 6074 { 6075 struct mddev *mddev = md_alloc(dev, name); 6076 6077 if (legacy_async_del_gendisk) 6078 pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n"); 6079 6080 if (IS_ERR(mddev)) 6081 return PTR_ERR(mddev); 6082 mddev_put(mddev); 6083 return 0; 6084 } 6085 6086 static void md_probe(dev_t dev) 6087 { 6088 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 6089 return; 6090 if (create_on_open) 6091 md_alloc_and_put(dev, NULL); 6092 } 6093 6094 static int add_named_array(const char *val, const struct kernel_param *kp) 6095 { 6096 /* 6097 * val must be "md_*" or "mdNNN". 6098 * For "md_*" we allocate an array with a large free minor number, and 6099 * set the name to val. val must not already be an active name. 6100 * For "mdNNN" we allocate an array with the minor number NNN 6101 * which must not already be in use. 6102 */ 6103 int len = strlen(val); 6104 char buf[DISK_NAME_LEN]; 6105 unsigned long devnum; 6106 6107 while (len && val[len-1] == '\n') 6108 len--; 6109 if (len >= DISK_NAME_LEN) 6110 return -E2BIG; 6111 strscpy(buf, val, len+1); 6112 if (strncmp(buf, "md_", 3) == 0) 6113 return md_alloc_and_put(0, buf); 6114 if (strncmp(buf, "md", 2) == 0 && 6115 isdigit(buf[2]) && 6116 kstrtoul(buf+2, 10, &devnum) == 0 && 6117 devnum <= MINORMASK) 6118 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 6119 6120 return -EINVAL; 6121 } 6122 6123 static void md_safemode_timeout(struct timer_list *t) 6124 { 6125 struct mddev *mddev = timer_container_of(mddev, t, safemode_timer); 6126 6127 mddev->safemode = 1; 6128 if (mddev->external) 6129 sysfs_notify_dirent_safe(mddev->sysfs_state); 6130 6131 md_wakeup_thread(mddev->thread); 6132 } 6133 6134 static int start_dirty_degraded; 6135 6136 int md_run(struct mddev *mddev) 6137 { 6138 int err; 6139 struct md_rdev *rdev; 6140 struct md_personality *pers; 6141 bool nowait = true; 6142 6143 if (list_empty(&mddev->disks)) 6144 /* cannot run an array with no devices.. */ 6145 return -EINVAL; 6146 6147 if (mddev->pers) 6148 return -EBUSY; 6149 /* Cannot run until previous stop completes properly */ 6150 if (mddev->sysfs_active) 6151 return -EBUSY; 6152 6153 /* 6154 * Analyze all RAID superblock(s) 6155 */ 6156 if (!mddev->raid_disks) { 6157 if (!mddev->persistent) 6158 return -EINVAL; 6159 err = analyze_sbs(mddev); 6160 if (err) 6161 return -EINVAL; 6162 } 6163 6164 if (mddev->level != LEVEL_NONE) 6165 request_module("md-level-%d", mddev->level); 6166 else if (mddev->clevel[0]) 6167 request_module("md-%s", mddev->clevel); 6168 6169 /* 6170 * Drop all container device buffers, from now on 6171 * the only valid external interface is through the md 6172 * device. 6173 */ 6174 mddev->has_superblocks = false; 6175 rdev_for_each(rdev, mddev) { 6176 if (test_bit(Faulty, &rdev->flags)) 6177 continue; 6178 sync_blockdev(rdev->bdev); 6179 invalidate_bdev(rdev->bdev); 6180 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 6181 mddev->ro = MD_RDONLY; 6182 if (!mddev_is_dm(mddev)) 6183 set_disk_ro(mddev->gendisk, 1); 6184 } 6185 6186 if (rdev->sb_page) 6187 mddev->has_superblocks = true; 6188 6189 /* perform some consistency tests on the device. 6190 * We don't want the data to overlap the metadata, 6191 * Internal Bitmap issues have been handled elsewhere. 6192 */ 6193 if (rdev->meta_bdev) { 6194 /* Nothing to check */; 6195 } else if (rdev->data_offset < rdev->sb_start) { 6196 if (mddev->dev_sectors && 6197 rdev->data_offset + mddev->dev_sectors 6198 > rdev->sb_start) { 6199 pr_warn("md: %s: data overlaps metadata\n", 6200 mdname(mddev)); 6201 return -EINVAL; 6202 } 6203 } else { 6204 if (rdev->sb_start + rdev->sb_size/512 6205 > rdev->data_offset) { 6206 pr_warn("md: %s: metadata overlaps data\n", 6207 mdname(mddev)); 6208 return -EINVAL; 6209 } 6210 } 6211 sysfs_notify_dirent_safe(rdev->sysfs_state); 6212 nowait = nowait && bdev_nowait(rdev->bdev); 6213 } 6214 6215 if (!bioset_initialized(&mddev->bio_set)) { 6216 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6217 if (err) 6218 return err; 6219 } 6220 if (!bioset_initialized(&mddev->sync_set)) { 6221 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6222 if (err) 6223 goto exit_bio_set; 6224 } 6225 6226 if (!bioset_initialized(&mddev->io_clone_set)) { 6227 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 6228 offsetof(struct md_io_clone, bio_clone), 0); 6229 if (err) 6230 goto exit_sync_set; 6231 } 6232 6233 pers = get_pers(mddev->level, mddev->clevel); 6234 if (!pers) { 6235 err = -EINVAL; 6236 goto abort; 6237 } 6238 if (mddev->level != pers->head.id) { 6239 mddev->level = pers->head.id; 6240 mddev->new_level = pers->head.id; 6241 } 6242 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 6243 6244 if (mddev->reshape_position != MaxSector && 6245 pers->start_reshape == NULL) { 6246 /* This personality cannot handle reshaping... */ 6247 put_pers(pers); 6248 err = -EINVAL; 6249 goto abort; 6250 } 6251 6252 if (pers->sync_request) { 6253 /* Warn if this is a potentially silly 6254 * configuration. 6255 */ 6256 struct md_rdev *rdev2; 6257 int warned = 0; 6258 6259 rdev_for_each(rdev, mddev) 6260 rdev_for_each(rdev2, mddev) { 6261 if (rdev < rdev2 && 6262 rdev->bdev->bd_disk == 6263 rdev2->bdev->bd_disk) { 6264 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 6265 mdname(mddev), 6266 rdev->bdev, 6267 rdev2->bdev); 6268 warned = 1; 6269 } 6270 } 6271 6272 if (warned) 6273 pr_warn("True protection against single-disk failure might be compromised.\n"); 6274 } 6275 6276 /* dm-raid expect sync_thread to be frozen until resume */ 6277 if (mddev->gendisk) 6278 mddev->recovery = 0; 6279 6280 /* may be over-ridden by personality */ 6281 mddev->resync_max_sectors = mddev->dev_sectors; 6282 6283 mddev->ok_start_degraded = start_dirty_degraded; 6284 6285 if (start_readonly && md_is_rdwr(mddev)) 6286 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6287 6288 err = pers->run(mddev); 6289 if (err) 6290 pr_warn("md: pers->run() failed ...\n"); 6291 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6292 WARN_ONCE(!mddev->external_size, 6293 "%s: default size too small, but 'external_size' not in effect?\n", 6294 __func__); 6295 pr_warn("md: invalid array_size %llu > default size %llu\n", 6296 (unsigned long long)mddev->array_sectors / 2, 6297 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6298 err = -EINVAL; 6299 } 6300 if (err == 0 && pers->sync_request && 6301 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6302 err = mddev->bitmap_ops->create(mddev); 6303 if (err) 6304 pr_warn("%s: failed to create bitmap (%d)\n", 6305 mdname(mddev), err); 6306 } 6307 if (err) 6308 goto bitmap_abort; 6309 6310 if (mddev->bitmap_info.max_write_behind > 0) { 6311 bool create_pool = false; 6312 6313 rdev_for_each(rdev, mddev) { 6314 if (test_bit(WriteMostly, &rdev->flags) && 6315 rdev_init_serial(rdev)) 6316 create_pool = true; 6317 } 6318 if (create_pool && mddev->serial_info_pool == NULL) { 6319 mddev->serial_info_pool = 6320 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6321 sizeof(struct serial_info)); 6322 if (!mddev->serial_info_pool) { 6323 err = -ENOMEM; 6324 goto bitmap_abort; 6325 } 6326 } 6327 } 6328 6329 if (pers->sync_request) { 6330 if (mddev->kobj.sd && 6331 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6332 pr_warn("md: cannot register extra attributes for %s\n", 6333 mdname(mddev)); 6334 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6335 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6336 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6337 } else if (mddev->ro == MD_AUTO_READ) 6338 mddev->ro = MD_RDWR; 6339 6340 atomic_set(&mddev->max_corr_read_errors, 6341 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6342 mddev->safemode = 0; 6343 if (mddev_is_clustered(mddev)) 6344 mddev->safemode_delay = 0; 6345 else 6346 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6347 mddev->in_sync = 1; 6348 smp_wmb(); 6349 spin_lock(&mddev->lock); 6350 mddev->pers = pers; 6351 spin_unlock(&mddev->lock); 6352 rdev_for_each(rdev, mddev) 6353 if (rdev->raid_disk >= 0) 6354 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6355 6356 if (mddev->degraded && md_is_rdwr(mddev)) 6357 /* This ensures that recovering status is reported immediately 6358 * via sysfs - until a lack of spares is confirmed. 6359 */ 6360 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6361 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6362 6363 if (mddev->sb_flags) 6364 md_update_sb(mddev, 0); 6365 6366 md_new_event(); 6367 return 0; 6368 6369 bitmap_abort: 6370 mddev_detach(mddev); 6371 if (mddev->private) 6372 pers->free(mddev, mddev->private); 6373 mddev->private = NULL; 6374 put_pers(pers); 6375 mddev->bitmap_ops->destroy(mddev); 6376 abort: 6377 bioset_exit(&mddev->io_clone_set); 6378 exit_sync_set: 6379 bioset_exit(&mddev->sync_set); 6380 exit_bio_set: 6381 bioset_exit(&mddev->bio_set); 6382 return err; 6383 } 6384 EXPORT_SYMBOL_GPL(md_run); 6385 6386 int do_md_run(struct mddev *mddev) 6387 { 6388 int err; 6389 6390 set_bit(MD_NOT_READY, &mddev->flags); 6391 err = md_run(mddev); 6392 if (err) 6393 goto out; 6394 6395 err = mddev->bitmap_ops->load(mddev); 6396 if (err) { 6397 mddev->bitmap_ops->destroy(mddev); 6398 goto out; 6399 } 6400 6401 if (mddev_is_clustered(mddev)) 6402 md_allow_write(mddev); 6403 6404 /* run start up tasks that require md_thread */ 6405 md_start(mddev); 6406 6407 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6408 6409 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6410 clear_bit(MD_NOT_READY, &mddev->flags); 6411 mddev->changed = 1; 6412 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6413 sysfs_notify_dirent_safe(mddev->sysfs_state); 6414 sysfs_notify_dirent_safe(mddev->sysfs_action); 6415 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6416 out: 6417 clear_bit(MD_NOT_READY, &mddev->flags); 6418 return err; 6419 } 6420 6421 int md_start(struct mddev *mddev) 6422 { 6423 int ret = 0; 6424 6425 if (mddev->pers->start) { 6426 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6427 ret = mddev->pers->start(mddev); 6428 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6429 md_wakeup_thread(mddev->sync_thread); 6430 } 6431 return ret; 6432 } 6433 EXPORT_SYMBOL_GPL(md_start); 6434 6435 static int restart_array(struct mddev *mddev) 6436 { 6437 struct gendisk *disk = mddev->gendisk; 6438 struct md_rdev *rdev; 6439 bool has_journal = false; 6440 bool has_readonly = false; 6441 6442 /* Complain if it has no devices */ 6443 if (list_empty(&mddev->disks)) 6444 return -ENXIO; 6445 if (!mddev->pers) 6446 return -EINVAL; 6447 if (md_is_rdwr(mddev)) 6448 return -EBUSY; 6449 6450 rcu_read_lock(); 6451 rdev_for_each_rcu(rdev, mddev) { 6452 if (test_bit(Journal, &rdev->flags) && 6453 !test_bit(Faulty, &rdev->flags)) 6454 has_journal = true; 6455 if (rdev_read_only(rdev)) 6456 has_readonly = true; 6457 } 6458 rcu_read_unlock(); 6459 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6460 /* Don't restart rw with journal missing/faulty */ 6461 return -EINVAL; 6462 if (has_readonly) 6463 return -EROFS; 6464 6465 mddev->safemode = 0; 6466 mddev->ro = MD_RDWR; 6467 set_disk_ro(disk, 0); 6468 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6469 /* Kick recovery or resync if necessary */ 6470 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6471 md_wakeup_thread(mddev->sync_thread); 6472 sysfs_notify_dirent_safe(mddev->sysfs_state); 6473 return 0; 6474 } 6475 6476 static void md_clean(struct mddev *mddev) 6477 { 6478 mddev->array_sectors = 0; 6479 mddev->external_size = 0; 6480 mddev->dev_sectors = 0; 6481 mddev->raid_disks = 0; 6482 mddev->resync_offset = 0; 6483 mddev->resync_min = 0; 6484 mddev->resync_max = MaxSector; 6485 mddev->reshape_position = MaxSector; 6486 /* we still need mddev->external in export_rdev, do not clear it yet */ 6487 mddev->persistent = 0; 6488 mddev->level = LEVEL_NONE; 6489 mddev->clevel[0] = 0; 6490 6491 /* 6492 * For legacy_async_del_gendisk mode, it can stop the array in the 6493 * middle of assembling it, then it still can access the array. So 6494 * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk, 6495 * it can't open the array again after stopping it. So it doesn't 6496 * clear MD_CLOSING. 6497 */ 6498 if (legacy_async_del_gendisk && mddev->hold_active) { 6499 clear_bit(MD_CLOSING, &mddev->flags); 6500 } else { 6501 /* if UNTIL_STOP is set, it's cleared here */ 6502 mddev->hold_active = 0; 6503 /* Don't clear MD_CLOSING, or mddev can be opened again. */ 6504 mddev->flags &= BIT_ULL_MASK(MD_CLOSING); 6505 } 6506 mddev->sb_flags = 0; 6507 mddev->ro = MD_RDWR; 6508 mddev->metadata_type[0] = 0; 6509 mddev->chunk_sectors = 0; 6510 mddev->ctime = mddev->utime = 0; 6511 mddev->layout = 0; 6512 mddev->max_disks = 0; 6513 mddev->events = 0; 6514 mddev->can_decrease_events = 0; 6515 mddev->delta_disks = 0; 6516 mddev->reshape_backwards = 0; 6517 mddev->new_level = LEVEL_NONE; 6518 mddev->new_layout = 0; 6519 mddev->new_chunk_sectors = 0; 6520 mddev->curr_resync = MD_RESYNC_NONE; 6521 atomic64_set(&mddev->resync_mismatches, 0); 6522 mddev->suspend_lo = mddev->suspend_hi = 0; 6523 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6524 mddev->recovery = 0; 6525 mddev->in_sync = 0; 6526 mddev->changed = 0; 6527 mddev->degraded = 0; 6528 mddev->safemode = 0; 6529 mddev->private = NULL; 6530 mddev->cluster_info = NULL; 6531 mddev->bitmap_info.offset = 0; 6532 mddev->bitmap_info.default_offset = 0; 6533 mddev->bitmap_info.default_space = 0; 6534 mddev->bitmap_info.chunksize = 0; 6535 mddev->bitmap_info.daemon_sleep = 0; 6536 mddev->bitmap_info.max_write_behind = 0; 6537 mddev->bitmap_info.nodes = 0; 6538 } 6539 6540 static void __md_stop_writes(struct mddev *mddev) 6541 { 6542 timer_delete_sync(&mddev->safemode_timer); 6543 6544 if (mddev->pers && mddev->pers->quiesce) { 6545 mddev->pers->quiesce(mddev, 1); 6546 mddev->pers->quiesce(mddev, 0); 6547 } 6548 6549 mddev->bitmap_ops->flush(mddev); 6550 6551 if (md_is_rdwr(mddev) && 6552 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6553 mddev->sb_flags)) { 6554 /* mark array as shutdown cleanly */ 6555 if (!mddev_is_clustered(mddev)) 6556 mddev->in_sync = 1; 6557 md_update_sb(mddev, 1); 6558 } 6559 /* disable policy to guarantee rdevs free resources for serialization */ 6560 mddev->serialize_policy = 0; 6561 mddev_destroy_serial_pool(mddev, NULL); 6562 } 6563 6564 void md_stop_writes(struct mddev *mddev) 6565 { 6566 mddev_lock_nointr(mddev); 6567 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6568 stop_sync_thread(mddev, true); 6569 __md_stop_writes(mddev); 6570 mddev_unlock(mddev); 6571 } 6572 EXPORT_SYMBOL_GPL(md_stop_writes); 6573 6574 static void mddev_detach(struct mddev *mddev) 6575 { 6576 mddev->bitmap_ops->wait_behind_writes(mddev); 6577 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6578 mddev->pers->quiesce(mddev, 1); 6579 mddev->pers->quiesce(mddev, 0); 6580 } 6581 md_unregister_thread(mddev, &mddev->thread); 6582 6583 /* the unplug fn references 'conf' */ 6584 if (!mddev_is_dm(mddev)) 6585 blk_sync_queue(mddev->gendisk->queue); 6586 } 6587 6588 static void __md_stop(struct mddev *mddev) 6589 { 6590 struct md_personality *pers = mddev->pers; 6591 6592 mddev->bitmap_ops->destroy(mddev); 6593 mddev_detach(mddev); 6594 spin_lock(&mddev->lock); 6595 mddev->pers = NULL; 6596 spin_unlock(&mddev->lock); 6597 if (mddev->private) 6598 pers->free(mddev, mddev->private); 6599 mddev->private = NULL; 6600 put_pers(pers); 6601 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6602 6603 bioset_exit(&mddev->bio_set); 6604 bioset_exit(&mddev->sync_set); 6605 bioset_exit(&mddev->io_clone_set); 6606 } 6607 6608 void md_stop(struct mddev *mddev) 6609 { 6610 lockdep_assert_held(&mddev->reconfig_mutex); 6611 6612 /* stop the array and free an attached data structures. 6613 * This is called from dm-raid 6614 */ 6615 __md_stop_writes(mddev); 6616 __md_stop(mddev); 6617 } 6618 6619 EXPORT_SYMBOL_GPL(md_stop); 6620 6621 /* ensure 'mddev->pers' exist before calling md_set_readonly() */ 6622 static int md_set_readonly(struct mddev *mddev) 6623 { 6624 int err = 0; 6625 int did_freeze = 0; 6626 6627 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6628 return -EBUSY; 6629 6630 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6631 did_freeze = 1; 6632 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6633 } 6634 6635 stop_sync_thread(mddev, false); 6636 wait_event(mddev->sb_wait, 6637 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6638 mddev_lock_nointr(mddev); 6639 6640 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6641 pr_warn("md: %s still in use.\n",mdname(mddev)); 6642 err = -EBUSY; 6643 goto out; 6644 } 6645 6646 __md_stop_writes(mddev); 6647 6648 if (mddev->ro == MD_RDONLY) { 6649 err = -ENXIO; 6650 goto out; 6651 } 6652 6653 mddev->ro = MD_RDONLY; 6654 set_disk_ro(mddev->gendisk, 1); 6655 6656 out: 6657 if (!err || did_freeze) { 6658 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6659 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6660 sysfs_notify_dirent_safe(mddev->sysfs_state); 6661 } 6662 6663 return err; 6664 } 6665 6666 /* mode: 6667 * 0 - completely stop and dis-assemble array 6668 * 2 - stop but do not disassemble array 6669 */ 6670 static int do_md_stop(struct mddev *mddev, int mode) 6671 { 6672 struct gendisk *disk = mddev->gendisk; 6673 struct md_rdev *rdev; 6674 int did_freeze = 0; 6675 6676 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6677 did_freeze = 1; 6678 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6679 } 6680 6681 stop_sync_thread(mddev, true); 6682 6683 if (mddev->sysfs_active || 6684 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6685 pr_warn("md: %s still in use.\n",mdname(mddev)); 6686 if (did_freeze) { 6687 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6688 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6689 } 6690 return -EBUSY; 6691 } 6692 if (mddev->pers) { 6693 if (!md_is_rdwr(mddev)) 6694 set_disk_ro(disk, 0); 6695 6696 __md_stop_writes(mddev); 6697 __md_stop(mddev); 6698 6699 /* tell userspace to handle 'inactive' */ 6700 sysfs_notify_dirent_safe(mddev->sysfs_state); 6701 6702 rdev_for_each(rdev, mddev) 6703 if (rdev->raid_disk >= 0) 6704 sysfs_unlink_rdev(mddev, rdev); 6705 6706 set_capacity_and_notify(disk, 0); 6707 mddev->changed = 1; 6708 6709 if (!md_is_rdwr(mddev)) 6710 mddev->ro = MD_RDWR; 6711 } 6712 /* 6713 * Free resources if final stop 6714 */ 6715 if (mode == 0) { 6716 pr_info("md: %s stopped.\n", mdname(mddev)); 6717 6718 if (mddev->bitmap_info.file) { 6719 struct file *f = mddev->bitmap_info.file; 6720 spin_lock(&mddev->lock); 6721 mddev->bitmap_info.file = NULL; 6722 spin_unlock(&mddev->lock); 6723 fput(f); 6724 } 6725 mddev->bitmap_info.offset = 0; 6726 6727 export_array(mddev); 6728 md_clean(mddev); 6729 if (!legacy_async_del_gendisk) 6730 set_bit(MD_DELETED, &mddev->flags); 6731 } 6732 md_new_event(); 6733 sysfs_notify_dirent_safe(mddev->sysfs_state); 6734 return 0; 6735 } 6736 6737 #ifndef MODULE 6738 static void autorun_array(struct mddev *mddev) 6739 { 6740 struct md_rdev *rdev; 6741 int err; 6742 6743 if (list_empty(&mddev->disks)) 6744 return; 6745 6746 pr_info("md: running: "); 6747 6748 rdev_for_each(rdev, mddev) { 6749 pr_cont("<%pg>", rdev->bdev); 6750 } 6751 pr_cont("\n"); 6752 6753 err = do_md_run(mddev); 6754 if (err) { 6755 pr_warn("md: do_md_run() returned %d\n", err); 6756 do_md_stop(mddev, 0); 6757 } 6758 } 6759 6760 /* 6761 * lets try to run arrays based on all disks that have arrived 6762 * until now. (those are in pending_raid_disks) 6763 * 6764 * the method: pick the first pending disk, collect all disks with 6765 * the same UUID, remove all from the pending list and put them into 6766 * the 'same_array' list. Then order this list based on superblock 6767 * update time (freshest comes first), kick out 'old' disks and 6768 * compare superblocks. If everything's fine then run it. 6769 * 6770 * If "unit" is allocated, then bump its reference count 6771 */ 6772 static void autorun_devices(int part) 6773 { 6774 struct md_rdev *rdev0, *rdev, *tmp; 6775 struct mddev *mddev; 6776 6777 pr_info("md: autorun ...\n"); 6778 while (!list_empty(&pending_raid_disks)) { 6779 int unit; 6780 dev_t dev; 6781 LIST_HEAD(candidates); 6782 rdev0 = list_entry(pending_raid_disks.next, 6783 struct md_rdev, same_set); 6784 6785 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6786 INIT_LIST_HEAD(&candidates); 6787 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6788 if (super_90_load(rdev, rdev0, 0) >= 0) { 6789 pr_debug("md: adding %pg ...\n", 6790 rdev->bdev); 6791 list_move(&rdev->same_set, &candidates); 6792 } 6793 /* 6794 * now we have a set of devices, with all of them having 6795 * mostly sane superblocks. It's time to allocate the 6796 * mddev. 6797 */ 6798 if (part) { 6799 dev = MKDEV(mdp_major, 6800 rdev0->preferred_minor << MdpMinorShift); 6801 unit = MINOR(dev) >> MdpMinorShift; 6802 } else { 6803 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6804 unit = MINOR(dev); 6805 } 6806 if (rdev0->preferred_minor != unit) { 6807 pr_warn("md: unit number in %pg is bad: %d\n", 6808 rdev0->bdev, rdev0->preferred_minor); 6809 break; 6810 } 6811 6812 mddev = md_alloc(dev, NULL); 6813 if (IS_ERR(mddev)) 6814 break; 6815 6816 if (mddev_suspend_and_lock(mddev)) 6817 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6818 else if (mddev->raid_disks || mddev->major_version 6819 || !list_empty(&mddev->disks)) { 6820 pr_warn("md: %s already running, cannot run %pg\n", 6821 mdname(mddev), rdev0->bdev); 6822 mddev_unlock_and_resume(mddev); 6823 } else { 6824 pr_debug("md: created %s\n", mdname(mddev)); 6825 mddev->persistent = 1; 6826 rdev_for_each_list(rdev, tmp, &candidates) { 6827 list_del_init(&rdev->same_set); 6828 if (bind_rdev_to_array(rdev, mddev)) 6829 export_rdev(rdev, mddev); 6830 } 6831 autorun_array(mddev); 6832 mddev_unlock_and_resume(mddev); 6833 } 6834 /* on success, candidates will be empty, on error 6835 * it won't... 6836 */ 6837 rdev_for_each_list(rdev, tmp, &candidates) { 6838 list_del_init(&rdev->same_set); 6839 export_rdev(rdev, mddev); 6840 } 6841 mddev_put(mddev); 6842 } 6843 pr_info("md: ... autorun DONE.\n"); 6844 } 6845 #endif /* !MODULE */ 6846 6847 static int get_version(void __user *arg) 6848 { 6849 mdu_version_t ver; 6850 6851 ver.major = MD_MAJOR_VERSION; 6852 ver.minor = MD_MINOR_VERSION; 6853 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6854 6855 if (copy_to_user(arg, &ver, sizeof(ver))) 6856 return -EFAULT; 6857 6858 return 0; 6859 } 6860 6861 static int get_array_info(struct mddev *mddev, void __user *arg) 6862 { 6863 mdu_array_info_t info; 6864 int nr,working,insync,failed,spare; 6865 struct md_rdev *rdev; 6866 6867 nr = working = insync = failed = spare = 0; 6868 rcu_read_lock(); 6869 rdev_for_each_rcu(rdev, mddev) { 6870 nr++; 6871 if (test_bit(Faulty, &rdev->flags)) 6872 failed++; 6873 else { 6874 working++; 6875 if (test_bit(In_sync, &rdev->flags)) 6876 insync++; 6877 else if (test_bit(Journal, &rdev->flags)) 6878 /* TODO: add journal count to md_u.h */ 6879 ; 6880 else 6881 spare++; 6882 } 6883 } 6884 rcu_read_unlock(); 6885 6886 info.major_version = mddev->major_version; 6887 info.minor_version = mddev->minor_version; 6888 info.patch_version = MD_PATCHLEVEL_VERSION; 6889 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6890 info.level = mddev->level; 6891 info.size = mddev->dev_sectors / 2; 6892 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6893 info.size = -1; 6894 info.nr_disks = nr; 6895 info.raid_disks = mddev->raid_disks; 6896 info.md_minor = mddev->md_minor; 6897 info.not_persistent= !mddev->persistent; 6898 6899 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6900 info.state = 0; 6901 if (mddev->in_sync) 6902 info.state = (1<<MD_SB_CLEAN); 6903 if (mddev->bitmap && mddev->bitmap_info.offset) 6904 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6905 if (mddev_is_clustered(mddev)) 6906 info.state |= (1<<MD_SB_CLUSTERED); 6907 info.active_disks = insync; 6908 info.working_disks = working; 6909 info.failed_disks = failed; 6910 info.spare_disks = spare; 6911 6912 info.layout = mddev->layout; 6913 info.chunk_size = mddev->chunk_sectors << 9; 6914 6915 if (copy_to_user(arg, &info, sizeof(info))) 6916 return -EFAULT; 6917 6918 return 0; 6919 } 6920 6921 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6922 { 6923 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6924 char *ptr; 6925 int err; 6926 6927 file = kzalloc(sizeof(*file), GFP_NOIO); 6928 if (!file) 6929 return -ENOMEM; 6930 6931 err = 0; 6932 spin_lock(&mddev->lock); 6933 /* bitmap enabled */ 6934 if (mddev->bitmap_info.file) { 6935 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6936 sizeof(file->pathname)); 6937 if (IS_ERR(ptr)) 6938 err = PTR_ERR(ptr); 6939 else 6940 memmove(file->pathname, ptr, 6941 sizeof(file->pathname)-(ptr-file->pathname)); 6942 } 6943 spin_unlock(&mddev->lock); 6944 6945 if (err == 0 && 6946 copy_to_user(arg, file, sizeof(*file))) 6947 err = -EFAULT; 6948 6949 kfree(file); 6950 return err; 6951 } 6952 6953 static int get_disk_info(struct mddev *mddev, void __user * arg) 6954 { 6955 mdu_disk_info_t info; 6956 struct md_rdev *rdev; 6957 6958 if (copy_from_user(&info, arg, sizeof(info))) 6959 return -EFAULT; 6960 6961 rcu_read_lock(); 6962 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6963 if (rdev) { 6964 info.major = MAJOR(rdev->bdev->bd_dev); 6965 info.minor = MINOR(rdev->bdev->bd_dev); 6966 info.raid_disk = rdev->raid_disk; 6967 info.state = 0; 6968 if (test_bit(Faulty, &rdev->flags)) 6969 info.state |= (1<<MD_DISK_FAULTY); 6970 else if (test_bit(In_sync, &rdev->flags)) { 6971 info.state |= (1<<MD_DISK_ACTIVE); 6972 info.state |= (1<<MD_DISK_SYNC); 6973 } 6974 if (test_bit(Journal, &rdev->flags)) 6975 info.state |= (1<<MD_DISK_JOURNAL); 6976 if (test_bit(WriteMostly, &rdev->flags)) 6977 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6978 if (test_bit(FailFast, &rdev->flags)) 6979 info.state |= (1<<MD_DISK_FAILFAST); 6980 } else { 6981 info.major = info.minor = 0; 6982 info.raid_disk = -1; 6983 info.state = (1<<MD_DISK_REMOVED); 6984 } 6985 rcu_read_unlock(); 6986 6987 if (copy_to_user(arg, &info, sizeof(info))) 6988 return -EFAULT; 6989 6990 return 0; 6991 } 6992 6993 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6994 { 6995 struct md_rdev *rdev; 6996 dev_t dev = MKDEV(info->major,info->minor); 6997 6998 if (mddev_is_clustered(mddev) && 6999 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 7000 pr_warn("%s: Cannot add to clustered mddev.\n", 7001 mdname(mddev)); 7002 return -EINVAL; 7003 } 7004 7005 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 7006 return -EOVERFLOW; 7007 7008 if (!mddev->raid_disks) { 7009 int err; 7010 /* expecting a device which has a superblock */ 7011 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 7012 if (IS_ERR(rdev)) { 7013 pr_warn("md: md_import_device returned %ld\n", 7014 PTR_ERR(rdev)); 7015 return PTR_ERR(rdev); 7016 } 7017 if (!list_empty(&mddev->disks)) { 7018 struct md_rdev *rdev0 7019 = list_entry(mddev->disks.next, 7020 struct md_rdev, same_set); 7021 err = super_types[mddev->major_version] 7022 .load_super(rdev, rdev0, mddev->minor_version); 7023 if (err < 0) { 7024 pr_warn("md: %pg has different UUID to %pg\n", 7025 rdev->bdev, 7026 rdev0->bdev); 7027 export_rdev(rdev, mddev); 7028 return -EINVAL; 7029 } 7030 } 7031 err = bind_rdev_to_array(rdev, mddev); 7032 if (err) 7033 export_rdev(rdev, mddev); 7034 return err; 7035 } 7036 7037 /* 7038 * md_add_new_disk can be used once the array is assembled 7039 * to add "hot spares". They must already have a superblock 7040 * written 7041 */ 7042 if (mddev->pers) { 7043 int err; 7044 if (!mddev->pers->hot_add_disk) { 7045 pr_warn("%s: personality does not support diskops!\n", 7046 mdname(mddev)); 7047 return -EINVAL; 7048 } 7049 if (mddev->persistent) 7050 rdev = md_import_device(dev, mddev->major_version, 7051 mddev->minor_version); 7052 else 7053 rdev = md_import_device(dev, -1, -1); 7054 if (IS_ERR(rdev)) { 7055 pr_warn("md: md_import_device returned %ld\n", 7056 PTR_ERR(rdev)); 7057 return PTR_ERR(rdev); 7058 } 7059 /* set saved_raid_disk if appropriate */ 7060 if (!mddev->persistent) { 7061 if (info->state & (1<<MD_DISK_SYNC) && 7062 info->raid_disk < mddev->raid_disks) { 7063 rdev->raid_disk = info->raid_disk; 7064 clear_bit(Bitmap_sync, &rdev->flags); 7065 } else 7066 rdev->raid_disk = -1; 7067 rdev->saved_raid_disk = rdev->raid_disk; 7068 } else 7069 super_types[mddev->major_version]. 7070 validate_super(mddev, NULL/*freshest*/, rdev); 7071 if ((info->state & (1<<MD_DISK_SYNC)) && 7072 rdev->raid_disk != info->raid_disk) { 7073 /* This was a hot-add request, but events doesn't 7074 * match, so reject it. 7075 */ 7076 export_rdev(rdev, mddev); 7077 return -EINVAL; 7078 } 7079 7080 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 7081 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7082 set_bit(WriteMostly, &rdev->flags); 7083 else 7084 clear_bit(WriteMostly, &rdev->flags); 7085 if (info->state & (1<<MD_DISK_FAILFAST)) 7086 set_bit(FailFast, &rdev->flags); 7087 else 7088 clear_bit(FailFast, &rdev->flags); 7089 7090 if (info->state & (1<<MD_DISK_JOURNAL)) { 7091 struct md_rdev *rdev2; 7092 bool has_journal = false; 7093 7094 /* make sure no existing journal disk */ 7095 rdev_for_each(rdev2, mddev) { 7096 if (test_bit(Journal, &rdev2->flags)) { 7097 has_journal = true; 7098 break; 7099 } 7100 } 7101 if (has_journal || mddev->bitmap) { 7102 export_rdev(rdev, mddev); 7103 return -EBUSY; 7104 } 7105 set_bit(Journal, &rdev->flags); 7106 } 7107 /* 7108 * check whether the device shows up in other nodes 7109 */ 7110 if (mddev_is_clustered(mddev)) { 7111 if (info->state & (1 << MD_DISK_CANDIDATE)) 7112 set_bit(Candidate, &rdev->flags); 7113 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 7114 /* --add initiated by this node */ 7115 err = mddev->cluster_ops->add_new_disk(mddev, rdev); 7116 if (err) { 7117 export_rdev(rdev, mddev); 7118 return err; 7119 } 7120 } 7121 } 7122 7123 rdev->raid_disk = -1; 7124 err = bind_rdev_to_array(rdev, mddev); 7125 7126 if (err) 7127 export_rdev(rdev, mddev); 7128 7129 if (mddev_is_clustered(mddev)) { 7130 if (info->state & (1 << MD_DISK_CANDIDATE)) { 7131 if (!err) { 7132 err = mddev->cluster_ops->new_disk_ack( 7133 mddev, err == 0); 7134 if (err) 7135 md_kick_rdev_from_array(rdev); 7136 } 7137 } else { 7138 if (err) 7139 mddev->cluster_ops->add_new_disk_cancel(mddev); 7140 else 7141 err = add_bound_rdev(rdev); 7142 } 7143 7144 } else if (!err) 7145 err = add_bound_rdev(rdev); 7146 7147 return err; 7148 } 7149 7150 /* otherwise, md_add_new_disk is only allowed 7151 * for major_version==0 superblocks 7152 */ 7153 if (mddev->major_version != 0) { 7154 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 7155 return -EINVAL; 7156 } 7157 7158 if (!(info->state & (1<<MD_DISK_FAULTY))) { 7159 int err; 7160 rdev = md_import_device(dev, -1, 0); 7161 if (IS_ERR(rdev)) { 7162 pr_warn("md: error, md_import_device() returned %ld\n", 7163 PTR_ERR(rdev)); 7164 return PTR_ERR(rdev); 7165 } 7166 rdev->desc_nr = info->number; 7167 if (info->raid_disk < mddev->raid_disks) 7168 rdev->raid_disk = info->raid_disk; 7169 else 7170 rdev->raid_disk = -1; 7171 7172 if (rdev->raid_disk < mddev->raid_disks) 7173 if (info->state & (1<<MD_DISK_SYNC)) 7174 set_bit(In_sync, &rdev->flags); 7175 7176 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7177 set_bit(WriteMostly, &rdev->flags); 7178 if (info->state & (1<<MD_DISK_FAILFAST)) 7179 set_bit(FailFast, &rdev->flags); 7180 7181 if (!mddev->persistent) { 7182 pr_debug("md: nonpersistent superblock ...\n"); 7183 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7184 } else 7185 rdev->sb_start = calc_dev_sboffset(rdev); 7186 rdev->sectors = rdev->sb_start; 7187 7188 err = bind_rdev_to_array(rdev, mddev); 7189 if (err) { 7190 export_rdev(rdev, mddev); 7191 return err; 7192 } 7193 } 7194 7195 return 0; 7196 } 7197 7198 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 7199 { 7200 struct md_rdev *rdev; 7201 7202 if (!mddev->pers) 7203 return -ENODEV; 7204 7205 rdev = find_rdev(mddev, dev); 7206 if (!rdev) 7207 return -ENXIO; 7208 7209 if (rdev->raid_disk < 0) 7210 goto kick_rdev; 7211 7212 clear_bit(Blocked, &rdev->flags); 7213 remove_and_add_spares(mddev, rdev); 7214 7215 if (rdev->raid_disk >= 0) 7216 goto busy; 7217 7218 kick_rdev: 7219 if (mddev_is_clustered(mddev) && 7220 mddev->cluster_ops->remove_disk(mddev, rdev)) 7221 goto busy; 7222 7223 md_kick_rdev_from_array(rdev); 7224 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7225 if (!mddev->thread) 7226 md_update_sb(mddev, 1); 7227 md_new_event(); 7228 7229 return 0; 7230 busy: 7231 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7232 rdev->bdev, mdname(mddev)); 7233 return -EBUSY; 7234 } 7235 7236 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7237 { 7238 int err; 7239 struct md_rdev *rdev; 7240 7241 if (!mddev->pers) 7242 return -ENODEV; 7243 7244 if (mddev->major_version != 0) { 7245 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7246 mdname(mddev)); 7247 return -EINVAL; 7248 } 7249 if (!mddev->pers->hot_add_disk) { 7250 pr_warn("%s: personality does not support diskops!\n", 7251 mdname(mddev)); 7252 return -EINVAL; 7253 } 7254 7255 rdev = md_import_device(dev, -1, 0); 7256 if (IS_ERR(rdev)) { 7257 pr_warn("md: error, md_import_device() returned %ld\n", 7258 PTR_ERR(rdev)); 7259 return -EINVAL; 7260 } 7261 7262 if (mddev->persistent) 7263 rdev->sb_start = calc_dev_sboffset(rdev); 7264 else 7265 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7266 7267 rdev->sectors = rdev->sb_start; 7268 7269 if (test_bit(Faulty, &rdev->flags)) { 7270 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7271 rdev->bdev, mdname(mddev)); 7272 err = -EINVAL; 7273 goto abort_export; 7274 } 7275 7276 clear_bit(In_sync, &rdev->flags); 7277 rdev->desc_nr = -1; 7278 rdev->saved_raid_disk = -1; 7279 err = bind_rdev_to_array(rdev, mddev); 7280 if (err) 7281 goto abort_export; 7282 7283 /* 7284 * The rest should better be atomic, we can have disk failures 7285 * noticed in interrupt contexts ... 7286 */ 7287 7288 rdev->raid_disk = -1; 7289 7290 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7291 if (!mddev->thread) 7292 md_update_sb(mddev, 1); 7293 /* 7294 * Kick recovery, maybe this spare has to be added to the 7295 * array immediately. 7296 */ 7297 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7298 md_new_event(); 7299 return 0; 7300 7301 abort_export: 7302 export_rdev(rdev, mddev); 7303 return err; 7304 } 7305 7306 static int set_bitmap_file(struct mddev *mddev, int fd) 7307 { 7308 int err = 0; 7309 7310 if (mddev->pers) { 7311 if (!mddev->pers->quiesce || !mddev->thread) 7312 return -EBUSY; 7313 if (mddev->recovery || mddev->sync_thread) 7314 return -EBUSY; 7315 /* we should be able to change the bitmap.. */ 7316 } 7317 7318 if (fd >= 0) { 7319 struct inode *inode; 7320 struct file *f; 7321 7322 if (mddev->bitmap || mddev->bitmap_info.file) 7323 return -EEXIST; /* cannot add when bitmap is present */ 7324 7325 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7326 pr_warn("%s: bitmap files not supported by this kernel\n", 7327 mdname(mddev)); 7328 return -EINVAL; 7329 } 7330 pr_warn("%s: using deprecated bitmap file support\n", 7331 mdname(mddev)); 7332 7333 f = fget(fd); 7334 7335 if (f == NULL) { 7336 pr_warn("%s: error: failed to get bitmap file\n", 7337 mdname(mddev)); 7338 return -EBADF; 7339 } 7340 7341 inode = f->f_mapping->host; 7342 if (!S_ISREG(inode->i_mode)) { 7343 pr_warn("%s: error: bitmap file must be a regular file\n", 7344 mdname(mddev)); 7345 err = -EBADF; 7346 } else if (!(f->f_mode & FMODE_WRITE)) { 7347 pr_warn("%s: error: bitmap file must open for write\n", 7348 mdname(mddev)); 7349 err = -EBADF; 7350 } else if (atomic_read(&inode->i_writecount) != 1) { 7351 pr_warn("%s: error: bitmap file is already in use\n", 7352 mdname(mddev)); 7353 err = -EBUSY; 7354 } 7355 if (err) { 7356 fput(f); 7357 return err; 7358 } 7359 mddev->bitmap_info.file = f; 7360 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7361 } else if (mddev->bitmap == NULL) 7362 return -ENOENT; /* cannot remove what isn't there */ 7363 err = 0; 7364 if (mddev->pers) { 7365 if (fd >= 0) { 7366 err = mddev->bitmap_ops->create(mddev); 7367 if (!err) 7368 err = mddev->bitmap_ops->load(mddev); 7369 7370 if (err) { 7371 mddev->bitmap_ops->destroy(mddev); 7372 fd = -1; 7373 } 7374 } else if (fd < 0) { 7375 mddev->bitmap_ops->destroy(mddev); 7376 } 7377 } 7378 7379 if (fd < 0) { 7380 struct file *f = mddev->bitmap_info.file; 7381 if (f) { 7382 spin_lock(&mddev->lock); 7383 mddev->bitmap_info.file = NULL; 7384 spin_unlock(&mddev->lock); 7385 fput(f); 7386 } 7387 } 7388 7389 return err; 7390 } 7391 7392 /* 7393 * md_set_array_info is used two different ways 7394 * The original usage is when creating a new array. 7395 * In this usage, raid_disks is > 0 and it together with 7396 * level, size, not_persistent,layout,chunksize determine the 7397 * shape of the array. 7398 * This will always create an array with a type-0.90.0 superblock. 7399 * The newer usage is when assembling an array. 7400 * In this case raid_disks will be 0, and the major_version field is 7401 * use to determine which style super-blocks are to be found on the devices. 7402 * The minor and patch _version numbers are also kept incase the 7403 * super_block handler wishes to interpret them. 7404 */ 7405 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7406 { 7407 if (info->raid_disks == 0) { 7408 /* just setting version number for superblock loading */ 7409 if (info->major_version < 0 || 7410 info->major_version >= ARRAY_SIZE(super_types) || 7411 super_types[info->major_version].name == NULL) { 7412 /* maybe try to auto-load a module? */ 7413 pr_warn("md: superblock version %d not known\n", 7414 info->major_version); 7415 return -EINVAL; 7416 } 7417 mddev->major_version = info->major_version; 7418 mddev->minor_version = info->minor_version; 7419 mddev->patch_version = info->patch_version; 7420 mddev->persistent = !info->not_persistent; 7421 /* ensure mddev_put doesn't delete this now that there 7422 * is some minimal configuration. 7423 */ 7424 mddev->ctime = ktime_get_real_seconds(); 7425 return 0; 7426 } 7427 mddev->major_version = MD_MAJOR_VERSION; 7428 mddev->minor_version = MD_MINOR_VERSION; 7429 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7430 mddev->ctime = ktime_get_real_seconds(); 7431 7432 mddev->level = info->level; 7433 mddev->clevel[0] = 0; 7434 mddev->dev_sectors = 2 * (sector_t)info->size; 7435 mddev->raid_disks = info->raid_disks; 7436 /* don't set md_minor, it is determined by which /dev/md* was 7437 * openned 7438 */ 7439 if (info->state & (1<<MD_SB_CLEAN)) 7440 mddev->resync_offset = MaxSector; 7441 else 7442 mddev->resync_offset = 0; 7443 mddev->persistent = ! info->not_persistent; 7444 mddev->external = 0; 7445 7446 mddev->layout = info->layout; 7447 if (mddev->level == 0) 7448 /* Cannot trust RAID0 layout info here */ 7449 mddev->layout = -1; 7450 mddev->chunk_sectors = info->chunk_size >> 9; 7451 7452 if (mddev->persistent) { 7453 mddev->max_disks = MD_SB_DISKS; 7454 mddev->flags = 0; 7455 mddev->sb_flags = 0; 7456 } 7457 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7458 7459 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7460 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7461 mddev->bitmap_info.offset = 0; 7462 7463 mddev->reshape_position = MaxSector; 7464 7465 /* 7466 * Generate a 128 bit UUID 7467 */ 7468 get_random_bytes(mddev->uuid, 16); 7469 7470 mddev->new_level = mddev->level; 7471 mddev->new_chunk_sectors = mddev->chunk_sectors; 7472 mddev->new_layout = mddev->layout; 7473 mddev->delta_disks = 0; 7474 mddev->reshape_backwards = 0; 7475 7476 return 0; 7477 } 7478 7479 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7480 { 7481 lockdep_assert_held(&mddev->reconfig_mutex); 7482 7483 if (mddev->external_size) 7484 return; 7485 7486 mddev->array_sectors = array_sectors; 7487 } 7488 EXPORT_SYMBOL(md_set_array_sectors); 7489 7490 static int update_size(struct mddev *mddev, sector_t num_sectors) 7491 { 7492 struct md_rdev *rdev; 7493 int rv; 7494 int fit = (num_sectors == 0); 7495 sector_t old_dev_sectors = mddev->dev_sectors; 7496 7497 if (mddev->pers->resize == NULL) 7498 return -EINVAL; 7499 /* The "num_sectors" is the number of sectors of each device that 7500 * is used. This can only make sense for arrays with redundancy. 7501 * linear and raid0 always use whatever space is available. We can only 7502 * consider changing this number if no resync or reconstruction is 7503 * happening, and if the new size is acceptable. It must fit before the 7504 * sb_start or, if that is <data_offset, it must fit before the size 7505 * of each device. If num_sectors is zero, we find the largest size 7506 * that fits. 7507 */ 7508 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7509 return -EBUSY; 7510 if (!md_is_rdwr(mddev)) 7511 return -EROFS; 7512 7513 rdev_for_each(rdev, mddev) { 7514 sector_t avail = rdev->sectors; 7515 7516 if (fit && (num_sectors == 0 || num_sectors > avail)) 7517 num_sectors = avail; 7518 if (avail < num_sectors) 7519 return -ENOSPC; 7520 } 7521 rv = mddev->pers->resize(mddev, num_sectors); 7522 if (!rv) { 7523 if (mddev_is_clustered(mddev)) 7524 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 7525 else if (!mddev_is_dm(mddev)) 7526 set_capacity_and_notify(mddev->gendisk, 7527 mddev->array_sectors); 7528 } 7529 return rv; 7530 } 7531 7532 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7533 { 7534 int rv; 7535 struct md_rdev *rdev; 7536 /* change the number of raid disks */ 7537 if (mddev->pers->check_reshape == NULL) 7538 return -EINVAL; 7539 if (!md_is_rdwr(mddev)) 7540 return -EROFS; 7541 if (raid_disks <= 0 || 7542 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7543 return -EINVAL; 7544 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7545 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7546 mddev->reshape_position != MaxSector) 7547 return -EBUSY; 7548 7549 rdev_for_each(rdev, mddev) { 7550 if (mddev->raid_disks < raid_disks && 7551 rdev->data_offset < rdev->new_data_offset) 7552 return -EINVAL; 7553 if (mddev->raid_disks > raid_disks && 7554 rdev->data_offset > rdev->new_data_offset) 7555 return -EINVAL; 7556 } 7557 7558 mddev->delta_disks = raid_disks - mddev->raid_disks; 7559 if (mddev->delta_disks < 0) 7560 mddev->reshape_backwards = 1; 7561 else if (mddev->delta_disks > 0) 7562 mddev->reshape_backwards = 0; 7563 7564 rv = mddev->pers->check_reshape(mddev); 7565 if (rv < 0) { 7566 mddev->delta_disks = 0; 7567 mddev->reshape_backwards = 0; 7568 } 7569 return rv; 7570 } 7571 7572 static int get_cluster_ops(struct mddev *mddev) 7573 { 7574 xa_lock(&md_submodule); 7575 mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); 7576 if (mddev->cluster_ops && 7577 !try_module_get(mddev->cluster_ops->head.owner)) 7578 mddev->cluster_ops = NULL; 7579 xa_unlock(&md_submodule); 7580 7581 return mddev->cluster_ops == NULL ? -ENOENT : 0; 7582 } 7583 7584 static void put_cluster_ops(struct mddev *mddev) 7585 { 7586 if (!mddev->cluster_ops) 7587 return; 7588 7589 mddev->cluster_ops->leave(mddev); 7590 module_put(mddev->cluster_ops->head.owner); 7591 mddev->cluster_ops = NULL; 7592 } 7593 7594 /* 7595 * update_array_info is used to change the configuration of an 7596 * on-line array. 7597 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7598 * fields in the info are checked against the array. 7599 * Any differences that cannot be handled will cause an error. 7600 * Normally, only one change can be managed at a time. 7601 */ 7602 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7603 { 7604 int rv = 0; 7605 int cnt = 0; 7606 int state = 0; 7607 7608 /* calculate expected state,ignoring low bits */ 7609 if (mddev->bitmap && mddev->bitmap_info.offset) 7610 state |= (1 << MD_SB_BITMAP_PRESENT); 7611 7612 if (mddev->major_version != info->major_version || 7613 mddev->minor_version != info->minor_version || 7614 /* mddev->patch_version != info->patch_version || */ 7615 mddev->ctime != info->ctime || 7616 mddev->level != info->level || 7617 /* mddev->layout != info->layout || */ 7618 mddev->persistent != !info->not_persistent || 7619 mddev->chunk_sectors != info->chunk_size >> 9 || 7620 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7621 ((state^info->state) & 0xfffffe00) 7622 ) 7623 return -EINVAL; 7624 /* Check there is only one change */ 7625 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7626 cnt++; 7627 if (mddev->raid_disks != info->raid_disks) 7628 cnt++; 7629 if (mddev->layout != info->layout) 7630 cnt++; 7631 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7632 cnt++; 7633 if (cnt == 0) 7634 return 0; 7635 if (cnt > 1) 7636 return -EINVAL; 7637 7638 if (mddev->layout != info->layout) { 7639 /* Change layout 7640 * we don't need to do anything at the md level, the 7641 * personality will take care of it all. 7642 */ 7643 if (mddev->pers->check_reshape == NULL) 7644 return -EINVAL; 7645 else { 7646 mddev->new_layout = info->layout; 7647 rv = mddev->pers->check_reshape(mddev); 7648 if (rv) 7649 mddev->new_layout = mddev->layout; 7650 return rv; 7651 } 7652 } 7653 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7654 rv = update_size(mddev, (sector_t)info->size * 2); 7655 7656 if (mddev->raid_disks != info->raid_disks) 7657 rv = update_raid_disks(mddev, info->raid_disks); 7658 7659 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7660 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7661 rv = -EINVAL; 7662 goto err; 7663 } 7664 if (mddev->recovery || mddev->sync_thread) { 7665 rv = -EBUSY; 7666 goto err; 7667 } 7668 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7669 /* add the bitmap */ 7670 if (mddev->bitmap) { 7671 rv = -EEXIST; 7672 goto err; 7673 } 7674 if (mddev->bitmap_info.default_offset == 0) { 7675 rv = -EINVAL; 7676 goto err; 7677 } 7678 mddev->bitmap_info.offset = 7679 mddev->bitmap_info.default_offset; 7680 mddev->bitmap_info.space = 7681 mddev->bitmap_info.default_space; 7682 rv = mddev->bitmap_ops->create(mddev); 7683 if (!rv) 7684 rv = mddev->bitmap_ops->load(mddev); 7685 7686 if (rv) 7687 mddev->bitmap_ops->destroy(mddev); 7688 } else { 7689 struct md_bitmap_stats stats; 7690 7691 rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 7692 if (rv) 7693 goto err; 7694 7695 if (stats.file) { 7696 rv = -EINVAL; 7697 goto err; 7698 } 7699 7700 if (mddev->bitmap_info.nodes) { 7701 /* hold PW on all the bitmap lock */ 7702 if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7703 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7704 rv = -EPERM; 7705 mddev->cluster_ops->unlock_all_bitmaps(mddev); 7706 goto err; 7707 } 7708 7709 mddev->bitmap_info.nodes = 0; 7710 put_cluster_ops(mddev); 7711 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7712 } 7713 mddev->bitmap_ops->destroy(mddev); 7714 mddev->bitmap_info.offset = 0; 7715 } 7716 } 7717 md_update_sb(mddev, 1); 7718 return rv; 7719 err: 7720 return rv; 7721 } 7722 7723 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7724 { 7725 struct md_rdev *rdev; 7726 int err = 0; 7727 7728 if (mddev->pers == NULL) 7729 return -ENODEV; 7730 7731 rcu_read_lock(); 7732 rdev = md_find_rdev_rcu(mddev, dev); 7733 if (!rdev) 7734 err = -ENODEV; 7735 else { 7736 md_error(mddev, rdev); 7737 if (test_bit(MD_BROKEN, &mddev->flags)) 7738 err = -EBUSY; 7739 } 7740 rcu_read_unlock(); 7741 return err; 7742 } 7743 7744 /* 7745 * We have a problem here : there is no easy way to give a CHS 7746 * virtual geometry. We currently pretend that we have a 2 heads 7747 * 4 sectors (with a BIG number of cylinders...). This drives 7748 * dosfs just mad... ;-) 7749 */ 7750 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7751 { 7752 struct mddev *mddev = bdev->bd_disk->private_data; 7753 7754 geo->heads = 2; 7755 geo->sectors = 4; 7756 geo->cylinders = mddev->array_sectors / 8; 7757 return 0; 7758 } 7759 7760 static inline int md_ioctl_valid(unsigned int cmd) 7761 { 7762 switch (cmd) { 7763 case GET_ARRAY_INFO: 7764 case GET_DISK_INFO: 7765 case RAID_VERSION: 7766 return 0; 7767 case ADD_NEW_DISK: 7768 case GET_BITMAP_FILE: 7769 case HOT_ADD_DISK: 7770 case HOT_REMOVE_DISK: 7771 case RESTART_ARRAY_RW: 7772 case RUN_ARRAY: 7773 case SET_ARRAY_INFO: 7774 case SET_BITMAP_FILE: 7775 case SET_DISK_FAULTY: 7776 case STOP_ARRAY: 7777 case STOP_ARRAY_RO: 7778 case CLUSTERED_DISK_NACK: 7779 if (!capable(CAP_SYS_ADMIN)) 7780 return -EACCES; 7781 return 0; 7782 default: 7783 return -ENOTTY; 7784 } 7785 } 7786 7787 static bool md_ioctl_need_suspend(unsigned int cmd) 7788 { 7789 switch (cmd) { 7790 case ADD_NEW_DISK: 7791 case HOT_ADD_DISK: 7792 case HOT_REMOVE_DISK: 7793 case SET_BITMAP_FILE: 7794 case SET_ARRAY_INFO: 7795 return true; 7796 default: 7797 return false; 7798 } 7799 } 7800 7801 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7802 { 7803 mdu_array_info_t info; 7804 int err; 7805 7806 if (!argp) 7807 memset(&info, 0, sizeof(info)); 7808 else if (copy_from_user(&info, argp, sizeof(info))) 7809 return -EFAULT; 7810 7811 if (mddev->pers) { 7812 err = update_array_info(mddev, &info); 7813 if (err) 7814 pr_warn("md: couldn't update array info. %d\n", err); 7815 return err; 7816 } 7817 7818 if (!list_empty(&mddev->disks)) { 7819 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7820 return -EBUSY; 7821 } 7822 7823 if (mddev->raid_disks) { 7824 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7825 return -EBUSY; 7826 } 7827 7828 err = md_set_array_info(mddev, &info); 7829 if (err) 7830 pr_warn("md: couldn't set array info. %d\n", err); 7831 7832 return err; 7833 } 7834 7835 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7836 unsigned int cmd, unsigned long arg) 7837 { 7838 int err = 0; 7839 void __user *argp = (void __user *)arg; 7840 struct mddev *mddev = NULL; 7841 7842 err = md_ioctl_valid(cmd); 7843 if (err) 7844 return err; 7845 7846 /* 7847 * Commands dealing with the RAID driver but not any 7848 * particular array: 7849 */ 7850 if (cmd == RAID_VERSION) 7851 return get_version(argp); 7852 7853 /* 7854 * Commands creating/starting a new array: 7855 */ 7856 7857 mddev = bdev->bd_disk->private_data; 7858 7859 /* Some actions do not requires the mutex */ 7860 switch (cmd) { 7861 case GET_ARRAY_INFO: 7862 if (!mddev->raid_disks && !mddev->external) 7863 return -ENODEV; 7864 return get_array_info(mddev, argp); 7865 7866 case GET_DISK_INFO: 7867 if (!mddev->raid_disks && !mddev->external) 7868 return -ENODEV; 7869 return get_disk_info(mddev, argp); 7870 7871 case SET_DISK_FAULTY: 7872 return set_disk_faulty(mddev, new_decode_dev(arg)); 7873 7874 case GET_BITMAP_FILE: 7875 return get_bitmap_file(mddev, argp); 7876 } 7877 7878 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7879 /* Need to flush page cache, and ensure no-one else opens 7880 * and writes 7881 */ 7882 err = mddev_set_closing_and_sync_blockdev(mddev, 1); 7883 if (err) 7884 return err; 7885 } 7886 7887 if (!md_is_rdwr(mddev)) 7888 flush_work(&mddev->sync_work); 7889 7890 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 7891 mddev_lock(mddev); 7892 if (err) { 7893 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7894 err, cmd); 7895 goto out; 7896 } 7897 7898 if (cmd == SET_ARRAY_INFO) { 7899 err = __md_set_array_info(mddev, argp); 7900 goto unlock; 7901 } 7902 7903 /* 7904 * Commands querying/configuring an existing array: 7905 */ 7906 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7907 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7908 if ((!mddev->raid_disks && !mddev->external) 7909 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7910 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7911 && cmd != GET_BITMAP_FILE) { 7912 err = -ENODEV; 7913 goto unlock; 7914 } 7915 7916 /* 7917 * Commands even a read-only array can execute: 7918 */ 7919 switch (cmd) { 7920 case RESTART_ARRAY_RW: 7921 err = restart_array(mddev); 7922 goto unlock; 7923 7924 case STOP_ARRAY: 7925 err = do_md_stop(mddev, 0); 7926 goto unlock; 7927 7928 case STOP_ARRAY_RO: 7929 if (mddev->pers) 7930 err = md_set_readonly(mddev); 7931 goto unlock; 7932 7933 case HOT_REMOVE_DISK: 7934 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7935 goto unlock; 7936 7937 case ADD_NEW_DISK: 7938 /* We can support ADD_NEW_DISK on read-only arrays 7939 * only if we are re-adding a preexisting device. 7940 * So require mddev->pers and MD_DISK_SYNC. 7941 */ 7942 if (mddev->pers) { 7943 mdu_disk_info_t info; 7944 if (copy_from_user(&info, argp, sizeof(info))) 7945 err = -EFAULT; 7946 else if (!(info.state & (1<<MD_DISK_SYNC))) 7947 /* Need to clear read-only for this */ 7948 break; 7949 else 7950 err = md_add_new_disk(mddev, &info); 7951 goto unlock; 7952 } 7953 break; 7954 } 7955 7956 /* 7957 * The remaining ioctls are changing the state of the 7958 * superblock, so we do not allow them on read-only arrays. 7959 */ 7960 if (!md_is_rdwr(mddev) && mddev->pers) { 7961 if (mddev->ro != MD_AUTO_READ) { 7962 err = -EROFS; 7963 goto unlock; 7964 } 7965 mddev->ro = MD_RDWR; 7966 sysfs_notify_dirent_safe(mddev->sysfs_state); 7967 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7968 /* mddev_unlock will wake thread */ 7969 /* If a device failed while we were read-only, we 7970 * need to make sure the metadata is updated now. 7971 */ 7972 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7973 mddev_unlock(mddev); 7974 wait_event(mddev->sb_wait, 7975 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7976 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7977 mddev_lock_nointr(mddev); 7978 } 7979 } 7980 7981 switch (cmd) { 7982 case ADD_NEW_DISK: 7983 { 7984 mdu_disk_info_t info; 7985 if (copy_from_user(&info, argp, sizeof(info))) 7986 err = -EFAULT; 7987 else 7988 err = md_add_new_disk(mddev, &info); 7989 goto unlock; 7990 } 7991 7992 case CLUSTERED_DISK_NACK: 7993 if (mddev_is_clustered(mddev)) 7994 mddev->cluster_ops->new_disk_ack(mddev, false); 7995 else 7996 err = -EINVAL; 7997 goto unlock; 7998 7999 case HOT_ADD_DISK: 8000 err = hot_add_disk(mddev, new_decode_dev(arg)); 8001 goto unlock; 8002 8003 case RUN_ARRAY: 8004 err = do_md_run(mddev); 8005 goto unlock; 8006 8007 case SET_BITMAP_FILE: 8008 err = set_bitmap_file(mddev, (int)arg); 8009 goto unlock; 8010 8011 default: 8012 err = -EINVAL; 8013 goto unlock; 8014 } 8015 8016 unlock: 8017 if (mddev->hold_active == UNTIL_IOCTL && 8018 err != -EINVAL) 8019 mddev->hold_active = 0; 8020 8021 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 8022 mddev_unlock(mddev); 8023 8024 out: 8025 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) 8026 clear_bit(MD_CLOSING, &mddev->flags); 8027 return err; 8028 } 8029 #ifdef CONFIG_COMPAT 8030 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 8031 unsigned int cmd, unsigned long arg) 8032 { 8033 switch (cmd) { 8034 case HOT_REMOVE_DISK: 8035 case HOT_ADD_DISK: 8036 case SET_DISK_FAULTY: 8037 case SET_BITMAP_FILE: 8038 /* These take in integer arg, do not convert */ 8039 break; 8040 default: 8041 arg = (unsigned long)compat_ptr(arg); 8042 break; 8043 } 8044 8045 return md_ioctl(bdev, mode, cmd, arg); 8046 } 8047 #endif /* CONFIG_COMPAT */ 8048 8049 static int md_set_read_only(struct block_device *bdev, bool ro) 8050 { 8051 struct mddev *mddev = bdev->bd_disk->private_data; 8052 int err; 8053 8054 err = mddev_lock(mddev); 8055 if (err) 8056 return err; 8057 8058 if (!mddev->raid_disks && !mddev->external) { 8059 err = -ENODEV; 8060 goto out_unlock; 8061 } 8062 8063 /* 8064 * Transitioning to read-auto need only happen for arrays that call 8065 * md_write_start and which are not ready for writes yet. 8066 */ 8067 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 8068 err = restart_array(mddev); 8069 if (err) 8070 goto out_unlock; 8071 mddev->ro = MD_AUTO_READ; 8072 } 8073 8074 out_unlock: 8075 mddev_unlock(mddev); 8076 return err; 8077 } 8078 8079 static int md_open(struct gendisk *disk, blk_mode_t mode) 8080 { 8081 struct mddev *mddev; 8082 int err; 8083 8084 spin_lock(&all_mddevs_lock); 8085 mddev = mddev_get(disk->private_data); 8086 spin_unlock(&all_mddevs_lock); 8087 if (!mddev) 8088 return -ENODEV; 8089 8090 err = mutex_lock_interruptible(&mddev->open_mutex); 8091 if (err) 8092 goto out; 8093 8094 err = -ENODEV; 8095 if (test_bit(MD_CLOSING, &mddev->flags)) 8096 goto out_unlock; 8097 8098 atomic_inc(&mddev->openers); 8099 mutex_unlock(&mddev->open_mutex); 8100 8101 disk_check_media_change(disk); 8102 return 0; 8103 8104 out_unlock: 8105 mutex_unlock(&mddev->open_mutex); 8106 out: 8107 mddev_put(mddev); 8108 return err; 8109 } 8110 8111 static void md_release(struct gendisk *disk) 8112 { 8113 struct mddev *mddev = disk->private_data; 8114 8115 BUG_ON(!mddev); 8116 atomic_dec(&mddev->openers); 8117 mddev_put(mddev); 8118 } 8119 8120 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 8121 { 8122 struct mddev *mddev = disk->private_data; 8123 unsigned int ret = 0; 8124 8125 if (mddev->changed) 8126 ret = DISK_EVENT_MEDIA_CHANGE; 8127 mddev->changed = 0; 8128 return ret; 8129 } 8130 8131 static void md_free_disk(struct gendisk *disk) 8132 { 8133 struct mddev *mddev = disk->private_data; 8134 8135 mddev_free(mddev); 8136 } 8137 8138 const struct block_device_operations md_fops = 8139 { 8140 .owner = THIS_MODULE, 8141 .submit_bio = md_submit_bio, 8142 .open = md_open, 8143 .release = md_release, 8144 .ioctl = md_ioctl, 8145 #ifdef CONFIG_COMPAT 8146 .compat_ioctl = md_compat_ioctl, 8147 #endif 8148 .getgeo = md_getgeo, 8149 .check_events = md_check_events, 8150 .set_read_only = md_set_read_only, 8151 .free_disk = md_free_disk, 8152 }; 8153 8154 static int md_thread(void *arg) 8155 { 8156 struct md_thread *thread = arg; 8157 8158 /* 8159 * md_thread is a 'system-thread', it's priority should be very 8160 * high. We avoid resource deadlocks individually in each 8161 * raid personality. (RAID5 does preallocation) We also use RR and 8162 * the very same RT priority as kswapd, thus we will never get 8163 * into a priority inversion deadlock. 8164 * 8165 * we definitely have to have equal or higher priority than 8166 * bdflush, otherwise bdflush will deadlock if there are too 8167 * many dirty RAID5 blocks. 8168 */ 8169 8170 allow_signal(SIGKILL); 8171 while (!kthread_should_stop()) { 8172 8173 /* We need to wait INTERRUPTIBLE so that 8174 * we don't add to the load-average. 8175 * That means we need to be sure no signals are 8176 * pending 8177 */ 8178 if (signal_pending(current)) 8179 flush_signals(current); 8180 8181 wait_event_interruptible_timeout 8182 (thread->wqueue, 8183 test_bit(THREAD_WAKEUP, &thread->flags) 8184 || kthread_should_stop() || kthread_should_park(), 8185 thread->timeout); 8186 8187 clear_bit(THREAD_WAKEUP, &thread->flags); 8188 if (kthread_should_park()) 8189 kthread_parkme(); 8190 if (!kthread_should_stop()) 8191 thread->run(thread); 8192 } 8193 8194 return 0; 8195 } 8196 8197 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 8198 { 8199 struct md_thread *t; 8200 8201 rcu_read_lock(); 8202 t = rcu_dereference(thread); 8203 if (t) 8204 wake_up_process(t->tsk); 8205 rcu_read_unlock(); 8206 } 8207 8208 void md_wakeup_thread(struct md_thread __rcu *thread) 8209 { 8210 struct md_thread *t; 8211 8212 rcu_read_lock(); 8213 t = rcu_dereference(thread); 8214 if (t) { 8215 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8216 set_bit(THREAD_WAKEUP, &t->flags); 8217 if (wq_has_sleeper(&t->wqueue)) 8218 wake_up(&t->wqueue); 8219 } 8220 rcu_read_unlock(); 8221 } 8222 EXPORT_SYMBOL(md_wakeup_thread); 8223 8224 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8225 struct mddev *mddev, const char *name) 8226 { 8227 struct md_thread *thread; 8228 8229 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8230 if (!thread) 8231 return NULL; 8232 8233 init_waitqueue_head(&thread->wqueue); 8234 8235 thread->run = run; 8236 thread->mddev = mddev; 8237 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8238 thread->tsk = kthread_run(md_thread, thread, 8239 "%s_%s", 8240 mdname(thread->mddev), 8241 name); 8242 if (IS_ERR(thread->tsk)) { 8243 kfree(thread); 8244 return NULL; 8245 } 8246 return thread; 8247 } 8248 EXPORT_SYMBOL(md_register_thread); 8249 8250 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8251 { 8252 struct md_thread *thread = rcu_dereference_protected(*threadp, 8253 lockdep_is_held(&mddev->reconfig_mutex)); 8254 8255 if (!thread) 8256 return; 8257 8258 rcu_assign_pointer(*threadp, NULL); 8259 synchronize_rcu(); 8260 8261 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8262 kthread_stop(thread->tsk); 8263 kfree(thread); 8264 } 8265 EXPORT_SYMBOL(md_unregister_thread); 8266 8267 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8268 { 8269 if (!rdev || test_bit(Faulty, &rdev->flags)) 8270 return; 8271 8272 if (!mddev->pers || !mddev->pers->error_handler) 8273 return; 8274 mddev->pers->error_handler(mddev, rdev); 8275 8276 if (mddev->pers->head.id == ID_RAID0 || 8277 mddev->pers->head.id == ID_LINEAR) 8278 return; 8279 8280 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8281 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8282 sysfs_notify_dirent_safe(rdev->sysfs_state); 8283 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8284 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8285 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8286 md_wakeup_thread(mddev->thread); 8287 } 8288 if (mddev->event_work.func) 8289 queue_work(md_misc_wq, &mddev->event_work); 8290 md_new_event(); 8291 } 8292 EXPORT_SYMBOL(md_error); 8293 8294 /* seq_file implementation /proc/mdstat */ 8295 8296 static void status_unused(struct seq_file *seq) 8297 { 8298 int i = 0; 8299 struct md_rdev *rdev; 8300 8301 seq_printf(seq, "unused devices: "); 8302 8303 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8304 i++; 8305 seq_printf(seq, "%pg ", rdev->bdev); 8306 } 8307 if (!i) 8308 seq_printf(seq, "<none>"); 8309 8310 seq_printf(seq, "\n"); 8311 } 8312 8313 static void status_personalities(struct seq_file *seq) 8314 { 8315 struct md_submodule_head *head; 8316 unsigned long i; 8317 8318 seq_puts(seq, "Personalities : "); 8319 8320 xa_lock(&md_submodule); 8321 xa_for_each(&md_submodule, i, head) 8322 if (head->type == MD_PERSONALITY) 8323 seq_printf(seq, "[%s] ", head->name); 8324 xa_unlock(&md_submodule); 8325 8326 seq_puts(seq, "\n"); 8327 } 8328 8329 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8330 { 8331 sector_t max_sectors, resync, res; 8332 unsigned long dt, db = 0; 8333 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8334 int scale, recovery_active; 8335 unsigned int per_milli; 8336 8337 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8338 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8339 max_sectors = mddev->resync_max_sectors; 8340 else 8341 max_sectors = mddev->dev_sectors; 8342 8343 resync = mddev->curr_resync; 8344 if (resync < MD_RESYNC_ACTIVE) { 8345 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8346 /* Still cleaning up */ 8347 resync = max_sectors; 8348 } else if (resync > max_sectors) { 8349 resync = max_sectors; 8350 } else { 8351 res = atomic_read(&mddev->recovery_active); 8352 /* 8353 * Resync has started, but the subtraction has overflowed or 8354 * yielded one of the special values. Force it to active to 8355 * ensure the status reports an active resync. 8356 */ 8357 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8358 resync = MD_RESYNC_ACTIVE; 8359 else 8360 resync -= res; 8361 } 8362 8363 if (resync == MD_RESYNC_NONE) { 8364 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8365 struct md_rdev *rdev; 8366 8367 rdev_for_each(rdev, mddev) 8368 if (rdev->raid_disk >= 0 && 8369 !test_bit(Faulty, &rdev->flags) && 8370 rdev->recovery_offset != MaxSector && 8371 rdev->recovery_offset) { 8372 seq_printf(seq, "\trecover=REMOTE"); 8373 return 1; 8374 } 8375 if (mddev->reshape_position != MaxSector) 8376 seq_printf(seq, "\treshape=REMOTE"); 8377 else 8378 seq_printf(seq, "\tresync=REMOTE"); 8379 return 1; 8380 } 8381 if (mddev->resync_offset < MaxSector) { 8382 seq_printf(seq, "\tresync=PENDING"); 8383 return 1; 8384 } 8385 return 0; 8386 } 8387 if (resync < MD_RESYNC_ACTIVE) { 8388 seq_printf(seq, "\tresync=DELAYED"); 8389 return 1; 8390 } 8391 8392 WARN_ON(max_sectors == 0); 8393 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8394 * in a sector_t, and (max_sectors>>scale) will fit in a 8395 * u32, as those are the requirements for sector_div. 8396 * Thus 'scale' must be at least 10 8397 */ 8398 scale = 10; 8399 if (sizeof(sector_t) > sizeof(unsigned long)) { 8400 while ( max_sectors/2 > (1ULL<<(scale+32))) 8401 scale++; 8402 } 8403 res = (resync>>scale)*1000; 8404 sector_div(res, (u32)((max_sectors>>scale)+1)); 8405 8406 per_milli = res; 8407 { 8408 int i, x = per_milli/50, y = 20-x; 8409 seq_printf(seq, "["); 8410 for (i = 0; i < x; i++) 8411 seq_printf(seq, "="); 8412 seq_printf(seq, ">"); 8413 for (i = 0; i < y; i++) 8414 seq_printf(seq, "."); 8415 seq_printf(seq, "] "); 8416 } 8417 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8418 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8419 "reshape" : 8420 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8421 "check" : 8422 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8423 "resync" : "recovery"))), 8424 per_milli/10, per_milli % 10, 8425 (unsigned long long) resync/2, 8426 (unsigned long long) max_sectors/2); 8427 8428 /* 8429 * dt: time from mark until now 8430 * db: blocks written from mark until now 8431 * rt: remaining time 8432 * 8433 * rt is a sector_t, which is always 64bit now. We are keeping 8434 * the original algorithm, but it is not really necessary. 8435 * 8436 * Original algorithm: 8437 * So we divide before multiply in case it is 32bit and close 8438 * to the limit. 8439 * We scale the divisor (db) by 32 to avoid losing precision 8440 * near the end of resync when the number of remaining sectors 8441 * is close to 'db'. 8442 * We then divide rt by 32 after multiplying by db to compensate. 8443 * The '+1' avoids division by zero if db is very small. 8444 */ 8445 dt = ((jiffies - mddev->resync_mark) / HZ); 8446 if (!dt) dt++; 8447 8448 curr_mark_cnt = mddev->curr_mark_cnt; 8449 recovery_active = atomic_read(&mddev->recovery_active); 8450 resync_mark_cnt = mddev->resync_mark_cnt; 8451 8452 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8453 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8454 8455 rt = max_sectors - resync; /* number of remaining sectors */ 8456 rt = div64_u64(rt, db/32+1); 8457 rt *= dt; 8458 rt >>= 5; 8459 8460 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8461 ((unsigned long)rt % 60)/6); 8462 8463 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8464 return 1; 8465 } 8466 8467 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8468 __acquires(&all_mddevs_lock) 8469 { 8470 seq->poll_event = atomic_read(&md_event_count); 8471 spin_lock(&all_mddevs_lock); 8472 8473 return seq_list_start_head(&all_mddevs, *pos); 8474 } 8475 8476 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8477 { 8478 return seq_list_next(v, &all_mddevs, pos); 8479 } 8480 8481 static void md_seq_stop(struct seq_file *seq, void *v) 8482 __releases(&all_mddevs_lock) 8483 { 8484 spin_unlock(&all_mddevs_lock); 8485 } 8486 8487 static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) 8488 { 8489 struct md_bitmap_stats stats; 8490 unsigned long used_pages; 8491 unsigned long chunk_kb; 8492 int err; 8493 8494 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 8495 if (err) 8496 return; 8497 8498 chunk_kb = mddev->bitmap_info.chunksize >> 10; 8499 used_pages = stats.pages - stats.missing_pages; 8500 8501 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", 8502 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), 8503 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 8504 chunk_kb ? "KB" : "B"); 8505 8506 if (stats.file) { 8507 seq_puts(seq, ", file: "); 8508 seq_file_path(seq, stats.file, " \t\n"); 8509 } 8510 8511 seq_putc(seq, '\n'); 8512 } 8513 8514 static int md_seq_show(struct seq_file *seq, void *v) 8515 { 8516 struct mddev *mddev; 8517 sector_t sectors; 8518 struct md_rdev *rdev; 8519 8520 if (v == &all_mddevs) { 8521 status_personalities(seq); 8522 if (list_empty(&all_mddevs)) 8523 status_unused(seq); 8524 return 0; 8525 } 8526 8527 mddev = list_entry(v, struct mddev, all_mddevs); 8528 if (!mddev_get(mddev)) 8529 return 0; 8530 8531 spin_unlock(&all_mddevs_lock); 8532 8533 /* prevent bitmap to be freed after checking */ 8534 mutex_lock(&mddev->bitmap_info.mutex); 8535 8536 spin_lock(&mddev->lock); 8537 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8538 seq_printf(seq, "%s : ", mdname(mddev)); 8539 if (mddev->pers) { 8540 if (test_bit(MD_BROKEN, &mddev->flags)) 8541 seq_printf(seq, "broken"); 8542 else 8543 seq_printf(seq, "active"); 8544 if (mddev->ro == MD_RDONLY) 8545 seq_printf(seq, " (read-only)"); 8546 if (mddev->ro == MD_AUTO_READ) 8547 seq_printf(seq, " (auto-read-only)"); 8548 seq_printf(seq, " %s", mddev->pers->head.name); 8549 } else { 8550 seq_printf(seq, "inactive"); 8551 } 8552 8553 sectors = 0; 8554 rcu_read_lock(); 8555 rdev_for_each_rcu(rdev, mddev) { 8556 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8557 8558 if (test_bit(WriteMostly, &rdev->flags)) 8559 seq_printf(seq, "(W)"); 8560 if (test_bit(Journal, &rdev->flags)) 8561 seq_printf(seq, "(J)"); 8562 if (test_bit(Faulty, &rdev->flags)) { 8563 seq_printf(seq, "(F)"); 8564 continue; 8565 } 8566 if (rdev->raid_disk < 0) 8567 seq_printf(seq, "(S)"); /* spare */ 8568 if (test_bit(Replacement, &rdev->flags)) 8569 seq_printf(seq, "(R)"); 8570 sectors += rdev->sectors; 8571 } 8572 rcu_read_unlock(); 8573 8574 if (!list_empty(&mddev->disks)) { 8575 if (mddev->pers) 8576 seq_printf(seq, "\n %llu blocks", 8577 (unsigned long long) 8578 mddev->array_sectors / 2); 8579 else 8580 seq_printf(seq, "\n %llu blocks", 8581 (unsigned long long)sectors / 2); 8582 } 8583 if (mddev->persistent) { 8584 if (mddev->major_version != 0 || 8585 mddev->minor_version != 90) { 8586 seq_printf(seq," super %d.%d", 8587 mddev->major_version, 8588 mddev->minor_version); 8589 } 8590 } else if (mddev->external) 8591 seq_printf(seq, " super external:%s", 8592 mddev->metadata_type); 8593 else 8594 seq_printf(seq, " super non-persistent"); 8595 8596 if (mddev->pers) { 8597 mddev->pers->status(seq, mddev); 8598 seq_printf(seq, "\n "); 8599 if (mddev->pers->sync_request) { 8600 if (status_resync(seq, mddev)) 8601 seq_printf(seq, "\n "); 8602 } 8603 } else 8604 seq_printf(seq, "\n "); 8605 8606 md_bitmap_status(seq, mddev); 8607 8608 seq_printf(seq, "\n"); 8609 } 8610 spin_unlock(&mddev->lock); 8611 mutex_unlock(&mddev->bitmap_info.mutex); 8612 spin_lock(&all_mddevs_lock); 8613 8614 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) 8615 status_unused(seq); 8616 8617 mddev_put_locked(mddev); 8618 return 0; 8619 } 8620 8621 static const struct seq_operations md_seq_ops = { 8622 .start = md_seq_start, 8623 .next = md_seq_next, 8624 .stop = md_seq_stop, 8625 .show = md_seq_show, 8626 }; 8627 8628 static int md_seq_open(struct inode *inode, struct file *file) 8629 { 8630 struct seq_file *seq; 8631 int error; 8632 8633 error = seq_open(file, &md_seq_ops); 8634 if (error) 8635 return error; 8636 8637 seq = file->private_data; 8638 seq->poll_event = atomic_read(&md_event_count); 8639 return error; 8640 } 8641 8642 static int md_unloading; 8643 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8644 { 8645 struct seq_file *seq = filp->private_data; 8646 __poll_t mask; 8647 8648 if (md_unloading) 8649 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8650 poll_wait(filp, &md_event_waiters, wait); 8651 8652 /* always allow read */ 8653 mask = EPOLLIN | EPOLLRDNORM; 8654 8655 if (seq->poll_event != atomic_read(&md_event_count)) 8656 mask |= EPOLLERR | EPOLLPRI; 8657 return mask; 8658 } 8659 8660 static const struct proc_ops mdstat_proc_ops = { 8661 .proc_open = md_seq_open, 8662 .proc_read = seq_read, 8663 .proc_lseek = seq_lseek, 8664 .proc_release = seq_release, 8665 .proc_poll = mdstat_poll, 8666 }; 8667 8668 int register_md_submodule(struct md_submodule_head *msh) 8669 { 8670 return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); 8671 } 8672 EXPORT_SYMBOL_GPL(register_md_submodule); 8673 8674 void unregister_md_submodule(struct md_submodule_head *msh) 8675 { 8676 xa_erase(&md_submodule, msh->id); 8677 } 8678 EXPORT_SYMBOL_GPL(unregister_md_submodule); 8679 8680 int md_setup_cluster(struct mddev *mddev, int nodes) 8681 { 8682 int ret = get_cluster_ops(mddev); 8683 8684 if (ret) { 8685 request_module("md-cluster"); 8686 ret = get_cluster_ops(mddev); 8687 } 8688 8689 /* ensure module won't be unloaded */ 8690 if (ret) { 8691 pr_warn("can't find md-cluster module or get its reference.\n"); 8692 return ret; 8693 } 8694 8695 ret = mddev->cluster_ops->join(mddev, nodes); 8696 if (!ret) 8697 mddev->safemode_delay = 0; 8698 return ret; 8699 } 8700 8701 void md_cluster_stop(struct mddev *mddev) 8702 { 8703 put_cluster_ops(mddev); 8704 } 8705 8706 static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) 8707 { 8708 unsigned long last_events = rdev->last_events; 8709 8710 if (!bdev_is_partition(rdev->bdev)) 8711 return true; 8712 8713 /* 8714 * If rdev is partition, and user doesn't issue IO to the array, the 8715 * array is still not idle if user issues IO to other partitions. 8716 */ 8717 rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, 8718 sectors) - 8719 part_stat_read_accum(rdev->bdev, sectors); 8720 8721 return init || rdev->last_events <= last_events; 8722 } 8723 8724 /* 8725 * mddev is idle if following conditions are matched since last check: 8726 * 1) mddev doesn't have normal IO completed; 8727 * 2) mddev doesn't have inflight normal IO; 8728 * 3) if any member disk is partition, and other partitions don't have IO 8729 * completed; 8730 * 8731 * Noted this checking rely on IO accounting is enabled. 8732 */ 8733 static bool is_mddev_idle(struct mddev *mddev, int init) 8734 { 8735 unsigned long last_events = mddev->normal_io_events; 8736 struct gendisk *disk; 8737 struct md_rdev *rdev; 8738 bool idle = true; 8739 8740 disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; 8741 if (!disk) 8742 return true; 8743 8744 mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); 8745 if (!init && (mddev->normal_io_events > last_events || 8746 bdev_count_inflight(disk->part0))) 8747 idle = false; 8748 8749 rcu_read_lock(); 8750 rdev_for_each_rcu(rdev, mddev) 8751 if (!is_rdev_holder_idle(rdev, init)) 8752 idle = false; 8753 rcu_read_unlock(); 8754 8755 return idle; 8756 } 8757 8758 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8759 { 8760 /* another "blocks" (512byte) blocks have been synced */ 8761 atomic_sub(blocks, &mddev->recovery_active); 8762 wake_up(&mddev->recovery_wait); 8763 if (!ok) { 8764 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8765 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8766 md_wakeup_thread(mddev->thread); 8767 // stop recovery, signal do_sync .... 8768 } 8769 } 8770 EXPORT_SYMBOL(md_done_sync); 8771 8772 /* md_write_start(mddev, bi) 8773 * If we need to update some array metadata (e.g. 'active' flag 8774 * in superblock) before writing, schedule a superblock update 8775 * and wait for it to complete. 8776 * A return value of 'false' means that the write wasn't recorded 8777 * and cannot proceed as the array is being suspend. 8778 */ 8779 void md_write_start(struct mddev *mddev, struct bio *bi) 8780 { 8781 int did_change = 0; 8782 8783 if (bio_data_dir(bi) != WRITE) 8784 return; 8785 8786 BUG_ON(mddev->ro == MD_RDONLY); 8787 if (mddev->ro == MD_AUTO_READ) { 8788 /* need to switch to read/write */ 8789 mddev->ro = MD_RDWR; 8790 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8791 md_wakeup_thread(mddev->thread); 8792 md_wakeup_thread(mddev->sync_thread); 8793 did_change = 1; 8794 } 8795 rcu_read_lock(); 8796 percpu_ref_get(&mddev->writes_pending); 8797 smp_mb(); /* Match smp_mb in set_in_sync() */ 8798 if (mddev->safemode == 1) 8799 mddev->safemode = 0; 8800 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8801 if (mddev->in_sync || mddev->sync_checkers) { 8802 spin_lock(&mddev->lock); 8803 if (mddev->in_sync) { 8804 mddev->in_sync = 0; 8805 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8806 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8807 md_wakeup_thread(mddev->thread); 8808 did_change = 1; 8809 } 8810 spin_unlock(&mddev->lock); 8811 } 8812 rcu_read_unlock(); 8813 if (did_change) 8814 sysfs_notify_dirent_safe(mddev->sysfs_state); 8815 if (!mddev->has_superblocks) 8816 return; 8817 wait_event(mddev->sb_wait, 8818 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8819 } 8820 EXPORT_SYMBOL(md_write_start); 8821 8822 /* md_write_inc can only be called when md_write_start() has 8823 * already been called at least once of the current request. 8824 * It increments the counter and is useful when a single request 8825 * is split into several parts. Each part causes an increment and 8826 * so needs a matching md_write_end(). 8827 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8828 * a spinlocked region. 8829 */ 8830 void md_write_inc(struct mddev *mddev, struct bio *bi) 8831 { 8832 if (bio_data_dir(bi) != WRITE) 8833 return; 8834 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8835 percpu_ref_get(&mddev->writes_pending); 8836 } 8837 EXPORT_SYMBOL(md_write_inc); 8838 8839 void md_write_end(struct mddev *mddev) 8840 { 8841 percpu_ref_put(&mddev->writes_pending); 8842 8843 if (mddev->safemode == 2) 8844 md_wakeup_thread(mddev->thread); 8845 else if (mddev->safemode_delay) 8846 /* The roundup() ensures this only performs locking once 8847 * every ->safemode_delay jiffies 8848 */ 8849 mod_timer(&mddev->safemode_timer, 8850 roundup(jiffies, mddev->safemode_delay) + 8851 mddev->safemode_delay); 8852 } 8853 8854 EXPORT_SYMBOL(md_write_end); 8855 8856 /* This is used by raid0 and raid10 */ 8857 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8858 struct bio *bio, sector_t start, sector_t size) 8859 { 8860 struct bio *discard_bio = NULL; 8861 8862 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8863 &discard_bio) || !discard_bio) 8864 return; 8865 8866 bio_chain(discard_bio, bio); 8867 bio_clone_blkg_association(discard_bio, bio); 8868 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); 8869 submit_bio_noacct(discard_bio); 8870 } 8871 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8872 8873 static void md_bitmap_start(struct mddev *mddev, 8874 struct md_io_clone *md_io_clone) 8875 { 8876 if (mddev->pers->bitmap_sector) 8877 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, 8878 &md_io_clone->sectors); 8879 8880 mddev->bitmap_ops->start_write(mddev, md_io_clone->offset, 8881 md_io_clone->sectors); 8882 } 8883 8884 static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) 8885 { 8886 mddev->bitmap_ops->end_write(mddev, md_io_clone->offset, 8887 md_io_clone->sectors); 8888 } 8889 8890 static void md_end_clone_io(struct bio *bio) 8891 { 8892 struct md_io_clone *md_io_clone = bio->bi_private; 8893 struct bio *orig_bio = md_io_clone->orig_bio; 8894 struct mddev *mddev = md_io_clone->mddev; 8895 8896 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) 8897 md_bitmap_end(mddev, md_io_clone); 8898 8899 if (bio->bi_status && !orig_bio->bi_status) 8900 orig_bio->bi_status = bio->bi_status; 8901 8902 if (md_io_clone->start_time) 8903 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8904 8905 bio_put(bio); 8906 bio_endio(orig_bio); 8907 percpu_ref_put(&mddev->active_io); 8908 } 8909 8910 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8911 { 8912 struct block_device *bdev = (*bio)->bi_bdev; 8913 struct md_io_clone *md_io_clone; 8914 struct bio *clone = 8915 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8916 8917 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8918 md_io_clone->orig_bio = *bio; 8919 md_io_clone->mddev = mddev; 8920 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8921 md_io_clone->start_time = bio_start_io_acct(*bio); 8922 8923 if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { 8924 md_io_clone->offset = (*bio)->bi_iter.bi_sector; 8925 md_io_clone->sectors = bio_sectors(*bio); 8926 md_bitmap_start(mddev, md_io_clone); 8927 } 8928 8929 clone->bi_end_io = md_end_clone_io; 8930 clone->bi_private = md_io_clone; 8931 *bio = clone; 8932 } 8933 8934 void md_account_bio(struct mddev *mddev, struct bio **bio) 8935 { 8936 percpu_ref_get(&mddev->active_io); 8937 md_clone_bio(mddev, bio); 8938 } 8939 EXPORT_SYMBOL_GPL(md_account_bio); 8940 8941 void md_free_cloned_bio(struct bio *bio) 8942 { 8943 struct md_io_clone *md_io_clone = bio->bi_private; 8944 struct bio *orig_bio = md_io_clone->orig_bio; 8945 struct mddev *mddev = md_io_clone->mddev; 8946 8947 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) 8948 md_bitmap_end(mddev, md_io_clone); 8949 8950 if (bio->bi_status && !orig_bio->bi_status) 8951 orig_bio->bi_status = bio->bi_status; 8952 8953 if (md_io_clone->start_time) 8954 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8955 8956 bio_put(bio); 8957 percpu_ref_put(&mddev->active_io); 8958 } 8959 EXPORT_SYMBOL_GPL(md_free_cloned_bio); 8960 8961 /* md_allow_write(mddev) 8962 * Calling this ensures that the array is marked 'active' so that writes 8963 * may proceed without blocking. It is important to call this before 8964 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8965 * Must be called with mddev_lock held. 8966 */ 8967 void md_allow_write(struct mddev *mddev) 8968 { 8969 if (!mddev->pers) 8970 return; 8971 if (!md_is_rdwr(mddev)) 8972 return; 8973 if (!mddev->pers->sync_request) 8974 return; 8975 8976 spin_lock(&mddev->lock); 8977 if (mddev->in_sync) { 8978 mddev->in_sync = 0; 8979 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8980 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8981 if (mddev->safemode_delay && 8982 mddev->safemode == 0) 8983 mddev->safemode = 1; 8984 spin_unlock(&mddev->lock); 8985 md_update_sb(mddev, 0); 8986 sysfs_notify_dirent_safe(mddev->sysfs_state); 8987 /* wait for the dirty state to be recorded in the metadata */ 8988 wait_event(mddev->sb_wait, 8989 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8990 } else 8991 spin_unlock(&mddev->lock); 8992 } 8993 EXPORT_SYMBOL_GPL(md_allow_write); 8994 8995 static sector_t md_sync_max_sectors(struct mddev *mddev, 8996 enum sync_action action) 8997 { 8998 switch (action) { 8999 case ACTION_RESYNC: 9000 case ACTION_CHECK: 9001 case ACTION_REPAIR: 9002 atomic64_set(&mddev->resync_mismatches, 0); 9003 fallthrough; 9004 case ACTION_RESHAPE: 9005 return mddev->resync_max_sectors; 9006 case ACTION_RECOVER: 9007 return mddev->dev_sectors; 9008 default: 9009 return 0; 9010 } 9011 } 9012 9013 static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) 9014 { 9015 sector_t start = 0; 9016 struct md_rdev *rdev; 9017 9018 switch (action) { 9019 case ACTION_CHECK: 9020 case ACTION_REPAIR: 9021 return mddev->resync_min; 9022 case ACTION_RESYNC: 9023 if (!mddev->bitmap) 9024 return mddev->resync_offset; 9025 return 0; 9026 case ACTION_RESHAPE: 9027 /* 9028 * If the original node aborts reshaping then we continue the 9029 * reshaping, so set again to avoid restart reshape from the 9030 * first beginning 9031 */ 9032 if (mddev_is_clustered(mddev) && 9033 mddev->reshape_position != MaxSector) 9034 return mddev->reshape_position; 9035 return 0; 9036 case ACTION_RECOVER: 9037 start = MaxSector; 9038 rcu_read_lock(); 9039 rdev_for_each_rcu(rdev, mddev) 9040 if (rdev_needs_recovery(rdev, start)) 9041 start = rdev->recovery_offset; 9042 rcu_read_unlock(); 9043 9044 /* If there is a bitmap, we need to make sure all 9045 * writes that started before we added a spare 9046 * complete before we start doing a recovery. 9047 * Otherwise the write might complete and (via 9048 * bitmap_endwrite) set a bit in the bitmap after the 9049 * recovery has checked that bit and skipped that 9050 * region. 9051 */ 9052 if (mddev->bitmap) { 9053 mddev->pers->quiesce(mddev, 1); 9054 mddev->pers->quiesce(mddev, 0); 9055 } 9056 return start; 9057 default: 9058 return MaxSector; 9059 } 9060 } 9061 9062 static bool sync_io_within_limit(struct mddev *mddev) 9063 { 9064 int io_sectors; 9065 9066 /* 9067 * For raid456, sync IO is stripe(4k) per IO, for other levels, it's 9068 * RESYNC_PAGES(64k) per IO. 9069 */ 9070 if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) 9071 io_sectors = 8; 9072 else 9073 io_sectors = 128; 9074 9075 return atomic_read(&mddev->recovery_active) < 9076 io_sectors * sync_io_depth(mddev); 9077 } 9078 9079 #define SYNC_MARKS 10 9080 #define SYNC_MARK_STEP (3*HZ) 9081 #define UPDATE_FREQUENCY (5*60*HZ) 9082 void md_do_sync(struct md_thread *thread) 9083 { 9084 struct mddev *mddev = thread->mddev; 9085 struct mddev *mddev2; 9086 unsigned int currspeed = 0, window; 9087 sector_t max_sectors,j, io_sectors, recovery_done; 9088 unsigned long mark[SYNC_MARKS]; 9089 unsigned long update_time; 9090 sector_t mark_cnt[SYNC_MARKS]; 9091 int last_mark,m; 9092 sector_t last_check; 9093 int skipped = 0; 9094 struct md_rdev *rdev; 9095 enum sync_action action; 9096 const char *desc; 9097 struct blk_plug plug; 9098 int ret; 9099 9100 /* just incase thread restarts... */ 9101 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9102 return; 9103 9104 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9105 goto skip; 9106 9107 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || 9108 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 9109 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9110 goto skip; 9111 } 9112 9113 if (mddev_is_clustered(mddev)) { 9114 ret = mddev->cluster_ops->resync_start(mddev); 9115 if (ret) 9116 goto skip; 9117 9118 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 9119 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 9120 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 9121 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 9122 && ((unsigned long long)mddev->curr_resync_completed 9123 < (unsigned long long)mddev->resync_max_sectors)) 9124 goto skip; 9125 } 9126 9127 action = md_sync_action(mddev); 9128 desc = md_sync_action_name(action); 9129 mddev->last_sync_action = action; 9130 9131 /* 9132 * Before starting a resync we must have set curr_resync to 9133 * 2, and then checked that every "conflicting" array has curr_resync 9134 * less than ours. When we find one that is the same or higher 9135 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 9136 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 9137 * This will mean we have to start checking from the beginning again. 9138 * 9139 */ 9140 if (mddev_is_clustered(mddev)) 9141 mddev->cluster_ops->resync_start_notify(mddev); 9142 do { 9143 int mddev2_minor = -1; 9144 mddev->curr_resync = MD_RESYNC_DELAYED; 9145 9146 try_again: 9147 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9148 goto skip; 9149 spin_lock(&all_mddevs_lock); 9150 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 9151 if (test_bit(MD_DELETED, &mddev2->flags)) 9152 continue; 9153 if (mddev2 == mddev) 9154 continue; 9155 if (!mddev->parallel_resync 9156 && mddev2->curr_resync 9157 && match_mddev_units(mddev, mddev2)) { 9158 DEFINE_WAIT(wq); 9159 if (mddev < mddev2 && 9160 mddev->curr_resync == MD_RESYNC_DELAYED) { 9161 /* arbitrarily yield */ 9162 mddev->curr_resync = MD_RESYNC_YIELDED; 9163 wake_up(&resync_wait); 9164 } 9165 if (mddev > mddev2 && 9166 mddev->curr_resync == MD_RESYNC_YIELDED) 9167 /* no need to wait here, we can wait the next 9168 * time 'round when curr_resync == 2 9169 */ 9170 continue; 9171 /* We need to wait 'interruptible' so as not to 9172 * contribute to the load average, and not to 9173 * be caught by 'softlockup' 9174 */ 9175 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 9176 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9177 mddev2->curr_resync >= mddev->curr_resync) { 9178 if (mddev2_minor != mddev2->md_minor) { 9179 mddev2_minor = mddev2->md_minor; 9180 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 9181 desc, mdname(mddev), 9182 mdname(mddev2)); 9183 } 9184 spin_unlock(&all_mddevs_lock); 9185 9186 if (signal_pending(current)) 9187 flush_signals(current); 9188 schedule(); 9189 finish_wait(&resync_wait, &wq); 9190 goto try_again; 9191 } 9192 finish_wait(&resync_wait, &wq); 9193 } 9194 } 9195 spin_unlock(&all_mddevs_lock); 9196 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 9197 9198 max_sectors = md_sync_max_sectors(mddev, action); 9199 j = md_sync_position(mddev, action); 9200 9201 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 9202 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 9203 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 9204 speed_max(mddev), desc); 9205 9206 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 9207 9208 io_sectors = 0; 9209 for (m = 0; m < SYNC_MARKS; m++) { 9210 mark[m] = jiffies; 9211 mark_cnt[m] = io_sectors; 9212 } 9213 last_mark = 0; 9214 mddev->resync_mark = mark[last_mark]; 9215 mddev->resync_mark_cnt = mark_cnt[last_mark]; 9216 9217 /* 9218 * Tune reconstruction: 9219 */ 9220 window = 32 * (PAGE_SIZE / 512); 9221 pr_debug("md: using %dk window, over a total of %lluk.\n", 9222 window/2, (unsigned long long)max_sectors/2); 9223 9224 atomic_set(&mddev->recovery_active, 0); 9225 last_check = 0; 9226 9227 if (j >= MD_RESYNC_ACTIVE) { 9228 pr_debug("md: resuming %s of %s from checkpoint.\n", 9229 desc, mdname(mddev)); 9230 mddev->curr_resync = j; 9231 } else 9232 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 9233 mddev->curr_resync_completed = j; 9234 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9235 md_new_event(); 9236 update_time = jiffies; 9237 9238 blk_start_plug(&plug); 9239 while (j < max_sectors) { 9240 sector_t sectors; 9241 9242 skipped = 0; 9243 9244 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9245 ((mddev->curr_resync > mddev->curr_resync_completed && 9246 (mddev->curr_resync - mddev->curr_resync_completed) 9247 > (max_sectors >> 4)) || 9248 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9249 (j - mddev->curr_resync_completed)*2 9250 >= mddev->resync_max - mddev->curr_resync_completed || 9251 mddev->curr_resync_completed > mddev->resync_max 9252 )) { 9253 /* time to update curr_resync_completed */ 9254 wait_event(mddev->recovery_wait, 9255 atomic_read(&mddev->recovery_active) == 0); 9256 mddev->curr_resync_completed = j; 9257 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9258 j > mddev->resync_offset) 9259 mddev->resync_offset = j; 9260 update_time = jiffies; 9261 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9262 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9263 } 9264 9265 while (j >= mddev->resync_max && 9266 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9267 /* As this condition is controlled by user-space, 9268 * we can block indefinitely, so use '_interruptible' 9269 * to avoid triggering warnings. 9270 */ 9271 flush_signals(current); /* just in case */ 9272 wait_event_interruptible(mddev->recovery_wait, 9273 mddev->resync_max > j 9274 || test_bit(MD_RECOVERY_INTR, 9275 &mddev->recovery)); 9276 } 9277 9278 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9279 break; 9280 9281 sectors = mddev->pers->sync_request(mddev, j, max_sectors, 9282 &skipped); 9283 if (sectors == 0) { 9284 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9285 break; 9286 } 9287 9288 if (!skipped) { /* actual IO requested */ 9289 io_sectors += sectors; 9290 atomic_add(sectors, &mddev->recovery_active); 9291 } 9292 9293 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9294 break; 9295 9296 j += sectors; 9297 if (j > max_sectors) 9298 /* when skipping, extra large numbers can be returned. */ 9299 j = max_sectors; 9300 if (j >= MD_RESYNC_ACTIVE) 9301 mddev->curr_resync = j; 9302 mddev->curr_mark_cnt = io_sectors; 9303 if (last_check == 0) 9304 /* this is the earliest that rebuild will be 9305 * visible in /proc/mdstat 9306 */ 9307 md_new_event(); 9308 9309 if (last_check + window > io_sectors || j == max_sectors) 9310 continue; 9311 9312 last_check = io_sectors; 9313 repeat: 9314 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9315 /* step marks */ 9316 int next = (last_mark+1) % SYNC_MARKS; 9317 9318 mddev->resync_mark = mark[next]; 9319 mddev->resync_mark_cnt = mark_cnt[next]; 9320 mark[next] = jiffies; 9321 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9322 last_mark = next; 9323 } 9324 9325 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9326 break; 9327 9328 /* 9329 * this loop exits only if either when we are slower than 9330 * the 'hard' speed limit, or the system was IO-idle for 9331 * a jiffy. 9332 * the system might be non-idle CPU-wise, but we only care 9333 * about not overloading the IO subsystem. (things like an 9334 * e2fsck being done on the RAID array should execute fast) 9335 */ 9336 cond_resched(); 9337 9338 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9339 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9340 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9341 9342 if (currspeed > speed_min(mddev)) { 9343 if (currspeed > speed_max(mddev)) { 9344 msleep(500); 9345 goto repeat; 9346 } 9347 if (!sync_io_within_limit(mddev) && 9348 !is_mddev_idle(mddev, 0)) { 9349 /* 9350 * Give other IO more of a chance. 9351 * The faster the devices, the less we wait. 9352 */ 9353 wait_event(mddev->recovery_wait, 9354 !atomic_read(&mddev->recovery_active)); 9355 } 9356 } 9357 } 9358 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9359 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9360 ? "interrupted" : "done"); 9361 /* 9362 * this also signals 'finished resyncing' to md_stop 9363 */ 9364 blk_finish_plug(&plug); 9365 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9366 9367 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9368 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9369 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9370 mddev->curr_resync_completed = mddev->curr_resync; 9371 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9372 } 9373 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); 9374 9375 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9376 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9377 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9378 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9379 if (mddev->curr_resync >= mddev->resync_offset) { 9380 pr_debug("md: checkpointing %s of %s.\n", 9381 desc, mdname(mddev)); 9382 if (test_bit(MD_RECOVERY_ERROR, 9383 &mddev->recovery)) 9384 mddev->resync_offset = 9385 mddev->curr_resync_completed; 9386 else 9387 mddev->resync_offset = 9388 mddev->curr_resync; 9389 } 9390 } else 9391 mddev->resync_offset = MaxSector; 9392 } else { 9393 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9394 mddev->curr_resync = MaxSector; 9395 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9396 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9397 rcu_read_lock(); 9398 rdev_for_each_rcu(rdev, mddev) 9399 if (mddev->delta_disks >= 0 && 9400 rdev_needs_recovery(rdev, mddev->curr_resync)) 9401 rdev->recovery_offset = mddev->curr_resync; 9402 rcu_read_unlock(); 9403 } 9404 } 9405 } 9406 skip: 9407 /* set CHANGE_PENDING here since maybe another update is needed, 9408 * so other nodes are informed. It should be harmless for normal 9409 * raid */ 9410 set_mask_bits(&mddev->sb_flags, 0, 9411 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9412 9413 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9414 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9415 mddev->delta_disks > 0 && 9416 mddev->pers->finish_reshape && 9417 mddev->pers->size && 9418 !mddev_is_dm(mddev)) { 9419 mddev_lock_nointr(mddev); 9420 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9421 mddev_unlock(mddev); 9422 if (!mddev_is_clustered(mddev)) 9423 set_capacity_and_notify(mddev->gendisk, 9424 mddev->array_sectors); 9425 } 9426 9427 spin_lock(&mddev->lock); 9428 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9429 /* We completed so min/max setting can be forgotten if used. */ 9430 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9431 mddev->resync_min = 0; 9432 mddev->resync_max = MaxSector; 9433 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9434 mddev->resync_min = mddev->curr_resync_completed; 9435 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9436 mddev->curr_resync = MD_RESYNC_NONE; 9437 spin_unlock(&mddev->lock); 9438 9439 wake_up(&resync_wait); 9440 md_wakeup_thread(mddev->thread); 9441 return; 9442 } 9443 EXPORT_SYMBOL_GPL(md_do_sync); 9444 9445 static bool rdev_removeable(struct md_rdev *rdev) 9446 { 9447 /* rdev is not used. */ 9448 if (rdev->raid_disk < 0) 9449 return false; 9450 9451 /* There are still inflight io, don't remove this rdev. */ 9452 if (atomic_read(&rdev->nr_pending)) 9453 return false; 9454 9455 /* 9456 * An error occurred but has not yet been acknowledged by the metadata 9457 * handler, don't remove this rdev. 9458 */ 9459 if (test_bit(Blocked, &rdev->flags)) 9460 return false; 9461 9462 /* Fautly rdev is not used, it's safe to remove it. */ 9463 if (test_bit(Faulty, &rdev->flags)) 9464 return true; 9465 9466 /* Journal disk can only be removed if it's faulty. */ 9467 if (test_bit(Journal, &rdev->flags)) 9468 return false; 9469 9470 /* 9471 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9472 * replacement has just become active from pers->spare_active(), and 9473 * then pers->hot_remove_disk() will replace this rdev with replacement. 9474 */ 9475 if (!test_bit(In_sync, &rdev->flags)) 9476 return true; 9477 9478 return false; 9479 } 9480 9481 static bool rdev_is_spare(struct md_rdev *rdev) 9482 { 9483 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 9484 !test_bit(In_sync, &rdev->flags) && 9485 !test_bit(Journal, &rdev->flags) && 9486 !test_bit(Faulty, &rdev->flags); 9487 } 9488 9489 static bool rdev_addable(struct md_rdev *rdev) 9490 { 9491 struct mddev *mddev; 9492 9493 mddev = READ_ONCE(rdev->mddev); 9494 if (!mddev) 9495 return false; 9496 9497 /* rdev is already used, don't add it again. */ 9498 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9499 test_bit(Faulty, &rdev->flags)) 9500 return false; 9501 9502 /* Allow to add journal disk. */ 9503 if (test_bit(Journal, &rdev->flags)) 9504 return true; 9505 9506 /* Allow to add if array is read-write. */ 9507 if (md_is_rdwr(mddev)) 9508 return true; 9509 9510 /* 9511 * For read-only array, only allow to readd a rdev. And if bitmap is 9512 * used, don't allow to readd a rdev that is too old. 9513 */ 9514 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 9515 return true; 9516 9517 return false; 9518 } 9519 9520 static bool md_spares_need_change(struct mddev *mddev) 9521 { 9522 struct md_rdev *rdev; 9523 9524 rcu_read_lock(); 9525 rdev_for_each_rcu(rdev, mddev) { 9526 if (rdev_removeable(rdev) || rdev_addable(rdev)) { 9527 rcu_read_unlock(); 9528 return true; 9529 } 9530 } 9531 rcu_read_unlock(); 9532 return false; 9533 } 9534 9535 static int remove_spares(struct mddev *mddev, struct md_rdev *this) 9536 { 9537 struct md_rdev *rdev; 9538 int removed = 0; 9539 9540 rdev_for_each(rdev, mddev) { 9541 if ((this == NULL || rdev == this) && rdev_removeable(rdev) && 9542 !mddev->pers->hot_remove_disk(mddev, rdev)) { 9543 sysfs_unlink_rdev(mddev, rdev); 9544 rdev->saved_raid_disk = rdev->raid_disk; 9545 rdev->raid_disk = -1; 9546 removed++; 9547 } 9548 } 9549 9550 if (removed && mddev->kobj.sd) 9551 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9552 9553 return removed; 9554 } 9555 9556 static int remove_and_add_spares(struct mddev *mddev, 9557 struct md_rdev *this) 9558 { 9559 struct md_rdev *rdev; 9560 int spares = 0; 9561 int removed = 0; 9562 9563 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9564 /* Mustn't remove devices when resync thread is running */ 9565 return 0; 9566 9567 removed = remove_spares(mddev, this); 9568 if (this && removed) 9569 goto no_add; 9570 9571 rdev_for_each(rdev, mddev) { 9572 if (this && this != rdev) 9573 continue; 9574 if (rdev_is_spare(rdev)) 9575 spares++; 9576 if (!rdev_addable(rdev)) 9577 continue; 9578 if (!test_bit(Journal, &rdev->flags)) 9579 rdev->recovery_offset = 0; 9580 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9581 /* failure here is OK */ 9582 sysfs_link_rdev(mddev, rdev); 9583 if (!test_bit(Journal, &rdev->flags)) 9584 spares++; 9585 md_new_event(); 9586 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9587 } 9588 } 9589 no_add: 9590 if (removed) 9591 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9592 return spares; 9593 } 9594 9595 static bool md_choose_sync_action(struct mddev *mddev, int *spares) 9596 { 9597 /* Check if reshape is in progress first. */ 9598 if (mddev->reshape_position != MaxSector) { 9599 if (mddev->pers->check_reshape == NULL || 9600 mddev->pers->check_reshape(mddev) != 0) 9601 return false; 9602 9603 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9604 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9605 return true; 9606 } 9607 9608 /* Check if resync is in progress. */ 9609 if (mddev->resync_offset < MaxSector) { 9610 remove_spares(mddev, NULL); 9611 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9612 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9613 return true; 9614 } 9615 9616 /* 9617 * Remove any failed drives, then add spares if possible. Spares are 9618 * also removed and re-added, to allow the personality to fail the 9619 * re-add. 9620 */ 9621 *spares = remove_and_add_spares(mddev, NULL); 9622 if (*spares) { 9623 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9624 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9625 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9626 9627 /* Start new recovery. */ 9628 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9629 return true; 9630 } 9631 9632 /* Delay to choose resync/check/repair in md_do_sync(). */ 9633 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9634 return true; 9635 9636 /* Nothing to be done */ 9637 return false; 9638 } 9639 9640 static void md_start_sync(struct work_struct *ws) 9641 { 9642 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 9643 int spares = 0; 9644 bool suspend = false; 9645 char *name; 9646 9647 /* 9648 * If reshape is still in progress, spares won't be added or removed 9649 * from conf until reshape is done. 9650 */ 9651 if (mddev->reshape_position == MaxSector && 9652 md_spares_need_change(mddev)) { 9653 suspend = true; 9654 mddev_suspend(mddev, false); 9655 } 9656 9657 mddev_lock_nointr(mddev); 9658 if (!md_is_rdwr(mddev)) { 9659 /* 9660 * On a read-only array we can: 9661 * - remove failed devices 9662 * - add already-in_sync devices if the array itself is in-sync. 9663 * As we only add devices that are already in-sync, we can 9664 * activate the spares immediately. 9665 */ 9666 remove_and_add_spares(mddev, NULL); 9667 goto not_running; 9668 } 9669 9670 if (!md_choose_sync_action(mddev, &spares)) 9671 goto not_running; 9672 9673 if (!mddev->pers->sync_request) 9674 goto not_running; 9675 9676 /* 9677 * We are adding a device or devices to an array which has the bitmap 9678 * stored on all devices. So make sure all bitmap pages get written. 9679 */ 9680 if (spares) 9681 mddev->bitmap_ops->write_all(mddev); 9682 9683 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? 9684 "reshape" : "resync"; 9685 rcu_assign_pointer(mddev->sync_thread, 9686 md_register_thread(md_do_sync, mddev, name)); 9687 if (!mddev->sync_thread) { 9688 pr_warn("%s: could not start resync thread...\n", 9689 mdname(mddev)); 9690 /* leave the spares where they are, it shouldn't hurt */ 9691 goto not_running; 9692 } 9693 9694 mddev_unlock(mddev); 9695 /* 9696 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9697 * not set it again. Otherwise, we may cause issue like this one: 9698 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9699 * Therefore, use __mddev_resume(mddev, false). 9700 */ 9701 if (suspend) 9702 __mddev_resume(mddev, false); 9703 md_wakeup_thread(mddev->sync_thread); 9704 sysfs_notify_dirent_safe(mddev->sysfs_action); 9705 md_new_event(); 9706 return; 9707 9708 not_running: 9709 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9710 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9711 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9712 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9713 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9714 mddev_unlock(mddev); 9715 /* 9716 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9717 * not set it again. Otherwise, we may cause issue like this one: 9718 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9719 * Therefore, use __mddev_resume(mddev, false). 9720 */ 9721 if (suspend) 9722 __mddev_resume(mddev, false); 9723 9724 wake_up(&resync_wait); 9725 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 9726 mddev->sysfs_action) 9727 sysfs_notify_dirent_safe(mddev->sysfs_action); 9728 } 9729 9730 static void unregister_sync_thread(struct mddev *mddev) 9731 { 9732 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9733 /* resync/recovery still happening */ 9734 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9735 return; 9736 } 9737 9738 if (WARN_ON_ONCE(!mddev->sync_thread)) 9739 return; 9740 9741 md_reap_sync_thread(mddev); 9742 } 9743 9744 /* 9745 * This routine is regularly called by all per-raid-array threads to 9746 * deal with generic issues like resync and super-block update. 9747 * Raid personalities that don't have a thread (linear/raid0) do not 9748 * need this as they never do any recovery or update the superblock. 9749 * 9750 * It does not do any resync itself, but rather "forks" off other threads 9751 * to do that as needed. 9752 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9753 * "->recovery" and create a thread at ->sync_thread. 9754 * When the thread finishes it sets MD_RECOVERY_DONE 9755 * and wakeups up this thread which will reap the thread and finish up. 9756 * This thread also removes any faulty devices (with nr_pending == 0). 9757 * 9758 * The overall approach is: 9759 * 1/ if the superblock needs updating, update it. 9760 * 2/ If a recovery thread is running, don't do anything else. 9761 * 3/ If recovery has finished, clean up, possibly marking spares active. 9762 * 4/ If there are any faulty devices, remove them. 9763 * 5/ If array is degraded, try to add spares devices 9764 * 6/ If array has spares or is not in-sync, start a resync thread. 9765 */ 9766 void md_check_recovery(struct mddev *mddev) 9767 { 9768 if (mddev->bitmap) 9769 mddev->bitmap_ops->daemon_work(mddev); 9770 9771 if (signal_pending(current)) { 9772 if (mddev->pers->sync_request && !mddev->external) { 9773 pr_debug("md: %s in immediate safe mode\n", 9774 mdname(mddev)); 9775 mddev->safemode = 2; 9776 } 9777 flush_signals(current); 9778 } 9779 9780 if (!md_is_rdwr(mddev) && 9781 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9782 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9783 return; 9784 if ( ! ( 9785 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9786 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9787 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9788 (mddev->external == 0 && mddev->safemode == 1) || 9789 (mddev->safemode == 2 9790 && !mddev->in_sync && mddev->resync_offset == MaxSector) 9791 )) 9792 return; 9793 9794 if (mddev_trylock(mddev)) { 9795 bool try_set_sync = mddev->safemode != 0; 9796 9797 if (!mddev->external && mddev->safemode == 1) 9798 mddev->safemode = 0; 9799 9800 if (!md_is_rdwr(mddev)) { 9801 struct md_rdev *rdev; 9802 9803 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9804 unregister_sync_thread(mddev); 9805 goto unlock; 9806 } 9807 9808 if (!mddev->external && mddev->in_sync) 9809 /* 9810 * 'Blocked' flag not needed as failed devices 9811 * will be recorded if array switched to read/write. 9812 * Leaving it set will prevent the device 9813 * from being removed. 9814 */ 9815 rdev_for_each(rdev, mddev) 9816 clear_bit(Blocked, &rdev->flags); 9817 9818 /* 9819 * There is no thread, but we need to call 9820 * ->spare_active and clear saved_raid_disk 9821 */ 9822 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9823 md_reap_sync_thread(mddev); 9824 9825 /* 9826 * Let md_start_sync() to remove and add rdevs to the 9827 * array. 9828 */ 9829 if (md_spares_need_change(mddev)) { 9830 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9831 queue_work(md_misc_wq, &mddev->sync_work); 9832 } 9833 9834 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9835 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9836 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9837 9838 goto unlock; 9839 } 9840 9841 if (mddev_is_clustered(mddev)) { 9842 struct md_rdev *rdev, *tmp; 9843 /* kick the device if another node issued a 9844 * remove disk. 9845 */ 9846 rdev_for_each_safe(rdev, tmp, mddev) { 9847 if (rdev->raid_disk < 0 && 9848 test_and_clear_bit(ClusterRemove, &rdev->flags)) 9849 md_kick_rdev_from_array(rdev); 9850 } 9851 } 9852 9853 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9854 spin_lock(&mddev->lock); 9855 set_in_sync(mddev); 9856 spin_unlock(&mddev->lock); 9857 } 9858 9859 if (mddev->sb_flags) 9860 md_update_sb(mddev, 0); 9861 9862 /* 9863 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9864 * still set. 9865 */ 9866 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9867 unregister_sync_thread(mddev); 9868 goto unlock; 9869 } 9870 9871 /* Set RUNNING before clearing NEEDED to avoid 9872 * any transients in the value of "sync_action". 9873 */ 9874 mddev->curr_resync_completed = 0; 9875 spin_lock(&mddev->lock); 9876 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9877 spin_unlock(&mddev->lock); 9878 /* Clear some bits that don't mean anything, but 9879 * might be left set 9880 */ 9881 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9882 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9883 9884 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9885 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 9886 queue_work(md_misc_wq, &mddev->sync_work); 9887 } else { 9888 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9889 wake_up(&resync_wait); 9890 } 9891 9892 unlock: 9893 wake_up(&mddev->sb_wait); 9894 mddev_unlock(mddev); 9895 } 9896 } 9897 EXPORT_SYMBOL(md_check_recovery); 9898 9899 void md_reap_sync_thread(struct mddev *mddev) 9900 { 9901 struct md_rdev *rdev; 9902 sector_t old_dev_sectors = mddev->dev_sectors; 9903 bool is_reshaped = false; 9904 9905 /* resync has finished, collect result */ 9906 md_unregister_thread(mddev, &mddev->sync_thread); 9907 atomic_inc(&mddev->sync_seq); 9908 9909 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9910 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9911 mddev->degraded != mddev->raid_disks) { 9912 /* success...*/ 9913 /* activate any spares */ 9914 if (mddev->pers->spare_active(mddev)) { 9915 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9916 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9917 } 9918 } 9919 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9920 mddev->pers->finish_reshape) { 9921 mddev->pers->finish_reshape(mddev); 9922 if (mddev_is_clustered(mddev)) 9923 is_reshaped = true; 9924 } 9925 9926 /* If array is no-longer degraded, then any saved_raid_disk 9927 * information must be scrapped. 9928 */ 9929 if (!mddev->degraded) 9930 rdev_for_each(rdev, mddev) 9931 rdev->saved_raid_disk = -1; 9932 9933 md_update_sb(mddev, 1); 9934 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9935 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9936 * clustered raid */ 9937 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9938 mddev->cluster_ops->resync_finish(mddev); 9939 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9940 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9941 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9942 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9943 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9944 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9945 /* 9946 * We call mddev->cluster_ops->update_size here because sync_size could 9947 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9948 * so it is time to update size across cluster. 9949 */ 9950 if (mddev_is_clustered(mddev) && is_reshaped 9951 && !test_bit(MD_CLOSING, &mddev->flags)) 9952 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 9953 /* flag recovery needed just to double check */ 9954 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9955 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9956 sysfs_notify_dirent_safe(mddev->sysfs_action); 9957 md_new_event(); 9958 if (mddev->event_work.func) 9959 queue_work(md_misc_wq, &mddev->event_work); 9960 wake_up(&resync_wait); 9961 } 9962 EXPORT_SYMBOL(md_reap_sync_thread); 9963 9964 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9965 { 9966 sysfs_notify_dirent_safe(rdev->sysfs_state); 9967 wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), 9968 msecs_to_jiffies(5000)); 9969 rdev_dec_pending(rdev, mddev); 9970 } 9971 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9972 9973 void md_finish_reshape(struct mddev *mddev) 9974 { 9975 /* called be personality module when reshape completes. */ 9976 struct md_rdev *rdev; 9977 9978 rdev_for_each(rdev, mddev) { 9979 if (rdev->data_offset > rdev->new_data_offset) 9980 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9981 else 9982 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9983 rdev->data_offset = rdev->new_data_offset; 9984 } 9985 } 9986 EXPORT_SYMBOL(md_finish_reshape); 9987 9988 /* Bad block management */ 9989 9990 /* Returns true on success, false on failure */ 9991 bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9992 int is_new) 9993 { 9994 struct mddev *mddev = rdev->mddev; 9995 9996 /* 9997 * Recording new badblocks for faulty rdev will force unnecessary 9998 * super block updating. This is fragile for external management because 9999 * userspace daemon may trying to remove this device and deadlock may 10000 * occur. This will be probably solved in the mdadm, but it is safer to 10001 * avoid it. 10002 */ 10003 if (test_bit(Faulty, &rdev->flags)) 10004 return true; 10005 10006 if (is_new) 10007 s += rdev->new_data_offset; 10008 else 10009 s += rdev->data_offset; 10010 10011 if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) 10012 return false; 10013 10014 /* Make sure they get written out promptly */ 10015 if (test_bit(ExternalBbl, &rdev->flags)) 10016 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 10017 sysfs_notify_dirent_safe(rdev->sysfs_state); 10018 set_mask_bits(&mddev->sb_flags, 0, 10019 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 10020 md_wakeup_thread(rdev->mddev->thread); 10021 return true; 10022 } 10023 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 10024 10025 void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 10026 int is_new) 10027 { 10028 if (is_new) 10029 s += rdev->new_data_offset; 10030 else 10031 s += rdev->data_offset; 10032 10033 if (!badblocks_clear(&rdev->badblocks, s, sectors)) 10034 return; 10035 10036 if (test_bit(ExternalBbl, &rdev->flags)) 10037 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 10038 } 10039 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 10040 10041 static int md_notify_reboot(struct notifier_block *this, 10042 unsigned long code, void *x) 10043 { 10044 struct mddev *mddev; 10045 int need_delay = 0; 10046 10047 spin_lock(&all_mddevs_lock); 10048 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 10049 if (!mddev_get(mddev)) 10050 continue; 10051 spin_unlock(&all_mddevs_lock); 10052 if (mddev_trylock(mddev)) { 10053 if (mddev->pers) 10054 __md_stop_writes(mddev); 10055 if (mddev->persistent) 10056 mddev->safemode = 2; 10057 mddev_unlock(mddev); 10058 } 10059 need_delay = 1; 10060 spin_lock(&all_mddevs_lock); 10061 mddev_put_locked(mddev); 10062 } 10063 spin_unlock(&all_mddevs_lock); 10064 10065 /* 10066 * certain more exotic SCSI devices are known to be 10067 * volatile wrt too early system reboots. While the 10068 * right place to handle this issue is the given 10069 * driver, we do want to have a safe RAID driver ... 10070 */ 10071 if (need_delay) 10072 msleep(1000); 10073 10074 return NOTIFY_DONE; 10075 } 10076 10077 static struct notifier_block md_notifier = { 10078 .notifier_call = md_notify_reboot, 10079 .next = NULL, 10080 .priority = INT_MAX, /* before any real devices */ 10081 }; 10082 10083 static void md_geninit(void) 10084 { 10085 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 10086 10087 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 10088 } 10089 10090 static int __init md_init(void) 10091 { 10092 int ret = -ENOMEM; 10093 10094 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 10095 if (!md_wq) 10096 goto err_wq; 10097 10098 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 10099 if (!md_misc_wq) 10100 goto err_misc_wq; 10101 10102 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 10103 0); 10104 if (!md_bitmap_wq) 10105 goto err_bitmap_wq; 10106 10107 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 10108 if (ret < 0) 10109 goto err_md; 10110 10111 ret = __register_blkdev(0, "mdp", md_probe); 10112 if (ret < 0) 10113 goto err_mdp; 10114 mdp_major = ret; 10115 10116 register_reboot_notifier(&md_notifier); 10117 raid_table_header = register_sysctl("dev/raid", raid_table); 10118 10119 md_geninit(); 10120 return 0; 10121 10122 err_mdp: 10123 unregister_blkdev(MD_MAJOR, "md"); 10124 err_md: 10125 destroy_workqueue(md_bitmap_wq); 10126 err_bitmap_wq: 10127 destroy_workqueue(md_misc_wq); 10128 err_misc_wq: 10129 destroy_workqueue(md_wq); 10130 err_wq: 10131 return ret; 10132 } 10133 10134 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 10135 { 10136 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 10137 struct md_rdev *rdev2, *tmp; 10138 int role, ret; 10139 10140 /* 10141 * If size is changed in another node then we need to 10142 * do resize as well. 10143 */ 10144 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 10145 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 10146 if (ret) 10147 pr_info("md-cluster: resize failed\n"); 10148 else 10149 mddev->bitmap_ops->update_sb(mddev->bitmap); 10150 } 10151 10152 /* Check for change of roles in the active devices */ 10153 rdev_for_each_safe(rdev2, tmp, mddev) { 10154 if (test_bit(Faulty, &rdev2->flags)) { 10155 if (test_bit(ClusterRemove, &rdev2->flags)) 10156 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10157 continue; 10158 } 10159 10160 /* Check if the roles changed */ 10161 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 10162 10163 if (test_bit(Candidate, &rdev2->flags)) { 10164 if (role == MD_DISK_ROLE_FAULTY) { 10165 pr_info("md: Removing Candidate device %pg because add failed\n", 10166 rdev2->bdev); 10167 md_kick_rdev_from_array(rdev2); 10168 continue; 10169 } 10170 else 10171 clear_bit(Candidate, &rdev2->flags); 10172 } 10173 10174 if (role != rdev2->raid_disk) { 10175 /* 10176 * got activated except reshape is happening. 10177 */ 10178 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 10179 !(le32_to_cpu(sb->feature_map) & 10180 MD_FEATURE_RESHAPE_ACTIVE) && 10181 !mddev->cluster_ops->resync_status_get(mddev)) { 10182 /* 10183 * -1 to make raid1_add_disk() set conf->fullsync 10184 * to 1. This could avoid skipping sync when the 10185 * remote node is down during resyncing. 10186 */ 10187 if ((le32_to_cpu(sb->feature_map) 10188 & MD_FEATURE_RECOVERY_OFFSET)) 10189 rdev2->saved_raid_disk = -1; 10190 else 10191 rdev2->saved_raid_disk = role; 10192 ret = remove_and_add_spares(mddev, rdev2); 10193 pr_info("Activated spare: %pg\n", 10194 rdev2->bdev); 10195 /* wakeup mddev->thread here, so array could 10196 * perform resync with the new activated disk */ 10197 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10198 md_wakeup_thread(mddev->thread); 10199 } 10200 /* device faulty 10201 * We just want to do the minimum to mark the disk 10202 * as faulty. The recovery is performed by the 10203 * one who initiated the error. 10204 */ 10205 if (role == MD_DISK_ROLE_FAULTY || 10206 role == MD_DISK_ROLE_JOURNAL) { 10207 md_error(mddev, rdev2); 10208 clear_bit(Blocked, &rdev2->flags); 10209 } 10210 } 10211 } 10212 10213 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 10214 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 10215 if (ret) 10216 pr_warn("md: updating array disks failed. %d\n", ret); 10217 } 10218 10219 /* 10220 * Since mddev->delta_disks has already updated in update_raid_disks, 10221 * so it is time to check reshape. 10222 */ 10223 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10224 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10225 /* 10226 * reshape is happening in the remote node, we need to 10227 * update reshape_position and call start_reshape. 10228 */ 10229 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 10230 if (mddev->pers->update_reshape_pos) 10231 mddev->pers->update_reshape_pos(mddev); 10232 if (mddev->pers->start_reshape) 10233 mddev->pers->start_reshape(mddev); 10234 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10235 mddev->reshape_position != MaxSector && 10236 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10237 /* reshape is just done in another node. */ 10238 mddev->reshape_position = MaxSector; 10239 if (mddev->pers->update_reshape_pos) 10240 mddev->pers->update_reshape_pos(mddev); 10241 } 10242 10243 /* Finally set the event to be up to date */ 10244 mddev->events = le64_to_cpu(sb->events); 10245 } 10246 10247 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 10248 { 10249 int err; 10250 struct page *swapout = rdev->sb_page; 10251 struct mdp_superblock_1 *sb; 10252 10253 /* Store the sb page of the rdev in the swapout temporary 10254 * variable in case we err in the future 10255 */ 10256 rdev->sb_page = NULL; 10257 err = alloc_disk_sb(rdev); 10258 if (err == 0) { 10259 ClearPageUptodate(rdev->sb_page); 10260 rdev->sb_loaded = 0; 10261 err = super_types[mddev->major_version]. 10262 load_super(rdev, NULL, mddev->minor_version); 10263 } 10264 if (err < 0) { 10265 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 10266 __func__, __LINE__, rdev->desc_nr, err); 10267 if (rdev->sb_page) 10268 put_page(rdev->sb_page); 10269 rdev->sb_page = swapout; 10270 rdev->sb_loaded = 1; 10271 return err; 10272 } 10273 10274 sb = page_address(rdev->sb_page); 10275 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 10276 * is not set 10277 */ 10278 10279 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 10280 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 10281 10282 /* The other node finished recovery, call spare_active to set 10283 * device In_sync and mddev->degraded 10284 */ 10285 if (rdev->recovery_offset == MaxSector && 10286 !test_bit(In_sync, &rdev->flags) && 10287 mddev->pers->spare_active(mddev)) 10288 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10289 10290 put_page(swapout); 10291 return 0; 10292 } 10293 10294 void md_reload_sb(struct mddev *mddev, int nr) 10295 { 10296 struct md_rdev *rdev = NULL, *iter; 10297 int err; 10298 10299 /* Find the rdev */ 10300 rdev_for_each_rcu(iter, mddev) { 10301 if (iter->desc_nr == nr) { 10302 rdev = iter; 10303 break; 10304 } 10305 } 10306 10307 if (!rdev) { 10308 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 10309 return; 10310 } 10311 10312 err = read_rdev(mddev, rdev); 10313 if (err < 0) 10314 return; 10315 10316 check_sb_changes(mddev, rdev); 10317 10318 /* Read all rdev's to update recovery_offset */ 10319 rdev_for_each_rcu(rdev, mddev) { 10320 if (!test_bit(Faulty, &rdev->flags)) 10321 read_rdev(mddev, rdev); 10322 } 10323 } 10324 EXPORT_SYMBOL(md_reload_sb); 10325 10326 #ifndef MODULE 10327 10328 /* 10329 * Searches all registered partitions for autorun RAID arrays 10330 * at boot time. 10331 */ 10332 10333 static DEFINE_MUTEX(detected_devices_mutex); 10334 static LIST_HEAD(all_detected_devices); 10335 struct detected_devices_node { 10336 struct list_head list; 10337 dev_t dev; 10338 }; 10339 10340 void md_autodetect_dev(dev_t dev) 10341 { 10342 struct detected_devices_node *node_detected_dev; 10343 10344 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 10345 if (node_detected_dev) { 10346 node_detected_dev->dev = dev; 10347 mutex_lock(&detected_devices_mutex); 10348 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10349 mutex_unlock(&detected_devices_mutex); 10350 } 10351 } 10352 10353 void md_autostart_arrays(int part) 10354 { 10355 struct md_rdev *rdev; 10356 struct detected_devices_node *node_detected_dev; 10357 dev_t dev; 10358 int i_scanned, i_passed; 10359 10360 i_scanned = 0; 10361 i_passed = 0; 10362 10363 pr_info("md: Autodetecting RAID arrays.\n"); 10364 10365 mutex_lock(&detected_devices_mutex); 10366 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10367 i_scanned++; 10368 node_detected_dev = list_entry(all_detected_devices.next, 10369 struct detected_devices_node, list); 10370 list_del(&node_detected_dev->list); 10371 dev = node_detected_dev->dev; 10372 kfree(node_detected_dev); 10373 mutex_unlock(&detected_devices_mutex); 10374 rdev = md_import_device(dev,0, 90); 10375 mutex_lock(&detected_devices_mutex); 10376 if (IS_ERR(rdev)) 10377 continue; 10378 10379 if (test_bit(Faulty, &rdev->flags)) 10380 continue; 10381 10382 set_bit(AutoDetected, &rdev->flags); 10383 list_add(&rdev->same_set, &pending_raid_disks); 10384 i_passed++; 10385 } 10386 mutex_unlock(&detected_devices_mutex); 10387 10388 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10389 10390 autorun_devices(part); 10391 } 10392 10393 #endif /* !MODULE */ 10394 10395 static __exit void md_exit(void) 10396 { 10397 struct mddev *mddev; 10398 int delay = 1; 10399 10400 unregister_blkdev(MD_MAJOR,"md"); 10401 unregister_blkdev(mdp_major, "mdp"); 10402 unregister_reboot_notifier(&md_notifier); 10403 unregister_sysctl_table(raid_table_header); 10404 10405 /* We cannot unload the modules while some process is 10406 * waiting for us in select() or poll() - wake them up 10407 */ 10408 md_unloading = 1; 10409 while (waitqueue_active(&md_event_waiters)) { 10410 /* not safe to leave yet */ 10411 wake_up(&md_event_waiters); 10412 msleep(delay); 10413 delay += delay; 10414 } 10415 remove_proc_entry("mdstat", NULL); 10416 10417 spin_lock(&all_mddevs_lock); 10418 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 10419 if (!mddev_get(mddev)) 10420 continue; 10421 spin_unlock(&all_mddevs_lock); 10422 export_array(mddev); 10423 mddev->ctime = 0; 10424 mddev->hold_active = 0; 10425 /* 10426 * As the mddev is now fully clear, mddev_put will schedule 10427 * the mddev for destruction by a workqueue, and the 10428 * destroy_workqueue() below will wait for that to complete. 10429 */ 10430 spin_lock(&all_mddevs_lock); 10431 mddev_put_locked(mddev); 10432 } 10433 spin_unlock(&all_mddevs_lock); 10434 10435 destroy_workqueue(md_misc_wq); 10436 destroy_workqueue(md_bitmap_wq); 10437 destroy_workqueue(md_wq); 10438 } 10439 10440 subsys_initcall(md_init); 10441 module_exit(md_exit) 10442 10443 static int get_ro(char *buffer, const struct kernel_param *kp) 10444 { 10445 return sprintf(buffer, "%d\n", start_readonly); 10446 } 10447 static int set_ro(const char *val, const struct kernel_param *kp) 10448 { 10449 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10450 } 10451 10452 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10453 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10454 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10455 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10456 module_param(legacy_async_del_gendisk, bool, 0600); 10457 10458 MODULE_LICENSE("GPL"); 10459 MODULE_DESCRIPTION("MD RAID framework"); 10460 MODULE_ALIAS("md"); 10461 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10462