1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include "md.h" 69 #include "md-bitmap.h" 70 #include "md-cluster.h" 71 72 static const char *action_name[NR_SYNC_ACTIONS] = { 73 [ACTION_RESYNC] = "resync", 74 [ACTION_RECOVER] = "recover", 75 [ACTION_CHECK] = "check", 76 [ACTION_REPAIR] = "repair", 77 [ACTION_RESHAPE] = "reshape", 78 [ACTION_FROZEN] = "frozen", 79 [ACTION_IDLE] = "idle", 80 }; 81 82 static DEFINE_XARRAY(md_submodule); 83 84 static const struct kobj_type md_ktype; 85 86 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 87 static struct workqueue_struct *md_wq; 88 89 /* 90 * This workqueue is used for sync_work to register new sync_thread, and for 91 * del_work to remove rdev, and for event_work that is only set by dm-raid. 92 * 93 * Noted that sync_work will grab reconfig_mutex, hence never flush this 94 * workqueue whith reconfig_mutex grabbed. 95 */ 96 static struct workqueue_struct *md_misc_wq; 97 struct workqueue_struct *md_bitmap_wq; 98 99 static int remove_and_add_spares(struct mddev *mddev, 100 struct md_rdev *this); 101 static void mddev_detach(struct mddev *mddev); 102 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 103 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 104 105 /* 106 * Default number of read corrections we'll attempt on an rdev 107 * before ejecting it from the array. We divide the read error 108 * count by 2 for every hour elapsed between read errors. 109 */ 110 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 111 /* Default safemode delay: 200 msec */ 112 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 113 /* 114 * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' 115 * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load 116 * does not show up that much. Increase it if you want to have more guaranteed 117 * speed. Note that the RAID driver will use the maximum bandwidth 118 * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. 119 * 120 * Background sync IO speed control: 121 * 122 * - below speed min: 123 * no limit; 124 * - above speed min and below speed max: 125 * a) if mddev is idle, then no limit; 126 * b) if mddev is busy handling normal IO, then limit inflight sync IO 127 * to sync_io_depth; 128 * - above speed max: 129 * sync IO can't be issued; 130 * 131 * Following configurations can be changed via /proc/sys/dev/raid/ for system 132 * or /sys/block/mdX/md/ for one array. 133 */ 134 static int sysctl_speed_limit_min = 1000; 135 static int sysctl_speed_limit_max = 200000; 136 static int sysctl_sync_io_depth = 32; 137 138 static int speed_min(struct mddev *mddev) 139 { 140 return mddev->sync_speed_min ? 141 mddev->sync_speed_min : sysctl_speed_limit_min; 142 } 143 144 static int speed_max(struct mddev *mddev) 145 { 146 return mddev->sync_speed_max ? 147 mddev->sync_speed_max : sysctl_speed_limit_max; 148 } 149 150 static int sync_io_depth(struct mddev *mddev) 151 { 152 return mddev->sync_io_depth ? 153 mddev->sync_io_depth : sysctl_sync_io_depth; 154 } 155 156 static void rdev_uninit_serial(struct md_rdev *rdev) 157 { 158 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 159 return; 160 161 kvfree(rdev->serial); 162 rdev->serial = NULL; 163 } 164 165 static void rdevs_uninit_serial(struct mddev *mddev) 166 { 167 struct md_rdev *rdev; 168 169 rdev_for_each(rdev, mddev) 170 rdev_uninit_serial(rdev); 171 } 172 173 static int rdev_init_serial(struct md_rdev *rdev) 174 { 175 /* serial_nums equals with BARRIER_BUCKETS_NR */ 176 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 177 struct serial_in_rdev *serial = NULL; 178 179 if (test_bit(CollisionCheck, &rdev->flags)) 180 return 0; 181 182 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 183 GFP_KERNEL); 184 if (!serial) 185 return -ENOMEM; 186 187 for (i = 0; i < serial_nums; i++) { 188 struct serial_in_rdev *serial_tmp = &serial[i]; 189 190 spin_lock_init(&serial_tmp->serial_lock); 191 serial_tmp->serial_rb = RB_ROOT_CACHED; 192 init_waitqueue_head(&serial_tmp->serial_io_wait); 193 } 194 195 rdev->serial = serial; 196 set_bit(CollisionCheck, &rdev->flags); 197 198 return 0; 199 } 200 201 static int rdevs_init_serial(struct mddev *mddev) 202 { 203 struct md_rdev *rdev; 204 int ret = 0; 205 206 rdev_for_each(rdev, mddev) { 207 ret = rdev_init_serial(rdev); 208 if (ret) 209 break; 210 } 211 212 /* Free all resources if pool is not existed */ 213 if (ret && !mddev->serial_info_pool) 214 rdevs_uninit_serial(mddev); 215 216 return ret; 217 } 218 219 /* 220 * rdev needs to enable serial stuffs if it meets the conditions: 221 * 1. it is multi-queue device flaged with writemostly. 222 * 2. the write-behind mode is enabled. 223 */ 224 static int rdev_need_serial(struct md_rdev *rdev) 225 { 226 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 227 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 228 test_bit(WriteMostly, &rdev->flags)); 229 } 230 231 /* 232 * Init resource for rdev(s), then create serial_info_pool if: 233 * 1. rdev is the first device which return true from rdev_enable_serial. 234 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 235 */ 236 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 237 { 238 int ret = 0; 239 240 if (rdev && !rdev_need_serial(rdev) && 241 !test_bit(CollisionCheck, &rdev->flags)) 242 return; 243 244 if (!rdev) 245 ret = rdevs_init_serial(mddev); 246 else 247 ret = rdev_init_serial(rdev); 248 if (ret) 249 return; 250 251 if (mddev->serial_info_pool == NULL) { 252 /* 253 * already in memalloc noio context by 254 * mddev_suspend() 255 */ 256 mddev->serial_info_pool = 257 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 258 sizeof(struct serial_info)); 259 if (!mddev->serial_info_pool) { 260 rdevs_uninit_serial(mddev); 261 pr_err("can't alloc memory pool for serialization\n"); 262 } 263 } 264 } 265 266 /* 267 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 268 * 1. rdev is the last device flaged with CollisionCheck. 269 * 2. when bitmap is destroyed while policy is not enabled. 270 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 271 */ 272 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 273 { 274 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 275 return; 276 277 if (mddev->serial_info_pool) { 278 struct md_rdev *temp; 279 int num = 0; /* used to track if other rdevs need the pool */ 280 281 rdev_for_each(temp, mddev) { 282 if (!rdev) { 283 if (!mddev->serialize_policy || 284 !rdev_need_serial(temp)) 285 rdev_uninit_serial(temp); 286 else 287 num++; 288 } else if (temp != rdev && 289 test_bit(CollisionCheck, &temp->flags)) 290 num++; 291 } 292 293 if (rdev) 294 rdev_uninit_serial(rdev); 295 296 if (num) 297 pr_info("The mempool could be used by other devices\n"); 298 else { 299 mempool_destroy(mddev->serial_info_pool); 300 mddev->serial_info_pool = NULL; 301 } 302 } 303 } 304 305 static struct ctl_table_header *raid_table_header; 306 307 static const struct ctl_table raid_table[] = { 308 { 309 .procname = "speed_limit_min", 310 .data = &sysctl_speed_limit_min, 311 .maxlen = sizeof(int), 312 .mode = 0644, 313 .proc_handler = proc_dointvec, 314 }, 315 { 316 .procname = "speed_limit_max", 317 .data = &sysctl_speed_limit_max, 318 .maxlen = sizeof(int), 319 .mode = 0644, 320 .proc_handler = proc_dointvec, 321 }, 322 { 323 .procname = "sync_io_depth", 324 .data = &sysctl_sync_io_depth, 325 .maxlen = sizeof(int), 326 .mode = 0644, 327 .proc_handler = proc_dointvec, 328 }, 329 }; 330 331 static int start_readonly; 332 333 /* 334 * The original mechanism for creating an md device is to create 335 * a device node in /dev and to open it. This causes races with device-close. 336 * The preferred method is to write to the "new_array" module parameter. 337 * This can avoid races. 338 * Setting create_on_open to false disables the original mechanism 339 * so all the races disappear. 340 */ 341 static bool create_on_open = true; 342 343 /* 344 * We have a system wide 'event count' that is incremented 345 * on any 'interesting' event, and readers of /proc/mdstat 346 * can use 'poll' or 'select' to find out when the event 347 * count increases. 348 * 349 * Events are: 350 * start array, stop array, error, add device, remove device, 351 * start build, activate spare 352 */ 353 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 354 static atomic_t md_event_count; 355 void md_new_event(void) 356 { 357 atomic_inc(&md_event_count); 358 wake_up(&md_event_waiters); 359 } 360 EXPORT_SYMBOL_GPL(md_new_event); 361 362 /* 363 * Enables to iterate over all existing md arrays 364 * all_mddevs_lock protects this list. 365 */ 366 static LIST_HEAD(all_mddevs); 367 static DEFINE_SPINLOCK(all_mddevs_lock); 368 369 static bool is_md_suspended(struct mddev *mddev) 370 { 371 return percpu_ref_is_dying(&mddev->active_io); 372 } 373 /* Rather than calling directly into the personality make_request function, 374 * IO requests come here first so that we can check if the device is 375 * being suspended pending a reconfiguration. 376 * We hold a refcount over the call to ->make_request. By the time that 377 * call has finished, the bio has been linked into some internal structure 378 * and so is visible to ->quiesce(), so we don't need the refcount any more. 379 */ 380 static bool is_suspended(struct mddev *mddev, struct bio *bio) 381 { 382 if (is_md_suspended(mddev)) 383 return true; 384 if (bio_data_dir(bio) != WRITE) 385 return false; 386 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 387 return false; 388 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 389 return false; 390 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 391 return false; 392 return true; 393 } 394 395 bool md_handle_request(struct mddev *mddev, struct bio *bio) 396 { 397 check_suspended: 398 if (is_suspended(mddev, bio)) { 399 DEFINE_WAIT(__wait); 400 /* Bail out if REQ_NOWAIT is set for the bio */ 401 if (bio->bi_opf & REQ_NOWAIT) { 402 bio_wouldblock_error(bio); 403 return true; 404 } 405 for (;;) { 406 prepare_to_wait(&mddev->sb_wait, &__wait, 407 TASK_UNINTERRUPTIBLE); 408 if (!is_suspended(mddev, bio)) 409 break; 410 schedule(); 411 } 412 finish_wait(&mddev->sb_wait, &__wait); 413 } 414 if (!percpu_ref_tryget_live(&mddev->active_io)) 415 goto check_suspended; 416 417 if (!mddev->pers->make_request(mddev, bio)) { 418 percpu_ref_put(&mddev->active_io); 419 if (!mddev->gendisk && mddev->pers->prepare_suspend) 420 return false; 421 goto check_suspended; 422 } 423 424 percpu_ref_put(&mddev->active_io); 425 return true; 426 } 427 EXPORT_SYMBOL(md_handle_request); 428 429 static void md_submit_bio(struct bio *bio) 430 { 431 const int rw = bio_data_dir(bio); 432 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 433 434 if (mddev == NULL || mddev->pers == NULL) { 435 bio_io_error(bio); 436 return; 437 } 438 439 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 440 bio_io_error(bio); 441 return; 442 } 443 444 bio = bio_split_to_limits(bio); 445 if (!bio) 446 return; 447 448 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 449 if (bio_sectors(bio) != 0) 450 bio->bi_status = BLK_STS_IOERR; 451 bio_endio(bio); 452 return; 453 } 454 455 /* bio could be mergeable after passing to underlayer */ 456 bio->bi_opf &= ~REQ_NOMERGE; 457 458 md_handle_request(mddev, bio); 459 } 460 461 /* 462 * Make sure no new requests are submitted to the device, and any requests that 463 * have been submitted are completely handled. 464 */ 465 int mddev_suspend(struct mddev *mddev, bool interruptible) 466 { 467 int err = 0; 468 469 /* 470 * hold reconfig_mutex to wait for normal io will deadlock, because 471 * other context can't update super_block, and normal io can rely on 472 * updating super_block. 473 */ 474 lockdep_assert_not_held(&mddev->reconfig_mutex); 475 476 if (interruptible) 477 err = mutex_lock_interruptible(&mddev->suspend_mutex); 478 else 479 mutex_lock(&mddev->suspend_mutex); 480 if (err) 481 return err; 482 483 if (mddev->suspended) { 484 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 485 mutex_unlock(&mddev->suspend_mutex); 486 return 0; 487 } 488 489 percpu_ref_kill(&mddev->active_io); 490 if (interruptible) 491 err = wait_event_interruptible(mddev->sb_wait, 492 percpu_ref_is_zero(&mddev->active_io)); 493 else 494 wait_event(mddev->sb_wait, 495 percpu_ref_is_zero(&mddev->active_io)); 496 if (err) { 497 percpu_ref_resurrect(&mddev->active_io); 498 mutex_unlock(&mddev->suspend_mutex); 499 return err; 500 } 501 502 /* 503 * For raid456, io might be waiting for reshape to make progress, 504 * allow new reshape to start while waiting for io to be done to 505 * prevent deadlock. 506 */ 507 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 508 509 /* restrict memory reclaim I/O during raid array is suspend */ 510 mddev->noio_flag = memalloc_noio_save(); 511 512 mutex_unlock(&mddev->suspend_mutex); 513 return 0; 514 } 515 EXPORT_SYMBOL_GPL(mddev_suspend); 516 517 static void __mddev_resume(struct mddev *mddev, bool recovery_needed) 518 { 519 lockdep_assert_not_held(&mddev->reconfig_mutex); 520 521 mutex_lock(&mddev->suspend_mutex); 522 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 523 if (mddev->suspended) { 524 mutex_unlock(&mddev->suspend_mutex); 525 return; 526 } 527 528 /* entred the memalloc scope from mddev_suspend() */ 529 memalloc_noio_restore(mddev->noio_flag); 530 531 percpu_ref_resurrect(&mddev->active_io); 532 wake_up(&mddev->sb_wait); 533 534 if (recovery_needed) 535 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 536 md_wakeup_thread(mddev->thread); 537 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 538 539 mutex_unlock(&mddev->suspend_mutex); 540 } 541 542 void mddev_resume(struct mddev *mddev) 543 { 544 return __mddev_resume(mddev, true); 545 } 546 EXPORT_SYMBOL_GPL(mddev_resume); 547 548 /* sync bdev before setting device to readonly or stopping raid*/ 549 static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) 550 { 551 mutex_lock(&mddev->open_mutex); 552 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { 553 mutex_unlock(&mddev->open_mutex); 554 return -EBUSY; 555 } 556 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 557 mutex_unlock(&mddev->open_mutex); 558 return -EBUSY; 559 } 560 mutex_unlock(&mddev->open_mutex); 561 562 sync_blockdev(mddev->gendisk->part0); 563 return 0; 564 } 565 566 /* 567 * The only difference from bio_chain_endio() is that the current 568 * bi_status of bio does not affect the bi_status of parent. 569 */ 570 static void md_end_flush(struct bio *bio) 571 { 572 struct bio *parent = bio->bi_private; 573 574 /* 575 * If any flush io error before the power failure, 576 * disk data may be lost. 577 */ 578 if (bio->bi_status) 579 pr_err("md: %pg flush io error %d\n", bio->bi_bdev, 580 blk_status_to_errno(bio->bi_status)); 581 582 bio_put(bio); 583 bio_endio(parent); 584 } 585 586 bool md_flush_request(struct mddev *mddev, struct bio *bio) 587 { 588 struct md_rdev *rdev; 589 struct bio *new; 590 591 /* 592 * md_flush_reqeust() should be called under md_handle_request() and 593 * 'active_io' is already grabbed. Hence it's safe to get rdev directly 594 * without rcu protection. 595 */ 596 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 597 598 rdev_for_each(rdev, mddev) { 599 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 600 continue; 601 602 new = bio_alloc_bioset(rdev->bdev, 0, 603 REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, 604 &mddev->bio_set); 605 new->bi_private = bio; 606 new->bi_end_io = md_end_flush; 607 bio_inc_remaining(bio); 608 submit_bio(new); 609 } 610 611 if (bio_sectors(bio) == 0) { 612 bio_endio(bio); 613 return true; 614 } 615 616 bio->bi_opf &= ~REQ_PREFLUSH; 617 return false; 618 } 619 EXPORT_SYMBOL(md_flush_request); 620 621 static inline struct mddev *mddev_get(struct mddev *mddev) 622 { 623 lockdep_assert_held(&all_mddevs_lock); 624 625 if (test_bit(MD_DELETED, &mddev->flags)) 626 return NULL; 627 atomic_inc(&mddev->active); 628 return mddev; 629 } 630 631 static void mddev_delayed_delete(struct work_struct *ws); 632 633 static void __mddev_put(struct mddev *mddev) 634 { 635 if (mddev->raid_disks || !list_empty(&mddev->disks) || 636 mddev->ctime || mddev->hold_active) 637 return; 638 639 /* 640 * Call queue_work inside the spinlock so that flush_workqueue() after 641 * mddev_find will succeed in waiting for the work to be done. 642 */ 643 queue_work(md_misc_wq, &mddev->del_work); 644 } 645 646 static void mddev_put_locked(struct mddev *mddev) 647 { 648 if (atomic_dec_and_test(&mddev->active)) 649 __mddev_put(mddev); 650 } 651 652 void mddev_put(struct mddev *mddev) 653 { 654 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 655 return; 656 657 __mddev_put(mddev); 658 spin_unlock(&all_mddevs_lock); 659 } 660 661 static void md_safemode_timeout(struct timer_list *t); 662 static void md_start_sync(struct work_struct *ws); 663 664 static void active_io_release(struct percpu_ref *ref) 665 { 666 struct mddev *mddev = container_of(ref, struct mddev, active_io); 667 668 wake_up(&mddev->sb_wait); 669 } 670 671 static void no_op(struct percpu_ref *r) {} 672 673 int mddev_init(struct mddev *mddev) 674 { 675 676 if (percpu_ref_init(&mddev->active_io, active_io_release, 677 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 678 return -ENOMEM; 679 680 if (percpu_ref_init(&mddev->writes_pending, no_op, 681 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 682 percpu_ref_exit(&mddev->active_io); 683 return -ENOMEM; 684 } 685 686 /* We want to start with the refcount at zero */ 687 percpu_ref_put(&mddev->writes_pending); 688 689 mutex_init(&mddev->open_mutex); 690 mutex_init(&mddev->reconfig_mutex); 691 mutex_init(&mddev->suspend_mutex); 692 mutex_init(&mddev->bitmap_info.mutex); 693 INIT_LIST_HEAD(&mddev->disks); 694 INIT_LIST_HEAD(&mddev->all_mddevs); 695 INIT_LIST_HEAD(&mddev->deleting); 696 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 697 atomic_set(&mddev->active, 1); 698 atomic_set(&mddev->openers, 0); 699 atomic_set(&mddev->sync_seq, 0); 700 spin_lock_init(&mddev->lock); 701 init_waitqueue_head(&mddev->sb_wait); 702 init_waitqueue_head(&mddev->recovery_wait); 703 mddev->reshape_position = MaxSector; 704 mddev->reshape_backwards = 0; 705 mddev->last_sync_action = ACTION_IDLE; 706 mddev->resync_min = 0; 707 mddev->resync_max = MaxSector; 708 mddev->level = LEVEL_NONE; 709 mddev_set_bitmap_ops(mddev); 710 711 INIT_WORK(&mddev->sync_work, md_start_sync); 712 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 713 714 return 0; 715 } 716 EXPORT_SYMBOL_GPL(mddev_init); 717 718 void mddev_destroy(struct mddev *mddev) 719 { 720 percpu_ref_exit(&mddev->active_io); 721 percpu_ref_exit(&mddev->writes_pending); 722 } 723 EXPORT_SYMBOL_GPL(mddev_destroy); 724 725 static struct mddev *mddev_find_locked(dev_t unit) 726 { 727 struct mddev *mddev; 728 729 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 730 if (mddev->unit == unit) 731 return mddev; 732 733 return NULL; 734 } 735 736 /* find an unused unit number */ 737 static dev_t mddev_alloc_unit(void) 738 { 739 static int next_minor = 512; 740 int start = next_minor; 741 bool is_free = 0; 742 dev_t dev = 0; 743 744 while (!is_free) { 745 dev = MKDEV(MD_MAJOR, next_minor); 746 next_minor++; 747 if (next_minor > MINORMASK) 748 next_minor = 0; 749 if (next_minor == start) 750 return 0; /* Oh dear, all in use. */ 751 is_free = !mddev_find_locked(dev); 752 } 753 754 return dev; 755 } 756 757 static struct mddev *mddev_alloc(dev_t unit) 758 { 759 struct mddev *new; 760 int error; 761 762 if (unit && MAJOR(unit) != MD_MAJOR) 763 unit &= ~((1 << MdpMinorShift) - 1); 764 765 new = kzalloc(sizeof(*new), GFP_KERNEL); 766 if (!new) 767 return ERR_PTR(-ENOMEM); 768 769 error = mddev_init(new); 770 if (error) 771 goto out_free_new; 772 773 spin_lock(&all_mddevs_lock); 774 if (unit) { 775 error = -EEXIST; 776 if (mddev_find_locked(unit)) 777 goto out_destroy_new; 778 new->unit = unit; 779 if (MAJOR(unit) == MD_MAJOR) 780 new->md_minor = MINOR(unit); 781 else 782 new->md_minor = MINOR(unit) >> MdpMinorShift; 783 new->hold_active = UNTIL_IOCTL; 784 } else { 785 error = -ENODEV; 786 new->unit = mddev_alloc_unit(); 787 if (!new->unit) 788 goto out_destroy_new; 789 new->md_minor = MINOR(new->unit); 790 new->hold_active = UNTIL_STOP; 791 } 792 793 list_add(&new->all_mddevs, &all_mddevs); 794 spin_unlock(&all_mddevs_lock); 795 return new; 796 797 out_destroy_new: 798 spin_unlock(&all_mddevs_lock); 799 mddev_destroy(new); 800 out_free_new: 801 kfree(new); 802 return ERR_PTR(error); 803 } 804 805 static void mddev_free(struct mddev *mddev) 806 { 807 spin_lock(&all_mddevs_lock); 808 list_del(&mddev->all_mddevs); 809 spin_unlock(&all_mddevs_lock); 810 811 mddev_destroy(mddev); 812 kfree(mddev); 813 } 814 815 static const struct attribute_group md_redundancy_group; 816 817 void mddev_unlock(struct mddev *mddev) 818 { 819 struct md_rdev *rdev; 820 struct md_rdev *tmp; 821 LIST_HEAD(delete); 822 823 if (!list_empty(&mddev->deleting)) 824 list_splice_init(&mddev->deleting, &delete); 825 826 if (mddev->to_remove) { 827 /* These cannot be removed under reconfig_mutex as 828 * an access to the files will try to take reconfig_mutex 829 * while holding the file unremovable, which leads to 830 * a deadlock. 831 * So hold set sysfs_active while the remove in happeing, 832 * and anything else which might set ->to_remove or my 833 * otherwise change the sysfs namespace will fail with 834 * -EBUSY if sysfs_active is still set. 835 * We set sysfs_active under reconfig_mutex and elsewhere 836 * test it under the same mutex to ensure its correct value 837 * is seen. 838 */ 839 const struct attribute_group *to_remove = mddev->to_remove; 840 mddev->to_remove = NULL; 841 mddev->sysfs_active = 1; 842 mutex_unlock(&mddev->reconfig_mutex); 843 844 if (mddev->kobj.sd) { 845 if (to_remove != &md_redundancy_group) 846 sysfs_remove_group(&mddev->kobj, to_remove); 847 if (mddev->pers == NULL || 848 mddev->pers->sync_request == NULL) { 849 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 850 if (mddev->sysfs_action) 851 sysfs_put(mddev->sysfs_action); 852 if (mddev->sysfs_completed) 853 sysfs_put(mddev->sysfs_completed); 854 if (mddev->sysfs_degraded) 855 sysfs_put(mddev->sysfs_degraded); 856 mddev->sysfs_action = NULL; 857 mddev->sysfs_completed = NULL; 858 mddev->sysfs_degraded = NULL; 859 } 860 } 861 mddev->sysfs_active = 0; 862 } else 863 mutex_unlock(&mddev->reconfig_mutex); 864 865 md_wakeup_thread(mddev->thread); 866 wake_up(&mddev->sb_wait); 867 868 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 869 list_del_init(&rdev->same_set); 870 kobject_del(&rdev->kobj); 871 export_rdev(rdev, mddev); 872 } 873 874 /* Call del_gendisk after release reconfig_mutex to avoid 875 * deadlock (e.g. call del_gendisk under the lock and an 876 * access to sysfs files waits the lock) 877 * And MD_DELETED is only used for md raid which is set in 878 * do_md_stop. dm raid only uses md_stop to stop. So dm raid 879 * doesn't need to check MD_DELETED when getting reconfig lock 880 */ 881 if (test_bit(MD_DELETED, &mddev->flags)) 882 del_gendisk(mddev->gendisk); 883 } 884 EXPORT_SYMBOL_GPL(mddev_unlock); 885 886 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 887 { 888 struct md_rdev *rdev; 889 890 rdev_for_each_rcu(rdev, mddev) 891 if (rdev->desc_nr == nr) 892 return rdev; 893 894 return NULL; 895 } 896 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 897 898 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 899 { 900 struct md_rdev *rdev; 901 902 rdev_for_each(rdev, mddev) 903 if (rdev->bdev->bd_dev == dev) 904 return rdev; 905 906 return NULL; 907 } 908 909 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 910 { 911 struct md_rdev *rdev; 912 913 rdev_for_each_rcu(rdev, mddev) 914 if (rdev->bdev->bd_dev == dev) 915 return rdev; 916 917 return NULL; 918 } 919 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 920 921 static struct md_personality *get_pers(int level, char *clevel) 922 { 923 struct md_personality *ret = NULL; 924 struct md_submodule_head *head; 925 unsigned long i; 926 927 xa_lock(&md_submodule); 928 xa_for_each(&md_submodule, i, head) { 929 if (head->type != MD_PERSONALITY) 930 continue; 931 if ((level != LEVEL_NONE && head->id == level) || 932 !strcmp(head->name, clevel)) { 933 if (try_module_get(head->owner)) 934 ret = (void *)head; 935 break; 936 } 937 } 938 xa_unlock(&md_submodule); 939 940 if (!ret) { 941 if (level != LEVEL_NONE) 942 pr_warn("md: personality for level %d is not loaded!\n", 943 level); 944 else 945 pr_warn("md: personality for level %s is not loaded!\n", 946 clevel); 947 } 948 949 return ret; 950 } 951 952 static void put_pers(struct md_personality *pers) 953 { 954 module_put(pers->head.owner); 955 } 956 957 /* return the offset of the super block in 512byte sectors */ 958 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 959 { 960 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 961 } 962 963 static int alloc_disk_sb(struct md_rdev *rdev) 964 { 965 rdev->sb_page = alloc_page(GFP_KERNEL); 966 if (!rdev->sb_page) 967 return -ENOMEM; 968 return 0; 969 } 970 971 void md_rdev_clear(struct md_rdev *rdev) 972 { 973 if (rdev->sb_page) { 974 put_page(rdev->sb_page); 975 rdev->sb_loaded = 0; 976 rdev->sb_page = NULL; 977 rdev->sb_start = 0; 978 rdev->sectors = 0; 979 } 980 if (rdev->bb_page) { 981 put_page(rdev->bb_page); 982 rdev->bb_page = NULL; 983 } 984 badblocks_exit(&rdev->badblocks); 985 } 986 EXPORT_SYMBOL_GPL(md_rdev_clear); 987 988 static void super_written(struct bio *bio) 989 { 990 struct md_rdev *rdev = bio->bi_private; 991 struct mddev *mddev = rdev->mddev; 992 993 if (bio->bi_status) { 994 pr_err("md: %s gets error=%d\n", __func__, 995 blk_status_to_errno(bio->bi_status)); 996 md_error(mddev, rdev); 997 if (!test_bit(Faulty, &rdev->flags) 998 && (bio->bi_opf & MD_FAILFAST)) { 999 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 1000 set_bit(LastDev, &rdev->flags); 1001 } 1002 } else 1003 clear_bit(LastDev, &rdev->flags); 1004 1005 bio_put(bio); 1006 1007 rdev_dec_pending(rdev, mddev); 1008 1009 if (atomic_dec_and_test(&mddev->pending_writes)) 1010 wake_up(&mddev->sb_wait); 1011 } 1012 1013 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 1014 sector_t sector, int size, struct page *page) 1015 { 1016 /* write first size bytes of page to sector of rdev 1017 * Increment mddev->pending_writes before returning 1018 * and decrement it on completion, waking up sb_wait 1019 * if zero is reached. 1020 * If an error occurred, call md_error 1021 */ 1022 struct bio *bio; 1023 1024 if (!page) 1025 return; 1026 1027 if (test_bit(Faulty, &rdev->flags)) 1028 return; 1029 1030 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1031 1, 1032 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 1033 | REQ_PREFLUSH | REQ_FUA, 1034 GFP_NOIO, &mddev->sync_set); 1035 1036 atomic_inc(&rdev->nr_pending); 1037 1038 bio->bi_iter.bi_sector = sector; 1039 __bio_add_page(bio, page, size, 0); 1040 bio->bi_private = rdev; 1041 bio->bi_end_io = super_written; 1042 1043 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 1044 test_bit(FailFast, &rdev->flags) && 1045 !test_bit(LastDev, &rdev->flags)) 1046 bio->bi_opf |= MD_FAILFAST; 1047 1048 atomic_inc(&mddev->pending_writes); 1049 submit_bio(bio); 1050 } 1051 1052 int md_super_wait(struct mddev *mddev) 1053 { 1054 /* wait for all superblock writes that were scheduled to complete */ 1055 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1056 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1057 return -EAGAIN; 1058 return 0; 1059 } 1060 1061 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1062 struct page *page, blk_opf_t opf, bool metadata_op) 1063 { 1064 struct bio bio; 1065 struct bio_vec bvec; 1066 1067 if (metadata_op && rdev->meta_bdev) 1068 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1069 else 1070 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1071 1072 if (metadata_op) 1073 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1074 else if (rdev->mddev->reshape_position != MaxSector && 1075 (rdev->mddev->reshape_backwards == 1076 (sector >= rdev->mddev->reshape_position))) 1077 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1078 else 1079 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1080 __bio_add_page(&bio, page, size, 0); 1081 1082 submit_bio_wait(&bio); 1083 1084 return !bio.bi_status; 1085 } 1086 EXPORT_SYMBOL_GPL(sync_page_io); 1087 1088 static int read_disk_sb(struct md_rdev *rdev, int size) 1089 { 1090 if (rdev->sb_loaded) 1091 return 0; 1092 1093 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1094 goto fail; 1095 rdev->sb_loaded = 1; 1096 return 0; 1097 1098 fail: 1099 pr_err("md: disabled device %pg, could not read superblock.\n", 1100 rdev->bdev); 1101 return -EINVAL; 1102 } 1103 1104 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1105 { 1106 return sb1->set_uuid0 == sb2->set_uuid0 && 1107 sb1->set_uuid1 == sb2->set_uuid1 && 1108 sb1->set_uuid2 == sb2->set_uuid2 && 1109 sb1->set_uuid3 == sb2->set_uuid3; 1110 } 1111 1112 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1113 { 1114 int ret; 1115 mdp_super_t *tmp1, *tmp2; 1116 1117 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1118 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1119 1120 if (!tmp1 || !tmp2) { 1121 ret = 0; 1122 goto abort; 1123 } 1124 1125 *tmp1 = *sb1; 1126 *tmp2 = *sb2; 1127 1128 /* 1129 * nr_disks is not constant 1130 */ 1131 tmp1->nr_disks = 0; 1132 tmp2->nr_disks = 0; 1133 1134 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1135 abort: 1136 kfree(tmp1); 1137 kfree(tmp2); 1138 return ret; 1139 } 1140 1141 static u32 md_csum_fold(u32 csum) 1142 { 1143 csum = (csum & 0xffff) + (csum >> 16); 1144 return (csum & 0xffff) + (csum >> 16); 1145 } 1146 1147 static unsigned int calc_sb_csum(mdp_super_t *sb) 1148 { 1149 u64 newcsum = 0; 1150 u32 *sb32 = (u32*)sb; 1151 int i; 1152 unsigned int disk_csum, csum; 1153 1154 disk_csum = sb->sb_csum; 1155 sb->sb_csum = 0; 1156 1157 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1158 newcsum += sb32[i]; 1159 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1160 1161 #ifdef CONFIG_ALPHA 1162 /* This used to use csum_partial, which was wrong for several 1163 * reasons including that different results are returned on 1164 * different architectures. It isn't critical that we get exactly 1165 * the same return value as before (we always csum_fold before 1166 * testing, and that removes any differences). However as we 1167 * know that csum_partial always returned a 16bit value on 1168 * alphas, do a fold to maximise conformity to previous behaviour. 1169 */ 1170 sb->sb_csum = md_csum_fold(disk_csum); 1171 #else 1172 sb->sb_csum = disk_csum; 1173 #endif 1174 return csum; 1175 } 1176 1177 /* 1178 * Handle superblock details. 1179 * We want to be able to handle multiple superblock formats 1180 * so we have a common interface to them all, and an array of 1181 * different handlers. 1182 * We rely on user-space to write the initial superblock, and support 1183 * reading and updating of superblocks. 1184 * Interface methods are: 1185 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1186 * loads and validates a superblock on dev. 1187 * if refdev != NULL, compare superblocks on both devices 1188 * Return: 1189 * 0 - dev has a superblock that is compatible with refdev 1190 * 1 - dev has a superblock that is compatible and newer than refdev 1191 * so dev should be used as the refdev in future 1192 * -EINVAL superblock incompatible or invalid 1193 * -othererror e.g. -EIO 1194 * 1195 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1196 * Verify that dev is acceptable into mddev. 1197 * The first time, mddev->raid_disks will be 0, and data from 1198 * dev should be merged in. Subsequent calls check that dev 1199 * is new enough. Return 0 or -EINVAL 1200 * 1201 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1202 * Update the superblock for rdev with data in mddev 1203 * This does not write to disc. 1204 * 1205 */ 1206 1207 struct super_type { 1208 char *name; 1209 struct module *owner; 1210 int (*load_super)(struct md_rdev *rdev, 1211 struct md_rdev *refdev, 1212 int minor_version); 1213 int (*validate_super)(struct mddev *mddev, 1214 struct md_rdev *freshest, 1215 struct md_rdev *rdev); 1216 void (*sync_super)(struct mddev *mddev, 1217 struct md_rdev *rdev); 1218 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1219 sector_t num_sectors); 1220 int (*allow_new_offset)(struct md_rdev *rdev, 1221 unsigned long long new_offset); 1222 }; 1223 1224 /* 1225 * Check that the given mddev has no bitmap. 1226 * 1227 * This function is called from the run method of all personalities that do not 1228 * support bitmaps. It prints an error message and returns non-zero if mddev 1229 * has a bitmap. Otherwise, it returns 0. 1230 * 1231 */ 1232 int md_check_no_bitmap(struct mddev *mddev) 1233 { 1234 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1235 return 0; 1236 pr_warn("%s: bitmaps are not supported for %s\n", 1237 mdname(mddev), mddev->pers->head.name); 1238 return 1; 1239 } 1240 EXPORT_SYMBOL(md_check_no_bitmap); 1241 1242 /* 1243 * load_super for 0.90.0 1244 */ 1245 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1246 { 1247 mdp_super_t *sb; 1248 int ret; 1249 bool spare_disk = true; 1250 1251 /* 1252 * Calculate the position of the superblock (512byte sectors), 1253 * it's at the end of the disk. 1254 * 1255 * It also happens to be a multiple of 4Kb. 1256 */ 1257 rdev->sb_start = calc_dev_sboffset(rdev); 1258 1259 ret = read_disk_sb(rdev, MD_SB_BYTES); 1260 if (ret) 1261 return ret; 1262 1263 ret = -EINVAL; 1264 1265 sb = page_address(rdev->sb_page); 1266 1267 if (sb->md_magic != MD_SB_MAGIC) { 1268 pr_warn("md: invalid raid superblock magic on %pg\n", 1269 rdev->bdev); 1270 goto abort; 1271 } 1272 1273 if (sb->major_version != 0 || 1274 sb->minor_version < 90 || 1275 sb->minor_version > 91) { 1276 pr_warn("Bad version number %d.%d on %pg\n", 1277 sb->major_version, sb->minor_version, rdev->bdev); 1278 goto abort; 1279 } 1280 1281 if (sb->raid_disks <= 0) 1282 goto abort; 1283 1284 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1285 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1286 goto abort; 1287 } 1288 1289 rdev->preferred_minor = sb->md_minor; 1290 rdev->data_offset = 0; 1291 rdev->new_data_offset = 0; 1292 rdev->sb_size = MD_SB_BYTES; 1293 rdev->badblocks.shift = -1; 1294 1295 rdev->desc_nr = sb->this_disk.number; 1296 1297 /* not spare disk */ 1298 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && 1299 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1300 spare_disk = false; 1301 1302 if (!refdev) { 1303 if (!spare_disk) 1304 ret = 1; 1305 else 1306 ret = 0; 1307 } else { 1308 __u64 ev1, ev2; 1309 mdp_super_t *refsb = page_address(refdev->sb_page); 1310 if (!md_uuid_equal(refsb, sb)) { 1311 pr_warn("md: %pg has different UUID to %pg\n", 1312 rdev->bdev, refdev->bdev); 1313 goto abort; 1314 } 1315 if (!md_sb_equal(refsb, sb)) { 1316 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1317 rdev->bdev, refdev->bdev); 1318 goto abort; 1319 } 1320 ev1 = md_event(sb); 1321 ev2 = md_event(refsb); 1322 1323 if (!spare_disk && ev1 > ev2) 1324 ret = 1; 1325 else 1326 ret = 0; 1327 } 1328 rdev->sectors = rdev->sb_start; 1329 /* Limit to 4TB as metadata cannot record more than that. 1330 * (not needed for Linear and RAID0 as metadata doesn't 1331 * record this size) 1332 */ 1333 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1334 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1335 1336 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1337 /* "this cannot possibly happen" ... */ 1338 ret = -EINVAL; 1339 1340 abort: 1341 return ret; 1342 } 1343 1344 static u64 md_bitmap_events_cleared(struct mddev *mddev) 1345 { 1346 struct md_bitmap_stats stats; 1347 int err; 1348 1349 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 1350 if (err) 1351 return 0; 1352 1353 return stats.events_cleared; 1354 } 1355 1356 /* 1357 * validate_super for 0.90.0 1358 * note: we are not using "freshest" for 0.9 superblock 1359 */ 1360 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1361 { 1362 mdp_disk_t *desc; 1363 mdp_super_t *sb = page_address(rdev->sb_page); 1364 __u64 ev1 = md_event(sb); 1365 1366 rdev->raid_disk = -1; 1367 clear_bit(Faulty, &rdev->flags); 1368 clear_bit(In_sync, &rdev->flags); 1369 clear_bit(Bitmap_sync, &rdev->flags); 1370 clear_bit(WriteMostly, &rdev->flags); 1371 1372 if (mddev->raid_disks == 0) { 1373 mddev->major_version = 0; 1374 mddev->minor_version = sb->minor_version; 1375 mddev->patch_version = sb->patch_version; 1376 mddev->external = 0; 1377 mddev->chunk_sectors = sb->chunk_size >> 9; 1378 mddev->ctime = sb->ctime; 1379 mddev->utime = sb->utime; 1380 mddev->level = sb->level; 1381 mddev->clevel[0] = 0; 1382 mddev->layout = sb->layout; 1383 mddev->raid_disks = sb->raid_disks; 1384 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1385 mddev->events = ev1; 1386 mddev->bitmap_info.offset = 0; 1387 mddev->bitmap_info.space = 0; 1388 /* bitmap can use 60 K after the 4K superblocks */ 1389 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1390 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1391 mddev->reshape_backwards = 0; 1392 1393 if (mddev->minor_version >= 91) { 1394 mddev->reshape_position = sb->reshape_position; 1395 mddev->delta_disks = sb->delta_disks; 1396 mddev->new_level = sb->new_level; 1397 mddev->new_layout = sb->new_layout; 1398 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1399 if (mddev->delta_disks < 0) 1400 mddev->reshape_backwards = 1; 1401 } else { 1402 mddev->reshape_position = MaxSector; 1403 mddev->delta_disks = 0; 1404 mddev->new_level = mddev->level; 1405 mddev->new_layout = mddev->layout; 1406 mddev->new_chunk_sectors = mddev->chunk_sectors; 1407 } 1408 if (mddev->level == 0) 1409 mddev->layout = -1; 1410 1411 if (sb->state & (1<<MD_SB_CLEAN)) 1412 mddev->recovery_cp = MaxSector; 1413 else { 1414 if (sb->events_hi == sb->cp_events_hi && 1415 sb->events_lo == sb->cp_events_lo) { 1416 mddev->recovery_cp = sb->recovery_cp; 1417 } else 1418 mddev->recovery_cp = 0; 1419 } 1420 1421 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1422 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1423 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1424 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1425 1426 mddev->max_disks = MD_SB_DISKS; 1427 1428 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1429 mddev->bitmap_info.file == NULL) { 1430 mddev->bitmap_info.offset = 1431 mddev->bitmap_info.default_offset; 1432 mddev->bitmap_info.space = 1433 mddev->bitmap_info.default_space; 1434 } 1435 1436 } else if (mddev->pers == NULL) { 1437 /* Insist on good event counter while assembling, except 1438 * for spares (which don't need an event count) */ 1439 ++ev1; 1440 if (sb->disks[rdev->desc_nr].state & ( 1441 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1442 if (ev1 < mddev->events) 1443 return -EINVAL; 1444 } else if (mddev->bitmap) { 1445 /* if adding to array with a bitmap, then we can accept an 1446 * older device ... but not too old. 1447 */ 1448 if (ev1 < md_bitmap_events_cleared(mddev)) 1449 return 0; 1450 if (ev1 < mddev->events) 1451 set_bit(Bitmap_sync, &rdev->flags); 1452 } else { 1453 if (ev1 < mddev->events) 1454 /* just a hot-add of a new device, leave raid_disk at -1 */ 1455 return 0; 1456 } 1457 1458 desc = sb->disks + rdev->desc_nr; 1459 1460 if (desc->state & (1<<MD_DISK_FAULTY)) 1461 set_bit(Faulty, &rdev->flags); 1462 else if (desc->state & (1<<MD_DISK_SYNC)) { 1463 set_bit(In_sync, &rdev->flags); 1464 rdev->raid_disk = desc->raid_disk; 1465 rdev->saved_raid_disk = desc->raid_disk; 1466 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1467 /* active but not in sync implies recovery up to 1468 * reshape position. We don't know exactly where 1469 * that is, so set to zero for now 1470 */ 1471 if (mddev->minor_version >= 91) { 1472 rdev->recovery_offset = 0; 1473 rdev->raid_disk = desc->raid_disk; 1474 } 1475 } 1476 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1477 set_bit(WriteMostly, &rdev->flags); 1478 if (desc->state & (1<<MD_DISK_FAILFAST)) 1479 set_bit(FailFast, &rdev->flags); 1480 return 0; 1481 } 1482 1483 /* 1484 * sync_super for 0.90.0 1485 */ 1486 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1487 { 1488 mdp_super_t *sb; 1489 struct md_rdev *rdev2; 1490 int next_spare = mddev->raid_disks; 1491 1492 /* make rdev->sb match mddev data.. 1493 * 1494 * 1/ zero out disks 1495 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1496 * 3/ any empty disks < next_spare become removed 1497 * 1498 * disks[0] gets initialised to REMOVED because 1499 * we cannot be sure from other fields if it has 1500 * been initialised or not. 1501 */ 1502 int i; 1503 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1504 1505 rdev->sb_size = MD_SB_BYTES; 1506 1507 sb = page_address(rdev->sb_page); 1508 1509 memset(sb, 0, sizeof(*sb)); 1510 1511 sb->md_magic = MD_SB_MAGIC; 1512 sb->major_version = mddev->major_version; 1513 sb->patch_version = mddev->patch_version; 1514 sb->gvalid_words = 0; /* ignored */ 1515 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1516 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1517 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1518 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1519 1520 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1521 sb->level = mddev->level; 1522 sb->size = mddev->dev_sectors / 2; 1523 sb->raid_disks = mddev->raid_disks; 1524 sb->md_minor = mddev->md_minor; 1525 sb->not_persistent = 0; 1526 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1527 sb->state = 0; 1528 sb->events_hi = (mddev->events>>32); 1529 sb->events_lo = (u32)mddev->events; 1530 1531 if (mddev->reshape_position == MaxSector) 1532 sb->minor_version = 90; 1533 else { 1534 sb->minor_version = 91; 1535 sb->reshape_position = mddev->reshape_position; 1536 sb->new_level = mddev->new_level; 1537 sb->delta_disks = mddev->delta_disks; 1538 sb->new_layout = mddev->new_layout; 1539 sb->new_chunk = mddev->new_chunk_sectors << 9; 1540 } 1541 mddev->minor_version = sb->minor_version; 1542 if (mddev->in_sync) 1543 { 1544 sb->recovery_cp = mddev->recovery_cp; 1545 sb->cp_events_hi = (mddev->events>>32); 1546 sb->cp_events_lo = (u32)mddev->events; 1547 if (mddev->recovery_cp == MaxSector) 1548 sb->state = (1<< MD_SB_CLEAN); 1549 } else 1550 sb->recovery_cp = 0; 1551 1552 sb->layout = mddev->layout; 1553 sb->chunk_size = mddev->chunk_sectors << 9; 1554 1555 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1556 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1557 1558 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1559 rdev_for_each(rdev2, mddev) { 1560 mdp_disk_t *d; 1561 int desc_nr; 1562 int is_active = test_bit(In_sync, &rdev2->flags); 1563 1564 if (rdev2->raid_disk >= 0 && 1565 sb->minor_version >= 91) 1566 /* we have nowhere to store the recovery_offset, 1567 * but if it is not below the reshape_position, 1568 * we can piggy-back on that. 1569 */ 1570 is_active = 1; 1571 if (rdev2->raid_disk < 0 || 1572 test_bit(Faulty, &rdev2->flags)) 1573 is_active = 0; 1574 if (is_active) 1575 desc_nr = rdev2->raid_disk; 1576 else 1577 desc_nr = next_spare++; 1578 rdev2->desc_nr = desc_nr; 1579 d = &sb->disks[rdev2->desc_nr]; 1580 nr_disks++; 1581 d->number = rdev2->desc_nr; 1582 d->major = MAJOR(rdev2->bdev->bd_dev); 1583 d->minor = MINOR(rdev2->bdev->bd_dev); 1584 if (is_active) 1585 d->raid_disk = rdev2->raid_disk; 1586 else 1587 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1588 if (test_bit(Faulty, &rdev2->flags)) 1589 d->state = (1<<MD_DISK_FAULTY); 1590 else if (is_active) { 1591 d->state = (1<<MD_DISK_ACTIVE); 1592 if (test_bit(In_sync, &rdev2->flags)) 1593 d->state |= (1<<MD_DISK_SYNC); 1594 active++; 1595 working++; 1596 } else { 1597 d->state = 0; 1598 spare++; 1599 working++; 1600 } 1601 if (test_bit(WriteMostly, &rdev2->flags)) 1602 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1603 if (test_bit(FailFast, &rdev2->flags)) 1604 d->state |= (1<<MD_DISK_FAILFAST); 1605 } 1606 /* now set the "removed" and "faulty" bits on any missing devices */ 1607 for (i=0 ; i < mddev->raid_disks ; i++) { 1608 mdp_disk_t *d = &sb->disks[i]; 1609 if (d->state == 0 && d->number == 0) { 1610 d->number = i; 1611 d->raid_disk = i; 1612 d->state = (1<<MD_DISK_REMOVED); 1613 d->state |= (1<<MD_DISK_FAULTY); 1614 failed++; 1615 } 1616 } 1617 sb->nr_disks = nr_disks; 1618 sb->active_disks = active; 1619 sb->working_disks = working; 1620 sb->failed_disks = failed; 1621 sb->spare_disks = spare; 1622 1623 sb->this_disk = sb->disks[rdev->desc_nr]; 1624 sb->sb_csum = calc_sb_csum(sb); 1625 } 1626 1627 /* 1628 * rdev_size_change for 0.90.0 1629 */ 1630 static unsigned long long 1631 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1632 { 1633 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1634 return 0; /* component must fit device */ 1635 if (rdev->mddev->bitmap_info.offset) 1636 return 0; /* can't move bitmap */ 1637 rdev->sb_start = calc_dev_sboffset(rdev); 1638 if (!num_sectors || num_sectors > rdev->sb_start) 1639 num_sectors = rdev->sb_start; 1640 /* Limit to 4TB as metadata cannot record more than that. 1641 * 4TB == 2^32 KB, or 2*2^32 sectors. 1642 */ 1643 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1644 num_sectors = (sector_t)(2ULL << 32) - 2; 1645 do { 1646 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1647 rdev->sb_page); 1648 } while (md_super_wait(rdev->mddev) < 0); 1649 return num_sectors; 1650 } 1651 1652 static int 1653 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1654 { 1655 /* non-zero offset changes not possible with v0.90 */ 1656 return new_offset == 0; 1657 } 1658 1659 /* 1660 * version 1 superblock 1661 */ 1662 1663 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1664 { 1665 __le32 disk_csum; 1666 u32 csum; 1667 unsigned long long newcsum; 1668 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1669 __le32 *isuper = (__le32*)sb; 1670 1671 disk_csum = sb->sb_csum; 1672 sb->sb_csum = 0; 1673 newcsum = 0; 1674 for (; size >= 4; size -= 4) 1675 newcsum += le32_to_cpu(*isuper++); 1676 1677 if (size == 2) 1678 newcsum += le16_to_cpu(*(__le16*) isuper); 1679 1680 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1681 sb->sb_csum = disk_csum; 1682 return cpu_to_le32(csum); 1683 } 1684 1685 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1686 { 1687 struct mdp_superblock_1 *sb; 1688 int ret; 1689 sector_t sb_start; 1690 sector_t sectors; 1691 int bmask; 1692 bool spare_disk = true; 1693 1694 /* 1695 * Calculate the position of the superblock in 512byte sectors. 1696 * It is always aligned to a 4K boundary and 1697 * depeding on minor_version, it can be: 1698 * 0: At least 8K, but less than 12K, from end of device 1699 * 1: At start of device 1700 * 2: 4K from start of device. 1701 */ 1702 switch(minor_version) { 1703 case 0: 1704 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1705 sb_start &= ~(sector_t)(4*2-1); 1706 break; 1707 case 1: 1708 sb_start = 0; 1709 break; 1710 case 2: 1711 sb_start = 8; 1712 break; 1713 default: 1714 return -EINVAL; 1715 } 1716 rdev->sb_start = sb_start; 1717 1718 /* superblock is rarely larger than 1K, but it can be larger, 1719 * and it is safe to read 4k, so we do that 1720 */ 1721 ret = read_disk_sb(rdev, 4096); 1722 if (ret) return ret; 1723 1724 sb = page_address(rdev->sb_page); 1725 1726 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1727 sb->major_version != cpu_to_le32(1) || 1728 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1729 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1730 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1731 return -EINVAL; 1732 1733 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1734 pr_warn("md: invalid superblock checksum on %pg\n", 1735 rdev->bdev); 1736 return -EINVAL; 1737 } 1738 if (le64_to_cpu(sb->data_size) < 10) { 1739 pr_warn("md: data_size too small on %pg\n", 1740 rdev->bdev); 1741 return -EINVAL; 1742 } 1743 if (sb->pad0 || 1744 sb->pad3[0] || 1745 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1746 /* Some padding is non-zero, might be a new feature */ 1747 return -EINVAL; 1748 1749 rdev->preferred_minor = 0xffff; 1750 rdev->data_offset = le64_to_cpu(sb->data_offset); 1751 rdev->new_data_offset = rdev->data_offset; 1752 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1753 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1754 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1755 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1756 1757 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1758 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1759 if (rdev->sb_size & bmask) 1760 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1761 1762 if (minor_version 1763 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1764 return -EINVAL; 1765 if (minor_version 1766 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1767 return -EINVAL; 1768 1769 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1770 1771 if (!rdev->bb_page) { 1772 rdev->bb_page = alloc_page(GFP_KERNEL); 1773 if (!rdev->bb_page) 1774 return -ENOMEM; 1775 } 1776 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1777 rdev->badblocks.count == 0) { 1778 /* need to load the bad block list. 1779 * Currently we limit it to one page. 1780 */ 1781 s32 offset; 1782 sector_t bb_sector; 1783 __le64 *bbp; 1784 int i; 1785 int sectors = le16_to_cpu(sb->bblog_size); 1786 if (sectors > (PAGE_SIZE / 512)) 1787 return -EINVAL; 1788 offset = le32_to_cpu(sb->bblog_offset); 1789 if (offset == 0) 1790 return -EINVAL; 1791 bb_sector = (long long)offset; 1792 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1793 rdev->bb_page, REQ_OP_READ, true)) 1794 return -EIO; 1795 bbp = (__le64 *)page_address(rdev->bb_page); 1796 rdev->badblocks.shift = sb->bblog_shift; 1797 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1798 u64 bb = le64_to_cpu(*bbp); 1799 int count = bb & (0x3ff); 1800 u64 sector = bb >> 10; 1801 sector <<= sb->bblog_shift; 1802 count <<= sb->bblog_shift; 1803 if (bb + 1 == 0) 1804 break; 1805 if (!badblocks_set(&rdev->badblocks, sector, count, 1)) 1806 return -EINVAL; 1807 } 1808 } else if (sb->bblog_offset != 0) 1809 rdev->badblocks.shift = 0; 1810 1811 if ((le32_to_cpu(sb->feature_map) & 1812 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1813 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1814 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1815 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1816 } 1817 1818 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1819 sb->level != 0) 1820 return -EINVAL; 1821 1822 /* not spare disk */ 1823 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1824 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1825 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1826 spare_disk = false; 1827 1828 if (!refdev) { 1829 if (!spare_disk) 1830 ret = 1; 1831 else 1832 ret = 0; 1833 } else { 1834 __u64 ev1, ev2; 1835 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1836 1837 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1838 sb->level != refsb->level || 1839 sb->layout != refsb->layout || 1840 sb->chunksize != refsb->chunksize) { 1841 pr_warn("md: %pg has strangely different superblock to %pg\n", 1842 rdev->bdev, 1843 refdev->bdev); 1844 return -EINVAL; 1845 } 1846 ev1 = le64_to_cpu(sb->events); 1847 ev2 = le64_to_cpu(refsb->events); 1848 1849 if (!spare_disk && ev1 > ev2) 1850 ret = 1; 1851 else 1852 ret = 0; 1853 } 1854 if (minor_version) 1855 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1856 else 1857 sectors = rdev->sb_start; 1858 if (sectors < le64_to_cpu(sb->data_size)) 1859 return -EINVAL; 1860 rdev->sectors = le64_to_cpu(sb->data_size); 1861 return ret; 1862 } 1863 1864 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1865 { 1866 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1867 __u64 ev1 = le64_to_cpu(sb->events); 1868 int role; 1869 1870 rdev->raid_disk = -1; 1871 clear_bit(Faulty, &rdev->flags); 1872 clear_bit(In_sync, &rdev->flags); 1873 clear_bit(Bitmap_sync, &rdev->flags); 1874 clear_bit(WriteMostly, &rdev->flags); 1875 1876 if (mddev->raid_disks == 0) { 1877 mddev->major_version = 1; 1878 mddev->patch_version = 0; 1879 mddev->external = 0; 1880 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1881 mddev->ctime = le64_to_cpu(sb->ctime); 1882 mddev->utime = le64_to_cpu(sb->utime); 1883 mddev->level = le32_to_cpu(sb->level); 1884 mddev->clevel[0] = 0; 1885 mddev->layout = le32_to_cpu(sb->layout); 1886 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1887 mddev->dev_sectors = le64_to_cpu(sb->size); 1888 mddev->events = ev1; 1889 mddev->bitmap_info.offset = 0; 1890 mddev->bitmap_info.space = 0; 1891 /* Default location for bitmap is 1K after superblock 1892 * using 3K - total of 4K 1893 */ 1894 mddev->bitmap_info.default_offset = 1024 >> 9; 1895 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1896 mddev->reshape_backwards = 0; 1897 1898 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1899 memcpy(mddev->uuid, sb->set_uuid, 16); 1900 1901 mddev->max_disks = (4096-256)/2; 1902 1903 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1904 mddev->bitmap_info.file == NULL) { 1905 mddev->bitmap_info.offset = 1906 (__s32)le32_to_cpu(sb->bitmap_offset); 1907 /* Metadata doesn't record how much space is available. 1908 * For 1.0, we assume we can use up to the superblock 1909 * if before, else to 4K beyond superblock. 1910 * For others, assume no change is possible. 1911 */ 1912 if (mddev->minor_version > 0) 1913 mddev->bitmap_info.space = 0; 1914 else if (mddev->bitmap_info.offset > 0) 1915 mddev->bitmap_info.space = 1916 8 - mddev->bitmap_info.offset; 1917 else 1918 mddev->bitmap_info.space = 1919 -mddev->bitmap_info.offset; 1920 } 1921 1922 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1923 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1924 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1925 mddev->new_level = le32_to_cpu(sb->new_level); 1926 mddev->new_layout = le32_to_cpu(sb->new_layout); 1927 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1928 if (mddev->delta_disks < 0 || 1929 (mddev->delta_disks == 0 && 1930 (le32_to_cpu(sb->feature_map) 1931 & MD_FEATURE_RESHAPE_BACKWARDS))) 1932 mddev->reshape_backwards = 1; 1933 } else { 1934 mddev->reshape_position = MaxSector; 1935 mddev->delta_disks = 0; 1936 mddev->new_level = mddev->level; 1937 mddev->new_layout = mddev->layout; 1938 mddev->new_chunk_sectors = mddev->chunk_sectors; 1939 } 1940 1941 if (mddev->level == 0 && 1942 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1943 mddev->layout = -1; 1944 1945 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1946 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1947 1948 if (le32_to_cpu(sb->feature_map) & 1949 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1950 if (le32_to_cpu(sb->feature_map) & 1951 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1952 return -EINVAL; 1953 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1954 (le32_to_cpu(sb->feature_map) & 1955 MD_FEATURE_MULTIPLE_PPLS)) 1956 return -EINVAL; 1957 set_bit(MD_HAS_PPL, &mddev->flags); 1958 } 1959 } else if (mddev->pers == NULL) { 1960 /* Insist of good event counter while assembling, except for 1961 * spares (which don't need an event count). 1962 * Similar to mdadm, we allow event counter difference of 1 1963 * from the freshest device. 1964 */ 1965 if (rdev->desc_nr >= 0 && 1966 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1967 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1968 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1969 if (ev1 + 1 < mddev->events) 1970 return -EINVAL; 1971 } else if (mddev->bitmap) { 1972 /* If adding to array with a bitmap, then we can accept an 1973 * older device, but not too old. 1974 */ 1975 if (ev1 < md_bitmap_events_cleared(mddev)) 1976 return 0; 1977 if (ev1 < mddev->events) 1978 set_bit(Bitmap_sync, &rdev->flags); 1979 } else { 1980 if (ev1 < mddev->events) 1981 /* just a hot-add of a new device, leave raid_disk at -1 */ 1982 return 0; 1983 } 1984 1985 if (rdev->desc_nr < 0 || 1986 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1987 role = MD_DISK_ROLE_SPARE; 1988 rdev->desc_nr = -1; 1989 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 1990 /* 1991 * If we are assembling, and our event counter is smaller than the 1992 * highest event counter, we cannot trust our superblock about the role. 1993 * It could happen that our rdev was marked as Faulty, and all other 1994 * superblocks were updated with +1 event counter. 1995 * Then, before the next superblock update, which typically happens when 1996 * remove_and_add_spares() removes the device from the array, there was 1997 * a crash or reboot. 1998 * If we allow current rdev without consulting the freshest superblock, 1999 * we could cause data corruption. 2000 * Note that in this case our event counter is smaller by 1 than the 2001 * highest, otherwise, this rdev would not be allowed into array; 2002 * both kernel and mdadm allow event counter difference of 1. 2003 */ 2004 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 2005 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 2006 2007 if (rdev->desc_nr >= freshest_max_dev) { 2008 /* this is unexpected, better not proceed */ 2009 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 2010 mdname(mddev), rdev->bdev, rdev->desc_nr, 2011 freshest->bdev, freshest_max_dev); 2012 return -EUCLEAN; 2013 } 2014 2015 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 2016 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 2017 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 2018 } else { 2019 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2020 } 2021 switch (role) { 2022 case MD_DISK_ROLE_SPARE: /* spare */ 2023 break; 2024 case MD_DISK_ROLE_FAULTY: /* faulty */ 2025 set_bit(Faulty, &rdev->flags); 2026 break; 2027 case MD_DISK_ROLE_JOURNAL: /* journal device */ 2028 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 2029 /* journal device without journal feature */ 2030 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 2031 return -EINVAL; 2032 } 2033 set_bit(Journal, &rdev->flags); 2034 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 2035 rdev->raid_disk = 0; 2036 break; 2037 default: 2038 rdev->saved_raid_disk = role; 2039 if ((le32_to_cpu(sb->feature_map) & 2040 MD_FEATURE_RECOVERY_OFFSET)) { 2041 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 2042 if (!(le32_to_cpu(sb->feature_map) & 2043 MD_FEATURE_RECOVERY_BITMAP)) 2044 rdev->saved_raid_disk = -1; 2045 } else { 2046 /* 2047 * If the array is FROZEN, then the device can't 2048 * be in_sync with rest of array. 2049 */ 2050 if (!test_bit(MD_RECOVERY_FROZEN, 2051 &mddev->recovery)) 2052 set_bit(In_sync, &rdev->flags); 2053 } 2054 rdev->raid_disk = role; 2055 break; 2056 } 2057 if (sb->devflags & WriteMostly1) 2058 set_bit(WriteMostly, &rdev->flags); 2059 if (sb->devflags & FailFast1) 2060 set_bit(FailFast, &rdev->flags); 2061 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2062 set_bit(Replacement, &rdev->flags); 2063 2064 return 0; 2065 } 2066 2067 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2068 { 2069 struct mdp_superblock_1 *sb; 2070 struct md_rdev *rdev2; 2071 int max_dev, i; 2072 /* make rdev->sb match mddev and rdev data. */ 2073 2074 sb = page_address(rdev->sb_page); 2075 2076 sb->feature_map = 0; 2077 sb->pad0 = 0; 2078 sb->recovery_offset = cpu_to_le64(0); 2079 memset(sb->pad3, 0, sizeof(sb->pad3)); 2080 2081 sb->utime = cpu_to_le64((__u64)mddev->utime); 2082 sb->events = cpu_to_le64(mddev->events); 2083 if (mddev->in_sync) 2084 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 2085 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2086 sb->resync_offset = cpu_to_le64(MaxSector); 2087 else 2088 sb->resync_offset = cpu_to_le64(0); 2089 2090 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2091 2092 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2093 sb->size = cpu_to_le64(mddev->dev_sectors); 2094 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2095 sb->level = cpu_to_le32(mddev->level); 2096 sb->layout = cpu_to_le32(mddev->layout); 2097 if (test_bit(FailFast, &rdev->flags)) 2098 sb->devflags |= FailFast1; 2099 else 2100 sb->devflags &= ~FailFast1; 2101 2102 if (test_bit(WriteMostly, &rdev->flags)) 2103 sb->devflags |= WriteMostly1; 2104 else 2105 sb->devflags &= ~WriteMostly1; 2106 sb->data_offset = cpu_to_le64(rdev->data_offset); 2107 sb->data_size = cpu_to_le64(rdev->sectors); 2108 2109 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2110 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2111 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2112 } 2113 2114 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2115 !test_bit(In_sync, &rdev->flags)) { 2116 sb->feature_map |= 2117 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2118 sb->recovery_offset = 2119 cpu_to_le64(rdev->recovery_offset); 2120 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2121 sb->feature_map |= 2122 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2123 } 2124 /* Note: recovery_offset and journal_tail share space */ 2125 if (test_bit(Journal, &rdev->flags)) 2126 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2127 if (test_bit(Replacement, &rdev->flags)) 2128 sb->feature_map |= 2129 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2130 2131 if (mddev->reshape_position != MaxSector) { 2132 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2133 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2134 sb->new_layout = cpu_to_le32(mddev->new_layout); 2135 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2136 sb->new_level = cpu_to_le32(mddev->new_level); 2137 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2138 if (mddev->delta_disks == 0 && 2139 mddev->reshape_backwards) 2140 sb->feature_map 2141 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2142 if (rdev->new_data_offset != rdev->data_offset) { 2143 sb->feature_map 2144 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2145 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2146 - rdev->data_offset)); 2147 } 2148 } 2149 2150 if (mddev_is_clustered(mddev)) 2151 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2152 2153 if (rdev->badblocks.count == 0) 2154 /* Nothing to do for bad blocks*/ ; 2155 else if (sb->bblog_offset == 0) 2156 /* Cannot record bad blocks on this device */ 2157 md_error(mddev, rdev); 2158 else { 2159 struct badblocks *bb = &rdev->badblocks; 2160 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2161 u64 *p = bb->page; 2162 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2163 if (bb->changed) { 2164 unsigned seq; 2165 2166 retry: 2167 seq = read_seqbegin(&bb->lock); 2168 2169 memset(bbp, 0xff, PAGE_SIZE); 2170 2171 for (i = 0 ; i < bb->count ; i++) { 2172 u64 internal_bb = p[i]; 2173 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2174 | BB_LEN(internal_bb)); 2175 bbp[i] = cpu_to_le64(store_bb); 2176 } 2177 bb->changed = 0; 2178 if (read_seqretry(&bb->lock, seq)) 2179 goto retry; 2180 2181 bb->sector = (rdev->sb_start + 2182 (int)le32_to_cpu(sb->bblog_offset)); 2183 bb->size = le16_to_cpu(sb->bblog_size); 2184 } 2185 } 2186 2187 max_dev = 0; 2188 rdev_for_each(rdev2, mddev) 2189 if (rdev2->desc_nr+1 > max_dev) 2190 max_dev = rdev2->desc_nr+1; 2191 2192 if (max_dev > le32_to_cpu(sb->max_dev)) { 2193 int bmask; 2194 sb->max_dev = cpu_to_le32(max_dev); 2195 rdev->sb_size = max_dev * 2 + 256; 2196 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2197 if (rdev->sb_size & bmask) 2198 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2199 } else 2200 max_dev = le32_to_cpu(sb->max_dev); 2201 2202 for (i=0; i<max_dev;i++) 2203 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2204 2205 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2206 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2207 2208 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2209 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2210 sb->feature_map |= 2211 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2212 else 2213 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2214 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2215 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2216 } 2217 2218 rdev_for_each(rdev2, mddev) { 2219 i = rdev2->desc_nr; 2220 if (test_bit(Faulty, &rdev2->flags)) 2221 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2222 else if (test_bit(In_sync, &rdev2->flags)) 2223 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2224 else if (test_bit(Journal, &rdev2->flags)) 2225 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2226 else if (rdev2->raid_disk >= 0) 2227 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2228 else 2229 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2230 } 2231 2232 sb->sb_csum = calc_sb_1_csum(sb); 2233 } 2234 2235 static sector_t super_1_choose_bm_space(sector_t dev_size) 2236 { 2237 sector_t bm_space; 2238 2239 /* if the device is bigger than 8Gig, save 64k for bitmap 2240 * usage, if bigger than 200Gig, save 128k 2241 */ 2242 if (dev_size < 64*2) 2243 bm_space = 0; 2244 else if (dev_size - 64*2 >= 200*1024*1024*2) 2245 bm_space = 128*2; 2246 else if (dev_size - 4*2 > 8*1024*1024*2) 2247 bm_space = 64*2; 2248 else 2249 bm_space = 4*2; 2250 return bm_space; 2251 } 2252 2253 static unsigned long long 2254 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2255 { 2256 struct mdp_superblock_1 *sb; 2257 sector_t max_sectors; 2258 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2259 return 0; /* component must fit device */ 2260 if (rdev->data_offset != rdev->new_data_offset) 2261 return 0; /* too confusing */ 2262 if (rdev->sb_start < rdev->data_offset) { 2263 /* minor versions 1 and 2; superblock before data */ 2264 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2265 if (!num_sectors || num_sectors > max_sectors) 2266 num_sectors = max_sectors; 2267 } else if (rdev->mddev->bitmap_info.offset) { 2268 /* minor version 0 with bitmap we can't move */ 2269 return 0; 2270 } else { 2271 /* minor version 0; superblock after data */ 2272 sector_t sb_start, bm_space; 2273 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2274 2275 /* 8K is for superblock */ 2276 sb_start = dev_size - 8*2; 2277 sb_start &= ~(sector_t)(4*2 - 1); 2278 2279 bm_space = super_1_choose_bm_space(dev_size); 2280 2281 /* Space that can be used to store date needs to decrease 2282 * superblock bitmap space and bad block space(4K) 2283 */ 2284 max_sectors = sb_start - bm_space - 4*2; 2285 2286 if (!num_sectors || num_sectors > max_sectors) 2287 num_sectors = max_sectors; 2288 rdev->sb_start = sb_start; 2289 } 2290 sb = page_address(rdev->sb_page); 2291 sb->data_size = cpu_to_le64(num_sectors); 2292 sb->super_offset = cpu_to_le64(rdev->sb_start); 2293 sb->sb_csum = calc_sb_1_csum(sb); 2294 do { 2295 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2296 rdev->sb_page); 2297 } while (md_super_wait(rdev->mddev) < 0); 2298 return num_sectors; 2299 2300 } 2301 2302 static int 2303 super_1_allow_new_offset(struct md_rdev *rdev, 2304 unsigned long long new_offset) 2305 { 2306 /* All necessary checks on new >= old have been done */ 2307 if (new_offset >= rdev->data_offset) 2308 return 1; 2309 2310 /* with 1.0 metadata, there is no metadata to tread on 2311 * so we can always move back */ 2312 if (rdev->mddev->minor_version == 0) 2313 return 1; 2314 2315 /* otherwise we must be sure not to step on 2316 * any metadata, so stay: 2317 * 36K beyond start of superblock 2318 * beyond end of badblocks 2319 * beyond write-intent bitmap 2320 */ 2321 if (rdev->sb_start + (32+4)*2 > new_offset) 2322 return 0; 2323 2324 if (!rdev->mddev->bitmap_info.file) { 2325 struct mddev *mddev = rdev->mddev; 2326 struct md_bitmap_stats stats; 2327 int err; 2328 2329 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 2330 if (!err && rdev->sb_start + mddev->bitmap_info.offset + 2331 stats.file_pages * (PAGE_SIZE >> 9) > new_offset) 2332 return 0; 2333 } 2334 2335 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2336 return 0; 2337 2338 return 1; 2339 } 2340 2341 static struct super_type super_types[] = { 2342 [0] = { 2343 .name = "0.90.0", 2344 .owner = THIS_MODULE, 2345 .load_super = super_90_load, 2346 .validate_super = super_90_validate, 2347 .sync_super = super_90_sync, 2348 .rdev_size_change = super_90_rdev_size_change, 2349 .allow_new_offset = super_90_allow_new_offset, 2350 }, 2351 [1] = { 2352 .name = "md-1", 2353 .owner = THIS_MODULE, 2354 .load_super = super_1_load, 2355 .validate_super = super_1_validate, 2356 .sync_super = super_1_sync, 2357 .rdev_size_change = super_1_rdev_size_change, 2358 .allow_new_offset = super_1_allow_new_offset, 2359 }, 2360 }; 2361 2362 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2363 { 2364 if (mddev->sync_super) { 2365 mddev->sync_super(mddev, rdev); 2366 return; 2367 } 2368 2369 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2370 2371 super_types[mddev->major_version].sync_super(mddev, rdev); 2372 } 2373 2374 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2375 { 2376 struct md_rdev *rdev, *rdev2; 2377 2378 rcu_read_lock(); 2379 rdev_for_each_rcu(rdev, mddev1) { 2380 if (test_bit(Faulty, &rdev->flags) || 2381 test_bit(Journal, &rdev->flags) || 2382 rdev->raid_disk == -1) 2383 continue; 2384 rdev_for_each_rcu(rdev2, mddev2) { 2385 if (test_bit(Faulty, &rdev2->flags) || 2386 test_bit(Journal, &rdev2->flags) || 2387 rdev2->raid_disk == -1) 2388 continue; 2389 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2390 rcu_read_unlock(); 2391 return 1; 2392 } 2393 } 2394 } 2395 rcu_read_unlock(); 2396 return 0; 2397 } 2398 2399 static LIST_HEAD(pending_raid_disks); 2400 2401 /* 2402 * Try to register data integrity profile for an mddev 2403 * 2404 * This is called when an array is started and after a disk has been kicked 2405 * from the array. It only succeeds if all working and active component devices 2406 * are integrity capable with matching profiles. 2407 */ 2408 int md_integrity_register(struct mddev *mddev) 2409 { 2410 if (list_empty(&mddev->disks)) 2411 return 0; /* nothing to do */ 2412 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) 2413 return 0; /* shouldn't register */ 2414 2415 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2416 return 0; 2417 } 2418 EXPORT_SYMBOL(md_integrity_register); 2419 2420 static bool rdev_read_only(struct md_rdev *rdev) 2421 { 2422 return bdev_read_only(rdev->bdev) || 2423 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2424 } 2425 2426 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2427 { 2428 char b[BDEVNAME_SIZE]; 2429 int err; 2430 2431 /* prevent duplicates */ 2432 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2433 return -EEXIST; 2434 2435 if (rdev_read_only(rdev) && mddev->pers) 2436 return -EROFS; 2437 2438 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2439 if (!test_bit(Journal, &rdev->flags) && 2440 rdev->sectors && 2441 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2442 if (mddev->pers) { 2443 /* Cannot change size, so fail 2444 * If mddev->level <= 0, then we don't care 2445 * about aligning sizes (e.g. linear) 2446 */ 2447 if (mddev->level > 0) 2448 return -ENOSPC; 2449 } else 2450 mddev->dev_sectors = rdev->sectors; 2451 } 2452 2453 /* Verify rdev->desc_nr is unique. 2454 * If it is -1, assign a free number, else 2455 * check number is not in use 2456 */ 2457 rcu_read_lock(); 2458 if (rdev->desc_nr < 0) { 2459 int choice = 0; 2460 if (mddev->pers) 2461 choice = mddev->raid_disks; 2462 while (md_find_rdev_nr_rcu(mddev, choice)) 2463 choice++; 2464 rdev->desc_nr = choice; 2465 } else { 2466 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2467 rcu_read_unlock(); 2468 return -EBUSY; 2469 } 2470 } 2471 rcu_read_unlock(); 2472 if (!test_bit(Journal, &rdev->flags) && 2473 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2474 pr_warn("md: %s: array is limited to %d devices\n", 2475 mdname(mddev), mddev->max_disks); 2476 return -EBUSY; 2477 } 2478 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2479 strreplace(b, '/', '!'); 2480 2481 rdev->mddev = mddev; 2482 pr_debug("md: bind<%s>\n", b); 2483 2484 if (mddev->raid_disks) 2485 mddev_create_serial_pool(mddev, rdev); 2486 2487 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2488 goto fail; 2489 2490 /* failure here is OK */ 2491 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2492 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2493 rdev->sysfs_unack_badblocks = 2494 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2495 rdev->sysfs_badblocks = 2496 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2497 2498 list_add_rcu(&rdev->same_set, &mddev->disks); 2499 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2500 2501 /* May as well allow recovery to be retried once */ 2502 mddev->recovery_disabled++; 2503 2504 return 0; 2505 2506 fail: 2507 pr_warn("md: failed to register dev-%s for %s\n", 2508 b, mdname(mddev)); 2509 mddev_destroy_serial_pool(mddev, rdev); 2510 return err; 2511 } 2512 2513 void md_autodetect_dev(dev_t dev); 2514 2515 /* just for claiming the bdev */ 2516 static struct md_rdev claim_rdev; 2517 2518 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2519 { 2520 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2521 md_rdev_clear(rdev); 2522 #ifndef MODULE 2523 if (test_bit(AutoDetected, &rdev->flags)) 2524 md_autodetect_dev(rdev->bdev->bd_dev); 2525 #endif 2526 fput(rdev->bdev_file); 2527 rdev->bdev = NULL; 2528 kobject_put(&rdev->kobj); 2529 } 2530 2531 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2532 { 2533 struct mddev *mddev = rdev->mddev; 2534 2535 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2536 list_del_rcu(&rdev->same_set); 2537 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2538 mddev_destroy_serial_pool(rdev->mddev, rdev); 2539 WRITE_ONCE(rdev->mddev, NULL); 2540 sysfs_remove_link(&rdev->kobj, "block"); 2541 sysfs_put(rdev->sysfs_state); 2542 sysfs_put(rdev->sysfs_unack_badblocks); 2543 sysfs_put(rdev->sysfs_badblocks); 2544 rdev->sysfs_state = NULL; 2545 rdev->sysfs_unack_badblocks = NULL; 2546 rdev->sysfs_badblocks = NULL; 2547 rdev->badblocks.count = 0; 2548 2549 synchronize_rcu(); 2550 2551 /* 2552 * kobject_del() will wait for all in progress writers to be done, where 2553 * reconfig_mutex is held, hence it can't be called under 2554 * reconfig_mutex and it's delayed to mddev_unlock(). 2555 */ 2556 list_add(&rdev->same_set, &mddev->deleting); 2557 } 2558 2559 static void export_array(struct mddev *mddev) 2560 { 2561 struct md_rdev *rdev; 2562 2563 while (!list_empty(&mddev->disks)) { 2564 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2565 same_set); 2566 md_kick_rdev_from_array(rdev); 2567 } 2568 mddev->raid_disks = 0; 2569 mddev->major_version = 0; 2570 } 2571 2572 static bool set_in_sync(struct mddev *mddev) 2573 { 2574 lockdep_assert_held(&mddev->lock); 2575 if (!mddev->in_sync) { 2576 mddev->sync_checkers++; 2577 spin_unlock(&mddev->lock); 2578 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2579 spin_lock(&mddev->lock); 2580 if (!mddev->in_sync && 2581 percpu_ref_is_zero(&mddev->writes_pending)) { 2582 mddev->in_sync = 1; 2583 /* 2584 * Ensure ->in_sync is visible before we clear 2585 * ->sync_checkers. 2586 */ 2587 smp_mb(); 2588 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2589 sysfs_notify_dirent_safe(mddev->sysfs_state); 2590 } 2591 if (--mddev->sync_checkers == 0) 2592 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2593 } 2594 if (mddev->safemode == 1) 2595 mddev->safemode = 0; 2596 return mddev->in_sync; 2597 } 2598 2599 static void sync_sbs(struct mddev *mddev, int nospares) 2600 { 2601 /* Update each superblock (in-memory image), but 2602 * if we are allowed to, skip spares which already 2603 * have the right event counter, or have one earlier 2604 * (which would mean they aren't being marked as dirty 2605 * with the rest of the array) 2606 */ 2607 struct md_rdev *rdev; 2608 rdev_for_each(rdev, mddev) { 2609 if (rdev->sb_events == mddev->events || 2610 (nospares && 2611 rdev->raid_disk < 0 && 2612 rdev->sb_events+1 == mddev->events)) { 2613 /* Don't update this superblock */ 2614 rdev->sb_loaded = 2; 2615 } else { 2616 sync_super(mddev, rdev); 2617 rdev->sb_loaded = 1; 2618 } 2619 } 2620 } 2621 2622 static bool does_sb_need_changing(struct mddev *mddev) 2623 { 2624 struct md_rdev *rdev = NULL, *iter; 2625 struct mdp_superblock_1 *sb; 2626 int role; 2627 2628 /* Find a good rdev */ 2629 rdev_for_each(iter, mddev) 2630 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2631 rdev = iter; 2632 break; 2633 } 2634 2635 /* No good device found. */ 2636 if (!rdev) 2637 return false; 2638 2639 sb = page_address(rdev->sb_page); 2640 /* Check if a device has become faulty or a spare become active */ 2641 rdev_for_each(rdev, mddev) { 2642 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2643 /* Device activated? */ 2644 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2645 !test_bit(Faulty, &rdev->flags)) 2646 return true; 2647 /* Device turned faulty? */ 2648 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2649 return true; 2650 } 2651 2652 /* Check if any mddev parameters have changed */ 2653 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2654 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2655 (mddev->layout != le32_to_cpu(sb->layout)) || 2656 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2657 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2658 return true; 2659 2660 return false; 2661 } 2662 2663 void md_update_sb(struct mddev *mddev, int force_change) 2664 { 2665 struct md_rdev *rdev; 2666 int sync_req; 2667 int nospares = 0; 2668 int any_badblocks_changed = 0; 2669 int ret = -1; 2670 2671 if (!md_is_rdwr(mddev)) { 2672 if (force_change) 2673 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2674 return; 2675 } 2676 2677 repeat: 2678 if (mddev_is_clustered(mddev)) { 2679 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2680 force_change = 1; 2681 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2682 nospares = 1; 2683 ret = mddev->cluster_ops->metadata_update_start(mddev); 2684 /* Has someone else has updated the sb */ 2685 if (!does_sb_need_changing(mddev)) { 2686 if (ret == 0) 2687 mddev->cluster_ops->metadata_update_cancel(mddev); 2688 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2689 BIT(MD_SB_CHANGE_DEVS) | 2690 BIT(MD_SB_CHANGE_CLEAN)); 2691 return; 2692 } 2693 } 2694 2695 /* 2696 * First make sure individual recovery_offsets are correct 2697 * curr_resync_completed can only be used during recovery. 2698 * During reshape/resync it might use array-addresses rather 2699 * that device addresses. 2700 */ 2701 rdev_for_each(rdev, mddev) { 2702 if (rdev->raid_disk >= 0 && 2703 mddev->delta_disks >= 0 && 2704 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2705 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2706 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2707 !test_bit(Journal, &rdev->flags) && 2708 !test_bit(In_sync, &rdev->flags) && 2709 mddev->curr_resync_completed > rdev->recovery_offset) 2710 rdev->recovery_offset = mddev->curr_resync_completed; 2711 2712 } 2713 if (!mddev->persistent) { 2714 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2715 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2716 if (!mddev->external) { 2717 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2718 rdev_for_each(rdev, mddev) { 2719 if (rdev->badblocks.changed) { 2720 rdev->badblocks.changed = 0; 2721 ack_all_badblocks(&rdev->badblocks); 2722 md_error(mddev, rdev); 2723 } 2724 clear_bit(Blocked, &rdev->flags); 2725 clear_bit(BlockedBadBlocks, &rdev->flags); 2726 wake_up(&rdev->blocked_wait); 2727 } 2728 } 2729 wake_up(&mddev->sb_wait); 2730 return; 2731 } 2732 2733 spin_lock(&mddev->lock); 2734 2735 mddev->utime = ktime_get_real_seconds(); 2736 2737 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2738 force_change = 1; 2739 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2740 /* just a clean<-> dirty transition, possibly leave spares alone, 2741 * though if events isn't the right even/odd, we will have to do 2742 * spares after all 2743 */ 2744 nospares = 1; 2745 if (force_change) 2746 nospares = 0; 2747 if (mddev->degraded) 2748 /* If the array is degraded, then skipping spares is both 2749 * dangerous and fairly pointless. 2750 * Dangerous because a device that was removed from the array 2751 * might have a event_count that still looks up-to-date, 2752 * so it can be re-added without a resync. 2753 * Pointless because if there are any spares to skip, 2754 * then a recovery will happen and soon that array won't 2755 * be degraded any more and the spare can go back to sleep then. 2756 */ 2757 nospares = 0; 2758 2759 sync_req = mddev->in_sync; 2760 2761 /* If this is just a dirty<->clean transition, and the array is clean 2762 * and 'events' is odd, we can roll back to the previous clean state */ 2763 if (nospares 2764 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2765 && mddev->can_decrease_events 2766 && mddev->events != 1) { 2767 mddev->events--; 2768 mddev->can_decrease_events = 0; 2769 } else { 2770 /* otherwise we have to go forward and ... */ 2771 mddev->events ++; 2772 mddev->can_decrease_events = nospares; 2773 } 2774 2775 /* 2776 * This 64-bit counter should never wrap. 2777 * Either we are in around ~1 trillion A.C., assuming 2778 * 1 reboot per second, or we have a bug... 2779 */ 2780 WARN_ON(mddev->events == 0); 2781 2782 rdev_for_each(rdev, mddev) { 2783 if (rdev->badblocks.changed) 2784 any_badblocks_changed++; 2785 if (test_bit(Faulty, &rdev->flags)) 2786 set_bit(FaultRecorded, &rdev->flags); 2787 } 2788 2789 sync_sbs(mddev, nospares); 2790 spin_unlock(&mddev->lock); 2791 2792 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2793 mdname(mddev), mddev->in_sync); 2794 2795 mddev_add_trace_msg(mddev, "md md_update_sb"); 2796 rewrite: 2797 mddev->bitmap_ops->update_sb(mddev->bitmap); 2798 rdev_for_each(rdev, mddev) { 2799 if (rdev->sb_loaded != 1) 2800 continue; /* no noise on spare devices */ 2801 2802 if (!test_bit(Faulty, &rdev->flags)) { 2803 md_super_write(mddev,rdev, 2804 rdev->sb_start, rdev->sb_size, 2805 rdev->sb_page); 2806 pr_debug("md: (write) %pg's sb offset: %llu\n", 2807 rdev->bdev, 2808 (unsigned long long)rdev->sb_start); 2809 rdev->sb_events = mddev->events; 2810 if (rdev->badblocks.size) { 2811 md_super_write(mddev, rdev, 2812 rdev->badblocks.sector, 2813 rdev->badblocks.size << 9, 2814 rdev->bb_page); 2815 rdev->badblocks.size = 0; 2816 } 2817 2818 } else 2819 pr_debug("md: %pg (skipping faulty)\n", 2820 rdev->bdev); 2821 } 2822 if (md_super_wait(mddev) < 0) 2823 goto rewrite; 2824 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2825 2826 if (mddev_is_clustered(mddev) && ret == 0) 2827 mddev->cluster_ops->metadata_update_finish(mddev); 2828 2829 if (mddev->in_sync != sync_req || 2830 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2831 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2832 /* have to write it out again */ 2833 goto repeat; 2834 wake_up(&mddev->sb_wait); 2835 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2836 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2837 2838 rdev_for_each(rdev, mddev) { 2839 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2840 clear_bit(Blocked, &rdev->flags); 2841 2842 if (any_badblocks_changed) 2843 ack_all_badblocks(&rdev->badblocks); 2844 clear_bit(BlockedBadBlocks, &rdev->flags); 2845 wake_up(&rdev->blocked_wait); 2846 } 2847 } 2848 EXPORT_SYMBOL(md_update_sb); 2849 2850 static int add_bound_rdev(struct md_rdev *rdev) 2851 { 2852 struct mddev *mddev = rdev->mddev; 2853 int err = 0; 2854 bool add_journal = test_bit(Journal, &rdev->flags); 2855 2856 if (!mddev->pers->hot_remove_disk || add_journal) { 2857 /* If there is hot_add_disk but no hot_remove_disk 2858 * then added disks for geometry changes, 2859 * and should be added immediately. 2860 */ 2861 super_types[mddev->major_version]. 2862 validate_super(mddev, NULL/*freshest*/, rdev); 2863 err = mddev->pers->hot_add_disk(mddev, rdev); 2864 if (err) { 2865 md_kick_rdev_from_array(rdev); 2866 return err; 2867 } 2868 } 2869 sysfs_notify_dirent_safe(rdev->sysfs_state); 2870 2871 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2872 if (mddev->degraded) 2873 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2874 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2875 md_new_event(); 2876 return 0; 2877 } 2878 2879 /* words written to sysfs files may, or may not, be \n terminated. 2880 * We want to accept with case. For this we use cmd_match. 2881 */ 2882 static int cmd_match(const char *cmd, const char *str) 2883 { 2884 /* See if cmd, written into a sysfs file, matches 2885 * str. They must either be the same, or cmd can 2886 * have a trailing newline 2887 */ 2888 while (*cmd && *str && *cmd == *str) { 2889 cmd++; 2890 str++; 2891 } 2892 if (*cmd == '\n') 2893 cmd++; 2894 if (*str || *cmd) 2895 return 0; 2896 return 1; 2897 } 2898 2899 struct rdev_sysfs_entry { 2900 struct attribute attr; 2901 ssize_t (*show)(struct md_rdev *, char *); 2902 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2903 }; 2904 2905 static ssize_t 2906 state_show(struct md_rdev *rdev, char *page) 2907 { 2908 char *sep = ","; 2909 size_t len = 0; 2910 unsigned long flags = READ_ONCE(rdev->flags); 2911 2912 if (test_bit(Faulty, &flags) || 2913 (!test_bit(ExternalBbl, &flags) && 2914 rdev->badblocks.unacked_exist)) 2915 len += sprintf(page+len, "faulty%s", sep); 2916 if (test_bit(In_sync, &flags)) 2917 len += sprintf(page+len, "in_sync%s", sep); 2918 if (test_bit(Journal, &flags)) 2919 len += sprintf(page+len, "journal%s", sep); 2920 if (test_bit(WriteMostly, &flags)) 2921 len += sprintf(page+len, "write_mostly%s", sep); 2922 if (test_bit(Blocked, &flags) || 2923 (rdev->badblocks.unacked_exist 2924 && !test_bit(Faulty, &flags))) 2925 len += sprintf(page+len, "blocked%s", sep); 2926 if (!test_bit(Faulty, &flags) && 2927 !test_bit(Journal, &flags) && 2928 !test_bit(In_sync, &flags)) 2929 len += sprintf(page+len, "spare%s", sep); 2930 if (test_bit(WriteErrorSeen, &flags)) 2931 len += sprintf(page+len, "write_error%s", sep); 2932 if (test_bit(WantReplacement, &flags)) 2933 len += sprintf(page+len, "want_replacement%s", sep); 2934 if (test_bit(Replacement, &flags)) 2935 len += sprintf(page+len, "replacement%s", sep); 2936 if (test_bit(ExternalBbl, &flags)) 2937 len += sprintf(page+len, "external_bbl%s", sep); 2938 if (test_bit(FailFast, &flags)) 2939 len += sprintf(page+len, "failfast%s", sep); 2940 2941 if (len) 2942 len -= strlen(sep); 2943 2944 return len+sprintf(page+len, "\n"); 2945 } 2946 2947 static ssize_t 2948 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2949 { 2950 /* can write 2951 * faulty - simulates an error 2952 * remove - disconnects the device 2953 * writemostly - sets write_mostly 2954 * -writemostly - clears write_mostly 2955 * blocked - sets the Blocked flags 2956 * -blocked - clears the Blocked and possibly simulates an error 2957 * insync - sets Insync providing device isn't active 2958 * -insync - clear Insync for a device with a slot assigned, 2959 * so that it gets rebuilt based on bitmap 2960 * write_error - sets WriteErrorSeen 2961 * -write_error - clears WriteErrorSeen 2962 * {,-}failfast - set/clear FailFast 2963 */ 2964 2965 struct mddev *mddev = rdev->mddev; 2966 int err = -EINVAL; 2967 bool need_update_sb = false; 2968 2969 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2970 md_error(rdev->mddev, rdev); 2971 2972 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 2973 err = -EBUSY; 2974 else 2975 err = 0; 2976 } else if (cmd_match(buf, "remove")) { 2977 if (rdev->mddev->pers) { 2978 clear_bit(Blocked, &rdev->flags); 2979 remove_and_add_spares(rdev->mddev, rdev); 2980 } 2981 if (rdev->raid_disk >= 0) 2982 err = -EBUSY; 2983 else { 2984 err = 0; 2985 if (mddev_is_clustered(mddev)) 2986 err = mddev->cluster_ops->remove_disk(mddev, rdev); 2987 2988 if (err == 0) { 2989 md_kick_rdev_from_array(rdev); 2990 if (mddev->pers) 2991 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2992 md_new_event(); 2993 } 2994 } 2995 } else if (cmd_match(buf, "writemostly")) { 2996 set_bit(WriteMostly, &rdev->flags); 2997 mddev_create_serial_pool(rdev->mddev, rdev); 2998 need_update_sb = true; 2999 err = 0; 3000 } else if (cmd_match(buf, "-writemostly")) { 3001 mddev_destroy_serial_pool(rdev->mddev, rdev); 3002 clear_bit(WriteMostly, &rdev->flags); 3003 need_update_sb = true; 3004 err = 0; 3005 } else if (cmd_match(buf, "blocked")) { 3006 set_bit(Blocked, &rdev->flags); 3007 err = 0; 3008 } else if (cmd_match(buf, "-blocked")) { 3009 if (!test_bit(Faulty, &rdev->flags) && 3010 !test_bit(ExternalBbl, &rdev->flags) && 3011 rdev->badblocks.unacked_exist) { 3012 /* metadata handler doesn't understand badblocks, 3013 * so we need to fail the device 3014 */ 3015 md_error(rdev->mddev, rdev); 3016 } 3017 clear_bit(Blocked, &rdev->flags); 3018 clear_bit(BlockedBadBlocks, &rdev->flags); 3019 wake_up(&rdev->blocked_wait); 3020 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3021 3022 err = 0; 3023 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3024 set_bit(In_sync, &rdev->flags); 3025 err = 0; 3026 } else if (cmd_match(buf, "failfast")) { 3027 set_bit(FailFast, &rdev->flags); 3028 need_update_sb = true; 3029 err = 0; 3030 } else if (cmd_match(buf, "-failfast")) { 3031 clear_bit(FailFast, &rdev->flags); 3032 need_update_sb = true; 3033 err = 0; 3034 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3035 !test_bit(Journal, &rdev->flags)) { 3036 if (rdev->mddev->pers == NULL) { 3037 clear_bit(In_sync, &rdev->flags); 3038 rdev->saved_raid_disk = rdev->raid_disk; 3039 rdev->raid_disk = -1; 3040 err = 0; 3041 } 3042 } else if (cmd_match(buf, "write_error")) { 3043 set_bit(WriteErrorSeen, &rdev->flags); 3044 err = 0; 3045 } else if (cmd_match(buf, "-write_error")) { 3046 clear_bit(WriteErrorSeen, &rdev->flags); 3047 err = 0; 3048 } else if (cmd_match(buf, "want_replacement")) { 3049 /* Any non-spare device that is not a replacement can 3050 * become want_replacement at any time, but we then need to 3051 * check if recovery is needed. 3052 */ 3053 if (rdev->raid_disk >= 0 && 3054 !test_bit(Journal, &rdev->flags) && 3055 !test_bit(Replacement, &rdev->flags)) 3056 set_bit(WantReplacement, &rdev->flags); 3057 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3058 err = 0; 3059 } else if (cmd_match(buf, "-want_replacement")) { 3060 /* Clearing 'want_replacement' is always allowed. 3061 * Once replacements starts it is too late though. 3062 */ 3063 err = 0; 3064 clear_bit(WantReplacement, &rdev->flags); 3065 } else if (cmd_match(buf, "replacement")) { 3066 /* Can only set a device as a replacement when array has not 3067 * yet been started. Once running, replacement is automatic 3068 * from spares, or by assigning 'slot'. 3069 */ 3070 if (rdev->mddev->pers) 3071 err = -EBUSY; 3072 else { 3073 set_bit(Replacement, &rdev->flags); 3074 err = 0; 3075 } 3076 } else if (cmd_match(buf, "-replacement")) { 3077 /* Similarly, can only clear Replacement before start */ 3078 if (rdev->mddev->pers) 3079 err = -EBUSY; 3080 else { 3081 clear_bit(Replacement, &rdev->flags); 3082 err = 0; 3083 } 3084 } else if (cmd_match(buf, "re-add")) { 3085 if (!rdev->mddev->pers) 3086 err = -EINVAL; 3087 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3088 rdev->saved_raid_disk >= 0) { 3089 /* clear_bit is performed _after_ all the devices 3090 * have their local Faulty bit cleared. If any writes 3091 * happen in the meantime in the local node, they 3092 * will land in the local bitmap, which will be synced 3093 * by this node eventually 3094 */ 3095 if (!mddev_is_clustered(rdev->mddev) || 3096 (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { 3097 clear_bit(Faulty, &rdev->flags); 3098 err = add_bound_rdev(rdev); 3099 } 3100 } else 3101 err = -EBUSY; 3102 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3103 set_bit(ExternalBbl, &rdev->flags); 3104 rdev->badblocks.shift = 0; 3105 err = 0; 3106 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3107 clear_bit(ExternalBbl, &rdev->flags); 3108 err = 0; 3109 } 3110 if (need_update_sb) 3111 md_update_sb(mddev, 1); 3112 if (!err) 3113 sysfs_notify_dirent_safe(rdev->sysfs_state); 3114 return err ? err : len; 3115 } 3116 static struct rdev_sysfs_entry rdev_state = 3117 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3118 3119 static ssize_t 3120 errors_show(struct md_rdev *rdev, char *page) 3121 { 3122 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3123 } 3124 3125 static ssize_t 3126 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3127 { 3128 unsigned int n; 3129 int rv; 3130 3131 rv = kstrtouint(buf, 10, &n); 3132 if (rv < 0) 3133 return rv; 3134 atomic_set(&rdev->corrected_errors, n); 3135 return len; 3136 } 3137 static struct rdev_sysfs_entry rdev_errors = 3138 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3139 3140 static ssize_t 3141 slot_show(struct md_rdev *rdev, char *page) 3142 { 3143 if (test_bit(Journal, &rdev->flags)) 3144 return sprintf(page, "journal\n"); 3145 else if (rdev->raid_disk < 0) 3146 return sprintf(page, "none\n"); 3147 else 3148 return sprintf(page, "%d\n", rdev->raid_disk); 3149 } 3150 3151 static ssize_t 3152 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3153 { 3154 int slot; 3155 int err; 3156 3157 if (test_bit(Journal, &rdev->flags)) 3158 return -EBUSY; 3159 if (strncmp(buf, "none", 4)==0) 3160 slot = -1; 3161 else { 3162 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3163 if (err < 0) 3164 return err; 3165 if (slot < 0) 3166 /* overflow */ 3167 return -ENOSPC; 3168 } 3169 if (rdev->mddev->pers && slot == -1) { 3170 /* Setting 'slot' on an active array requires also 3171 * updating the 'rd%d' link, and communicating 3172 * with the personality with ->hot_*_disk. 3173 * For now we only support removing 3174 * failed/spare devices. This normally happens automatically, 3175 * but not when the metadata is externally managed. 3176 */ 3177 if (rdev->raid_disk == -1) 3178 return -EEXIST; 3179 /* personality does all needed checks */ 3180 if (rdev->mddev->pers->hot_remove_disk == NULL) 3181 return -EINVAL; 3182 clear_bit(Blocked, &rdev->flags); 3183 remove_and_add_spares(rdev->mddev, rdev); 3184 if (rdev->raid_disk >= 0) 3185 return -EBUSY; 3186 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3187 } else if (rdev->mddev->pers) { 3188 /* Activating a spare .. or possibly reactivating 3189 * if we ever get bitmaps working here. 3190 */ 3191 int err; 3192 3193 if (rdev->raid_disk != -1) 3194 return -EBUSY; 3195 3196 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3197 return -EBUSY; 3198 3199 if (rdev->mddev->pers->hot_add_disk == NULL) 3200 return -EINVAL; 3201 3202 if (slot >= rdev->mddev->raid_disks && 3203 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3204 return -ENOSPC; 3205 3206 rdev->raid_disk = slot; 3207 if (test_bit(In_sync, &rdev->flags)) 3208 rdev->saved_raid_disk = slot; 3209 else 3210 rdev->saved_raid_disk = -1; 3211 clear_bit(In_sync, &rdev->flags); 3212 clear_bit(Bitmap_sync, &rdev->flags); 3213 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3214 if (err) { 3215 rdev->raid_disk = -1; 3216 return err; 3217 } else 3218 sysfs_notify_dirent_safe(rdev->sysfs_state); 3219 /* failure here is OK */; 3220 sysfs_link_rdev(rdev->mddev, rdev); 3221 /* don't wakeup anyone, leave that to userspace. */ 3222 } else { 3223 if (slot >= rdev->mddev->raid_disks && 3224 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3225 return -ENOSPC; 3226 rdev->raid_disk = slot; 3227 /* assume it is working */ 3228 clear_bit(Faulty, &rdev->flags); 3229 clear_bit(WriteMostly, &rdev->flags); 3230 set_bit(In_sync, &rdev->flags); 3231 sysfs_notify_dirent_safe(rdev->sysfs_state); 3232 } 3233 return len; 3234 } 3235 3236 static struct rdev_sysfs_entry rdev_slot = 3237 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3238 3239 static ssize_t 3240 offset_show(struct md_rdev *rdev, char *page) 3241 { 3242 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3243 } 3244 3245 static ssize_t 3246 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3247 { 3248 unsigned long long offset; 3249 if (kstrtoull(buf, 10, &offset) < 0) 3250 return -EINVAL; 3251 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3252 return -EBUSY; 3253 if (rdev->sectors && rdev->mddev->external) 3254 /* Must set offset before size, so overlap checks 3255 * can be sane */ 3256 return -EBUSY; 3257 rdev->data_offset = offset; 3258 rdev->new_data_offset = offset; 3259 return len; 3260 } 3261 3262 static struct rdev_sysfs_entry rdev_offset = 3263 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3264 3265 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3266 { 3267 return sprintf(page, "%llu\n", 3268 (unsigned long long)rdev->new_data_offset); 3269 } 3270 3271 static ssize_t new_offset_store(struct md_rdev *rdev, 3272 const char *buf, size_t len) 3273 { 3274 unsigned long long new_offset; 3275 struct mddev *mddev = rdev->mddev; 3276 3277 if (kstrtoull(buf, 10, &new_offset) < 0) 3278 return -EINVAL; 3279 3280 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3281 return -EBUSY; 3282 if (new_offset == rdev->data_offset) 3283 /* reset is always permitted */ 3284 ; 3285 else if (new_offset > rdev->data_offset) { 3286 /* must not push array size beyond rdev_sectors */ 3287 if (new_offset - rdev->data_offset 3288 + mddev->dev_sectors > rdev->sectors) 3289 return -E2BIG; 3290 } 3291 /* Metadata worries about other space details. */ 3292 3293 /* decreasing the offset is inconsistent with a backwards 3294 * reshape. 3295 */ 3296 if (new_offset < rdev->data_offset && 3297 mddev->reshape_backwards) 3298 return -EINVAL; 3299 /* Increasing offset is inconsistent with forwards 3300 * reshape. reshape_direction should be set to 3301 * 'backwards' first. 3302 */ 3303 if (new_offset > rdev->data_offset && 3304 !mddev->reshape_backwards) 3305 return -EINVAL; 3306 3307 if (mddev->pers && mddev->persistent && 3308 !super_types[mddev->major_version] 3309 .allow_new_offset(rdev, new_offset)) 3310 return -E2BIG; 3311 rdev->new_data_offset = new_offset; 3312 if (new_offset > rdev->data_offset) 3313 mddev->reshape_backwards = 1; 3314 else if (new_offset < rdev->data_offset) 3315 mddev->reshape_backwards = 0; 3316 3317 return len; 3318 } 3319 static struct rdev_sysfs_entry rdev_new_offset = 3320 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3321 3322 static ssize_t 3323 rdev_size_show(struct md_rdev *rdev, char *page) 3324 { 3325 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3326 } 3327 3328 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3329 { 3330 /* check if two start/length pairs overlap */ 3331 if (a->data_offset + a->sectors <= b->data_offset) 3332 return false; 3333 if (b->data_offset + b->sectors <= a->data_offset) 3334 return false; 3335 return true; 3336 } 3337 3338 static bool md_rdev_overlaps(struct md_rdev *rdev) 3339 { 3340 struct mddev *mddev; 3341 struct md_rdev *rdev2; 3342 3343 spin_lock(&all_mddevs_lock); 3344 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3345 if (test_bit(MD_DELETED, &mddev->flags)) 3346 continue; 3347 rdev_for_each(rdev2, mddev) { 3348 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3349 md_rdevs_overlap(rdev, rdev2)) { 3350 spin_unlock(&all_mddevs_lock); 3351 return true; 3352 } 3353 } 3354 } 3355 spin_unlock(&all_mddevs_lock); 3356 return false; 3357 } 3358 3359 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3360 { 3361 unsigned long long blocks; 3362 sector_t new; 3363 3364 if (kstrtoull(buf, 10, &blocks) < 0) 3365 return -EINVAL; 3366 3367 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3368 return -EINVAL; /* sector conversion overflow */ 3369 3370 new = blocks * 2; 3371 if (new != blocks * 2) 3372 return -EINVAL; /* unsigned long long to sector_t overflow */ 3373 3374 *sectors = new; 3375 return 0; 3376 } 3377 3378 static ssize_t 3379 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3380 { 3381 struct mddev *my_mddev = rdev->mddev; 3382 sector_t oldsectors = rdev->sectors; 3383 sector_t sectors; 3384 3385 if (test_bit(Journal, &rdev->flags)) 3386 return -EBUSY; 3387 if (strict_blocks_to_sectors(buf, §ors) < 0) 3388 return -EINVAL; 3389 if (rdev->data_offset != rdev->new_data_offset) 3390 return -EINVAL; /* too confusing */ 3391 if (my_mddev->pers && rdev->raid_disk >= 0) { 3392 if (my_mddev->persistent) { 3393 sectors = super_types[my_mddev->major_version]. 3394 rdev_size_change(rdev, sectors); 3395 if (!sectors) 3396 return -EBUSY; 3397 } else if (!sectors) 3398 sectors = bdev_nr_sectors(rdev->bdev) - 3399 rdev->data_offset; 3400 if (!my_mddev->pers->resize) 3401 /* Cannot change size for RAID0 or Linear etc */ 3402 return -EINVAL; 3403 } 3404 if (sectors < my_mddev->dev_sectors) 3405 return -EINVAL; /* component must fit device */ 3406 3407 rdev->sectors = sectors; 3408 3409 /* 3410 * Check that all other rdevs with the same bdev do not overlap. This 3411 * check does not provide a hard guarantee, it just helps avoid 3412 * dangerous mistakes. 3413 */ 3414 if (sectors > oldsectors && my_mddev->external && 3415 md_rdev_overlaps(rdev)) { 3416 /* 3417 * Someone else could have slipped in a size change here, but 3418 * doing so is just silly. We put oldsectors back because we 3419 * know it is safe, and trust userspace not to race with itself. 3420 */ 3421 rdev->sectors = oldsectors; 3422 return -EBUSY; 3423 } 3424 return len; 3425 } 3426 3427 static struct rdev_sysfs_entry rdev_size = 3428 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3429 3430 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3431 { 3432 unsigned long long recovery_start = rdev->recovery_offset; 3433 3434 if (test_bit(In_sync, &rdev->flags) || 3435 recovery_start == MaxSector) 3436 return sprintf(page, "none\n"); 3437 3438 return sprintf(page, "%llu\n", recovery_start); 3439 } 3440 3441 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3442 { 3443 unsigned long long recovery_start; 3444 3445 if (cmd_match(buf, "none")) 3446 recovery_start = MaxSector; 3447 else if (kstrtoull(buf, 10, &recovery_start)) 3448 return -EINVAL; 3449 3450 if (rdev->mddev->pers && 3451 rdev->raid_disk >= 0) 3452 return -EBUSY; 3453 3454 rdev->recovery_offset = recovery_start; 3455 if (recovery_start == MaxSector) 3456 set_bit(In_sync, &rdev->flags); 3457 else 3458 clear_bit(In_sync, &rdev->flags); 3459 return len; 3460 } 3461 3462 static struct rdev_sysfs_entry rdev_recovery_start = 3463 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3464 3465 /* sysfs access to bad-blocks list. 3466 * We present two files. 3467 * 'bad-blocks' lists sector numbers and lengths of ranges that 3468 * are recorded as bad. The list is truncated to fit within 3469 * the one-page limit of sysfs. 3470 * Writing "sector length" to this file adds an acknowledged 3471 * bad block list. 3472 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3473 * been acknowledged. Writing to this file adds bad blocks 3474 * without acknowledging them. This is largely for testing. 3475 */ 3476 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3477 { 3478 return badblocks_show(&rdev->badblocks, page, 0); 3479 } 3480 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3481 { 3482 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3483 /* Maybe that ack was all we needed */ 3484 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3485 wake_up(&rdev->blocked_wait); 3486 return rv; 3487 } 3488 static struct rdev_sysfs_entry rdev_bad_blocks = 3489 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3490 3491 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3492 { 3493 return badblocks_show(&rdev->badblocks, page, 1); 3494 } 3495 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3496 { 3497 return badblocks_store(&rdev->badblocks, page, len, 1); 3498 } 3499 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3500 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3501 3502 static ssize_t 3503 ppl_sector_show(struct md_rdev *rdev, char *page) 3504 { 3505 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3506 } 3507 3508 static ssize_t 3509 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3510 { 3511 unsigned long long sector; 3512 3513 if (kstrtoull(buf, 10, §or) < 0) 3514 return -EINVAL; 3515 if (sector != (sector_t)sector) 3516 return -EINVAL; 3517 3518 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3519 rdev->raid_disk >= 0) 3520 return -EBUSY; 3521 3522 if (rdev->mddev->persistent) { 3523 if (rdev->mddev->major_version == 0) 3524 return -EINVAL; 3525 if ((sector > rdev->sb_start && 3526 sector - rdev->sb_start > S16_MAX) || 3527 (sector < rdev->sb_start && 3528 rdev->sb_start - sector > -S16_MIN)) 3529 return -EINVAL; 3530 rdev->ppl.offset = sector - rdev->sb_start; 3531 } else if (!rdev->mddev->external) { 3532 return -EBUSY; 3533 } 3534 rdev->ppl.sector = sector; 3535 return len; 3536 } 3537 3538 static struct rdev_sysfs_entry rdev_ppl_sector = 3539 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3540 3541 static ssize_t 3542 ppl_size_show(struct md_rdev *rdev, char *page) 3543 { 3544 return sprintf(page, "%u\n", rdev->ppl.size); 3545 } 3546 3547 static ssize_t 3548 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3549 { 3550 unsigned int size; 3551 3552 if (kstrtouint(buf, 10, &size) < 0) 3553 return -EINVAL; 3554 3555 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3556 rdev->raid_disk >= 0) 3557 return -EBUSY; 3558 3559 if (rdev->mddev->persistent) { 3560 if (rdev->mddev->major_version == 0) 3561 return -EINVAL; 3562 if (size > U16_MAX) 3563 return -EINVAL; 3564 } else if (!rdev->mddev->external) { 3565 return -EBUSY; 3566 } 3567 rdev->ppl.size = size; 3568 return len; 3569 } 3570 3571 static struct rdev_sysfs_entry rdev_ppl_size = 3572 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3573 3574 static struct attribute *rdev_default_attrs[] = { 3575 &rdev_state.attr, 3576 &rdev_errors.attr, 3577 &rdev_slot.attr, 3578 &rdev_offset.attr, 3579 &rdev_new_offset.attr, 3580 &rdev_size.attr, 3581 &rdev_recovery_start.attr, 3582 &rdev_bad_blocks.attr, 3583 &rdev_unack_bad_blocks.attr, 3584 &rdev_ppl_sector.attr, 3585 &rdev_ppl_size.attr, 3586 NULL, 3587 }; 3588 ATTRIBUTE_GROUPS(rdev_default); 3589 static ssize_t 3590 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3591 { 3592 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3593 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3594 3595 if (!entry->show) 3596 return -EIO; 3597 if (!rdev->mddev) 3598 return -ENODEV; 3599 return entry->show(rdev, page); 3600 } 3601 3602 static ssize_t 3603 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3604 const char *page, size_t length) 3605 { 3606 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3607 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3608 struct kernfs_node *kn = NULL; 3609 bool suspend = false; 3610 ssize_t rv; 3611 struct mddev *mddev = READ_ONCE(rdev->mddev); 3612 3613 if (!entry->store) 3614 return -EIO; 3615 if (!capable(CAP_SYS_ADMIN)) 3616 return -EACCES; 3617 if (!mddev) 3618 return -ENODEV; 3619 3620 if (entry->store == state_store) { 3621 if (cmd_match(page, "remove")) 3622 kn = sysfs_break_active_protection(kobj, attr); 3623 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3624 cmd_match(page, "writemostly") || 3625 cmd_match(page, "-writemostly")) 3626 suspend = true; 3627 } 3628 3629 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3630 if (!rv) { 3631 if (rdev->mddev == NULL) 3632 rv = -ENODEV; 3633 else 3634 rv = entry->store(rdev, page, length); 3635 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3636 } 3637 3638 if (kn) 3639 sysfs_unbreak_active_protection(kn); 3640 3641 return rv; 3642 } 3643 3644 static void rdev_free(struct kobject *ko) 3645 { 3646 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3647 kfree(rdev); 3648 } 3649 static const struct sysfs_ops rdev_sysfs_ops = { 3650 .show = rdev_attr_show, 3651 .store = rdev_attr_store, 3652 }; 3653 static const struct kobj_type rdev_ktype = { 3654 .release = rdev_free, 3655 .sysfs_ops = &rdev_sysfs_ops, 3656 .default_groups = rdev_default_groups, 3657 }; 3658 3659 int md_rdev_init(struct md_rdev *rdev) 3660 { 3661 rdev->desc_nr = -1; 3662 rdev->saved_raid_disk = -1; 3663 rdev->raid_disk = -1; 3664 rdev->flags = 0; 3665 rdev->data_offset = 0; 3666 rdev->new_data_offset = 0; 3667 rdev->sb_events = 0; 3668 rdev->last_read_error = 0; 3669 rdev->sb_loaded = 0; 3670 rdev->bb_page = NULL; 3671 atomic_set(&rdev->nr_pending, 0); 3672 atomic_set(&rdev->read_errors, 0); 3673 atomic_set(&rdev->corrected_errors, 0); 3674 3675 INIT_LIST_HEAD(&rdev->same_set); 3676 init_waitqueue_head(&rdev->blocked_wait); 3677 3678 /* Add space to store bad block list. 3679 * This reserves the space even on arrays where it cannot 3680 * be used - I wonder if that matters 3681 */ 3682 return badblocks_init(&rdev->badblocks, 0); 3683 } 3684 EXPORT_SYMBOL_GPL(md_rdev_init); 3685 3686 /* 3687 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3688 * 3689 * mark the device faulty if: 3690 * 3691 * - the device is nonexistent (zero size) 3692 * - the device has no valid superblock 3693 * 3694 * a faulty rdev _never_ has rdev->sb set. 3695 */ 3696 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3697 { 3698 struct md_rdev *rdev; 3699 sector_t size; 3700 int err; 3701 3702 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3703 if (!rdev) 3704 return ERR_PTR(-ENOMEM); 3705 3706 err = md_rdev_init(rdev); 3707 if (err) 3708 goto out_free_rdev; 3709 err = alloc_disk_sb(rdev); 3710 if (err) 3711 goto out_clear_rdev; 3712 3713 rdev->bdev_file = bdev_file_open_by_dev(newdev, 3714 BLK_OPEN_READ | BLK_OPEN_WRITE, 3715 super_format == -2 ? &claim_rdev : rdev, NULL); 3716 if (IS_ERR(rdev->bdev_file)) { 3717 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3718 MAJOR(newdev), MINOR(newdev)); 3719 err = PTR_ERR(rdev->bdev_file); 3720 goto out_clear_rdev; 3721 } 3722 rdev->bdev = file_bdev(rdev->bdev_file); 3723 3724 kobject_init(&rdev->kobj, &rdev_ktype); 3725 3726 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3727 if (!size) { 3728 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3729 rdev->bdev); 3730 err = -EINVAL; 3731 goto out_blkdev_put; 3732 } 3733 3734 if (super_format >= 0) { 3735 err = super_types[super_format]. 3736 load_super(rdev, NULL, super_minor); 3737 if (err == -EINVAL) { 3738 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3739 rdev->bdev, 3740 super_format, super_minor); 3741 goto out_blkdev_put; 3742 } 3743 if (err < 0) { 3744 pr_warn("md: could not read %pg's sb, not importing!\n", 3745 rdev->bdev); 3746 goto out_blkdev_put; 3747 } 3748 } 3749 3750 return rdev; 3751 3752 out_blkdev_put: 3753 fput(rdev->bdev_file); 3754 out_clear_rdev: 3755 md_rdev_clear(rdev); 3756 out_free_rdev: 3757 kfree(rdev); 3758 return ERR_PTR(err); 3759 } 3760 3761 /* 3762 * Check a full RAID array for plausibility 3763 */ 3764 3765 static int analyze_sbs(struct mddev *mddev) 3766 { 3767 int i; 3768 struct md_rdev *rdev, *freshest, *tmp; 3769 3770 freshest = NULL; 3771 rdev_for_each_safe(rdev, tmp, mddev) 3772 switch (super_types[mddev->major_version]. 3773 load_super(rdev, freshest, mddev->minor_version)) { 3774 case 1: 3775 freshest = rdev; 3776 break; 3777 case 0: 3778 break; 3779 default: 3780 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3781 rdev->bdev); 3782 md_kick_rdev_from_array(rdev); 3783 } 3784 3785 /* Cannot find a valid fresh disk */ 3786 if (!freshest) { 3787 pr_warn("md: cannot find a valid disk\n"); 3788 return -EINVAL; 3789 } 3790 3791 super_types[mddev->major_version]. 3792 validate_super(mddev, NULL/*freshest*/, freshest); 3793 3794 i = 0; 3795 rdev_for_each_safe(rdev, tmp, mddev) { 3796 if (mddev->max_disks && 3797 (rdev->desc_nr >= mddev->max_disks || 3798 i > mddev->max_disks)) { 3799 pr_warn("md: %s: %pg: only %d devices permitted\n", 3800 mdname(mddev), rdev->bdev, 3801 mddev->max_disks); 3802 md_kick_rdev_from_array(rdev); 3803 continue; 3804 } 3805 if (rdev != freshest) { 3806 if (super_types[mddev->major_version]. 3807 validate_super(mddev, freshest, rdev)) { 3808 pr_warn("md: kicking non-fresh %pg from array!\n", 3809 rdev->bdev); 3810 md_kick_rdev_from_array(rdev); 3811 continue; 3812 } 3813 } 3814 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && 3815 !test_bit(Journal, &rdev->flags)) { 3816 rdev->raid_disk = -1; 3817 clear_bit(In_sync, &rdev->flags); 3818 } 3819 } 3820 3821 return 0; 3822 } 3823 3824 /* Read a fixed-point number. 3825 * Numbers in sysfs attributes should be in "standard" units where 3826 * possible, so time should be in seconds. 3827 * However we internally use a a much smaller unit such as 3828 * milliseconds or jiffies. 3829 * This function takes a decimal number with a possible fractional 3830 * component, and produces an integer which is the result of 3831 * multiplying that number by 10^'scale'. 3832 * all without any floating-point arithmetic. 3833 */ 3834 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3835 { 3836 unsigned long result = 0; 3837 long decimals = -1; 3838 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3839 if (*cp == '.') 3840 decimals = 0; 3841 else if (decimals < scale) { 3842 unsigned int value; 3843 value = *cp - '0'; 3844 result = result * 10 + value; 3845 if (decimals >= 0) 3846 decimals++; 3847 } 3848 cp++; 3849 } 3850 if (*cp == '\n') 3851 cp++; 3852 if (*cp) 3853 return -EINVAL; 3854 if (decimals < 0) 3855 decimals = 0; 3856 *res = result * int_pow(10, scale - decimals); 3857 return 0; 3858 } 3859 3860 static ssize_t 3861 safe_delay_show(struct mddev *mddev, char *page) 3862 { 3863 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3864 3865 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3866 } 3867 static ssize_t 3868 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3869 { 3870 unsigned long msec; 3871 3872 if (mddev_is_clustered(mddev)) { 3873 pr_warn("md: Safemode is disabled for clustered mode\n"); 3874 return -EINVAL; 3875 } 3876 3877 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3878 return -EINVAL; 3879 if (msec == 0) 3880 mddev->safemode_delay = 0; 3881 else { 3882 unsigned long old_delay = mddev->safemode_delay; 3883 unsigned long new_delay = (msec*HZ)/1000; 3884 3885 if (new_delay == 0) 3886 new_delay = 1; 3887 mddev->safemode_delay = new_delay; 3888 if (new_delay < old_delay || old_delay == 0) 3889 mod_timer(&mddev->safemode_timer, jiffies+1); 3890 } 3891 return len; 3892 } 3893 static struct md_sysfs_entry md_safe_delay = 3894 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3895 3896 static ssize_t 3897 level_show(struct mddev *mddev, char *page) 3898 { 3899 struct md_personality *p; 3900 int ret; 3901 spin_lock(&mddev->lock); 3902 p = mddev->pers; 3903 if (p) 3904 ret = sprintf(page, "%s\n", p->head.name); 3905 else if (mddev->clevel[0]) 3906 ret = sprintf(page, "%s\n", mddev->clevel); 3907 else if (mddev->level != LEVEL_NONE) 3908 ret = sprintf(page, "%d\n", mddev->level); 3909 else 3910 ret = 0; 3911 spin_unlock(&mddev->lock); 3912 return ret; 3913 } 3914 3915 static ssize_t 3916 level_store(struct mddev *mddev, const char *buf, size_t len) 3917 { 3918 char clevel[16]; 3919 ssize_t rv; 3920 size_t slen = len; 3921 struct md_personality *pers, *oldpers; 3922 long level; 3923 void *priv, *oldpriv; 3924 struct md_rdev *rdev; 3925 3926 if (slen == 0 || slen >= sizeof(clevel)) 3927 return -EINVAL; 3928 3929 rv = mddev_suspend_and_lock(mddev); 3930 if (rv) 3931 return rv; 3932 3933 if (mddev->pers == NULL) { 3934 memcpy(mddev->clevel, buf, slen); 3935 if (mddev->clevel[slen-1] == '\n') 3936 slen--; 3937 mddev->clevel[slen] = 0; 3938 mddev->level = LEVEL_NONE; 3939 rv = len; 3940 goto out_unlock; 3941 } 3942 rv = -EROFS; 3943 if (!md_is_rdwr(mddev)) 3944 goto out_unlock; 3945 3946 /* request to change the personality. Need to ensure: 3947 * - array is not engaged in resync/recovery/reshape 3948 * - old personality can be suspended 3949 * - new personality will access other array. 3950 */ 3951 3952 rv = -EBUSY; 3953 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3954 mddev->reshape_position != MaxSector || 3955 mddev->sysfs_active) 3956 goto out_unlock; 3957 3958 rv = -EINVAL; 3959 if (!mddev->pers->quiesce) { 3960 pr_warn("md: %s: %s does not support online personality change\n", 3961 mdname(mddev), mddev->pers->head.name); 3962 goto out_unlock; 3963 } 3964 3965 /* Now find the new personality */ 3966 memcpy(clevel, buf, slen); 3967 if (clevel[slen-1] == '\n') 3968 slen--; 3969 clevel[slen] = 0; 3970 if (kstrtol(clevel, 10, &level)) 3971 level = LEVEL_NONE; 3972 3973 if (request_module("md-%s", clevel) != 0) 3974 request_module("md-level-%s", clevel); 3975 pers = get_pers(level, clevel); 3976 if (!pers) { 3977 rv = -EINVAL; 3978 goto out_unlock; 3979 } 3980 3981 if (pers == mddev->pers) { 3982 /* Nothing to do! */ 3983 put_pers(pers); 3984 rv = len; 3985 goto out_unlock; 3986 } 3987 if (!pers->takeover) { 3988 put_pers(pers); 3989 pr_warn("md: %s: %s does not support personality takeover\n", 3990 mdname(mddev), clevel); 3991 rv = -EINVAL; 3992 goto out_unlock; 3993 } 3994 3995 rdev_for_each(rdev, mddev) 3996 rdev->new_raid_disk = rdev->raid_disk; 3997 3998 /* ->takeover must set new_* and/or delta_disks 3999 * if it succeeds, and may set them when it fails. 4000 */ 4001 priv = pers->takeover(mddev); 4002 if (IS_ERR(priv)) { 4003 mddev->new_level = mddev->level; 4004 mddev->new_layout = mddev->layout; 4005 mddev->new_chunk_sectors = mddev->chunk_sectors; 4006 mddev->raid_disks -= mddev->delta_disks; 4007 mddev->delta_disks = 0; 4008 mddev->reshape_backwards = 0; 4009 put_pers(pers); 4010 pr_warn("md: %s: %s would not accept array\n", 4011 mdname(mddev), clevel); 4012 rv = PTR_ERR(priv); 4013 goto out_unlock; 4014 } 4015 4016 /* Looks like we have a winner */ 4017 mddev_detach(mddev); 4018 4019 spin_lock(&mddev->lock); 4020 oldpers = mddev->pers; 4021 oldpriv = mddev->private; 4022 mddev->pers = pers; 4023 mddev->private = priv; 4024 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 4025 mddev->level = mddev->new_level; 4026 mddev->layout = mddev->new_layout; 4027 mddev->chunk_sectors = mddev->new_chunk_sectors; 4028 mddev->delta_disks = 0; 4029 mddev->reshape_backwards = 0; 4030 mddev->degraded = 0; 4031 spin_unlock(&mddev->lock); 4032 4033 if (oldpers->sync_request == NULL && 4034 mddev->external) { 4035 /* We are converting from a no-redundancy array 4036 * to a redundancy array and metadata is managed 4037 * externally so we need to be sure that writes 4038 * won't block due to a need to transition 4039 * clean->dirty 4040 * until external management is started. 4041 */ 4042 mddev->in_sync = 0; 4043 mddev->safemode_delay = 0; 4044 mddev->safemode = 0; 4045 } 4046 4047 oldpers->free(mddev, oldpriv); 4048 4049 if (oldpers->sync_request == NULL && 4050 pers->sync_request != NULL) { 4051 /* need to add the md_redundancy_group */ 4052 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4053 pr_warn("md: cannot register extra attributes for %s\n", 4054 mdname(mddev)); 4055 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4056 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4057 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4058 } 4059 if (oldpers->sync_request != NULL && 4060 pers->sync_request == NULL) { 4061 /* need to remove the md_redundancy_group */ 4062 if (mddev->to_remove == NULL) 4063 mddev->to_remove = &md_redundancy_group; 4064 } 4065 4066 put_pers(oldpers); 4067 4068 rdev_for_each(rdev, mddev) { 4069 if (rdev->raid_disk < 0) 4070 continue; 4071 if (rdev->new_raid_disk >= mddev->raid_disks) 4072 rdev->new_raid_disk = -1; 4073 if (rdev->new_raid_disk == rdev->raid_disk) 4074 continue; 4075 sysfs_unlink_rdev(mddev, rdev); 4076 } 4077 rdev_for_each(rdev, mddev) { 4078 if (rdev->raid_disk < 0) 4079 continue; 4080 if (rdev->new_raid_disk == rdev->raid_disk) 4081 continue; 4082 rdev->raid_disk = rdev->new_raid_disk; 4083 if (rdev->raid_disk < 0) 4084 clear_bit(In_sync, &rdev->flags); 4085 else { 4086 if (sysfs_link_rdev(mddev, rdev)) 4087 pr_warn("md: cannot register rd%d for %s after level change\n", 4088 rdev->raid_disk, mdname(mddev)); 4089 } 4090 } 4091 4092 if (pers->sync_request == NULL) { 4093 /* this is now an array without redundancy, so 4094 * it must always be in_sync 4095 */ 4096 mddev->in_sync = 1; 4097 timer_delete_sync(&mddev->safemode_timer); 4098 } 4099 pers->run(mddev); 4100 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4101 if (!mddev->thread) 4102 md_update_sb(mddev, 1); 4103 sysfs_notify_dirent_safe(mddev->sysfs_level); 4104 md_new_event(); 4105 rv = len; 4106 out_unlock: 4107 mddev_unlock_and_resume(mddev); 4108 return rv; 4109 } 4110 4111 static struct md_sysfs_entry md_level = 4112 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4113 4114 static ssize_t 4115 new_level_show(struct mddev *mddev, char *page) 4116 { 4117 return sprintf(page, "%d\n", mddev->new_level); 4118 } 4119 4120 static ssize_t 4121 new_level_store(struct mddev *mddev, const char *buf, size_t len) 4122 { 4123 unsigned int n; 4124 int err; 4125 4126 err = kstrtouint(buf, 10, &n); 4127 if (err < 0) 4128 return err; 4129 err = mddev_lock(mddev); 4130 if (err) 4131 return err; 4132 4133 mddev->new_level = n; 4134 md_update_sb(mddev, 1); 4135 4136 mddev_unlock(mddev); 4137 return len; 4138 } 4139 static struct md_sysfs_entry md_new_level = 4140 __ATTR(new_level, 0664, new_level_show, new_level_store); 4141 4142 static ssize_t 4143 layout_show(struct mddev *mddev, char *page) 4144 { 4145 /* just a number, not meaningful for all levels */ 4146 if (mddev->reshape_position != MaxSector && 4147 mddev->layout != mddev->new_layout) 4148 return sprintf(page, "%d (%d)\n", 4149 mddev->new_layout, mddev->layout); 4150 return sprintf(page, "%d\n", mddev->layout); 4151 } 4152 4153 static ssize_t 4154 layout_store(struct mddev *mddev, const char *buf, size_t len) 4155 { 4156 unsigned int n; 4157 int err; 4158 4159 err = kstrtouint(buf, 10, &n); 4160 if (err < 0) 4161 return err; 4162 err = mddev_lock(mddev); 4163 if (err) 4164 return err; 4165 4166 if (mddev->pers) { 4167 if (mddev->pers->check_reshape == NULL) 4168 err = -EBUSY; 4169 else if (!md_is_rdwr(mddev)) 4170 err = -EROFS; 4171 else { 4172 mddev->new_layout = n; 4173 err = mddev->pers->check_reshape(mddev); 4174 if (err) 4175 mddev->new_layout = mddev->layout; 4176 } 4177 } else { 4178 mddev->new_layout = n; 4179 if (mddev->reshape_position == MaxSector) 4180 mddev->layout = n; 4181 } 4182 mddev_unlock(mddev); 4183 return err ?: len; 4184 } 4185 static struct md_sysfs_entry md_layout = 4186 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4187 4188 static ssize_t 4189 raid_disks_show(struct mddev *mddev, char *page) 4190 { 4191 if (mddev->raid_disks == 0) 4192 return 0; 4193 if (mddev->reshape_position != MaxSector && 4194 mddev->delta_disks != 0) 4195 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4196 mddev->raid_disks - mddev->delta_disks); 4197 return sprintf(page, "%d\n", mddev->raid_disks); 4198 } 4199 4200 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4201 4202 static ssize_t 4203 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4204 { 4205 unsigned int n; 4206 int err; 4207 4208 err = kstrtouint(buf, 10, &n); 4209 if (err < 0) 4210 return err; 4211 4212 err = mddev_lock(mddev); 4213 if (err) 4214 return err; 4215 if (mddev->pers) 4216 err = update_raid_disks(mddev, n); 4217 else if (mddev->reshape_position != MaxSector) { 4218 struct md_rdev *rdev; 4219 int olddisks = mddev->raid_disks - mddev->delta_disks; 4220 4221 err = -EINVAL; 4222 rdev_for_each(rdev, mddev) { 4223 if (olddisks < n && 4224 rdev->data_offset < rdev->new_data_offset) 4225 goto out_unlock; 4226 if (olddisks > n && 4227 rdev->data_offset > rdev->new_data_offset) 4228 goto out_unlock; 4229 } 4230 err = 0; 4231 mddev->delta_disks = n - olddisks; 4232 mddev->raid_disks = n; 4233 mddev->reshape_backwards = (mddev->delta_disks < 0); 4234 } else 4235 mddev->raid_disks = n; 4236 out_unlock: 4237 mddev_unlock(mddev); 4238 return err ? err : len; 4239 } 4240 static struct md_sysfs_entry md_raid_disks = 4241 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4242 4243 static ssize_t 4244 uuid_show(struct mddev *mddev, char *page) 4245 { 4246 return sprintf(page, "%pU\n", mddev->uuid); 4247 } 4248 static struct md_sysfs_entry md_uuid = 4249 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4250 4251 static ssize_t 4252 chunk_size_show(struct mddev *mddev, char *page) 4253 { 4254 if (mddev->reshape_position != MaxSector && 4255 mddev->chunk_sectors != mddev->new_chunk_sectors) 4256 return sprintf(page, "%d (%d)\n", 4257 mddev->new_chunk_sectors << 9, 4258 mddev->chunk_sectors << 9); 4259 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4260 } 4261 4262 static ssize_t 4263 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4264 { 4265 unsigned long n; 4266 int err; 4267 4268 err = kstrtoul(buf, 10, &n); 4269 if (err < 0) 4270 return err; 4271 4272 err = mddev_lock(mddev); 4273 if (err) 4274 return err; 4275 if (mddev->pers) { 4276 if (mddev->pers->check_reshape == NULL) 4277 err = -EBUSY; 4278 else if (!md_is_rdwr(mddev)) 4279 err = -EROFS; 4280 else { 4281 mddev->new_chunk_sectors = n >> 9; 4282 err = mddev->pers->check_reshape(mddev); 4283 if (err) 4284 mddev->new_chunk_sectors = mddev->chunk_sectors; 4285 } 4286 } else { 4287 mddev->new_chunk_sectors = n >> 9; 4288 if (mddev->reshape_position == MaxSector) 4289 mddev->chunk_sectors = n >> 9; 4290 } 4291 mddev_unlock(mddev); 4292 return err ?: len; 4293 } 4294 static struct md_sysfs_entry md_chunk_size = 4295 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4296 4297 static ssize_t 4298 resync_start_show(struct mddev *mddev, char *page) 4299 { 4300 if (mddev->recovery_cp == MaxSector) 4301 return sprintf(page, "none\n"); 4302 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4303 } 4304 4305 static ssize_t 4306 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4307 { 4308 unsigned long long n; 4309 int err; 4310 4311 if (cmd_match(buf, "none")) 4312 n = MaxSector; 4313 else { 4314 err = kstrtoull(buf, 10, &n); 4315 if (err < 0) 4316 return err; 4317 if (n != (sector_t)n) 4318 return -EINVAL; 4319 } 4320 4321 err = mddev_lock(mddev); 4322 if (err) 4323 return err; 4324 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4325 err = -EBUSY; 4326 4327 if (!err) { 4328 mddev->recovery_cp = n; 4329 if (mddev->pers) 4330 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4331 } 4332 mddev_unlock(mddev); 4333 return err ?: len; 4334 } 4335 static struct md_sysfs_entry md_resync_start = 4336 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4337 resync_start_show, resync_start_store); 4338 4339 /* 4340 * The array state can be: 4341 * 4342 * clear 4343 * No devices, no size, no level 4344 * Equivalent to STOP_ARRAY ioctl 4345 * inactive 4346 * May have some settings, but array is not active 4347 * all IO results in error 4348 * When written, doesn't tear down array, but just stops it 4349 * suspended (not supported yet) 4350 * All IO requests will block. The array can be reconfigured. 4351 * Writing this, if accepted, will block until array is quiescent 4352 * readonly 4353 * no resync can happen. no superblocks get written. 4354 * write requests fail 4355 * read-auto 4356 * like readonly, but behaves like 'clean' on a write request. 4357 * 4358 * clean - no pending writes, but otherwise active. 4359 * When written to inactive array, starts without resync 4360 * If a write request arrives then 4361 * if metadata is known, mark 'dirty' and switch to 'active'. 4362 * if not known, block and switch to write-pending 4363 * If written to an active array that has pending writes, then fails. 4364 * active 4365 * fully active: IO and resync can be happening. 4366 * When written to inactive array, starts with resync 4367 * 4368 * write-pending 4369 * clean, but writes are blocked waiting for 'active' to be written. 4370 * 4371 * active-idle 4372 * like active, but no writes have been seen for a while (100msec). 4373 * 4374 * broken 4375 * Array is failed. It's useful because mounted-arrays aren't stopped 4376 * when array is failed, so this state will at least alert the user that 4377 * something is wrong. 4378 */ 4379 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4380 write_pending, active_idle, broken, bad_word}; 4381 static char *array_states[] = { 4382 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4383 "write-pending", "active-idle", "broken", NULL }; 4384 4385 static int match_word(const char *word, char **list) 4386 { 4387 int n; 4388 for (n=0; list[n]; n++) 4389 if (cmd_match(word, list[n])) 4390 break; 4391 return n; 4392 } 4393 4394 static ssize_t 4395 array_state_show(struct mddev *mddev, char *page) 4396 { 4397 enum array_state st = inactive; 4398 4399 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4400 switch(mddev->ro) { 4401 case MD_RDONLY: 4402 st = readonly; 4403 break; 4404 case MD_AUTO_READ: 4405 st = read_auto; 4406 break; 4407 case MD_RDWR: 4408 spin_lock(&mddev->lock); 4409 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4410 st = write_pending; 4411 else if (mddev->in_sync) 4412 st = clean; 4413 else if (mddev->safemode) 4414 st = active_idle; 4415 else 4416 st = active; 4417 spin_unlock(&mddev->lock); 4418 } 4419 4420 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4421 st = broken; 4422 } else { 4423 if (list_empty(&mddev->disks) && 4424 mddev->raid_disks == 0 && 4425 mddev->dev_sectors == 0) 4426 st = clear; 4427 else 4428 st = inactive; 4429 } 4430 return sprintf(page, "%s\n", array_states[st]); 4431 } 4432 4433 static int do_md_stop(struct mddev *mddev, int ro); 4434 static int md_set_readonly(struct mddev *mddev); 4435 static int restart_array(struct mddev *mddev); 4436 4437 static ssize_t 4438 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4439 { 4440 int err = 0; 4441 enum array_state st = match_word(buf, array_states); 4442 4443 /* No lock dependent actions */ 4444 switch (st) { 4445 case suspended: /* not supported yet */ 4446 case write_pending: /* cannot be set */ 4447 case active_idle: /* cannot be set */ 4448 case broken: /* cannot be set */ 4449 case bad_word: 4450 return -EINVAL; 4451 case clear: 4452 case readonly: 4453 case inactive: 4454 case read_auto: 4455 if (!mddev->pers || !md_is_rdwr(mddev)) 4456 break; 4457 /* write sysfs will not open mddev and opener should be 0 */ 4458 err = mddev_set_closing_and_sync_blockdev(mddev, 0); 4459 if (err) 4460 return err; 4461 break; 4462 default: 4463 break; 4464 } 4465 4466 if (mddev->pers && (st == active || st == clean) && 4467 mddev->ro != MD_RDONLY) { 4468 /* don't take reconfig_mutex when toggling between 4469 * clean and active 4470 */ 4471 spin_lock(&mddev->lock); 4472 if (st == active) { 4473 restart_array(mddev); 4474 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4475 md_wakeup_thread(mddev->thread); 4476 wake_up(&mddev->sb_wait); 4477 } else /* st == clean */ { 4478 restart_array(mddev); 4479 if (!set_in_sync(mddev)) 4480 err = -EBUSY; 4481 } 4482 if (!err) 4483 sysfs_notify_dirent_safe(mddev->sysfs_state); 4484 spin_unlock(&mddev->lock); 4485 return err ?: len; 4486 } 4487 err = mddev_lock(mddev); 4488 if (err) 4489 return err; 4490 4491 switch (st) { 4492 case inactive: 4493 /* stop an active array, return 0 otherwise */ 4494 if (mddev->pers) 4495 err = do_md_stop(mddev, 2); 4496 break; 4497 case clear: 4498 err = do_md_stop(mddev, 0); 4499 break; 4500 case readonly: 4501 if (mddev->pers) 4502 err = md_set_readonly(mddev); 4503 else { 4504 mddev->ro = MD_RDONLY; 4505 set_disk_ro(mddev->gendisk, 1); 4506 err = do_md_run(mddev); 4507 } 4508 break; 4509 case read_auto: 4510 if (mddev->pers) { 4511 if (md_is_rdwr(mddev)) 4512 err = md_set_readonly(mddev); 4513 else if (mddev->ro == MD_RDONLY) 4514 err = restart_array(mddev); 4515 if (err == 0) { 4516 mddev->ro = MD_AUTO_READ; 4517 set_disk_ro(mddev->gendisk, 0); 4518 } 4519 } else { 4520 mddev->ro = MD_AUTO_READ; 4521 err = do_md_run(mddev); 4522 } 4523 break; 4524 case clean: 4525 if (mddev->pers) { 4526 err = restart_array(mddev); 4527 if (err) 4528 break; 4529 spin_lock(&mddev->lock); 4530 if (!set_in_sync(mddev)) 4531 err = -EBUSY; 4532 spin_unlock(&mddev->lock); 4533 } else 4534 err = -EINVAL; 4535 break; 4536 case active: 4537 if (mddev->pers) { 4538 err = restart_array(mddev); 4539 if (err) 4540 break; 4541 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4542 wake_up(&mddev->sb_wait); 4543 err = 0; 4544 } else { 4545 mddev->ro = MD_RDWR; 4546 set_disk_ro(mddev->gendisk, 0); 4547 err = do_md_run(mddev); 4548 } 4549 break; 4550 default: 4551 err = -EINVAL; 4552 break; 4553 } 4554 4555 if (!err) { 4556 if (mddev->hold_active == UNTIL_IOCTL) 4557 mddev->hold_active = 0; 4558 sysfs_notify_dirent_safe(mddev->sysfs_state); 4559 } 4560 mddev_unlock(mddev); 4561 4562 if (st == readonly || st == read_auto || st == inactive || 4563 (err && st == clear)) 4564 clear_bit(MD_CLOSING, &mddev->flags); 4565 4566 return err ?: len; 4567 } 4568 static struct md_sysfs_entry md_array_state = 4569 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4570 4571 static ssize_t 4572 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4573 return sprintf(page, "%d\n", 4574 atomic_read(&mddev->max_corr_read_errors)); 4575 } 4576 4577 static ssize_t 4578 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4579 { 4580 unsigned int n; 4581 int rv; 4582 4583 rv = kstrtouint(buf, 10, &n); 4584 if (rv < 0) 4585 return rv; 4586 if (n > INT_MAX) 4587 return -EINVAL; 4588 atomic_set(&mddev->max_corr_read_errors, n); 4589 return len; 4590 } 4591 4592 static struct md_sysfs_entry max_corr_read_errors = 4593 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4594 max_corrected_read_errors_store); 4595 4596 static ssize_t 4597 null_show(struct mddev *mddev, char *page) 4598 { 4599 return -EINVAL; 4600 } 4601 4602 static ssize_t 4603 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4604 { 4605 /* buf must be %d:%d\n? giving major and minor numbers */ 4606 /* The new device is added to the array. 4607 * If the array has a persistent superblock, we read the 4608 * superblock to initialise info and check validity. 4609 * Otherwise, only checking done is that in bind_rdev_to_array, 4610 * which mainly checks size. 4611 */ 4612 char *e; 4613 int major = simple_strtoul(buf, &e, 10); 4614 int minor; 4615 dev_t dev; 4616 struct md_rdev *rdev; 4617 int err; 4618 4619 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4620 return -EINVAL; 4621 minor = simple_strtoul(e+1, &e, 10); 4622 if (*e && *e != '\n') 4623 return -EINVAL; 4624 dev = MKDEV(major, minor); 4625 if (major != MAJOR(dev) || 4626 minor != MINOR(dev)) 4627 return -EOVERFLOW; 4628 4629 err = mddev_suspend_and_lock(mddev); 4630 if (err) 4631 return err; 4632 if (mddev->persistent) { 4633 rdev = md_import_device(dev, mddev->major_version, 4634 mddev->minor_version); 4635 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4636 struct md_rdev *rdev0 4637 = list_entry(mddev->disks.next, 4638 struct md_rdev, same_set); 4639 err = super_types[mddev->major_version] 4640 .load_super(rdev, rdev0, mddev->minor_version); 4641 if (err < 0) 4642 goto out; 4643 } 4644 } else if (mddev->external) 4645 rdev = md_import_device(dev, -2, -1); 4646 else 4647 rdev = md_import_device(dev, -1, -1); 4648 4649 if (IS_ERR(rdev)) { 4650 mddev_unlock_and_resume(mddev); 4651 return PTR_ERR(rdev); 4652 } 4653 err = bind_rdev_to_array(rdev, mddev); 4654 out: 4655 if (err) 4656 export_rdev(rdev, mddev); 4657 mddev_unlock_and_resume(mddev); 4658 if (!err) 4659 md_new_event(); 4660 return err ? err : len; 4661 } 4662 4663 static struct md_sysfs_entry md_new_device = 4664 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4665 4666 static ssize_t 4667 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4668 { 4669 char *end; 4670 unsigned long chunk, end_chunk; 4671 int err; 4672 4673 err = mddev_lock(mddev); 4674 if (err) 4675 return err; 4676 if (!mddev->bitmap) 4677 goto out; 4678 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4679 while (*buf) { 4680 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4681 if (buf == end) 4682 break; 4683 4684 if (*end == '-') { /* range */ 4685 buf = end + 1; 4686 end_chunk = simple_strtoul(buf, &end, 0); 4687 if (buf == end) 4688 break; 4689 } 4690 4691 if (*end && !isspace(*end)) 4692 break; 4693 4694 mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); 4695 buf = skip_spaces(end); 4696 } 4697 mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ 4698 out: 4699 mddev_unlock(mddev); 4700 return len; 4701 } 4702 4703 static struct md_sysfs_entry md_bitmap = 4704 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4705 4706 static ssize_t 4707 size_show(struct mddev *mddev, char *page) 4708 { 4709 return sprintf(page, "%llu\n", 4710 (unsigned long long)mddev->dev_sectors / 2); 4711 } 4712 4713 static int update_size(struct mddev *mddev, sector_t num_sectors); 4714 4715 static ssize_t 4716 size_store(struct mddev *mddev, const char *buf, size_t len) 4717 { 4718 /* If array is inactive, we can reduce the component size, but 4719 * not increase it (except from 0). 4720 * If array is active, we can try an on-line resize 4721 */ 4722 sector_t sectors; 4723 int err = strict_blocks_to_sectors(buf, §ors); 4724 4725 if (err < 0) 4726 return err; 4727 err = mddev_lock(mddev); 4728 if (err) 4729 return err; 4730 if (mddev->pers) { 4731 err = update_size(mddev, sectors); 4732 if (err == 0) 4733 md_update_sb(mddev, 1); 4734 } else { 4735 if (mddev->dev_sectors == 0 || 4736 mddev->dev_sectors > sectors) 4737 mddev->dev_sectors = sectors; 4738 else 4739 err = -ENOSPC; 4740 } 4741 mddev_unlock(mddev); 4742 return err ? err : len; 4743 } 4744 4745 static struct md_sysfs_entry md_size = 4746 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4747 4748 /* Metadata version. 4749 * This is one of 4750 * 'none' for arrays with no metadata (good luck...) 4751 * 'external' for arrays with externally managed metadata, 4752 * or N.M for internally known formats 4753 */ 4754 static ssize_t 4755 metadata_show(struct mddev *mddev, char *page) 4756 { 4757 if (mddev->persistent) 4758 return sprintf(page, "%d.%d\n", 4759 mddev->major_version, mddev->minor_version); 4760 else if (mddev->external) 4761 return sprintf(page, "external:%s\n", mddev->metadata_type); 4762 else 4763 return sprintf(page, "none\n"); 4764 } 4765 4766 static ssize_t 4767 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4768 { 4769 int major, minor; 4770 char *e; 4771 int err; 4772 /* Changing the details of 'external' metadata is 4773 * always permitted. Otherwise there must be 4774 * no devices attached to the array. 4775 */ 4776 4777 err = mddev_lock(mddev); 4778 if (err) 4779 return err; 4780 err = -EBUSY; 4781 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4782 ; 4783 else if (!list_empty(&mddev->disks)) 4784 goto out_unlock; 4785 4786 err = 0; 4787 if (cmd_match(buf, "none")) { 4788 mddev->persistent = 0; 4789 mddev->external = 0; 4790 mddev->major_version = 0; 4791 mddev->minor_version = 90; 4792 goto out_unlock; 4793 } 4794 if (strncmp(buf, "external:", 9) == 0) { 4795 size_t namelen = len-9; 4796 if (namelen >= sizeof(mddev->metadata_type)) 4797 namelen = sizeof(mddev->metadata_type)-1; 4798 memcpy(mddev->metadata_type, buf+9, namelen); 4799 mddev->metadata_type[namelen] = 0; 4800 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4801 mddev->metadata_type[--namelen] = 0; 4802 mddev->persistent = 0; 4803 mddev->external = 1; 4804 mddev->major_version = 0; 4805 mddev->minor_version = 90; 4806 goto out_unlock; 4807 } 4808 major = simple_strtoul(buf, &e, 10); 4809 err = -EINVAL; 4810 if (e==buf || *e != '.') 4811 goto out_unlock; 4812 buf = e+1; 4813 minor = simple_strtoul(buf, &e, 10); 4814 if (e==buf || (*e && *e != '\n') ) 4815 goto out_unlock; 4816 err = -ENOENT; 4817 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4818 goto out_unlock; 4819 mddev->major_version = major; 4820 mddev->minor_version = minor; 4821 mddev->persistent = 1; 4822 mddev->external = 0; 4823 err = 0; 4824 out_unlock: 4825 mddev_unlock(mddev); 4826 return err ?: len; 4827 } 4828 4829 static struct md_sysfs_entry md_metadata = 4830 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4831 4832 enum sync_action md_sync_action(struct mddev *mddev) 4833 { 4834 unsigned long recovery = mddev->recovery; 4835 4836 /* 4837 * frozen has the highest priority, means running sync_thread will be 4838 * stopped immediately, and no new sync_thread can start. 4839 */ 4840 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4841 return ACTION_FROZEN; 4842 4843 /* 4844 * read-only array can't register sync_thread, and it can only 4845 * add/remove spares. 4846 */ 4847 if (!md_is_rdwr(mddev)) 4848 return ACTION_IDLE; 4849 4850 /* 4851 * idle means no sync_thread is running, and no new sync_thread is 4852 * requested. 4853 */ 4854 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && 4855 !test_bit(MD_RECOVERY_NEEDED, &recovery)) 4856 return ACTION_IDLE; 4857 4858 if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || 4859 mddev->reshape_position != MaxSector) 4860 return ACTION_RESHAPE; 4861 4862 if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4863 return ACTION_RECOVER; 4864 4865 if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4866 /* 4867 * MD_RECOVERY_CHECK must be paired with 4868 * MD_RECOVERY_REQUESTED. 4869 */ 4870 if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4871 return ACTION_CHECK; 4872 if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4873 return ACTION_REPAIR; 4874 return ACTION_RESYNC; 4875 } 4876 4877 /* 4878 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no 4879 * sync_action is specified. 4880 */ 4881 return ACTION_IDLE; 4882 } 4883 4884 enum sync_action md_sync_action_by_name(const char *page) 4885 { 4886 enum sync_action action; 4887 4888 for (action = 0; action < NR_SYNC_ACTIONS; ++action) { 4889 if (cmd_match(page, action_name[action])) 4890 return action; 4891 } 4892 4893 return NR_SYNC_ACTIONS; 4894 } 4895 4896 const char *md_sync_action_name(enum sync_action action) 4897 { 4898 return action_name[action]; 4899 } 4900 4901 static ssize_t 4902 action_show(struct mddev *mddev, char *page) 4903 { 4904 enum sync_action action = md_sync_action(mddev); 4905 4906 return sprintf(page, "%s\n", md_sync_action_name(action)); 4907 } 4908 4909 /** 4910 * stop_sync_thread() - wait for sync_thread to stop if it's running. 4911 * @mddev: the array. 4912 * @locked: if set, reconfig_mutex will still be held after this function 4913 * return; if not set, reconfig_mutex will be released after this 4914 * function return. 4915 */ 4916 static void stop_sync_thread(struct mddev *mddev, bool locked) 4917 { 4918 int sync_seq = atomic_read(&mddev->sync_seq); 4919 4920 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4921 if (!locked) 4922 mddev_unlock(mddev); 4923 return; 4924 } 4925 4926 mddev_unlock(mddev); 4927 4928 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4929 /* 4930 * Thread might be blocked waiting for metadata update which will now 4931 * never happen 4932 */ 4933 md_wakeup_thread_directly(mddev->sync_thread); 4934 if (work_pending(&mddev->sync_work)) 4935 flush_work(&mddev->sync_work); 4936 4937 wait_event(resync_wait, 4938 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4939 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && 4940 sync_seq != atomic_read(&mddev->sync_seq))); 4941 4942 if (locked) 4943 mddev_lock_nointr(mddev); 4944 } 4945 4946 void md_idle_sync_thread(struct mddev *mddev) 4947 { 4948 lockdep_assert_held(&mddev->reconfig_mutex); 4949 4950 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4951 stop_sync_thread(mddev, true); 4952 } 4953 EXPORT_SYMBOL_GPL(md_idle_sync_thread); 4954 4955 void md_frozen_sync_thread(struct mddev *mddev) 4956 { 4957 lockdep_assert_held(&mddev->reconfig_mutex); 4958 4959 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4960 stop_sync_thread(mddev, true); 4961 } 4962 EXPORT_SYMBOL_GPL(md_frozen_sync_thread); 4963 4964 void md_unfrozen_sync_thread(struct mddev *mddev) 4965 { 4966 lockdep_assert_held(&mddev->reconfig_mutex); 4967 4968 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4969 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4970 md_wakeup_thread(mddev->thread); 4971 sysfs_notify_dirent_safe(mddev->sysfs_action); 4972 } 4973 EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); 4974 4975 static int mddev_start_reshape(struct mddev *mddev) 4976 { 4977 int ret; 4978 4979 if (mddev->pers->start_reshape == NULL) 4980 return -EINVAL; 4981 4982 if (mddev->reshape_position == MaxSector || 4983 mddev->pers->check_reshape == NULL || 4984 mddev->pers->check_reshape(mddev)) { 4985 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4986 ret = mddev->pers->start_reshape(mddev); 4987 if (ret) 4988 return ret; 4989 } else { 4990 /* 4991 * If reshape is still in progress, and md_check_recovery() can 4992 * continue to reshape, don't restart reshape because data can 4993 * be corrupted for raid456. 4994 */ 4995 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4996 } 4997 4998 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 4999 return 0; 5000 } 5001 5002 static ssize_t 5003 action_store(struct mddev *mddev, const char *page, size_t len) 5004 { 5005 int ret; 5006 enum sync_action action; 5007 5008 if (!mddev->pers || !mddev->pers->sync_request) 5009 return -EINVAL; 5010 5011 retry: 5012 if (work_busy(&mddev->sync_work)) 5013 flush_work(&mddev->sync_work); 5014 5015 ret = mddev_lock(mddev); 5016 if (ret) 5017 return ret; 5018 5019 if (work_busy(&mddev->sync_work)) { 5020 mddev_unlock(mddev); 5021 goto retry; 5022 } 5023 5024 action = md_sync_action_by_name(page); 5025 5026 /* TODO: mdadm rely on "idle" to start sync_thread. */ 5027 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5028 switch (action) { 5029 case ACTION_FROZEN: 5030 md_frozen_sync_thread(mddev); 5031 ret = len; 5032 goto out; 5033 case ACTION_IDLE: 5034 md_idle_sync_thread(mddev); 5035 break; 5036 case ACTION_RESHAPE: 5037 case ACTION_RECOVER: 5038 case ACTION_CHECK: 5039 case ACTION_REPAIR: 5040 case ACTION_RESYNC: 5041 ret = -EBUSY; 5042 goto out; 5043 default: 5044 ret = -EINVAL; 5045 goto out; 5046 } 5047 } else { 5048 switch (action) { 5049 case ACTION_FROZEN: 5050 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5051 ret = len; 5052 goto out; 5053 case ACTION_RESHAPE: 5054 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5055 ret = mddev_start_reshape(mddev); 5056 if (ret) 5057 goto out; 5058 break; 5059 case ACTION_RECOVER: 5060 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5061 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5062 break; 5063 case ACTION_CHECK: 5064 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5065 fallthrough; 5066 case ACTION_REPAIR: 5067 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 5068 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5069 fallthrough; 5070 case ACTION_RESYNC: 5071 case ACTION_IDLE: 5072 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5073 break; 5074 default: 5075 ret = -EINVAL; 5076 goto out; 5077 } 5078 } 5079 5080 if (mddev->ro == MD_AUTO_READ) { 5081 /* A write to sync_action is enough to justify 5082 * canceling read-auto mode 5083 */ 5084 mddev->ro = MD_RDWR; 5085 md_wakeup_thread(mddev->sync_thread); 5086 } 5087 5088 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5089 md_wakeup_thread(mddev->thread); 5090 sysfs_notify_dirent_safe(mddev->sysfs_action); 5091 ret = len; 5092 5093 out: 5094 mddev_unlock(mddev); 5095 return ret; 5096 } 5097 5098 static struct md_sysfs_entry md_scan_mode = 5099 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 5100 5101 static ssize_t 5102 last_sync_action_show(struct mddev *mddev, char *page) 5103 { 5104 return sprintf(page, "%s\n", 5105 md_sync_action_name(mddev->last_sync_action)); 5106 } 5107 5108 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 5109 5110 static ssize_t 5111 mismatch_cnt_show(struct mddev *mddev, char *page) 5112 { 5113 return sprintf(page, "%llu\n", 5114 (unsigned long long) 5115 atomic64_read(&mddev->resync_mismatches)); 5116 } 5117 5118 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 5119 5120 static ssize_t 5121 sync_min_show(struct mddev *mddev, char *page) 5122 { 5123 return sprintf(page, "%d (%s)\n", speed_min(mddev), 5124 mddev->sync_speed_min ? "local" : "system"); 5125 } 5126 5127 static ssize_t 5128 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 5129 { 5130 unsigned int min; 5131 int rv; 5132 5133 if (strncmp(buf, "system", 6) == 0) { 5134 min = 0; 5135 } else { 5136 rv = kstrtouint(buf, 10, &min); 5137 if (rv < 0) 5138 return rv; 5139 if (min == 0) 5140 return -EINVAL; 5141 } 5142 mddev->sync_speed_min = min; 5143 return len; 5144 } 5145 5146 static struct md_sysfs_entry md_sync_min = 5147 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5148 5149 static ssize_t 5150 sync_max_show(struct mddev *mddev, char *page) 5151 { 5152 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5153 mddev->sync_speed_max ? "local" : "system"); 5154 } 5155 5156 static ssize_t 5157 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5158 { 5159 unsigned int max; 5160 int rv; 5161 5162 if (strncmp(buf, "system", 6) == 0) { 5163 max = 0; 5164 } else { 5165 rv = kstrtouint(buf, 10, &max); 5166 if (rv < 0) 5167 return rv; 5168 if (max == 0) 5169 return -EINVAL; 5170 } 5171 mddev->sync_speed_max = max; 5172 return len; 5173 } 5174 5175 static struct md_sysfs_entry md_sync_max = 5176 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5177 5178 static ssize_t 5179 sync_io_depth_show(struct mddev *mddev, char *page) 5180 { 5181 return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), 5182 mddev->sync_io_depth ? "local" : "system"); 5183 } 5184 5185 static ssize_t 5186 sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) 5187 { 5188 unsigned int max; 5189 int rv; 5190 5191 if (strncmp(buf, "system", 6) == 0) { 5192 max = 0; 5193 } else { 5194 rv = kstrtouint(buf, 10, &max); 5195 if (rv < 0) 5196 return rv; 5197 if (max == 0) 5198 return -EINVAL; 5199 } 5200 mddev->sync_io_depth = max; 5201 return len; 5202 } 5203 5204 static struct md_sysfs_entry md_sync_io_depth = 5205 __ATTR_RW(sync_io_depth); 5206 5207 static ssize_t 5208 degraded_show(struct mddev *mddev, char *page) 5209 { 5210 return sprintf(page, "%d\n", mddev->degraded); 5211 } 5212 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5213 5214 static ssize_t 5215 sync_force_parallel_show(struct mddev *mddev, char *page) 5216 { 5217 return sprintf(page, "%d\n", mddev->parallel_resync); 5218 } 5219 5220 static ssize_t 5221 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5222 { 5223 long n; 5224 5225 if (kstrtol(buf, 10, &n)) 5226 return -EINVAL; 5227 5228 if (n != 0 && n != 1) 5229 return -EINVAL; 5230 5231 mddev->parallel_resync = n; 5232 5233 if (mddev->sync_thread) 5234 wake_up(&resync_wait); 5235 5236 return len; 5237 } 5238 5239 /* force parallel resync, even with shared block devices */ 5240 static struct md_sysfs_entry md_sync_force_parallel = 5241 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5242 sync_force_parallel_show, sync_force_parallel_store); 5243 5244 static ssize_t 5245 sync_speed_show(struct mddev *mddev, char *page) 5246 { 5247 unsigned long resync, dt, db; 5248 if (mddev->curr_resync == MD_RESYNC_NONE) 5249 return sprintf(page, "none\n"); 5250 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5251 dt = (jiffies - mddev->resync_mark) / HZ; 5252 if (!dt) dt++; 5253 db = resync - mddev->resync_mark_cnt; 5254 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5255 } 5256 5257 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5258 5259 static ssize_t 5260 sync_completed_show(struct mddev *mddev, char *page) 5261 { 5262 unsigned long long max_sectors, resync; 5263 5264 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5265 return sprintf(page, "none\n"); 5266 5267 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5268 mddev->curr_resync == MD_RESYNC_DELAYED) 5269 return sprintf(page, "delayed\n"); 5270 5271 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5272 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5273 max_sectors = mddev->resync_max_sectors; 5274 else 5275 max_sectors = mddev->dev_sectors; 5276 5277 resync = mddev->curr_resync_completed; 5278 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5279 } 5280 5281 static struct md_sysfs_entry md_sync_completed = 5282 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5283 5284 static ssize_t 5285 min_sync_show(struct mddev *mddev, char *page) 5286 { 5287 return sprintf(page, "%llu\n", 5288 (unsigned long long)mddev->resync_min); 5289 } 5290 static ssize_t 5291 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5292 { 5293 unsigned long long min; 5294 int err; 5295 5296 if (kstrtoull(buf, 10, &min)) 5297 return -EINVAL; 5298 5299 spin_lock(&mddev->lock); 5300 err = -EINVAL; 5301 if (min > mddev->resync_max) 5302 goto out_unlock; 5303 5304 err = -EBUSY; 5305 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5306 goto out_unlock; 5307 5308 /* Round down to multiple of 4K for safety */ 5309 mddev->resync_min = round_down(min, 8); 5310 err = 0; 5311 5312 out_unlock: 5313 spin_unlock(&mddev->lock); 5314 return err ?: len; 5315 } 5316 5317 static struct md_sysfs_entry md_min_sync = 5318 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5319 5320 static ssize_t 5321 max_sync_show(struct mddev *mddev, char *page) 5322 { 5323 if (mddev->resync_max == MaxSector) 5324 return sprintf(page, "max\n"); 5325 else 5326 return sprintf(page, "%llu\n", 5327 (unsigned long long)mddev->resync_max); 5328 } 5329 static ssize_t 5330 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5331 { 5332 int err; 5333 spin_lock(&mddev->lock); 5334 if (strncmp(buf, "max", 3) == 0) 5335 mddev->resync_max = MaxSector; 5336 else { 5337 unsigned long long max; 5338 int chunk; 5339 5340 err = -EINVAL; 5341 if (kstrtoull(buf, 10, &max)) 5342 goto out_unlock; 5343 if (max < mddev->resync_min) 5344 goto out_unlock; 5345 5346 err = -EBUSY; 5347 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5348 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5349 goto out_unlock; 5350 5351 /* Must be a multiple of chunk_size */ 5352 chunk = mddev->chunk_sectors; 5353 if (chunk) { 5354 sector_t temp = max; 5355 5356 err = -EINVAL; 5357 if (sector_div(temp, chunk)) 5358 goto out_unlock; 5359 } 5360 mddev->resync_max = max; 5361 } 5362 wake_up(&mddev->recovery_wait); 5363 err = 0; 5364 out_unlock: 5365 spin_unlock(&mddev->lock); 5366 return err ?: len; 5367 } 5368 5369 static struct md_sysfs_entry md_max_sync = 5370 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5371 5372 static ssize_t 5373 suspend_lo_show(struct mddev *mddev, char *page) 5374 { 5375 return sprintf(page, "%llu\n", 5376 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5377 } 5378 5379 static ssize_t 5380 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5381 { 5382 unsigned long long new; 5383 int err; 5384 5385 err = kstrtoull(buf, 10, &new); 5386 if (err < 0) 5387 return err; 5388 if (new != (sector_t)new) 5389 return -EINVAL; 5390 5391 err = mddev_suspend(mddev, true); 5392 if (err) 5393 return err; 5394 5395 WRITE_ONCE(mddev->suspend_lo, new); 5396 mddev_resume(mddev); 5397 5398 return len; 5399 } 5400 static struct md_sysfs_entry md_suspend_lo = 5401 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5402 5403 static ssize_t 5404 suspend_hi_show(struct mddev *mddev, char *page) 5405 { 5406 return sprintf(page, "%llu\n", 5407 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5408 } 5409 5410 static ssize_t 5411 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5412 { 5413 unsigned long long new; 5414 int err; 5415 5416 err = kstrtoull(buf, 10, &new); 5417 if (err < 0) 5418 return err; 5419 if (new != (sector_t)new) 5420 return -EINVAL; 5421 5422 err = mddev_suspend(mddev, true); 5423 if (err) 5424 return err; 5425 5426 WRITE_ONCE(mddev->suspend_hi, new); 5427 mddev_resume(mddev); 5428 5429 return len; 5430 } 5431 static struct md_sysfs_entry md_suspend_hi = 5432 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5433 5434 static ssize_t 5435 reshape_position_show(struct mddev *mddev, char *page) 5436 { 5437 if (mddev->reshape_position != MaxSector) 5438 return sprintf(page, "%llu\n", 5439 (unsigned long long)mddev->reshape_position); 5440 strcpy(page, "none\n"); 5441 return 5; 5442 } 5443 5444 static ssize_t 5445 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5446 { 5447 struct md_rdev *rdev; 5448 unsigned long long new; 5449 int err; 5450 5451 err = kstrtoull(buf, 10, &new); 5452 if (err < 0) 5453 return err; 5454 if (new != (sector_t)new) 5455 return -EINVAL; 5456 err = mddev_lock(mddev); 5457 if (err) 5458 return err; 5459 err = -EBUSY; 5460 if (mddev->pers) 5461 goto unlock; 5462 mddev->reshape_position = new; 5463 mddev->delta_disks = 0; 5464 mddev->reshape_backwards = 0; 5465 mddev->new_level = mddev->level; 5466 mddev->new_layout = mddev->layout; 5467 mddev->new_chunk_sectors = mddev->chunk_sectors; 5468 rdev_for_each(rdev, mddev) 5469 rdev->new_data_offset = rdev->data_offset; 5470 err = 0; 5471 unlock: 5472 mddev_unlock(mddev); 5473 return err ?: len; 5474 } 5475 5476 static struct md_sysfs_entry md_reshape_position = 5477 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5478 reshape_position_store); 5479 5480 static ssize_t 5481 reshape_direction_show(struct mddev *mddev, char *page) 5482 { 5483 return sprintf(page, "%s\n", 5484 mddev->reshape_backwards ? "backwards" : "forwards"); 5485 } 5486 5487 static ssize_t 5488 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5489 { 5490 int backwards = 0; 5491 int err; 5492 5493 if (cmd_match(buf, "forwards")) 5494 backwards = 0; 5495 else if (cmd_match(buf, "backwards")) 5496 backwards = 1; 5497 else 5498 return -EINVAL; 5499 if (mddev->reshape_backwards == backwards) 5500 return len; 5501 5502 err = mddev_lock(mddev); 5503 if (err) 5504 return err; 5505 /* check if we are allowed to change */ 5506 if (mddev->delta_disks) 5507 err = -EBUSY; 5508 else if (mddev->persistent && 5509 mddev->major_version == 0) 5510 err = -EINVAL; 5511 else 5512 mddev->reshape_backwards = backwards; 5513 mddev_unlock(mddev); 5514 return err ?: len; 5515 } 5516 5517 static struct md_sysfs_entry md_reshape_direction = 5518 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5519 reshape_direction_store); 5520 5521 static ssize_t 5522 array_size_show(struct mddev *mddev, char *page) 5523 { 5524 if (mddev->external_size) 5525 return sprintf(page, "%llu\n", 5526 (unsigned long long)mddev->array_sectors/2); 5527 else 5528 return sprintf(page, "default\n"); 5529 } 5530 5531 static ssize_t 5532 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5533 { 5534 sector_t sectors; 5535 int err; 5536 5537 err = mddev_lock(mddev); 5538 if (err) 5539 return err; 5540 5541 /* cluster raid doesn't support change array_sectors */ 5542 if (mddev_is_clustered(mddev)) { 5543 mddev_unlock(mddev); 5544 return -EINVAL; 5545 } 5546 5547 if (strncmp(buf, "default", 7) == 0) { 5548 if (mddev->pers) 5549 sectors = mddev->pers->size(mddev, 0, 0); 5550 else 5551 sectors = mddev->array_sectors; 5552 5553 mddev->external_size = 0; 5554 } else { 5555 if (strict_blocks_to_sectors(buf, §ors) < 0) 5556 err = -EINVAL; 5557 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5558 err = -E2BIG; 5559 else 5560 mddev->external_size = 1; 5561 } 5562 5563 if (!err) { 5564 mddev->array_sectors = sectors; 5565 if (mddev->pers) 5566 set_capacity_and_notify(mddev->gendisk, 5567 mddev->array_sectors); 5568 } 5569 mddev_unlock(mddev); 5570 return err ?: len; 5571 } 5572 5573 static struct md_sysfs_entry md_array_size = 5574 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5575 array_size_store); 5576 5577 static ssize_t 5578 consistency_policy_show(struct mddev *mddev, char *page) 5579 { 5580 int ret; 5581 5582 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5583 ret = sprintf(page, "journal\n"); 5584 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5585 ret = sprintf(page, "ppl\n"); 5586 } else if (mddev->bitmap) { 5587 ret = sprintf(page, "bitmap\n"); 5588 } else if (mddev->pers) { 5589 if (mddev->pers->sync_request) 5590 ret = sprintf(page, "resync\n"); 5591 else 5592 ret = sprintf(page, "none\n"); 5593 } else { 5594 ret = sprintf(page, "unknown\n"); 5595 } 5596 5597 return ret; 5598 } 5599 5600 static ssize_t 5601 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5602 { 5603 int err = 0; 5604 5605 if (mddev->pers) { 5606 if (mddev->pers->change_consistency_policy) 5607 err = mddev->pers->change_consistency_policy(mddev, buf); 5608 else 5609 err = -EBUSY; 5610 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5611 set_bit(MD_HAS_PPL, &mddev->flags); 5612 } else { 5613 err = -EINVAL; 5614 } 5615 5616 return err ? err : len; 5617 } 5618 5619 static struct md_sysfs_entry md_consistency_policy = 5620 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5621 consistency_policy_store); 5622 5623 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5624 { 5625 return sprintf(page, "%d\n", mddev->fail_last_dev); 5626 } 5627 5628 /* 5629 * Setting fail_last_dev to true to allow last device to be forcibly removed 5630 * from RAID1/RAID10. 5631 */ 5632 static ssize_t 5633 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5634 { 5635 int ret; 5636 bool value; 5637 5638 ret = kstrtobool(buf, &value); 5639 if (ret) 5640 return ret; 5641 5642 if (value != mddev->fail_last_dev) 5643 mddev->fail_last_dev = value; 5644 5645 return len; 5646 } 5647 static struct md_sysfs_entry md_fail_last_dev = 5648 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5649 fail_last_dev_store); 5650 5651 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5652 { 5653 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) 5654 return sprintf(page, "n/a\n"); 5655 else 5656 return sprintf(page, "%d\n", mddev->serialize_policy); 5657 } 5658 5659 /* 5660 * Setting serialize_policy to true to enforce write IO is not reordered 5661 * for raid1. 5662 */ 5663 static ssize_t 5664 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5665 { 5666 int err; 5667 bool value; 5668 5669 err = kstrtobool(buf, &value); 5670 if (err) 5671 return err; 5672 5673 if (value == mddev->serialize_policy) 5674 return len; 5675 5676 err = mddev_suspend_and_lock(mddev); 5677 if (err) 5678 return err; 5679 if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { 5680 pr_err("md: serialize_policy is only effective for raid1\n"); 5681 err = -EINVAL; 5682 goto unlock; 5683 } 5684 5685 if (value) 5686 mddev_create_serial_pool(mddev, NULL); 5687 else 5688 mddev_destroy_serial_pool(mddev, NULL); 5689 mddev->serialize_policy = value; 5690 unlock: 5691 mddev_unlock_and_resume(mddev); 5692 return err ?: len; 5693 } 5694 5695 static struct md_sysfs_entry md_serialize_policy = 5696 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5697 serialize_policy_store); 5698 5699 5700 static struct attribute *md_default_attrs[] = { 5701 &md_level.attr, 5702 &md_new_level.attr, 5703 &md_layout.attr, 5704 &md_raid_disks.attr, 5705 &md_uuid.attr, 5706 &md_chunk_size.attr, 5707 &md_size.attr, 5708 &md_resync_start.attr, 5709 &md_metadata.attr, 5710 &md_new_device.attr, 5711 &md_safe_delay.attr, 5712 &md_array_state.attr, 5713 &md_reshape_position.attr, 5714 &md_reshape_direction.attr, 5715 &md_array_size.attr, 5716 &max_corr_read_errors.attr, 5717 &md_consistency_policy.attr, 5718 &md_fail_last_dev.attr, 5719 &md_serialize_policy.attr, 5720 NULL, 5721 }; 5722 5723 static const struct attribute_group md_default_group = { 5724 .attrs = md_default_attrs, 5725 }; 5726 5727 static struct attribute *md_redundancy_attrs[] = { 5728 &md_scan_mode.attr, 5729 &md_last_scan_mode.attr, 5730 &md_mismatches.attr, 5731 &md_sync_min.attr, 5732 &md_sync_max.attr, 5733 &md_sync_io_depth.attr, 5734 &md_sync_speed.attr, 5735 &md_sync_force_parallel.attr, 5736 &md_sync_completed.attr, 5737 &md_min_sync.attr, 5738 &md_max_sync.attr, 5739 &md_suspend_lo.attr, 5740 &md_suspend_hi.attr, 5741 &md_bitmap.attr, 5742 &md_degraded.attr, 5743 NULL, 5744 }; 5745 static const struct attribute_group md_redundancy_group = { 5746 .name = NULL, 5747 .attrs = md_redundancy_attrs, 5748 }; 5749 5750 static const struct attribute_group *md_attr_groups[] = { 5751 &md_default_group, 5752 &md_bitmap_group, 5753 NULL, 5754 }; 5755 5756 static ssize_t 5757 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5758 { 5759 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5760 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5761 ssize_t rv; 5762 5763 if (!entry->show) 5764 return -EIO; 5765 spin_lock(&all_mddevs_lock); 5766 if (!mddev_get(mddev)) { 5767 spin_unlock(&all_mddevs_lock); 5768 return -EBUSY; 5769 } 5770 spin_unlock(&all_mddevs_lock); 5771 5772 rv = entry->show(mddev, page); 5773 mddev_put(mddev); 5774 return rv; 5775 } 5776 5777 static ssize_t 5778 md_attr_store(struct kobject *kobj, struct attribute *attr, 5779 const char *page, size_t length) 5780 { 5781 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5782 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5783 ssize_t rv; 5784 struct kernfs_node *kn = NULL; 5785 5786 if (!entry->store) 5787 return -EIO; 5788 if (!capable(CAP_SYS_ADMIN)) 5789 return -EACCES; 5790 5791 if (entry->store == array_state_store && cmd_match(page, "clear")) 5792 kn = sysfs_break_active_protection(kobj, attr); 5793 5794 spin_lock(&all_mddevs_lock); 5795 if (!mddev_get(mddev)) { 5796 spin_unlock(&all_mddevs_lock); 5797 if (kn) 5798 sysfs_unbreak_active_protection(kn); 5799 return -EBUSY; 5800 } 5801 spin_unlock(&all_mddevs_lock); 5802 rv = entry->store(mddev, page, length); 5803 mddev_put(mddev); 5804 5805 if (kn) 5806 sysfs_unbreak_active_protection(kn); 5807 5808 return rv; 5809 } 5810 5811 static void md_kobj_release(struct kobject *ko) 5812 { 5813 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5814 5815 put_disk(mddev->gendisk); 5816 } 5817 5818 static const struct sysfs_ops md_sysfs_ops = { 5819 .show = md_attr_show, 5820 .store = md_attr_store, 5821 }; 5822 static const struct kobj_type md_ktype = { 5823 .release = md_kobj_release, 5824 .sysfs_ops = &md_sysfs_ops, 5825 .default_groups = md_attr_groups, 5826 }; 5827 5828 int mdp_major = 0; 5829 5830 /* stack the limit for all rdevs into lim */ 5831 int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, 5832 unsigned int flags) 5833 { 5834 struct md_rdev *rdev; 5835 5836 rdev_for_each(rdev, mddev) { 5837 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, 5838 mddev->gendisk->disk_name); 5839 if ((flags & MDDEV_STACK_INTEGRITY) && 5840 !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) 5841 return -EINVAL; 5842 } 5843 5844 return 0; 5845 } 5846 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); 5847 5848 /* apply the extra stacking limits from a new rdev into mddev */ 5849 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) 5850 { 5851 struct queue_limits lim; 5852 5853 if (mddev_is_dm(mddev)) 5854 return 0; 5855 5856 lim = queue_limits_start_update(mddev->gendisk->queue); 5857 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, 5858 mddev->gendisk->disk_name); 5859 5860 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { 5861 pr_err("%s: incompatible integrity profile for %pg\n", 5862 mdname(mddev), rdev->bdev); 5863 queue_limits_cancel_update(mddev->gendisk->queue); 5864 return -ENXIO; 5865 } 5866 5867 return queue_limits_commit_update(mddev->gendisk->queue, &lim); 5868 } 5869 EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); 5870 5871 /* update the optimal I/O size after a reshape */ 5872 void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) 5873 { 5874 struct queue_limits lim; 5875 5876 if (mddev_is_dm(mddev)) 5877 return; 5878 5879 /* don't bother updating io_opt if we can't suspend the array */ 5880 if (mddev_suspend(mddev, false) < 0) 5881 return; 5882 lim = queue_limits_start_update(mddev->gendisk->queue); 5883 lim.io_opt = lim.io_min * nr_stripes; 5884 queue_limits_commit_update(mddev->gendisk->queue, &lim); 5885 mddev_resume(mddev); 5886 } 5887 EXPORT_SYMBOL_GPL(mddev_update_io_opt); 5888 5889 static void mddev_delayed_delete(struct work_struct *ws) 5890 { 5891 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5892 5893 kobject_put(&mddev->kobj); 5894 } 5895 5896 void md_init_stacking_limits(struct queue_limits *lim) 5897 { 5898 blk_set_stacking_limits(lim); 5899 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | 5900 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; 5901 } 5902 EXPORT_SYMBOL_GPL(md_init_stacking_limits); 5903 5904 struct mddev *md_alloc(dev_t dev, char *name) 5905 { 5906 /* 5907 * If dev is zero, name is the name of a device to allocate with 5908 * an arbitrary minor number. It will be "md_???" 5909 * If dev is non-zero it must be a device number with a MAJOR of 5910 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5911 * the device is being created by opening a node in /dev. 5912 * If "name" is not NULL, the device is being created by 5913 * writing to /sys/module/md_mod/parameters/new_array. 5914 */ 5915 static DEFINE_MUTEX(disks_mutex); 5916 struct mddev *mddev; 5917 struct gendisk *disk; 5918 int partitioned; 5919 int shift; 5920 int unit; 5921 int error; 5922 5923 /* 5924 * Wait for any previous instance of this device to be completely 5925 * removed (mddev_delayed_delete). 5926 */ 5927 flush_workqueue(md_misc_wq); 5928 5929 mutex_lock(&disks_mutex); 5930 mddev = mddev_alloc(dev); 5931 if (IS_ERR(mddev)) { 5932 error = PTR_ERR(mddev); 5933 goto out_unlock; 5934 } 5935 5936 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5937 shift = partitioned ? MdpMinorShift : 0; 5938 unit = MINOR(mddev->unit) >> shift; 5939 5940 if (name && !dev) { 5941 /* Need to ensure that 'name' is not a duplicate. 5942 */ 5943 struct mddev *mddev2; 5944 spin_lock(&all_mddevs_lock); 5945 5946 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5947 if (mddev2->gendisk && 5948 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5949 spin_unlock(&all_mddevs_lock); 5950 error = -EEXIST; 5951 goto out_free_mddev; 5952 } 5953 spin_unlock(&all_mddevs_lock); 5954 } 5955 if (name && dev) 5956 /* 5957 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5958 */ 5959 mddev->hold_active = UNTIL_STOP; 5960 5961 disk = blk_alloc_disk(NULL, NUMA_NO_NODE); 5962 if (IS_ERR(disk)) { 5963 error = PTR_ERR(disk); 5964 goto out_free_mddev; 5965 } 5966 5967 disk->major = MAJOR(mddev->unit); 5968 disk->first_minor = unit << shift; 5969 disk->minors = 1 << shift; 5970 if (name) 5971 strcpy(disk->disk_name, name); 5972 else if (partitioned) 5973 sprintf(disk->disk_name, "md_d%d", unit); 5974 else 5975 sprintf(disk->disk_name, "md%d", unit); 5976 disk->fops = &md_fops; 5977 disk->private_data = mddev; 5978 5979 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5980 mddev->gendisk = disk; 5981 error = add_disk(disk); 5982 if (error) 5983 goto out_put_disk; 5984 5985 kobject_init(&mddev->kobj, &md_ktype); 5986 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5987 if (error) { 5988 /* 5989 * The disk is already live at this point. Clear the hold flag 5990 * and let mddev_put take care of the deletion, as it isn't any 5991 * different from a normal close on last release now. 5992 */ 5993 mddev->hold_active = 0; 5994 mutex_unlock(&disks_mutex); 5995 mddev_put(mddev); 5996 return ERR_PTR(error); 5997 } 5998 5999 kobject_uevent(&mddev->kobj, KOBJ_ADD); 6000 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 6001 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 6002 mutex_unlock(&disks_mutex); 6003 return mddev; 6004 6005 out_put_disk: 6006 put_disk(disk); 6007 out_free_mddev: 6008 mddev_free(mddev); 6009 out_unlock: 6010 mutex_unlock(&disks_mutex); 6011 return ERR_PTR(error); 6012 } 6013 6014 static int md_alloc_and_put(dev_t dev, char *name) 6015 { 6016 struct mddev *mddev = md_alloc(dev, name); 6017 6018 if (IS_ERR(mddev)) 6019 return PTR_ERR(mddev); 6020 mddev_put(mddev); 6021 return 0; 6022 } 6023 6024 static void md_probe(dev_t dev) 6025 { 6026 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 6027 return; 6028 if (create_on_open) 6029 md_alloc_and_put(dev, NULL); 6030 } 6031 6032 static int add_named_array(const char *val, const struct kernel_param *kp) 6033 { 6034 /* 6035 * val must be "md_*" or "mdNNN". 6036 * For "md_*" we allocate an array with a large free minor number, and 6037 * set the name to val. val must not already be an active name. 6038 * For "mdNNN" we allocate an array with the minor number NNN 6039 * which must not already be in use. 6040 */ 6041 int len = strlen(val); 6042 char buf[DISK_NAME_LEN]; 6043 unsigned long devnum; 6044 6045 while (len && val[len-1] == '\n') 6046 len--; 6047 if (len >= DISK_NAME_LEN) 6048 return -E2BIG; 6049 strscpy(buf, val, len+1); 6050 if (strncmp(buf, "md_", 3) == 0) 6051 return md_alloc_and_put(0, buf); 6052 if (strncmp(buf, "md", 2) == 0 && 6053 isdigit(buf[2]) && 6054 kstrtoul(buf+2, 10, &devnum) == 0 && 6055 devnum <= MINORMASK) 6056 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 6057 6058 return -EINVAL; 6059 } 6060 6061 static void md_safemode_timeout(struct timer_list *t) 6062 { 6063 struct mddev *mddev = timer_container_of(mddev, t, safemode_timer); 6064 6065 mddev->safemode = 1; 6066 if (mddev->external) 6067 sysfs_notify_dirent_safe(mddev->sysfs_state); 6068 6069 md_wakeup_thread(mddev->thread); 6070 } 6071 6072 static int start_dirty_degraded; 6073 6074 int md_run(struct mddev *mddev) 6075 { 6076 int err; 6077 struct md_rdev *rdev; 6078 struct md_personality *pers; 6079 bool nowait = true; 6080 6081 if (list_empty(&mddev->disks)) 6082 /* cannot run an array with no devices.. */ 6083 return -EINVAL; 6084 6085 if (mddev->pers) 6086 return -EBUSY; 6087 /* Cannot run until previous stop completes properly */ 6088 if (mddev->sysfs_active) 6089 return -EBUSY; 6090 6091 /* 6092 * Analyze all RAID superblock(s) 6093 */ 6094 if (!mddev->raid_disks) { 6095 if (!mddev->persistent) 6096 return -EINVAL; 6097 err = analyze_sbs(mddev); 6098 if (err) 6099 return -EINVAL; 6100 } 6101 6102 if (mddev->level != LEVEL_NONE) 6103 request_module("md-level-%d", mddev->level); 6104 else if (mddev->clevel[0]) 6105 request_module("md-%s", mddev->clevel); 6106 6107 /* 6108 * Drop all container device buffers, from now on 6109 * the only valid external interface is through the md 6110 * device. 6111 */ 6112 mddev->has_superblocks = false; 6113 rdev_for_each(rdev, mddev) { 6114 if (test_bit(Faulty, &rdev->flags)) 6115 continue; 6116 sync_blockdev(rdev->bdev); 6117 invalidate_bdev(rdev->bdev); 6118 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 6119 mddev->ro = MD_RDONLY; 6120 if (!mddev_is_dm(mddev)) 6121 set_disk_ro(mddev->gendisk, 1); 6122 } 6123 6124 if (rdev->sb_page) 6125 mddev->has_superblocks = true; 6126 6127 /* perform some consistency tests on the device. 6128 * We don't want the data to overlap the metadata, 6129 * Internal Bitmap issues have been handled elsewhere. 6130 */ 6131 if (rdev->meta_bdev) { 6132 /* Nothing to check */; 6133 } else if (rdev->data_offset < rdev->sb_start) { 6134 if (mddev->dev_sectors && 6135 rdev->data_offset + mddev->dev_sectors 6136 > rdev->sb_start) { 6137 pr_warn("md: %s: data overlaps metadata\n", 6138 mdname(mddev)); 6139 return -EINVAL; 6140 } 6141 } else { 6142 if (rdev->sb_start + rdev->sb_size/512 6143 > rdev->data_offset) { 6144 pr_warn("md: %s: metadata overlaps data\n", 6145 mdname(mddev)); 6146 return -EINVAL; 6147 } 6148 } 6149 sysfs_notify_dirent_safe(rdev->sysfs_state); 6150 nowait = nowait && bdev_nowait(rdev->bdev); 6151 } 6152 6153 if (!bioset_initialized(&mddev->bio_set)) { 6154 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6155 if (err) 6156 return err; 6157 } 6158 if (!bioset_initialized(&mddev->sync_set)) { 6159 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6160 if (err) 6161 goto exit_bio_set; 6162 } 6163 6164 if (!bioset_initialized(&mddev->io_clone_set)) { 6165 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 6166 offsetof(struct md_io_clone, bio_clone), 0); 6167 if (err) 6168 goto exit_sync_set; 6169 } 6170 6171 pers = get_pers(mddev->level, mddev->clevel); 6172 if (!pers) { 6173 err = -EINVAL; 6174 goto abort; 6175 } 6176 if (mddev->level != pers->head.id) { 6177 mddev->level = pers->head.id; 6178 mddev->new_level = pers->head.id; 6179 } 6180 strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); 6181 6182 if (mddev->reshape_position != MaxSector && 6183 pers->start_reshape == NULL) { 6184 /* This personality cannot handle reshaping... */ 6185 put_pers(pers); 6186 err = -EINVAL; 6187 goto abort; 6188 } 6189 6190 if (pers->sync_request) { 6191 /* Warn if this is a potentially silly 6192 * configuration. 6193 */ 6194 struct md_rdev *rdev2; 6195 int warned = 0; 6196 6197 rdev_for_each(rdev, mddev) 6198 rdev_for_each(rdev2, mddev) { 6199 if (rdev < rdev2 && 6200 rdev->bdev->bd_disk == 6201 rdev2->bdev->bd_disk) { 6202 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 6203 mdname(mddev), 6204 rdev->bdev, 6205 rdev2->bdev); 6206 warned = 1; 6207 } 6208 } 6209 6210 if (warned) 6211 pr_warn("True protection against single-disk failure might be compromised.\n"); 6212 } 6213 6214 /* dm-raid expect sync_thread to be frozen until resume */ 6215 if (mddev->gendisk) 6216 mddev->recovery = 0; 6217 6218 /* may be over-ridden by personality */ 6219 mddev->resync_max_sectors = mddev->dev_sectors; 6220 6221 mddev->ok_start_degraded = start_dirty_degraded; 6222 6223 if (start_readonly && md_is_rdwr(mddev)) 6224 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6225 6226 err = pers->run(mddev); 6227 if (err) 6228 pr_warn("md: pers->run() failed ...\n"); 6229 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6230 WARN_ONCE(!mddev->external_size, 6231 "%s: default size too small, but 'external_size' not in effect?\n", 6232 __func__); 6233 pr_warn("md: invalid array_size %llu > default size %llu\n", 6234 (unsigned long long)mddev->array_sectors / 2, 6235 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6236 err = -EINVAL; 6237 } 6238 if (err == 0 && pers->sync_request && 6239 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6240 err = mddev->bitmap_ops->create(mddev); 6241 if (err) 6242 pr_warn("%s: failed to create bitmap (%d)\n", 6243 mdname(mddev), err); 6244 } 6245 if (err) 6246 goto bitmap_abort; 6247 6248 if (mddev->bitmap_info.max_write_behind > 0) { 6249 bool create_pool = false; 6250 6251 rdev_for_each(rdev, mddev) { 6252 if (test_bit(WriteMostly, &rdev->flags) && 6253 rdev_init_serial(rdev)) 6254 create_pool = true; 6255 } 6256 if (create_pool && mddev->serial_info_pool == NULL) { 6257 mddev->serial_info_pool = 6258 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6259 sizeof(struct serial_info)); 6260 if (!mddev->serial_info_pool) { 6261 err = -ENOMEM; 6262 goto bitmap_abort; 6263 } 6264 } 6265 } 6266 6267 if (pers->sync_request) { 6268 if (mddev->kobj.sd && 6269 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6270 pr_warn("md: cannot register extra attributes for %s\n", 6271 mdname(mddev)); 6272 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6273 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6274 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6275 } else if (mddev->ro == MD_AUTO_READ) 6276 mddev->ro = MD_RDWR; 6277 6278 atomic_set(&mddev->max_corr_read_errors, 6279 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6280 mddev->safemode = 0; 6281 if (mddev_is_clustered(mddev)) 6282 mddev->safemode_delay = 0; 6283 else 6284 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6285 mddev->in_sync = 1; 6286 smp_wmb(); 6287 spin_lock(&mddev->lock); 6288 mddev->pers = pers; 6289 spin_unlock(&mddev->lock); 6290 rdev_for_each(rdev, mddev) 6291 if (rdev->raid_disk >= 0) 6292 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6293 6294 if (mddev->degraded && md_is_rdwr(mddev)) 6295 /* This ensures that recovering status is reported immediately 6296 * via sysfs - until a lack of spares is confirmed. 6297 */ 6298 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6299 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6300 6301 if (mddev->sb_flags) 6302 md_update_sb(mddev, 0); 6303 6304 md_new_event(); 6305 return 0; 6306 6307 bitmap_abort: 6308 mddev_detach(mddev); 6309 if (mddev->private) 6310 pers->free(mddev, mddev->private); 6311 mddev->private = NULL; 6312 put_pers(pers); 6313 mddev->bitmap_ops->destroy(mddev); 6314 abort: 6315 bioset_exit(&mddev->io_clone_set); 6316 exit_sync_set: 6317 bioset_exit(&mddev->sync_set); 6318 exit_bio_set: 6319 bioset_exit(&mddev->bio_set); 6320 return err; 6321 } 6322 EXPORT_SYMBOL_GPL(md_run); 6323 6324 int do_md_run(struct mddev *mddev) 6325 { 6326 int err; 6327 6328 set_bit(MD_NOT_READY, &mddev->flags); 6329 err = md_run(mddev); 6330 if (err) 6331 goto out; 6332 6333 err = mddev->bitmap_ops->load(mddev); 6334 if (err) { 6335 mddev->bitmap_ops->destroy(mddev); 6336 goto out; 6337 } 6338 6339 if (mddev_is_clustered(mddev)) 6340 md_allow_write(mddev); 6341 6342 /* run start up tasks that require md_thread */ 6343 md_start(mddev); 6344 6345 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6346 6347 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6348 clear_bit(MD_NOT_READY, &mddev->flags); 6349 mddev->changed = 1; 6350 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6351 sysfs_notify_dirent_safe(mddev->sysfs_state); 6352 sysfs_notify_dirent_safe(mddev->sysfs_action); 6353 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6354 out: 6355 clear_bit(MD_NOT_READY, &mddev->flags); 6356 return err; 6357 } 6358 6359 int md_start(struct mddev *mddev) 6360 { 6361 int ret = 0; 6362 6363 if (mddev->pers->start) { 6364 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6365 ret = mddev->pers->start(mddev); 6366 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6367 md_wakeup_thread(mddev->sync_thread); 6368 } 6369 return ret; 6370 } 6371 EXPORT_SYMBOL_GPL(md_start); 6372 6373 static int restart_array(struct mddev *mddev) 6374 { 6375 struct gendisk *disk = mddev->gendisk; 6376 struct md_rdev *rdev; 6377 bool has_journal = false; 6378 bool has_readonly = false; 6379 6380 /* Complain if it has no devices */ 6381 if (list_empty(&mddev->disks)) 6382 return -ENXIO; 6383 if (!mddev->pers) 6384 return -EINVAL; 6385 if (md_is_rdwr(mddev)) 6386 return -EBUSY; 6387 6388 rcu_read_lock(); 6389 rdev_for_each_rcu(rdev, mddev) { 6390 if (test_bit(Journal, &rdev->flags) && 6391 !test_bit(Faulty, &rdev->flags)) 6392 has_journal = true; 6393 if (rdev_read_only(rdev)) 6394 has_readonly = true; 6395 } 6396 rcu_read_unlock(); 6397 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6398 /* Don't restart rw with journal missing/faulty */ 6399 return -EINVAL; 6400 if (has_readonly) 6401 return -EROFS; 6402 6403 mddev->safemode = 0; 6404 mddev->ro = MD_RDWR; 6405 set_disk_ro(disk, 0); 6406 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6407 /* Kick recovery or resync if necessary */ 6408 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6409 md_wakeup_thread(mddev->sync_thread); 6410 sysfs_notify_dirent_safe(mddev->sysfs_state); 6411 return 0; 6412 } 6413 6414 static void md_clean(struct mddev *mddev) 6415 { 6416 mddev->array_sectors = 0; 6417 mddev->external_size = 0; 6418 mddev->dev_sectors = 0; 6419 mddev->raid_disks = 0; 6420 mddev->recovery_cp = 0; 6421 mddev->resync_min = 0; 6422 mddev->resync_max = MaxSector; 6423 mddev->reshape_position = MaxSector; 6424 /* we still need mddev->external in export_rdev, do not clear it yet */ 6425 mddev->persistent = 0; 6426 mddev->level = LEVEL_NONE; 6427 mddev->clevel[0] = 0; 6428 /* if UNTIL_STOP is set, it's cleared here */ 6429 mddev->hold_active = 0; 6430 /* Don't clear MD_CLOSING, or mddev can be opened again. */ 6431 mddev->flags &= BIT_ULL_MASK(MD_CLOSING); 6432 mddev->sb_flags = 0; 6433 mddev->ro = MD_RDWR; 6434 mddev->metadata_type[0] = 0; 6435 mddev->chunk_sectors = 0; 6436 mddev->ctime = mddev->utime = 0; 6437 mddev->layout = 0; 6438 mddev->max_disks = 0; 6439 mddev->events = 0; 6440 mddev->can_decrease_events = 0; 6441 mddev->delta_disks = 0; 6442 mddev->reshape_backwards = 0; 6443 mddev->new_level = LEVEL_NONE; 6444 mddev->new_layout = 0; 6445 mddev->new_chunk_sectors = 0; 6446 mddev->curr_resync = MD_RESYNC_NONE; 6447 atomic64_set(&mddev->resync_mismatches, 0); 6448 mddev->suspend_lo = mddev->suspend_hi = 0; 6449 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6450 mddev->recovery = 0; 6451 mddev->in_sync = 0; 6452 mddev->changed = 0; 6453 mddev->degraded = 0; 6454 mddev->safemode = 0; 6455 mddev->private = NULL; 6456 mddev->cluster_info = NULL; 6457 mddev->bitmap_info.offset = 0; 6458 mddev->bitmap_info.default_offset = 0; 6459 mddev->bitmap_info.default_space = 0; 6460 mddev->bitmap_info.chunksize = 0; 6461 mddev->bitmap_info.daemon_sleep = 0; 6462 mddev->bitmap_info.max_write_behind = 0; 6463 mddev->bitmap_info.nodes = 0; 6464 } 6465 6466 static void __md_stop_writes(struct mddev *mddev) 6467 { 6468 timer_delete_sync(&mddev->safemode_timer); 6469 6470 if (mddev->pers && mddev->pers->quiesce) { 6471 mddev->pers->quiesce(mddev, 1); 6472 mddev->pers->quiesce(mddev, 0); 6473 } 6474 6475 mddev->bitmap_ops->flush(mddev); 6476 6477 if (md_is_rdwr(mddev) && 6478 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6479 mddev->sb_flags)) { 6480 /* mark array as shutdown cleanly */ 6481 if (!mddev_is_clustered(mddev)) 6482 mddev->in_sync = 1; 6483 md_update_sb(mddev, 1); 6484 } 6485 /* disable policy to guarantee rdevs free resources for serialization */ 6486 mddev->serialize_policy = 0; 6487 mddev_destroy_serial_pool(mddev, NULL); 6488 } 6489 6490 void md_stop_writes(struct mddev *mddev) 6491 { 6492 mddev_lock_nointr(mddev); 6493 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6494 stop_sync_thread(mddev, true); 6495 __md_stop_writes(mddev); 6496 mddev_unlock(mddev); 6497 } 6498 EXPORT_SYMBOL_GPL(md_stop_writes); 6499 6500 static void mddev_detach(struct mddev *mddev) 6501 { 6502 mddev->bitmap_ops->wait_behind_writes(mddev); 6503 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6504 mddev->pers->quiesce(mddev, 1); 6505 mddev->pers->quiesce(mddev, 0); 6506 } 6507 md_unregister_thread(mddev, &mddev->thread); 6508 6509 /* the unplug fn references 'conf' */ 6510 if (!mddev_is_dm(mddev)) 6511 blk_sync_queue(mddev->gendisk->queue); 6512 } 6513 6514 static void __md_stop(struct mddev *mddev) 6515 { 6516 struct md_personality *pers = mddev->pers; 6517 6518 mddev->bitmap_ops->destroy(mddev); 6519 mddev_detach(mddev); 6520 spin_lock(&mddev->lock); 6521 mddev->pers = NULL; 6522 spin_unlock(&mddev->lock); 6523 if (mddev->private) 6524 pers->free(mddev, mddev->private); 6525 mddev->private = NULL; 6526 put_pers(pers); 6527 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6528 6529 bioset_exit(&mddev->bio_set); 6530 bioset_exit(&mddev->sync_set); 6531 bioset_exit(&mddev->io_clone_set); 6532 } 6533 6534 void md_stop(struct mddev *mddev) 6535 { 6536 lockdep_assert_held(&mddev->reconfig_mutex); 6537 6538 /* stop the array and free an attached data structures. 6539 * This is called from dm-raid 6540 */ 6541 __md_stop_writes(mddev); 6542 __md_stop(mddev); 6543 } 6544 6545 EXPORT_SYMBOL_GPL(md_stop); 6546 6547 /* ensure 'mddev->pers' exist before calling md_set_readonly() */ 6548 static int md_set_readonly(struct mddev *mddev) 6549 { 6550 int err = 0; 6551 int did_freeze = 0; 6552 6553 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6554 return -EBUSY; 6555 6556 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6557 did_freeze = 1; 6558 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6559 } 6560 6561 stop_sync_thread(mddev, false); 6562 wait_event(mddev->sb_wait, 6563 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6564 mddev_lock_nointr(mddev); 6565 6566 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6567 pr_warn("md: %s still in use.\n",mdname(mddev)); 6568 err = -EBUSY; 6569 goto out; 6570 } 6571 6572 __md_stop_writes(mddev); 6573 6574 if (mddev->ro == MD_RDONLY) { 6575 err = -ENXIO; 6576 goto out; 6577 } 6578 6579 mddev->ro = MD_RDONLY; 6580 set_disk_ro(mddev->gendisk, 1); 6581 6582 out: 6583 if (!err || did_freeze) { 6584 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6585 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6586 sysfs_notify_dirent_safe(mddev->sysfs_state); 6587 } 6588 6589 return err; 6590 } 6591 6592 /* mode: 6593 * 0 - completely stop and dis-assemble array 6594 * 2 - stop but do not disassemble array 6595 */ 6596 static int do_md_stop(struct mddev *mddev, int mode) 6597 { 6598 struct gendisk *disk = mddev->gendisk; 6599 struct md_rdev *rdev; 6600 int did_freeze = 0; 6601 6602 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6603 did_freeze = 1; 6604 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6605 } 6606 6607 stop_sync_thread(mddev, true); 6608 6609 if (mddev->sysfs_active || 6610 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6611 pr_warn("md: %s still in use.\n",mdname(mddev)); 6612 if (did_freeze) { 6613 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6614 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6615 } 6616 return -EBUSY; 6617 } 6618 if (mddev->pers) { 6619 if (!md_is_rdwr(mddev)) 6620 set_disk_ro(disk, 0); 6621 6622 __md_stop_writes(mddev); 6623 __md_stop(mddev); 6624 6625 /* tell userspace to handle 'inactive' */ 6626 sysfs_notify_dirent_safe(mddev->sysfs_state); 6627 6628 rdev_for_each(rdev, mddev) 6629 if (rdev->raid_disk >= 0) 6630 sysfs_unlink_rdev(mddev, rdev); 6631 6632 set_capacity_and_notify(disk, 0); 6633 mddev->changed = 1; 6634 6635 if (!md_is_rdwr(mddev)) 6636 mddev->ro = MD_RDWR; 6637 } 6638 /* 6639 * Free resources if final stop 6640 */ 6641 if (mode == 0) { 6642 pr_info("md: %s stopped.\n", mdname(mddev)); 6643 6644 if (mddev->bitmap_info.file) { 6645 struct file *f = mddev->bitmap_info.file; 6646 spin_lock(&mddev->lock); 6647 mddev->bitmap_info.file = NULL; 6648 spin_unlock(&mddev->lock); 6649 fput(f); 6650 } 6651 mddev->bitmap_info.offset = 0; 6652 6653 export_array(mddev); 6654 md_clean(mddev); 6655 set_bit(MD_DELETED, &mddev->flags); 6656 } 6657 md_new_event(); 6658 sysfs_notify_dirent_safe(mddev->sysfs_state); 6659 return 0; 6660 } 6661 6662 #ifndef MODULE 6663 static void autorun_array(struct mddev *mddev) 6664 { 6665 struct md_rdev *rdev; 6666 int err; 6667 6668 if (list_empty(&mddev->disks)) 6669 return; 6670 6671 pr_info("md: running: "); 6672 6673 rdev_for_each(rdev, mddev) { 6674 pr_cont("<%pg>", rdev->bdev); 6675 } 6676 pr_cont("\n"); 6677 6678 err = do_md_run(mddev); 6679 if (err) { 6680 pr_warn("md: do_md_run() returned %d\n", err); 6681 do_md_stop(mddev, 0); 6682 } 6683 } 6684 6685 /* 6686 * lets try to run arrays based on all disks that have arrived 6687 * until now. (those are in pending_raid_disks) 6688 * 6689 * the method: pick the first pending disk, collect all disks with 6690 * the same UUID, remove all from the pending list and put them into 6691 * the 'same_array' list. Then order this list based on superblock 6692 * update time (freshest comes first), kick out 'old' disks and 6693 * compare superblocks. If everything's fine then run it. 6694 * 6695 * If "unit" is allocated, then bump its reference count 6696 */ 6697 static void autorun_devices(int part) 6698 { 6699 struct md_rdev *rdev0, *rdev, *tmp; 6700 struct mddev *mddev; 6701 6702 pr_info("md: autorun ...\n"); 6703 while (!list_empty(&pending_raid_disks)) { 6704 int unit; 6705 dev_t dev; 6706 LIST_HEAD(candidates); 6707 rdev0 = list_entry(pending_raid_disks.next, 6708 struct md_rdev, same_set); 6709 6710 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6711 INIT_LIST_HEAD(&candidates); 6712 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6713 if (super_90_load(rdev, rdev0, 0) >= 0) { 6714 pr_debug("md: adding %pg ...\n", 6715 rdev->bdev); 6716 list_move(&rdev->same_set, &candidates); 6717 } 6718 /* 6719 * now we have a set of devices, with all of them having 6720 * mostly sane superblocks. It's time to allocate the 6721 * mddev. 6722 */ 6723 if (part) { 6724 dev = MKDEV(mdp_major, 6725 rdev0->preferred_minor << MdpMinorShift); 6726 unit = MINOR(dev) >> MdpMinorShift; 6727 } else { 6728 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6729 unit = MINOR(dev); 6730 } 6731 if (rdev0->preferred_minor != unit) { 6732 pr_warn("md: unit number in %pg is bad: %d\n", 6733 rdev0->bdev, rdev0->preferred_minor); 6734 break; 6735 } 6736 6737 mddev = md_alloc(dev, NULL); 6738 if (IS_ERR(mddev)) 6739 break; 6740 6741 if (mddev_suspend_and_lock(mddev)) 6742 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6743 else if (mddev->raid_disks || mddev->major_version 6744 || !list_empty(&mddev->disks)) { 6745 pr_warn("md: %s already running, cannot run %pg\n", 6746 mdname(mddev), rdev0->bdev); 6747 mddev_unlock_and_resume(mddev); 6748 } else { 6749 pr_debug("md: created %s\n", mdname(mddev)); 6750 mddev->persistent = 1; 6751 rdev_for_each_list(rdev, tmp, &candidates) { 6752 list_del_init(&rdev->same_set); 6753 if (bind_rdev_to_array(rdev, mddev)) 6754 export_rdev(rdev, mddev); 6755 } 6756 autorun_array(mddev); 6757 mddev_unlock_and_resume(mddev); 6758 } 6759 /* on success, candidates will be empty, on error 6760 * it won't... 6761 */ 6762 rdev_for_each_list(rdev, tmp, &candidates) { 6763 list_del_init(&rdev->same_set); 6764 export_rdev(rdev, mddev); 6765 } 6766 mddev_put(mddev); 6767 } 6768 pr_info("md: ... autorun DONE.\n"); 6769 } 6770 #endif /* !MODULE */ 6771 6772 static int get_version(void __user *arg) 6773 { 6774 mdu_version_t ver; 6775 6776 ver.major = MD_MAJOR_VERSION; 6777 ver.minor = MD_MINOR_VERSION; 6778 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6779 6780 if (copy_to_user(arg, &ver, sizeof(ver))) 6781 return -EFAULT; 6782 6783 return 0; 6784 } 6785 6786 static int get_array_info(struct mddev *mddev, void __user *arg) 6787 { 6788 mdu_array_info_t info; 6789 int nr,working,insync,failed,spare; 6790 struct md_rdev *rdev; 6791 6792 nr = working = insync = failed = spare = 0; 6793 rcu_read_lock(); 6794 rdev_for_each_rcu(rdev, mddev) { 6795 nr++; 6796 if (test_bit(Faulty, &rdev->flags)) 6797 failed++; 6798 else { 6799 working++; 6800 if (test_bit(In_sync, &rdev->flags)) 6801 insync++; 6802 else if (test_bit(Journal, &rdev->flags)) 6803 /* TODO: add journal count to md_u.h */ 6804 ; 6805 else 6806 spare++; 6807 } 6808 } 6809 rcu_read_unlock(); 6810 6811 info.major_version = mddev->major_version; 6812 info.minor_version = mddev->minor_version; 6813 info.patch_version = MD_PATCHLEVEL_VERSION; 6814 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6815 info.level = mddev->level; 6816 info.size = mddev->dev_sectors / 2; 6817 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6818 info.size = -1; 6819 info.nr_disks = nr; 6820 info.raid_disks = mddev->raid_disks; 6821 info.md_minor = mddev->md_minor; 6822 info.not_persistent= !mddev->persistent; 6823 6824 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6825 info.state = 0; 6826 if (mddev->in_sync) 6827 info.state = (1<<MD_SB_CLEAN); 6828 if (mddev->bitmap && mddev->bitmap_info.offset) 6829 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6830 if (mddev_is_clustered(mddev)) 6831 info.state |= (1<<MD_SB_CLUSTERED); 6832 info.active_disks = insync; 6833 info.working_disks = working; 6834 info.failed_disks = failed; 6835 info.spare_disks = spare; 6836 6837 info.layout = mddev->layout; 6838 info.chunk_size = mddev->chunk_sectors << 9; 6839 6840 if (copy_to_user(arg, &info, sizeof(info))) 6841 return -EFAULT; 6842 6843 return 0; 6844 } 6845 6846 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6847 { 6848 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6849 char *ptr; 6850 int err; 6851 6852 file = kzalloc(sizeof(*file), GFP_NOIO); 6853 if (!file) 6854 return -ENOMEM; 6855 6856 err = 0; 6857 spin_lock(&mddev->lock); 6858 /* bitmap enabled */ 6859 if (mddev->bitmap_info.file) { 6860 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6861 sizeof(file->pathname)); 6862 if (IS_ERR(ptr)) 6863 err = PTR_ERR(ptr); 6864 else 6865 memmove(file->pathname, ptr, 6866 sizeof(file->pathname)-(ptr-file->pathname)); 6867 } 6868 spin_unlock(&mddev->lock); 6869 6870 if (err == 0 && 6871 copy_to_user(arg, file, sizeof(*file))) 6872 err = -EFAULT; 6873 6874 kfree(file); 6875 return err; 6876 } 6877 6878 static int get_disk_info(struct mddev *mddev, void __user * arg) 6879 { 6880 mdu_disk_info_t info; 6881 struct md_rdev *rdev; 6882 6883 if (copy_from_user(&info, arg, sizeof(info))) 6884 return -EFAULT; 6885 6886 rcu_read_lock(); 6887 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6888 if (rdev) { 6889 info.major = MAJOR(rdev->bdev->bd_dev); 6890 info.minor = MINOR(rdev->bdev->bd_dev); 6891 info.raid_disk = rdev->raid_disk; 6892 info.state = 0; 6893 if (test_bit(Faulty, &rdev->flags)) 6894 info.state |= (1<<MD_DISK_FAULTY); 6895 else if (test_bit(In_sync, &rdev->flags)) { 6896 info.state |= (1<<MD_DISK_ACTIVE); 6897 info.state |= (1<<MD_DISK_SYNC); 6898 } 6899 if (test_bit(Journal, &rdev->flags)) 6900 info.state |= (1<<MD_DISK_JOURNAL); 6901 if (test_bit(WriteMostly, &rdev->flags)) 6902 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6903 if (test_bit(FailFast, &rdev->flags)) 6904 info.state |= (1<<MD_DISK_FAILFAST); 6905 } else { 6906 info.major = info.minor = 0; 6907 info.raid_disk = -1; 6908 info.state = (1<<MD_DISK_REMOVED); 6909 } 6910 rcu_read_unlock(); 6911 6912 if (copy_to_user(arg, &info, sizeof(info))) 6913 return -EFAULT; 6914 6915 return 0; 6916 } 6917 6918 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6919 { 6920 struct md_rdev *rdev; 6921 dev_t dev = MKDEV(info->major,info->minor); 6922 6923 if (mddev_is_clustered(mddev) && 6924 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6925 pr_warn("%s: Cannot add to clustered mddev.\n", 6926 mdname(mddev)); 6927 return -EINVAL; 6928 } 6929 6930 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6931 return -EOVERFLOW; 6932 6933 if (!mddev->raid_disks) { 6934 int err; 6935 /* expecting a device which has a superblock */ 6936 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6937 if (IS_ERR(rdev)) { 6938 pr_warn("md: md_import_device returned %ld\n", 6939 PTR_ERR(rdev)); 6940 return PTR_ERR(rdev); 6941 } 6942 if (!list_empty(&mddev->disks)) { 6943 struct md_rdev *rdev0 6944 = list_entry(mddev->disks.next, 6945 struct md_rdev, same_set); 6946 err = super_types[mddev->major_version] 6947 .load_super(rdev, rdev0, mddev->minor_version); 6948 if (err < 0) { 6949 pr_warn("md: %pg has different UUID to %pg\n", 6950 rdev->bdev, 6951 rdev0->bdev); 6952 export_rdev(rdev, mddev); 6953 return -EINVAL; 6954 } 6955 } 6956 err = bind_rdev_to_array(rdev, mddev); 6957 if (err) 6958 export_rdev(rdev, mddev); 6959 return err; 6960 } 6961 6962 /* 6963 * md_add_new_disk can be used once the array is assembled 6964 * to add "hot spares". They must already have a superblock 6965 * written 6966 */ 6967 if (mddev->pers) { 6968 int err; 6969 if (!mddev->pers->hot_add_disk) { 6970 pr_warn("%s: personality does not support diskops!\n", 6971 mdname(mddev)); 6972 return -EINVAL; 6973 } 6974 if (mddev->persistent) 6975 rdev = md_import_device(dev, mddev->major_version, 6976 mddev->minor_version); 6977 else 6978 rdev = md_import_device(dev, -1, -1); 6979 if (IS_ERR(rdev)) { 6980 pr_warn("md: md_import_device returned %ld\n", 6981 PTR_ERR(rdev)); 6982 return PTR_ERR(rdev); 6983 } 6984 /* set saved_raid_disk if appropriate */ 6985 if (!mddev->persistent) { 6986 if (info->state & (1<<MD_DISK_SYNC) && 6987 info->raid_disk < mddev->raid_disks) { 6988 rdev->raid_disk = info->raid_disk; 6989 clear_bit(Bitmap_sync, &rdev->flags); 6990 } else 6991 rdev->raid_disk = -1; 6992 rdev->saved_raid_disk = rdev->raid_disk; 6993 } else 6994 super_types[mddev->major_version]. 6995 validate_super(mddev, NULL/*freshest*/, rdev); 6996 if ((info->state & (1<<MD_DISK_SYNC)) && 6997 rdev->raid_disk != info->raid_disk) { 6998 /* This was a hot-add request, but events doesn't 6999 * match, so reject it. 7000 */ 7001 export_rdev(rdev, mddev); 7002 return -EINVAL; 7003 } 7004 7005 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 7006 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7007 set_bit(WriteMostly, &rdev->flags); 7008 else 7009 clear_bit(WriteMostly, &rdev->flags); 7010 if (info->state & (1<<MD_DISK_FAILFAST)) 7011 set_bit(FailFast, &rdev->flags); 7012 else 7013 clear_bit(FailFast, &rdev->flags); 7014 7015 if (info->state & (1<<MD_DISK_JOURNAL)) { 7016 struct md_rdev *rdev2; 7017 bool has_journal = false; 7018 7019 /* make sure no existing journal disk */ 7020 rdev_for_each(rdev2, mddev) { 7021 if (test_bit(Journal, &rdev2->flags)) { 7022 has_journal = true; 7023 break; 7024 } 7025 } 7026 if (has_journal || mddev->bitmap) { 7027 export_rdev(rdev, mddev); 7028 return -EBUSY; 7029 } 7030 set_bit(Journal, &rdev->flags); 7031 } 7032 /* 7033 * check whether the device shows up in other nodes 7034 */ 7035 if (mddev_is_clustered(mddev)) { 7036 if (info->state & (1 << MD_DISK_CANDIDATE)) 7037 set_bit(Candidate, &rdev->flags); 7038 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 7039 /* --add initiated by this node */ 7040 err = mddev->cluster_ops->add_new_disk(mddev, rdev); 7041 if (err) { 7042 export_rdev(rdev, mddev); 7043 return err; 7044 } 7045 } 7046 } 7047 7048 rdev->raid_disk = -1; 7049 err = bind_rdev_to_array(rdev, mddev); 7050 7051 if (err) 7052 export_rdev(rdev, mddev); 7053 7054 if (mddev_is_clustered(mddev)) { 7055 if (info->state & (1 << MD_DISK_CANDIDATE)) { 7056 if (!err) { 7057 err = mddev->cluster_ops->new_disk_ack( 7058 mddev, err == 0); 7059 if (err) 7060 md_kick_rdev_from_array(rdev); 7061 } 7062 } else { 7063 if (err) 7064 mddev->cluster_ops->add_new_disk_cancel(mddev); 7065 else 7066 err = add_bound_rdev(rdev); 7067 } 7068 7069 } else if (!err) 7070 err = add_bound_rdev(rdev); 7071 7072 return err; 7073 } 7074 7075 /* otherwise, md_add_new_disk is only allowed 7076 * for major_version==0 superblocks 7077 */ 7078 if (mddev->major_version != 0) { 7079 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 7080 return -EINVAL; 7081 } 7082 7083 if (!(info->state & (1<<MD_DISK_FAULTY))) { 7084 int err; 7085 rdev = md_import_device(dev, -1, 0); 7086 if (IS_ERR(rdev)) { 7087 pr_warn("md: error, md_import_device() returned %ld\n", 7088 PTR_ERR(rdev)); 7089 return PTR_ERR(rdev); 7090 } 7091 rdev->desc_nr = info->number; 7092 if (info->raid_disk < mddev->raid_disks) 7093 rdev->raid_disk = info->raid_disk; 7094 else 7095 rdev->raid_disk = -1; 7096 7097 if (rdev->raid_disk < mddev->raid_disks) 7098 if (info->state & (1<<MD_DISK_SYNC)) 7099 set_bit(In_sync, &rdev->flags); 7100 7101 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7102 set_bit(WriteMostly, &rdev->flags); 7103 if (info->state & (1<<MD_DISK_FAILFAST)) 7104 set_bit(FailFast, &rdev->flags); 7105 7106 if (!mddev->persistent) { 7107 pr_debug("md: nonpersistent superblock ...\n"); 7108 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7109 } else 7110 rdev->sb_start = calc_dev_sboffset(rdev); 7111 rdev->sectors = rdev->sb_start; 7112 7113 err = bind_rdev_to_array(rdev, mddev); 7114 if (err) { 7115 export_rdev(rdev, mddev); 7116 return err; 7117 } 7118 } 7119 7120 return 0; 7121 } 7122 7123 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 7124 { 7125 struct md_rdev *rdev; 7126 7127 if (!mddev->pers) 7128 return -ENODEV; 7129 7130 rdev = find_rdev(mddev, dev); 7131 if (!rdev) 7132 return -ENXIO; 7133 7134 if (rdev->raid_disk < 0) 7135 goto kick_rdev; 7136 7137 clear_bit(Blocked, &rdev->flags); 7138 remove_and_add_spares(mddev, rdev); 7139 7140 if (rdev->raid_disk >= 0) 7141 goto busy; 7142 7143 kick_rdev: 7144 if (mddev_is_clustered(mddev) && 7145 mddev->cluster_ops->remove_disk(mddev, rdev)) 7146 goto busy; 7147 7148 md_kick_rdev_from_array(rdev); 7149 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7150 if (!mddev->thread) 7151 md_update_sb(mddev, 1); 7152 md_new_event(); 7153 7154 return 0; 7155 busy: 7156 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7157 rdev->bdev, mdname(mddev)); 7158 return -EBUSY; 7159 } 7160 7161 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7162 { 7163 int err; 7164 struct md_rdev *rdev; 7165 7166 if (!mddev->pers) 7167 return -ENODEV; 7168 7169 if (mddev->major_version != 0) { 7170 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7171 mdname(mddev)); 7172 return -EINVAL; 7173 } 7174 if (!mddev->pers->hot_add_disk) { 7175 pr_warn("%s: personality does not support diskops!\n", 7176 mdname(mddev)); 7177 return -EINVAL; 7178 } 7179 7180 rdev = md_import_device(dev, -1, 0); 7181 if (IS_ERR(rdev)) { 7182 pr_warn("md: error, md_import_device() returned %ld\n", 7183 PTR_ERR(rdev)); 7184 return -EINVAL; 7185 } 7186 7187 if (mddev->persistent) 7188 rdev->sb_start = calc_dev_sboffset(rdev); 7189 else 7190 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7191 7192 rdev->sectors = rdev->sb_start; 7193 7194 if (test_bit(Faulty, &rdev->flags)) { 7195 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7196 rdev->bdev, mdname(mddev)); 7197 err = -EINVAL; 7198 goto abort_export; 7199 } 7200 7201 clear_bit(In_sync, &rdev->flags); 7202 rdev->desc_nr = -1; 7203 rdev->saved_raid_disk = -1; 7204 err = bind_rdev_to_array(rdev, mddev); 7205 if (err) 7206 goto abort_export; 7207 7208 /* 7209 * The rest should better be atomic, we can have disk failures 7210 * noticed in interrupt contexts ... 7211 */ 7212 7213 rdev->raid_disk = -1; 7214 7215 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7216 if (!mddev->thread) 7217 md_update_sb(mddev, 1); 7218 /* 7219 * Kick recovery, maybe this spare has to be added to the 7220 * array immediately. 7221 */ 7222 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7223 md_new_event(); 7224 return 0; 7225 7226 abort_export: 7227 export_rdev(rdev, mddev); 7228 return err; 7229 } 7230 7231 static int set_bitmap_file(struct mddev *mddev, int fd) 7232 { 7233 int err = 0; 7234 7235 if (mddev->pers) { 7236 if (!mddev->pers->quiesce || !mddev->thread) 7237 return -EBUSY; 7238 if (mddev->recovery || mddev->sync_thread) 7239 return -EBUSY; 7240 /* we should be able to change the bitmap.. */ 7241 } 7242 7243 if (fd >= 0) { 7244 struct inode *inode; 7245 struct file *f; 7246 7247 if (mddev->bitmap || mddev->bitmap_info.file) 7248 return -EEXIST; /* cannot add when bitmap is present */ 7249 7250 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7251 pr_warn("%s: bitmap files not supported by this kernel\n", 7252 mdname(mddev)); 7253 return -EINVAL; 7254 } 7255 pr_warn("%s: using deprecated bitmap file support\n", 7256 mdname(mddev)); 7257 7258 f = fget(fd); 7259 7260 if (f == NULL) { 7261 pr_warn("%s: error: failed to get bitmap file\n", 7262 mdname(mddev)); 7263 return -EBADF; 7264 } 7265 7266 inode = f->f_mapping->host; 7267 if (!S_ISREG(inode->i_mode)) { 7268 pr_warn("%s: error: bitmap file must be a regular file\n", 7269 mdname(mddev)); 7270 err = -EBADF; 7271 } else if (!(f->f_mode & FMODE_WRITE)) { 7272 pr_warn("%s: error: bitmap file must open for write\n", 7273 mdname(mddev)); 7274 err = -EBADF; 7275 } else if (atomic_read(&inode->i_writecount) != 1) { 7276 pr_warn("%s: error: bitmap file is already in use\n", 7277 mdname(mddev)); 7278 err = -EBUSY; 7279 } 7280 if (err) { 7281 fput(f); 7282 return err; 7283 } 7284 mddev->bitmap_info.file = f; 7285 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7286 } else if (mddev->bitmap == NULL) 7287 return -ENOENT; /* cannot remove what isn't there */ 7288 err = 0; 7289 if (mddev->pers) { 7290 if (fd >= 0) { 7291 err = mddev->bitmap_ops->create(mddev); 7292 if (!err) 7293 err = mddev->bitmap_ops->load(mddev); 7294 7295 if (err) { 7296 mddev->bitmap_ops->destroy(mddev); 7297 fd = -1; 7298 } 7299 } else if (fd < 0) { 7300 mddev->bitmap_ops->destroy(mddev); 7301 } 7302 } 7303 7304 if (fd < 0) { 7305 struct file *f = mddev->bitmap_info.file; 7306 if (f) { 7307 spin_lock(&mddev->lock); 7308 mddev->bitmap_info.file = NULL; 7309 spin_unlock(&mddev->lock); 7310 fput(f); 7311 } 7312 } 7313 7314 return err; 7315 } 7316 7317 /* 7318 * md_set_array_info is used two different ways 7319 * The original usage is when creating a new array. 7320 * In this usage, raid_disks is > 0 and it together with 7321 * level, size, not_persistent,layout,chunksize determine the 7322 * shape of the array. 7323 * This will always create an array with a type-0.90.0 superblock. 7324 * The newer usage is when assembling an array. 7325 * In this case raid_disks will be 0, and the major_version field is 7326 * use to determine which style super-blocks are to be found on the devices. 7327 * The minor and patch _version numbers are also kept incase the 7328 * super_block handler wishes to interpret them. 7329 */ 7330 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7331 { 7332 if (info->raid_disks == 0) { 7333 /* just setting version number for superblock loading */ 7334 if (info->major_version < 0 || 7335 info->major_version >= ARRAY_SIZE(super_types) || 7336 super_types[info->major_version].name == NULL) { 7337 /* maybe try to auto-load a module? */ 7338 pr_warn("md: superblock version %d not known\n", 7339 info->major_version); 7340 return -EINVAL; 7341 } 7342 mddev->major_version = info->major_version; 7343 mddev->minor_version = info->minor_version; 7344 mddev->patch_version = info->patch_version; 7345 mddev->persistent = !info->not_persistent; 7346 /* ensure mddev_put doesn't delete this now that there 7347 * is some minimal configuration. 7348 */ 7349 mddev->ctime = ktime_get_real_seconds(); 7350 return 0; 7351 } 7352 mddev->major_version = MD_MAJOR_VERSION; 7353 mddev->minor_version = MD_MINOR_VERSION; 7354 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7355 mddev->ctime = ktime_get_real_seconds(); 7356 7357 mddev->level = info->level; 7358 mddev->clevel[0] = 0; 7359 mddev->dev_sectors = 2 * (sector_t)info->size; 7360 mddev->raid_disks = info->raid_disks; 7361 /* don't set md_minor, it is determined by which /dev/md* was 7362 * openned 7363 */ 7364 if (info->state & (1<<MD_SB_CLEAN)) 7365 mddev->recovery_cp = MaxSector; 7366 else 7367 mddev->recovery_cp = 0; 7368 mddev->persistent = ! info->not_persistent; 7369 mddev->external = 0; 7370 7371 mddev->layout = info->layout; 7372 if (mddev->level == 0) 7373 /* Cannot trust RAID0 layout info here */ 7374 mddev->layout = -1; 7375 mddev->chunk_sectors = info->chunk_size >> 9; 7376 7377 if (mddev->persistent) { 7378 mddev->max_disks = MD_SB_DISKS; 7379 mddev->flags = 0; 7380 mddev->sb_flags = 0; 7381 } 7382 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7383 7384 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7385 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7386 mddev->bitmap_info.offset = 0; 7387 7388 mddev->reshape_position = MaxSector; 7389 7390 /* 7391 * Generate a 128 bit UUID 7392 */ 7393 get_random_bytes(mddev->uuid, 16); 7394 7395 mddev->new_level = mddev->level; 7396 mddev->new_chunk_sectors = mddev->chunk_sectors; 7397 mddev->new_layout = mddev->layout; 7398 mddev->delta_disks = 0; 7399 mddev->reshape_backwards = 0; 7400 7401 return 0; 7402 } 7403 7404 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7405 { 7406 lockdep_assert_held(&mddev->reconfig_mutex); 7407 7408 if (mddev->external_size) 7409 return; 7410 7411 mddev->array_sectors = array_sectors; 7412 } 7413 EXPORT_SYMBOL(md_set_array_sectors); 7414 7415 static int update_size(struct mddev *mddev, sector_t num_sectors) 7416 { 7417 struct md_rdev *rdev; 7418 int rv; 7419 int fit = (num_sectors == 0); 7420 sector_t old_dev_sectors = mddev->dev_sectors; 7421 7422 if (mddev->pers->resize == NULL) 7423 return -EINVAL; 7424 /* The "num_sectors" is the number of sectors of each device that 7425 * is used. This can only make sense for arrays with redundancy. 7426 * linear and raid0 always use whatever space is available. We can only 7427 * consider changing this number if no resync or reconstruction is 7428 * happening, and if the new size is acceptable. It must fit before the 7429 * sb_start or, if that is <data_offset, it must fit before the size 7430 * of each device. If num_sectors is zero, we find the largest size 7431 * that fits. 7432 */ 7433 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7434 return -EBUSY; 7435 if (!md_is_rdwr(mddev)) 7436 return -EROFS; 7437 7438 rdev_for_each(rdev, mddev) { 7439 sector_t avail = rdev->sectors; 7440 7441 if (fit && (num_sectors == 0 || num_sectors > avail)) 7442 num_sectors = avail; 7443 if (avail < num_sectors) 7444 return -ENOSPC; 7445 } 7446 rv = mddev->pers->resize(mddev, num_sectors); 7447 if (!rv) { 7448 if (mddev_is_clustered(mddev)) 7449 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 7450 else if (!mddev_is_dm(mddev)) 7451 set_capacity_and_notify(mddev->gendisk, 7452 mddev->array_sectors); 7453 } 7454 return rv; 7455 } 7456 7457 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7458 { 7459 int rv; 7460 struct md_rdev *rdev; 7461 /* change the number of raid disks */ 7462 if (mddev->pers->check_reshape == NULL) 7463 return -EINVAL; 7464 if (!md_is_rdwr(mddev)) 7465 return -EROFS; 7466 if (raid_disks <= 0 || 7467 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7468 return -EINVAL; 7469 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7470 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7471 mddev->reshape_position != MaxSector) 7472 return -EBUSY; 7473 7474 rdev_for_each(rdev, mddev) { 7475 if (mddev->raid_disks < raid_disks && 7476 rdev->data_offset < rdev->new_data_offset) 7477 return -EINVAL; 7478 if (mddev->raid_disks > raid_disks && 7479 rdev->data_offset > rdev->new_data_offset) 7480 return -EINVAL; 7481 } 7482 7483 mddev->delta_disks = raid_disks - mddev->raid_disks; 7484 if (mddev->delta_disks < 0) 7485 mddev->reshape_backwards = 1; 7486 else if (mddev->delta_disks > 0) 7487 mddev->reshape_backwards = 0; 7488 7489 rv = mddev->pers->check_reshape(mddev); 7490 if (rv < 0) { 7491 mddev->delta_disks = 0; 7492 mddev->reshape_backwards = 0; 7493 } 7494 return rv; 7495 } 7496 7497 static int get_cluster_ops(struct mddev *mddev) 7498 { 7499 xa_lock(&md_submodule); 7500 mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); 7501 if (mddev->cluster_ops && 7502 !try_module_get(mddev->cluster_ops->head.owner)) 7503 mddev->cluster_ops = NULL; 7504 xa_unlock(&md_submodule); 7505 7506 return mddev->cluster_ops == NULL ? -ENOENT : 0; 7507 } 7508 7509 static void put_cluster_ops(struct mddev *mddev) 7510 { 7511 if (!mddev->cluster_ops) 7512 return; 7513 7514 mddev->cluster_ops->leave(mddev); 7515 module_put(mddev->cluster_ops->head.owner); 7516 mddev->cluster_ops = NULL; 7517 } 7518 7519 /* 7520 * update_array_info is used to change the configuration of an 7521 * on-line array. 7522 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7523 * fields in the info are checked against the array. 7524 * Any differences that cannot be handled will cause an error. 7525 * Normally, only one change can be managed at a time. 7526 */ 7527 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7528 { 7529 int rv = 0; 7530 int cnt = 0; 7531 int state = 0; 7532 7533 /* calculate expected state,ignoring low bits */ 7534 if (mddev->bitmap && mddev->bitmap_info.offset) 7535 state |= (1 << MD_SB_BITMAP_PRESENT); 7536 7537 if (mddev->major_version != info->major_version || 7538 mddev->minor_version != info->minor_version || 7539 /* mddev->patch_version != info->patch_version || */ 7540 mddev->ctime != info->ctime || 7541 mddev->level != info->level || 7542 /* mddev->layout != info->layout || */ 7543 mddev->persistent != !info->not_persistent || 7544 mddev->chunk_sectors != info->chunk_size >> 9 || 7545 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7546 ((state^info->state) & 0xfffffe00) 7547 ) 7548 return -EINVAL; 7549 /* Check there is only one change */ 7550 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7551 cnt++; 7552 if (mddev->raid_disks != info->raid_disks) 7553 cnt++; 7554 if (mddev->layout != info->layout) 7555 cnt++; 7556 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7557 cnt++; 7558 if (cnt == 0) 7559 return 0; 7560 if (cnt > 1) 7561 return -EINVAL; 7562 7563 if (mddev->layout != info->layout) { 7564 /* Change layout 7565 * we don't need to do anything at the md level, the 7566 * personality will take care of it all. 7567 */ 7568 if (mddev->pers->check_reshape == NULL) 7569 return -EINVAL; 7570 else { 7571 mddev->new_layout = info->layout; 7572 rv = mddev->pers->check_reshape(mddev); 7573 if (rv) 7574 mddev->new_layout = mddev->layout; 7575 return rv; 7576 } 7577 } 7578 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7579 rv = update_size(mddev, (sector_t)info->size * 2); 7580 7581 if (mddev->raid_disks != info->raid_disks) 7582 rv = update_raid_disks(mddev, info->raid_disks); 7583 7584 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7585 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7586 rv = -EINVAL; 7587 goto err; 7588 } 7589 if (mddev->recovery || mddev->sync_thread) { 7590 rv = -EBUSY; 7591 goto err; 7592 } 7593 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7594 /* add the bitmap */ 7595 if (mddev->bitmap) { 7596 rv = -EEXIST; 7597 goto err; 7598 } 7599 if (mddev->bitmap_info.default_offset == 0) { 7600 rv = -EINVAL; 7601 goto err; 7602 } 7603 mddev->bitmap_info.offset = 7604 mddev->bitmap_info.default_offset; 7605 mddev->bitmap_info.space = 7606 mddev->bitmap_info.default_space; 7607 rv = mddev->bitmap_ops->create(mddev); 7608 if (!rv) 7609 rv = mddev->bitmap_ops->load(mddev); 7610 7611 if (rv) 7612 mddev->bitmap_ops->destroy(mddev); 7613 } else { 7614 struct md_bitmap_stats stats; 7615 7616 rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 7617 if (rv) 7618 goto err; 7619 7620 if (stats.file) { 7621 rv = -EINVAL; 7622 goto err; 7623 } 7624 7625 if (mddev->bitmap_info.nodes) { 7626 /* hold PW on all the bitmap lock */ 7627 if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7628 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7629 rv = -EPERM; 7630 mddev->cluster_ops->unlock_all_bitmaps(mddev); 7631 goto err; 7632 } 7633 7634 mddev->bitmap_info.nodes = 0; 7635 put_cluster_ops(mddev); 7636 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7637 } 7638 mddev->bitmap_ops->destroy(mddev); 7639 mddev->bitmap_info.offset = 0; 7640 } 7641 } 7642 md_update_sb(mddev, 1); 7643 return rv; 7644 err: 7645 return rv; 7646 } 7647 7648 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7649 { 7650 struct md_rdev *rdev; 7651 int err = 0; 7652 7653 if (mddev->pers == NULL) 7654 return -ENODEV; 7655 7656 rcu_read_lock(); 7657 rdev = md_find_rdev_rcu(mddev, dev); 7658 if (!rdev) 7659 err = -ENODEV; 7660 else { 7661 md_error(mddev, rdev); 7662 if (test_bit(MD_BROKEN, &mddev->flags)) 7663 err = -EBUSY; 7664 } 7665 rcu_read_unlock(); 7666 return err; 7667 } 7668 7669 /* 7670 * We have a problem here : there is no easy way to give a CHS 7671 * virtual geometry. We currently pretend that we have a 2 heads 7672 * 4 sectors (with a BIG number of cylinders...). This drives 7673 * dosfs just mad... ;-) 7674 */ 7675 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7676 { 7677 struct mddev *mddev = bdev->bd_disk->private_data; 7678 7679 geo->heads = 2; 7680 geo->sectors = 4; 7681 geo->cylinders = mddev->array_sectors / 8; 7682 return 0; 7683 } 7684 7685 static inline int md_ioctl_valid(unsigned int cmd) 7686 { 7687 switch (cmd) { 7688 case GET_ARRAY_INFO: 7689 case GET_DISK_INFO: 7690 case RAID_VERSION: 7691 return 0; 7692 case ADD_NEW_DISK: 7693 case GET_BITMAP_FILE: 7694 case HOT_ADD_DISK: 7695 case HOT_REMOVE_DISK: 7696 case RESTART_ARRAY_RW: 7697 case RUN_ARRAY: 7698 case SET_ARRAY_INFO: 7699 case SET_BITMAP_FILE: 7700 case SET_DISK_FAULTY: 7701 case STOP_ARRAY: 7702 case STOP_ARRAY_RO: 7703 case CLUSTERED_DISK_NACK: 7704 if (!capable(CAP_SYS_ADMIN)) 7705 return -EACCES; 7706 return 0; 7707 default: 7708 return -ENOTTY; 7709 } 7710 } 7711 7712 static bool md_ioctl_need_suspend(unsigned int cmd) 7713 { 7714 switch (cmd) { 7715 case ADD_NEW_DISK: 7716 case HOT_ADD_DISK: 7717 case HOT_REMOVE_DISK: 7718 case SET_BITMAP_FILE: 7719 case SET_ARRAY_INFO: 7720 return true; 7721 default: 7722 return false; 7723 } 7724 } 7725 7726 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7727 { 7728 mdu_array_info_t info; 7729 int err; 7730 7731 if (!argp) 7732 memset(&info, 0, sizeof(info)); 7733 else if (copy_from_user(&info, argp, sizeof(info))) 7734 return -EFAULT; 7735 7736 if (mddev->pers) { 7737 err = update_array_info(mddev, &info); 7738 if (err) 7739 pr_warn("md: couldn't update array info. %d\n", err); 7740 return err; 7741 } 7742 7743 if (!list_empty(&mddev->disks)) { 7744 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7745 return -EBUSY; 7746 } 7747 7748 if (mddev->raid_disks) { 7749 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7750 return -EBUSY; 7751 } 7752 7753 err = md_set_array_info(mddev, &info); 7754 if (err) 7755 pr_warn("md: couldn't set array info. %d\n", err); 7756 7757 return err; 7758 } 7759 7760 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7761 unsigned int cmd, unsigned long arg) 7762 { 7763 int err = 0; 7764 void __user *argp = (void __user *)arg; 7765 struct mddev *mddev = NULL; 7766 7767 err = md_ioctl_valid(cmd); 7768 if (err) 7769 return err; 7770 7771 /* 7772 * Commands dealing with the RAID driver but not any 7773 * particular array: 7774 */ 7775 if (cmd == RAID_VERSION) 7776 return get_version(argp); 7777 7778 /* 7779 * Commands creating/starting a new array: 7780 */ 7781 7782 mddev = bdev->bd_disk->private_data; 7783 7784 /* Some actions do not requires the mutex */ 7785 switch (cmd) { 7786 case GET_ARRAY_INFO: 7787 if (!mddev->raid_disks && !mddev->external) 7788 return -ENODEV; 7789 return get_array_info(mddev, argp); 7790 7791 case GET_DISK_INFO: 7792 if (!mddev->raid_disks && !mddev->external) 7793 return -ENODEV; 7794 return get_disk_info(mddev, argp); 7795 7796 case SET_DISK_FAULTY: 7797 return set_disk_faulty(mddev, new_decode_dev(arg)); 7798 7799 case GET_BITMAP_FILE: 7800 return get_bitmap_file(mddev, argp); 7801 } 7802 7803 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7804 /* Need to flush page cache, and ensure no-one else opens 7805 * and writes 7806 */ 7807 err = mddev_set_closing_and_sync_blockdev(mddev, 1); 7808 if (err) 7809 return err; 7810 } 7811 7812 if (!md_is_rdwr(mddev)) 7813 flush_work(&mddev->sync_work); 7814 7815 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 7816 mddev_lock(mddev); 7817 if (err) { 7818 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7819 err, cmd); 7820 goto out; 7821 } 7822 7823 if (cmd == SET_ARRAY_INFO) { 7824 err = __md_set_array_info(mddev, argp); 7825 goto unlock; 7826 } 7827 7828 /* 7829 * Commands querying/configuring an existing array: 7830 */ 7831 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7832 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7833 if ((!mddev->raid_disks && !mddev->external) 7834 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7835 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7836 && cmd != GET_BITMAP_FILE) { 7837 err = -ENODEV; 7838 goto unlock; 7839 } 7840 7841 /* 7842 * Commands even a read-only array can execute: 7843 */ 7844 switch (cmd) { 7845 case RESTART_ARRAY_RW: 7846 err = restart_array(mddev); 7847 goto unlock; 7848 7849 case STOP_ARRAY: 7850 err = do_md_stop(mddev, 0); 7851 goto unlock; 7852 7853 case STOP_ARRAY_RO: 7854 if (mddev->pers) 7855 err = md_set_readonly(mddev); 7856 goto unlock; 7857 7858 case HOT_REMOVE_DISK: 7859 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7860 goto unlock; 7861 7862 case ADD_NEW_DISK: 7863 /* We can support ADD_NEW_DISK on read-only arrays 7864 * only if we are re-adding a preexisting device. 7865 * So require mddev->pers and MD_DISK_SYNC. 7866 */ 7867 if (mddev->pers) { 7868 mdu_disk_info_t info; 7869 if (copy_from_user(&info, argp, sizeof(info))) 7870 err = -EFAULT; 7871 else if (!(info.state & (1<<MD_DISK_SYNC))) 7872 /* Need to clear read-only for this */ 7873 break; 7874 else 7875 err = md_add_new_disk(mddev, &info); 7876 goto unlock; 7877 } 7878 break; 7879 } 7880 7881 /* 7882 * The remaining ioctls are changing the state of the 7883 * superblock, so we do not allow them on read-only arrays. 7884 */ 7885 if (!md_is_rdwr(mddev) && mddev->pers) { 7886 if (mddev->ro != MD_AUTO_READ) { 7887 err = -EROFS; 7888 goto unlock; 7889 } 7890 mddev->ro = MD_RDWR; 7891 sysfs_notify_dirent_safe(mddev->sysfs_state); 7892 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7893 /* mddev_unlock will wake thread */ 7894 /* If a device failed while we were read-only, we 7895 * need to make sure the metadata is updated now. 7896 */ 7897 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7898 mddev_unlock(mddev); 7899 wait_event(mddev->sb_wait, 7900 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7901 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7902 mddev_lock_nointr(mddev); 7903 } 7904 } 7905 7906 switch (cmd) { 7907 case ADD_NEW_DISK: 7908 { 7909 mdu_disk_info_t info; 7910 if (copy_from_user(&info, argp, sizeof(info))) 7911 err = -EFAULT; 7912 else 7913 err = md_add_new_disk(mddev, &info); 7914 goto unlock; 7915 } 7916 7917 case CLUSTERED_DISK_NACK: 7918 if (mddev_is_clustered(mddev)) 7919 mddev->cluster_ops->new_disk_ack(mddev, false); 7920 else 7921 err = -EINVAL; 7922 goto unlock; 7923 7924 case HOT_ADD_DISK: 7925 err = hot_add_disk(mddev, new_decode_dev(arg)); 7926 goto unlock; 7927 7928 case RUN_ARRAY: 7929 err = do_md_run(mddev); 7930 goto unlock; 7931 7932 case SET_BITMAP_FILE: 7933 err = set_bitmap_file(mddev, (int)arg); 7934 goto unlock; 7935 7936 default: 7937 err = -EINVAL; 7938 goto unlock; 7939 } 7940 7941 unlock: 7942 if (mddev->hold_active == UNTIL_IOCTL && 7943 err != -EINVAL) 7944 mddev->hold_active = 0; 7945 7946 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 7947 mddev_unlock(mddev); 7948 7949 out: 7950 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) 7951 clear_bit(MD_CLOSING, &mddev->flags); 7952 return err; 7953 } 7954 #ifdef CONFIG_COMPAT 7955 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 7956 unsigned int cmd, unsigned long arg) 7957 { 7958 switch (cmd) { 7959 case HOT_REMOVE_DISK: 7960 case HOT_ADD_DISK: 7961 case SET_DISK_FAULTY: 7962 case SET_BITMAP_FILE: 7963 /* These take in integer arg, do not convert */ 7964 break; 7965 default: 7966 arg = (unsigned long)compat_ptr(arg); 7967 break; 7968 } 7969 7970 return md_ioctl(bdev, mode, cmd, arg); 7971 } 7972 #endif /* CONFIG_COMPAT */ 7973 7974 static int md_set_read_only(struct block_device *bdev, bool ro) 7975 { 7976 struct mddev *mddev = bdev->bd_disk->private_data; 7977 int err; 7978 7979 err = mddev_lock(mddev); 7980 if (err) 7981 return err; 7982 7983 if (!mddev->raid_disks && !mddev->external) { 7984 err = -ENODEV; 7985 goto out_unlock; 7986 } 7987 7988 /* 7989 * Transitioning to read-auto need only happen for arrays that call 7990 * md_write_start and which are not ready for writes yet. 7991 */ 7992 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 7993 err = restart_array(mddev); 7994 if (err) 7995 goto out_unlock; 7996 mddev->ro = MD_AUTO_READ; 7997 } 7998 7999 out_unlock: 8000 mddev_unlock(mddev); 8001 return err; 8002 } 8003 8004 static int md_open(struct gendisk *disk, blk_mode_t mode) 8005 { 8006 struct mddev *mddev; 8007 int err; 8008 8009 spin_lock(&all_mddevs_lock); 8010 mddev = mddev_get(disk->private_data); 8011 spin_unlock(&all_mddevs_lock); 8012 if (!mddev) 8013 return -ENODEV; 8014 8015 err = mutex_lock_interruptible(&mddev->open_mutex); 8016 if (err) 8017 goto out; 8018 8019 err = -ENODEV; 8020 if (test_bit(MD_CLOSING, &mddev->flags)) 8021 goto out_unlock; 8022 8023 atomic_inc(&mddev->openers); 8024 mutex_unlock(&mddev->open_mutex); 8025 8026 disk_check_media_change(disk); 8027 return 0; 8028 8029 out_unlock: 8030 mutex_unlock(&mddev->open_mutex); 8031 out: 8032 mddev_put(mddev); 8033 return err; 8034 } 8035 8036 static void md_release(struct gendisk *disk) 8037 { 8038 struct mddev *mddev = disk->private_data; 8039 8040 BUG_ON(!mddev); 8041 atomic_dec(&mddev->openers); 8042 mddev_put(mddev); 8043 } 8044 8045 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 8046 { 8047 struct mddev *mddev = disk->private_data; 8048 unsigned int ret = 0; 8049 8050 if (mddev->changed) 8051 ret = DISK_EVENT_MEDIA_CHANGE; 8052 mddev->changed = 0; 8053 return ret; 8054 } 8055 8056 static void md_free_disk(struct gendisk *disk) 8057 { 8058 struct mddev *mddev = disk->private_data; 8059 8060 mddev_free(mddev); 8061 } 8062 8063 const struct block_device_operations md_fops = 8064 { 8065 .owner = THIS_MODULE, 8066 .submit_bio = md_submit_bio, 8067 .open = md_open, 8068 .release = md_release, 8069 .ioctl = md_ioctl, 8070 #ifdef CONFIG_COMPAT 8071 .compat_ioctl = md_compat_ioctl, 8072 #endif 8073 .getgeo = md_getgeo, 8074 .check_events = md_check_events, 8075 .set_read_only = md_set_read_only, 8076 .free_disk = md_free_disk, 8077 }; 8078 8079 static int md_thread(void *arg) 8080 { 8081 struct md_thread *thread = arg; 8082 8083 /* 8084 * md_thread is a 'system-thread', it's priority should be very 8085 * high. We avoid resource deadlocks individually in each 8086 * raid personality. (RAID5 does preallocation) We also use RR and 8087 * the very same RT priority as kswapd, thus we will never get 8088 * into a priority inversion deadlock. 8089 * 8090 * we definitely have to have equal or higher priority than 8091 * bdflush, otherwise bdflush will deadlock if there are too 8092 * many dirty RAID5 blocks. 8093 */ 8094 8095 allow_signal(SIGKILL); 8096 while (!kthread_should_stop()) { 8097 8098 /* We need to wait INTERRUPTIBLE so that 8099 * we don't add to the load-average. 8100 * That means we need to be sure no signals are 8101 * pending 8102 */ 8103 if (signal_pending(current)) 8104 flush_signals(current); 8105 8106 wait_event_interruptible_timeout 8107 (thread->wqueue, 8108 test_bit(THREAD_WAKEUP, &thread->flags) 8109 || kthread_should_stop() || kthread_should_park(), 8110 thread->timeout); 8111 8112 clear_bit(THREAD_WAKEUP, &thread->flags); 8113 if (kthread_should_park()) 8114 kthread_parkme(); 8115 if (!kthread_should_stop()) 8116 thread->run(thread); 8117 } 8118 8119 return 0; 8120 } 8121 8122 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 8123 { 8124 struct md_thread *t; 8125 8126 rcu_read_lock(); 8127 t = rcu_dereference(thread); 8128 if (t) 8129 wake_up_process(t->tsk); 8130 rcu_read_unlock(); 8131 } 8132 8133 void md_wakeup_thread(struct md_thread __rcu *thread) 8134 { 8135 struct md_thread *t; 8136 8137 rcu_read_lock(); 8138 t = rcu_dereference(thread); 8139 if (t) { 8140 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8141 set_bit(THREAD_WAKEUP, &t->flags); 8142 if (wq_has_sleeper(&t->wqueue)) 8143 wake_up(&t->wqueue); 8144 } 8145 rcu_read_unlock(); 8146 } 8147 EXPORT_SYMBOL(md_wakeup_thread); 8148 8149 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8150 struct mddev *mddev, const char *name) 8151 { 8152 struct md_thread *thread; 8153 8154 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8155 if (!thread) 8156 return NULL; 8157 8158 init_waitqueue_head(&thread->wqueue); 8159 8160 thread->run = run; 8161 thread->mddev = mddev; 8162 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8163 thread->tsk = kthread_run(md_thread, thread, 8164 "%s_%s", 8165 mdname(thread->mddev), 8166 name); 8167 if (IS_ERR(thread->tsk)) { 8168 kfree(thread); 8169 return NULL; 8170 } 8171 return thread; 8172 } 8173 EXPORT_SYMBOL(md_register_thread); 8174 8175 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8176 { 8177 struct md_thread *thread = rcu_dereference_protected(*threadp, 8178 lockdep_is_held(&mddev->reconfig_mutex)); 8179 8180 if (!thread) 8181 return; 8182 8183 rcu_assign_pointer(*threadp, NULL); 8184 synchronize_rcu(); 8185 8186 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8187 kthread_stop(thread->tsk); 8188 kfree(thread); 8189 } 8190 EXPORT_SYMBOL(md_unregister_thread); 8191 8192 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8193 { 8194 if (!rdev || test_bit(Faulty, &rdev->flags)) 8195 return; 8196 8197 if (!mddev->pers || !mddev->pers->error_handler) 8198 return; 8199 mddev->pers->error_handler(mddev, rdev); 8200 8201 if (mddev->pers->head.id == ID_RAID0 || 8202 mddev->pers->head.id == ID_LINEAR) 8203 return; 8204 8205 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8206 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8207 sysfs_notify_dirent_safe(rdev->sysfs_state); 8208 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8209 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8210 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8211 md_wakeup_thread(mddev->thread); 8212 } 8213 if (mddev->event_work.func) 8214 queue_work(md_misc_wq, &mddev->event_work); 8215 md_new_event(); 8216 } 8217 EXPORT_SYMBOL(md_error); 8218 8219 /* seq_file implementation /proc/mdstat */ 8220 8221 static void status_unused(struct seq_file *seq) 8222 { 8223 int i = 0; 8224 struct md_rdev *rdev; 8225 8226 seq_printf(seq, "unused devices: "); 8227 8228 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8229 i++; 8230 seq_printf(seq, "%pg ", rdev->bdev); 8231 } 8232 if (!i) 8233 seq_printf(seq, "<none>"); 8234 8235 seq_printf(seq, "\n"); 8236 } 8237 8238 static void status_personalities(struct seq_file *seq) 8239 { 8240 struct md_submodule_head *head; 8241 unsigned long i; 8242 8243 seq_puts(seq, "Personalities : "); 8244 8245 xa_lock(&md_submodule); 8246 xa_for_each(&md_submodule, i, head) 8247 if (head->type == MD_PERSONALITY) 8248 seq_printf(seq, "[%s] ", head->name); 8249 xa_unlock(&md_submodule); 8250 8251 seq_puts(seq, "\n"); 8252 } 8253 8254 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8255 { 8256 sector_t max_sectors, resync, res; 8257 unsigned long dt, db = 0; 8258 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8259 int scale, recovery_active; 8260 unsigned int per_milli; 8261 8262 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8263 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8264 max_sectors = mddev->resync_max_sectors; 8265 else 8266 max_sectors = mddev->dev_sectors; 8267 8268 resync = mddev->curr_resync; 8269 if (resync < MD_RESYNC_ACTIVE) { 8270 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8271 /* Still cleaning up */ 8272 resync = max_sectors; 8273 } else if (resync > max_sectors) { 8274 resync = max_sectors; 8275 } else { 8276 res = atomic_read(&mddev->recovery_active); 8277 /* 8278 * Resync has started, but the subtraction has overflowed or 8279 * yielded one of the special values. Force it to active to 8280 * ensure the status reports an active resync. 8281 */ 8282 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8283 resync = MD_RESYNC_ACTIVE; 8284 else 8285 resync -= res; 8286 } 8287 8288 if (resync == MD_RESYNC_NONE) { 8289 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8290 struct md_rdev *rdev; 8291 8292 rdev_for_each(rdev, mddev) 8293 if (rdev->raid_disk >= 0 && 8294 !test_bit(Faulty, &rdev->flags) && 8295 rdev->recovery_offset != MaxSector && 8296 rdev->recovery_offset) { 8297 seq_printf(seq, "\trecover=REMOTE"); 8298 return 1; 8299 } 8300 if (mddev->reshape_position != MaxSector) 8301 seq_printf(seq, "\treshape=REMOTE"); 8302 else 8303 seq_printf(seq, "\tresync=REMOTE"); 8304 return 1; 8305 } 8306 if (mddev->recovery_cp < MaxSector) { 8307 seq_printf(seq, "\tresync=PENDING"); 8308 return 1; 8309 } 8310 return 0; 8311 } 8312 if (resync < MD_RESYNC_ACTIVE) { 8313 seq_printf(seq, "\tresync=DELAYED"); 8314 return 1; 8315 } 8316 8317 WARN_ON(max_sectors == 0); 8318 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8319 * in a sector_t, and (max_sectors>>scale) will fit in a 8320 * u32, as those are the requirements for sector_div. 8321 * Thus 'scale' must be at least 10 8322 */ 8323 scale = 10; 8324 if (sizeof(sector_t) > sizeof(unsigned long)) { 8325 while ( max_sectors/2 > (1ULL<<(scale+32))) 8326 scale++; 8327 } 8328 res = (resync>>scale)*1000; 8329 sector_div(res, (u32)((max_sectors>>scale)+1)); 8330 8331 per_milli = res; 8332 { 8333 int i, x = per_milli/50, y = 20-x; 8334 seq_printf(seq, "["); 8335 for (i = 0; i < x; i++) 8336 seq_printf(seq, "="); 8337 seq_printf(seq, ">"); 8338 for (i = 0; i < y; i++) 8339 seq_printf(seq, "."); 8340 seq_printf(seq, "] "); 8341 } 8342 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8343 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8344 "reshape" : 8345 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8346 "check" : 8347 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8348 "resync" : "recovery"))), 8349 per_milli/10, per_milli % 10, 8350 (unsigned long long) resync/2, 8351 (unsigned long long) max_sectors/2); 8352 8353 /* 8354 * dt: time from mark until now 8355 * db: blocks written from mark until now 8356 * rt: remaining time 8357 * 8358 * rt is a sector_t, which is always 64bit now. We are keeping 8359 * the original algorithm, but it is not really necessary. 8360 * 8361 * Original algorithm: 8362 * So we divide before multiply in case it is 32bit and close 8363 * to the limit. 8364 * We scale the divisor (db) by 32 to avoid losing precision 8365 * near the end of resync when the number of remaining sectors 8366 * is close to 'db'. 8367 * We then divide rt by 32 after multiplying by db to compensate. 8368 * The '+1' avoids division by zero if db is very small. 8369 */ 8370 dt = ((jiffies - mddev->resync_mark) / HZ); 8371 if (!dt) dt++; 8372 8373 curr_mark_cnt = mddev->curr_mark_cnt; 8374 recovery_active = atomic_read(&mddev->recovery_active); 8375 resync_mark_cnt = mddev->resync_mark_cnt; 8376 8377 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8378 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8379 8380 rt = max_sectors - resync; /* number of remaining sectors */ 8381 rt = div64_u64(rt, db/32+1); 8382 rt *= dt; 8383 rt >>= 5; 8384 8385 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8386 ((unsigned long)rt % 60)/6); 8387 8388 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8389 return 1; 8390 } 8391 8392 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8393 __acquires(&all_mddevs_lock) 8394 { 8395 seq->poll_event = atomic_read(&md_event_count); 8396 spin_lock(&all_mddevs_lock); 8397 8398 return seq_list_start_head(&all_mddevs, *pos); 8399 } 8400 8401 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8402 { 8403 return seq_list_next(v, &all_mddevs, pos); 8404 } 8405 8406 static void md_seq_stop(struct seq_file *seq, void *v) 8407 __releases(&all_mddevs_lock) 8408 { 8409 spin_unlock(&all_mddevs_lock); 8410 } 8411 8412 static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) 8413 { 8414 struct md_bitmap_stats stats; 8415 unsigned long used_pages; 8416 unsigned long chunk_kb; 8417 int err; 8418 8419 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 8420 if (err) 8421 return; 8422 8423 chunk_kb = mddev->bitmap_info.chunksize >> 10; 8424 used_pages = stats.pages - stats.missing_pages; 8425 8426 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", 8427 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), 8428 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 8429 chunk_kb ? "KB" : "B"); 8430 8431 if (stats.file) { 8432 seq_puts(seq, ", file: "); 8433 seq_file_path(seq, stats.file, " \t\n"); 8434 } 8435 8436 seq_putc(seq, '\n'); 8437 } 8438 8439 static int md_seq_show(struct seq_file *seq, void *v) 8440 { 8441 struct mddev *mddev; 8442 sector_t sectors; 8443 struct md_rdev *rdev; 8444 8445 if (v == &all_mddevs) { 8446 status_personalities(seq); 8447 if (list_empty(&all_mddevs)) 8448 status_unused(seq); 8449 return 0; 8450 } 8451 8452 mddev = list_entry(v, struct mddev, all_mddevs); 8453 if (!mddev_get(mddev)) 8454 return 0; 8455 8456 spin_unlock(&all_mddevs_lock); 8457 8458 /* prevent bitmap to be freed after checking */ 8459 mutex_lock(&mddev->bitmap_info.mutex); 8460 8461 spin_lock(&mddev->lock); 8462 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8463 seq_printf(seq, "%s : ", mdname(mddev)); 8464 if (mddev->pers) { 8465 if (test_bit(MD_BROKEN, &mddev->flags)) 8466 seq_printf(seq, "broken"); 8467 else 8468 seq_printf(seq, "active"); 8469 if (mddev->ro == MD_RDONLY) 8470 seq_printf(seq, " (read-only)"); 8471 if (mddev->ro == MD_AUTO_READ) 8472 seq_printf(seq, " (auto-read-only)"); 8473 seq_printf(seq, " %s", mddev->pers->head.name); 8474 } else { 8475 seq_printf(seq, "inactive"); 8476 } 8477 8478 sectors = 0; 8479 rcu_read_lock(); 8480 rdev_for_each_rcu(rdev, mddev) { 8481 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8482 8483 if (test_bit(WriteMostly, &rdev->flags)) 8484 seq_printf(seq, "(W)"); 8485 if (test_bit(Journal, &rdev->flags)) 8486 seq_printf(seq, "(J)"); 8487 if (test_bit(Faulty, &rdev->flags)) { 8488 seq_printf(seq, "(F)"); 8489 continue; 8490 } 8491 if (rdev->raid_disk < 0) 8492 seq_printf(seq, "(S)"); /* spare */ 8493 if (test_bit(Replacement, &rdev->flags)) 8494 seq_printf(seq, "(R)"); 8495 sectors += rdev->sectors; 8496 } 8497 rcu_read_unlock(); 8498 8499 if (!list_empty(&mddev->disks)) { 8500 if (mddev->pers) 8501 seq_printf(seq, "\n %llu blocks", 8502 (unsigned long long) 8503 mddev->array_sectors / 2); 8504 else 8505 seq_printf(seq, "\n %llu blocks", 8506 (unsigned long long)sectors / 2); 8507 } 8508 if (mddev->persistent) { 8509 if (mddev->major_version != 0 || 8510 mddev->minor_version != 90) { 8511 seq_printf(seq," super %d.%d", 8512 mddev->major_version, 8513 mddev->minor_version); 8514 } 8515 } else if (mddev->external) 8516 seq_printf(seq, " super external:%s", 8517 mddev->metadata_type); 8518 else 8519 seq_printf(seq, " super non-persistent"); 8520 8521 if (mddev->pers) { 8522 mddev->pers->status(seq, mddev); 8523 seq_printf(seq, "\n "); 8524 if (mddev->pers->sync_request) { 8525 if (status_resync(seq, mddev)) 8526 seq_printf(seq, "\n "); 8527 } 8528 } else 8529 seq_printf(seq, "\n "); 8530 8531 md_bitmap_status(seq, mddev); 8532 8533 seq_printf(seq, "\n"); 8534 } 8535 spin_unlock(&mddev->lock); 8536 mutex_unlock(&mddev->bitmap_info.mutex); 8537 spin_lock(&all_mddevs_lock); 8538 8539 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) 8540 status_unused(seq); 8541 8542 mddev_put_locked(mddev); 8543 return 0; 8544 } 8545 8546 static const struct seq_operations md_seq_ops = { 8547 .start = md_seq_start, 8548 .next = md_seq_next, 8549 .stop = md_seq_stop, 8550 .show = md_seq_show, 8551 }; 8552 8553 static int md_seq_open(struct inode *inode, struct file *file) 8554 { 8555 struct seq_file *seq; 8556 int error; 8557 8558 error = seq_open(file, &md_seq_ops); 8559 if (error) 8560 return error; 8561 8562 seq = file->private_data; 8563 seq->poll_event = atomic_read(&md_event_count); 8564 return error; 8565 } 8566 8567 static int md_unloading; 8568 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8569 { 8570 struct seq_file *seq = filp->private_data; 8571 __poll_t mask; 8572 8573 if (md_unloading) 8574 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8575 poll_wait(filp, &md_event_waiters, wait); 8576 8577 /* always allow read */ 8578 mask = EPOLLIN | EPOLLRDNORM; 8579 8580 if (seq->poll_event != atomic_read(&md_event_count)) 8581 mask |= EPOLLERR | EPOLLPRI; 8582 return mask; 8583 } 8584 8585 static const struct proc_ops mdstat_proc_ops = { 8586 .proc_open = md_seq_open, 8587 .proc_read = seq_read, 8588 .proc_lseek = seq_lseek, 8589 .proc_release = seq_release, 8590 .proc_poll = mdstat_poll, 8591 }; 8592 8593 int register_md_submodule(struct md_submodule_head *msh) 8594 { 8595 return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); 8596 } 8597 EXPORT_SYMBOL_GPL(register_md_submodule); 8598 8599 void unregister_md_submodule(struct md_submodule_head *msh) 8600 { 8601 xa_erase(&md_submodule, msh->id); 8602 } 8603 EXPORT_SYMBOL_GPL(unregister_md_submodule); 8604 8605 int md_setup_cluster(struct mddev *mddev, int nodes) 8606 { 8607 int ret = get_cluster_ops(mddev); 8608 8609 if (ret) { 8610 request_module("md-cluster"); 8611 ret = get_cluster_ops(mddev); 8612 } 8613 8614 /* ensure module won't be unloaded */ 8615 if (ret) { 8616 pr_warn("can't find md-cluster module or get its reference.\n"); 8617 return ret; 8618 } 8619 8620 ret = mddev->cluster_ops->join(mddev, nodes); 8621 if (!ret) 8622 mddev->safemode_delay = 0; 8623 return ret; 8624 } 8625 8626 void md_cluster_stop(struct mddev *mddev) 8627 { 8628 put_cluster_ops(mddev); 8629 } 8630 8631 static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) 8632 { 8633 unsigned long last_events = rdev->last_events; 8634 8635 if (!bdev_is_partition(rdev->bdev)) 8636 return true; 8637 8638 /* 8639 * If rdev is partition, and user doesn't issue IO to the array, the 8640 * array is still not idle if user issues IO to other partitions. 8641 */ 8642 rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, 8643 sectors) - 8644 part_stat_read_accum(rdev->bdev, sectors); 8645 8646 return init || rdev->last_events <= last_events; 8647 } 8648 8649 /* 8650 * mddev is idle if following conditions are matched since last check: 8651 * 1) mddev doesn't have normal IO completed; 8652 * 2) mddev doesn't have inflight normal IO; 8653 * 3) if any member disk is partition, and other partitions don't have IO 8654 * completed; 8655 * 8656 * Noted this checking rely on IO accounting is enabled. 8657 */ 8658 static bool is_mddev_idle(struct mddev *mddev, int init) 8659 { 8660 unsigned long last_events = mddev->normal_io_events; 8661 struct gendisk *disk; 8662 struct md_rdev *rdev; 8663 bool idle = true; 8664 8665 disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; 8666 if (!disk) 8667 return true; 8668 8669 mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); 8670 if (!init && (mddev->normal_io_events > last_events || 8671 bdev_count_inflight(disk->part0))) 8672 idle = false; 8673 8674 rcu_read_lock(); 8675 rdev_for_each_rcu(rdev, mddev) 8676 if (!is_rdev_holder_idle(rdev, init)) 8677 idle = false; 8678 rcu_read_unlock(); 8679 8680 return idle; 8681 } 8682 8683 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8684 { 8685 /* another "blocks" (512byte) blocks have been synced */ 8686 atomic_sub(blocks, &mddev->recovery_active); 8687 wake_up(&mddev->recovery_wait); 8688 if (!ok) { 8689 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8690 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8691 md_wakeup_thread(mddev->thread); 8692 // stop recovery, signal do_sync .... 8693 } 8694 } 8695 EXPORT_SYMBOL(md_done_sync); 8696 8697 /* md_write_start(mddev, bi) 8698 * If we need to update some array metadata (e.g. 'active' flag 8699 * in superblock) before writing, schedule a superblock update 8700 * and wait for it to complete. 8701 * A return value of 'false' means that the write wasn't recorded 8702 * and cannot proceed as the array is being suspend. 8703 */ 8704 void md_write_start(struct mddev *mddev, struct bio *bi) 8705 { 8706 int did_change = 0; 8707 8708 if (bio_data_dir(bi) != WRITE) 8709 return; 8710 8711 BUG_ON(mddev->ro == MD_RDONLY); 8712 if (mddev->ro == MD_AUTO_READ) { 8713 /* need to switch to read/write */ 8714 mddev->ro = MD_RDWR; 8715 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8716 md_wakeup_thread(mddev->thread); 8717 md_wakeup_thread(mddev->sync_thread); 8718 did_change = 1; 8719 } 8720 rcu_read_lock(); 8721 percpu_ref_get(&mddev->writes_pending); 8722 smp_mb(); /* Match smp_mb in set_in_sync() */ 8723 if (mddev->safemode == 1) 8724 mddev->safemode = 0; 8725 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8726 if (mddev->in_sync || mddev->sync_checkers) { 8727 spin_lock(&mddev->lock); 8728 if (mddev->in_sync) { 8729 mddev->in_sync = 0; 8730 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8731 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8732 md_wakeup_thread(mddev->thread); 8733 did_change = 1; 8734 } 8735 spin_unlock(&mddev->lock); 8736 } 8737 rcu_read_unlock(); 8738 if (did_change) 8739 sysfs_notify_dirent_safe(mddev->sysfs_state); 8740 if (!mddev->has_superblocks) 8741 return; 8742 wait_event(mddev->sb_wait, 8743 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8744 } 8745 EXPORT_SYMBOL(md_write_start); 8746 8747 /* md_write_inc can only be called when md_write_start() has 8748 * already been called at least once of the current request. 8749 * It increments the counter and is useful when a single request 8750 * is split into several parts. Each part causes an increment and 8751 * so needs a matching md_write_end(). 8752 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8753 * a spinlocked region. 8754 */ 8755 void md_write_inc(struct mddev *mddev, struct bio *bi) 8756 { 8757 if (bio_data_dir(bi) != WRITE) 8758 return; 8759 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8760 percpu_ref_get(&mddev->writes_pending); 8761 } 8762 EXPORT_SYMBOL(md_write_inc); 8763 8764 void md_write_end(struct mddev *mddev) 8765 { 8766 percpu_ref_put(&mddev->writes_pending); 8767 8768 if (mddev->safemode == 2) 8769 md_wakeup_thread(mddev->thread); 8770 else if (mddev->safemode_delay) 8771 /* The roundup() ensures this only performs locking once 8772 * every ->safemode_delay jiffies 8773 */ 8774 mod_timer(&mddev->safemode_timer, 8775 roundup(jiffies, mddev->safemode_delay) + 8776 mddev->safemode_delay); 8777 } 8778 8779 EXPORT_SYMBOL(md_write_end); 8780 8781 /* This is used by raid0 and raid10 */ 8782 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8783 struct bio *bio, sector_t start, sector_t size) 8784 { 8785 struct bio *discard_bio = NULL; 8786 8787 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8788 &discard_bio) || !discard_bio) 8789 return; 8790 8791 bio_chain(discard_bio, bio); 8792 bio_clone_blkg_association(discard_bio, bio); 8793 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); 8794 submit_bio_noacct(discard_bio); 8795 } 8796 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8797 8798 static void md_bitmap_start(struct mddev *mddev, 8799 struct md_io_clone *md_io_clone) 8800 { 8801 if (mddev->pers->bitmap_sector) 8802 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, 8803 &md_io_clone->sectors); 8804 8805 mddev->bitmap_ops->start_write(mddev, md_io_clone->offset, 8806 md_io_clone->sectors); 8807 } 8808 8809 static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) 8810 { 8811 mddev->bitmap_ops->end_write(mddev, md_io_clone->offset, 8812 md_io_clone->sectors); 8813 } 8814 8815 static void md_end_clone_io(struct bio *bio) 8816 { 8817 struct md_io_clone *md_io_clone = bio->bi_private; 8818 struct bio *orig_bio = md_io_clone->orig_bio; 8819 struct mddev *mddev = md_io_clone->mddev; 8820 8821 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) 8822 md_bitmap_end(mddev, md_io_clone); 8823 8824 if (bio->bi_status && !orig_bio->bi_status) 8825 orig_bio->bi_status = bio->bi_status; 8826 8827 if (md_io_clone->start_time) 8828 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8829 8830 bio_put(bio); 8831 bio_endio(orig_bio); 8832 percpu_ref_put(&mddev->active_io); 8833 } 8834 8835 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8836 { 8837 struct block_device *bdev = (*bio)->bi_bdev; 8838 struct md_io_clone *md_io_clone; 8839 struct bio *clone = 8840 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8841 8842 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8843 md_io_clone->orig_bio = *bio; 8844 md_io_clone->mddev = mddev; 8845 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8846 md_io_clone->start_time = bio_start_io_acct(*bio); 8847 8848 if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { 8849 md_io_clone->offset = (*bio)->bi_iter.bi_sector; 8850 md_io_clone->sectors = bio_sectors(*bio); 8851 md_bitmap_start(mddev, md_io_clone); 8852 } 8853 8854 clone->bi_end_io = md_end_clone_io; 8855 clone->bi_private = md_io_clone; 8856 *bio = clone; 8857 } 8858 8859 void md_account_bio(struct mddev *mddev, struct bio **bio) 8860 { 8861 percpu_ref_get(&mddev->active_io); 8862 md_clone_bio(mddev, bio); 8863 } 8864 EXPORT_SYMBOL_GPL(md_account_bio); 8865 8866 void md_free_cloned_bio(struct bio *bio) 8867 { 8868 struct md_io_clone *md_io_clone = bio->bi_private; 8869 struct bio *orig_bio = md_io_clone->orig_bio; 8870 struct mddev *mddev = md_io_clone->mddev; 8871 8872 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) 8873 md_bitmap_end(mddev, md_io_clone); 8874 8875 if (bio->bi_status && !orig_bio->bi_status) 8876 orig_bio->bi_status = bio->bi_status; 8877 8878 if (md_io_clone->start_time) 8879 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8880 8881 bio_put(bio); 8882 percpu_ref_put(&mddev->active_io); 8883 } 8884 EXPORT_SYMBOL_GPL(md_free_cloned_bio); 8885 8886 /* md_allow_write(mddev) 8887 * Calling this ensures that the array is marked 'active' so that writes 8888 * may proceed without blocking. It is important to call this before 8889 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8890 * Must be called with mddev_lock held. 8891 */ 8892 void md_allow_write(struct mddev *mddev) 8893 { 8894 if (!mddev->pers) 8895 return; 8896 if (!md_is_rdwr(mddev)) 8897 return; 8898 if (!mddev->pers->sync_request) 8899 return; 8900 8901 spin_lock(&mddev->lock); 8902 if (mddev->in_sync) { 8903 mddev->in_sync = 0; 8904 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8905 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8906 if (mddev->safemode_delay && 8907 mddev->safemode == 0) 8908 mddev->safemode = 1; 8909 spin_unlock(&mddev->lock); 8910 md_update_sb(mddev, 0); 8911 sysfs_notify_dirent_safe(mddev->sysfs_state); 8912 /* wait for the dirty state to be recorded in the metadata */ 8913 wait_event(mddev->sb_wait, 8914 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8915 } else 8916 spin_unlock(&mddev->lock); 8917 } 8918 EXPORT_SYMBOL_GPL(md_allow_write); 8919 8920 static sector_t md_sync_max_sectors(struct mddev *mddev, 8921 enum sync_action action) 8922 { 8923 switch (action) { 8924 case ACTION_RESYNC: 8925 case ACTION_CHECK: 8926 case ACTION_REPAIR: 8927 atomic64_set(&mddev->resync_mismatches, 0); 8928 fallthrough; 8929 case ACTION_RESHAPE: 8930 return mddev->resync_max_sectors; 8931 case ACTION_RECOVER: 8932 return mddev->dev_sectors; 8933 default: 8934 return 0; 8935 } 8936 } 8937 8938 static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) 8939 { 8940 sector_t start = 0; 8941 struct md_rdev *rdev; 8942 8943 switch (action) { 8944 case ACTION_CHECK: 8945 case ACTION_REPAIR: 8946 return mddev->resync_min; 8947 case ACTION_RESYNC: 8948 if (!mddev->bitmap) 8949 return mddev->recovery_cp; 8950 return 0; 8951 case ACTION_RESHAPE: 8952 /* 8953 * If the original node aborts reshaping then we continue the 8954 * reshaping, so set again to avoid restart reshape from the 8955 * first beginning 8956 */ 8957 if (mddev_is_clustered(mddev) && 8958 mddev->reshape_position != MaxSector) 8959 return mddev->reshape_position; 8960 return 0; 8961 case ACTION_RECOVER: 8962 start = MaxSector; 8963 rcu_read_lock(); 8964 rdev_for_each_rcu(rdev, mddev) 8965 if (rdev->raid_disk >= 0 && 8966 !test_bit(Journal, &rdev->flags) && 8967 !test_bit(Faulty, &rdev->flags) && 8968 !test_bit(In_sync, &rdev->flags) && 8969 rdev->recovery_offset < start) 8970 start = rdev->recovery_offset; 8971 rcu_read_unlock(); 8972 8973 /* If there is a bitmap, we need to make sure all 8974 * writes that started before we added a spare 8975 * complete before we start doing a recovery. 8976 * Otherwise the write might complete and (via 8977 * bitmap_endwrite) set a bit in the bitmap after the 8978 * recovery has checked that bit and skipped that 8979 * region. 8980 */ 8981 if (mddev->bitmap) { 8982 mddev->pers->quiesce(mddev, 1); 8983 mddev->pers->quiesce(mddev, 0); 8984 } 8985 return start; 8986 default: 8987 return MaxSector; 8988 } 8989 } 8990 8991 static bool sync_io_within_limit(struct mddev *mddev) 8992 { 8993 int io_sectors; 8994 8995 /* 8996 * For raid456, sync IO is stripe(4k) per IO, for other levels, it's 8997 * RESYNC_PAGES(64k) per IO. 8998 */ 8999 if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) 9000 io_sectors = 8; 9001 else 9002 io_sectors = 128; 9003 9004 return atomic_read(&mddev->recovery_active) < 9005 io_sectors * sync_io_depth(mddev); 9006 } 9007 9008 #define SYNC_MARKS 10 9009 #define SYNC_MARK_STEP (3*HZ) 9010 #define UPDATE_FREQUENCY (5*60*HZ) 9011 void md_do_sync(struct md_thread *thread) 9012 { 9013 struct mddev *mddev = thread->mddev; 9014 struct mddev *mddev2; 9015 unsigned int currspeed = 0, window; 9016 sector_t max_sectors,j, io_sectors, recovery_done; 9017 unsigned long mark[SYNC_MARKS]; 9018 unsigned long update_time; 9019 sector_t mark_cnt[SYNC_MARKS]; 9020 int last_mark,m; 9021 sector_t last_check; 9022 int skipped = 0; 9023 struct md_rdev *rdev; 9024 enum sync_action action; 9025 const char *desc; 9026 struct blk_plug plug; 9027 int ret; 9028 9029 /* just incase thread restarts... */ 9030 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9031 return; 9032 9033 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9034 goto skip; 9035 9036 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || 9037 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 9038 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9039 goto skip; 9040 } 9041 9042 if (mddev_is_clustered(mddev)) { 9043 ret = mddev->cluster_ops->resync_start(mddev); 9044 if (ret) 9045 goto skip; 9046 9047 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 9048 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 9049 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 9050 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 9051 && ((unsigned long long)mddev->curr_resync_completed 9052 < (unsigned long long)mddev->resync_max_sectors)) 9053 goto skip; 9054 } 9055 9056 action = md_sync_action(mddev); 9057 desc = md_sync_action_name(action); 9058 mddev->last_sync_action = action; 9059 9060 /* 9061 * Before starting a resync we must have set curr_resync to 9062 * 2, and then checked that every "conflicting" array has curr_resync 9063 * less than ours. When we find one that is the same or higher 9064 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 9065 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 9066 * This will mean we have to start checking from the beginning again. 9067 * 9068 */ 9069 if (mddev_is_clustered(mddev)) 9070 mddev->cluster_ops->resync_start_notify(mddev); 9071 do { 9072 int mddev2_minor = -1; 9073 mddev->curr_resync = MD_RESYNC_DELAYED; 9074 9075 try_again: 9076 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9077 goto skip; 9078 spin_lock(&all_mddevs_lock); 9079 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 9080 if (test_bit(MD_DELETED, &mddev2->flags)) 9081 continue; 9082 if (mddev2 == mddev) 9083 continue; 9084 if (!mddev->parallel_resync 9085 && mddev2->curr_resync 9086 && match_mddev_units(mddev, mddev2)) { 9087 DEFINE_WAIT(wq); 9088 if (mddev < mddev2 && 9089 mddev->curr_resync == MD_RESYNC_DELAYED) { 9090 /* arbitrarily yield */ 9091 mddev->curr_resync = MD_RESYNC_YIELDED; 9092 wake_up(&resync_wait); 9093 } 9094 if (mddev > mddev2 && 9095 mddev->curr_resync == MD_RESYNC_YIELDED) 9096 /* no need to wait here, we can wait the next 9097 * time 'round when curr_resync == 2 9098 */ 9099 continue; 9100 /* We need to wait 'interruptible' so as not to 9101 * contribute to the load average, and not to 9102 * be caught by 'softlockup' 9103 */ 9104 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 9105 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9106 mddev2->curr_resync >= mddev->curr_resync) { 9107 if (mddev2_minor != mddev2->md_minor) { 9108 mddev2_minor = mddev2->md_minor; 9109 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 9110 desc, mdname(mddev), 9111 mdname(mddev2)); 9112 } 9113 spin_unlock(&all_mddevs_lock); 9114 9115 if (signal_pending(current)) 9116 flush_signals(current); 9117 schedule(); 9118 finish_wait(&resync_wait, &wq); 9119 goto try_again; 9120 } 9121 finish_wait(&resync_wait, &wq); 9122 } 9123 } 9124 spin_unlock(&all_mddevs_lock); 9125 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 9126 9127 max_sectors = md_sync_max_sectors(mddev, action); 9128 j = md_sync_position(mddev, action); 9129 9130 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 9131 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 9132 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 9133 speed_max(mddev), desc); 9134 9135 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 9136 9137 io_sectors = 0; 9138 for (m = 0; m < SYNC_MARKS; m++) { 9139 mark[m] = jiffies; 9140 mark_cnt[m] = io_sectors; 9141 } 9142 last_mark = 0; 9143 mddev->resync_mark = mark[last_mark]; 9144 mddev->resync_mark_cnt = mark_cnt[last_mark]; 9145 9146 /* 9147 * Tune reconstruction: 9148 */ 9149 window = 32 * (PAGE_SIZE / 512); 9150 pr_debug("md: using %dk window, over a total of %lluk.\n", 9151 window/2, (unsigned long long)max_sectors/2); 9152 9153 atomic_set(&mddev->recovery_active, 0); 9154 last_check = 0; 9155 9156 if (j >= MD_RESYNC_ACTIVE) { 9157 pr_debug("md: resuming %s of %s from checkpoint.\n", 9158 desc, mdname(mddev)); 9159 mddev->curr_resync = j; 9160 } else 9161 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 9162 mddev->curr_resync_completed = j; 9163 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9164 md_new_event(); 9165 update_time = jiffies; 9166 9167 blk_start_plug(&plug); 9168 while (j < max_sectors) { 9169 sector_t sectors; 9170 9171 skipped = 0; 9172 9173 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9174 ((mddev->curr_resync > mddev->curr_resync_completed && 9175 (mddev->curr_resync - mddev->curr_resync_completed) 9176 > (max_sectors >> 4)) || 9177 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9178 (j - mddev->curr_resync_completed)*2 9179 >= mddev->resync_max - mddev->curr_resync_completed || 9180 mddev->curr_resync_completed > mddev->resync_max 9181 )) { 9182 /* time to update curr_resync_completed */ 9183 wait_event(mddev->recovery_wait, 9184 atomic_read(&mddev->recovery_active) == 0); 9185 mddev->curr_resync_completed = j; 9186 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9187 j > mddev->recovery_cp) 9188 mddev->recovery_cp = j; 9189 update_time = jiffies; 9190 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9191 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9192 } 9193 9194 while (j >= mddev->resync_max && 9195 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9196 /* As this condition is controlled by user-space, 9197 * we can block indefinitely, so use '_interruptible' 9198 * to avoid triggering warnings. 9199 */ 9200 flush_signals(current); /* just in case */ 9201 wait_event_interruptible(mddev->recovery_wait, 9202 mddev->resync_max > j 9203 || test_bit(MD_RECOVERY_INTR, 9204 &mddev->recovery)); 9205 } 9206 9207 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9208 break; 9209 9210 sectors = mddev->pers->sync_request(mddev, j, max_sectors, 9211 &skipped); 9212 if (sectors == 0) { 9213 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9214 break; 9215 } 9216 9217 if (!skipped) { /* actual IO requested */ 9218 io_sectors += sectors; 9219 atomic_add(sectors, &mddev->recovery_active); 9220 } 9221 9222 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9223 break; 9224 9225 j += sectors; 9226 if (j > max_sectors) 9227 /* when skipping, extra large numbers can be returned. */ 9228 j = max_sectors; 9229 if (j >= MD_RESYNC_ACTIVE) 9230 mddev->curr_resync = j; 9231 mddev->curr_mark_cnt = io_sectors; 9232 if (last_check == 0) 9233 /* this is the earliest that rebuild will be 9234 * visible in /proc/mdstat 9235 */ 9236 md_new_event(); 9237 9238 if (last_check + window > io_sectors || j == max_sectors) 9239 continue; 9240 9241 last_check = io_sectors; 9242 repeat: 9243 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9244 /* step marks */ 9245 int next = (last_mark+1) % SYNC_MARKS; 9246 9247 mddev->resync_mark = mark[next]; 9248 mddev->resync_mark_cnt = mark_cnt[next]; 9249 mark[next] = jiffies; 9250 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9251 last_mark = next; 9252 } 9253 9254 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9255 break; 9256 9257 /* 9258 * this loop exits only if either when we are slower than 9259 * the 'hard' speed limit, or the system was IO-idle for 9260 * a jiffy. 9261 * the system might be non-idle CPU-wise, but we only care 9262 * about not overloading the IO subsystem. (things like an 9263 * e2fsck being done on the RAID array should execute fast) 9264 */ 9265 cond_resched(); 9266 9267 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9268 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9269 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9270 9271 if (currspeed > speed_min(mddev)) { 9272 if (currspeed > speed_max(mddev)) { 9273 msleep(500); 9274 goto repeat; 9275 } 9276 if (!sync_io_within_limit(mddev) && 9277 !is_mddev_idle(mddev, 0)) { 9278 /* 9279 * Give other IO more of a chance. 9280 * The faster the devices, the less we wait. 9281 */ 9282 wait_event(mddev->recovery_wait, 9283 !atomic_read(&mddev->recovery_active)); 9284 } 9285 } 9286 } 9287 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9288 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9289 ? "interrupted" : "done"); 9290 /* 9291 * this also signals 'finished resyncing' to md_stop 9292 */ 9293 blk_finish_plug(&plug); 9294 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9295 9296 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9297 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9298 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9299 mddev->curr_resync_completed = mddev->curr_resync; 9300 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9301 } 9302 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); 9303 9304 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9305 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9306 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9307 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9308 if (mddev->curr_resync >= mddev->recovery_cp) { 9309 pr_debug("md: checkpointing %s of %s.\n", 9310 desc, mdname(mddev)); 9311 if (test_bit(MD_RECOVERY_ERROR, 9312 &mddev->recovery)) 9313 mddev->recovery_cp = 9314 mddev->curr_resync_completed; 9315 else 9316 mddev->recovery_cp = 9317 mddev->curr_resync; 9318 } 9319 } else 9320 mddev->recovery_cp = MaxSector; 9321 } else { 9322 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9323 mddev->curr_resync = MaxSector; 9324 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9325 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9326 rcu_read_lock(); 9327 rdev_for_each_rcu(rdev, mddev) 9328 if (rdev->raid_disk >= 0 && 9329 mddev->delta_disks >= 0 && 9330 !test_bit(Journal, &rdev->flags) && 9331 !test_bit(Faulty, &rdev->flags) && 9332 !test_bit(In_sync, &rdev->flags) && 9333 rdev->recovery_offset < mddev->curr_resync) 9334 rdev->recovery_offset = mddev->curr_resync; 9335 rcu_read_unlock(); 9336 } 9337 } 9338 } 9339 skip: 9340 /* set CHANGE_PENDING here since maybe another update is needed, 9341 * so other nodes are informed. It should be harmless for normal 9342 * raid */ 9343 set_mask_bits(&mddev->sb_flags, 0, 9344 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9345 9346 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9347 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9348 mddev->delta_disks > 0 && 9349 mddev->pers->finish_reshape && 9350 mddev->pers->size && 9351 !mddev_is_dm(mddev)) { 9352 mddev_lock_nointr(mddev); 9353 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9354 mddev_unlock(mddev); 9355 if (!mddev_is_clustered(mddev)) 9356 set_capacity_and_notify(mddev->gendisk, 9357 mddev->array_sectors); 9358 } 9359 9360 spin_lock(&mddev->lock); 9361 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9362 /* We completed so min/max setting can be forgotten if used. */ 9363 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9364 mddev->resync_min = 0; 9365 mddev->resync_max = MaxSector; 9366 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9367 mddev->resync_min = mddev->curr_resync_completed; 9368 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9369 mddev->curr_resync = MD_RESYNC_NONE; 9370 spin_unlock(&mddev->lock); 9371 9372 wake_up(&resync_wait); 9373 md_wakeup_thread(mddev->thread); 9374 return; 9375 } 9376 EXPORT_SYMBOL_GPL(md_do_sync); 9377 9378 static bool rdev_removeable(struct md_rdev *rdev) 9379 { 9380 /* rdev is not used. */ 9381 if (rdev->raid_disk < 0) 9382 return false; 9383 9384 /* There are still inflight io, don't remove this rdev. */ 9385 if (atomic_read(&rdev->nr_pending)) 9386 return false; 9387 9388 /* 9389 * An error occurred but has not yet been acknowledged by the metadata 9390 * handler, don't remove this rdev. 9391 */ 9392 if (test_bit(Blocked, &rdev->flags)) 9393 return false; 9394 9395 /* Fautly rdev is not used, it's safe to remove it. */ 9396 if (test_bit(Faulty, &rdev->flags)) 9397 return true; 9398 9399 /* Journal disk can only be removed if it's faulty. */ 9400 if (test_bit(Journal, &rdev->flags)) 9401 return false; 9402 9403 /* 9404 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9405 * replacement has just become active from pers->spare_active(), and 9406 * then pers->hot_remove_disk() will replace this rdev with replacement. 9407 */ 9408 if (!test_bit(In_sync, &rdev->flags)) 9409 return true; 9410 9411 return false; 9412 } 9413 9414 static bool rdev_is_spare(struct md_rdev *rdev) 9415 { 9416 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 9417 !test_bit(In_sync, &rdev->flags) && 9418 !test_bit(Journal, &rdev->flags) && 9419 !test_bit(Faulty, &rdev->flags); 9420 } 9421 9422 static bool rdev_addable(struct md_rdev *rdev) 9423 { 9424 /* rdev is already used, don't add it again. */ 9425 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9426 test_bit(Faulty, &rdev->flags)) 9427 return false; 9428 9429 /* Allow to add journal disk. */ 9430 if (test_bit(Journal, &rdev->flags)) 9431 return true; 9432 9433 /* Allow to add if array is read-write. */ 9434 if (md_is_rdwr(rdev->mddev)) 9435 return true; 9436 9437 /* 9438 * For read-only array, only allow to readd a rdev. And if bitmap is 9439 * used, don't allow to readd a rdev that is too old. 9440 */ 9441 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 9442 return true; 9443 9444 return false; 9445 } 9446 9447 static bool md_spares_need_change(struct mddev *mddev) 9448 { 9449 struct md_rdev *rdev; 9450 9451 rcu_read_lock(); 9452 rdev_for_each_rcu(rdev, mddev) { 9453 if (rdev_removeable(rdev) || rdev_addable(rdev)) { 9454 rcu_read_unlock(); 9455 return true; 9456 } 9457 } 9458 rcu_read_unlock(); 9459 return false; 9460 } 9461 9462 static int remove_spares(struct mddev *mddev, struct md_rdev *this) 9463 { 9464 struct md_rdev *rdev; 9465 int removed = 0; 9466 9467 rdev_for_each(rdev, mddev) { 9468 if ((this == NULL || rdev == this) && rdev_removeable(rdev) && 9469 !mddev->pers->hot_remove_disk(mddev, rdev)) { 9470 sysfs_unlink_rdev(mddev, rdev); 9471 rdev->saved_raid_disk = rdev->raid_disk; 9472 rdev->raid_disk = -1; 9473 removed++; 9474 } 9475 } 9476 9477 if (removed && mddev->kobj.sd) 9478 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9479 9480 return removed; 9481 } 9482 9483 static int remove_and_add_spares(struct mddev *mddev, 9484 struct md_rdev *this) 9485 { 9486 struct md_rdev *rdev; 9487 int spares = 0; 9488 int removed = 0; 9489 9490 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9491 /* Mustn't remove devices when resync thread is running */ 9492 return 0; 9493 9494 removed = remove_spares(mddev, this); 9495 if (this && removed) 9496 goto no_add; 9497 9498 rdev_for_each(rdev, mddev) { 9499 if (this && this != rdev) 9500 continue; 9501 if (rdev_is_spare(rdev)) 9502 spares++; 9503 if (!rdev_addable(rdev)) 9504 continue; 9505 if (!test_bit(Journal, &rdev->flags)) 9506 rdev->recovery_offset = 0; 9507 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9508 /* failure here is OK */ 9509 sysfs_link_rdev(mddev, rdev); 9510 if (!test_bit(Journal, &rdev->flags)) 9511 spares++; 9512 md_new_event(); 9513 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9514 } 9515 } 9516 no_add: 9517 if (removed) 9518 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9519 return spares; 9520 } 9521 9522 static bool md_choose_sync_action(struct mddev *mddev, int *spares) 9523 { 9524 /* Check if reshape is in progress first. */ 9525 if (mddev->reshape_position != MaxSector) { 9526 if (mddev->pers->check_reshape == NULL || 9527 mddev->pers->check_reshape(mddev) != 0) 9528 return false; 9529 9530 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9531 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9532 return true; 9533 } 9534 9535 /* Check if resync is in progress. */ 9536 if (mddev->recovery_cp < MaxSector) { 9537 remove_spares(mddev, NULL); 9538 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9539 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9540 return true; 9541 } 9542 9543 /* 9544 * Remove any failed drives, then add spares if possible. Spares are 9545 * also removed and re-added, to allow the personality to fail the 9546 * re-add. 9547 */ 9548 *spares = remove_and_add_spares(mddev, NULL); 9549 if (*spares) { 9550 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9551 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9552 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9553 9554 /* Start new recovery. */ 9555 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9556 return true; 9557 } 9558 9559 /* Delay to choose resync/check/repair in md_do_sync(). */ 9560 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9561 return true; 9562 9563 /* Nothing to be done */ 9564 return false; 9565 } 9566 9567 static void md_start_sync(struct work_struct *ws) 9568 { 9569 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 9570 int spares = 0; 9571 bool suspend = false; 9572 char *name; 9573 9574 /* 9575 * If reshape is still in progress, spares won't be added or removed 9576 * from conf until reshape is done. 9577 */ 9578 if (mddev->reshape_position == MaxSector && 9579 md_spares_need_change(mddev)) { 9580 suspend = true; 9581 mddev_suspend(mddev, false); 9582 } 9583 9584 mddev_lock_nointr(mddev); 9585 if (!md_is_rdwr(mddev)) { 9586 /* 9587 * On a read-only array we can: 9588 * - remove failed devices 9589 * - add already-in_sync devices if the array itself is in-sync. 9590 * As we only add devices that are already in-sync, we can 9591 * activate the spares immediately. 9592 */ 9593 remove_and_add_spares(mddev, NULL); 9594 goto not_running; 9595 } 9596 9597 if (!md_choose_sync_action(mddev, &spares)) 9598 goto not_running; 9599 9600 if (!mddev->pers->sync_request) 9601 goto not_running; 9602 9603 /* 9604 * We are adding a device or devices to an array which has the bitmap 9605 * stored on all devices. So make sure all bitmap pages get written. 9606 */ 9607 if (spares) 9608 mddev->bitmap_ops->write_all(mddev); 9609 9610 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? 9611 "reshape" : "resync"; 9612 rcu_assign_pointer(mddev->sync_thread, 9613 md_register_thread(md_do_sync, mddev, name)); 9614 if (!mddev->sync_thread) { 9615 pr_warn("%s: could not start resync thread...\n", 9616 mdname(mddev)); 9617 /* leave the spares where they are, it shouldn't hurt */ 9618 goto not_running; 9619 } 9620 9621 mddev_unlock(mddev); 9622 /* 9623 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9624 * not set it again. Otherwise, we may cause issue like this one: 9625 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9626 * Therefore, use __mddev_resume(mddev, false). 9627 */ 9628 if (suspend) 9629 __mddev_resume(mddev, false); 9630 md_wakeup_thread(mddev->sync_thread); 9631 sysfs_notify_dirent_safe(mddev->sysfs_action); 9632 md_new_event(); 9633 return; 9634 9635 not_running: 9636 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9637 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9638 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9639 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9640 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9641 mddev_unlock(mddev); 9642 /* 9643 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9644 * not set it again. Otherwise, we may cause issue like this one: 9645 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9646 * Therefore, use __mddev_resume(mddev, false). 9647 */ 9648 if (suspend) 9649 __mddev_resume(mddev, false); 9650 9651 wake_up(&resync_wait); 9652 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 9653 mddev->sysfs_action) 9654 sysfs_notify_dirent_safe(mddev->sysfs_action); 9655 } 9656 9657 static void unregister_sync_thread(struct mddev *mddev) 9658 { 9659 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9660 /* resync/recovery still happening */ 9661 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9662 return; 9663 } 9664 9665 if (WARN_ON_ONCE(!mddev->sync_thread)) 9666 return; 9667 9668 md_reap_sync_thread(mddev); 9669 } 9670 9671 /* 9672 * This routine is regularly called by all per-raid-array threads to 9673 * deal with generic issues like resync and super-block update. 9674 * Raid personalities that don't have a thread (linear/raid0) do not 9675 * need this as they never do any recovery or update the superblock. 9676 * 9677 * It does not do any resync itself, but rather "forks" off other threads 9678 * to do that as needed. 9679 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9680 * "->recovery" and create a thread at ->sync_thread. 9681 * When the thread finishes it sets MD_RECOVERY_DONE 9682 * and wakeups up this thread which will reap the thread and finish up. 9683 * This thread also removes any faulty devices (with nr_pending == 0). 9684 * 9685 * The overall approach is: 9686 * 1/ if the superblock needs updating, update it. 9687 * 2/ If a recovery thread is running, don't do anything else. 9688 * 3/ If recovery has finished, clean up, possibly marking spares active. 9689 * 4/ If there are any faulty devices, remove them. 9690 * 5/ If array is degraded, try to add spares devices 9691 * 6/ If array has spares or is not in-sync, start a resync thread. 9692 */ 9693 void md_check_recovery(struct mddev *mddev) 9694 { 9695 if (mddev->bitmap) 9696 mddev->bitmap_ops->daemon_work(mddev); 9697 9698 if (signal_pending(current)) { 9699 if (mddev->pers->sync_request && !mddev->external) { 9700 pr_debug("md: %s in immediate safe mode\n", 9701 mdname(mddev)); 9702 mddev->safemode = 2; 9703 } 9704 flush_signals(current); 9705 } 9706 9707 if (!md_is_rdwr(mddev) && 9708 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9709 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9710 return; 9711 if ( ! ( 9712 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9713 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9714 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9715 (mddev->external == 0 && mddev->safemode == 1) || 9716 (mddev->safemode == 2 9717 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9718 )) 9719 return; 9720 9721 if (mddev_trylock(mddev)) { 9722 bool try_set_sync = mddev->safemode != 0; 9723 9724 if (!mddev->external && mddev->safemode == 1) 9725 mddev->safemode = 0; 9726 9727 if (!md_is_rdwr(mddev)) { 9728 struct md_rdev *rdev; 9729 9730 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9731 unregister_sync_thread(mddev); 9732 goto unlock; 9733 } 9734 9735 if (!mddev->external && mddev->in_sync) 9736 /* 9737 * 'Blocked' flag not needed as failed devices 9738 * will be recorded if array switched to read/write. 9739 * Leaving it set will prevent the device 9740 * from being removed. 9741 */ 9742 rdev_for_each(rdev, mddev) 9743 clear_bit(Blocked, &rdev->flags); 9744 9745 /* 9746 * There is no thread, but we need to call 9747 * ->spare_active and clear saved_raid_disk 9748 */ 9749 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9750 md_reap_sync_thread(mddev); 9751 9752 /* 9753 * Let md_start_sync() to remove and add rdevs to the 9754 * array. 9755 */ 9756 if (md_spares_need_change(mddev)) { 9757 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9758 queue_work(md_misc_wq, &mddev->sync_work); 9759 } 9760 9761 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9762 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9763 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9764 9765 goto unlock; 9766 } 9767 9768 if (mddev_is_clustered(mddev)) { 9769 struct md_rdev *rdev, *tmp; 9770 /* kick the device if another node issued a 9771 * remove disk. 9772 */ 9773 rdev_for_each_safe(rdev, tmp, mddev) { 9774 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9775 rdev->raid_disk < 0) 9776 md_kick_rdev_from_array(rdev); 9777 } 9778 } 9779 9780 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9781 spin_lock(&mddev->lock); 9782 set_in_sync(mddev); 9783 spin_unlock(&mddev->lock); 9784 } 9785 9786 if (mddev->sb_flags) 9787 md_update_sb(mddev, 0); 9788 9789 /* 9790 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9791 * still set. 9792 */ 9793 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9794 unregister_sync_thread(mddev); 9795 goto unlock; 9796 } 9797 9798 /* Set RUNNING before clearing NEEDED to avoid 9799 * any transients in the value of "sync_action". 9800 */ 9801 mddev->curr_resync_completed = 0; 9802 spin_lock(&mddev->lock); 9803 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9804 spin_unlock(&mddev->lock); 9805 /* Clear some bits that don't mean anything, but 9806 * might be left set 9807 */ 9808 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9809 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9810 9811 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9812 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 9813 queue_work(md_misc_wq, &mddev->sync_work); 9814 } else { 9815 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9816 wake_up(&resync_wait); 9817 } 9818 9819 unlock: 9820 wake_up(&mddev->sb_wait); 9821 mddev_unlock(mddev); 9822 } 9823 } 9824 EXPORT_SYMBOL(md_check_recovery); 9825 9826 void md_reap_sync_thread(struct mddev *mddev) 9827 { 9828 struct md_rdev *rdev; 9829 sector_t old_dev_sectors = mddev->dev_sectors; 9830 bool is_reshaped = false; 9831 9832 /* resync has finished, collect result */ 9833 md_unregister_thread(mddev, &mddev->sync_thread); 9834 atomic_inc(&mddev->sync_seq); 9835 9836 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9837 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9838 mddev->degraded != mddev->raid_disks) { 9839 /* success...*/ 9840 /* activate any spares */ 9841 if (mddev->pers->spare_active(mddev)) { 9842 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9843 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9844 } 9845 } 9846 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9847 mddev->pers->finish_reshape) { 9848 mddev->pers->finish_reshape(mddev); 9849 if (mddev_is_clustered(mddev)) 9850 is_reshaped = true; 9851 } 9852 9853 /* If array is no-longer degraded, then any saved_raid_disk 9854 * information must be scrapped. 9855 */ 9856 if (!mddev->degraded) 9857 rdev_for_each(rdev, mddev) 9858 rdev->saved_raid_disk = -1; 9859 9860 md_update_sb(mddev, 1); 9861 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9862 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9863 * clustered raid */ 9864 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9865 mddev->cluster_ops->resync_finish(mddev); 9866 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9867 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9868 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9869 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9870 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9871 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9872 /* 9873 * We call mddev->cluster_ops->update_size here because sync_size could 9874 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9875 * so it is time to update size across cluster. 9876 */ 9877 if (mddev_is_clustered(mddev) && is_reshaped 9878 && !test_bit(MD_CLOSING, &mddev->flags)) 9879 mddev->cluster_ops->update_size(mddev, old_dev_sectors); 9880 /* flag recovery needed just to double check */ 9881 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9882 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9883 sysfs_notify_dirent_safe(mddev->sysfs_action); 9884 md_new_event(); 9885 if (mddev->event_work.func) 9886 queue_work(md_misc_wq, &mddev->event_work); 9887 wake_up(&resync_wait); 9888 } 9889 EXPORT_SYMBOL(md_reap_sync_thread); 9890 9891 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9892 { 9893 sysfs_notify_dirent_safe(rdev->sysfs_state); 9894 wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), 9895 msecs_to_jiffies(5000)); 9896 rdev_dec_pending(rdev, mddev); 9897 } 9898 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9899 9900 void md_finish_reshape(struct mddev *mddev) 9901 { 9902 /* called be personality module when reshape completes. */ 9903 struct md_rdev *rdev; 9904 9905 rdev_for_each(rdev, mddev) { 9906 if (rdev->data_offset > rdev->new_data_offset) 9907 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9908 else 9909 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9910 rdev->data_offset = rdev->new_data_offset; 9911 } 9912 } 9913 EXPORT_SYMBOL(md_finish_reshape); 9914 9915 /* Bad block management */ 9916 9917 /* Returns true on success, false on failure */ 9918 bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9919 int is_new) 9920 { 9921 struct mddev *mddev = rdev->mddev; 9922 9923 /* 9924 * Recording new badblocks for faulty rdev will force unnecessary 9925 * super block updating. This is fragile for external management because 9926 * userspace daemon may trying to remove this device and deadlock may 9927 * occur. This will be probably solved in the mdadm, but it is safer to 9928 * avoid it. 9929 */ 9930 if (test_bit(Faulty, &rdev->flags)) 9931 return true; 9932 9933 if (is_new) 9934 s += rdev->new_data_offset; 9935 else 9936 s += rdev->data_offset; 9937 9938 if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) 9939 return false; 9940 9941 /* Make sure they get written out promptly */ 9942 if (test_bit(ExternalBbl, &rdev->flags)) 9943 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9944 sysfs_notify_dirent_safe(rdev->sysfs_state); 9945 set_mask_bits(&mddev->sb_flags, 0, 9946 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9947 md_wakeup_thread(rdev->mddev->thread); 9948 return true; 9949 } 9950 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9951 9952 void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9953 int is_new) 9954 { 9955 if (is_new) 9956 s += rdev->new_data_offset; 9957 else 9958 s += rdev->data_offset; 9959 9960 if (!badblocks_clear(&rdev->badblocks, s, sectors)) 9961 return; 9962 9963 if (test_bit(ExternalBbl, &rdev->flags)) 9964 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9965 } 9966 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9967 9968 static int md_notify_reboot(struct notifier_block *this, 9969 unsigned long code, void *x) 9970 { 9971 struct mddev *mddev; 9972 int need_delay = 0; 9973 9974 spin_lock(&all_mddevs_lock); 9975 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 9976 if (!mddev_get(mddev)) 9977 continue; 9978 spin_unlock(&all_mddevs_lock); 9979 if (mddev_trylock(mddev)) { 9980 if (mddev->pers) 9981 __md_stop_writes(mddev); 9982 if (mddev->persistent) 9983 mddev->safemode = 2; 9984 mddev_unlock(mddev); 9985 } 9986 need_delay = 1; 9987 spin_lock(&all_mddevs_lock); 9988 mddev_put_locked(mddev); 9989 } 9990 spin_unlock(&all_mddevs_lock); 9991 9992 /* 9993 * certain more exotic SCSI devices are known to be 9994 * volatile wrt too early system reboots. While the 9995 * right place to handle this issue is the given 9996 * driver, we do want to have a safe RAID driver ... 9997 */ 9998 if (need_delay) 9999 msleep(1000); 10000 10001 return NOTIFY_DONE; 10002 } 10003 10004 static struct notifier_block md_notifier = { 10005 .notifier_call = md_notify_reboot, 10006 .next = NULL, 10007 .priority = INT_MAX, /* before any real devices */ 10008 }; 10009 10010 static void md_geninit(void) 10011 { 10012 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 10013 10014 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 10015 } 10016 10017 static int __init md_init(void) 10018 { 10019 int ret = -ENOMEM; 10020 10021 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 10022 if (!md_wq) 10023 goto err_wq; 10024 10025 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 10026 if (!md_misc_wq) 10027 goto err_misc_wq; 10028 10029 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 10030 0); 10031 if (!md_bitmap_wq) 10032 goto err_bitmap_wq; 10033 10034 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 10035 if (ret < 0) 10036 goto err_md; 10037 10038 ret = __register_blkdev(0, "mdp", md_probe); 10039 if (ret < 0) 10040 goto err_mdp; 10041 mdp_major = ret; 10042 10043 register_reboot_notifier(&md_notifier); 10044 raid_table_header = register_sysctl("dev/raid", raid_table); 10045 10046 md_geninit(); 10047 return 0; 10048 10049 err_mdp: 10050 unregister_blkdev(MD_MAJOR, "md"); 10051 err_md: 10052 destroy_workqueue(md_bitmap_wq); 10053 err_bitmap_wq: 10054 destroy_workqueue(md_misc_wq); 10055 err_misc_wq: 10056 destroy_workqueue(md_wq); 10057 err_wq: 10058 return ret; 10059 } 10060 10061 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 10062 { 10063 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 10064 struct md_rdev *rdev2, *tmp; 10065 int role, ret; 10066 10067 /* 10068 * If size is changed in another node then we need to 10069 * do resize as well. 10070 */ 10071 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 10072 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 10073 if (ret) 10074 pr_info("md-cluster: resize failed\n"); 10075 else 10076 mddev->bitmap_ops->update_sb(mddev->bitmap); 10077 } 10078 10079 /* Check for change of roles in the active devices */ 10080 rdev_for_each_safe(rdev2, tmp, mddev) { 10081 if (test_bit(Faulty, &rdev2->flags)) 10082 continue; 10083 10084 /* Check if the roles changed */ 10085 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 10086 10087 if (test_bit(Candidate, &rdev2->flags)) { 10088 if (role == MD_DISK_ROLE_FAULTY) { 10089 pr_info("md: Removing Candidate device %pg because add failed\n", 10090 rdev2->bdev); 10091 md_kick_rdev_from_array(rdev2); 10092 continue; 10093 } 10094 else 10095 clear_bit(Candidate, &rdev2->flags); 10096 } 10097 10098 if (role != rdev2->raid_disk) { 10099 /* 10100 * got activated except reshape is happening. 10101 */ 10102 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 10103 !(le32_to_cpu(sb->feature_map) & 10104 MD_FEATURE_RESHAPE_ACTIVE) && 10105 !mddev->cluster_ops->resync_status_get(mddev)) { 10106 /* 10107 * -1 to make raid1_add_disk() set conf->fullsync 10108 * to 1. This could avoid skipping sync when the 10109 * remote node is down during resyncing. 10110 */ 10111 if ((le32_to_cpu(sb->feature_map) 10112 & MD_FEATURE_RECOVERY_OFFSET)) 10113 rdev2->saved_raid_disk = -1; 10114 else 10115 rdev2->saved_raid_disk = role; 10116 ret = remove_and_add_spares(mddev, rdev2); 10117 pr_info("Activated spare: %pg\n", 10118 rdev2->bdev); 10119 /* wakeup mddev->thread here, so array could 10120 * perform resync with the new activated disk */ 10121 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10122 md_wakeup_thread(mddev->thread); 10123 } 10124 /* device faulty 10125 * We just want to do the minimum to mark the disk 10126 * as faulty. The recovery is performed by the 10127 * one who initiated the error. 10128 */ 10129 if (role == MD_DISK_ROLE_FAULTY || 10130 role == MD_DISK_ROLE_JOURNAL) { 10131 md_error(mddev, rdev2); 10132 clear_bit(Blocked, &rdev2->flags); 10133 } 10134 } 10135 } 10136 10137 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 10138 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 10139 if (ret) 10140 pr_warn("md: updating array disks failed. %d\n", ret); 10141 } 10142 10143 /* 10144 * Since mddev->delta_disks has already updated in update_raid_disks, 10145 * so it is time to check reshape. 10146 */ 10147 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10148 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10149 /* 10150 * reshape is happening in the remote node, we need to 10151 * update reshape_position and call start_reshape. 10152 */ 10153 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 10154 if (mddev->pers->update_reshape_pos) 10155 mddev->pers->update_reshape_pos(mddev); 10156 if (mddev->pers->start_reshape) 10157 mddev->pers->start_reshape(mddev); 10158 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10159 mddev->reshape_position != MaxSector && 10160 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10161 /* reshape is just done in another node. */ 10162 mddev->reshape_position = MaxSector; 10163 if (mddev->pers->update_reshape_pos) 10164 mddev->pers->update_reshape_pos(mddev); 10165 } 10166 10167 /* Finally set the event to be up to date */ 10168 mddev->events = le64_to_cpu(sb->events); 10169 } 10170 10171 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 10172 { 10173 int err; 10174 struct page *swapout = rdev->sb_page; 10175 struct mdp_superblock_1 *sb; 10176 10177 /* Store the sb page of the rdev in the swapout temporary 10178 * variable in case we err in the future 10179 */ 10180 rdev->sb_page = NULL; 10181 err = alloc_disk_sb(rdev); 10182 if (err == 0) { 10183 ClearPageUptodate(rdev->sb_page); 10184 rdev->sb_loaded = 0; 10185 err = super_types[mddev->major_version]. 10186 load_super(rdev, NULL, mddev->minor_version); 10187 } 10188 if (err < 0) { 10189 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 10190 __func__, __LINE__, rdev->desc_nr, err); 10191 if (rdev->sb_page) 10192 put_page(rdev->sb_page); 10193 rdev->sb_page = swapout; 10194 rdev->sb_loaded = 1; 10195 return err; 10196 } 10197 10198 sb = page_address(rdev->sb_page); 10199 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 10200 * is not set 10201 */ 10202 10203 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 10204 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 10205 10206 /* The other node finished recovery, call spare_active to set 10207 * device In_sync and mddev->degraded 10208 */ 10209 if (rdev->recovery_offset == MaxSector && 10210 !test_bit(In_sync, &rdev->flags) && 10211 mddev->pers->spare_active(mddev)) 10212 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10213 10214 put_page(swapout); 10215 return 0; 10216 } 10217 10218 void md_reload_sb(struct mddev *mddev, int nr) 10219 { 10220 struct md_rdev *rdev = NULL, *iter; 10221 int err; 10222 10223 /* Find the rdev */ 10224 rdev_for_each_rcu(iter, mddev) { 10225 if (iter->desc_nr == nr) { 10226 rdev = iter; 10227 break; 10228 } 10229 } 10230 10231 if (!rdev) { 10232 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 10233 return; 10234 } 10235 10236 err = read_rdev(mddev, rdev); 10237 if (err < 0) 10238 return; 10239 10240 check_sb_changes(mddev, rdev); 10241 10242 /* Read all rdev's to update recovery_offset */ 10243 rdev_for_each_rcu(rdev, mddev) { 10244 if (!test_bit(Faulty, &rdev->flags)) 10245 read_rdev(mddev, rdev); 10246 } 10247 } 10248 EXPORT_SYMBOL(md_reload_sb); 10249 10250 #ifndef MODULE 10251 10252 /* 10253 * Searches all registered partitions for autorun RAID arrays 10254 * at boot time. 10255 */ 10256 10257 static DEFINE_MUTEX(detected_devices_mutex); 10258 static LIST_HEAD(all_detected_devices); 10259 struct detected_devices_node { 10260 struct list_head list; 10261 dev_t dev; 10262 }; 10263 10264 void md_autodetect_dev(dev_t dev) 10265 { 10266 struct detected_devices_node *node_detected_dev; 10267 10268 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 10269 if (node_detected_dev) { 10270 node_detected_dev->dev = dev; 10271 mutex_lock(&detected_devices_mutex); 10272 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10273 mutex_unlock(&detected_devices_mutex); 10274 } 10275 } 10276 10277 void md_autostart_arrays(int part) 10278 { 10279 struct md_rdev *rdev; 10280 struct detected_devices_node *node_detected_dev; 10281 dev_t dev; 10282 int i_scanned, i_passed; 10283 10284 i_scanned = 0; 10285 i_passed = 0; 10286 10287 pr_info("md: Autodetecting RAID arrays.\n"); 10288 10289 mutex_lock(&detected_devices_mutex); 10290 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10291 i_scanned++; 10292 node_detected_dev = list_entry(all_detected_devices.next, 10293 struct detected_devices_node, list); 10294 list_del(&node_detected_dev->list); 10295 dev = node_detected_dev->dev; 10296 kfree(node_detected_dev); 10297 mutex_unlock(&detected_devices_mutex); 10298 rdev = md_import_device(dev,0, 90); 10299 mutex_lock(&detected_devices_mutex); 10300 if (IS_ERR(rdev)) 10301 continue; 10302 10303 if (test_bit(Faulty, &rdev->flags)) 10304 continue; 10305 10306 set_bit(AutoDetected, &rdev->flags); 10307 list_add(&rdev->same_set, &pending_raid_disks); 10308 i_passed++; 10309 } 10310 mutex_unlock(&detected_devices_mutex); 10311 10312 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10313 10314 autorun_devices(part); 10315 } 10316 10317 #endif /* !MODULE */ 10318 10319 static __exit void md_exit(void) 10320 { 10321 struct mddev *mddev; 10322 int delay = 1; 10323 10324 unregister_blkdev(MD_MAJOR,"md"); 10325 unregister_blkdev(mdp_major, "mdp"); 10326 unregister_reboot_notifier(&md_notifier); 10327 unregister_sysctl_table(raid_table_header); 10328 10329 /* We cannot unload the modules while some process is 10330 * waiting for us in select() or poll() - wake them up 10331 */ 10332 md_unloading = 1; 10333 while (waitqueue_active(&md_event_waiters)) { 10334 /* not safe to leave yet */ 10335 wake_up(&md_event_waiters); 10336 msleep(delay); 10337 delay += delay; 10338 } 10339 remove_proc_entry("mdstat", NULL); 10340 10341 spin_lock(&all_mddevs_lock); 10342 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 10343 if (!mddev_get(mddev)) 10344 continue; 10345 spin_unlock(&all_mddevs_lock); 10346 export_array(mddev); 10347 mddev->ctime = 0; 10348 mddev->hold_active = 0; 10349 /* 10350 * As the mddev is now fully clear, mddev_put will schedule 10351 * the mddev for destruction by a workqueue, and the 10352 * destroy_workqueue() below will wait for that to complete. 10353 */ 10354 spin_lock(&all_mddevs_lock); 10355 mddev_put_locked(mddev); 10356 } 10357 spin_unlock(&all_mddevs_lock); 10358 10359 destroy_workqueue(md_misc_wq); 10360 destroy_workqueue(md_bitmap_wq); 10361 destroy_workqueue(md_wq); 10362 } 10363 10364 subsys_initcall(md_init); 10365 module_exit(md_exit) 10366 10367 static int get_ro(char *buffer, const struct kernel_param *kp) 10368 { 10369 return sprintf(buffer, "%d\n", start_readonly); 10370 } 10371 static int set_ro(const char *val, const struct kernel_param *kp) 10372 { 10373 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10374 } 10375 10376 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10377 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10378 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10379 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10380 10381 MODULE_LICENSE("GPL"); 10382 MODULE_DESCRIPTION("MD RAID framework"); 10383 MODULE_ALIAS("md"); 10384 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10385