1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include "md.h" 69 #include "md-bitmap.h" 70 #include "md-cluster.h" 71 72 static const char *action_name[NR_SYNC_ACTIONS] = { 73 [ACTION_RESYNC] = "resync", 74 [ACTION_RECOVER] = "recover", 75 [ACTION_CHECK] = "check", 76 [ACTION_REPAIR] = "repair", 77 [ACTION_RESHAPE] = "reshape", 78 [ACTION_FROZEN] = "frozen", 79 [ACTION_IDLE] = "idle", 80 }; 81 82 /* pers_list is a list of registered personalities protected by pers_lock. */ 83 static LIST_HEAD(pers_list); 84 static DEFINE_SPINLOCK(pers_lock); 85 86 static const struct kobj_type md_ktype; 87 88 const struct md_cluster_operations *md_cluster_ops; 89 EXPORT_SYMBOL(md_cluster_ops); 90 static struct module *md_cluster_mod; 91 92 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 93 static struct workqueue_struct *md_wq; 94 95 /* 96 * This workqueue is used for sync_work to register new sync_thread, and for 97 * del_work to remove rdev, and for event_work that is only set by dm-raid. 98 * 99 * Noted that sync_work will grab reconfig_mutex, hence never flush this 100 * workqueue whith reconfig_mutex grabbed. 101 */ 102 static struct workqueue_struct *md_misc_wq; 103 struct workqueue_struct *md_bitmap_wq; 104 105 static int remove_and_add_spares(struct mddev *mddev, 106 struct md_rdev *this); 107 static void mddev_detach(struct mddev *mddev); 108 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 109 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 110 111 /* 112 * Default number of read corrections we'll attempt on an rdev 113 * before ejecting it from the array. We divide the read error 114 * count by 2 for every hour elapsed between read errors. 115 */ 116 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 117 /* Default safemode delay: 200 msec */ 118 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 119 /* 120 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 121 * is 1000 KB/sec, so the extra system load does not show up that much. 122 * Increase it if you want to have more _guaranteed_ speed. Note that 123 * the RAID driver will use the maximum available bandwidth if the IO 124 * subsystem is idle. There is also an 'absolute maximum' reconstruction 125 * speed limit - in case reconstruction slows down your system despite 126 * idle IO detection. 127 * 128 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 129 * or /sys/block/mdX/md/sync_speed_{min,max} 130 */ 131 132 static int sysctl_speed_limit_min = 1000; 133 static int sysctl_speed_limit_max = 200000; 134 static inline int speed_min(struct mddev *mddev) 135 { 136 return mddev->sync_speed_min ? 137 mddev->sync_speed_min : sysctl_speed_limit_min; 138 } 139 140 static inline int speed_max(struct mddev *mddev) 141 { 142 return mddev->sync_speed_max ? 143 mddev->sync_speed_max : sysctl_speed_limit_max; 144 } 145 146 static void rdev_uninit_serial(struct md_rdev *rdev) 147 { 148 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 149 return; 150 151 kvfree(rdev->serial); 152 rdev->serial = NULL; 153 } 154 155 static void rdevs_uninit_serial(struct mddev *mddev) 156 { 157 struct md_rdev *rdev; 158 159 rdev_for_each(rdev, mddev) 160 rdev_uninit_serial(rdev); 161 } 162 163 static int rdev_init_serial(struct md_rdev *rdev) 164 { 165 /* serial_nums equals with BARRIER_BUCKETS_NR */ 166 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 167 struct serial_in_rdev *serial = NULL; 168 169 if (test_bit(CollisionCheck, &rdev->flags)) 170 return 0; 171 172 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 173 GFP_KERNEL); 174 if (!serial) 175 return -ENOMEM; 176 177 for (i = 0; i < serial_nums; i++) { 178 struct serial_in_rdev *serial_tmp = &serial[i]; 179 180 spin_lock_init(&serial_tmp->serial_lock); 181 serial_tmp->serial_rb = RB_ROOT_CACHED; 182 init_waitqueue_head(&serial_tmp->serial_io_wait); 183 } 184 185 rdev->serial = serial; 186 set_bit(CollisionCheck, &rdev->flags); 187 188 return 0; 189 } 190 191 static int rdevs_init_serial(struct mddev *mddev) 192 { 193 struct md_rdev *rdev; 194 int ret = 0; 195 196 rdev_for_each(rdev, mddev) { 197 ret = rdev_init_serial(rdev); 198 if (ret) 199 break; 200 } 201 202 /* Free all resources if pool is not existed */ 203 if (ret && !mddev->serial_info_pool) 204 rdevs_uninit_serial(mddev); 205 206 return ret; 207 } 208 209 /* 210 * rdev needs to enable serial stuffs if it meets the conditions: 211 * 1. it is multi-queue device flaged with writemostly. 212 * 2. the write-behind mode is enabled. 213 */ 214 static int rdev_need_serial(struct md_rdev *rdev) 215 { 216 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 217 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 218 test_bit(WriteMostly, &rdev->flags)); 219 } 220 221 /* 222 * Init resource for rdev(s), then create serial_info_pool if: 223 * 1. rdev is the first device which return true from rdev_enable_serial. 224 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 225 */ 226 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 227 { 228 int ret = 0; 229 230 if (rdev && !rdev_need_serial(rdev) && 231 !test_bit(CollisionCheck, &rdev->flags)) 232 return; 233 234 if (!rdev) 235 ret = rdevs_init_serial(mddev); 236 else 237 ret = rdev_init_serial(rdev); 238 if (ret) 239 return; 240 241 if (mddev->serial_info_pool == NULL) { 242 /* 243 * already in memalloc noio context by 244 * mddev_suspend() 245 */ 246 mddev->serial_info_pool = 247 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 248 sizeof(struct serial_info)); 249 if (!mddev->serial_info_pool) { 250 rdevs_uninit_serial(mddev); 251 pr_err("can't alloc memory pool for serialization\n"); 252 } 253 } 254 } 255 256 /* 257 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 258 * 1. rdev is the last device flaged with CollisionCheck. 259 * 2. when bitmap is destroyed while policy is not enabled. 260 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 261 */ 262 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 263 { 264 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 265 return; 266 267 if (mddev->serial_info_pool) { 268 struct md_rdev *temp; 269 int num = 0; /* used to track if other rdevs need the pool */ 270 271 rdev_for_each(temp, mddev) { 272 if (!rdev) { 273 if (!mddev->serialize_policy || 274 !rdev_need_serial(temp)) 275 rdev_uninit_serial(temp); 276 else 277 num++; 278 } else if (temp != rdev && 279 test_bit(CollisionCheck, &temp->flags)) 280 num++; 281 } 282 283 if (rdev) 284 rdev_uninit_serial(rdev); 285 286 if (num) 287 pr_info("The mempool could be used by other devices\n"); 288 else { 289 mempool_destroy(mddev->serial_info_pool); 290 mddev->serial_info_pool = NULL; 291 } 292 } 293 } 294 295 static struct ctl_table_header *raid_table_header; 296 297 static const struct ctl_table raid_table[] = { 298 { 299 .procname = "speed_limit_min", 300 .data = &sysctl_speed_limit_min, 301 .maxlen = sizeof(int), 302 .mode = S_IRUGO|S_IWUSR, 303 .proc_handler = proc_dointvec, 304 }, 305 { 306 .procname = "speed_limit_max", 307 .data = &sysctl_speed_limit_max, 308 .maxlen = sizeof(int), 309 .mode = S_IRUGO|S_IWUSR, 310 .proc_handler = proc_dointvec, 311 }, 312 }; 313 314 static int start_readonly; 315 316 /* 317 * The original mechanism for creating an md device is to create 318 * a device node in /dev and to open it. This causes races with device-close. 319 * The preferred method is to write to the "new_array" module parameter. 320 * This can avoid races. 321 * Setting create_on_open to false disables the original mechanism 322 * so all the races disappear. 323 */ 324 static bool create_on_open = true; 325 326 /* 327 * We have a system wide 'event count' that is incremented 328 * on any 'interesting' event, and readers of /proc/mdstat 329 * can use 'poll' or 'select' to find out when the event 330 * count increases. 331 * 332 * Events are: 333 * start array, stop array, error, add device, remove device, 334 * start build, activate spare 335 */ 336 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 337 static atomic_t md_event_count; 338 void md_new_event(void) 339 { 340 atomic_inc(&md_event_count); 341 wake_up(&md_event_waiters); 342 } 343 EXPORT_SYMBOL_GPL(md_new_event); 344 345 /* 346 * Enables to iterate over all existing md arrays 347 * all_mddevs_lock protects this list. 348 */ 349 static LIST_HEAD(all_mddevs); 350 static DEFINE_SPINLOCK(all_mddevs_lock); 351 352 static bool is_md_suspended(struct mddev *mddev) 353 { 354 return percpu_ref_is_dying(&mddev->active_io); 355 } 356 /* Rather than calling directly into the personality make_request function, 357 * IO requests come here first so that we can check if the device is 358 * being suspended pending a reconfiguration. 359 * We hold a refcount over the call to ->make_request. By the time that 360 * call has finished, the bio has been linked into some internal structure 361 * and so is visible to ->quiesce(), so we don't need the refcount any more. 362 */ 363 static bool is_suspended(struct mddev *mddev, struct bio *bio) 364 { 365 if (is_md_suspended(mddev)) 366 return true; 367 if (bio_data_dir(bio) != WRITE) 368 return false; 369 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 370 return false; 371 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 372 return false; 373 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 374 return false; 375 return true; 376 } 377 378 bool md_handle_request(struct mddev *mddev, struct bio *bio) 379 { 380 check_suspended: 381 if (is_suspended(mddev, bio)) { 382 DEFINE_WAIT(__wait); 383 /* Bail out if REQ_NOWAIT is set for the bio */ 384 if (bio->bi_opf & REQ_NOWAIT) { 385 bio_wouldblock_error(bio); 386 return true; 387 } 388 for (;;) { 389 prepare_to_wait(&mddev->sb_wait, &__wait, 390 TASK_UNINTERRUPTIBLE); 391 if (!is_suspended(mddev, bio)) 392 break; 393 schedule(); 394 } 395 finish_wait(&mddev->sb_wait, &__wait); 396 } 397 if (!percpu_ref_tryget_live(&mddev->active_io)) 398 goto check_suspended; 399 400 if (!mddev->pers->make_request(mddev, bio)) { 401 percpu_ref_put(&mddev->active_io); 402 if (!mddev->gendisk && mddev->pers->prepare_suspend) 403 return false; 404 goto check_suspended; 405 } 406 407 percpu_ref_put(&mddev->active_io); 408 return true; 409 } 410 EXPORT_SYMBOL(md_handle_request); 411 412 static void md_submit_bio(struct bio *bio) 413 { 414 const int rw = bio_data_dir(bio); 415 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 416 417 if (mddev == NULL || mddev->pers == NULL) { 418 bio_io_error(bio); 419 return; 420 } 421 422 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 423 bio_io_error(bio); 424 return; 425 } 426 427 bio = bio_split_to_limits(bio); 428 if (!bio) 429 return; 430 431 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 432 if (bio_sectors(bio) != 0) 433 bio->bi_status = BLK_STS_IOERR; 434 bio_endio(bio); 435 return; 436 } 437 438 /* bio could be mergeable after passing to underlayer */ 439 bio->bi_opf &= ~REQ_NOMERGE; 440 441 md_handle_request(mddev, bio); 442 } 443 444 /* 445 * Make sure no new requests are submitted to the device, and any requests that 446 * have been submitted are completely handled. 447 */ 448 int mddev_suspend(struct mddev *mddev, bool interruptible) 449 { 450 int err = 0; 451 452 /* 453 * hold reconfig_mutex to wait for normal io will deadlock, because 454 * other context can't update super_block, and normal io can rely on 455 * updating super_block. 456 */ 457 lockdep_assert_not_held(&mddev->reconfig_mutex); 458 459 if (interruptible) 460 err = mutex_lock_interruptible(&mddev->suspend_mutex); 461 else 462 mutex_lock(&mddev->suspend_mutex); 463 if (err) 464 return err; 465 466 if (mddev->suspended) { 467 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 468 mutex_unlock(&mddev->suspend_mutex); 469 return 0; 470 } 471 472 percpu_ref_kill(&mddev->active_io); 473 if (interruptible) 474 err = wait_event_interruptible(mddev->sb_wait, 475 percpu_ref_is_zero(&mddev->active_io)); 476 else 477 wait_event(mddev->sb_wait, 478 percpu_ref_is_zero(&mddev->active_io)); 479 if (err) { 480 percpu_ref_resurrect(&mddev->active_io); 481 mutex_unlock(&mddev->suspend_mutex); 482 return err; 483 } 484 485 /* 486 * For raid456, io might be waiting for reshape to make progress, 487 * allow new reshape to start while waiting for io to be done to 488 * prevent deadlock. 489 */ 490 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 491 492 /* restrict memory reclaim I/O during raid array is suspend */ 493 mddev->noio_flag = memalloc_noio_save(); 494 495 mutex_unlock(&mddev->suspend_mutex); 496 return 0; 497 } 498 EXPORT_SYMBOL_GPL(mddev_suspend); 499 500 static void __mddev_resume(struct mddev *mddev, bool recovery_needed) 501 { 502 lockdep_assert_not_held(&mddev->reconfig_mutex); 503 504 mutex_lock(&mddev->suspend_mutex); 505 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 506 if (mddev->suspended) { 507 mutex_unlock(&mddev->suspend_mutex); 508 return; 509 } 510 511 /* entred the memalloc scope from mddev_suspend() */ 512 memalloc_noio_restore(mddev->noio_flag); 513 514 percpu_ref_resurrect(&mddev->active_io); 515 wake_up(&mddev->sb_wait); 516 517 if (recovery_needed) 518 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 519 md_wakeup_thread(mddev->thread); 520 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 521 522 mutex_unlock(&mddev->suspend_mutex); 523 } 524 525 void mddev_resume(struct mddev *mddev) 526 { 527 return __mddev_resume(mddev, true); 528 } 529 EXPORT_SYMBOL_GPL(mddev_resume); 530 531 /* sync bdev before setting device to readonly or stopping raid*/ 532 static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) 533 { 534 mutex_lock(&mddev->open_mutex); 535 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { 536 mutex_unlock(&mddev->open_mutex); 537 return -EBUSY; 538 } 539 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 540 mutex_unlock(&mddev->open_mutex); 541 return -EBUSY; 542 } 543 mutex_unlock(&mddev->open_mutex); 544 545 sync_blockdev(mddev->gendisk->part0); 546 return 0; 547 } 548 549 /* 550 * The only difference from bio_chain_endio() is that the current 551 * bi_status of bio does not affect the bi_status of parent. 552 */ 553 static void md_end_flush(struct bio *bio) 554 { 555 struct bio *parent = bio->bi_private; 556 557 /* 558 * If any flush io error before the power failure, 559 * disk data may be lost. 560 */ 561 if (bio->bi_status) 562 pr_err("md: %pg flush io error %d\n", bio->bi_bdev, 563 blk_status_to_errno(bio->bi_status)); 564 565 bio_put(bio); 566 bio_endio(parent); 567 } 568 569 bool md_flush_request(struct mddev *mddev, struct bio *bio) 570 { 571 struct md_rdev *rdev; 572 struct bio *new; 573 574 /* 575 * md_flush_reqeust() should be called under md_handle_request() and 576 * 'active_io' is already grabbed. Hence it's safe to get rdev directly 577 * without rcu protection. 578 */ 579 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 580 581 rdev_for_each(rdev, mddev) { 582 if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 583 continue; 584 585 new = bio_alloc_bioset(rdev->bdev, 0, 586 REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, 587 &mddev->bio_set); 588 new->bi_private = bio; 589 new->bi_end_io = md_end_flush; 590 bio_inc_remaining(bio); 591 submit_bio(new); 592 } 593 594 if (bio_sectors(bio) == 0) { 595 bio_endio(bio); 596 return true; 597 } 598 599 bio->bi_opf &= ~REQ_PREFLUSH; 600 return false; 601 } 602 EXPORT_SYMBOL(md_flush_request); 603 604 static inline struct mddev *mddev_get(struct mddev *mddev) 605 { 606 lockdep_assert_held(&all_mddevs_lock); 607 608 if (test_bit(MD_DELETED, &mddev->flags)) 609 return NULL; 610 atomic_inc(&mddev->active); 611 return mddev; 612 } 613 614 static void mddev_delayed_delete(struct work_struct *ws); 615 616 static void __mddev_put(struct mddev *mddev) 617 { 618 if (mddev->raid_disks || !list_empty(&mddev->disks) || 619 mddev->ctime || mddev->hold_active) 620 return; 621 622 /* Array is not configured at all, and not held active, so destroy it */ 623 set_bit(MD_DELETED, &mddev->flags); 624 625 /* 626 * Call queue_work inside the spinlock so that flush_workqueue() after 627 * mddev_find will succeed in waiting for the work to be done. 628 */ 629 queue_work(md_misc_wq, &mddev->del_work); 630 } 631 632 void mddev_put(struct mddev *mddev) 633 { 634 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 635 return; 636 637 __mddev_put(mddev); 638 spin_unlock(&all_mddevs_lock); 639 } 640 641 static void md_safemode_timeout(struct timer_list *t); 642 static void md_start_sync(struct work_struct *ws); 643 644 static void active_io_release(struct percpu_ref *ref) 645 { 646 struct mddev *mddev = container_of(ref, struct mddev, active_io); 647 648 wake_up(&mddev->sb_wait); 649 } 650 651 static void no_op(struct percpu_ref *r) {} 652 653 int mddev_init(struct mddev *mddev) 654 { 655 656 if (percpu_ref_init(&mddev->active_io, active_io_release, 657 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 658 return -ENOMEM; 659 660 if (percpu_ref_init(&mddev->writes_pending, no_op, 661 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 662 percpu_ref_exit(&mddev->active_io); 663 return -ENOMEM; 664 } 665 666 /* We want to start with the refcount at zero */ 667 percpu_ref_put(&mddev->writes_pending); 668 669 mutex_init(&mddev->open_mutex); 670 mutex_init(&mddev->reconfig_mutex); 671 mutex_init(&mddev->suspend_mutex); 672 mutex_init(&mddev->bitmap_info.mutex); 673 INIT_LIST_HEAD(&mddev->disks); 674 INIT_LIST_HEAD(&mddev->all_mddevs); 675 INIT_LIST_HEAD(&mddev->deleting); 676 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 677 atomic_set(&mddev->active, 1); 678 atomic_set(&mddev->openers, 0); 679 atomic_set(&mddev->sync_seq, 0); 680 spin_lock_init(&mddev->lock); 681 init_waitqueue_head(&mddev->sb_wait); 682 init_waitqueue_head(&mddev->recovery_wait); 683 mddev->reshape_position = MaxSector; 684 mddev->reshape_backwards = 0; 685 mddev->last_sync_action = ACTION_IDLE; 686 mddev->resync_min = 0; 687 mddev->resync_max = MaxSector; 688 mddev->level = LEVEL_NONE; 689 mddev_set_bitmap_ops(mddev); 690 691 INIT_WORK(&mddev->sync_work, md_start_sync); 692 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 693 694 return 0; 695 } 696 EXPORT_SYMBOL_GPL(mddev_init); 697 698 void mddev_destroy(struct mddev *mddev) 699 { 700 percpu_ref_exit(&mddev->active_io); 701 percpu_ref_exit(&mddev->writes_pending); 702 } 703 EXPORT_SYMBOL_GPL(mddev_destroy); 704 705 static struct mddev *mddev_find_locked(dev_t unit) 706 { 707 struct mddev *mddev; 708 709 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 710 if (mddev->unit == unit) 711 return mddev; 712 713 return NULL; 714 } 715 716 /* find an unused unit number */ 717 static dev_t mddev_alloc_unit(void) 718 { 719 static int next_minor = 512; 720 int start = next_minor; 721 bool is_free = 0; 722 dev_t dev = 0; 723 724 while (!is_free) { 725 dev = MKDEV(MD_MAJOR, next_minor); 726 next_minor++; 727 if (next_minor > MINORMASK) 728 next_minor = 0; 729 if (next_minor == start) 730 return 0; /* Oh dear, all in use. */ 731 is_free = !mddev_find_locked(dev); 732 } 733 734 return dev; 735 } 736 737 static struct mddev *mddev_alloc(dev_t unit) 738 { 739 struct mddev *new; 740 int error; 741 742 if (unit && MAJOR(unit) != MD_MAJOR) 743 unit &= ~((1 << MdpMinorShift) - 1); 744 745 new = kzalloc(sizeof(*new), GFP_KERNEL); 746 if (!new) 747 return ERR_PTR(-ENOMEM); 748 749 error = mddev_init(new); 750 if (error) 751 goto out_free_new; 752 753 spin_lock(&all_mddevs_lock); 754 if (unit) { 755 error = -EEXIST; 756 if (mddev_find_locked(unit)) 757 goto out_destroy_new; 758 new->unit = unit; 759 if (MAJOR(unit) == MD_MAJOR) 760 new->md_minor = MINOR(unit); 761 else 762 new->md_minor = MINOR(unit) >> MdpMinorShift; 763 new->hold_active = UNTIL_IOCTL; 764 } else { 765 error = -ENODEV; 766 new->unit = mddev_alloc_unit(); 767 if (!new->unit) 768 goto out_destroy_new; 769 new->md_minor = MINOR(new->unit); 770 new->hold_active = UNTIL_STOP; 771 } 772 773 list_add(&new->all_mddevs, &all_mddevs); 774 spin_unlock(&all_mddevs_lock); 775 return new; 776 777 out_destroy_new: 778 spin_unlock(&all_mddevs_lock); 779 mddev_destroy(new); 780 out_free_new: 781 kfree(new); 782 return ERR_PTR(error); 783 } 784 785 static void mddev_free(struct mddev *mddev) 786 { 787 spin_lock(&all_mddevs_lock); 788 list_del(&mddev->all_mddevs); 789 spin_unlock(&all_mddevs_lock); 790 791 mddev_destroy(mddev); 792 kfree(mddev); 793 } 794 795 static const struct attribute_group md_redundancy_group; 796 797 void mddev_unlock(struct mddev *mddev) 798 { 799 struct md_rdev *rdev; 800 struct md_rdev *tmp; 801 LIST_HEAD(delete); 802 803 if (!list_empty(&mddev->deleting)) 804 list_splice_init(&mddev->deleting, &delete); 805 806 if (mddev->to_remove) { 807 /* These cannot be removed under reconfig_mutex as 808 * an access to the files will try to take reconfig_mutex 809 * while holding the file unremovable, which leads to 810 * a deadlock. 811 * So hold set sysfs_active while the remove in happeing, 812 * and anything else which might set ->to_remove or my 813 * otherwise change the sysfs namespace will fail with 814 * -EBUSY if sysfs_active is still set. 815 * We set sysfs_active under reconfig_mutex and elsewhere 816 * test it under the same mutex to ensure its correct value 817 * is seen. 818 */ 819 const struct attribute_group *to_remove = mddev->to_remove; 820 mddev->to_remove = NULL; 821 mddev->sysfs_active = 1; 822 mutex_unlock(&mddev->reconfig_mutex); 823 824 if (mddev->kobj.sd) { 825 if (to_remove != &md_redundancy_group) 826 sysfs_remove_group(&mddev->kobj, to_remove); 827 if (mddev->pers == NULL || 828 mddev->pers->sync_request == NULL) { 829 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 830 if (mddev->sysfs_action) 831 sysfs_put(mddev->sysfs_action); 832 if (mddev->sysfs_completed) 833 sysfs_put(mddev->sysfs_completed); 834 if (mddev->sysfs_degraded) 835 sysfs_put(mddev->sysfs_degraded); 836 mddev->sysfs_action = NULL; 837 mddev->sysfs_completed = NULL; 838 mddev->sysfs_degraded = NULL; 839 } 840 } 841 mddev->sysfs_active = 0; 842 } else 843 mutex_unlock(&mddev->reconfig_mutex); 844 845 md_wakeup_thread(mddev->thread); 846 wake_up(&mddev->sb_wait); 847 848 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 849 list_del_init(&rdev->same_set); 850 kobject_del(&rdev->kobj); 851 export_rdev(rdev, mddev); 852 } 853 } 854 EXPORT_SYMBOL_GPL(mddev_unlock); 855 856 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 857 { 858 struct md_rdev *rdev; 859 860 rdev_for_each_rcu(rdev, mddev) 861 if (rdev->desc_nr == nr) 862 return rdev; 863 864 return NULL; 865 } 866 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 867 868 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 869 { 870 struct md_rdev *rdev; 871 872 rdev_for_each(rdev, mddev) 873 if (rdev->bdev->bd_dev == dev) 874 return rdev; 875 876 return NULL; 877 } 878 879 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 880 { 881 struct md_rdev *rdev; 882 883 rdev_for_each_rcu(rdev, mddev) 884 if (rdev->bdev->bd_dev == dev) 885 return rdev; 886 887 return NULL; 888 } 889 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 890 891 static struct md_personality *find_pers(int level, char *clevel) 892 { 893 struct md_personality *pers; 894 list_for_each_entry(pers, &pers_list, list) { 895 if (level != LEVEL_NONE && pers->level == level) 896 return pers; 897 if (strcmp(pers->name, clevel)==0) 898 return pers; 899 } 900 return NULL; 901 } 902 903 /* return the offset of the super block in 512byte sectors */ 904 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 905 { 906 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 907 } 908 909 static int alloc_disk_sb(struct md_rdev *rdev) 910 { 911 rdev->sb_page = alloc_page(GFP_KERNEL); 912 if (!rdev->sb_page) 913 return -ENOMEM; 914 return 0; 915 } 916 917 void md_rdev_clear(struct md_rdev *rdev) 918 { 919 if (rdev->sb_page) { 920 put_page(rdev->sb_page); 921 rdev->sb_loaded = 0; 922 rdev->sb_page = NULL; 923 rdev->sb_start = 0; 924 rdev->sectors = 0; 925 } 926 if (rdev->bb_page) { 927 put_page(rdev->bb_page); 928 rdev->bb_page = NULL; 929 } 930 badblocks_exit(&rdev->badblocks); 931 } 932 EXPORT_SYMBOL_GPL(md_rdev_clear); 933 934 static void super_written(struct bio *bio) 935 { 936 struct md_rdev *rdev = bio->bi_private; 937 struct mddev *mddev = rdev->mddev; 938 939 if (bio->bi_status) { 940 pr_err("md: %s gets error=%d\n", __func__, 941 blk_status_to_errno(bio->bi_status)); 942 md_error(mddev, rdev); 943 if (!test_bit(Faulty, &rdev->flags) 944 && (bio->bi_opf & MD_FAILFAST)) { 945 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 946 set_bit(LastDev, &rdev->flags); 947 } 948 } else 949 clear_bit(LastDev, &rdev->flags); 950 951 bio_put(bio); 952 953 rdev_dec_pending(rdev, mddev); 954 955 if (atomic_dec_and_test(&mddev->pending_writes)) 956 wake_up(&mddev->sb_wait); 957 } 958 959 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 960 sector_t sector, int size, struct page *page) 961 { 962 /* write first size bytes of page to sector of rdev 963 * Increment mddev->pending_writes before returning 964 * and decrement it on completion, waking up sb_wait 965 * if zero is reached. 966 * If an error occurred, call md_error 967 */ 968 struct bio *bio; 969 970 if (!page) 971 return; 972 973 if (test_bit(Faulty, &rdev->flags)) 974 return; 975 976 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 977 1, 978 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 979 | REQ_PREFLUSH | REQ_FUA, 980 GFP_NOIO, &mddev->sync_set); 981 982 atomic_inc(&rdev->nr_pending); 983 984 bio->bi_iter.bi_sector = sector; 985 __bio_add_page(bio, page, size, 0); 986 bio->bi_private = rdev; 987 bio->bi_end_io = super_written; 988 989 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 990 test_bit(FailFast, &rdev->flags) && 991 !test_bit(LastDev, &rdev->flags)) 992 bio->bi_opf |= MD_FAILFAST; 993 994 atomic_inc(&mddev->pending_writes); 995 submit_bio(bio); 996 } 997 998 int md_super_wait(struct mddev *mddev) 999 { 1000 /* wait for all superblock writes that were scheduled to complete */ 1001 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1002 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1003 return -EAGAIN; 1004 return 0; 1005 } 1006 1007 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1008 struct page *page, blk_opf_t opf, bool metadata_op) 1009 { 1010 struct bio bio; 1011 struct bio_vec bvec; 1012 1013 if (metadata_op && rdev->meta_bdev) 1014 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1015 else 1016 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1017 1018 if (metadata_op) 1019 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1020 else if (rdev->mddev->reshape_position != MaxSector && 1021 (rdev->mddev->reshape_backwards == 1022 (sector >= rdev->mddev->reshape_position))) 1023 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1024 else 1025 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1026 __bio_add_page(&bio, page, size, 0); 1027 1028 submit_bio_wait(&bio); 1029 1030 return !bio.bi_status; 1031 } 1032 EXPORT_SYMBOL_GPL(sync_page_io); 1033 1034 static int read_disk_sb(struct md_rdev *rdev, int size) 1035 { 1036 if (rdev->sb_loaded) 1037 return 0; 1038 1039 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1040 goto fail; 1041 rdev->sb_loaded = 1; 1042 return 0; 1043 1044 fail: 1045 pr_err("md: disabled device %pg, could not read superblock.\n", 1046 rdev->bdev); 1047 return -EINVAL; 1048 } 1049 1050 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1051 { 1052 return sb1->set_uuid0 == sb2->set_uuid0 && 1053 sb1->set_uuid1 == sb2->set_uuid1 && 1054 sb1->set_uuid2 == sb2->set_uuid2 && 1055 sb1->set_uuid3 == sb2->set_uuid3; 1056 } 1057 1058 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1059 { 1060 int ret; 1061 mdp_super_t *tmp1, *tmp2; 1062 1063 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1064 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1065 1066 if (!tmp1 || !tmp2) { 1067 ret = 0; 1068 goto abort; 1069 } 1070 1071 *tmp1 = *sb1; 1072 *tmp2 = *sb2; 1073 1074 /* 1075 * nr_disks is not constant 1076 */ 1077 tmp1->nr_disks = 0; 1078 tmp2->nr_disks = 0; 1079 1080 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1081 abort: 1082 kfree(tmp1); 1083 kfree(tmp2); 1084 return ret; 1085 } 1086 1087 static u32 md_csum_fold(u32 csum) 1088 { 1089 csum = (csum & 0xffff) + (csum >> 16); 1090 return (csum & 0xffff) + (csum >> 16); 1091 } 1092 1093 static unsigned int calc_sb_csum(mdp_super_t *sb) 1094 { 1095 u64 newcsum = 0; 1096 u32 *sb32 = (u32*)sb; 1097 int i; 1098 unsigned int disk_csum, csum; 1099 1100 disk_csum = sb->sb_csum; 1101 sb->sb_csum = 0; 1102 1103 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1104 newcsum += sb32[i]; 1105 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1106 1107 #ifdef CONFIG_ALPHA 1108 /* This used to use csum_partial, which was wrong for several 1109 * reasons including that different results are returned on 1110 * different architectures. It isn't critical that we get exactly 1111 * the same return value as before (we always csum_fold before 1112 * testing, and that removes any differences). However as we 1113 * know that csum_partial always returned a 16bit value on 1114 * alphas, do a fold to maximise conformity to previous behaviour. 1115 */ 1116 sb->sb_csum = md_csum_fold(disk_csum); 1117 #else 1118 sb->sb_csum = disk_csum; 1119 #endif 1120 return csum; 1121 } 1122 1123 /* 1124 * Handle superblock details. 1125 * We want to be able to handle multiple superblock formats 1126 * so we have a common interface to them all, and an array of 1127 * different handlers. 1128 * We rely on user-space to write the initial superblock, and support 1129 * reading and updating of superblocks. 1130 * Interface methods are: 1131 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1132 * loads and validates a superblock on dev. 1133 * if refdev != NULL, compare superblocks on both devices 1134 * Return: 1135 * 0 - dev has a superblock that is compatible with refdev 1136 * 1 - dev has a superblock that is compatible and newer than refdev 1137 * so dev should be used as the refdev in future 1138 * -EINVAL superblock incompatible or invalid 1139 * -othererror e.g. -EIO 1140 * 1141 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1142 * Verify that dev is acceptable into mddev. 1143 * The first time, mddev->raid_disks will be 0, and data from 1144 * dev should be merged in. Subsequent calls check that dev 1145 * is new enough. Return 0 or -EINVAL 1146 * 1147 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1148 * Update the superblock for rdev with data in mddev 1149 * This does not write to disc. 1150 * 1151 */ 1152 1153 struct super_type { 1154 char *name; 1155 struct module *owner; 1156 int (*load_super)(struct md_rdev *rdev, 1157 struct md_rdev *refdev, 1158 int minor_version); 1159 int (*validate_super)(struct mddev *mddev, 1160 struct md_rdev *freshest, 1161 struct md_rdev *rdev); 1162 void (*sync_super)(struct mddev *mddev, 1163 struct md_rdev *rdev); 1164 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1165 sector_t num_sectors); 1166 int (*allow_new_offset)(struct md_rdev *rdev, 1167 unsigned long long new_offset); 1168 }; 1169 1170 /* 1171 * Check that the given mddev has no bitmap. 1172 * 1173 * This function is called from the run method of all personalities that do not 1174 * support bitmaps. It prints an error message and returns non-zero if mddev 1175 * has a bitmap. Otherwise, it returns 0. 1176 * 1177 */ 1178 int md_check_no_bitmap(struct mddev *mddev) 1179 { 1180 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1181 return 0; 1182 pr_warn("%s: bitmaps are not supported for %s\n", 1183 mdname(mddev), mddev->pers->name); 1184 return 1; 1185 } 1186 EXPORT_SYMBOL(md_check_no_bitmap); 1187 1188 /* 1189 * load_super for 0.90.0 1190 */ 1191 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1192 { 1193 mdp_super_t *sb; 1194 int ret; 1195 bool spare_disk = true; 1196 1197 /* 1198 * Calculate the position of the superblock (512byte sectors), 1199 * it's at the end of the disk. 1200 * 1201 * It also happens to be a multiple of 4Kb. 1202 */ 1203 rdev->sb_start = calc_dev_sboffset(rdev); 1204 1205 ret = read_disk_sb(rdev, MD_SB_BYTES); 1206 if (ret) 1207 return ret; 1208 1209 ret = -EINVAL; 1210 1211 sb = page_address(rdev->sb_page); 1212 1213 if (sb->md_magic != MD_SB_MAGIC) { 1214 pr_warn("md: invalid raid superblock magic on %pg\n", 1215 rdev->bdev); 1216 goto abort; 1217 } 1218 1219 if (sb->major_version != 0 || 1220 sb->minor_version < 90 || 1221 sb->minor_version > 91) { 1222 pr_warn("Bad version number %d.%d on %pg\n", 1223 sb->major_version, sb->minor_version, rdev->bdev); 1224 goto abort; 1225 } 1226 1227 if (sb->raid_disks <= 0) 1228 goto abort; 1229 1230 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1231 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1232 goto abort; 1233 } 1234 1235 rdev->preferred_minor = sb->md_minor; 1236 rdev->data_offset = 0; 1237 rdev->new_data_offset = 0; 1238 rdev->sb_size = MD_SB_BYTES; 1239 rdev->badblocks.shift = -1; 1240 1241 rdev->desc_nr = sb->this_disk.number; 1242 1243 /* not spare disk */ 1244 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && 1245 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1246 spare_disk = false; 1247 1248 if (!refdev) { 1249 if (!spare_disk) 1250 ret = 1; 1251 else 1252 ret = 0; 1253 } else { 1254 __u64 ev1, ev2; 1255 mdp_super_t *refsb = page_address(refdev->sb_page); 1256 if (!md_uuid_equal(refsb, sb)) { 1257 pr_warn("md: %pg has different UUID to %pg\n", 1258 rdev->bdev, refdev->bdev); 1259 goto abort; 1260 } 1261 if (!md_sb_equal(refsb, sb)) { 1262 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1263 rdev->bdev, refdev->bdev); 1264 goto abort; 1265 } 1266 ev1 = md_event(sb); 1267 ev2 = md_event(refsb); 1268 1269 if (!spare_disk && ev1 > ev2) 1270 ret = 1; 1271 else 1272 ret = 0; 1273 } 1274 rdev->sectors = rdev->sb_start; 1275 /* Limit to 4TB as metadata cannot record more than that. 1276 * (not needed for Linear and RAID0 as metadata doesn't 1277 * record this size) 1278 */ 1279 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1280 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1281 1282 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1283 /* "this cannot possibly happen" ... */ 1284 ret = -EINVAL; 1285 1286 abort: 1287 return ret; 1288 } 1289 1290 static u64 md_bitmap_events_cleared(struct mddev *mddev) 1291 { 1292 struct md_bitmap_stats stats; 1293 int err; 1294 1295 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 1296 if (err) 1297 return 0; 1298 1299 return stats.events_cleared; 1300 } 1301 1302 /* 1303 * validate_super for 0.90.0 1304 * note: we are not using "freshest" for 0.9 superblock 1305 */ 1306 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1307 { 1308 mdp_disk_t *desc; 1309 mdp_super_t *sb = page_address(rdev->sb_page); 1310 __u64 ev1 = md_event(sb); 1311 1312 rdev->raid_disk = -1; 1313 clear_bit(Faulty, &rdev->flags); 1314 clear_bit(In_sync, &rdev->flags); 1315 clear_bit(Bitmap_sync, &rdev->flags); 1316 clear_bit(WriteMostly, &rdev->flags); 1317 1318 if (mddev->raid_disks == 0) { 1319 mddev->major_version = 0; 1320 mddev->minor_version = sb->minor_version; 1321 mddev->patch_version = sb->patch_version; 1322 mddev->external = 0; 1323 mddev->chunk_sectors = sb->chunk_size >> 9; 1324 mddev->ctime = sb->ctime; 1325 mddev->utime = sb->utime; 1326 mddev->level = sb->level; 1327 mddev->clevel[0] = 0; 1328 mddev->layout = sb->layout; 1329 mddev->raid_disks = sb->raid_disks; 1330 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1331 mddev->events = ev1; 1332 mddev->bitmap_info.offset = 0; 1333 mddev->bitmap_info.space = 0; 1334 /* bitmap can use 60 K after the 4K superblocks */ 1335 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1336 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1337 mddev->reshape_backwards = 0; 1338 1339 if (mddev->minor_version >= 91) { 1340 mddev->reshape_position = sb->reshape_position; 1341 mddev->delta_disks = sb->delta_disks; 1342 mddev->new_level = sb->new_level; 1343 mddev->new_layout = sb->new_layout; 1344 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1345 if (mddev->delta_disks < 0) 1346 mddev->reshape_backwards = 1; 1347 } else { 1348 mddev->reshape_position = MaxSector; 1349 mddev->delta_disks = 0; 1350 mddev->new_level = mddev->level; 1351 mddev->new_layout = mddev->layout; 1352 mddev->new_chunk_sectors = mddev->chunk_sectors; 1353 } 1354 if (mddev->level == 0) 1355 mddev->layout = -1; 1356 1357 if (sb->state & (1<<MD_SB_CLEAN)) 1358 mddev->recovery_cp = MaxSector; 1359 else { 1360 if (sb->events_hi == sb->cp_events_hi && 1361 sb->events_lo == sb->cp_events_lo) { 1362 mddev->recovery_cp = sb->recovery_cp; 1363 } else 1364 mddev->recovery_cp = 0; 1365 } 1366 1367 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1368 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1369 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1370 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1371 1372 mddev->max_disks = MD_SB_DISKS; 1373 1374 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1375 mddev->bitmap_info.file == NULL) { 1376 mddev->bitmap_info.offset = 1377 mddev->bitmap_info.default_offset; 1378 mddev->bitmap_info.space = 1379 mddev->bitmap_info.default_space; 1380 } 1381 1382 } else if (mddev->pers == NULL) { 1383 /* Insist on good event counter while assembling, except 1384 * for spares (which don't need an event count) */ 1385 ++ev1; 1386 if (sb->disks[rdev->desc_nr].state & ( 1387 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1388 if (ev1 < mddev->events) 1389 return -EINVAL; 1390 } else if (mddev->bitmap) { 1391 /* if adding to array with a bitmap, then we can accept an 1392 * older device ... but not too old. 1393 */ 1394 if (ev1 < md_bitmap_events_cleared(mddev)) 1395 return 0; 1396 if (ev1 < mddev->events) 1397 set_bit(Bitmap_sync, &rdev->flags); 1398 } else { 1399 if (ev1 < mddev->events) 1400 /* just a hot-add of a new device, leave raid_disk at -1 */ 1401 return 0; 1402 } 1403 1404 desc = sb->disks + rdev->desc_nr; 1405 1406 if (desc->state & (1<<MD_DISK_FAULTY)) 1407 set_bit(Faulty, &rdev->flags); 1408 else if (desc->state & (1<<MD_DISK_SYNC)) { 1409 set_bit(In_sync, &rdev->flags); 1410 rdev->raid_disk = desc->raid_disk; 1411 rdev->saved_raid_disk = desc->raid_disk; 1412 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1413 /* active but not in sync implies recovery up to 1414 * reshape position. We don't know exactly where 1415 * that is, so set to zero for now 1416 */ 1417 if (mddev->minor_version >= 91) { 1418 rdev->recovery_offset = 0; 1419 rdev->raid_disk = desc->raid_disk; 1420 } 1421 } 1422 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1423 set_bit(WriteMostly, &rdev->flags); 1424 if (desc->state & (1<<MD_DISK_FAILFAST)) 1425 set_bit(FailFast, &rdev->flags); 1426 return 0; 1427 } 1428 1429 /* 1430 * sync_super for 0.90.0 1431 */ 1432 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1433 { 1434 mdp_super_t *sb; 1435 struct md_rdev *rdev2; 1436 int next_spare = mddev->raid_disks; 1437 1438 /* make rdev->sb match mddev data.. 1439 * 1440 * 1/ zero out disks 1441 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1442 * 3/ any empty disks < next_spare become removed 1443 * 1444 * disks[0] gets initialised to REMOVED because 1445 * we cannot be sure from other fields if it has 1446 * been initialised or not. 1447 */ 1448 int i; 1449 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1450 1451 rdev->sb_size = MD_SB_BYTES; 1452 1453 sb = page_address(rdev->sb_page); 1454 1455 memset(sb, 0, sizeof(*sb)); 1456 1457 sb->md_magic = MD_SB_MAGIC; 1458 sb->major_version = mddev->major_version; 1459 sb->patch_version = mddev->patch_version; 1460 sb->gvalid_words = 0; /* ignored */ 1461 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1462 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1463 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1464 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1465 1466 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1467 sb->level = mddev->level; 1468 sb->size = mddev->dev_sectors / 2; 1469 sb->raid_disks = mddev->raid_disks; 1470 sb->md_minor = mddev->md_minor; 1471 sb->not_persistent = 0; 1472 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1473 sb->state = 0; 1474 sb->events_hi = (mddev->events>>32); 1475 sb->events_lo = (u32)mddev->events; 1476 1477 if (mddev->reshape_position == MaxSector) 1478 sb->minor_version = 90; 1479 else { 1480 sb->minor_version = 91; 1481 sb->reshape_position = mddev->reshape_position; 1482 sb->new_level = mddev->new_level; 1483 sb->delta_disks = mddev->delta_disks; 1484 sb->new_layout = mddev->new_layout; 1485 sb->new_chunk = mddev->new_chunk_sectors << 9; 1486 } 1487 mddev->minor_version = sb->minor_version; 1488 if (mddev->in_sync) 1489 { 1490 sb->recovery_cp = mddev->recovery_cp; 1491 sb->cp_events_hi = (mddev->events>>32); 1492 sb->cp_events_lo = (u32)mddev->events; 1493 if (mddev->recovery_cp == MaxSector) 1494 sb->state = (1<< MD_SB_CLEAN); 1495 } else 1496 sb->recovery_cp = 0; 1497 1498 sb->layout = mddev->layout; 1499 sb->chunk_size = mddev->chunk_sectors << 9; 1500 1501 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1502 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1503 1504 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1505 rdev_for_each(rdev2, mddev) { 1506 mdp_disk_t *d; 1507 int desc_nr; 1508 int is_active = test_bit(In_sync, &rdev2->flags); 1509 1510 if (rdev2->raid_disk >= 0 && 1511 sb->minor_version >= 91) 1512 /* we have nowhere to store the recovery_offset, 1513 * but if it is not below the reshape_position, 1514 * we can piggy-back on that. 1515 */ 1516 is_active = 1; 1517 if (rdev2->raid_disk < 0 || 1518 test_bit(Faulty, &rdev2->flags)) 1519 is_active = 0; 1520 if (is_active) 1521 desc_nr = rdev2->raid_disk; 1522 else 1523 desc_nr = next_spare++; 1524 rdev2->desc_nr = desc_nr; 1525 d = &sb->disks[rdev2->desc_nr]; 1526 nr_disks++; 1527 d->number = rdev2->desc_nr; 1528 d->major = MAJOR(rdev2->bdev->bd_dev); 1529 d->minor = MINOR(rdev2->bdev->bd_dev); 1530 if (is_active) 1531 d->raid_disk = rdev2->raid_disk; 1532 else 1533 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1534 if (test_bit(Faulty, &rdev2->flags)) 1535 d->state = (1<<MD_DISK_FAULTY); 1536 else if (is_active) { 1537 d->state = (1<<MD_DISK_ACTIVE); 1538 if (test_bit(In_sync, &rdev2->flags)) 1539 d->state |= (1<<MD_DISK_SYNC); 1540 active++; 1541 working++; 1542 } else { 1543 d->state = 0; 1544 spare++; 1545 working++; 1546 } 1547 if (test_bit(WriteMostly, &rdev2->flags)) 1548 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1549 if (test_bit(FailFast, &rdev2->flags)) 1550 d->state |= (1<<MD_DISK_FAILFAST); 1551 } 1552 /* now set the "removed" and "faulty" bits on any missing devices */ 1553 for (i=0 ; i < mddev->raid_disks ; i++) { 1554 mdp_disk_t *d = &sb->disks[i]; 1555 if (d->state == 0 && d->number == 0) { 1556 d->number = i; 1557 d->raid_disk = i; 1558 d->state = (1<<MD_DISK_REMOVED); 1559 d->state |= (1<<MD_DISK_FAULTY); 1560 failed++; 1561 } 1562 } 1563 sb->nr_disks = nr_disks; 1564 sb->active_disks = active; 1565 sb->working_disks = working; 1566 sb->failed_disks = failed; 1567 sb->spare_disks = spare; 1568 1569 sb->this_disk = sb->disks[rdev->desc_nr]; 1570 sb->sb_csum = calc_sb_csum(sb); 1571 } 1572 1573 /* 1574 * rdev_size_change for 0.90.0 1575 */ 1576 static unsigned long long 1577 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1578 { 1579 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1580 return 0; /* component must fit device */ 1581 if (rdev->mddev->bitmap_info.offset) 1582 return 0; /* can't move bitmap */ 1583 rdev->sb_start = calc_dev_sboffset(rdev); 1584 if (!num_sectors || num_sectors > rdev->sb_start) 1585 num_sectors = rdev->sb_start; 1586 /* Limit to 4TB as metadata cannot record more than that. 1587 * 4TB == 2^32 KB, or 2*2^32 sectors. 1588 */ 1589 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1590 num_sectors = (sector_t)(2ULL << 32) - 2; 1591 do { 1592 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1593 rdev->sb_page); 1594 } while (md_super_wait(rdev->mddev) < 0); 1595 return num_sectors; 1596 } 1597 1598 static int 1599 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1600 { 1601 /* non-zero offset changes not possible with v0.90 */ 1602 return new_offset == 0; 1603 } 1604 1605 /* 1606 * version 1 superblock 1607 */ 1608 1609 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1610 { 1611 __le32 disk_csum; 1612 u32 csum; 1613 unsigned long long newcsum; 1614 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1615 __le32 *isuper = (__le32*)sb; 1616 1617 disk_csum = sb->sb_csum; 1618 sb->sb_csum = 0; 1619 newcsum = 0; 1620 for (; size >= 4; size -= 4) 1621 newcsum += le32_to_cpu(*isuper++); 1622 1623 if (size == 2) 1624 newcsum += le16_to_cpu(*(__le16*) isuper); 1625 1626 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1627 sb->sb_csum = disk_csum; 1628 return cpu_to_le32(csum); 1629 } 1630 1631 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1632 { 1633 struct mdp_superblock_1 *sb; 1634 int ret; 1635 sector_t sb_start; 1636 sector_t sectors; 1637 int bmask; 1638 bool spare_disk = true; 1639 1640 /* 1641 * Calculate the position of the superblock in 512byte sectors. 1642 * It is always aligned to a 4K boundary and 1643 * depeding on minor_version, it can be: 1644 * 0: At least 8K, but less than 12K, from end of device 1645 * 1: At start of device 1646 * 2: 4K from start of device. 1647 */ 1648 switch(minor_version) { 1649 case 0: 1650 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1651 sb_start &= ~(sector_t)(4*2-1); 1652 break; 1653 case 1: 1654 sb_start = 0; 1655 break; 1656 case 2: 1657 sb_start = 8; 1658 break; 1659 default: 1660 return -EINVAL; 1661 } 1662 rdev->sb_start = sb_start; 1663 1664 /* superblock is rarely larger than 1K, but it can be larger, 1665 * and it is safe to read 4k, so we do that 1666 */ 1667 ret = read_disk_sb(rdev, 4096); 1668 if (ret) return ret; 1669 1670 sb = page_address(rdev->sb_page); 1671 1672 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1673 sb->major_version != cpu_to_le32(1) || 1674 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1675 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1676 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1677 return -EINVAL; 1678 1679 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1680 pr_warn("md: invalid superblock checksum on %pg\n", 1681 rdev->bdev); 1682 return -EINVAL; 1683 } 1684 if (le64_to_cpu(sb->data_size) < 10) { 1685 pr_warn("md: data_size too small on %pg\n", 1686 rdev->bdev); 1687 return -EINVAL; 1688 } 1689 if (sb->pad0 || 1690 sb->pad3[0] || 1691 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1692 /* Some padding is non-zero, might be a new feature */ 1693 return -EINVAL; 1694 1695 rdev->preferred_minor = 0xffff; 1696 rdev->data_offset = le64_to_cpu(sb->data_offset); 1697 rdev->new_data_offset = rdev->data_offset; 1698 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1699 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1700 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1701 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1702 1703 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1704 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1705 if (rdev->sb_size & bmask) 1706 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1707 1708 if (minor_version 1709 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1710 return -EINVAL; 1711 if (minor_version 1712 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1713 return -EINVAL; 1714 1715 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1716 1717 if (!rdev->bb_page) { 1718 rdev->bb_page = alloc_page(GFP_KERNEL); 1719 if (!rdev->bb_page) 1720 return -ENOMEM; 1721 } 1722 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1723 rdev->badblocks.count == 0) { 1724 /* need to load the bad block list. 1725 * Currently we limit it to one page. 1726 */ 1727 s32 offset; 1728 sector_t bb_sector; 1729 __le64 *bbp; 1730 int i; 1731 int sectors = le16_to_cpu(sb->bblog_size); 1732 if (sectors > (PAGE_SIZE / 512)) 1733 return -EINVAL; 1734 offset = le32_to_cpu(sb->bblog_offset); 1735 if (offset == 0) 1736 return -EINVAL; 1737 bb_sector = (long long)offset; 1738 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1739 rdev->bb_page, REQ_OP_READ, true)) 1740 return -EIO; 1741 bbp = (__le64 *)page_address(rdev->bb_page); 1742 rdev->badblocks.shift = sb->bblog_shift; 1743 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1744 u64 bb = le64_to_cpu(*bbp); 1745 int count = bb & (0x3ff); 1746 u64 sector = bb >> 10; 1747 sector <<= sb->bblog_shift; 1748 count <<= sb->bblog_shift; 1749 if (bb + 1 == 0) 1750 break; 1751 if (!badblocks_set(&rdev->badblocks, sector, count, 1)) 1752 return -EINVAL; 1753 } 1754 } else if (sb->bblog_offset != 0) 1755 rdev->badblocks.shift = 0; 1756 1757 if ((le32_to_cpu(sb->feature_map) & 1758 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1759 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1760 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1761 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1762 } 1763 1764 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1765 sb->level != 0) 1766 return -EINVAL; 1767 1768 /* not spare disk */ 1769 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1770 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1771 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1772 spare_disk = false; 1773 1774 if (!refdev) { 1775 if (!spare_disk) 1776 ret = 1; 1777 else 1778 ret = 0; 1779 } else { 1780 __u64 ev1, ev2; 1781 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1782 1783 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1784 sb->level != refsb->level || 1785 sb->layout != refsb->layout || 1786 sb->chunksize != refsb->chunksize) { 1787 pr_warn("md: %pg has strangely different superblock to %pg\n", 1788 rdev->bdev, 1789 refdev->bdev); 1790 return -EINVAL; 1791 } 1792 ev1 = le64_to_cpu(sb->events); 1793 ev2 = le64_to_cpu(refsb->events); 1794 1795 if (!spare_disk && ev1 > ev2) 1796 ret = 1; 1797 else 1798 ret = 0; 1799 } 1800 if (minor_version) 1801 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1802 else 1803 sectors = rdev->sb_start; 1804 if (sectors < le64_to_cpu(sb->data_size)) 1805 return -EINVAL; 1806 rdev->sectors = le64_to_cpu(sb->data_size); 1807 return ret; 1808 } 1809 1810 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1811 { 1812 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1813 __u64 ev1 = le64_to_cpu(sb->events); 1814 int role; 1815 1816 rdev->raid_disk = -1; 1817 clear_bit(Faulty, &rdev->flags); 1818 clear_bit(In_sync, &rdev->flags); 1819 clear_bit(Bitmap_sync, &rdev->flags); 1820 clear_bit(WriteMostly, &rdev->flags); 1821 1822 if (mddev->raid_disks == 0) { 1823 mddev->major_version = 1; 1824 mddev->patch_version = 0; 1825 mddev->external = 0; 1826 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1827 mddev->ctime = le64_to_cpu(sb->ctime); 1828 mddev->utime = le64_to_cpu(sb->utime); 1829 mddev->level = le32_to_cpu(sb->level); 1830 mddev->clevel[0] = 0; 1831 mddev->layout = le32_to_cpu(sb->layout); 1832 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1833 mddev->dev_sectors = le64_to_cpu(sb->size); 1834 mddev->events = ev1; 1835 mddev->bitmap_info.offset = 0; 1836 mddev->bitmap_info.space = 0; 1837 /* Default location for bitmap is 1K after superblock 1838 * using 3K - total of 4K 1839 */ 1840 mddev->bitmap_info.default_offset = 1024 >> 9; 1841 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1842 mddev->reshape_backwards = 0; 1843 1844 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1845 memcpy(mddev->uuid, sb->set_uuid, 16); 1846 1847 mddev->max_disks = (4096-256)/2; 1848 1849 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1850 mddev->bitmap_info.file == NULL) { 1851 mddev->bitmap_info.offset = 1852 (__s32)le32_to_cpu(sb->bitmap_offset); 1853 /* Metadata doesn't record how much space is available. 1854 * For 1.0, we assume we can use up to the superblock 1855 * if before, else to 4K beyond superblock. 1856 * For others, assume no change is possible. 1857 */ 1858 if (mddev->minor_version > 0) 1859 mddev->bitmap_info.space = 0; 1860 else if (mddev->bitmap_info.offset > 0) 1861 mddev->bitmap_info.space = 1862 8 - mddev->bitmap_info.offset; 1863 else 1864 mddev->bitmap_info.space = 1865 -mddev->bitmap_info.offset; 1866 } 1867 1868 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1869 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1870 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1871 mddev->new_level = le32_to_cpu(sb->new_level); 1872 mddev->new_layout = le32_to_cpu(sb->new_layout); 1873 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1874 if (mddev->delta_disks < 0 || 1875 (mddev->delta_disks == 0 && 1876 (le32_to_cpu(sb->feature_map) 1877 & MD_FEATURE_RESHAPE_BACKWARDS))) 1878 mddev->reshape_backwards = 1; 1879 } else { 1880 mddev->reshape_position = MaxSector; 1881 mddev->delta_disks = 0; 1882 mddev->new_level = mddev->level; 1883 mddev->new_layout = mddev->layout; 1884 mddev->new_chunk_sectors = mddev->chunk_sectors; 1885 } 1886 1887 if (mddev->level == 0 && 1888 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1889 mddev->layout = -1; 1890 1891 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1892 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1893 1894 if (le32_to_cpu(sb->feature_map) & 1895 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1896 if (le32_to_cpu(sb->feature_map) & 1897 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1898 return -EINVAL; 1899 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1900 (le32_to_cpu(sb->feature_map) & 1901 MD_FEATURE_MULTIPLE_PPLS)) 1902 return -EINVAL; 1903 set_bit(MD_HAS_PPL, &mddev->flags); 1904 } 1905 } else if (mddev->pers == NULL) { 1906 /* Insist of good event counter while assembling, except for 1907 * spares (which don't need an event count). 1908 * Similar to mdadm, we allow event counter difference of 1 1909 * from the freshest device. 1910 */ 1911 if (rdev->desc_nr >= 0 && 1912 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1913 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1914 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1915 if (ev1 + 1 < mddev->events) 1916 return -EINVAL; 1917 } else if (mddev->bitmap) { 1918 /* If adding to array with a bitmap, then we can accept an 1919 * older device, but not too old. 1920 */ 1921 if (ev1 < md_bitmap_events_cleared(mddev)) 1922 return 0; 1923 if (ev1 < mddev->events) 1924 set_bit(Bitmap_sync, &rdev->flags); 1925 } else { 1926 if (ev1 < mddev->events) 1927 /* just a hot-add of a new device, leave raid_disk at -1 */ 1928 return 0; 1929 } 1930 1931 if (rdev->desc_nr < 0 || 1932 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1933 role = MD_DISK_ROLE_SPARE; 1934 rdev->desc_nr = -1; 1935 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 1936 /* 1937 * If we are assembling, and our event counter is smaller than the 1938 * highest event counter, we cannot trust our superblock about the role. 1939 * It could happen that our rdev was marked as Faulty, and all other 1940 * superblocks were updated with +1 event counter. 1941 * Then, before the next superblock update, which typically happens when 1942 * remove_and_add_spares() removes the device from the array, there was 1943 * a crash or reboot. 1944 * If we allow current rdev without consulting the freshest superblock, 1945 * we could cause data corruption. 1946 * Note that in this case our event counter is smaller by 1 than the 1947 * highest, otherwise, this rdev would not be allowed into array; 1948 * both kernel and mdadm allow event counter difference of 1. 1949 */ 1950 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 1951 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 1952 1953 if (rdev->desc_nr >= freshest_max_dev) { 1954 /* this is unexpected, better not proceed */ 1955 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 1956 mdname(mddev), rdev->bdev, rdev->desc_nr, 1957 freshest->bdev, freshest_max_dev); 1958 return -EUCLEAN; 1959 } 1960 1961 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 1962 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 1963 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 1964 } else { 1965 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1966 } 1967 switch (role) { 1968 case MD_DISK_ROLE_SPARE: /* spare */ 1969 break; 1970 case MD_DISK_ROLE_FAULTY: /* faulty */ 1971 set_bit(Faulty, &rdev->flags); 1972 break; 1973 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1974 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1975 /* journal device without journal feature */ 1976 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 1977 return -EINVAL; 1978 } 1979 set_bit(Journal, &rdev->flags); 1980 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1981 rdev->raid_disk = 0; 1982 break; 1983 default: 1984 rdev->saved_raid_disk = role; 1985 if ((le32_to_cpu(sb->feature_map) & 1986 MD_FEATURE_RECOVERY_OFFSET)) { 1987 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1988 if (!(le32_to_cpu(sb->feature_map) & 1989 MD_FEATURE_RECOVERY_BITMAP)) 1990 rdev->saved_raid_disk = -1; 1991 } else { 1992 /* 1993 * If the array is FROZEN, then the device can't 1994 * be in_sync with rest of array. 1995 */ 1996 if (!test_bit(MD_RECOVERY_FROZEN, 1997 &mddev->recovery)) 1998 set_bit(In_sync, &rdev->flags); 1999 } 2000 rdev->raid_disk = role; 2001 break; 2002 } 2003 if (sb->devflags & WriteMostly1) 2004 set_bit(WriteMostly, &rdev->flags); 2005 if (sb->devflags & FailFast1) 2006 set_bit(FailFast, &rdev->flags); 2007 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2008 set_bit(Replacement, &rdev->flags); 2009 2010 return 0; 2011 } 2012 2013 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2014 { 2015 struct mdp_superblock_1 *sb; 2016 struct md_rdev *rdev2; 2017 int max_dev, i; 2018 /* make rdev->sb match mddev and rdev data. */ 2019 2020 sb = page_address(rdev->sb_page); 2021 2022 sb->feature_map = 0; 2023 sb->pad0 = 0; 2024 sb->recovery_offset = cpu_to_le64(0); 2025 memset(sb->pad3, 0, sizeof(sb->pad3)); 2026 2027 sb->utime = cpu_to_le64((__u64)mddev->utime); 2028 sb->events = cpu_to_le64(mddev->events); 2029 if (mddev->in_sync) 2030 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 2031 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2032 sb->resync_offset = cpu_to_le64(MaxSector); 2033 else 2034 sb->resync_offset = cpu_to_le64(0); 2035 2036 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2037 2038 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2039 sb->size = cpu_to_le64(mddev->dev_sectors); 2040 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2041 sb->level = cpu_to_le32(mddev->level); 2042 sb->layout = cpu_to_le32(mddev->layout); 2043 if (test_bit(FailFast, &rdev->flags)) 2044 sb->devflags |= FailFast1; 2045 else 2046 sb->devflags &= ~FailFast1; 2047 2048 if (test_bit(WriteMostly, &rdev->flags)) 2049 sb->devflags |= WriteMostly1; 2050 else 2051 sb->devflags &= ~WriteMostly1; 2052 sb->data_offset = cpu_to_le64(rdev->data_offset); 2053 sb->data_size = cpu_to_le64(rdev->sectors); 2054 2055 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2056 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2057 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2058 } 2059 2060 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2061 !test_bit(In_sync, &rdev->flags)) { 2062 sb->feature_map |= 2063 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2064 sb->recovery_offset = 2065 cpu_to_le64(rdev->recovery_offset); 2066 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2067 sb->feature_map |= 2068 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2069 } 2070 /* Note: recovery_offset and journal_tail share space */ 2071 if (test_bit(Journal, &rdev->flags)) 2072 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2073 if (test_bit(Replacement, &rdev->flags)) 2074 sb->feature_map |= 2075 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2076 2077 if (mddev->reshape_position != MaxSector) { 2078 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2079 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2080 sb->new_layout = cpu_to_le32(mddev->new_layout); 2081 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2082 sb->new_level = cpu_to_le32(mddev->new_level); 2083 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2084 if (mddev->delta_disks == 0 && 2085 mddev->reshape_backwards) 2086 sb->feature_map 2087 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2088 if (rdev->new_data_offset != rdev->data_offset) { 2089 sb->feature_map 2090 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2091 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2092 - rdev->data_offset)); 2093 } 2094 } 2095 2096 if (mddev_is_clustered(mddev)) 2097 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2098 2099 if (rdev->badblocks.count == 0) 2100 /* Nothing to do for bad blocks*/ ; 2101 else if (sb->bblog_offset == 0) 2102 /* Cannot record bad blocks on this device */ 2103 md_error(mddev, rdev); 2104 else { 2105 struct badblocks *bb = &rdev->badblocks; 2106 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2107 u64 *p = bb->page; 2108 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2109 if (bb->changed) { 2110 unsigned seq; 2111 2112 retry: 2113 seq = read_seqbegin(&bb->lock); 2114 2115 memset(bbp, 0xff, PAGE_SIZE); 2116 2117 for (i = 0 ; i < bb->count ; i++) { 2118 u64 internal_bb = p[i]; 2119 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2120 | BB_LEN(internal_bb)); 2121 bbp[i] = cpu_to_le64(store_bb); 2122 } 2123 bb->changed = 0; 2124 if (read_seqretry(&bb->lock, seq)) 2125 goto retry; 2126 2127 bb->sector = (rdev->sb_start + 2128 (int)le32_to_cpu(sb->bblog_offset)); 2129 bb->size = le16_to_cpu(sb->bblog_size); 2130 } 2131 } 2132 2133 max_dev = 0; 2134 rdev_for_each(rdev2, mddev) 2135 if (rdev2->desc_nr+1 > max_dev) 2136 max_dev = rdev2->desc_nr+1; 2137 2138 if (max_dev > le32_to_cpu(sb->max_dev)) { 2139 int bmask; 2140 sb->max_dev = cpu_to_le32(max_dev); 2141 rdev->sb_size = max_dev * 2 + 256; 2142 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2143 if (rdev->sb_size & bmask) 2144 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2145 } else 2146 max_dev = le32_to_cpu(sb->max_dev); 2147 2148 for (i=0; i<max_dev;i++) 2149 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2150 2151 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2152 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2153 2154 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2155 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2156 sb->feature_map |= 2157 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2158 else 2159 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2160 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2161 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2162 } 2163 2164 rdev_for_each(rdev2, mddev) { 2165 i = rdev2->desc_nr; 2166 if (test_bit(Faulty, &rdev2->flags)) 2167 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2168 else if (test_bit(In_sync, &rdev2->flags)) 2169 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2170 else if (test_bit(Journal, &rdev2->flags)) 2171 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2172 else if (rdev2->raid_disk >= 0) 2173 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2174 else 2175 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2176 } 2177 2178 sb->sb_csum = calc_sb_1_csum(sb); 2179 } 2180 2181 static sector_t super_1_choose_bm_space(sector_t dev_size) 2182 { 2183 sector_t bm_space; 2184 2185 /* if the device is bigger than 8Gig, save 64k for bitmap 2186 * usage, if bigger than 200Gig, save 128k 2187 */ 2188 if (dev_size < 64*2) 2189 bm_space = 0; 2190 else if (dev_size - 64*2 >= 200*1024*1024*2) 2191 bm_space = 128*2; 2192 else if (dev_size - 4*2 > 8*1024*1024*2) 2193 bm_space = 64*2; 2194 else 2195 bm_space = 4*2; 2196 return bm_space; 2197 } 2198 2199 static unsigned long long 2200 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2201 { 2202 struct mdp_superblock_1 *sb; 2203 sector_t max_sectors; 2204 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2205 return 0; /* component must fit device */ 2206 if (rdev->data_offset != rdev->new_data_offset) 2207 return 0; /* too confusing */ 2208 if (rdev->sb_start < rdev->data_offset) { 2209 /* minor versions 1 and 2; superblock before data */ 2210 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2211 if (!num_sectors || num_sectors > max_sectors) 2212 num_sectors = max_sectors; 2213 } else if (rdev->mddev->bitmap_info.offset) { 2214 /* minor version 0 with bitmap we can't move */ 2215 return 0; 2216 } else { 2217 /* minor version 0; superblock after data */ 2218 sector_t sb_start, bm_space; 2219 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2220 2221 /* 8K is for superblock */ 2222 sb_start = dev_size - 8*2; 2223 sb_start &= ~(sector_t)(4*2 - 1); 2224 2225 bm_space = super_1_choose_bm_space(dev_size); 2226 2227 /* Space that can be used to store date needs to decrease 2228 * superblock bitmap space and bad block space(4K) 2229 */ 2230 max_sectors = sb_start - bm_space - 4*2; 2231 2232 if (!num_sectors || num_sectors > max_sectors) 2233 num_sectors = max_sectors; 2234 rdev->sb_start = sb_start; 2235 } 2236 sb = page_address(rdev->sb_page); 2237 sb->data_size = cpu_to_le64(num_sectors); 2238 sb->super_offset = cpu_to_le64(rdev->sb_start); 2239 sb->sb_csum = calc_sb_1_csum(sb); 2240 do { 2241 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2242 rdev->sb_page); 2243 } while (md_super_wait(rdev->mddev) < 0); 2244 return num_sectors; 2245 2246 } 2247 2248 static int 2249 super_1_allow_new_offset(struct md_rdev *rdev, 2250 unsigned long long new_offset) 2251 { 2252 /* All necessary checks on new >= old have been done */ 2253 if (new_offset >= rdev->data_offset) 2254 return 1; 2255 2256 /* with 1.0 metadata, there is no metadata to tread on 2257 * so we can always move back */ 2258 if (rdev->mddev->minor_version == 0) 2259 return 1; 2260 2261 /* otherwise we must be sure not to step on 2262 * any metadata, so stay: 2263 * 36K beyond start of superblock 2264 * beyond end of badblocks 2265 * beyond write-intent bitmap 2266 */ 2267 if (rdev->sb_start + (32+4)*2 > new_offset) 2268 return 0; 2269 2270 if (!rdev->mddev->bitmap_info.file) { 2271 struct mddev *mddev = rdev->mddev; 2272 struct md_bitmap_stats stats; 2273 int err; 2274 2275 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 2276 if (!err && rdev->sb_start + mddev->bitmap_info.offset + 2277 stats.file_pages * (PAGE_SIZE >> 9) > new_offset) 2278 return 0; 2279 } 2280 2281 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2282 return 0; 2283 2284 return 1; 2285 } 2286 2287 static struct super_type super_types[] = { 2288 [0] = { 2289 .name = "0.90.0", 2290 .owner = THIS_MODULE, 2291 .load_super = super_90_load, 2292 .validate_super = super_90_validate, 2293 .sync_super = super_90_sync, 2294 .rdev_size_change = super_90_rdev_size_change, 2295 .allow_new_offset = super_90_allow_new_offset, 2296 }, 2297 [1] = { 2298 .name = "md-1", 2299 .owner = THIS_MODULE, 2300 .load_super = super_1_load, 2301 .validate_super = super_1_validate, 2302 .sync_super = super_1_sync, 2303 .rdev_size_change = super_1_rdev_size_change, 2304 .allow_new_offset = super_1_allow_new_offset, 2305 }, 2306 }; 2307 2308 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2309 { 2310 if (mddev->sync_super) { 2311 mddev->sync_super(mddev, rdev); 2312 return; 2313 } 2314 2315 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2316 2317 super_types[mddev->major_version].sync_super(mddev, rdev); 2318 } 2319 2320 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2321 { 2322 struct md_rdev *rdev, *rdev2; 2323 2324 rcu_read_lock(); 2325 rdev_for_each_rcu(rdev, mddev1) { 2326 if (test_bit(Faulty, &rdev->flags) || 2327 test_bit(Journal, &rdev->flags) || 2328 rdev->raid_disk == -1) 2329 continue; 2330 rdev_for_each_rcu(rdev2, mddev2) { 2331 if (test_bit(Faulty, &rdev2->flags) || 2332 test_bit(Journal, &rdev2->flags) || 2333 rdev2->raid_disk == -1) 2334 continue; 2335 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2336 rcu_read_unlock(); 2337 return 1; 2338 } 2339 } 2340 } 2341 rcu_read_unlock(); 2342 return 0; 2343 } 2344 2345 static LIST_HEAD(pending_raid_disks); 2346 2347 /* 2348 * Try to register data integrity profile for an mddev 2349 * 2350 * This is called when an array is started and after a disk has been kicked 2351 * from the array. It only succeeds if all working and active component devices 2352 * are integrity capable with matching profiles. 2353 */ 2354 int md_integrity_register(struct mddev *mddev) 2355 { 2356 if (list_empty(&mddev->disks)) 2357 return 0; /* nothing to do */ 2358 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) 2359 return 0; /* shouldn't register */ 2360 2361 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2362 return 0; 2363 } 2364 EXPORT_SYMBOL(md_integrity_register); 2365 2366 static bool rdev_read_only(struct md_rdev *rdev) 2367 { 2368 return bdev_read_only(rdev->bdev) || 2369 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2370 } 2371 2372 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2373 { 2374 char b[BDEVNAME_SIZE]; 2375 int err; 2376 2377 /* prevent duplicates */ 2378 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2379 return -EEXIST; 2380 2381 if (rdev_read_only(rdev) && mddev->pers) 2382 return -EROFS; 2383 2384 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2385 if (!test_bit(Journal, &rdev->flags) && 2386 rdev->sectors && 2387 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2388 if (mddev->pers) { 2389 /* Cannot change size, so fail 2390 * If mddev->level <= 0, then we don't care 2391 * about aligning sizes (e.g. linear) 2392 */ 2393 if (mddev->level > 0) 2394 return -ENOSPC; 2395 } else 2396 mddev->dev_sectors = rdev->sectors; 2397 } 2398 2399 /* Verify rdev->desc_nr is unique. 2400 * If it is -1, assign a free number, else 2401 * check number is not in use 2402 */ 2403 rcu_read_lock(); 2404 if (rdev->desc_nr < 0) { 2405 int choice = 0; 2406 if (mddev->pers) 2407 choice = mddev->raid_disks; 2408 while (md_find_rdev_nr_rcu(mddev, choice)) 2409 choice++; 2410 rdev->desc_nr = choice; 2411 } else { 2412 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2413 rcu_read_unlock(); 2414 return -EBUSY; 2415 } 2416 } 2417 rcu_read_unlock(); 2418 if (!test_bit(Journal, &rdev->flags) && 2419 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2420 pr_warn("md: %s: array is limited to %d devices\n", 2421 mdname(mddev), mddev->max_disks); 2422 return -EBUSY; 2423 } 2424 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2425 strreplace(b, '/', '!'); 2426 2427 rdev->mddev = mddev; 2428 pr_debug("md: bind<%s>\n", b); 2429 2430 if (mddev->raid_disks) 2431 mddev_create_serial_pool(mddev, rdev); 2432 2433 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2434 goto fail; 2435 2436 /* failure here is OK */ 2437 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2438 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2439 rdev->sysfs_unack_badblocks = 2440 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2441 rdev->sysfs_badblocks = 2442 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2443 2444 list_add_rcu(&rdev->same_set, &mddev->disks); 2445 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2446 2447 /* May as well allow recovery to be retried once */ 2448 mddev->recovery_disabled++; 2449 2450 return 0; 2451 2452 fail: 2453 pr_warn("md: failed to register dev-%s for %s\n", 2454 b, mdname(mddev)); 2455 mddev_destroy_serial_pool(mddev, rdev); 2456 return err; 2457 } 2458 2459 void md_autodetect_dev(dev_t dev); 2460 2461 /* just for claiming the bdev */ 2462 static struct md_rdev claim_rdev; 2463 2464 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2465 { 2466 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2467 md_rdev_clear(rdev); 2468 #ifndef MODULE 2469 if (test_bit(AutoDetected, &rdev->flags)) 2470 md_autodetect_dev(rdev->bdev->bd_dev); 2471 #endif 2472 fput(rdev->bdev_file); 2473 rdev->bdev = NULL; 2474 kobject_put(&rdev->kobj); 2475 } 2476 2477 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2478 { 2479 struct mddev *mddev = rdev->mddev; 2480 2481 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2482 list_del_rcu(&rdev->same_set); 2483 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2484 mddev_destroy_serial_pool(rdev->mddev, rdev); 2485 WRITE_ONCE(rdev->mddev, NULL); 2486 sysfs_remove_link(&rdev->kobj, "block"); 2487 sysfs_put(rdev->sysfs_state); 2488 sysfs_put(rdev->sysfs_unack_badblocks); 2489 sysfs_put(rdev->sysfs_badblocks); 2490 rdev->sysfs_state = NULL; 2491 rdev->sysfs_unack_badblocks = NULL; 2492 rdev->sysfs_badblocks = NULL; 2493 rdev->badblocks.count = 0; 2494 2495 synchronize_rcu(); 2496 2497 /* 2498 * kobject_del() will wait for all in progress writers to be done, where 2499 * reconfig_mutex is held, hence it can't be called under 2500 * reconfig_mutex and it's delayed to mddev_unlock(). 2501 */ 2502 list_add(&rdev->same_set, &mddev->deleting); 2503 } 2504 2505 static void export_array(struct mddev *mddev) 2506 { 2507 struct md_rdev *rdev; 2508 2509 while (!list_empty(&mddev->disks)) { 2510 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2511 same_set); 2512 md_kick_rdev_from_array(rdev); 2513 } 2514 mddev->raid_disks = 0; 2515 mddev->major_version = 0; 2516 } 2517 2518 static bool set_in_sync(struct mddev *mddev) 2519 { 2520 lockdep_assert_held(&mddev->lock); 2521 if (!mddev->in_sync) { 2522 mddev->sync_checkers++; 2523 spin_unlock(&mddev->lock); 2524 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2525 spin_lock(&mddev->lock); 2526 if (!mddev->in_sync && 2527 percpu_ref_is_zero(&mddev->writes_pending)) { 2528 mddev->in_sync = 1; 2529 /* 2530 * Ensure ->in_sync is visible before we clear 2531 * ->sync_checkers. 2532 */ 2533 smp_mb(); 2534 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2535 sysfs_notify_dirent_safe(mddev->sysfs_state); 2536 } 2537 if (--mddev->sync_checkers == 0) 2538 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2539 } 2540 if (mddev->safemode == 1) 2541 mddev->safemode = 0; 2542 return mddev->in_sync; 2543 } 2544 2545 static void sync_sbs(struct mddev *mddev, int nospares) 2546 { 2547 /* Update each superblock (in-memory image), but 2548 * if we are allowed to, skip spares which already 2549 * have the right event counter, or have one earlier 2550 * (which would mean they aren't being marked as dirty 2551 * with the rest of the array) 2552 */ 2553 struct md_rdev *rdev; 2554 rdev_for_each(rdev, mddev) { 2555 if (rdev->sb_events == mddev->events || 2556 (nospares && 2557 rdev->raid_disk < 0 && 2558 rdev->sb_events+1 == mddev->events)) { 2559 /* Don't update this superblock */ 2560 rdev->sb_loaded = 2; 2561 } else { 2562 sync_super(mddev, rdev); 2563 rdev->sb_loaded = 1; 2564 } 2565 } 2566 } 2567 2568 static bool does_sb_need_changing(struct mddev *mddev) 2569 { 2570 struct md_rdev *rdev = NULL, *iter; 2571 struct mdp_superblock_1 *sb; 2572 int role; 2573 2574 /* Find a good rdev */ 2575 rdev_for_each(iter, mddev) 2576 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2577 rdev = iter; 2578 break; 2579 } 2580 2581 /* No good device found. */ 2582 if (!rdev) 2583 return false; 2584 2585 sb = page_address(rdev->sb_page); 2586 /* Check if a device has become faulty or a spare become active */ 2587 rdev_for_each(rdev, mddev) { 2588 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2589 /* Device activated? */ 2590 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2591 !test_bit(Faulty, &rdev->flags)) 2592 return true; 2593 /* Device turned faulty? */ 2594 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2595 return true; 2596 } 2597 2598 /* Check if any mddev parameters have changed */ 2599 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2600 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2601 (mddev->layout != le32_to_cpu(sb->layout)) || 2602 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2603 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2604 return true; 2605 2606 return false; 2607 } 2608 2609 void md_update_sb(struct mddev *mddev, int force_change) 2610 { 2611 struct md_rdev *rdev; 2612 int sync_req; 2613 int nospares = 0; 2614 int any_badblocks_changed = 0; 2615 int ret = -1; 2616 2617 if (!md_is_rdwr(mddev)) { 2618 if (force_change) 2619 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2620 return; 2621 } 2622 2623 repeat: 2624 if (mddev_is_clustered(mddev)) { 2625 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2626 force_change = 1; 2627 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2628 nospares = 1; 2629 ret = md_cluster_ops->metadata_update_start(mddev); 2630 /* Has someone else has updated the sb */ 2631 if (!does_sb_need_changing(mddev)) { 2632 if (ret == 0) 2633 md_cluster_ops->metadata_update_cancel(mddev); 2634 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2635 BIT(MD_SB_CHANGE_DEVS) | 2636 BIT(MD_SB_CHANGE_CLEAN)); 2637 return; 2638 } 2639 } 2640 2641 /* 2642 * First make sure individual recovery_offsets are correct 2643 * curr_resync_completed can only be used during recovery. 2644 * During reshape/resync it might use array-addresses rather 2645 * that device addresses. 2646 */ 2647 rdev_for_each(rdev, mddev) { 2648 if (rdev->raid_disk >= 0 && 2649 mddev->delta_disks >= 0 && 2650 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2651 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2652 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2653 !test_bit(Journal, &rdev->flags) && 2654 !test_bit(In_sync, &rdev->flags) && 2655 mddev->curr_resync_completed > rdev->recovery_offset) 2656 rdev->recovery_offset = mddev->curr_resync_completed; 2657 2658 } 2659 if (!mddev->persistent) { 2660 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2661 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2662 if (!mddev->external) { 2663 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2664 rdev_for_each(rdev, mddev) { 2665 if (rdev->badblocks.changed) { 2666 rdev->badblocks.changed = 0; 2667 ack_all_badblocks(&rdev->badblocks); 2668 md_error(mddev, rdev); 2669 } 2670 clear_bit(Blocked, &rdev->flags); 2671 clear_bit(BlockedBadBlocks, &rdev->flags); 2672 wake_up(&rdev->blocked_wait); 2673 } 2674 } 2675 wake_up(&mddev->sb_wait); 2676 return; 2677 } 2678 2679 spin_lock(&mddev->lock); 2680 2681 mddev->utime = ktime_get_real_seconds(); 2682 2683 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2684 force_change = 1; 2685 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2686 /* just a clean<-> dirty transition, possibly leave spares alone, 2687 * though if events isn't the right even/odd, we will have to do 2688 * spares after all 2689 */ 2690 nospares = 1; 2691 if (force_change) 2692 nospares = 0; 2693 if (mddev->degraded) 2694 /* If the array is degraded, then skipping spares is both 2695 * dangerous and fairly pointless. 2696 * Dangerous because a device that was removed from the array 2697 * might have a event_count that still looks up-to-date, 2698 * so it can be re-added without a resync. 2699 * Pointless because if there are any spares to skip, 2700 * then a recovery will happen and soon that array won't 2701 * be degraded any more and the spare can go back to sleep then. 2702 */ 2703 nospares = 0; 2704 2705 sync_req = mddev->in_sync; 2706 2707 /* If this is just a dirty<->clean transition, and the array is clean 2708 * and 'events' is odd, we can roll back to the previous clean state */ 2709 if (nospares 2710 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2711 && mddev->can_decrease_events 2712 && mddev->events != 1) { 2713 mddev->events--; 2714 mddev->can_decrease_events = 0; 2715 } else { 2716 /* otherwise we have to go forward and ... */ 2717 mddev->events ++; 2718 mddev->can_decrease_events = nospares; 2719 } 2720 2721 /* 2722 * This 64-bit counter should never wrap. 2723 * Either we are in around ~1 trillion A.C., assuming 2724 * 1 reboot per second, or we have a bug... 2725 */ 2726 WARN_ON(mddev->events == 0); 2727 2728 rdev_for_each(rdev, mddev) { 2729 if (rdev->badblocks.changed) 2730 any_badblocks_changed++; 2731 if (test_bit(Faulty, &rdev->flags)) 2732 set_bit(FaultRecorded, &rdev->flags); 2733 } 2734 2735 sync_sbs(mddev, nospares); 2736 spin_unlock(&mddev->lock); 2737 2738 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2739 mdname(mddev), mddev->in_sync); 2740 2741 mddev_add_trace_msg(mddev, "md md_update_sb"); 2742 rewrite: 2743 mddev->bitmap_ops->update_sb(mddev->bitmap); 2744 rdev_for_each(rdev, mddev) { 2745 if (rdev->sb_loaded != 1) 2746 continue; /* no noise on spare devices */ 2747 2748 if (!test_bit(Faulty, &rdev->flags)) { 2749 md_super_write(mddev,rdev, 2750 rdev->sb_start, rdev->sb_size, 2751 rdev->sb_page); 2752 pr_debug("md: (write) %pg's sb offset: %llu\n", 2753 rdev->bdev, 2754 (unsigned long long)rdev->sb_start); 2755 rdev->sb_events = mddev->events; 2756 if (rdev->badblocks.size) { 2757 md_super_write(mddev, rdev, 2758 rdev->badblocks.sector, 2759 rdev->badblocks.size << 9, 2760 rdev->bb_page); 2761 rdev->badblocks.size = 0; 2762 } 2763 2764 } else 2765 pr_debug("md: %pg (skipping faulty)\n", 2766 rdev->bdev); 2767 } 2768 if (md_super_wait(mddev) < 0) 2769 goto rewrite; 2770 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2771 2772 if (mddev_is_clustered(mddev) && ret == 0) 2773 md_cluster_ops->metadata_update_finish(mddev); 2774 2775 if (mddev->in_sync != sync_req || 2776 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2777 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2778 /* have to write it out again */ 2779 goto repeat; 2780 wake_up(&mddev->sb_wait); 2781 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2782 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2783 2784 rdev_for_each(rdev, mddev) { 2785 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2786 clear_bit(Blocked, &rdev->flags); 2787 2788 if (any_badblocks_changed) 2789 ack_all_badblocks(&rdev->badblocks); 2790 clear_bit(BlockedBadBlocks, &rdev->flags); 2791 wake_up(&rdev->blocked_wait); 2792 } 2793 } 2794 EXPORT_SYMBOL(md_update_sb); 2795 2796 static int add_bound_rdev(struct md_rdev *rdev) 2797 { 2798 struct mddev *mddev = rdev->mddev; 2799 int err = 0; 2800 bool add_journal = test_bit(Journal, &rdev->flags); 2801 2802 if (!mddev->pers->hot_remove_disk || add_journal) { 2803 /* If there is hot_add_disk but no hot_remove_disk 2804 * then added disks for geometry changes, 2805 * and should be added immediately. 2806 */ 2807 super_types[mddev->major_version]. 2808 validate_super(mddev, NULL/*freshest*/, rdev); 2809 err = mddev->pers->hot_add_disk(mddev, rdev); 2810 if (err) { 2811 md_kick_rdev_from_array(rdev); 2812 return err; 2813 } 2814 } 2815 sysfs_notify_dirent_safe(rdev->sysfs_state); 2816 2817 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2818 if (mddev->degraded) 2819 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2820 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2821 md_new_event(); 2822 return 0; 2823 } 2824 2825 /* words written to sysfs files may, or may not, be \n terminated. 2826 * We want to accept with case. For this we use cmd_match. 2827 */ 2828 static int cmd_match(const char *cmd, const char *str) 2829 { 2830 /* See if cmd, written into a sysfs file, matches 2831 * str. They must either be the same, or cmd can 2832 * have a trailing newline 2833 */ 2834 while (*cmd && *str && *cmd == *str) { 2835 cmd++; 2836 str++; 2837 } 2838 if (*cmd == '\n') 2839 cmd++; 2840 if (*str || *cmd) 2841 return 0; 2842 return 1; 2843 } 2844 2845 struct rdev_sysfs_entry { 2846 struct attribute attr; 2847 ssize_t (*show)(struct md_rdev *, char *); 2848 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2849 }; 2850 2851 static ssize_t 2852 state_show(struct md_rdev *rdev, char *page) 2853 { 2854 char *sep = ","; 2855 size_t len = 0; 2856 unsigned long flags = READ_ONCE(rdev->flags); 2857 2858 if (test_bit(Faulty, &flags) || 2859 (!test_bit(ExternalBbl, &flags) && 2860 rdev->badblocks.unacked_exist)) 2861 len += sprintf(page+len, "faulty%s", sep); 2862 if (test_bit(In_sync, &flags)) 2863 len += sprintf(page+len, "in_sync%s", sep); 2864 if (test_bit(Journal, &flags)) 2865 len += sprintf(page+len, "journal%s", sep); 2866 if (test_bit(WriteMostly, &flags)) 2867 len += sprintf(page+len, "write_mostly%s", sep); 2868 if (test_bit(Blocked, &flags) || 2869 (rdev->badblocks.unacked_exist 2870 && !test_bit(Faulty, &flags))) 2871 len += sprintf(page+len, "blocked%s", sep); 2872 if (!test_bit(Faulty, &flags) && 2873 !test_bit(Journal, &flags) && 2874 !test_bit(In_sync, &flags)) 2875 len += sprintf(page+len, "spare%s", sep); 2876 if (test_bit(WriteErrorSeen, &flags)) 2877 len += sprintf(page+len, "write_error%s", sep); 2878 if (test_bit(WantReplacement, &flags)) 2879 len += sprintf(page+len, "want_replacement%s", sep); 2880 if (test_bit(Replacement, &flags)) 2881 len += sprintf(page+len, "replacement%s", sep); 2882 if (test_bit(ExternalBbl, &flags)) 2883 len += sprintf(page+len, "external_bbl%s", sep); 2884 if (test_bit(FailFast, &flags)) 2885 len += sprintf(page+len, "failfast%s", sep); 2886 2887 if (len) 2888 len -= strlen(sep); 2889 2890 return len+sprintf(page+len, "\n"); 2891 } 2892 2893 static ssize_t 2894 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2895 { 2896 /* can write 2897 * faulty - simulates an error 2898 * remove - disconnects the device 2899 * writemostly - sets write_mostly 2900 * -writemostly - clears write_mostly 2901 * blocked - sets the Blocked flags 2902 * -blocked - clears the Blocked and possibly simulates an error 2903 * insync - sets Insync providing device isn't active 2904 * -insync - clear Insync for a device with a slot assigned, 2905 * so that it gets rebuilt based on bitmap 2906 * write_error - sets WriteErrorSeen 2907 * -write_error - clears WriteErrorSeen 2908 * {,-}failfast - set/clear FailFast 2909 */ 2910 2911 struct mddev *mddev = rdev->mddev; 2912 int err = -EINVAL; 2913 bool need_update_sb = false; 2914 2915 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2916 md_error(rdev->mddev, rdev); 2917 2918 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 2919 err = -EBUSY; 2920 else 2921 err = 0; 2922 } else if (cmd_match(buf, "remove")) { 2923 if (rdev->mddev->pers) { 2924 clear_bit(Blocked, &rdev->flags); 2925 remove_and_add_spares(rdev->mddev, rdev); 2926 } 2927 if (rdev->raid_disk >= 0) 2928 err = -EBUSY; 2929 else { 2930 err = 0; 2931 if (mddev_is_clustered(mddev)) 2932 err = md_cluster_ops->remove_disk(mddev, rdev); 2933 2934 if (err == 0) { 2935 md_kick_rdev_from_array(rdev); 2936 if (mddev->pers) 2937 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2938 md_new_event(); 2939 } 2940 } 2941 } else if (cmd_match(buf, "writemostly")) { 2942 set_bit(WriteMostly, &rdev->flags); 2943 mddev_create_serial_pool(rdev->mddev, rdev); 2944 need_update_sb = true; 2945 err = 0; 2946 } else if (cmd_match(buf, "-writemostly")) { 2947 mddev_destroy_serial_pool(rdev->mddev, rdev); 2948 clear_bit(WriteMostly, &rdev->flags); 2949 need_update_sb = true; 2950 err = 0; 2951 } else if (cmd_match(buf, "blocked")) { 2952 set_bit(Blocked, &rdev->flags); 2953 err = 0; 2954 } else if (cmd_match(buf, "-blocked")) { 2955 if (!test_bit(Faulty, &rdev->flags) && 2956 !test_bit(ExternalBbl, &rdev->flags) && 2957 rdev->badblocks.unacked_exist) { 2958 /* metadata handler doesn't understand badblocks, 2959 * so we need to fail the device 2960 */ 2961 md_error(rdev->mddev, rdev); 2962 } 2963 clear_bit(Blocked, &rdev->flags); 2964 clear_bit(BlockedBadBlocks, &rdev->flags); 2965 wake_up(&rdev->blocked_wait); 2966 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2967 2968 err = 0; 2969 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2970 set_bit(In_sync, &rdev->flags); 2971 err = 0; 2972 } else if (cmd_match(buf, "failfast")) { 2973 set_bit(FailFast, &rdev->flags); 2974 need_update_sb = true; 2975 err = 0; 2976 } else if (cmd_match(buf, "-failfast")) { 2977 clear_bit(FailFast, &rdev->flags); 2978 need_update_sb = true; 2979 err = 0; 2980 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 2981 !test_bit(Journal, &rdev->flags)) { 2982 if (rdev->mddev->pers == NULL) { 2983 clear_bit(In_sync, &rdev->flags); 2984 rdev->saved_raid_disk = rdev->raid_disk; 2985 rdev->raid_disk = -1; 2986 err = 0; 2987 } 2988 } else if (cmd_match(buf, "write_error")) { 2989 set_bit(WriteErrorSeen, &rdev->flags); 2990 err = 0; 2991 } else if (cmd_match(buf, "-write_error")) { 2992 clear_bit(WriteErrorSeen, &rdev->flags); 2993 err = 0; 2994 } else if (cmd_match(buf, "want_replacement")) { 2995 /* Any non-spare device that is not a replacement can 2996 * become want_replacement at any time, but we then need to 2997 * check if recovery is needed. 2998 */ 2999 if (rdev->raid_disk >= 0 && 3000 !test_bit(Journal, &rdev->flags) && 3001 !test_bit(Replacement, &rdev->flags)) 3002 set_bit(WantReplacement, &rdev->flags); 3003 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3004 err = 0; 3005 } else if (cmd_match(buf, "-want_replacement")) { 3006 /* Clearing 'want_replacement' is always allowed. 3007 * Once replacements starts it is too late though. 3008 */ 3009 err = 0; 3010 clear_bit(WantReplacement, &rdev->flags); 3011 } else if (cmd_match(buf, "replacement")) { 3012 /* Can only set a device as a replacement when array has not 3013 * yet been started. Once running, replacement is automatic 3014 * from spares, or by assigning 'slot'. 3015 */ 3016 if (rdev->mddev->pers) 3017 err = -EBUSY; 3018 else { 3019 set_bit(Replacement, &rdev->flags); 3020 err = 0; 3021 } 3022 } else if (cmd_match(buf, "-replacement")) { 3023 /* Similarly, can only clear Replacement before start */ 3024 if (rdev->mddev->pers) 3025 err = -EBUSY; 3026 else { 3027 clear_bit(Replacement, &rdev->flags); 3028 err = 0; 3029 } 3030 } else if (cmd_match(buf, "re-add")) { 3031 if (!rdev->mddev->pers) 3032 err = -EINVAL; 3033 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3034 rdev->saved_raid_disk >= 0) { 3035 /* clear_bit is performed _after_ all the devices 3036 * have their local Faulty bit cleared. If any writes 3037 * happen in the meantime in the local node, they 3038 * will land in the local bitmap, which will be synced 3039 * by this node eventually 3040 */ 3041 if (!mddev_is_clustered(rdev->mddev) || 3042 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 3043 clear_bit(Faulty, &rdev->flags); 3044 err = add_bound_rdev(rdev); 3045 } 3046 } else 3047 err = -EBUSY; 3048 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3049 set_bit(ExternalBbl, &rdev->flags); 3050 rdev->badblocks.shift = 0; 3051 err = 0; 3052 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3053 clear_bit(ExternalBbl, &rdev->flags); 3054 err = 0; 3055 } 3056 if (need_update_sb) 3057 md_update_sb(mddev, 1); 3058 if (!err) 3059 sysfs_notify_dirent_safe(rdev->sysfs_state); 3060 return err ? err : len; 3061 } 3062 static struct rdev_sysfs_entry rdev_state = 3063 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3064 3065 static ssize_t 3066 errors_show(struct md_rdev *rdev, char *page) 3067 { 3068 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3069 } 3070 3071 static ssize_t 3072 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3073 { 3074 unsigned int n; 3075 int rv; 3076 3077 rv = kstrtouint(buf, 10, &n); 3078 if (rv < 0) 3079 return rv; 3080 atomic_set(&rdev->corrected_errors, n); 3081 return len; 3082 } 3083 static struct rdev_sysfs_entry rdev_errors = 3084 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3085 3086 static ssize_t 3087 slot_show(struct md_rdev *rdev, char *page) 3088 { 3089 if (test_bit(Journal, &rdev->flags)) 3090 return sprintf(page, "journal\n"); 3091 else if (rdev->raid_disk < 0) 3092 return sprintf(page, "none\n"); 3093 else 3094 return sprintf(page, "%d\n", rdev->raid_disk); 3095 } 3096 3097 static ssize_t 3098 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3099 { 3100 int slot; 3101 int err; 3102 3103 if (test_bit(Journal, &rdev->flags)) 3104 return -EBUSY; 3105 if (strncmp(buf, "none", 4)==0) 3106 slot = -1; 3107 else { 3108 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3109 if (err < 0) 3110 return err; 3111 if (slot < 0) 3112 /* overflow */ 3113 return -ENOSPC; 3114 } 3115 if (rdev->mddev->pers && slot == -1) { 3116 /* Setting 'slot' on an active array requires also 3117 * updating the 'rd%d' link, and communicating 3118 * with the personality with ->hot_*_disk. 3119 * For now we only support removing 3120 * failed/spare devices. This normally happens automatically, 3121 * but not when the metadata is externally managed. 3122 */ 3123 if (rdev->raid_disk == -1) 3124 return -EEXIST; 3125 /* personality does all needed checks */ 3126 if (rdev->mddev->pers->hot_remove_disk == NULL) 3127 return -EINVAL; 3128 clear_bit(Blocked, &rdev->flags); 3129 remove_and_add_spares(rdev->mddev, rdev); 3130 if (rdev->raid_disk >= 0) 3131 return -EBUSY; 3132 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3133 } else if (rdev->mddev->pers) { 3134 /* Activating a spare .. or possibly reactivating 3135 * if we ever get bitmaps working here. 3136 */ 3137 int err; 3138 3139 if (rdev->raid_disk != -1) 3140 return -EBUSY; 3141 3142 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3143 return -EBUSY; 3144 3145 if (rdev->mddev->pers->hot_add_disk == NULL) 3146 return -EINVAL; 3147 3148 if (slot >= rdev->mddev->raid_disks && 3149 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3150 return -ENOSPC; 3151 3152 rdev->raid_disk = slot; 3153 if (test_bit(In_sync, &rdev->flags)) 3154 rdev->saved_raid_disk = slot; 3155 else 3156 rdev->saved_raid_disk = -1; 3157 clear_bit(In_sync, &rdev->flags); 3158 clear_bit(Bitmap_sync, &rdev->flags); 3159 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3160 if (err) { 3161 rdev->raid_disk = -1; 3162 return err; 3163 } else 3164 sysfs_notify_dirent_safe(rdev->sysfs_state); 3165 /* failure here is OK */; 3166 sysfs_link_rdev(rdev->mddev, rdev); 3167 /* don't wakeup anyone, leave that to userspace. */ 3168 } else { 3169 if (slot >= rdev->mddev->raid_disks && 3170 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3171 return -ENOSPC; 3172 rdev->raid_disk = slot; 3173 /* assume it is working */ 3174 clear_bit(Faulty, &rdev->flags); 3175 clear_bit(WriteMostly, &rdev->flags); 3176 set_bit(In_sync, &rdev->flags); 3177 sysfs_notify_dirent_safe(rdev->sysfs_state); 3178 } 3179 return len; 3180 } 3181 3182 static struct rdev_sysfs_entry rdev_slot = 3183 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3184 3185 static ssize_t 3186 offset_show(struct md_rdev *rdev, char *page) 3187 { 3188 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3189 } 3190 3191 static ssize_t 3192 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3193 { 3194 unsigned long long offset; 3195 if (kstrtoull(buf, 10, &offset) < 0) 3196 return -EINVAL; 3197 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3198 return -EBUSY; 3199 if (rdev->sectors && rdev->mddev->external) 3200 /* Must set offset before size, so overlap checks 3201 * can be sane */ 3202 return -EBUSY; 3203 rdev->data_offset = offset; 3204 rdev->new_data_offset = offset; 3205 return len; 3206 } 3207 3208 static struct rdev_sysfs_entry rdev_offset = 3209 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3210 3211 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3212 { 3213 return sprintf(page, "%llu\n", 3214 (unsigned long long)rdev->new_data_offset); 3215 } 3216 3217 static ssize_t new_offset_store(struct md_rdev *rdev, 3218 const char *buf, size_t len) 3219 { 3220 unsigned long long new_offset; 3221 struct mddev *mddev = rdev->mddev; 3222 3223 if (kstrtoull(buf, 10, &new_offset) < 0) 3224 return -EINVAL; 3225 3226 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3227 return -EBUSY; 3228 if (new_offset == rdev->data_offset) 3229 /* reset is always permitted */ 3230 ; 3231 else if (new_offset > rdev->data_offset) { 3232 /* must not push array size beyond rdev_sectors */ 3233 if (new_offset - rdev->data_offset 3234 + mddev->dev_sectors > rdev->sectors) 3235 return -E2BIG; 3236 } 3237 /* Metadata worries about other space details. */ 3238 3239 /* decreasing the offset is inconsistent with a backwards 3240 * reshape. 3241 */ 3242 if (new_offset < rdev->data_offset && 3243 mddev->reshape_backwards) 3244 return -EINVAL; 3245 /* Increasing offset is inconsistent with forwards 3246 * reshape. reshape_direction should be set to 3247 * 'backwards' first. 3248 */ 3249 if (new_offset > rdev->data_offset && 3250 !mddev->reshape_backwards) 3251 return -EINVAL; 3252 3253 if (mddev->pers && mddev->persistent && 3254 !super_types[mddev->major_version] 3255 .allow_new_offset(rdev, new_offset)) 3256 return -E2BIG; 3257 rdev->new_data_offset = new_offset; 3258 if (new_offset > rdev->data_offset) 3259 mddev->reshape_backwards = 1; 3260 else if (new_offset < rdev->data_offset) 3261 mddev->reshape_backwards = 0; 3262 3263 return len; 3264 } 3265 static struct rdev_sysfs_entry rdev_new_offset = 3266 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3267 3268 static ssize_t 3269 rdev_size_show(struct md_rdev *rdev, char *page) 3270 { 3271 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3272 } 3273 3274 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3275 { 3276 /* check if two start/length pairs overlap */ 3277 if (a->data_offset + a->sectors <= b->data_offset) 3278 return false; 3279 if (b->data_offset + b->sectors <= a->data_offset) 3280 return false; 3281 return true; 3282 } 3283 3284 static bool md_rdev_overlaps(struct md_rdev *rdev) 3285 { 3286 struct mddev *mddev; 3287 struct md_rdev *rdev2; 3288 3289 spin_lock(&all_mddevs_lock); 3290 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3291 if (test_bit(MD_DELETED, &mddev->flags)) 3292 continue; 3293 rdev_for_each(rdev2, mddev) { 3294 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3295 md_rdevs_overlap(rdev, rdev2)) { 3296 spin_unlock(&all_mddevs_lock); 3297 return true; 3298 } 3299 } 3300 } 3301 spin_unlock(&all_mddevs_lock); 3302 return false; 3303 } 3304 3305 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3306 { 3307 unsigned long long blocks; 3308 sector_t new; 3309 3310 if (kstrtoull(buf, 10, &blocks) < 0) 3311 return -EINVAL; 3312 3313 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3314 return -EINVAL; /* sector conversion overflow */ 3315 3316 new = blocks * 2; 3317 if (new != blocks * 2) 3318 return -EINVAL; /* unsigned long long to sector_t overflow */ 3319 3320 *sectors = new; 3321 return 0; 3322 } 3323 3324 static ssize_t 3325 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3326 { 3327 struct mddev *my_mddev = rdev->mddev; 3328 sector_t oldsectors = rdev->sectors; 3329 sector_t sectors; 3330 3331 if (test_bit(Journal, &rdev->flags)) 3332 return -EBUSY; 3333 if (strict_blocks_to_sectors(buf, §ors) < 0) 3334 return -EINVAL; 3335 if (rdev->data_offset != rdev->new_data_offset) 3336 return -EINVAL; /* too confusing */ 3337 if (my_mddev->pers && rdev->raid_disk >= 0) { 3338 if (my_mddev->persistent) { 3339 sectors = super_types[my_mddev->major_version]. 3340 rdev_size_change(rdev, sectors); 3341 if (!sectors) 3342 return -EBUSY; 3343 } else if (!sectors) 3344 sectors = bdev_nr_sectors(rdev->bdev) - 3345 rdev->data_offset; 3346 if (!my_mddev->pers->resize) 3347 /* Cannot change size for RAID0 or Linear etc */ 3348 return -EINVAL; 3349 } 3350 if (sectors < my_mddev->dev_sectors) 3351 return -EINVAL; /* component must fit device */ 3352 3353 rdev->sectors = sectors; 3354 3355 /* 3356 * Check that all other rdevs with the same bdev do not overlap. This 3357 * check does not provide a hard guarantee, it just helps avoid 3358 * dangerous mistakes. 3359 */ 3360 if (sectors > oldsectors && my_mddev->external && 3361 md_rdev_overlaps(rdev)) { 3362 /* 3363 * Someone else could have slipped in a size change here, but 3364 * doing so is just silly. We put oldsectors back because we 3365 * know it is safe, and trust userspace not to race with itself. 3366 */ 3367 rdev->sectors = oldsectors; 3368 return -EBUSY; 3369 } 3370 return len; 3371 } 3372 3373 static struct rdev_sysfs_entry rdev_size = 3374 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3375 3376 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3377 { 3378 unsigned long long recovery_start = rdev->recovery_offset; 3379 3380 if (test_bit(In_sync, &rdev->flags) || 3381 recovery_start == MaxSector) 3382 return sprintf(page, "none\n"); 3383 3384 return sprintf(page, "%llu\n", recovery_start); 3385 } 3386 3387 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3388 { 3389 unsigned long long recovery_start; 3390 3391 if (cmd_match(buf, "none")) 3392 recovery_start = MaxSector; 3393 else if (kstrtoull(buf, 10, &recovery_start)) 3394 return -EINVAL; 3395 3396 if (rdev->mddev->pers && 3397 rdev->raid_disk >= 0) 3398 return -EBUSY; 3399 3400 rdev->recovery_offset = recovery_start; 3401 if (recovery_start == MaxSector) 3402 set_bit(In_sync, &rdev->flags); 3403 else 3404 clear_bit(In_sync, &rdev->flags); 3405 return len; 3406 } 3407 3408 static struct rdev_sysfs_entry rdev_recovery_start = 3409 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3410 3411 /* sysfs access to bad-blocks list. 3412 * We present two files. 3413 * 'bad-blocks' lists sector numbers and lengths of ranges that 3414 * are recorded as bad. The list is truncated to fit within 3415 * the one-page limit of sysfs. 3416 * Writing "sector length" to this file adds an acknowledged 3417 * bad block list. 3418 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3419 * been acknowledged. Writing to this file adds bad blocks 3420 * without acknowledging them. This is largely for testing. 3421 */ 3422 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3423 { 3424 return badblocks_show(&rdev->badblocks, page, 0); 3425 } 3426 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3427 { 3428 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3429 /* Maybe that ack was all we needed */ 3430 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3431 wake_up(&rdev->blocked_wait); 3432 return rv; 3433 } 3434 static struct rdev_sysfs_entry rdev_bad_blocks = 3435 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3436 3437 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3438 { 3439 return badblocks_show(&rdev->badblocks, page, 1); 3440 } 3441 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3442 { 3443 return badblocks_store(&rdev->badblocks, page, len, 1); 3444 } 3445 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3446 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3447 3448 static ssize_t 3449 ppl_sector_show(struct md_rdev *rdev, char *page) 3450 { 3451 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3452 } 3453 3454 static ssize_t 3455 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3456 { 3457 unsigned long long sector; 3458 3459 if (kstrtoull(buf, 10, §or) < 0) 3460 return -EINVAL; 3461 if (sector != (sector_t)sector) 3462 return -EINVAL; 3463 3464 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3465 rdev->raid_disk >= 0) 3466 return -EBUSY; 3467 3468 if (rdev->mddev->persistent) { 3469 if (rdev->mddev->major_version == 0) 3470 return -EINVAL; 3471 if ((sector > rdev->sb_start && 3472 sector - rdev->sb_start > S16_MAX) || 3473 (sector < rdev->sb_start && 3474 rdev->sb_start - sector > -S16_MIN)) 3475 return -EINVAL; 3476 rdev->ppl.offset = sector - rdev->sb_start; 3477 } else if (!rdev->mddev->external) { 3478 return -EBUSY; 3479 } 3480 rdev->ppl.sector = sector; 3481 return len; 3482 } 3483 3484 static struct rdev_sysfs_entry rdev_ppl_sector = 3485 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3486 3487 static ssize_t 3488 ppl_size_show(struct md_rdev *rdev, char *page) 3489 { 3490 return sprintf(page, "%u\n", rdev->ppl.size); 3491 } 3492 3493 static ssize_t 3494 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3495 { 3496 unsigned int size; 3497 3498 if (kstrtouint(buf, 10, &size) < 0) 3499 return -EINVAL; 3500 3501 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3502 rdev->raid_disk >= 0) 3503 return -EBUSY; 3504 3505 if (rdev->mddev->persistent) { 3506 if (rdev->mddev->major_version == 0) 3507 return -EINVAL; 3508 if (size > U16_MAX) 3509 return -EINVAL; 3510 } else if (!rdev->mddev->external) { 3511 return -EBUSY; 3512 } 3513 rdev->ppl.size = size; 3514 return len; 3515 } 3516 3517 static struct rdev_sysfs_entry rdev_ppl_size = 3518 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3519 3520 static struct attribute *rdev_default_attrs[] = { 3521 &rdev_state.attr, 3522 &rdev_errors.attr, 3523 &rdev_slot.attr, 3524 &rdev_offset.attr, 3525 &rdev_new_offset.attr, 3526 &rdev_size.attr, 3527 &rdev_recovery_start.attr, 3528 &rdev_bad_blocks.attr, 3529 &rdev_unack_bad_blocks.attr, 3530 &rdev_ppl_sector.attr, 3531 &rdev_ppl_size.attr, 3532 NULL, 3533 }; 3534 ATTRIBUTE_GROUPS(rdev_default); 3535 static ssize_t 3536 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3537 { 3538 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3539 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3540 3541 if (!entry->show) 3542 return -EIO; 3543 if (!rdev->mddev) 3544 return -ENODEV; 3545 return entry->show(rdev, page); 3546 } 3547 3548 static ssize_t 3549 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3550 const char *page, size_t length) 3551 { 3552 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3553 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3554 struct kernfs_node *kn = NULL; 3555 bool suspend = false; 3556 ssize_t rv; 3557 struct mddev *mddev = READ_ONCE(rdev->mddev); 3558 3559 if (!entry->store) 3560 return -EIO; 3561 if (!capable(CAP_SYS_ADMIN)) 3562 return -EACCES; 3563 if (!mddev) 3564 return -ENODEV; 3565 3566 if (entry->store == state_store) { 3567 if (cmd_match(page, "remove")) 3568 kn = sysfs_break_active_protection(kobj, attr); 3569 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3570 cmd_match(page, "writemostly") || 3571 cmd_match(page, "-writemostly")) 3572 suspend = true; 3573 } 3574 3575 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3576 if (!rv) { 3577 if (rdev->mddev == NULL) 3578 rv = -ENODEV; 3579 else 3580 rv = entry->store(rdev, page, length); 3581 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3582 } 3583 3584 if (kn) 3585 sysfs_unbreak_active_protection(kn); 3586 3587 return rv; 3588 } 3589 3590 static void rdev_free(struct kobject *ko) 3591 { 3592 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3593 kfree(rdev); 3594 } 3595 static const struct sysfs_ops rdev_sysfs_ops = { 3596 .show = rdev_attr_show, 3597 .store = rdev_attr_store, 3598 }; 3599 static const struct kobj_type rdev_ktype = { 3600 .release = rdev_free, 3601 .sysfs_ops = &rdev_sysfs_ops, 3602 .default_groups = rdev_default_groups, 3603 }; 3604 3605 int md_rdev_init(struct md_rdev *rdev) 3606 { 3607 rdev->desc_nr = -1; 3608 rdev->saved_raid_disk = -1; 3609 rdev->raid_disk = -1; 3610 rdev->flags = 0; 3611 rdev->data_offset = 0; 3612 rdev->new_data_offset = 0; 3613 rdev->sb_events = 0; 3614 rdev->last_read_error = 0; 3615 rdev->sb_loaded = 0; 3616 rdev->bb_page = NULL; 3617 atomic_set(&rdev->nr_pending, 0); 3618 atomic_set(&rdev->read_errors, 0); 3619 atomic_set(&rdev->corrected_errors, 0); 3620 3621 INIT_LIST_HEAD(&rdev->same_set); 3622 init_waitqueue_head(&rdev->blocked_wait); 3623 3624 /* Add space to store bad block list. 3625 * This reserves the space even on arrays where it cannot 3626 * be used - I wonder if that matters 3627 */ 3628 return badblocks_init(&rdev->badblocks, 0); 3629 } 3630 EXPORT_SYMBOL_GPL(md_rdev_init); 3631 3632 /* 3633 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3634 * 3635 * mark the device faulty if: 3636 * 3637 * - the device is nonexistent (zero size) 3638 * - the device has no valid superblock 3639 * 3640 * a faulty rdev _never_ has rdev->sb set. 3641 */ 3642 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3643 { 3644 struct md_rdev *rdev; 3645 sector_t size; 3646 int err; 3647 3648 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3649 if (!rdev) 3650 return ERR_PTR(-ENOMEM); 3651 3652 err = md_rdev_init(rdev); 3653 if (err) 3654 goto out_free_rdev; 3655 err = alloc_disk_sb(rdev); 3656 if (err) 3657 goto out_clear_rdev; 3658 3659 rdev->bdev_file = bdev_file_open_by_dev(newdev, 3660 BLK_OPEN_READ | BLK_OPEN_WRITE, 3661 super_format == -2 ? &claim_rdev : rdev, NULL); 3662 if (IS_ERR(rdev->bdev_file)) { 3663 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3664 MAJOR(newdev), MINOR(newdev)); 3665 err = PTR_ERR(rdev->bdev_file); 3666 goto out_clear_rdev; 3667 } 3668 rdev->bdev = file_bdev(rdev->bdev_file); 3669 3670 kobject_init(&rdev->kobj, &rdev_ktype); 3671 3672 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3673 if (!size) { 3674 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3675 rdev->bdev); 3676 err = -EINVAL; 3677 goto out_blkdev_put; 3678 } 3679 3680 if (super_format >= 0) { 3681 err = super_types[super_format]. 3682 load_super(rdev, NULL, super_minor); 3683 if (err == -EINVAL) { 3684 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3685 rdev->bdev, 3686 super_format, super_minor); 3687 goto out_blkdev_put; 3688 } 3689 if (err < 0) { 3690 pr_warn("md: could not read %pg's sb, not importing!\n", 3691 rdev->bdev); 3692 goto out_blkdev_put; 3693 } 3694 } 3695 3696 return rdev; 3697 3698 out_blkdev_put: 3699 fput(rdev->bdev_file); 3700 out_clear_rdev: 3701 md_rdev_clear(rdev); 3702 out_free_rdev: 3703 kfree(rdev); 3704 return ERR_PTR(err); 3705 } 3706 3707 /* 3708 * Check a full RAID array for plausibility 3709 */ 3710 3711 static int analyze_sbs(struct mddev *mddev) 3712 { 3713 int i; 3714 struct md_rdev *rdev, *freshest, *tmp; 3715 3716 freshest = NULL; 3717 rdev_for_each_safe(rdev, tmp, mddev) 3718 switch (super_types[mddev->major_version]. 3719 load_super(rdev, freshest, mddev->minor_version)) { 3720 case 1: 3721 freshest = rdev; 3722 break; 3723 case 0: 3724 break; 3725 default: 3726 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3727 rdev->bdev); 3728 md_kick_rdev_from_array(rdev); 3729 } 3730 3731 /* Cannot find a valid fresh disk */ 3732 if (!freshest) { 3733 pr_warn("md: cannot find a valid disk\n"); 3734 return -EINVAL; 3735 } 3736 3737 super_types[mddev->major_version]. 3738 validate_super(mddev, NULL/*freshest*/, freshest); 3739 3740 i = 0; 3741 rdev_for_each_safe(rdev, tmp, mddev) { 3742 if (mddev->max_disks && 3743 (rdev->desc_nr >= mddev->max_disks || 3744 i > mddev->max_disks)) { 3745 pr_warn("md: %s: %pg: only %d devices permitted\n", 3746 mdname(mddev), rdev->bdev, 3747 mddev->max_disks); 3748 md_kick_rdev_from_array(rdev); 3749 continue; 3750 } 3751 if (rdev != freshest) { 3752 if (super_types[mddev->major_version]. 3753 validate_super(mddev, freshest, rdev)) { 3754 pr_warn("md: kicking non-fresh %pg from array!\n", 3755 rdev->bdev); 3756 md_kick_rdev_from_array(rdev); 3757 continue; 3758 } 3759 } 3760 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && 3761 !test_bit(Journal, &rdev->flags)) { 3762 rdev->raid_disk = -1; 3763 clear_bit(In_sync, &rdev->flags); 3764 } 3765 } 3766 3767 return 0; 3768 } 3769 3770 /* Read a fixed-point number. 3771 * Numbers in sysfs attributes should be in "standard" units where 3772 * possible, so time should be in seconds. 3773 * However we internally use a a much smaller unit such as 3774 * milliseconds or jiffies. 3775 * This function takes a decimal number with a possible fractional 3776 * component, and produces an integer which is the result of 3777 * multiplying that number by 10^'scale'. 3778 * all without any floating-point arithmetic. 3779 */ 3780 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3781 { 3782 unsigned long result = 0; 3783 long decimals = -1; 3784 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3785 if (*cp == '.') 3786 decimals = 0; 3787 else if (decimals < scale) { 3788 unsigned int value; 3789 value = *cp - '0'; 3790 result = result * 10 + value; 3791 if (decimals >= 0) 3792 decimals++; 3793 } 3794 cp++; 3795 } 3796 if (*cp == '\n') 3797 cp++; 3798 if (*cp) 3799 return -EINVAL; 3800 if (decimals < 0) 3801 decimals = 0; 3802 *res = result * int_pow(10, scale - decimals); 3803 return 0; 3804 } 3805 3806 static ssize_t 3807 safe_delay_show(struct mddev *mddev, char *page) 3808 { 3809 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3810 3811 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3812 } 3813 static ssize_t 3814 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3815 { 3816 unsigned long msec; 3817 3818 if (mddev_is_clustered(mddev)) { 3819 pr_warn("md: Safemode is disabled for clustered mode\n"); 3820 return -EINVAL; 3821 } 3822 3823 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3824 return -EINVAL; 3825 if (msec == 0) 3826 mddev->safemode_delay = 0; 3827 else { 3828 unsigned long old_delay = mddev->safemode_delay; 3829 unsigned long new_delay = (msec*HZ)/1000; 3830 3831 if (new_delay == 0) 3832 new_delay = 1; 3833 mddev->safemode_delay = new_delay; 3834 if (new_delay < old_delay || old_delay == 0) 3835 mod_timer(&mddev->safemode_timer, jiffies+1); 3836 } 3837 return len; 3838 } 3839 static struct md_sysfs_entry md_safe_delay = 3840 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3841 3842 static ssize_t 3843 level_show(struct mddev *mddev, char *page) 3844 { 3845 struct md_personality *p; 3846 int ret; 3847 spin_lock(&mddev->lock); 3848 p = mddev->pers; 3849 if (p) 3850 ret = sprintf(page, "%s\n", p->name); 3851 else if (mddev->clevel[0]) 3852 ret = sprintf(page, "%s\n", mddev->clevel); 3853 else if (mddev->level != LEVEL_NONE) 3854 ret = sprintf(page, "%d\n", mddev->level); 3855 else 3856 ret = 0; 3857 spin_unlock(&mddev->lock); 3858 return ret; 3859 } 3860 3861 static ssize_t 3862 level_store(struct mddev *mddev, const char *buf, size_t len) 3863 { 3864 char clevel[16]; 3865 ssize_t rv; 3866 size_t slen = len; 3867 struct md_personality *pers, *oldpers; 3868 long level; 3869 void *priv, *oldpriv; 3870 struct md_rdev *rdev; 3871 3872 if (slen == 0 || slen >= sizeof(clevel)) 3873 return -EINVAL; 3874 3875 rv = mddev_suspend_and_lock(mddev); 3876 if (rv) 3877 return rv; 3878 3879 if (mddev->pers == NULL) { 3880 memcpy(mddev->clevel, buf, slen); 3881 if (mddev->clevel[slen-1] == '\n') 3882 slen--; 3883 mddev->clevel[slen] = 0; 3884 mddev->level = LEVEL_NONE; 3885 rv = len; 3886 goto out_unlock; 3887 } 3888 rv = -EROFS; 3889 if (!md_is_rdwr(mddev)) 3890 goto out_unlock; 3891 3892 /* request to change the personality. Need to ensure: 3893 * - array is not engaged in resync/recovery/reshape 3894 * - old personality can be suspended 3895 * - new personality will access other array. 3896 */ 3897 3898 rv = -EBUSY; 3899 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3900 mddev->reshape_position != MaxSector || 3901 mddev->sysfs_active) 3902 goto out_unlock; 3903 3904 rv = -EINVAL; 3905 if (!mddev->pers->quiesce) { 3906 pr_warn("md: %s: %s does not support online personality change\n", 3907 mdname(mddev), mddev->pers->name); 3908 goto out_unlock; 3909 } 3910 3911 /* Now find the new personality */ 3912 memcpy(clevel, buf, slen); 3913 if (clevel[slen-1] == '\n') 3914 slen--; 3915 clevel[slen] = 0; 3916 if (kstrtol(clevel, 10, &level)) 3917 level = LEVEL_NONE; 3918 3919 if (request_module("md-%s", clevel) != 0) 3920 request_module("md-level-%s", clevel); 3921 spin_lock(&pers_lock); 3922 pers = find_pers(level, clevel); 3923 if (!pers || !try_module_get(pers->owner)) { 3924 spin_unlock(&pers_lock); 3925 pr_warn("md: personality %s not loaded\n", clevel); 3926 rv = -EINVAL; 3927 goto out_unlock; 3928 } 3929 spin_unlock(&pers_lock); 3930 3931 if (pers == mddev->pers) { 3932 /* Nothing to do! */ 3933 module_put(pers->owner); 3934 rv = len; 3935 goto out_unlock; 3936 } 3937 if (!pers->takeover) { 3938 module_put(pers->owner); 3939 pr_warn("md: %s: %s does not support personality takeover\n", 3940 mdname(mddev), clevel); 3941 rv = -EINVAL; 3942 goto out_unlock; 3943 } 3944 3945 rdev_for_each(rdev, mddev) 3946 rdev->new_raid_disk = rdev->raid_disk; 3947 3948 /* ->takeover must set new_* and/or delta_disks 3949 * if it succeeds, and may set them when it fails. 3950 */ 3951 priv = pers->takeover(mddev); 3952 if (IS_ERR(priv)) { 3953 mddev->new_level = mddev->level; 3954 mddev->new_layout = mddev->layout; 3955 mddev->new_chunk_sectors = mddev->chunk_sectors; 3956 mddev->raid_disks -= mddev->delta_disks; 3957 mddev->delta_disks = 0; 3958 mddev->reshape_backwards = 0; 3959 module_put(pers->owner); 3960 pr_warn("md: %s: %s would not accept array\n", 3961 mdname(mddev), clevel); 3962 rv = PTR_ERR(priv); 3963 goto out_unlock; 3964 } 3965 3966 /* Looks like we have a winner */ 3967 mddev_detach(mddev); 3968 3969 spin_lock(&mddev->lock); 3970 oldpers = mddev->pers; 3971 oldpriv = mddev->private; 3972 mddev->pers = pers; 3973 mddev->private = priv; 3974 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3975 mddev->level = mddev->new_level; 3976 mddev->layout = mddev->new_layout; 3977 mddev->chunk_sectors = mddev->new_chunk_sectors; 3978 mddev->delta_disks = 0; 3979 mddev->reshape_backwards = 0; 3980 mddev->degraded = 0; 3981 spin_unlock(&mddev->lock); 3982 3983 if (oldpers->sync_request == NULL && 3984 mddev->external) { 3985 /* We are converting from a no-redundancy array 3986 * to a redundancy array and metadata is managed 3987 * externally so we need to be sure that writes 3988 * won't block due to a need to transition 3989 * clean->dirty 3990 * until external management is started. 3991 */ 3992 mddev->in_sync = 0; 3993 mddev->safemode_delay = 0; 3994 mddev->safemode = 0; 3995 } 3996 3997 oldpers->free(mddev, oldpriv); 3998 3999 if (oldpers->sync_request == NULL && 4000 pers->sync_request != NULL) { 4001 /* need to add the md_redundancy_group */ 4002 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4003 pr_warn("md: cannot register extra attributes for %s\n", 4004 mdname(mddev)); 4005 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4006 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4007 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4008 } 4009 if (oldpers->sync_request != NULL && 4010 pers->sync_request == NULL) { 4011 /* need to remove the md_redundancy_group */ 4012 if (mddev->to_remove == NULL) 4013 mddev->to_remove = &md_redundancy_group; 4014 } 4015 4016 module_put(oldpers->owner); 4017 4018 rdev_for_each(rdev, mddev) { 4019 if (rdev->raid_disk < 0) 4020 continue; 4021 if (rdev->new_raid_disk >= mddev->raid_disks) 4022 rdev->new_raid_disk = -1; 4023 if (rdev->new_raid_disk == rdev->raid_disk) 4024 continue; 4025 sysfs_unlink_rdev(mddev, rdev); 4026 } 4027 rdev_for_each(rdev, mddev) { 4028 if (rdev->raid_disk < 0) 4029 continue; 4030 if (rdev->new_raid_disk == rdev->raid_disk) 4031 continue; 4032 rdev->raid_disk = rdev->new_raid_disk; 4033 if (rdev->raid_disk < 0) 4034 clear_bit(In_sync, &rdev->flags); 4035 else { 4036 if (sysfs_link_rdev(mddev, rdev)) 4037 pr_warn("md: cannot register rd%d for %s after level change\n", 4038 rdev->raid_disk, mdname(mddev)); 4039 } 4040 } 4041 4042 if (pers->sync_request == NULL) { 4043 /* this is now an array without redundancy, so 4044 * it must always be in_sync 4045 */ 4046 mddev->in_sync = 1; 4047 del_timer_sync(&mddev->safemode_timer); 4048 } 4049 pers->run(mddev); 4050 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4051 if (!mddev->thread) 4052 md_update_sb(mddev, 1); 4053 sysfs_notify_dirent_safe(mddev->sysfs_level); 4054 md_new_event(); 4055 rv = len; 4056 out_unlock: 4057 mddev_unlock_and_resume(mddev); 4058 return rv; 4059 } 4060 4061 static struct md_sysfs_entry md_level = 4062 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4063 4064 static ssize_t 4065 new_level_show(struct mddev *mddev, char *page) 4066 { 4067 return sprintf(page, "%d\n", mddev->new_level); 4068 } 4069 4070 static ssize_t 4071 new_level_store(struct mddev *mddev, const char *buf, size_t len) 4072 { 4073 unsigned int n; 4074 int err; 4075 4076 err = kstrtouint(buf, 10, &n); 4077 if (err < 0) 4078 return err; 4079 err = mddev_lock(mddev); 4080 if (err) 4081 return err; 4082 4083 mddev->new_level = n; 4084 md_update_sb(mddev, 1); 4085 4086 mddev_unlock(mddev); 4087 return len; 4088 } 4089 static struct md_sysfs_entry md_new_level = 4090 __ATTR(new_level, 0664, new_level_show, new_level_store); 4091 4092 static ssize_t 4093 layout_show(struct mddev *mddev, char *page) 4094 { 4095 /* just a number, not meaningful for all levels */ 4096 if (mddev->reshape_position != MaxSector && 4097 mddev->layout != mddev->new_layout) 4098 return sprintf(page, "%d (%d)\n", 4099 mddev->new_layout, mddev->layout); 4100 return sprintf(page, "%d\n", mddev->layout); 4101 } 4102 4103 static ssize_t 4104 layout_store(struct mddev *mddev, const char *buf, size_t len) 4105 { 4106 unsigned int n; 4107 int err; 4108 4109 err = kstrtouint(buf, 10, &n); 4110 if (err < 0) 4111 return err; 4112 err = mddev_lock(mddev); 4113 if (err) 4114 return err; 4115 4116 if (mddev->pers) { 4117 if (mddev->pers->check_reshape == NULL) 4118 err = -EBUSY; 4119 else if (!md_is_rdwr(mddev)) 4120 err = -EROFS; 4121 else { 4122 mddev->new_layout = n; 4123 err = mddev->pers->check_reshape(mddev); 4124 if (err) 4125 mddev->new_layout = mddev->layout; 4126 } 4127 } else { 4128 mddev->new_layout = n; 4129 if (mddev->reshape_position == MaxSector) 4130 mddev->layout = n; 4131 } 4132 mddev_unlock(mddev); 4133 return err ?: len; 4134 } 4135 static struct md_sysfs_entry md_layout = 4136 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4137 4138 static ssize_t 4139 raid_disks_show(struct mddev *mddev, char *page) 4140 { 4141 if (mddev->raid_disks == 0) 4142 return 0; 4143 if (mddev->reshape_position != MaxSector && 4144 mddev->delta_disks != 0) 4145 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4146 mddev->raid_disks - mddev->delta_disks); 4147 return sprintf(page, "%d\n", mddev->raid_disks); 4148 } 4149 4150 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4151 4152 static ssize_t 4153 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4154 { 4155 unsigned int n; 4156 int err; 4157 4158 err = kstrtouint(buf, 10, &n); 4159 if (err < 0) 4160 return err; 4161 4162 err = mddev_lock(mddev); 4163 if (err) 4164 return err; 4165 if (mddev->pers) 4166 err = update_raid_disks(mddev, n); 4167 else if (mddev->reshape_position != MaxSector) { 4168 struct md_rdev *rdev; 4169 int olddisks = mddev->raid_disks - mddev->delta_disks; 4170 4171 err = -EINVAL; 4172 rdev_for_each(rdev, mddev) { 4173 if (olddisks < n && 4174 rdev->data_offset < rdev->new_data_offset) 4175 goto out_unlock; 4176 if (olddisks > n && 4177 rdev->data_offset > rdev->new_data_offset) 4178 goto out_unlock; 4179 } 4180 err = 0; 4181 mddev->delta_disks = n - olddisks; 4182 mddev->raid_disks = n; 4183 mddev->reshape_backwards = (mddev->delta_disks < 0); 4184 } else 4185 mddev->raid_disks = n; 4186 out_unlock: 4187 mddev_unlock(mddev); 4188 return err ? err : len; 4189 } 4190 static struct md_sysfs_entry md_raid_disks = 4191 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4192 4193 static ssize_t 4194 uuid_show(struct mddev *mddev, char *page) 4195 { 4196 return sprintf(page, "%pU\n", mddev->uuid); 4197 } 4198 static struct md_sysfs_entry md_uuid = 4199 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4200 4201 static ssize_t 4202 chunk_size_show(struct mddev *mddev, char *page) 4203 { 4204 if (mddev->reshape_position != MaxSector && 4205 mddev->chunk_sectors != mddev->new_chunk_sectors) 4206 return sprintf(page, "%d (%d)\n", 4207 mddev->new_chunk_sectors << 9, 4208 mddev->chunk_sectors << 9); 4209 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4210 } 4211 4212 static ssize_t 4213 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4214 { 4215 unsigned long n; 4216 int err; 4217 4218 err = kstrtoul(buf, 10, &n); 4219 if (err < 0) 4220 return err; 4221 4222 err = mddev_lock(mddev); 4223 if (err) 4224 return err; 4225 if (mddev->pers) { 4226 if (mddev->pers->check_reshape == NULL) 4227 err = -EBUSY; 4228 else if (!md_is_rdwr(mddev)) 4229 err = -EROFS; 4230 else { 4231 mddev->new_chunk_sectors = n >> 9; 4232 err = mddev->pers->check_reshape(mddev); 4233 if (err) 4234 mddev->new_chunk_sectors = mddev->chunk_sectors; 4235 } 4236 } else { 4237 mddev->new_chunk_sectors = n >> 9; 4238 if (mddev->reshape_position == MaxSector) 4239 mddev->chunk_sectors = n >> 9; 4240 } 4241 mddev_unlock(mddev); 4242 return err ?: len; 4243 } 4244 static struct md_sysfs_entry md_chunk_size = 4245 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4246 4247 static ssize_t 4248 resync_start_show(struct mddev *mddev, char *page) 4249 { 4250 if (mddev->recovery_cp == MaxSector) 4251 return sprintf(page, "none\n"); 4252 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4253 } 4254 4255 static ssize_t 4256 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4257 { 4258 unsigned long long n; 4259 int err; 4260 4261 if (cmd_match(buf, "none")) 4262 n = MaxSector; 4263 else { 4264 err = kstrtoull(buf, 10, &n); 4265 if (err < 0) 4266 return err; 4267 if (n != (sector_t)n) 4268 return -EINVAL; 4269 } 4270 4271 err = mddev_lock(mddev); 4272 if (err) 4273 return err; 4274 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4275 err = -EBUSY; 4276 4277 if (!err) { 4278 mddev->recovery_cp = n; 4279 if (mddev->pers) 4280 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4281 } 4282 mddev_unlock(mddev); 4283 return err ?: len; 4284 } 4285 static struct md_sysfs_entry md_resync_start = 4286 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4287 resync_start_show, resync_start_store); 4288 4289 /* 4290 * The array state can be: 4291 * 4292 * clear 4293 * No devices, no size, no level 4294 * Equivalent to STOP_ARRAY ioctl 4295 * inactive 4296 * May have some settings, but array is not active 4297 * all IO results in error 4298 * When written, doesn't tear down array, but just stops it 4299 * suspended (not supported yet) 4300 * All IO requests will block. The array can be reconfigured. 4301 * Writing this, if accepted, will block until array is quiescent 4302 * readonly 4303 * no resync can happen. no superblocks get written. 4304 * write requests fail 4305 * read-auto 4306 * like readonly, but behaves like 'clean' on a write request. 4307 * 4308 * clean - no pending writes, but otherwise active. 4309 * When written to inactive array, starts without resync 4310 * If a write request arrives then 4311 * if metadata is known, mark 'dirty' and switch to 'active'. 4312 * if not known, block and switch to write-pending 4313 * If written to an active array that has pending writes, then fails. 4314 * active 4315 * fully active: IO and resync can be happening. 4316 * When written to inactive array, starts with resync 4317 * 4318 * write-pending 4319 * clean, but writes are blocked waiting for 'active' to be written. 4320 * 4321 * active-idle 4322 * like active, but no writes have been seen for a while (100msec). 4323 * 4324 * broken 4325 * Array is failed. It's useful because mounted-arrays aren't stopped 4326 * when array is failed, so this state will at least alert the user that 4327 * something is wrong. 4328 */ 4329 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4330 write_pending, active_idle, broken, bad_word}; 4331 static char *array_states[] = { 4332 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4333 "write-pending", "active-idle", "broken", NULL }; 4334 4335 static int match_word(const char *word, char **list) 4336 { 4337 int n; 4338 for (n=0; list[n]; n++) 4339 if (cmd_match(word, list[n])) 4340 break; 4341 return n; 4342 } 4343 4344 static ssize_t 4345 array_state_show(struct mddev *mddev, char *page) 4346 { 4347 enum array_state st = inactive; 4348 4349 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4350 switch(mddev->ro) { 4351 case MD_RDONLY: 4352 st = readonly; 4353 break; 4354 case MD_AUTO_READ: 4355 st = read_auto; 4356 break; 4357 case MD_RDWR: 4358 spin_lock(&mddev->lock); 4359 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4360 st = write_pending; 4361 else if (mddev->in_sync) 4362 st = clean; 4363 else if (mddev->safemode) 4364 st = active_idle; 4365 else 4366 st = active; 4367 spin_unlock(&mddev->lock); 4368 } 4369 4370 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4371 st = broken; 4372 } else { 4373 if (list_empty(&mddev->disks) && 4374 mddev->raid_disks == 0 && 4375 mddev->dev_sectors == 0) 4376 st = clear; 4377 else 4378 st = inactive; 4379 } 4380 return sprintf(page, "%s\n", array_states[st]); 4381 } 4382 4383 static int do_md_stop(struct mddev *mddev, int ro); 4384 static int md_set_readonly(struct mddev *mddev); 4385 static int restart_array(struct mddev *mddev); 4386 4387 static ssize_t 4388 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4389 { 4390 int err = 0; 4391 enum array_state st = match_word(buf, array_states); 4392 4393 /* No lock dependent actions */ 4394 switch (st) { 4395 case suspended: /* not supported yet */ 4396 case write_pending: /* cannot be set */ 4397 case active_idle: /* cannot be set */ 4398 case broken: /* cannot be set */ 4399 case bad_word: 4400 return -EINVAL; 4401 case clear: 4402 case readonly: 4403 case inactive: 4404 case read_auto: 4405 if (!mddev->pers || !md_is_rdwr(mddev)) 4406 break; 4407 /* write sysfs will not open mddev and opener should be 0 */ 4408 err = mddev_set_closing_and_sync_blockdev(mddev, 0); 4409 if (err) 4410 return err; 4411 break; 4412 default: 4413 break; 4414 } 4415 4416 if (mddev->pers && (st == active || st == clean) && 4417 mddev->ro != MD_RDONLY) { 4418 /* don't take reconfig_mutex when toggling between 4419 * clean and active 4420 */ 4421 spin_lock(&mddev->lock); 4422 if (st == active) { 4423 restart_array(mddev); 4424 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4425 md_wakeup_thread(mddev->thread); 4426 wake_up(&mddev->sb_wait); 4427 } else /* st == clean */ { 4428 restart_array(mddev); 4429 if (!set_in_sync(mddev)) 4430 err = -EBUSY; 4431 } 4432 if (!err) 4433 sysfs_notify_dirent_safe(mddev->sysfs_state); 4434 spin_unlock(&mddev->lock); 4435 return err ?: len; 4436 } 4437 err = mddev_lock(mddev); 4438 if (err) 4439 return err; 4440 4441 switch (st) { 4442 case inactive: 4443 /* stop an active array, return 0 otherwise */ 4444 if (mddev->pers) 4445 err = do_md_stop(mddev, 2); 4446 break; 4447 case clear: 4448 err = do_md_stop(mddev, 0); 4449 break; 4450 case readonly: 4451 if (mddev->pers) 4452 err = md_set_readonly(mddev); 4453 else { 4454 mddev->ro = MD_RDONLY; 4455 set_disk_ro(mddev->gendisk, 1); 4456 err = do_md_run(mddev); 4457 } 4458 break; 4459 case read_auto: 4460 if (mddev->pers) { 4461 if (md_is_rdwr(mddev)) 4462 err = md_set_readonly(mddev); 4463 else if (mddev->ro == MD_RDONLY) 4464 err = restart_array(mddev); 4465 if (err == 0) { 4466 mddev->ro = MD_AUTO_READ; 4467 set_disk_ro(mddev->gendisk, 0); 4468 } 4469 } else { 4470 mddev->ro = MD_AUTO_READ; 4471 err = do_md_run(mddev); 4472 } 4473 break; 4474 case clean: 4475 if (mddev->pers) { 4476 err = restart_array(mddev); 4477 if (err) 4478 break; 4479 spin_lock(&mddev->lock); 4480 if (!set_in_sync(mddev)) 4481 err = -EBUSY; 4482 spin_unlock(&mddev->lock); 4483 } else 4484 err = -EINVAL; 4485 break; 4486 case active: 4487 if (mddev->pers) { 4488 err = restart_array(mddev); 4489 if (err) 4490 break; 4491 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4492 wake_up(&mddev->sb_wait); 4493 err = 0; 4494 } else { 4495 mddev->ro = MD_RDWR; 4496 set_disk_ro(mddev->gendisk, 0); 4497 err = do_md_run(mddev); 4498 } 4499 break; 4500 default: 4501 err = -EINVAL; 4502 break; 4503 } 4504 4505 if (!err) { 4506 if (mddev->hold_active == UNTIL_IOCTL) 4507 mddev->hold_active = 0; 4508 sysfs_notify_dirent_safe(mddev->sysfs_state); 4509 } 4510 mddev_unlock(mddev); 4511 4512 if (st == readonly || st == read_auto || st == inactive || 4513 (err && st == clear)) 4514 clear_bit(MD_CLOSING, &mddev->flags); 4515 4516 return err ?: len; 4517 } 4518 static struct md_sysfs_entry md_array_state = 4519 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4520 4521 static ssize_t 4522 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4523 return sprintf(page, "%d\n", 4524 atomic_read(&mddev->max_corr_read_errors)); 4525 } 4526 4527 static ssize_t 4528 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4529 { 4530 unsigned int n; 4531 int rv; 4532 4533 rv = kstrtouint(buf, 10, &n); 4534 if (rv < 0) 4535 return rv; 4536 if (n > INT_MAX) 4537 return -EINVAL; 4538 atomic_set(&mddev->max_corr_read_errors, n); 4539 return len; 4540 } 4541 4542 static struct md_sysfs_entry max_corr_read_errors = 4543 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4544 max_corrected_read_errors_store); 4545 4546 static ssize_t 4547 null_show(struct mddev *mddev, char *page) 4548 { 4549 return -EINVAL; 4550 } 4551 4552 static ssize_t 4553 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4554 { 4555 /* buf must be %d:%d\n? giving major and minor numbers */ 4556 /* The new device is added to the array. 4557 * If the array has a persistent superblock, we read the 4558 * superblock to initialise info and check validity. 4559 * Otherwise, only checking done is that in bind_rdev_to_array, 4560 * which mainly checks size. 4561 */ 4562 char *e; 4563 int major = simple_strtoul(buf, &e, 10); 4564 int minor; 4565 dev_t dev; 4566 struct md_rdev *rdev; 4567 int err; 4568 4569 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4570 return -EINVAL; 4571 minor = simple_strtoul(e+1, &e, 10); 4572 if (*e && *e != '\n') 4573 return -EINVAL; 4574 dev = MKDEV(major, minor); 4575 if (major != MAJOR(dev) || 4576 minor != MINOR(dev)) 4577 return -EOVERFLOW; 4578 4579 err = mddev_suspend_and_lock(mddev); 4580 if (err) 4581 return err; 4582 if (mddev->persistent) { 4583 rdev = md_import_device(dev, mddev->major_version, 4584 mddev->minor_version); 4585 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4586 struct md_rdev *rdev0 4587 = list_entry(mddev->disks.next, 4588 struct md_rdev, same_set); 4589 err = super_types[mddev->major_version] 4590 .load_super(rdev, rdev0, mddev->minor_version); 4591 if (err < 0) 4592 goto out; 4593 } 4594 } else if (mddev->external) 4595 rdev = md_import_device(dev, -2, -1); 4596 else 4597 rdev = md_import_device(dev, -1, -1); 4598 4599 if (IS_ERR(rdev)) { 4600 mddev_unlock_and_resume(mddev); 4601 return PTR_ERR(rdev); 4602 } 4603 err = bind_rdev_to_array(rdev, mddev); 4604 out: 4605 if (err) 4606 export_rdev(rdev, mddev); 4607 mddev_unlock_and_resume(mddev); 4608 if (!err) 4609 md_new_event(); 4610 return err ? err : len; 4611 } 4612 4613 static struct md_sysfs_entry md_new_device = 4614 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4615 4616 static ssize_t 4617 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4618 { 4619 char *end; 4620 unsigned long chunk, end_chunk; 4621 int err; 4622 4623 err = mddev_lock(mddev); 4624 if (err) 4625 return err; 4626 if (!mddev->bitmap) 4627 goto out; 4628 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4629 while (*buf) { 4630 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4631 if (buf == end) 4632 break; 4633 4634 if (*end == '-') { /* range */ 4635 buf = end + 1; 4636 end_chunk = simple_strtoul(buf, &end, 0); 4637 if (buf == end) 4638 break; 4639 } 4640 4641 if (*end && !isspace(*end)) 4642 break; 4643 4644 mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); 4645 buf = skip_spaces(end); 4646 } 4647 mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ 4648 out: 4649 mddev_unlock(mddev); 4650 return len; 4651 } 4652 4653 static struct md_sysfs_entry md_bitmap = 4654 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4655 4656 static ssize_t 4657 size_show(struct mddev *mddev, char *page) 4658 { 4659 return sprintf(page, "%llu\n", 4660 (unsigned long long)mddev->dev_sectors / 2); 4661 } 4662 4663 static int update_size(struct mddev *mddev, sector_t num_sectors); 4664 4665 static ssize_t 4666 size_store(struct mddev *mddev, const char *buf, size_t len) 4667 { 4668 /* If array is inactive, we can reduce the component size, but 4669 * not increase it (except from 0). 4670 * If array is active, we can try an on-line resize 4671 */ 4672 sector_t sectors; 4673 int err = strict_blocks_to_sectors(buf, §ors); 4674 4675 if (err < 0) 4676 return err; 4677 err = mddev_lock(mddev); 4678 if (err) 4679 return err; 4680 if (mddev->pers) { 4681 err = update_size(mddev, sectors); 4682 if (err == 0) 4683 md_update_sb(mddev, 1); 4684 } else { 4685 if (mddev->dev_sectors == 0 || 4686 mddev->dev_sectors > sectors) 4687 mddev->dev_sectors = sectors; 4688 else 4689 err = -ENOSPC; 4690 } 4691 mddev_unlock(mddev); 4692 return err ? err : len; 4693 } 4694 4695 static struct md_sysfs_entry md_size = 4696 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4697 4698 /* Metadata version. 4699 * This is one of 4700 * 'none' for arrays with no metadata (good luck...) 4701 * 'external' for arrays with externally managed metadata, 4702 * or N.M for internally known formats 4703 */ 4704 static ssize_t 4705 metadata_show(struct mddev *mddev, char *page) 4706 { 4707 if (mddev->persistent) 4708 return sprintf(page, "%d.%d\n", 4709 mddev->major_version, mddev->minor_version); 4710 else if (mddev->external) 4711 return sprintf(page, "external:%s\n", mddev->metadata_type); 4712 else 4713 return sprintf(page, "none\n"); 4714 } 4715 4716 static ssize_t 4717 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4718 { 4719 int major, minor; 4720 char *e; 4721 int err; 4722 /* Changing the details of 'external' metadata is 4723 * always permitted. Otherwise there must be 4724 * no devices attached to the array. 4725 */ 4726 4727 err = mddev_lock(mddev); 4728 if (err) 4729 return err; 4730 err = -EBUSY; 4731 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4732 ; 4733 else if (!list_empty(&mddev->disks)) 4734 goto out_unlock; 4735 4736 err = 0; 4737 if (cmd_match(buf, "none")) { 4738 mddev->persistent = 0; 4739 mddev->external = 0; 4740 mddev->major_version = 0; 4741 mddev->minor_version = 90; 4742 goto out_unlock; 4743 } 4744 if (strncmp(buf, "external:", 9) == 0) { 4745 size_t namelen = len-9; 4746 if (namelen >= sizeof(mddev->metadata_type)) 4747 namelen = sizeof(mddev->metadata_type)-1; 4748 memcpy(mddev->metadata_type, buf+9, namelen); 4749 mddev->metadata_type[namelen] = 0; 4750 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4751 mddev->metadata_type[--namelen] = 0; 4752 mddev->persistent = 0; 4753 mddev->external = 1; 4754 mddev->major_version = 0; 4755 mddev->minor_version = 90; 4756 goto out_unlock; 4757 } 4758 major = simple_strtoul(buf, &e, 10); 4759 err = -EINVAL; 4760 if (e==buf || *e != '.') 4761 goto out_unlock; 4762 buf = e+1; 4763 minor = simple_strtoul(buf, &e, 10); 4764 if (e==buf || (*e && *e != '\n') ) 4765 goto out_unlock; 4766 err = -ENOENT; 4767 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4768 goto out_unlock; 4769 mddev->major_version = major; 4770 mddev->minor_version = minor; 4771 mddev->persistent = 1; 4772 mddev->external = 0; 4773 err = 0; 4774 out_unlock: 4775 mddev_unlock(mddev); 4776 return err ?: len; 4777 } 4778 4779 static struct md_sysfs_entry md_metadata = 4780 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4781 4782 enum sync_action md_sync_action(struct mddev *mddev) 4783 { 4784 unsigned long recovery = mddev->recovery; 4785 4786 /* 4787 * frozen has the highest priority, means running sync_thread will be 4788 * stopped immediately, and no new sync_thread can start. 4789 */ 4790 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4791 return ACTION_FROZEN; 4792 4793 /* 4794 * read-only array can't register sync_thread, and it can only 4795 * add/remove spares. 4796 */ 4797 if (!md_is_rdwr(mddev)) 4798 return ACTION_IDLE; 4799 4800 /* 4801 * idle means no sync_thread is running, and no new sync_thread is 4802 * requested. 4803 */ 4804 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && 4805 !test_bit(MD_RECOVERY_NEEDED, &recovery)) 4806 return ACTION_IDLE; 4807 4808 if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || 4809 mddev->reshape_position != MaxSector) 4810 return ACTION_RESHAPE; 4811 4812 if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4813 return ACTION_RECOVER; 4814 4815 if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4816 /* 4817 * MD_RECOVERY_CHECK must be paired with 4818 * MD_RECOVERY_REQUESTED. 4819 */ 4820 if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4821 return ACTION_CHECK; 4822 if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4823 return ACTION_REPAIR; 4824 return ACTION_RESYNC; 4825 } 4826 4827 /* 4828 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no 4829 * sync_action is specified. 4830 */ 4831 return ACTION_IDLE; 4832 } 4833 4834 enum sync_action md_sync_action_by_name(const char *page) 4835 { 4836 enum sync_action action; 4837 4838 for (action = 0; action < NR_SYNC_ACTIONS; ++action) { 4839 if (cmd_match(page, action_name[action])) 4840 return action; 4841 } 4842 4843 return NR_SYNC_ACTIONS; 4844 } 4845 4846 const char *md_sync_action_name(enum sync_action action) 4847 { 4848 return action_name[action]; 4849 } 4850 4851 static ssize_t 4852 action_show(struct mddev *mddev, char *page) 4853 { 4854 enum sync_action action = md_sync_action(mddev); 4855 4856 return sprintf(page, "%s\n", md_sync_action_name(action)); 4857 } 4858 4859 /** 4860 * stop_sync_thread() - wait for sync_thread to stop if it's running. 4861 * @mddev: the array. 4862 * @locked: if set, reconfig_mutex will still be held after this function 4863 * return; if not set, reconfig_mutex will be released after this 4864 * function return. 4865 */ 4866 static void stop_sync_thread(struct mddev *mddev, bool locked) 4867 { 4868 int sync_seq = atomic_read(&mddev->sync_seq); 4869 4870 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4871 if (!locked) 4872 mddev_unlock(mddev); 4873 return; 4874 } 4875 4876 mddev_unlock(mddev); 4877 4878 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4879 /* 4880 * Thread might be blocked waiting for metadata update which will now 4881 * never happen 4882 */ 4883 md_wakeup_thread_directly(mddev->sync_thread); 4884 if (work_pending(&mddev->sync_work)) 4885 flush_work(&mddev->sync_work); 4886 4887 wait_event(resync_wait, 4888 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4889 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && 4890 sync_seq != atomic_read(&mddev->sync_seq))); 4891 4892 if (locked) 4893 mddev_lock_nointr(mddev); 4894 } 4895 4896 void md_idle_sync_thread(struct mddev *mddev) 4897 { 4898 lockdep_assert_held(&mddev->reconfig_mutex); 4899 4900 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4901 stop_sync_thread(mddev, true); 4902 } 4903 EXPORT_SYMBOL_GPL(md_idle_sync_thread); 4904 4905 void md_frozen_sync_thread(struct mddev *mddev) 4906 { 4907 lockdep_assert_held(&mddev->reconfig_mutex); 4908 4909 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4910 stop_sync_thread(mddev, true); 4911 } 4912 EXPORT_SYMBOL_GPL(md_frozen_sync_thread); 4913 4914 void md_unfrozen_sync_thread(struct mddev *mddev) 4915 { 4916 lockdep_assert_held(&mddev->reconfig_mutex); 4917 4918 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4919 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4920 md_wakeup_thread(mddev->thread); 4921 sysfs_notify_dirent_safe(mddev->sysfs_action); 4922 } 4923 EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); 4924 4925 static int mddev_start_reshape(struct mddev *mddev) 4926 { 4927 int ret; 4928 4929 if (mddev->pers->start_reshape == NULL) 4930 return -EINVAL; 4931 4932 if (mddev->reshape_position == MaxSector || 4933 mddev->pers->check_reshape == NULL || 4934 mddev->pers->check_reshape(mddev)) { 4935 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4936 ret = mddev->pers->start_reshape(mddev); 4937 if (ret) 4938 return ret; 4939 } else { 4940 /* 4941 * If reshape is still in progress, and md_check_recovery() can 4942 * continue to reshape, don't restart reshape because data can 4943 * be corrupted for raid456. 4944 */ 4945 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4946 } 4947 4948 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 4949 return 0; 4950 } 4951 4952 static ssize_t 4953 action_store(struct mddev *mddev, const char *page, size_t len) 4954 { 4955 int ret; 4956 enum sync_action action; 4957 4958 if (!mddev->pers || !mddev->pers->sync_request) 4959 return -EINVAL; 4960 4961 retry: 4962 if (work_busy(&mddev->sync_work)) 4963 flush_work(&mddev->sync_work); 4964 4965 ret = mddev_lock(mddev); 4966 if (ret) 4967 return ret; 4968 4969 if (work_busy(&mddev->sync_work)) { 4970 mddev_unlock(mddev); 4971 goto retry; 4972 } 4973 4974 action = md_sync_action_by_name(page); 4975 4976 /* TODO: mdadm rely on "idle" to start sync_thread. */ 4977 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4978 switch (action) { 4979 case ACTION_FROZEN: 4980 md_frozen_sync_thread(mddev); 4981 ret = len; 4982 goto out; 4983 case ACTION_IDLE: 4984 md_idle_sync_thread(mddev); 4985 break; 4986 case ACTION_RESHAPE: 4987 case ACTION_RECOVER: 4988 case ACTION_CHECK: 4989 case ACTION_REPAIR: 4990 case ACTION_RESYNC: 4991 ret = -EBUSY; 4992 goto out; 4993 default: 4994 ret = -EINVAL; 4995 goto out; 4996 } 4997 } else { 4998 switch (action) { 4999 case ACTION_FROZEN: 5000 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5001 ret = len; 5002 goto out; 5003 case ACTION_RESHAPE: 5004 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5005 ret = mddev_start_reshape(mddev); 5006 if (ret) 5007 goto out; 5008 break; 5009 case ACTION_RECOVER: 5010 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5011 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5012 break; 5013 case ACTION_CHECK: 5014 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5015 fallthrough; 5016 case ACTION_REPAIR: 5017 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 5018 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5019 fallthrough; 5020 case ACTION_RESYNC: 5021 case ACTION_IDLE: 5022 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5023 break; 5024 default: 5025 ret = -EINVAL; 5026 goto out; 5027 } 5028 } 5029 5030 if (mddev->ro == MD_AUTO_READ) { 5031 /* A write to sync_action is enough to justify 5032 * canceling read-auto mode 5033 */ 5034 mddev->ro = MD_RDWR; 5035 md_wakeup_thread(mddev->sync_thread); 5036 } 5037 5038 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5039 md_wakeup_thread(mddev->thread); 5040 sysfs_notify_dirent_safe(mddev->sysfs_action); 5041 ret = len; 5042 5043 out: 5044 mddev_unlock(mddev); 5045 return ret; 5046 } 5047 5048 static struct md_sysfs_entry md_scan_mode = 5049 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 5050 5051 static ssize_t 5052 last_sync_action_show(struct mddev *mddev, char *page) 5053 { 5054 return sprintf(page, "%s\n", 5055 md_sync_action_name(mddev->last_sync_action)); 5056 } 5057 5058 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 5059 5060 static ssize_t 5061 mismatch_cnt_show(struct mddev *mddev, char *page) 5062 { 5063 return sprintf(page, "%llu\n", 5064 (unsigned long long) 5065 atomic64_read(&mddev->resync_mismatches)); 5066 } 5067 5068 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 5069 5070 static ssize_t 5071 sync_min_show(struct mddev *mddev, char *page) 5072 { 5073 return sprintf(page, "%d (%s)\n", speed_min(mddev), 5074 mddev->sync_speed_min ? "local": "system"); 5075 } 5076 5077 static ssize_t 5078 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 5079 { 5080 unsigned int min; 5081 int rv; 5082 5083 if (strncmp(buf, "system", 6)==0) { 5084 min = 0; 5085 } else { 5086 rv = kstrtouint(buf, 10, &min); 5087 if (rv < 0) 5088 return rv; 5089 if (min == 0) 5090 return -EINVAL; 5091 } 5092 mddev->sync_speed_min = min; 5093 return len; 5094 } 5095 5096 static struct md_sysfs_entry md_sync_min = 5097 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5098 5099 static ssize_t 5100 sync_max_show(struct mddev *mddev, char *page) 5101 { 5102 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5103 mddev->sync_speed_max ? "local": "system"); 5104 } 5105 5106 static ssize_t 5107 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5108 { 5109 unsigned int max; 5110 int rv; 5111 5112 if (strncmp(buf, "system", 6)==0) { 5113 max = 0; 5114 } else { 5115 rv = kstrtouint(buf, 10, &max); 5116 if (rv < 0) 5117 return rv; 5118 if (max == 0) 5119 return -EINVAL; 5120 } 5121 mddev->sync_speed_max = max; 5122 return len; 5123 } 5124 5125 static struct md_sysfs_entry md_sync_max = 5126 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5127 5128 static ssize_t 5129 degraded_show(struct mddev *mddev, char *page) 5130 { 5131 return sprintf(page, "%d\n", mddev->degraded); 5132 } 5133 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5134 5135 static ssize_t 5136 sync_force_parallel_show(struct mddev *mddev, char *page) 5137 { 5138 return sprintf(page, "%d\n", mddev->parallel_resync); 5139 } 5140 5141 static ssize_t 5142 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5143 { 5144 long n; 5145 5146 if (kstrtol(buf, 10, &n)) 5147 return -EINVAL; 5148 5149 if (n != 0 && n != 1) 5150 return -EINVAL; 5151 5152 mddev->parallel_resync = n; 5153 5154 if (mddev->sync_thread) 5155 wake_up(&resync_wait); 5156 5157 return len; 5158 } 5159 5160 /* force parallel resync, even with shared block devices */ 5161 static struct md_sysfs_entry md_sync_force_parallel = 5162 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5163 sync_force_parallel_show, sync_force_parallel_store); 5164 5165 static ssize_t 5166 sync_speed_show(struct mddev *mddev, char *page) 5167 { 5168 unsigned long resync, dt, db; 5169 if (mddev->curr_resync == MD_RESYNC_NONE) 5170 return sprintf(page, "none\n"); 5171 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5172 dt = (jiffies - mddev->resync_mark) / HZ; 5173 if (!dt) dt++; 5174 db = resync - mddev->resync_mark_cnt; 5175 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5176 } 5177 5178 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5179 5180 static ssize_t 5181 sync_completed_show(struct mddev *mddev, char *page) 5182 { 5183 unsigned long long max_sectors, resync; 5184 5185 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5186 return sprintf(page, "none\n"); 5187 5188 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5189 mddev->curr_resync == MD_RESYNC_DELAYED) 5190 return sprintf(page, "delayed\n"); 5191 5192 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5193 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5194 max_sectors = mddev->resync_max_sectors; 5195 else 5196 max_sectors = mddev->dev_sectors; 5197 5198 resync = mddev->curr_resync_completed; 5199 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5200 } 5201 5202 static struct md_sysfs_entry md_sync_completed = 5203 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5204 5205 static ssize_t 5206 min_sync_show(struct mddev *mddev, char *page) 5207 { 5208 return sprintf(page, "%llu\n", 5209 (unsigned long long)mddev->resync_min); 5210 } 5211 static ssize_t 5212 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5213 { 5214 unsigned long long min; 5215 int err; 5216 5217 if (kstrtoull(buf, 10, &min)) 5218 return -EINVAL; 5219 5220 spin_lock(&mddev->lock); 5221 err = -EINVAL; 5222 if (min > mddev->resync_max) 5223 goto out_unlock; 5224 5225 err = -EBUSY; 5226 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5227 goto out_unlock; 5228 5229 /* Round down to multiple of 4K for safety */ 5230 mddev->resync_min = round_down(min, 8); 5231 err = 0; 5232 5233 out_unlock: 5234 spin_unlock(&mddev->lock); 5235 return err ?: len; 5236 } 5237 5238 static struct md_sysfs_entry md_min_sync = 5239 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5240 5241 static ssize_t 5242 max_sync_show(struct mddev *mddev, char *page) 5243 { 5244 if (mddev->resync_max == MaxSector) 5245 return sprintf(page, "max\n"); 5246 else 5247 return sprintf(page, "%llu\n", 5248 (unsigned long long)mddev->resync_max); 5249 } 5250 static ssize_t 5251 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5252 { 5253 int err; 5254 spin_lock(&mddev->lock); 5255 if (strncmp(buf, "max", 3) == 0) 5256 mddev->resync_max = MaxSector; 5257 else { 5258 unsigned long long max; 5259 int chunk; 5260 5261 err = -EINVAL; 5262 if (kstrtoull(buf, 10, &max)) 5263 goto out_unlock; 5264 if (max < mddev->resync_min) 5265 goto out_unlock; 5266 5267 err = -EBUSY; 5268 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5269 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5270 goto out_unlock; 5271 5272 /* Must be a multiple of chunk_size */ 5273 chunk = mddev->chunk_sectors; 5274 if (chunk) { 5275 sector_t temp = max; 5276 5277 err = -EINVAL; 5278 if (sector_div(temp, chunk)) 5279 goto out_unlock; 5280 } 5281 mddev->resync_max = max; 5282 } 5283 wake_up(&mddev->recovery_wait); 5284 err = 0; 5285 out_unlock: 5286 spin_unlock(&mddev->lock); 5287 return err ?: len; 5288 } 5289 5290 static struct md_sysfs_entry md_max_sync = 5291 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5292 5293 static ssize_t 5294 suspend_lo_show(struct mddev *mddev, char *page) 5295 { 5296 return sprintf(page, "%llu\n", 5297 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5298 } 5299 5300 static ssize_t 5301 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5302 { 5303 unsigned long long new; 5304 int err; 5305 5306 err = kstrtoull(buf, 10, &new); 5307 if (err < 0) 5308 return err; 5309 if (new != (sector_t)new) 5310 return -EINVAL; 5311 5312 err = mddev_suspend(mddev, true); 5313 if (err) 5314 return err; 5315 5316 WRITE_ONCE(mddev->suspend_lo, new); 5317 mddev_resume(mddev); 5318 5319 return len; 5320 } 5321 static struct md_sysfs_entry md_suspend_lo = 5322 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5323 5324 static ssize_t 5325 suspend_hi_show(struct mddev *mddev, char *page) 5326 { 5327 return sprintf(page, "%llu\n", 5328 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5329 } 5330 5331 static ssize_t 5332 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5333 { 5334 unsigned long long new; 5335 int err; 5336 5337 err = kstrtoull(buf, 10, &new); 5338 if (err < 0) 5339 return err; 5340 if (new != (sector_t)new) 5341 return -EINVAL; 5342 5343 err = mddev_suspend(mddev, true); 5344 if (err) 5345 return err; 5346 5347 WRITE_ONCE(mddev->suspend_hi, new); 5348 mddev_resume(mddev); 5349 5350 return len; 5351 } 5352 static struct md_sysfs_entry md_suspend_hi = 5353 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5354 5355 static ssize_t 5356 reshape_position_show(struct mddev *mddev, char *page) 5357 { 5358 if (mddev->reshape_position != MaxSector) 5359 return sprintf(page, "%llu\n", 5360 (unsigned long long)mddev->reshape_position); 5361 strcpy(page, "none\n"); 5362 return 5; 5363 } 5364 5365 static ssize_t 5366 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5367 { 5368 struct md_rdev *rdev; 5369 unsigned long long new; 5370 int err; 5371 5372 err = kstrtoull(buf, 10, &new); 5373 if (err < 0) 5374 return err; 5375 if (new != (sector_t)new) 5376 return -EINVAL; 5377 err = mddev_lock(mddev); 5378 if (err) 5379 return err; 5380 err = -EBUSY; 5381 if (mddev->pers) 5382 goto unlock; 5383 mddev->reshape_position = new; 5384 mddev->delta_disks = 0; 5385 mddev->reshape_backwards = 0; 5386 mddev->new_level = mddev->level; 5387 mddev->new_layout = mddev->layout; 5388 mddev->new_chunk_sectors = mddev->chunk_sectors; 5389 rdev_for_each(rdev, mddev) 5390 rdev->new_data_offset = rdev->data_offset; 5391 err = 0; 5392 unlock: 5393 mddev_unlock(mddev); 5394 return err ?: len; 5395 } 5396 5397 static struct md_sysfs_entry md_reshape_position = 5398 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5399 reshape_position_store); 5400 5401 static ssize_t 5402 reshape_direction_show(struct mddev *mddev, char *page) 5403 { 5404 return sprintf(page, "%s\n", 5405 mddev->reshape_backwards ? "backwards" : "forwards"); 5406 } 5407 5408 static ssize_t 5409 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5410 { 5411 int backwards = 0; 5412 int err; 5413 5414 if (cmd_match(buf, "forwards")) 5415 backwards = 0; 5416 else if (cmd_match(buf, "backwards")) 5417 backwards = 1; 5418 else 5419 return -EINVAL; 5420 if (mddev->reshape_backwards == backwards) 5421 return len; 5422 5423 err = mddev_lock(mddev); 5424 if (err) 5425 return err; 5426 /* check if we are allowed to change */ 5427 if (mddev->delta_disks) 5428 err = -EBUSY; 5429 else if (mddev->persistent && 5430 mddev->major_version == 0) 5431 err = -EINVAL; 5432 else 5433 mddev->reshape_backwards = backwards; 5434 mddev_unlock(mddev); 5435 return err ?: len; 5436 } 5437 5438 static struct md_sysfs_entry md_reshape_direction = 5439 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5440 reshape_direction_store); 5441 5442 static ssize_t 5443 array_size_show(struct mddev *mddev, char *page) 5444 { 5445 if (mddev->external_size) 5446 return sprintf(page, "%llu\n", 5447 (unsigned long long)mddev->array_sectors/2); 5448 else 5449 return sprintf(page, "default\n"); 5450 } 5451 5452 static ssize_t 5453 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5454 { 5455 sector_t sectors; 5456 int err; 5457 5458 err = mddev_lock(mddev); 5459 if (err) 5460 return err; 5461 5462 /* cluster raid doesn't support change array_sectors */ 5463 if (mddev_is_clustered(mddev)) { 5464 mddev_unlock(mddev); 5465 return -EINVAL; 5466 } 5467 5468 if (strncmp(buf, "default", 7) == 0) { 5469 if (mddev->pers) 5470 sectors = mddev->pers->size(mddev, 0, 0); 5471 else 5472 sectors = mddev->array_sectors; 5473 5474 mddev->external_size = 0; 5475 } else { 5476 if (strict_blocks_to_sectors(buf, §ors) < 0) 5477 err = -EINVAL; 5478 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5479 err = -E2BIG; 5480 else 5481 mddev->external_size = 1; 5482 } 5483 5484 if (!err) { 5485 mddev->array_sectors = sectors; 5486 if (mddev->pers) 5487 set_capacity_and_notify(mddev->gendisk, 5488 mddev->array_sectors); 5489 } 5490 mddev_unlock(mddev); 5491 return err ?: len; 5492 } 5493 5494 static struct md_sysfs_entry md_array_size = 5495 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5496 array_size_store); 5497 5498 static ssize_t 5499 consistency_policy_show(struct mddev *mddev, char *page) 5500 { 5501 int ret; 5502 5503 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5504 ret = sprintf(page, "journal\n"); 5505 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5506 ret = sprintf(page, "ppl\n"); 5507 } else if (mddev->bitmap) { 5508 ret = sprintf(page, "bitmap\n"); 5509 } else if (mddev->pers) { 5510 if (mddev->pers->sync_request) 5511 ret = sprintf(page, "resync\n"); 5512 else 5513 ret = sprintf(page, "none\n"); 5514 } else { 5515 ret = sprintf(page, "unknown\n"); 5516 } 5517 5518 return ret; 5519 } 5520 5521 static ssize_t 5522 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5523 { 5524 int err = 0; 5525 5526 if (mddev->pers) { 5527 if (mddev->pers->change_consistency_policy) 5528 err = mddev->pers->change_consistency_policy(mddev, buf); 5529 else 5530 err = -EBUSY; 5531 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5532 set_bit(MD_HAS_PPL, &mddev->flags); 5533 } else { 5534 err = -EINVAL; 5535 } 5536 5537 return err ? err : len; 5538 } 5539 5540 static struct md_sysfs_entry md_consistency_policy = 5541 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5542 consistency_policy_store); 5543 5544 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5545 { 5546 return sprintf(page, "%d\n", mddev->fail_last_dev); 5547 } 5548 5549 /* 5550 * Setting fail_last_dev to true to allow last device to be forcibly removed 5551 * from RAID1/RAID10. 5552 */ 5553 static ssize_t 5554 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5555 { 5556 int ret; 5557 bool value; 5558 5559 ret = kstrtobool(buf, &value); 5560 if (ret) 5561 return ret; 5562 5563 if (value != mddev->fail_last_dev) 5564 mddev->fail_last_dev = value; 5565 5566 return len; 5567 } 5568 static struct md_sysfs_entry md_fail_last_dev = 5569 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5570 fail_last_dev_store); 5571 5572 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5573 { 5574 if (mddev->pers == NULL || (mddev->pers->level != 1)) 5575 return sprintf(page, "n/a\n"); 5576 else 5577 return sprintf(page, "%d\n", mddev->serialize_policy); 5578 } 5579 5580 /* 5581 * Setting serialize_policy to true to enforce write IO is not reordered 5582 * for raid1. 5583 */ 5584 static ssize_t 5585 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5586 { 5587 int err; 5588 bool value; 5589 5590 err = kstrtobool(buf, &value); 5591 if (err) 5592 return err; 5593 5594 if (value == mddev->serialize_policy) 5595 return len; 5596 5597 err = mddev_suspend_and_lock(mddev); 5598 if (err) 5599 return err; 5600 if (mddev->pers == NULL || (mddev->pers->level != 1)) { 5601 pr_err("md: serialize_policy is only effective for raid1\n"); 5602 err = -EINVAL; 5603 goto unlock; 5604 } 5605 5606 if (value) 5607 mddev_create_serial_pool(mddev, NULL); 5608 else 5609 mddev_destroy_serial_pool(mddev, NULL); 5610 mddev->serialize_policy = value; 5611 unlock: 5612 mddev_unlock_and_resume(mddev); 5613 return err ?: len; 5614 } 5615 5616 static struct md_sysfs_entry md_serialize_policy = 5617 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5618 serialize_policy_store); 5619 5620 5621 static struct attribute *md_default_attrs[] = { 5622 &md_level.attr, 5623 &md_new_level.attr, 5624 &md_layout.attr, 5625 &md_raid_disks.attr, 5626 &md_uuid.attr, 5627 &md_chunk_size.attr, 5628 &md_size.attr, 5629 &md_resync_start.attr, 5630 &md_metadata.attr, 5631 &md_new_device.attr, 5632 &md_safe_delay.attr, 5633 &md_array_state.attr, 5634 &md_reshape_position.attr, 5635 &md_reshape_direction.attr, 5636 &md_array_size.attr, 5637 &max_corr_read_errors.attr, 5638 &md_consistency_policy.attr, 5639 &md_fail_last_dev.attr, 5640 &md_serialize_policy.attr, 5641 NULL, 5642 }; 5643 5644 static const struct attribute_group md_default_group = { 5645 .attrs = md_default_attrs, 5646 }; 5647 5648 static struct attribute *md_redundancy_attrs[] = { 5649 &md_scan_mode.attr, 5650 &md_last_scan_mode.attr, 5651 &md_mismatches.attr, 5652 &md_sync_min.attr, 5653 &md_sync_max.attr, 5654 &md_sync_speed.attr, 5655 &md_sync_force_parallel.attr, 5656 &md_sync_completed.attr, 5657 &md_min_sync.attr, 5658 &md_max_sync.attr, 5659 &md_suspend_lo.attr, 5660 &md_suspend_hi.attr, 5661 &md_bitmap.attr, 5662 &md_degraded.attr, 5663 NULL, 5664 }; 5665 static const struct attribute_group md_redundancy_group = { 5666 .name = NULL, 5667 .attrs = md_redundancy_attrs, 5668 }; 5669 5670 static const struct attribute_group *md_attr_groups[] = { 5671 &md_default_group, 5672 &md_bitmap_group, 5673 NULL, 5674 }; 5675 5676 static ssize_t 5677 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5678 { 5679 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5680 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5681 ssize_t rv; 5682 5683 if (!entry->show) 5684 return -EIO; 5685 spin_lock(&all_mddevs_lock); 5686 if (!mddev_get(mddev)) { 5687 spin_unlock(&all_mddevs_lock); 5688 return -EBUSY; 5689 } 5690 spin_unlock(&all_mddevs_lock); 5691 5692 rv = entry->show(mddev, page); 5693 mddev_put(mddev); 5694 return rv; 5695 } 5696 5697 static ssize_t 5698 md_attr_store(struct kobject *kobj, struct attribute *attr, 5699 const char *page, size_t length) 5700 { 5701 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5702 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5703 ssize_t rv; 5704 5705 if (!entry->store) 5706 return -EIO; 5707 if (!capable(CAP_SYS_ADMIN)) 5708 return -EACCES; 5709 spin_lock(&all_mddevs_lock); 5710 if (!mddev_get(mddev)) { 5711 spin_unlock(&all_mddevs_lock); 5712 return -EBUSY; 5713 } 5714 spin_unlock(&all_mddevs_lock); 5715 rv = entry->store(mddev, page, length); 5716 mddev_put(mddev); 5717 return rv; 5718 } 5719 5720 static void md_kobj_release(struct kobject *ko) 5721 { 5722 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5723 5724 if (mddev->sysfs_state) 5725 sysfs_put(mddev->sysfs_state); 5726 if (mddev->sysfs_level) 5727 sysfs_put(mddev->sysfs_level); 5728 5729 del_gendisk(mddev->gendisk); 5730 put_disk(mddev->gendisk); 5731 } 5732 5733 static const struct sysfs_ops md_sysfs_ops = { 5734 .show = md_attr_show, 5735 .store = md_attr_store, 5736 }; 5737 static const struct kobj_type md_ktype = { 5738 .release = md_kobj_release, 5739 .sysfs_ops = &md_sysfs_ops, 5740 .default_groups = md_attr_groups, 5741 }; 5742 5743 int mdp_major = 0; 5744 5745 /* stack the limit for all rdevs into lim */ 5746 int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, 5747 unsigned int flags) 5748 { 5749 struct md_rdev *rdev; 5750 5751 rdev_for_each(rdev, mddev) { 5752 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, 5753 mddev->gendisk->disk_name); 5754 if ((flags & MDDEV_STACK_INTEGRITY) && 5755 !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) 5756 return -EINVAL; 5757 } 5758 5759 return 0; 5760 } 5761 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); 5762 5763 /* apply the extra stacking limits from a new rdev into mddev */ 5764 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) 5765 { 5766 struct queue_limits lim; 5767 5768 if (mddev_is_dm(mddev)) 5769 return 0; 5770 5771 lim = queue_limits_start_update(mddev->gendisk->queue); 5772 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, 5773 mddev->gendisk->disk_name); 5774 5775 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { 5776 pr_err("%s: incompatible integrity profile for %pg\n", 5777 mdname(mddev), rdev->bdev); 5778 queue_limits_cancel_update(mddev->gendisk->queue); 5779 return -ENXIO; 5780 } 5781 5782 return queue_limits_commit_update(mddev->gendisk->queue, &lim); 5783 } 5784 EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); 5785 5786 /* update the optimal I/O size after a reshape */ 5787 void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) 5788 { 5789 struct queue_limits lim; 5790 5791 if (mddev_is_dm(mddev)) 5792 return; 5793 5794 /* don't bother updating io_opt if we can't suspend the array */ 5795 if (mddev_suspend(mddev, false) < 0) 5796 return; 5797 lim = queue_limits_start_update(mddev->gendisk->queue); 5798 lim.io_opt = lim.io_min * nr_stripes; 5799 queue_limits_commit_update(mddev->gendisk->queue, &lim); 5800 mddev_resume(mddev); 5801 } 5802 EXPORT_SYMBOL_GPL(mddev_update_io_opt); 5803 5804 static void mddev_delayed_delete(struct work_struct *ws) 5805 { 5806 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5807 5808 kobject_put(&mddev->kobj); 5809 } 5810 5811 void md_init_stacking_limits(struct queue_limits *lim) 5812 { 5813 blk_set_stacking_limits(lim); 5814 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | 5815 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; 5816 } 5817 EXPORT_SYMBOL_GPL(md_init_stacking_limits); 5818 5819 struct mddev *md_alloc(dev_t dev, char *name) 5820 { 5821 /* 5822 * If dev is zero, name is the name of a device to allocate with 5823 * an arbitrary minor number. It will be "md_???" 5824 * If dev is non-zero it must be a device number with a MAJOR of 5825 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5826 * the device is being created by opening a node in /dev. 5827 * If "name" is not NULL, the device is being created by 5828 * writing to /sys/module/md_mod/parameters/new_array. 5829 */ 5830 static DEFINE_MUTEX(disks_mutex); 5831 struct mddev *mddev; 5832 struct gendisk *disk; 5833 int partitioned; 5834 int shift; 5835 int unit; 5836 int error; 5837 5838 /* 5839 * Wait for any previous instance of this device to be completely 5840 * removed (mddev_delayed_delete). 5841 */ 5842 flush_workqueue(md_misc_wq); 5843 5844 mutex_lock(&disks_mutex); 5845 mddev = mddev_alloc(dev); 5846 if (IS_ERR(mddev)) { 5847 error = PTR_ERR(mddev); 5848 goto out_unlock; 5849 } 5850 5851 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5852 shift = partitioned ? MdpMinorShift : 0; 5853 unit = MINOR(mddev->unit) >> shift; 5854 5855 if (name && !dev) { 5856 /* Need to ensure that 'name' is not a duplicate. 5857 */ 5858 struct mddev *mddev2; 5859 spin_lock(&all_mddevs_lock); 5860 5861 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5862 if (mddev2->gendisk && 5863 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5864 spin_unlock(&all_mddevs_lock); 5865 error = -EEXIST; 5866 goto out_free_mddev; 5867 } 5868 spin_unlock(&all_mddevs_lock); 5869 } 5870 if (name && dev) 5871 /* 5872 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5873 */ 5874 mddev->hold_active = UNTIL_STOP; 5875 5876 disk = blk_alloc_disk(NULL, NUMA_NO_NODE); 5877 if (IS_ERR(disk)) { 5878 error = PTR_ERR(disk); 5879 goto out_free_mddev; 5880 } 5881 5882 disk->major = MAJOR(mddev->unit); 5883 disk->first_minor = unit << shift; 5884 disk->minors = 1 << shift; 5885 if (name) 5886 strcpy(disk->disk_name, name); 5887 else if (partitioned) 5888 sprintf(disk->disk_name, "md_d%d", unit); 5889 else 5890 sprintf(disk->disk_name, "md%d", unit); 5891 disk->fops = &md_fops; 5892 disk->private_data = mddev; 5893 5894 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5895 mddev->gendisk = disk; 5896 error = add_disk(disk); 5897 if (error) 5898 goto out_put_disk; 5899 5900 kobject_init(&mddev->kobj, &md_ktype); 5901 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5902 if (error) { 5903 /* 5904 * The disk is already live at this point. Clear the hold flag 5905 * and let mddev_put take care of the deletion, as it isn't any 5906 * different from a normal close on last release now. 5907 */ 5908 mddev->hold_active = 0; 5909 mutex_unlock(&disks_mutex); 5910 mddev_put(mddev); 5911 return ERR_PTR(error); 5912 } 5913 5914 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5915 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5916 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 5917 mutex_unlock(&disks_mutex); 5918 return mddev; 5919 5920 out_put_disk: 5921 put_disk(disk); 5922 out_free_mddev: 5923 mddev_free(mddev); 5924 out_unlock: 5925 mutex_unlock(&disks_mutex); 5926 return ERR_PTR(error); 5927 } 5928 5929 static int md_alloc_and_put(dev_t dev, char *name) 5930 { 5931 struct mddev *mddev = md_alloc(dev, name); 5932 5933 if (IS_ERR(mddev)) 5934 return PTR_ERR(mddev); 5935 mddev_put(mddev); 5936 return 0; 5937 } 5938 5939 static void md_probe(dev_t dev) 5940 { 5941 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 5942 return; 5943 if (create_on_open) 5944 md_alloc_and_put(dev, NULL); 5945 } 5946 5947 static int add_named_array(const char *val, const struct kernel_param *kp) 5948 { 5949 /* 5950 * val must be "md_*" or "mdNNN". 5951 * For "md_*" we allocate an array with a large free minor number, and 5952 * set the name to val. val must not already be an active name. 5953 * For "mdNNN" we allocate an array with the minor number NNN 5954 * which must not already be in use. 5955 */ 5956 int len = strlen(val); 5957 char buf[DISK_NAME_LEN]; 5958 unsigned long devnum; 5959 5960 while (len && val[len-1] == '\n') 5961 len--; 5962 if (len >= DISK_NAME_LEN) 5963 return -E2BIG; 5964 strscpy(buf, val, len+1); 5965 if (strncmp(buf, "md_", 3) == 0) 5966 return md_alloc_and_put(0, buf); 5967 if (strncmp(buf, "md", 2) == 0 && 5968 isdigit(buf[2]) && 5969 kstrtoul(buf+2, 10, &devnum) == 0 && 5970 devnum <= MINORMASK) 5971 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 5972 5973 return -EINVAL; 5974 } 5975 5976 static void md_safemode_timeout(struct timer_list *t) 5977 { 5978 struct mddev *mddev = from_timer(mddev, t, safemode_timer); 5979 5980 mddev->safemode = 1; 5981 if (mddev->external) 5982 sysfs_notify_dirent_safe(mddev->sysfs_state); 5983 5984 md_wakeup_thread(mddev->thread); 5985 } 5986 5987 static int start_dirty_degraded; 5988 5989 int md_run(struct mddev *mddev) 5990 { 5991 int err; 5992 struct md_rdev *rdev; 5993 struct md_personality *pers; 5994 bool nowait = true; 5995 5996 if (list_empty(&mddev->disks)) 5997 /* cannot run an array with no devices.. */ 5998 return -EINVAL; 5999 6000 if (mddev->pers) 6001 return -EBUSY; 6002 /* Cannot run until previous stop completes properly */ 6003 if (mddev->sysfs_active) 6004 return -EBUSY; 6005 6006 /* 6007 * Analyze all RAID superblock(s) 6008 */ 6009 if (!mddev->raid_disks) { 6010 if (!mddev->persistent) 6011 return -EINVAL; 6012 err = analyze_sbs(mddev); 6013 if (err) 6014 return -EINVAL; 6015 } 6016 6017 if (mddev->level != LEVEL_NONE) 6018 request_module("md-level-%d", mddev->level); 6019 else if (mddev->clevel[0]) 6020 request_module("md-%s", mddev->clevel); 6021 6022 /* 6023 * Drop all container device buffers, from now on 6024 * the only valid external interface is through the md 6025 * device. 6026 */ 6027 mddev->has_superblocks = false; 6028 rdev_for_each(rdev, mddev) { 6029 if (test_bit(Faulty, &rdev->flags)) 6030 continue; 6031 sync_blockdev(rdev->bdev); 6032 invalidate_bdev(rdev->bdev); 6033 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 6034 mddev->ro = MD_RDONLY; 6035 if (!mddev_is_dm(mddev)) 6036 set_disk_ro(mddev->gendisk, 1); 6037 } 6038 6039 if (rdev->sb_page) 6040 mddev->has_superblocks = true; 6041 6042 /* perform some consistency tests on the device. 6043 * We don't want the data to overlap the metadata, 6044 * Internal Bitmap issues have been handled elsewhere. 6045 */ 6046 if (rdev->meta_bdev) { 6047 /* Nothing to check */; 6048 } else if (rdev->data_offset < rdev->sb_start) { 6049 if (mddev->dev_sectors && 6050 rdev->data_offset + mddev->dev_sectors 6051 > rdev->sb_start) { 6052 pr_warn("md: %s: data overlaps metadata\n", 6053 mdname(mddev)); 6054 return -EINVAL; 6055 } 6056 } else { 6057 if (rdev->sb_start + rdev->sb_size/512 6058 > rdev->data_offset) { 6059 pr_warn("md: %s: metadata overlaps data\n", 6060 mdname(mddev)); 6061 return -EINVAL; 6062 } 6063 } 6064 sysfs_notify_dirent_safe(rdev->sysfs_state); 6065 nowait = nowait && bdev_nowait(rdev->bdev); 6066 } 6067 6068 if (!bioset_initialized(&mddev->bio_set)) { 6069 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6070 if (err) 6071 return err; 6072 } 6073 if (!bioset_initialized(&mddev->sync_set)) { 6074 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6075 if (err) 6076 goto exit_bio_set; 6077 } 6078 6079 if (!bioset_initialized(&mddev->io_clone_set)) { 6080 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 6081 offsetof(struct md_io_clone, bio_clone), 0); 6082 if (err) 6083 goto exit_sync_set; 6084 } 6085 6086 spin_lock(&pers_lock); 6087 pers = find_pers(mddev->level, mddev->clevel); 6088 if (!pers || !try_module_get(pers->owner)) { 6089 spin_unlock(&pers_lock); 6090 if (mddev->level != LEVEL_NONE) 6091 pr_warn("md: personality for level %d is not loaded!\n", 6092 mddev->level); 6093 else 6094 pr_warn("md: personality for level %s is not loaded!\n", 6095 mddev->clevel); 6096 err = -EINVAL; 6097 goto abort; 6098 } 6099 spin_unlock(&pers_lock); 6100 if (mddev->level != pers->level) { 6101 mddev->level = pers->level; 6102 mddev->new_level = pers->level; 6103 } 6104 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 6105 6106 if (mddev->reshape_position != MaxSector && 6107 pers->start_reshape == NULL) { 6108 /* This personality cannot handle reshaping... */ 6109 module_put(pers->owner); 6110 err = -EINVAL; 6111 goto abort; 6112 } 6113 6114 if (pers->sync_request) { 6115 /* Warn if this is a potentially silly 6116 * configuration. 6117 */ 6118 struct md_rdev *rdev2; 6119 int warned = 0; 6120 6121 rdev_for_each(rdev, mddev) 6122 rdev_for_each(rdev2, mddev) { 6123 if (rdev < rdev2 && 6124 rdev->bdev->bd_disk == 6125 rdev2->bdev->bd_disk) { 6126 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 6127 mdname(mddev), 6128 rdev->bdev, 6129 rdev2->bdev); 6130 warned = 1; 6131 } 6132 } 6133 6134 if (warned) 6135 pr_warn("True protection against single-disk failure might be compromised.\n"); 6136 } 6137 6138 /* dm-raid expect sync_thread to be frozen until resume */ 6139 if (mddev->gendisk) 6140 mddev->recovery = 0; 6141 6142 /* may be over-ridden by personality */ 6143 mddev->resync_max_sectors = mddev->dev_sectors; 6144 6145 mddev->ok_start_degraded = start_dirty_degraded; 6146 6147 if (start_readonly && md_is_rdwr(mddev)) 6148 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6149 6150 err = pers->run(mddev); 6151 if (err) 6152 pr_warn("md: pers->run() failed ...\n"); 6153 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6154 WARN_ONCE(!mddev->external_size, 6155 "%s: default size too small, but 'external_size' not in effect?\n", 6156 __func__); 6157 pr_warn("md: invalid array_size %llu > default size %llu\n", 6158 (unsigned long long)mddev->array_sectors / 2, 6159 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6160 err = -EINVAL; 6161 } 6162 if (err == 0 && pers->sync_request && 6163 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6164 err = mddev->bitmap_ops->create(mddev, -1); 6165 if (err) 6166 pr_warn("%s: failed to create bitmap (%d)\n", 6167 mdname(mddev), err); 6168 } 6169 if (err) 6170 goto bitmap_abort; 6171 6172 if (mddev->bitmap_info.max_write_behind > 0) { 6173 bool create_pool = false; 6174 6175 rdev_for_each(rdev, mddev) { 6176 if (test_bit(WriteMostly, &rdev->flags) && 6177 rdev_init_serial(rdev)) 6178 create_pool = true; 6179 } 6180 if (create_pool && mddev->serial_info_pool == NULL) { 6181 mddev->serial_info_pool = 6182 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6183 sizeof(struct serial_info)); 6184 if (!mddev->serial_info_pool) { 6185 err = -ENOMEM; 6186 goto bitmap_abort; 6187 } 6188 } 6189 } 6190 6191 if (pers->sync_request) { 6192 if (mddev->kobj.sd && 6193 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6194 pr_warn("md: cannot register extra attributes for %s\n", 6195 mdname(mddev)); 6196 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6197 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6198 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6199 } else if (mddev->ro == MD_AUTO_READ) 6200 mddev->ro = MD_RDWR; 6201 6202 atomic_set(&mddev->max_corr_read_errors, 6203 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6204 mddev->safemode = 0; 6205 if (mddev_is_clustered(mddev)) 6206 mddev->safemode_delay = 0; 6207 else 6208 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6209 mddev->in_sync = 1; 6210 smp_wmb(); 6211 spin_lock(&mddev->lock); 6212 mddev->pers = pers; 6213 spin_unlock(&mddev->lock); 6214 rdev_for_each(rdev, mddev) 6215 if (rdev->raid_disk >= 0) 6216 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6217 6218 if (mddev->degraded && md_is_rdwr(mddev)) 6219 /* This ensures that recovering status is reported immediately 6220 * via sysfs - until a lack of spares is confirmed. 6221 */ 6222 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6223 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6224 6225 if (mddev->sb_flags) 6226 md_update_sb(mddev, 0); 6227 6228 md_new_event(); 6229 return 0; 6230 6231 bitmap_abort: 6232 mddev_detach(mddev); 6233 if (mddev->private) 6234 pers->free(mddev, mddev->private); 6235 mddev->private = NULL; 6236 module_put(pers->owner); 6237 mddev->bitmap_ops->destroy(mddev); 6238 abort: 6239 bioset_exit(&mddev->io_clone_set); 6240 exit_sync_set: 6241 bioset_exit(&mddev->sync_set); 6242 exit_bio_set: 6243 bioset_exit(&mddev->bio_set); 6244 return err; 6245 } 6246 EXPORT_SYMBOL_GPL(md_run); 6247 6248 int do_md_run(struct mddev *mddev) 6249 { 6250 int err; 6251 6252 set_bit(MD_NOT_READY, &mddev->flags); 6253 err = md_run(mddev); 6254 if (err) 6255 goto out; 6256 6257 err = mddev->bitmap_ops->load(mddev); 6258 if (err) { 6259 mddev->bitmap_ops->destroy(mddev); 6260 goto out; 6261 } 6262 6263 if (mddev_is_clustered(mddev)) 6264 md_allow_write(mddev); 6265 6266 /* run start up tasks that require md_thread */ 6267 md_start(mddev); 6268 6269 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6270 6271 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6272 clear_bit(MD_NOT_READY, &mddev->flags); 6273 mddev->changed = 1; 6274 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6275 sysfs_notify_dirent_safe(mddev->sysfs_state); 6276 sysfs_notify_dirent_safe(mddev->sysfs_action); 6277 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6278 out: 6279 clear_bit(MD_NOT_READY, &mddev->flags); 6280 return err; 6281 } 6282 6283 int md_start(struct mddev *mddev) 6284 { 6285 int ret = 0; 6286 6287 if (mddev->pers->start) { 6288 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6289 ret = mddev->pers->start(mddev); 6290 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6291 md_wakeup_thread(mddev->sync_thread); 6292 } 6293 return ret; 6294 } 6295 EXPORT_SYMBOL_GPL(md_start); 6296 6297 static int restart_array(struct mddev *mddev) 6298 { 6299 struct gendisk *disk = mddev->gendisk; 6300 struct md_rdev *rdev; 6301 bool has_journal = false; 6302 bool has_readonly = false; 6303 6304 /* Complain if it has no devices */ 6305 if (list_empty(&mddev->disks)) 6306 return -ENXIO; 6307 if (!mddev->pers) 6308 return -EINVAL; 6309 if (md_is_rdwr(mddev)) 6310 return -EBUSY; 6311 6312 rcu_read_lock(); 6313 rdev_for_each_rcu(rdev, mddev) { 6314 if (test_bit(Journal, &rdev->flags) && 6315 !test_bit(Faulty, &rdev->flags)) 6316 has_journal = true; 6317 if (rdev_read_only(rdev)) 6318 has_readonly = true; 6319 } 6320 rcu_read_unlock(); 6321 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6322 /* Don't restart rw with journal missing/faulty */ 6323 return -EINVAL; 6324 if (has_readonly) 6325 return -EROFS; 6326 6327 mddev->safemode = 0; 6328 mddev->ro = MD_RDWR; 6329 set_disk_ro(disk, 0); 6330 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6331 /* Kick recovery or resync if necessary */ 6332 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6333 md_wakeup_thread(mddev->sync_thread); 6334 sysfs_notify_dirent_safe(mddev->sysfs_state); 6335 return 0; 6336 } 6337 6338 static void md_clean(struct mddev *mddev) 6339 { 6340 mddev->array_sectors = 0; 6341 mddev->external_size = 0; 6342 mddev->dev_sectors = 0; 6343 mddev->raid_disks = 0; 6344 mddev->recovery_cp = 0; 6345 mddev->resync_min = 0; 6346 mddev->resync_max = MaxSector; 6347 mddev->reshape_position = MaxSector; 6348 /* we still need mddev->external in export_rdev, do not clear it yet */ 6349 mddev->persistent = 0; 6350 mddev->level = LEVEL_NONE; 6351 mddev->clevel[0] = 0; 6352 /* 6353 * Don't clear MD_CLOSING, or mddev can be opened again. 6354 * 'hold_active != 0' means mddev is still in the creation 6355 * process and will be used later. 6356 */ 6357 if (mddev->hold_active) 6358 mddev->flags = 0; 6359 else 6360 mddev->flags &= BIT_ULL_MASK(MD_CLOSING); 6361 mddev->sb_flags = 0; 6362 mddev->ro = MD_RDWR; 6363 mddev->metadata_type[0] = 0; 6364 mddev->chunk_sectors = 0; 6365 mddev->ctime = mddev->utime = 0; 6366 mddev->layout = 0; 6367 mddev->max_disks = 0; 6368 mddev->events = 0; 6369 mddev->can_decrease_events = 0; 6370 mddev->delta_disks = 0; 6371 mddev->reshape_backwards = 0; 6372 mddev->new_level = LEVEL_NONE; 6373 mddev->new_layout = 0; 6374 mddev->new_chunk_sectors = 0; 6375 mddev->curr_resync = MD_RESYNC_NONE; 6376 atomic64_set(&mddev->resync_mismatches, 0); 6377 mddev->suspend_lo = mddev->suspend_hi = 0; 6378 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6379 mddev->recovery = 0; 6380 mddev->in_sync = 0; 6381 mddev->changed = 0; 6382 mddev->degraded = 0; 6383 mddev->safemode = 0; 6384 mddev->private = NULL; 6385 mddev->cluster_info = NULL; 6386 mddev->bitmap_info.offset = 0; 6387 mddev->bitmap_info.default_offset = 0; 6388 mddev->bitmap_info.default_space = 0; 6389 mddev->bitmap_info.chunksize = 0; 6390 mddev->bitmap_info.daemon_sleep = 0; 6391 mddev->bitmap_info.max_write_behind = 0; 6392 mddev->bitmap_info.nodes = 0; 6393 } 6394 6395 static void __md_stop_writes(struct mddev *mddev) 6396 { 6397 del_timer_sync(&mddev->safemode_timer); 6398 6399 if (mddev->pers && mddev->pers->quiesce) { 6400 mddev->pers->quiesce(mddev, 1); 6401 mddev->pers->quiesce(mddev, 0); 6402 } 6403 6404 mddev->bitmap_ops->flush(mddev); 6405 6406 if (md_is_rdwr(mddev) && 6407 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6408 mddev->sb_flags)) { 6409 /* mark array as shutdown cleanly */ 6410 if (!mddev_is_clustered(mddev)) 6411 mddev->in_sync = 1; 6412 md_update_sb(mddev, 1); 6413 } 6414 /* disable policy to guarantee rdevs free resources for serialization */ 6415 mddev->serialize_policy = 0; 6416 mddev_destroy_serial_pool(mddev, NULL); 6417 } 6418 6419 void md_stop_writes(struct mddev *mddev) 6420 { 6421 mddev_lock_nointr(mddev); 6422 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6423 stop_sync_thread(mddev, true); 6424 __md_stop_writes(mddev); 6425 mddev_unlock(mddev); 6426 } 6427 EXPORT_SYMBOL_GPL(md_stop_writes); 6428 6429 static void mddev_detach(struct mddev *mddev) 6430 { 6431 mddev->bitmap_ops->wait_behind_writes(mddev); 6432 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6433 mddev->pers->quiesce(mddev, 1); 6434 mddev->pers->quiesce(mddev, 0); 6435 } 6436 md_unregister_thread(mddev, &mddev->thread); 6437 6438 /* the unplug fn references 'conf' */ 6439 if (!mddev_is_dm(mddev)) 6440 blk_sync_queue(mddev->gendisk->queue); 6441 } 6442 6443 static void __md_stop(struct mddev *mddev) 6444 { 6445 struct md_personality *pers = mddev->pers; 6446 6447 mddev->bitmap_ops->destroy(mddev); 6448 mddev_detach(mddev); 6449 spin_lock(&mddev->lock); 6450 mddev->pers = NULL; 6451 spin_unlock(&mddev->lock); 6452 if (mddev->private) 6453 pers->free(mddev, mddev->private); 6454 mddev->private = NULL; 6455 if (pers->sync_request && mddev->to_remove == NULL) 6456 mddev->to_remove = &md_redundancy_group; 6457 module_put(pers->owner); 6458 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6459 6460 bioset_exit(&mddev->bio_set); 6461 bioset_exit(&mddev->sync_set); 6462 bioset_exit(&mddev->io_clone_set); 6463 } 6464 6465 void md_stop(struct mddev *mddev) 6466 { 6467 lockdep_assert_held(&mddev->reconfig_mutex); 6468 6469 /* stop the array and free an attached data structures. 6470 * This is called from dm-raid 6471 */ 6472 __md_stop_writes(mddev); 6473 __md_stop(mddev); 6474 } 6475 6476 EXPORT_SYMBOL_GPL(md_stop); 6477 6478 /* ensure 'mddev->pers' exist before calling md_set_readonly() */ 6479 static int md_set_readonly(struct mddev *mddev) 6480 { 6481 int err = 0; 6482 int did_freeze = 0; 6483 6484 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6485 return -EBUSY; 6486 6487 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6488 did_freeze = 1; 6489 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6490 } 6491 6492 stop_sync_thread(mddev, false); 6493 wait_event(mddev->sb_wait, 6494 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6495 mddev_lock_nointr(mddev); 6496 6497 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6498 pr_warn("md: %s still in use.\n",mdname(mddev)); 6499 err = -EBUSY; 6500 goto out; 6501 } 6502 6503 __md_stop_writes(mddev); 6504 6505 if (mddev->ro == MD_RDONLY) { 6506 err = -ENXIO; 6507 goto out; 6508 } 6509 6510 mddev->ro = MD_RDONLY; 6511 set_disk_ro(mddev->gendisk, 1); 6512 6513 out: 6514 if (!err || did_freeze) { 6515 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6516 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6517 sysfs_notify_dirent_safe(mddev->sysfs_state); 6518 } 6519 6520 return err; 6521 } 6522 6523 /* mode: 6524 * 0 - completely stop and dis-assemble array 6525 * 2 - stop but do not disassemble array 6526 */ 6527 static int do_md_stop(struct mddev *mddev, int mode) 6528 { 6529 struct gendisk *disk = mddev->gendisk; 6530 struct md_rdev *rdev; 6531 int did_freeze = 0; 6532 6533 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6534 did_freeze = 1; 6535 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6536 } 6537 6538 stop_sync_thread(mddev, true); 6539 6540 if (mddev->sysfs_active || 6541 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6542 pr_warn("md: %s still in use.\n",mdname(mddev)); 6543 if (did_freeze) { 6544 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6545 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6546 } 6547 return -EBUSY; 6548 } 6549 if (mddev->pers) { 6550 if (!md_is_rdwr(mddev)) 6551 set_disk_ro(disk, 0); 6552 6553 __md_stop_writes(mddev); 6554 __md_stop(mddev); 6555 6556 /* tell userspace to handle 'inactive' */ 6557 sysfs_notify_dirent_safe(mddev->sysfs_state); 6558 6559 rdev_for_each(rdev, mddev) 6560 if (rdev->raid_disk >= 0) 6561 sysfs_unlink_rdev(mddev, rdev); 6562 6563 set_capacity_and_notify(disk, 0); 6564 mddev->changed = 1; 6565 6566 if (!md_is_rdwr(mddev)) 6567 mddev->ro = MD_RDWR; 6568 } 6569 /* 6570 * Free resources if final stop 6571 */ 6572 if (mode == 0) { 6573 pr_info("md: %s stopped.\n", mdname(mddev)); 6574 6575 if (mddev->bitmap_info.file) { 6576 struct file *f = mddev->bitmap_info.file; 6577 spin_lock(&mddev->lock); 6578 mddev->bitmap_info.file = NULL; 6579 spin_unlock(&mddev->lock); 6580 fput(f); 6581 } 6582 mddev->bitmap_info.offset = 0; 6583 6584 export_array(mddev); 6585 6586 md_clean(mddev); 6587 if (mddev->hold_active == UNTIL_STOP) 6588 mddev->hold_active = 0; 6589 } 6590 md_new_event(); 6591 sysfs_notify_dirent_safe(mddev->sysfs_state); 6592 return 0; 6593 } 6594 6595 #ifndef MODULE 6596 static void autorun_array(struct mddev *mddev) 6597 { 6598 struct md_rdev *rdev; 6599 int err; 6600 6601 if (list_empty(&mddev->disks)) 6602 return; 6603 6604 pr_info("md: running: "); 6605 6606 rdev_for_each(rdev, mddev) { 6607 pr_cont("<%pg>", rdev->bdev); 6608 } 6609 pr_cont("\n"); 6610 6611 err = do_md_run(mddev); 6612 if (err) { 6613 pr_warn("md: do_md_run() returned %d\n", err); 6614 do_md_stop(mddev, 0); 6615 } 6616 } 6617 6618 /* 6619 * lets try to run arrays based on all disks that have arrived 6620 * until now. (those are in pending_raid_disks) 6621 * 6622 * the method: pick the first pending disk, collect all disks with 6623 * the same UUID, remove all from the pending list and put them into 6624 * the 'same_array' list. Then order this list based on superblock 6625 * update time (freshest comes first), kick out 'old' disks and 6626 * compare superblocks. If everything's fine then run it. 6627 * 6628 * If "unit" is allocated, then bump its reference count 6629 */ 6630 static void autorun_devices(int part) 6631 { 6632 struct md_rdev *rdev0, *rdev, *tmp; 6633 struct mddev *mddev; 6634 6635 pr_info("md: autorun ...\n"); 6636 while (!list_empty(&pending_raid_disks)) { 6637 int unit; 6638 dev_t dev; 6639 LIST_HEAD(candidates); 6640 rdev0 = list_entry(pending_raid_disks.next, 6641 struct md_rdev, same_set); 6642 6643 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6644 INIT_LIST_HEAD(&candidates); 6645 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6646 if (super_90_load(rdev, rdev0, 0) >= 0) { 6647 pr_debug("md: adding %pg ...\n", 6648 rdev->bdev); 6649 list_move(&rdev->same_set, &candidates); 6650 } 6651 /* 6652 * now we have a set of devices, with all of them having 6653 * mostly sane superblocks. It's time to allocate the 6654 * mddev. 6655 */ 6656 if (part) { 6657 dev = MKDEV(mdp_major, 6658 rdev0->preferred_minor << MdpMinorShift); 6659 unit = MINOR(dev) >> MdpMinorShift; 6660 } else { 6661 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6662 unit = MINOR(dev); 6663 } 6664 if (rdev0->preferred_minor != unit) { 6665 pr_warn("md: unit number in %pg is bad: %d\n", 6666 rdev0->bdev, rdev0->preferred_minor); 6667 break; 6668 } 6669 6670 mddev = md_alloc(dev, NULL); 6671 if (IS_ERR(mddev)) 6672 break; 6673 6674 if (mddev_suspend_and_lock(mddev)) 6675 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6676 else if (mddev->raid_disks || mddev->major_version 6677 || !list_empty(&mddev->disks)) { 6678 pr_warn("md: %s already running, cannot run %pg\n", 6679 mdname(mddev), rdev0->bdev); 6680 mddev_unlock_and_resume(mddev); 6681 } else { 6682 pr_debug("md: created %s\n", mdname(mddev)); 6683 mddev->persistent = 1; 6684 rdev_for_each_list(rdev, tmp, &candidates) { 6685 list_del_init(&rdev->same_set); 6686 if (bind_rdev_to_array(rdev, mddev)) 6687 export_rdev(rdev, mddev); 6688 } 6689 autorun_array(mddev); 6690 mddev_unlock_and_resume(mddev); 6691 } 6692 /* on success, candidates will be empty, on error 6693 * it won't... 6694 */ 6695 rdev_for_each_list(rdev, tmp, &candidates) { 6696 list_del_init(&rdev->same_set); 6697 export_rdev(rdev, mddev); 6698 } 6699 mddev_put(mddev); 6700 } 6701 pr_info("md: ... autorun DONE.\n"); 6702 } 6703 #endif /* !MODULE */ 6704 6705 static int get_version(void __user *arg) 6706 { 6707 mdu_version_t ver; 6708 6709 ver.major = MD_MAJOR_VERSION; 6710 ver.minor = MD_MINOR_VERSION; 6711 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6712 6713 if (copy_to_user(arg, &ver, sizeof(ver))) 6714 return -EFAULT; 6715 6716 return 0; 6717 } 6718 6719 static int get_array_info(struct mddev *mddev, void __user *arg) 6720 { 6721 mdu_array_info_t info; 6722 int nr,working,insync,failed,spare; 6723 struct md_rdev *rdev; 6724 6725 nr = working = insync = failed = spare = 0; 6726 rcu_read_lock(); 6727 rdev_for_each_rcu(rdev, mddev) { 6728 nr++; 6729 if (test_bit(Faulty, &rdev->flags)) 6730 failed++; 6731 else { 6732 working++; 6733 if (test_bit(In_sync, &rdev->flags)) 6734 insync++; 6735 else if (test_bit(Journal, &rdev->flags)) 6736 /* TODO: add journal count to md_u.h */ 6737 ; 6738 else 6739 spare++; 6740 } 6741 } 6742 rcu_read_unlock(); 6743 6744 info.major_version = mddev->major_version; 6745 info.minor_version = mddev->minor_version; 6746 info.patch_version = MD_PATCHLEVEL_VERSION; 6747 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6748 info.level = mddev->level; 6749 info.size = mddev->dev_sectors / 2; 6750 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6751 info.size = -1; 6752 info.nr_disks = nr; 6753 info.raid_disks = mddev->raid_disks; 6754 info.md_minor = mddev->md_minor; 6755 info.not_persistent= !mddev->persistent; 6756 6757 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6758 info.state = 0; 6759 if (mddev->in_sync) 6760 info.state = (1<<MD_SB_CLEAN); 6761 if (mddev->bitmap && mddev->bitmap_info.offset) 6762 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6763 if (mddev_is_clustered(mddev)) 6764 info.state |= (1<<MD_SB_CLUSTERED); 6765 info.active_disks = insync; 6766 info.working_disks = working; 6767 info.failed_disks = failed; 6768 info.spare_disks = spare; 6769 6770 info.layout = mddev->layout; 6771 info.chunk_size = mddev->chunk_sectors << 9; 6772 6773 if (copy_to_user(arg, &info, sizeof(info))) 6774 return -EFAULT; 6775 6776 return 0; 6777 } 6778 6779 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6780 { 6781 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6782 char *ptr; 6783 int err; 6784 6785 file = kzalloc(sizeof(*file), GFP_NOIO); 6786 if (!file) 6787 return -ENOMEM; 6788 6789 err = 0; 6790 spin_lock(&mddev->lock); 6791 /* bitmap enabled */ 6792 if (mddev->bitmap_info.file) { 6793 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6794 sizeof(file->pathname)); 6795 if (IS_ERR(ptr)) 6796 err = PTR_ERR(ptr); 6797 else 6798 memmove(file->pathname, ptr, 6799 sizeof(file->pathname)-(ptr-file->pathname)); 6800 } 6801 spin_unlock(&mddev->lock); 6802 6803 if (err == 0 && 6804 copy_to_user(arg, file, sizeof(*file))) 6805 err = -EFAULT; 6806 6807 kfree(file); 6808 return err; 6809 } 6810 6811 static int get_disk_info(struct mddev *mddev, void __user * arg) 6812 { 6813 mdu_disk_info_t info; 6814 struct md_rdev *rdev; 6815 6816 if (copy_from_user(&info, arg, sizeof(info))) 6817 return -EFAULT; 6818 6819 rcu_read_lock(); 6820 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6821 if (rdev) { 6822 info.major = MAJOR(rdev->bdev->bd_dev); 6823 info.minor = MINOR(rdev->bdev->bd_dev); 6824 info.raid_disk = rdev->raid_disk; 6825 info.state = 0; 6826 if (test_bit(Faulty, &rdev->flags)) 6827 info.state |= (1<<MD_DISK_FAULTY); 6828 else if (test_bit(In_sync, &rdev->flags)) { 6829 info.state |= (1<<MD_DISK_ACTIVE); 6830 info.state |= (1<<MD_DISK_SYNC); 6831 } 6832 if (test_bit(Journal, &rdev->flags)) 6833 info.state |= (1<<MD_DISK_JOURNAL); 6834 if (test_bit(WriteMostly, &rdev->flags)) 6835 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6836 if (test_bit(FailFast, &rdev->flags)) 6837 info.state |= (1<<MD_DISK_FAILFAST); 6838 } else { 6839 info.major = info.minor = 0; 6840 info.raid_disk = -1; 6841 info.state = (1<<MD_DISK_REMOVED); 6842 } 6843 rcu_read_unlock(); 6844 6845 if (copy_to_user(arg, &info, sizeof(info))) 6846 return -EFAULT; 6847 6848 return 0; 6849 } 6850 6851 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6852 { 6853 struct md_rdev *rdev; 6854 dev_t dev = MKDEV(info->major,info->minor); 6855 6856 if (mddev_is_clustered(mddev) && 6857 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6858 pr_warn("%s: Cannot add to clustered mddev.\n", 6859 mdname(mddev)); 6860 return -EINVAL; 6861 } 6862 6863 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6864 return -EOVERFLOW; 6865 6866 if (!mddev->raid_disks) { 6867 int err; 6868 /* expecting a device which has a superblock */ 6869 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6870 if (IS_ERR(rdev)) { 6871 pr_warn("md: md_import_device returned %ld\n", 6872 PTR_ERR(rdev)); 6873 return PTR_ERR(rdev); 6874 } 6875 if (!list_empty(&mddev->disks)) { 6876 struct md_rdev *rdev0 6877 = list_entry(mddev->disks.next, 6878 struct md_rdev, same_set); 6879 err = super_types[mddev->major_version] 6880 .load_super(rdev, rdev0, mddev->minor_version); 6881 if (err < 0) { 6882 pr_warn("md: %pg has different UUID to %pg\n", 6883 rdev->bdev, 6884 rdev0->bdev); 6885 export_rdev(rdev, mddev); 6886 return -EINVAL; 6887 } 6888 } 6889 err = bind_rdev_to_array(rdev, mddev); 6890 if (err) 6891 export_rdev(rdev, mddev); 6892 return err; 6893 } 6894 6895 /* 6896 * md_add_new_disk can be used once the array is assembled 6897 * to add "hot spares". They must already have a superblock 6898 * written 6899 */ 6900 if (mddev->pers) { 6901 int err; 6902 if (!mddev->pers->hot_add_disk) { 6903 pr_warn("%s: personality does not support diskops!\n", 6904 mdname(mddev)); 6905 return -EINVAL; 6906 } 6907 if (mddev->persistent) 6908 rdev = md_import_device(dev, mddev->major_version, 6909 mddev->minor_version); 6910 else 6911 rdev = md_import_device(dev, -1, -1); 6912 if (IS_ERR(rdev)) { 6913 pr_warn("md: md_import_device returned %ld\n", 6914 PTR_ERR(rdev)); 6915 return PTR_ERR(rdev); 6916 } 6917 /* set saved_raid_disk if appropriate */ 6918 if (!mddev->persistent) { 6919 if (info->state & (1<<MD_DISK_SYNC) && 6920 info->raid_disk < mddev->raid_disks) { 6921 rdev->raid_disk = info->raid_disk; 6922 clear_bit(Bitmap_sync, &rdev->flags); 6923 } else 6924 rdev->raid_disk = -1; 6925 rdev->saved_raid_disk = rdev->raid_disk; 6926 } else 6927 super_types[mddev->major_version]. 6928 validate_super(mddev, NULL/*freshest*/, rdev); 6929 if ((info->state & (1<<MD_DISK_SYNC)) && 6930 rdev->raid_disk != info->raid_disk) { 6931 /* This was a hot-add request, but events doesn't 6932 * match, so reject it. 6933 */ 6934 export_rdev(rdev, mddev); 6935 return -EINVAL; 6936 } 6937 6938 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6939 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6940 set_bit(WriteMostly, &rdev->flags); 6941 else 6942 clear_bit(WriteMostly, &rdev->flags); 6943 if (info->state & (1<<MD_DISK_FAILFAST)) 6944 set_bit(FailFast, &rdev->flags); 6945 else 6946 clear_bit(FailFast, &rdev->flags); 6947 6948 if (info->state & (1<<MD_DISK_JOURNAL)) { 6949 struct md_rdev *rdev2; 6950 bool has_journal = false; 6951 6952 /* make sure no existing journal disk */ 6953 rdev_for_each(rdev2, mddev) { 6954 if (test_bit(Journal, &rdev2->flags)) { 6955 has_journal = true; 6956 break; 6957 } 6958 } 6959 if (has_journal || mddev->bitmap) { 6960 export_rdev(rdev, mddev); 6961 return -EBUSY; 6962 } 6963 set_bit(Journal, &rdev->flags); 6964 } 6965 /* 6966 * check whether the device shows up in other nodes 6967 */ 6968 if (mddev_is_clustered(mddev)) { 6969 if (info->state & (1 << MD_DISK_CANDIDATE)) 6970 set_bit(Candidate, &rdev->flags); 6971 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6972 /* --add initiated by this node */ 6973 err = md_cluster_ops->add_new_disk(mddev, rdev); 6974 if (err) { 6975 export_rdev(rdev, mddev); 6976 return err; 6977 } 6978 } 6979 } 6980 6981 rdev->raid_disk = -1; 6982 err = bind_rdev_to_array(rdev, mddev); 6983 6984 if (err) 6985 export_rdev(rdev, mddev); 6986 6987 if (mddev_is_clustered(mddev)) { 6988 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6989 if (!err) { 6990 err = md_cluster_ops->new_disk_ack(mddev, 6991 err == 0); 6992 if (err) 6993 md_kick_rdev_from_array(rdev); 6994 } 6995 } else { 6996 if (err) 6997 md_cluster_ops->add_new_disk_cancel(mddev); 6998 else 6999 err = add_bound_rdev(rdev); 7000 } 7001 7002 } else if (!err) 7003 err = add_bound_rdev(rdev); 7004 7005 return err; 7006 } 7007 7008 /* otherwise, md_add_new_disk is only allowed 7009 * for major_version==0 superblocks 7010 */ 7011 if (mddev->major_version != 0) { 7012 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 7013 return -EINVAL; 7014 } 7015 7016 if (!(info->state & (1<<MD_DISK_FAULTY))) { 7017 int err; 7018 rdev = md_import_device(dev, -1, 0); 7019 if (IS_ERR(rdev)) { 7020 pr_warn("md: error, md_import_device() returned %ld\n", 7021 PTR_ERR(rdev)); 7022 return PTR_ERR(rdev); 7023 } 7024 rdev->desc_nr = info->number; 7025 if (info->raid_disk < mddev->raid_disks) 7026 rdev->raid_disk = info->raid_disk; 7027 else 7028 rdev->raid_disk = -1; 7029 7030 if (rdev->raid_disk < mddev->raid_disks) 7031 if (info->state & (1<<MD_DISK_SYNC)) 7032 set_bit(In_sync, &rdev->flags); 7033 7034 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7035 set_bit(WriteMostly, &rdev->flags); 7036 if (info->state & (1<<MD_DISK_FAILFAST)) 7037 set_bit(FailFast, &rdev->flags); 7038 7039 if (!mddev->persistent) { 7040 pr_debug("md: nonpersistent superblock ...\n"); 7041 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7042 } else 7043 rdev->sb_start = calc_dev_sboffset(rdev); 7044 rdev->sectors = rdev->sb_start; 7045 7046 err = bind_rdev_to_array(rdev, mddev); 7047 if (err) { 7048 export_rdev(rdev, mddev); 7049 return err; 7050 } 7051 } 7052 7053 return 0; 7054 } 7055 7056 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 7057 { 7058 struct md_rdev *rdev; 7059 7060 if (!mddev->pers) 7061 return -ENODEV; 7062 7063 rdev = find_rdev(mddev, dev); 7064 if (!rdev) 7065 return -ENXIO; 7066 7067 if (rdev->raid_disk < 0) 7068 goto kick_rdev; 7069 7070 clear_bit(Blocked, &rdev->flags); 7071 remove_and_add_spares(mddev, rdev); 7072 7073 if (rdev->raid_disk >= 0) 7074 goto busy; 7075 7076 kick_rdev: 7077 if (mddev_is_clustered(mddev)) { 7078 if (md_cluster_ops->remove_disk(mddev, rdev)) 7079 goto busy; 7080 } 7081 7082 md_kick_rdev_from_array(rdev); 7083 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7084 if (!mddev->thread) 7085 md_update_sb(mddev, 1); 7086 md_new_event(); 7087 7088 return 0; 7089 busy: 7090 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7091 rdev->bdev, mdname(mddev)); 7092 return -EBUSY; 7093 } 7094 7095 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7096 { 7097 int err; 7098 struct md_rdev *rdev; 7099 7100 if (!mddev->pers) 7101 return -ENODEV; 7102 7103 if (mddev->major_version != 0) { 7104 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7105 mdname(mddev)); 7106 return -EINVAL; 7107 } 7108 if (!mddev->pers->hot_add_disk) { 7109 pr_warn("%s: personality does not support diskops!\n", 7110 mdname(mddev)); 7111 return -EINVAL; 7112 } 7113 7114 rdev = md_import_device(dev, -1, 0); 7115 if (IS_ERR(rdev)) { 7116 pr_warn("md: error, md_import_device() returned %ld\n", 7117 PTR_ERR(rdev)); 7118 return -EINVAL; 7119 } 7120 7121 if (mddev->persistent) 7122 rdev->sb_start = calc_dev_sboffset(rdev); 7123 else 7124 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7125 7126 rdev->sectors = rdev->sb_start; 7127 7128 if (test_bit(Faulty, &rdev->flags)) { 7129 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7130 rdev->bdev, mdname(mddev)); 7131 err = -EINVAL; 7132 goto abort_export; 7133 } 7134 7135 clear_bit(In_sync, &rdev->flags); 7136 rdev->desc_nr = -1; 7137 rdev->saved_raid_disk = -1; 7138 err = bind_rdev_to_array(rdev, mddev); 7139 if (err) 7140 goto abort_export; 7141 7142 /* 7143 * The rest should better be atomic, we can have disk failures 7144 * noticed in interrupt contexts ... 7145 */ 7146 7147 rdev->raid_disk = -1; 7148 7149 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7150 if (!mddev->thread) 7151 md_update_sb(mddev, 1); 7152 /* 7153 * Kick recovery, maybe this spare has to be added to the 7154 * array immediately. 7155 */ 7156 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7157 md_new_event(); 7158 return 0; 7159 7160 abort_export: 7161 export_rdev(rdev, mddev); 7162 return err; 7163 } 7164 7165 static int set_bitmap_file(struct mddev *mddev, int fd) 7166 { 7167 int err = 0; 7168 7169 if (mddev->pers) { 7170 if (!mddev->pers->quiesce || !mddev->thread) 7171 return -EBUSY; 7172 if (mddev->recovery || mddev->sync_thread) 7173 return -EBUSY; 7174 /* we should be able to change the bitmap.. */ 7175 } 7176 7177 if (fd >= 0) { 7178 struct inode *inode; 7179 struct file *f; 7180 7181 if (mddev->bitmap || mddev->bitmap_info.file) 7182 return -EEXIST; /* cannot add when bitmap is present */ 7183 7184 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7185 pr_warn("%s: bitmap files not supported by this kernel\n", 7186 mdname(mddev)); 7187 return -EINVAL; 7188 } 7189 pr_warn("%s: using deprecated bitmap file support\n", 7190 mdname(mddev)); 7191 7192 f = fget(fd); 7193 7194 if (f == NULL) { 7195 pr_warn("%s: error: failed to get bitmap file\n", 7196 mdname(mddev)); 7197 return -EBADF; 7198 } 7199 7200 inode = f->f_mapping->host; 7201 if (!S_ISREG(inode->i_mode)) { 7202 pr_warn("%s: error: bitmap file must be a regular file\n", 7203 mdname(mddev)); 7204 err = -EBADF; 7205 } else if (!(f->f_mode & FMODE_WRITE)) { 7206 pr_warn("%s: error: bitmap file must open for write\n", 7207 mdname(mddev)); 7208 err = -EBADF; 7209 } else if (atomic_read(&inode->i_writecount) != 1) { 7210 pr_warn("%s: error: bitmap file is already in use\n", 7211 mdname(mddev)); 7212 err = -EBUSY; 7213 } 7214 if (err) { 7215 fput(f); 7216 return err; 7217 } 7218 mddev->bitmap_info.file = f; 7219 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7220 } else if (mddev->bitmap == NULL) 7221 return -ENOENT; /* cannot remove what isn't there */ 7222 err = 0; 7223 if (mddev->pers) { 7224 if (fd >= 0) { 7225 err = mddev->bitmap_ops->create(mddev, -1); 7226 if (!err) 7227 err = mddev->bitmap_ops->load(mddev); 7228 7229 if (err) { 7230 mddev->bitmap_ops->destroy(mddev); 7231 fd = -1; 7232 } 7233 } else if (fd < 0) { 7234 mddev->bitmap_ops->destroy(mddev); 7235 } 7236 } 7237 7238 if (fd < 0) { 7239 struct file *f = mddev->bitmap_info.file; 7240 if (f) { 7241 spin_lock(&mddev->lock); 7242 mddev->bitmap_info.file = NULL; 7243 spin_unlock(&mddev->lock); 7244 fput(f); 7245 } 7246 } 7247 7248 return err; 7249 } 7250 7251 /* 7252 * md_set_array_info is used two different ways 7253 * The original usage is when creating a new array. 7254 * In this usage, raid_disks is > 0 and it together with 7255 * level, size, not_persistent,layout,chunksize determine the 7256 * shape of the array. 7257 * This will always create an array with a type-0.90.0 superblock. 7258 * The newer usage is when assembling an array. 7259 * In this case raid_disks will be 0, and the major_version field is 7260 * use to determine which style super-blocks are to be found on the devices. 7261 * The minor and patch _version numbers are also kept incase the 7262 * super_block handler wishes to interpret them. 7263 */ 7264 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7265 { 7266 if (info->raid_disks == 0) { 7267 /* just setting version number for superblock loading */ 7268 if (info->major_version < 0 || 7269 info->major_version >= ARRAY_SIZE(super_types) || 7270 super_types[info->major_version].name == NULL) { 7271 /* maybe try to auto-load a module? */ 7272 pr_warn("md: superblock version %d not known\n", 7273 info->major_version); 7274 return -EINVAL; 7275 } 7276 mddev->major_version = info->major_version; 7277 mddev->minor_version = info->minor_version; 7278 mddev->patch_version = info->patch_version; 7279 mddev->persistent = !info->not_persistent; 7280 /* ensure mddev_put doesn't delete this now that there 7281 * is some minimal configuration. 7282 */ 7283 mddev->ctime = ktime_get_real_seconds(); 7284 return 0; 7285 } 7286 mddev->major_version = MD_MAJOR_VERSION; 7287 mddev->minor_version = MD_MINOR_VERSION; 7288 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7289 mddev->ctime = ktime_get_real_seconds(); 7290 7291 mddev->level = info->level; 7292 mddev->clevel[0] = 0; 7293 mddev->dev_sectors = 2 * (sector_t)info->size; 7294 mddev->raid_disks = info->raid_disks; 7295 /* don't set md_minor, it is determined by which /dev/md* was 7296 * openned 7297 */ 7298 if (info->state & (1<<MD_SB_CLEAN)) 7299 mddev->recovery_cp = MaxSector; 7300 else 7301 mddev->recovery_cp = 0; 7302 mddev->persistent = ! info->not_persistent; 7303 mddev->external = 0; 7304 7305 mddev->layout = info->layout; 7306 if (mddev->level == 0) 7307 /* Cannot trust RAID0 layout info here */ 7308 mddev->layout = -1; 7309 mddev->chunk_sectors = info->chunk_size >> 9; 7310 7311 if (mddev->persistent) { 7312 mddev->max_disks = MD_SB_DISKS; 7313 mddev->flags = 0; 7314 mddev->sb_flags = 0; 7315 } 7316 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7317 7318 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7319 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7320 mddev->bitmap_info.offset = 0; 7321 7322 mddev->reshape_position = MaxSector; 7323 7324 /* 7325 * Generate a 128 bit UUID 7326 */ 7327 get_random_bytes(mddev->uuid, 16); 7328 7329 mddev->new_level = mddev->level; 7330 mddev->new_chunk_sectors = mddev->chunk_sectors; 7331 mddev->new_layout = mddev->layout; 7332 mddev->delta_disks = 0; 7333 mddev->reshape_backwards = 0; 7334 7335 return 0; 7336 } 7337 7338 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7339 { 7340 lockdep_assert_held(&mddev->reconfig_mutex); 7341 7342 if (mddev->external_size) 7343 return; 7344 7345 mddev->array_sectors = array_sectors; 7346 } 7347 EXPORT_SYMBOL(md_set_array_sectors); 7348 7349 static int update_size(struct mddev *mddev, sector_t num_sectors) 7350 { 7351 struct md_rdev *rdev; 7352 int rv; 7353 int fit = (num_sectors == 0); 7354 sector_t old_dev_sectors = mddev->dev_sectors; 7355 7356 if (mddev->pers->resize == NULL) 7357 return -EINVAL; 7358 /* The "num_sectors" is the number of sectors of each device that 7359 * is used. This can only make sense for arrays with redundancy. 7360 * linear and raid0 always use whatever space is available. We can only 7361 * consider changing this number if no resync or reconstruction is 7362 * happening, and if the new size is acceptable. It must fit before the 7363 * sb_start or, if that is <data_offset, it must fit before the size 7364 * of each device. If num_sectors is zero, we find the largest size 7365 * that fits. 7366 */ 7367 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7368 return -EBUSY; 7369 if (!md_is_rdwr(mddev)) 7370 return -EROFS; 7371 7372 rdev_for_each(rdev, mddev) { 7373 sector_t avail = rdev->sectors; 7374 7375 if (fit && (num_sectors == 0 || num_sectors > avail)) 7376 num_sectors = avail; 7377 if (avail < num_sectors) 7378 return -ENOSPC; 7379 } 7380 rv = mddev->pers->resize(mddev, num_sectors); 7381 if (!rv) { 7382 if (mddev_is_clustered(mddev)) 7383 md_cluster_ops->update_size(mddev, old_dev_sectors); 7384 else if (!mddev_is_dm(mddev)) 7385 set_capacity_and_notify(mddev->gendisk, 7386 mddev->array_sectors); 7387 } 7388 return rv; 7389 } 7390 7391 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7392 { 7393 int rv; 7394 struct md_rdev *rdev; 7395 /* change the number of raid disks */ 7396 if (mddev->pers->check_reshape == NULL) 7397 return -EINVAL; 7398 if (!md_is_rdwr(mddev)) 7399 return -EROFS; 7400 if (raid_disks <= 0 || 7401 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7402 return -EINVAL; 7403 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7404 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7405 mddev->reshape_position != MaxSector) 7406 return -EBUSY; 7407 7408 rdev_for_each(rdev, mddev) { 7409 if (mddev->raid_disks < raid_disks && 7410 rdev->data_offset < rdev->new_data_offset) 7411 return -EINVAL; 7412 if (mddev->raid_disks > raid_disks && 7413 rdev->data_offset > rdev->new_data_offset) 7414 return -EINVAL; 7415 } 7416 7417 mddev->delta_disks = raid_disks - mddev->raid_disks; 7418 if (mddev->delta_disks < 0) 7419 mddev->reshape_backwards = 1; 7420 else if (mddev->delta_disks > 0) 7421 mddev->reshape_backwards = 0; 7422 7423 rv = mddev->pers->check_reshape(mddev); 7424 if (rv < 0) { 7425 mddev->delta_disks = 0; 7426 mddev->reshape_backwards = 0; 7427 } 7428 return rv; 7429 } 7430 7431 /* 7432 * update_array_info is used to change the configuration of an 7433 * on-line array. 7434 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7435 * fields in the info are checked against the array. 7436 * Any differences that cannot be handled will cause an error. 7437 * Normally, only one change can be managed at a time. 7438 */ 7439 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7440 { 7441 int rv = 0; 7442 int cnt = 0; 7443 int state = 0; 7444 7445 /* calculate expected state,ignoring low bits */ 7446 if (mddev->bitmap && mddev->bitmap_info.offset) 7447 state |= (1 << MD_SB_BITMAP_PRESENT); 7448 7449 if (mddev->major_version != info->major_version || 7450 mddev->minor_version != info->minor_version || 7451 /* mddev->patch_version != info->patch_version || */ 7452 mddev->ctime != info->ctime || 7453 mddev->level != info->level || 7454 /* mddev->layout != info->layout || */ 7455 mddev->persistent != !info->not_persistent || 7456 mddev->chunk_sectors != info->chunk_size >> 9 || 7457 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7458 ((state^info->state) & 0xfffffe00) 7459 ) 7460 return -EINVAL; 7461 /* Check there is only one change */ 7462 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7463 cnt++; 7464 if (mddev->raid_disks != info->raid_disks) 7465 cnt++; 7466 if (mddev->layout != info->layout) 7467 cnt++; 7468 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7469 cnt++; 7470 if (cnt == 0) 7471 return 0; 7472 if (cnt > 1) 7473 return -EINVAL; 7474 7475 if (mddev->layout != info->layout) { 7476 /* Change layout 7477 * we don't need to do anything at the md level, the 7478 * personality will take care of it all. 7479 */ 7480 if (mddev->pers->check_reshape == NULL) 7481 return -EINVAL; 7482 else { 7483 mddev->new_layout = info->layout; 7484 rv = mddev->pers->check_reshape(mddev); 7485 if (rv) 7486 mddev->new_layout = mddev->layout; 7487 return rv; 7488 } 7489 } 7490 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7491 rv = update_size(mddev, (sector_t)info->size * 2); 7492 7493 if (mddev->raid_disks != info->raid_disks) 7494 rv = update_raid_disks(mddev, info->raid_disks); 7495 7496 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7497 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7498 rv = -EINVAL; 7499 goto err; 7500 } 7501 if (mddev->recovery || mddev->sync_thread) { 7502 rv = -EBUSY; 7503 goto err; 7504 } 7505 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7506 /* add the bitmap */ 7507 if (mddev->bitmap) { 7508 rv = -EEXIST; 7509 goto err; 7510 } 7511 if (mddev->bitmap_info.default_offset == 0) { 7512 rv = -EINVAL; 7513 goto err; 7514 } 7515 mddev->bitmap_info.offset = 7516 mddev->bitmap_info.default_offset; 7517 mddev->bitmap_info.space = 7518 mddev->bitmap_info.default_space; 7519 rv = mddev->bitmap_ops->create(mddev, -1); 7520 if (!rv) 7521 rv = mddev->bitmap_ops->load(mddev); 7522 7523 if (rv) 7524 mddev->bitmap_ops->destroy(mddev); 7525 } else { 7526 struct md_bitmap_stats stats; 7527 7528 rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 7529 if (rv) 7530 goto err; 7531 7532 if (stats.file) { 7533 rv = -EINVAL; 7534 goto err; 7535 } 7536 7537 if (mddev->bitmap_info.nodes) { 7538 /* hold PW on all the bitmap lock */ 7539 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7540 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7541 rv = -EPERM; 7542 md_cluster_ops->unlock_all_bitmaps(mddev); 7543 goto err; 7544 } 7545 7546 mddev->bitmap_info.nodes = 0; 7547 md_cluster_ops->leave(mddev); 7548 module_put(md_cluster_mod); 7549 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7550 } 7551 mddev->bitmap_ops->destroy(mddev); 7552 mddev->bitmap_info.offset = 0; 7553 } 7554 } 7555 md_update_sb(mddev, 1); 7556 return rv; 7557 err: 7558 return rv; 7559 } 7560 7561 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7562 { 7563 struct md_rdev *rdev; 7564 int err = 0; 7565 7566 if (mddev->pers == NULL) 7567 return -ENODEV; 7568 7569 rcu_read_lock(); 7570 rdev = md_find_rdev_rcu(mddev, dev); 7571 if (!rdev) 7572 err = -ENODEV; 7573 else { 7574 md_error(mddev, rdev); 7575 if (test_bit(MD_BROKEN, &mddev->flags)) 7576 err = -EBUSY; 7577 } 7578 rcu_read_unlock(); 7579 return err; 7580 } 7581 7582 /* 7583 * We have a problem here : there is no easy way to give a CHS 7584 * virtual geometry. We currently pretend that we have a 2 heads 7585 * 4 sectors (with a BIG number of cylinders...). This drives 7586 * dosfs just mad... ;-) 7587 */ 7588 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7589 { 7590 struct mddev *mddev = bdev->bd_disk->private_data; 7591 7592 geo->heads = 2; 7593 geo->sectors = 4; 7594 geo->cylinders = mddev->array_sectors / 8; 7595 return 0; 7596 } 7597 7598 static inline int md_ioctl_valid(unsigned int cmd) 7599 { 7600 switch (cmd) { 7601 case GET_ARRAY_INFO: 7602 case GET_DISK_INFO: 7603 case RAID_VERSION: 7604 return 0; 7605 case ADD_NEW_DISK: 7606 case GET_BITMAP_FILE: 7607 case HOT_ADD_DISK: 7608 case HOT_REMOVE_DISK: 7609 case RESTART_ARRAY_RW: 7610 case RUN_ARRAY: 7611 case SET_ARRAY_INFO: 7612 case SET_BITMAP_FILE: 7613 case SET_DISK_FAULTY: 7614 case STOP_ARRAY: 7615 case STOP_ARRAY_RO: 7616 case CLUSTERED_DISK_NACK: 7617 if (!capable(CAP_SYS_ADMIN)) 7618 return -EACCES; 7619 return 0; 7620 default: 7621 return -ENOTTY; 7622 } 7623 } 7624 7625 static bool md_ioctl_need_suspend(unsigned int cmd) 7626 { 7627 switch (cmd) { 7628 case ADD_NEW_DISK: 7629 case HOT_ADD_DISK: 7630 case HOT_REMOVE_DISK: 7631 case SET_BITMAP_FILE: 7632 case SET_ARRAY_INFO: 7633 return true; 7634 default: 7635 return false; 7636 } 7637 } 7638 7639 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7640 { 7641 mdu_array_info_t info; 7642 int err; 7643 7644 if (!argp) 7645 memset(&info, 0, sizeof(info)); 7646 else if (copy_from_user(&info, argp, sizeof(info))) 7647 return -EFAULT; 7648 7649 if (mddev->pers) { 7650 err = update_array_info(mddev, &info); 7651 if (err) 7652 pr_warn("md: couldn't update array info. %d\n", err); 7653 return err; 7654 } 7655 7656 if (!list_empty(&mddev->disks)) { 7657 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7658 return -EBUSY; 7659 } 7660 7661 if (mddev->raid_disks) { 7662 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7663 return -EBUSY; 7664 } 7665 7666 err = md_set_array_info(mddev, &info); 7667 if (err) 7668 pr_warn("md: couldn't set array info. %d\n", err); 7669 7670 return err; 7671 } 7672 7673 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7674 unsigned int cmd, unsigned long arg) 7675 { 7676 int err = 0; 7677 void __user *argp = (void __user *)arg; 7678 struct mddev *mddev = NULL; 7679 7680 err = md_ioctl_valid(cmd); 7681 if (err) 7682 return err; 7683 7684 /* 7685 * Commands dealing with the RAID driver but not any 7686 * particular array: 7687 */ 7688 if (cmd == RAID_VERSION) 7689 return get_version(argp); 7690 7691 /* 7692 * Commands creating/starting a new array: 7693 */ 7694 7695 mddev = bdev->bd_disk->private_data; 7696 7697 /* Some actions do not requires the mutex */ 7698 switch (cmd) { 7699 case GET_ARRAY_INFO: 7700 if (!mddev->raid_disks && !mddev->external) 7701 return -ENODEV; 7702 return get_array_info(mddev, argp); 7703 7704 case GET_DISK_INFO: 7705 if (!mddev->raid_disks && !mddev->external) 7706 return -ENODEV; 7707 return get_disk_info(mddev, argp); 7708 7709 case SET_DISK_FAULTY: 7710 return set_disk_faulty(mddev, new_decode_dev(arg)); 7711 7712 case GET_BITMAP_FILE: 7713 return get_bitmap_file(mddev, argp); 7714 } 7715 7716 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7717 /* Need to flush page cache, and ensure no-one else opens 7718 * and writes 7719 */ 7720 err = mddev_set_closing_and_sync_blockdev(mddev, 1); 7721 if (err) 7722 return err; 7723 } 7724 7725 if (!md_is_rdwr(mddev)) 7726 flush_work(&mddev->sync_work); 7727 7728 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 7729 mddev_lock(mddev); 7730 if (err) { 7731 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7732 err, cmd); 7733 goto out; 7734 } 7735 7736 if (cmd == SET_ARRAY_INFO) { 7737 err = __md_set_array_info(mddev, argp); 7738 goto unlock; 7739 } 7740 7741 /* 7742 * Commands querying/configuring an existing array: 7743 */ 7744 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7745 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7746 if ((!mddev->raid_disks && !mddev->external) 7747 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7748 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7749 && cmd != GET_BITMAP_FILE) { 7750 err = -ENODEV; 7751 goto unlock; 7752 } 7753 7754 /* 7755 * Commands even a read-only array can execute: 7756 */ 7757 switch (cmd) { 7758 case RESTART_ARRAY_RW: 7759 err = restart_array(mddev); 7760 goto unlock; 7761 7762 case STOP_ARRAY: 7763 err = do_md_stop(mddev, 0); 7764 goto unlock; 7765 7766 case STOP_ARRAY_RO: 7767 if (mddev->pers) 7768 err = md_set_readonly(mddev); 7769 goto unlock; 7770 7771 case HOT_REMOVE_DISK: 7772 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7773 goto unlock; 7774 7775 case ADD_NEW_DISK: 7776 /* We can support ADD_NEW_DISK on read-only arrays 7777 * only if we are re-adding a preexisting device. 7778 * So require mddev->pers and MD_DISK_SYNC. 7779 */ 7780 if (mddev->pers) { 7781 mdu_disk_info_t info; 7782 if (copy_from_user(&info, argp, sizeof(info))) 7783 err = -EFAULT; 7784 else if (!(info.state & (1<<MD_DISK_SYNC))) 7785 /* Need to clear read-only for this */ 7786 break; 7787 else 7788 err = md_add_new_disk(mddev, &info); 7789 goto unlock; 7790 } 7791 break; 7792 } 7793 7794 /* 7795 * The remaining ioctls are changing the state of the 7796 * superblock, so we do not allow them on read-only arrays. 7797 */ 7798 if (!md_is_rdwr(mddev) && mddev->pers) { 7799 if (mddev->ro != MD_AUTO_READ) { 7800 err = -EROFS; 7801 goto unlock; 7802 } 7803 mddev->ro = MD_RDWR; 7804 sysfs_notify_dirent_safe(mddev->sysfs_state); 7805 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7806 /* mddev_unlock will wake thread */ 7807 /* If a device failed while we were read-only, we 7808 * need to make sure the metadata is updated now. 7809 */ 7810 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7811 mddev_unlock(mddev); 7812 wait_event(mddev->sb_wait, 7813 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7814 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7815 mddev_lock_nointr(mddev); 7816 } 7817 } 7818 7819 switch (cmd) { 7820 case ADD_NEW_DISK: 7821 { 7822 mdu_disk_info_t info; 7823 if (copy_from_user(&info, argp, sizeof(info))) 7824 err = -EFAULT; 7825 else 7826 err = md_add_new_disk(mddev, &info); 7827 goto unlock; 7828 } 7829 7830 case CLUSTERED_DISK_NACK: 7831 if (mddev_is_clustered(mddev)) 7832 md_cluster_ops->new_disk_ack(mddev, false); 7833 else 7834 err = -EINVAL; 7835 goto unlock; 7836 7837 case HOT_ADD_DISK: 7838 err = hot_add_disk(mddev, new_decode_dev(arg)); 7839 goto unlock; 7840 7841 case RUN_ARRAY: 7842 err = do_md_run(mddev); 7843 goto unlock; 7844 7845 case SET_BITMAP_FILE: 7846 err = set_bitmap_file(mddev, (int)arg); 7847 goto unlock; 7848 7849 default: 7850 err = -EINVAL; 7851 goto unlock; 7852 } 7853 7854 unlock: 7855 if (mddev->hold_active == UNTIL_IOCTL && 7856 err != -EINVAL) 7857 mddev->hold_active = 0; 7858 7859 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 7860 mddev_unlock(mddev); 7861 7862 out: 7863 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) 7864 clear_bit(MD_CLOSING, &mddev->flags); 7865 return err; 7866 } 7867 #ifdef CONFIG_COMPAT 7868 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 7869 unsigned int cmd, unsigned long arg) 7870 { 7871 switch (cmd) { 7872 case HOT_REMOVE_DISK: 7873 case HOT_ADD_DISK: 7874 case SET_DISK_FAULTY: 7875 case SET_BITMAP_FILE: 7876 /* These take in integer arg, do not convert */ 7877 break; 7878 default: 7879 arg = (unsigned long)compat_ptr(arg); 7880 break; 7881 } 7882 7883 return md_ioctl(bdev, mode, cmd, arg); 7884 } 7885 #endif /* CONFIG_COMPAT */ 7886 7887 static int md_set_read_only(struct block_device *bdev, bool ro) 7888 { 7889 struct mddev *mddev = bdev->bd_disk->private_data; 7890 int err; 7891 7892 err = mddev_lock(mddev); 7893 if (err) 7894 return err; 7895 7896 if (!mddev->raid_disks && !mddev->external) { 7897 err = -ENODEV; 7898 goto out_unlock; 7899 } 7900 7901 /* 7902 * Transitioning to read-auto need only happen for arrays that call 7903 * md_write_start and which are not ready for writes yet. 7904 */ 7905 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 7906 err = restart_array(mddev); 7907 if (err) 7908 goto out_unlock; 7909 mddev->ro = MD_AUTO_READ; 7910 } 7911 7912 out_unlock: 7913 mddev_unlock(mddev); 7914 return err; 7915 } 7916 7917 static int md_open(struct gendisk *disk, blk_mode_t mode) 7918 { 7919 struct mddev *mddev; 7920 int err; 7921 7922 spin_lock(&all_mddevs_lock); 7923 mddev = mddev_get(disk->private_data); 7924 spin_unlock(&all_mddevs_lock); 7925 if (!mddev) 7926 return -ENODEV; 7927 7928 err = mutex_lock_interruptible(&mddev->open_mutex); 7929 if (err) 7930 goto out; 7931 7932 err = -ENODEV; 7933 if (test_bit(MD_CLOSING, &mddev->flags)) 7934 goto out_unlock; 7935 7936 atomic_inc(&mddev->openers); 7937 mutex_unlock(&mddev->open_mutex); 7938 7939 disk_check_media_change(disk); 7940 return 0; 7941 7942 out_unlock: 7943 mutex_unlock(&mddev->open_mutex); 7944 out: 7945 mddev_put(mddev); 7946 return err; 7947 } 7948 7949 static void md_release(struct gendisk *disk) 7950 { 7951 struct mddev *mddev = disk->private_data; 7952 7953 BUG_ON(!mddev); 7954 atomic_dec(&mddev->openers); 7955 mddev_put(mddev); 7956 } 7957 7958 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 7959 { 7960 struct mddev *mddev = disk->private_data; 7961 unsigned int ret = 0; 7962 7963 if (mddev->changed) 7964 ret = DISK_EVENT_MEDIA_CHANGE; 7965 mddev->changed = 0; 7966 return ret; 7967 } 7968 7969 static void md_free_disk(struct gendisk *disk) 7970 { 7971 struct mddev *mddev = disk->private_data; 7972 7973 mddev_free(mddev); 7974 } 7975 7976 const struct block_device_operations md_fops = 7977 { 7978 .owner = THIS_MODULE, 7979 .submit_bio = md_submit_bio, 7980 .open = md_open, 7981 .release = md_release, 7982 .ioctl = md_ioctl, 7983 #ifdef CONFIG_COMPAT 7984 .compat_ioctl = md_compat_ioctl, 7985 #endif 7986 .getgeo = md_getgeo, 7987 .check_events = md_check_events, 7988 .set_read_only = md_set_read_only, 7989 .free_disk = md_free_disk, 7990 }; 7991 7992 static int md_thread(void *arg) 7993 { 7994 struct md_thread *thread = arg; 7995 7996 /* 7997 * md_thread is a 'system-thread', it's priority should be very 7998 * high. We avoid resource deadlocks individually in each 7999 * raid personality. (RAID5 does preallocation) We also use RR and 8000 * the very same RT priority as kswapd, thus we will never get 8001 * into a priority inversion deadlock. 8002 * 8003 * we definitely have to have equal or higher priority than 8004 * bdflush, otherwise bdflush will deadlock if there are too 8005 * many dirty RAID5 blocks. 8006 */ 8007 8008 allow_signal(SIGKILL); 8009 while (!kthread_should_stop()) { 8010 8011 /* We need to wait INTERRUPTIBLE so that 8012 * we don't add to the load-average. 8013 * That means we need to be sure no signals are 8014 * pending 8015 */ 8016 if (signal_pending(current)) 8017 flush_signals(current); 8018 8019 wait_event_interruptible_timeout 8020 (thread->wqueue, 8021 test_bit(THREAD_WAKEUP, &thread->flags) 8022 || kthread_should_stop() || kthread_should_park(), 8023 thread->timeout); 8024 8025 clear_bit(THREAD_WAKEUP, &thread->flags); 8026 if (kthread_should_park()) 8027 kthread_parkme(); 8028 if (!kthread_should_stop()) 8029 thread->run(thread); 8030 } 8031 8032 return 0; 8033 } 8034 8035 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 8036 { 8037 struct md_thread *t; 8038 8039 rcu_read_lock(); 8040 t = rcu_dereference(thread); 8041 if (t) 8042 wake_up_process(t->tsk); 8043 rcu_read_unlock(); 8044 } 8045 8046 void md_wakeup_thread(struct md_thread __rcu *thread) 8047 { 8048 struct md_thread *t; 8049 8050 rcu_read_lock(); 8051 t = rcu_dereference(thread); 8052 if (t) { 8053 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8054 set_bit(THREAD_WAKEUP, &t->flags); 8055 if (wq_has_sleeper(&t->wqueue)) 8056 wake_up(&t->wqueue); 8057 } 8058 rcu_read_unlock(); 8059 } 8060 EXPORT_SYMBOL(md_wakeup_thread); 8061 8062 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8063 struct mddev *mddev, const char *name) 8064 { 8065 struct md_thread *thread; 8066 8067 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8068 if (!thread) 8069 return NULL; 8070 8071 init_waitqueue_head(&thread->wqueue); 8072 8073 thread->run = run; 8074 thread->mddev = mddev; 8075 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8076 thread->tsk = kthread_run(md_thread, thread, 8077 "%s_%s", 8078 mdname(thread->mddev), 8079 name); 8080 if (IS_ERR(thread->tsk)) { 8081 kfree(thread); 8082 return NULL; 8083 } 8084 return thread; 8085 } 8086 EXPORT_SYMBOL(md_register_thread); 8087 8088 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8089 { 8090 struct md_thread *thread = rcu_dereference_protected(*threadp, 8091 lockdep_is_held(&mddev->reconfig_mutex)); 8092 8093 if (!thread) 8094 return; 8095 8096 rcu_assign_pointer(*threadp, NULL); 8097 synchronize_rcu(); 8098 8099 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8100 kthread_stop(thread->tsk); 8101 kfree(thread); 8102 } 8103 EXPORT_SYMBOL(md_unregister_thread); 8104 8105 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8106 { 8107 if (!rdev || test_bit(Faulty, &rdev->flags)) 8108 return; 8109 8110 if (!mddev->pers || !mddev->pers->error_handler) 8111 return; 8112 mddev->pers->error_handler(mddev, rdev); 8113 8114 if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) 8115 return; 8116 8117 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8118 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8119 sysfs_notify_dirent_safe(rdev->sysfs_state); 8120 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8121 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8122 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8123 md_wakeup_thread(mddev->thread); 8124 } 8125 if (mddev->event_work.func) 8126 queue_work(md_misc_wq, &mddev->event_work); 8127 md_new_event(); 8128 } 8129 EXPORT_SYMBOL(md_error); 8130 8131 /* seq_file implementation /proc/mdstat */ 8132 8133 static void status_unused(struct seq_file *seq) 8134 { 8135 int i = 0; 8136 struct md_rdev *rdev; 8137 8138 seq_printf(seq, "unused devices: "); 8139 8140 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8141 i++; 8142 seq_printf(seq, "%pg ", rdev->bdev); 8143 } 8144 if (!i) 8145 seq_printf(seq, "<none>"); 8146 8147 seq_printf(seq, "\n"); 8148 } 8149 8150 static void status_personalities(struct seq_file *seq) 8151 { 8152 struct md_personality *pers; 8153 8154 seq_puts(seq, "Personalities : "); 8155 spin_lock(&pers_lock); 8156 list_for_each_entry(pers, &pers_list, list) 8157 seq_printf(seq, "[%s] ", pers->name); 8158 8159 spin_unlock(&pers_lock); 8160 seq_puts(seq, "\n"); 8161 } 8162 8163 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8164 { 8165 sector_t max_sectors, resync, res; 8166 unsigned long dt, db = 0; 8167 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8168 int scale, recovery_active; 8169 unsigned int per_milli; 8170 8171 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8172 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8173 max_sectors = mddev->resync_max_sectors; 8174 else 8175 max_sectors = mddev->dev_sectors; 8176 8177 resync = mddev->curr_resync; 8178 if (resync < MD_RESYNC_ACTIVE) { 8179 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8180 /* Still cleaning up */ 8181 resync = max_sectors; 8182 } else if (resync > max_sectors) { 8183 resync = max_sectors; 8184 } else { 8185 res = atomic_read(&mddev->recovery_active); 8186 /* 8187 * Resync has started, but the subtraction has overflowed or 8188 * yielded one of the special values. Force it to active to 8189 * ensure the status reports an active resync. 8190 */ 8191 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8192 resync = MD_RESYNC_ACTIVE; 8193 else 8194 resync -= res; 8195 } 8196 8197 if (resync == MD_RESYNC_NONE) { 8198 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8199 struct md_rdev *rdev; 8200 8201 rdev_for_each(rdev, mddev) 8202 if (rdev->raid_disk >= 0 && 8203 !test_bit(Faulty, &rdev->flags) && 8204 rdev->recovery_offset != MaxSector && 8205 rdev->recovery_offset) { 8206 seq_printf(seq, "\trecover=REMOTE"); 8207 return 1; 8208 } 8209 if (mddev->reshape_position != MaxSector) 8210 seq_printf(seq, "\treshape=REMOTE"); 8211 else 8212 seq_printf(seq, "\tresync=REMOTE"); 8213 return 1; 8214 } 8215 if (mddev->recovery_cp < MaxSector) { 8216 seq_printf(seq, "\tresync=PENDING"); 8217 return 1; 8218 } 8219 return 0; 8220 } 8221 if (resync < MD_RESYNC_ACTIVE) { 8222 seq_printf(seq, "\tresync=DELAYED"); 8223 return 1; 8224 } 8225 8226 WARN_ON(max_sectors == 0); 8227 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8228 * in a sector_t, and (max_sectors>>scale) will fit in a 8229 * u32, as those are the requirements for sector_div. 8230 * Thus 'scale' must be at least 10 8231 */ 8232 scale = 10; 8233 if (sizeof(sector_t) > sizeof(unsigned long)) { 8234 while ( max_sectors/2 > (1ULL<<(scale+32))) 8235 scale++; 8236 } 8237 res = (resync>>scale)*1000; 8238 sector_div(res, (u32)((max_sectors>>scale)+1)); 8239 8240 per_milli = res; 8241 { 8242 int i, x = per_milli/50, y = 20-x; 8243 seq_printf(seq, "["); 8244 for (i = 0; i < x; i++) 8245 seq_printf(seq, "="); 8246 seq_printf(seq, ">"); 8247 for (i = 0; i < y; i++) 8248 seq_printf(seq, "."); 8249 seq_printf(seq, "] "); 8250 } 8251 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8252 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8253 "reshape" : 8254 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8255 "check" : 8256 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8257 "resync" : "recovery"))), 8258 per_milli/10, per_milli % 10, 8259 (unsigned long long) resync/2, 8260 (unsigned long long) max_sectors/2); 8261 8262 /* 8263 * dt: time from mark until now 8264 * db: blocks written from mark until now 8265 * rt: remaining time 8266 * 8267 * rt is a sector_t, which is always 64bit now. We are keeping 8268 * the original algorithm, but it is not really necessary. 8269 * 8270 * Original algorithm: 8271 * So we divide before multiply in case it is 32bit and close 8272 * to the limit. 8273 * We scale the divisor (db) by 32 to avoid losing precision 8274 * near the end of resync when the number of remaining sectors 8275 * is close to 'db'. 8276 * We then divide rt by 32 after multiplying by db to compensate. 8277 * The '+1' avoids division by zero if db is very small. 8278 */ 8279 dt = ((jiffies - mddev->resync_mark) / HZ); 8280 if (!dt) dt++; 8281 8282 curr_mark_cnt = mddev->curr_mark_cnt; 8283 recovery_active = atomic_read(&mddev->recovery_active); 8284 resync_mark_cnt = mddev->resync_mark_cnt; 8285 8286 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8287 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8288 8289 rt = max_sectors - resync; /* number of remaining sectors */ 8290 rt = div64_u64(rt, db/32+1); 8291 rt *= dt; 8292 rt >>= 5; 8293 8294 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8295 ((unsigned long)rt % 60)/6); 8296 8297 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8298 return 1; 8299 } 8300 8301 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8302 __acquires(&all_mddevs_lock) 8303 { 8304 seq->poll_event = atomic_read(&md_event_count); 8305 spin_lock(&all_mddevs_lock); 8306 8307 return seq_list_start_head(&all_mddevs, *pos); 8308 } 8309 8310 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8311 { 8312 return seq_list_next(v, &all_mddevs, pos); 8313 } 8314 8315 static void md_seq_stop(struct seq_file *seq, void *v) 8316 __releases(&all_mddevs_lock) 8317 { 8318 spin_unlock(&all_mddevs_lock); 8319 } 8320 8321 static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) 8322 { 8323 struct md_bitmap_stats stats; 8324 unsigned long used_pages; 8325 unsigned long chunk_kb; 8326 int err; 8327 8328 err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 8329 if (err) 8330 return; 8331 8332 chunk_kb = mddev->bitmap_info.chunksize >> 10; 8333 used_pages = stats.pages - stats.missing_pages; 8334 8335 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", 8336 used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), 8337 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 8338 chunk_kb ? "KB" : "B"); 8339 8340 if (stats.file) { 8341 seq_puts(seq, ", file: "); 8342 seq_file_path(seq, stats.file, " \t\n"); 8343 } 8344 8345 seq_putc(seq, '\n'); 8346 } 8347 8348 static int md_seq_show(struct seq_file *seq, void *v) 8349 { 8350 struct mddev *mddev; 8351 sector_t sectors; 8352 struct md_rdev *rdev; 8353 8354 if (v == &all_mddevs) { 8355 status_personalities(seq); 8356 if (list_empty(&all_mddevs)) 8357 status_unused(seq); 8358 return 0; 8359 } 8360 8361 mddev = list_entry(v, struct mddev, all_mddevs); 8362 if (!mddev_get(mddev)) 8363 return 0; 8364 8365 spin_unlock(&all_mddevs_lock); 8366 8367 /* prevent bitmap to be freed after checking */ 8368 mutex_lock(&mddev->bitmap_info.mutex); 8369 8370 spin_lock(&mddev->lock); 8371 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8372 seq_printf(seq, "%s : ", mdname(mddev)); 8373 if (mddev->pers) { 8374 if (test_bit(MD_BROKEN, &mddev->flags)) 8375 seq_printf(seq, "broken"); 8376 else 8377 seq_printf(seq, "active"); 8378 if (mddev->ro == MD_RDONLY) 8379 seq_printf(seq, " (read-only)"); 8380 if (mddev->ro == MD_AUTO_READ) 8381 seq_printf(seq, " (auto-read-only)"); 8382 seq_printf(seq, " %s", mddev->pers->name); 8383 } else { 8384 seq_printf(seq, "inactive"); 8385 } 8386 8387 sectors = 0; 8388 rcu_read_lock(); 8389 rdev_for_each_rcu(rdev, mddev) { 8390 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8391 8392 if (test_bit(WriteMostly, &rdev->flags)) 8393 seq_printf(seq, "(W)"); 8394 if (test_bit(Journal, &rdev->flags)) 8395 seq_printf(seq, "(J)"); 8396 if (test_bit(Faulty, &rdev->flags)) { 8397 seq_printf(seq, "(F)"); 8398 continue; 8399 } 8400 if (rdev->raid_disk < 0) 8401 seq_printf(seq, "(S)"); /* spare */ 8402 if (test_bit(Replacement, &rdev->flags)) 8403 seq_printf(seq, "(R)"); 8404 sectors += rdev->sectors; 8405 } 8406 rcu_read_unlock(); 8407 8408 if (!list_empty(&mddev->disks)) { 8409 if (mddev->pers) 8410 seq_printf(seq, "\n %llu blocks", 8411 (unsigned long long) 8412 mddev->array_sectors / 2); 8413 else 8414 seq_printf(seq, "\n %llu blocks", 8415 (unsigned long long)sectors / 2); 8416 } 8417 if (mddev->persistent) { 8418 if (mddev->major_version != 0 || 8419 mddev->minor_version != 90) { 8420 seq_printf(seq," super %d.%d", 8421 mddev->major_version, 8422 mddev->minor_version); 8423 } 8424 } else if (mddev->external) 8425 seq_printf(seq, " super external:%s", 8426 mddev->metadata_type); 8427 else 8428 seq_printf(seq, " super non-persistent"); 8429 8430 if (mddev->pers) { 8431 mddev->pers->status(seq, mddev); 8432 seq_printf(seq, "\n "); 8433 if (mddev->pers->sync_request) { 8434 if (status_resync(seq, mddev)) 8435 seq_printf(seq, "\n "); 8436 } 8437 } else 8438 seq_printf(seq, "\n "); 8439 8440 md_bitmap_status(seq, mddev); 8441 8442 seq_printf(seq, "\n"); 8443 } 8444 spin_unlock(&mddev->lock); 8445 mutex_unlock(&mddev->bitmap_info.mutex); 8446 spin_lock(&all_mddevs_lock); 8447 8448 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) 8449 status_unused(seq); 8450 8451 if (atomic_dec_and_test(&mddev->active)) 8452 __mddev_put(mddev); 8453 8454 return 0; 8455 } 8456 8457 static const struct seq_operations md_seq_ops = { 8458 .start = md_seq_start, 8459 .next = md_seq_next, 8460 .stop = md_seq_stop, 8461 .show = md_seq_show, 8462 }; 8463 8464 static int md_seq_open(struct inode *inode, struct file *file) 8465 { 8466 struct seq_file *seq; 8467 int error; 8468 8469 error = seq_open(file, &md_seq_ops); 8470 if (error) 8471 return error; 8472 8473 seq = file->private_data; 8474 seq->poll_event = atomic_read(&md_event_count); 8475 return error; 8476 } 8477 8478 static int md_unloading; 8479 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8480 { 8481 struct seq_file *seq = filp->private_data; 8482 __poll_t mask; 8483 8484 if (md_unloading) 8485 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8486 poll_wait(filp, &md_event_waiters, wait); 8487 8488 /* always allow read */ 8489 mask = EPOLLIN | EPOLLRDNORM; 8490 8491 if (seq->poll_event != atomic_read(&md_event_count)) 8492 mask |= EPOLLERR | EPOLLPRI; 8493 return mask; 8494 } 8495 8496 static const struct proc_ops mdstat_proc_ops = { 8497 .proc_open = md_seq_open, 8498 .proc_read = seq_read, 8499 .proc_lseek = seq_lseek, 8500 .proc_release = seq_release, 8501 .proc_poll = mdstat_poll, 8502 }; 8503 8504 int register_md_personality(struct md_personality *p) 8505 { 8506 pr_debug("md: %s personality registered for level %d\n", 8507 p->name, p->level); 8508 spin_lock(&pers_lock); 8509 list_add_tail(&p->list, &pers_list); 8510 spin_unlock(&pers_lock); 8511 return 0; 8512 } 8513 EXPORT_SYMBOL(register_md_personality); 8514 8515 int unregister_md_personality(struct md_personality *p) 8516 { 8517 pr_debug("md: %s personality unregistered\n", p->name); 8518 spin_lock(&pers_lock); 8519 list_del_init(&p->list); 8520 spin_unlock(&pers_lock); 8521 return 0; 8522 } 8523 EXPORT_SYMBOL(unregister_md_personality); 8524 8525 int register_md_cluster_operations(const struct md_cluster_operations *ops, 8526 struct module *module) 8527 { 8528 int ret = 0; 8529 spin_lock(&pers_lock); 8530 if (md_cluster_ops != NULL) 8531 ret = -EALREADY; 8532 else { 8533 md_cluster_ops = ops; 8534 md_cluster_mod = module; 8535 } 8536 spin_unlock(&pers_lock); 8537 return ret; 8538 } 8539 EXPORT_SYMBOL(register_md_cluster_operations); 8540 8541 int unregister_md_cluster_operations(void) 8542 { 8543 spin_lock(&pers_lock); 8544 md_cluster_ops = NULL; 8545 spin_unlock(&pers_lock); 8546 return 0; 8547 } 8548 EXPORT_SYMBOL(unregister_md_cluster_operations); 8549 8550 int md_setup_cluster(struct mddev *mddev, int nodes) 8551 { 8552 int ret; 8553 if (!md_cluster_ops) 8554 request_module("md-cluster"); 8555 spin_lock(&pers_lock); 8556 /* ensure module won't be unloaded */ 8557 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 8558 pr_warn("can't find md-cluster module or get its reference.\n"); 8559 spin_unlock(&pers_lock); 8560 return -ENOENT; 8561 } 8562 spin_unlock(&pers_lock); 8563 8564 ret = md_cluster_ops->join(mddev, nodes); 8565 if (!ret) 8566 mddev->safemode_delay = 0; 8567 return ret; 8568 } 8569 8570 void md_cluster_stop(struct mddev *mddev) 8571 { 8572 if (!md_cluster_ops) 8573 return; 8574 md_cluster_ops->leave(mddev); 8575 module_put(md_cluster_mod); 8576 } 8577 8578 static int is_mddev_idle(struct mddev *mddev, int init) 8579 { 8580 struct md_rdev *rdev; 8581 int idle; 8582 int curr_events; 8583 8584 idle = 1; 8585 rcu_read_lock(); 8586 rdev_for_each_rcu(rdev, mddev) { 8587 struct gendisk *disk = rdev->bdev->bd_disk; 8588 8589 if (!init && !blk_queue_io_stat(disk->queue)) 8590 continue; 8591 8592 curr_events = (int)part_stat_read_accum(disk->part0, sectors) - 8593 atomic_read(&disk->sync_io); 8594 /* sync IO will cause sync_io to increase before the disk_stats 8595 * as sync_io is counted when a request starts, and 8596 * disk_stats is counted when it completes. 8597 * So resync activity will cause curr_events to be smaller than 8598 * when there was no such activity. 8599 * non-sync IO will cause disk_stat to increase without 8600 * increasing sync_io so curr_events will (eventually) 8601 * be larger than it was before. Once it becomes 8602 * substantially larger, the test below will cause 8603 * the array to appear non-idle, and resync will slow 8604 * down. 8605 * If there is a lot of outstanding resync activity when 8606 * we set last_event to curr_events, then all that activity 8607 * completing might cause the array to appear non-idle 8608 * and resync will be slowed down even though there might 8609 * not have been non-resync activity. This will only 8610 * happen once though. 'last_events' will soon reflect 8611 * the state where there is little or no outstanding 8612 * resync requests, and further resync activity will 8613 * always make curr_events less than last_events. 8614 * 8615 */ 8616 if (init || curr_events - rdev->last_events > 64) { 8617 rdev->last_events = curr_events; 8618 idle = 0; 8619 } 8620 } 8621 rcu_read_unlock(); 8622 return idle; 8623 } 8624 8625 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8626 { 8627 /* another "blocks" (512byte) blocks have been synced */ 8628 atomic_sub(blocks, &mddev->recovery_active); 8629 wake_up(&mddev->recovery_wait); 8630 if (!ok) { 8631 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8632 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8633 md_wakeup_thread(mddev->thread); 8634 // stop recovery, signal do_sync .... 8635 } 8636 } 8637 EXPORT_SYMBOL(md_done_sync); 8638 8639 /* md_write_start(mddev, bi) 8640 * If we need to update some array metadata (e.g. 'active' flag 8641 * in superblock) before writing, schedule a superblock update 8642 * and wait for it to complete. 8643 * A return value of 'false' means that the write wasn't recorded 8644 * and cannot proceed as the array is being suspend. 8645 */ 8646 void md_write_start(struct mddev *mddev, struct bio *bi) 8647 { 8648 int did_change = 0; 8649 8650 if (bio_data_dir(bi) != WRITE) 8651 return; 8652 8653 BUG_ON(mddev->ro == MD_RDONLY); 8654 if (mddev->ro == MD_AUTO_READ) { 8655 /* need to switch to read/write */ 8656 mddev->ro = MD_RDWR; 8657 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8658 md_wakeup_thread(mddev->thread); 8659 md_wakeup_thread(mddev->sync_thread); 8660 did_change = 1; 8661 } 8662 rcu_read_lock(); 8663 percpu_ref_get(&mddev->writes_pending); 8664 smp_mb(); /* Match smp_mb in set_in_sync() */ 8665 if (mddev->safemode == 1) 8666 mddev->safemode = 0; 8667 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8668 if (mddev->in_sync || mddev->sync_checkers) { 8669 spin_lock(&mddev->lock); 8670 if (mddev->in_sync) { 8671 mddev->in_sync = 0; 8672 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8673 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8674 md_wakeup_thread(mddev->thread); 8675 did_change = 1; 8676 } 8677 spin_unlock(&mddev->lock); 8678 } 8679 rcu_read_unlock(); 8680 if (did_change) 8681 sysfs_notify_dirent_safe(mddev->sysfs_state); 8682 if (!mddev->has_superblocks) 8683 return; 8684 wait_event(mddev->sb_wait, 8685 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8686 } 8687 EXPORT_SYMBOL(md_write_start); 8688 8689 /* md_write_inc can only be called when md_write_start() has 8690 * already been called at least once of the current request. 8691 * It increments the counter and is useful when a single request 8692 * is split into several parts. Each part causes an increment and 8693 * so needs a matching md_write_end(). 8694 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8695 * a spinlocked region. 8696 */ 8697 void md_write_inc(struct mddev *mddev, struct bio *bi) 8698 { 8699 if (bio_data_dir(bi) != WRITE) 8700 return; 8701 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8702 percpu_ref_get(&mddev->writes_pending); 8703 } 8704 EXPORT_SYMBOL(md_write_inc); 8705 8706 void md_write_end(struct mddev *mddev) 8707 { 8708 percpu_ref_put(&mddev->writes_pending); 8709 8710 if (mddev->safemode == 2) 8711 md_wakeup_thread(mddev->thread); 8712 else if (mddev->safemode_delay) 8713 /* The roundup() ensures this only performs locking once 8714 * every ->safemode_delay jiffies 8715 */ 8716 mod_timer(&mddev->safemode_timer, 8717 roundup(jiffies, mddev->safemode_delay) + 8718 mddev->safemode_delay); 8719 } 8720 8721 EXPORT_SYMBOL(md_write_end); 8722 8723 /* This is used by raid0 and raid10 */ 8724 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8725 struct bio *bio, sector_t start, sector_t size) 8726 { 8727 struct bio *discard_bio = NULL; 8728 8729 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8730 &discard_bio) || !discard_bio) 8731 return; 8732 8733 bio_chain(discard_bio, bio); 8734 bio_clone_blkg_association(discard_bio, bio); 8735 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); 8736 submit_bio_noacct(discard_bio); 8737 } 8738 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8739 8740 static void md_bitmap_start(struct mddev *mddev, 8741 struct md_io_clone *md_io_clone) 8742 { 8743 if (mddev->pers->bitmap_sector) 8744 mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, 8745 &md_io_clone->sectors); 8746 8747 mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset, 8748 md_io_clone->sectors); 8749 } 8750 8751 static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) 8752 { 8753 mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset, 8754 md_io_clone->sectors); 8755 } 8756 8757 static void md_end_clone_io(struct bio *bio) 8758 { 8759 struct md_io_clone *md_io_clone = bio->bi_private; 8760 struct bio *orig_bio = md_io_clone->orig_bio; 8761 struct mddev *mddev = md_io_clone->mddev; 8762 8763 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) 8764 md_bitmap_end(mddev, md_io_clone); 8765 8766 if (bio->bi_status && !orig_bio->bi_status) 8767 orig_bio->bi_status = bio->bi_status; 8768 8769 if (md_io_clone->start_time) 8770 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8771 8772 bio_put(bio); 8773 bio_endio(orig_bio); 8774 percpu_ref_put(&mddev->active_io); 8775 } 8776 8777 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8778 { 8779 struct block_device *bdev = (*bio)->bi_bdev; 8780 struct md_io_clone *md_io_clone; 8781 struct bio *clone = 8782 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8783 8784 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8785 md_io_clone->orig_bio = *bio; 8786 md_io_clone->mddev = mddev; 8787 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8788 md_io_clone->start_time = bio_start_io_acct(*bio); 8789 8790 if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { 8791 md_io_clone->offset = (*bio)->bi_iter.bi_sector; 8792 md_io_clone->sectors = bio_sectors(*bio); 8793 md_bitmap_start(mddev, md_io_clone); 8794 } 8795 8796 clone->bi_end_io = md_end_clone_io; 8797 clone->bi_private = md_io_clone; 8798 *bio = clone; 8799 } 8800 8801 void md_account_bio(struct mddev *mddev, struct bio **bio) 8802 { 8803 percpu_ref_get(&mddev->active_io); 8804 md_clone_bio(mddev, bio); 8805 } 8806 EXPORT_SYMBOL_GPL(md_account_bio); 8807 8808 void md_free_cloned_bio(struct bio *bio) 8809 { 8810 struct md_io_clone *md_io_clone = bio->bi_private; 8811 struct bio *orig_bio = md_io_clone->orig_bio; 8812 struct mddev *mddev = md_io_clone->mddev; 8813 8814 if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) 8815 md_bitmap_end(mddev, md_io_clone); 8816 8817 if (bio->bi_status && !orig_bio->bi_status) 8818 orig_bio->bi_status = bio->bi_status; 8819 8820 if (md_io_clone->start_time) 8821 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8822 8823 bio_put(bio); 8824 percpu_ref_put(&mddev->active_io); 8825 } 8826 EXPORT_SYMBOL_GPL(md_free_cloned_bio); 8827 8828 /* md_allow_write(mddev) 8829 * Calling this ensures that the array is marked 'active' so that writes 8830 * may proceed without blocking. It is important to call this before 8831 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8832 * Must be called with mddev_lock held. 8833 */ 8834 void md_allow_write(struct mddev *mddev) 8835 { 8836 if (!mddev->pers) 8837 return; 8838 if (!md_is_rdwr(mddev)) 8839 return; 8840 if (!mddev->pers->sync_request) 8841 return; 8842 8843 spin_lock(&mddev->lock); 8844 if (mddev->in_sync) { 8845 mddev->in_sync = 0; 8846 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8847 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8848 if (mddev->safemode_delay && 8849 mddev->safemode == 0) 8850 mddev->safemode = 1; 8851 spin_unlock(&mddev->lock); 8852 md_update_sb(mddev, 0); 8853 sysfs_notify_dirent_safe(mddev->sysfs_state); 8854 /* wait for the dirty state to be recorded in the metadata */ 8855 wait_event(mddev->sb_wait, 8856 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8857 } else 8858 spin_unlock(&mddev->lock); 8859 } 8860 EXPORT_SYMBOL_GPL(md_allow_write); 8861 8862 static sector_t md_sync_max_sectors(struct mddev *mddev, 8863 enum sync_action action) 8864 { 8865 switch (action) { 8866 case ACTION_RESYNC: 8867 case ACTION_CHECK: 8868 case ACTION_REPAIR: 8869 atomic64_set(&mddev->resync_mismatches, 0); 8870 fallthrough; 8871 case ACTION_RESHAPE: 8872 return mddev->resync_max_sectors; 8873 case ACTION_RECOVER: 8874 return mddev->dev_sectors; 8875 default: 8876 return 0; 8877 } 8878 } 8879 8880 static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) 8881 { 8882 sector_t start = 0; 8883 struct md_rdev *rdev; 8884 8885 switch (action) { 8886 case ACTION_CHECK: 8887 case ACTION_REPAIR: 8888 return mddev->resync_min; 8889 case ACTION_RESYNC: 8890 if (!mddev->bitmap) 8891 return mddev->recovery_cp; 8892 return 0; 8893 case ACTION_RESHAPE: 8894 /* 8895 * If the original node aborts reshaping then we continue the 8896 * reshaping, so set again to avoid restart reshape from the 8897 * first beginning 8898 */ 8899 if (mddev_is_clustered(mddev) && 8900 mddev->reshape_position != MaxSector) 8901 return mddev->reshape_position; 8902 return 0; 8903 case ACTION_RECOVER: 8904 start = MaxSector; 8905 rcu_read_lock(); 8906 rdev_for_each_rcu(rdev, mddev) 8907 if (rdev->raid_disk >= 0 && 8908 !test_bit(Journal, &rdev->flags) && 8909 !test_bit(Faulty, &rdev->flags) && 8910 !test_bit(In_sync, &rdev->flags) && 8911 rdev->recovery_offset < start) 8912 start = rdev->recovery_offset; 8913 rcu_read_unlock(); 8914 8915 /* If there is a bitmap, we need to make sure all 8916 * writes that started before we added a spare 8917 * complete before we start doing a recovery. 8918 * Otherwise the write might complete and (via 8919 * bitmap_endwrite) set a bit in the bitmap after the 8920 * recovery has checked that bit and skipped that 8921 * region. 8922 */ 8923 if (mddev->bitmap) { 8924 mddev->pers->quiesce(mddev, 1); 8925 mddev->pers->quiesce(mddev, 0); 8926 } 8927 return start; 8928 default: 8929 return MaxSector; 8930 } 8931 } 8932 8933 #define SYNC_MARKS 10 8934 #define SYNC_MARK_STEP (3*HZ) 8935 #define UPDATE_FREQUENCY (5*60*HZ) 8936 void md_do_sync(struct md_thread *thread) 8937 { 8938 struct mddev *mddev = thread->mddev; 8939 struct mddev *mddev2; 8940 unsigned int currspeed = 0, window; 8941 sector_t max_sectors,j, io_sectors, recovery_done; 8942 unsigned long mark[SYNC_MARKS]; 8943 unsigned long update_time; 8944 sector_t mark_cnt[SYNC_MARKS]; 8945 int last_mark,m; 8946 sector_t last_check; 8947 int skipped = 0; 8948 struct md_rdev *rdev; 8949 enum sync_action action; 8950 const char *desc; 8951 struct blk_plug plug; 8952 int ret; 8953 8954 /* just incase thread restarts... */ 8955 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8956 return; 8957 8958 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8959 goto skip; 8960 8961 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || 8962 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 8963 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8964 goto skip; 8965 } 8966 8967 if (mddev_is_clustered(mddev)) { 8968 ret = md_cluster_ops->resync_start(mddev); 8969 if (ret) 8970 goto skip; 8971 8972 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8973 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8974 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8975 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8976 && ((unsigned long long)mddev->curr_resync_completed 8977 < (unsigned long long)mddev->resync_max_sectors)) 8978 goto skip; 8979 } 8980 8981 action = md_sync_action(mddev); 8982 desc = md_sync_action_name(action); 8983 mddev->last_sync_action = action; 8984 8985 /* 8986 * Before starting a resync we must have set curr_resync to 8987 * 2, and then checked that every "conflicting" array has curr_resync 8988 * less than ours. When we find one that is the same or higher 8989 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8990 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8991 * This will mean we have to start checking from the beginning again. 8992 * 8993 */ 8994 if (mddev_is_clustered(mddev)) 8995 md_cluster_ops->resync_start_notify(mddev); 8996 do { 8997 int mddev2_minor = -1; 8998 mddev->curr_resync = MD_RESYNC_DELAYED; 8999 9000 try_again: 9001 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9002 goto skip; 9003 spin_lock(&all_mddevs_lock); 9004 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 9005 if (test_bit(MD_DELETED, &mddev2->flags)) 9006 continue; 9007 if (mddev2 == mddev) 9008 continue; 9009 if (!mddev->parallel_resync 9010 && mddev2->curr_resync 9011 && match_mddev_units(mddev, mddev2)) { 9012 DEFINE_WAIT(wq); 9013 if (mddev < mddev2 && 9014 mddev->curr_resync == MD_RESYNC_DELAYED) { 9015 /* arbitrarily yield */ 9016 mddev->curr_resync = MD_RESYNC_YIELDED; 9017 wake_up(&resync_wait); 9018 } 9019 if (mddev > mddev2 && 9020 mddev->curr_resync == MD_RESYNC_YIELDED) 9021 /* no need to wait here, we can wait the next 9022 * time 'round when curr_resync == 2 9023 */ 9024 continue; 9025 /* We need to wait 'interruptible' so as not to 9026 * contribute to the load average, and not to 9027 * be caught by 'softlockup' 9028 */ 9029 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 9030 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9031 mddev2->curr_resync >= mddev->curr_resync) { 9032 if (mddev2_minor != mddev2->md_minor) { 9033 mddev2_minor = mddev2->md_minor; 9034 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 9035 desc, mdname(mddev), 9036 mdname(mddev2)); 9037 } 9038 spin_unlock(&all_mddevs_lock); 9039 9040 if (signal_pending(current)) 9041 flush_signals(current); 9042 schedule(); 9043 finish_wait(&resync_wait, &wq); 9044 goto try_again; 9045 } 9046 finish_wait(&resync_wait, &wq); 9047 } 9048 } 9049 spin_unlock(&all_mddevs_lock); 9050 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 9051 9052 max_sectors = md_sync_max_sectors(mddev, action); 9053 j = md_sync_position(mddev, action); 9054 9055 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 9056 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 9057 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 9058 speed_max(mddev), desc); 9059 9060 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 9061 9062 io_sectors = 0; 9063 for (m = 0; m < SYNC_MARKS; m++) { 9064 mark[m] = jiffies; 9065 mark_cnt[m] = io_sectors; 9066 } 9067 last_mark = 0; 9068 mddev->resync_mark = mark[last_mark]; 9069 mddev->resync_mark_cnt = mark_cnt[last_mark]; 9070 9071 /* 9072 * Tune reconstruction: 9073 */ 9074 window = 32 * (PAGE_SIZE / 512); 9075 pr_debug("md: using %dk window, over a total of %lluk.\n", 9076 window/2, (unsigned long long)max_sectors/2); 9077 9078 atomic_set(&mddev->recovery_active, 0); 9079 last_check = 0; 9080 9081 if (j >= MD_RESYNC_ACTIVE) { 9082 pr_debug("md: resuming %s of %s from checkpoint.\n", 9083 desc, mdname(mddev)); 9084 mddev->curr_resync = j; 9085 } else 9086 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 9087 mddev->curr_resync_completed = j; 9088 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9089 md_new_event(); 9090 update_time = jiffies; 9091 9092 blk_start_plug(&plug); 9093 while (j < max_sectors) { 9094 sector_t sectors; 9095 9096 skipped = 0; 9097 9098 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9099 ((mddev->curr_resync > mddev->curr_resync_completed && 9100 (mddev->curr_resync - mddev->curr_resync_completed) 9101 > (max_sectors >> 4)) || 9102 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9103 (j - mddev->curr_resync_completed)*2 9104 >= mddev->resync_max - mddev->curr_resync_completed || 9105 mddev->curr_resync_completed > mddev->resync_max 9106 )) { 9107 /* time to update curr_resync_completed */ 9108 wait_event(mddev->recovery_wait, 9109 atomic_read(&mddev->recovery_active) == 0); 9110 mddev->curr_resync_completed = j; 9111 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9112 j > mddev->recovery_cp) 9113 mddev->recovery_cp = j; 9114 update_time = jiffies; 9115 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9116 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9117 } 9118 9119 while (j >= mddev->resync_max && 9120 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9121 /* As this condition is controlled by user-space, 9122 * we can block indefinitely, so use '_interruptible' 9123 * to avoid triggering warnings. 9124 */ 9125 flush_signals(current); /* just in case */ 9126 wait_event_interruptible(mddev->recovery_wait, 9127 mddev->resync_max > j 9128 || test_bit(MD_RECOVERY_INTR, 9129 &mddev->recovery)); 9130 } 9131 9132 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9133 break; 9134 9135 sectors = mddev->pers->sync_request(mddev, j, max_sectors, 9136 &skipped); 9137 if (sectors == 0) { 9138 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9139 break; 9140 } 9141 9142 if (!skipped) { /* actual IO requested */ 9143 io_sectors += sectors; 9144 atomic_add(sectors, &mddev->recovery_active); 9145 } 9146 9147 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9148 break; 9149 9150 j += sectors; 9151 if (j > max_sectors) 9152 /* when skipping, extra large numbers can be returned. */ 9153 j = max_sectors; 9154 if (j >= MD_RESYNC_ACTIVE) 9155 mddev->curr_resync = j; 9156 mddev->curr_mark_cnt = io_sectors; 9157 if (last_check == 0) 9158 /* this is the earliest that rebuild will be 9159 * visible in /proc/mdstat 9160 */ 9161 md_new_event(); 9162 9163 if (last_check + window > io_sectors || j == max_sectors) 9164 continue; 9165 9166 last_check = io_sectors; 9167 repeat: 9168 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9169 /* step marks */ 9170 int next = (last_mark+1) % SYNC_MARKS; 9171 9172 mddev->resync_mark = mark[next]; 9173 mddev->resync_mark_cnt = mark_cnt[next]; 9174 mark[next] = jiffies; 9175 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9176 last_mark = next; 9177 } 9178 9179 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9180 break; 9181 9182 /* 9183 * this loop exits only if either when we are slower than 9184 * the 'hard' speed limit, or the system was IO-idle for 9185 * a jiffy. 9186 * the system might be non-idle CPU-wise, but we only care 9187 * about not overloading the IO subsystem. (things like an 9188 * e2fsck being done on the RAID array should execute fast) 9189 */ 9190 cond_resched(); 9191 9192 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9193 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9194 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9195 9196 if (currspeed > speed_min(mddev)) { 9197 if (currspeed > speed_max(mddev)) { 9198 msleep(500); 9199 goto repeat; 9200 } 9201 if (!is_mddev_idle(mddev, 0)) { 9202 /* 9203 * Give other IO more of a chance. 9204 * The faster the devices, the less we wait. 9205 */ 9206 wait_event(mddev->recovery_wait, 9207 !atomic_read(&mddev->recovery_active)); 9208 } 9209 } 9210 } 9211 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9212 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9213 ? "interrupted" : "done"); 9214 /* 9215 * this also signals 'finished resyncing' to md_stop 9216 */ 9217 blk_finish_plug(&plug); 9218 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9219 9220 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9221 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9222 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9223 mddev->curr_resync_completed = mddev->curr_resync; 9224 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9225 } 9226 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); 9227 9228 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9229 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9230 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9231 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9232 if (mddev->curr_resync >= mddev->recovery_cp) { 9233 pr_debug("md: checkpointing %s of %s.\n", 9234 desc, mdname(mddev)); 9235 if (test_bit(MD_RECOVERY_ERROR, 9236 &mddev->recovery)) 9237 mddev->recovery_cp = 9238 mddev->curr_resync_completed; 9239 else 9240 mddev->recovery_cp = 9241 mddev->curr_resync; 9242 } 9243 } else 9244 mddev->recovery_cp = MaxSector; 9245 } else { 9246 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9247 mddev->curr_resync = MaxSector; 9248 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9249 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9250 rcu_read_lock(); 9251 rdev_for_each_rcu(rdev, mddev) 9252 if (rdev->raid_disk >= 0 && 9253 mddev->delta_disks >= 0 && 9254 !test_bit(Journal, &rdev->flags) && 9255 !test_bit(Faulty, &rdev->flags) && 9256 !test_bit(In_sync, &rdev->flags) && 9257 rdev->recovery_offset < mddev->curr_resync) 9258 rdev->recovery_offset = mddev->curr_resync; 9259 rcu_read_unlock(); 9260 } 9261 } 9262 } 9263 skip: 9264 /* set CHANGE_PENDING here since maybe another update is needed, 9265 * so other nodes are informed. It should be harmless for normal 9266 * raid */ 9267 set_mask_bits(&mddev->sb_flags, 0, 9268 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9269 9270 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9271 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9272 mddev->delta_disks > 0 && 9273 mddev->pers->finish_reshape && 9274 mddev->pers->size && 9275 !mddev_is_dm(mddev)) { 9276 mddev_lock_nointr(mddev); 9277 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9278 mddev_unlock(mddev); 9279 if (!mddev_is_clustered(mddev)) 9280 set_capacity_and_notify(mddev->gendisk, 9281 mddev->array_sectors); 9282 } 9283 9284 spin_lock(&mddev->lock); 9285 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9286 /* We completed so min/max setting can be forgotten if used. */ 9287 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9288 mddev->resync_min = 0; 9289 mddev->resync_max = MaxSector; 9290 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9291 mddev->resync_min = mddev->curr_resync_completed; 9292 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9293 mddev->curr_resync = MD_RESYNC_NONE; 9294 spin_unlock(&mddev->lock); 9295 9296 wake_up(&resync_wait); 9297 md_wakeup_thread(mddev->thread); 9298 return; 9299 } 9300 EXPORT_SYMBOL_GPL(md_do_sync); 9301 9302 static bool rdev_removeable(struct md_rdev *rdev) 9303 { 9304 /* rdev is not used. */ 9305 if (rdev->raid_disk < 0) 9306 return false; 9307 9308 /* There are still inflight io, don't remove this rdev. */ 9309 if (atomic_read(&rdev->nr_pending)) 9310 return false; 9311 9312 /* 9313 * An error occurred but has not yet been acknowledged by the metadata 9314 * handler, don't remove this rdev. 9315 */ 9316 if (test_bit(Blocked, &rdev->flags)) 9317 return false; 9318 9319 /* Fautly rdev is not used, it's safe to remove it. */ 9320 if (test_bit(Faulty, &rdev->flags)) 9321 return true; 9322 9323 /* Journal disk can only be removed if it's faulty. */ 9324 if (test_bit(Journal, &rdev->flags)) 9325 return false; 9326 9327 /* 9328 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9329 * replacement has just become active from pers->spare_active(), and 9330 * then pers->hot_remove_disk() will replace this rdev with replacement. 9331 */ 9332 if (!test_bit(In_sync, &rdev->flags)) 9333 return true; 9334 9335 return false; 9336 } 9337 9338 static bool rdev_is_spare(struct md_rdev *rdev) 9339 { 9340 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 9341 !test_bit(In_sync, &rdev->flags) && 9342 !test_bit(Journal, &rdev->flags) && 9343 !test_bit(Faulty, &rdev->flags); 9344 } 9345 9346 static bool rdev_addable(struct md_rdev *rdev) 9347 { 9348 /* rdev is already used, don't add it again. */ 9349 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9350 test_bit(Faulty, &rdev->flags)) 9351 return false; 9352 9353 /* Allow to add journal disk. */ 9354 if (test_bit(Journal, &rdev->flags)) 9355 return true; 9356 9357 /* Allow to add if array is read-write. */ 9358 if (md_is_rdwr(rdev->mddev)) 9359 return true; 9360 9361 /* 9362 * For read-only array, only allow to readd a rdev. And if bitmap is 9363 * used, don't allow to readd a rdev that is too old. 9364 */ 9365 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 9366 return true; 9367 9368 return false; 9369 } 9370 9371 static bool md_spares_need_change(struct mddev *mddev) 9372 { 9373 struct md_rdev *rdev; 9374 9375 rcu_read_lock(); 9376 rdev_for_each_rcu(rdev, mddev) { 9377 if (rdev_removeable(rdev) || rdev_addable(rdev)) { 9378 rcu_read_unlock(); 9379 return true; 9380 } 9381 } 9382 rcu_read_unlock(); 9383 return false; 9384 } 9385 9386 static int remove_and_add_spares(struct mddev *mddev, 9387 struct md_rdev *this) 9388 { 9389 struct md_rdev *rdev; 9390 int spares = 0; 9391 int removed = 0; 9392 9393 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9394 /* Mustn't remove devices when resync thread is running */ 9395 return 0; 9396 9397 rdev_for_each(rdev, mddev) { 9398 if ((this == NULL || rdev == this) && rdev_removeable(rdev) && 9399 !mddev->pers->hot_remove_disk(mddev, rdev)) { 9400 sysfs_unlink_rdev(mddev, rdev); 9401 rdev->saved_raid_disk = rdev->raid_disk; 9402 rdev->raid_disk = -1; 9403 removed++; 9404 } 9405 } 9406 9407 if (removed && mddev->kobj.sd) 9408 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9409 9410 if (this && removed) 9411 goto no_add; 9412 9413 rdev_for_each(rdev, mddev) { 9414 if (this && this != rdev) 9415 continue; 9416 if (rdev_is_spare(rdev)) 9417 spares++; 9418 if (!rdev_addable(rdev)) 9419 continue; 9420 if (!test_bit(Journal, &rdev->flags)) 9421 rdev->recovery_offset = 0; 9422 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9423 /* failure here is OK */ 9424 sysfs_link_rdev(mddev, rdev); 9425 if (!test_bit(Journal, &rdev->flags)) 9426 spares++; 9427 md_new_event(); 9428 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9429 } 9430 } 9431 no_add: 9432 if (removed) 9433 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9434 return spares; 9435 } 9436 9437 static bool md_choose_sync_action(struct mddev *mddev, int *spares) 9438 { 9439 /* Check if reshape is in progress first. */ 9440 if (mddev->reshape_position != MaxSector) { 9441 if (mddev->pers->check_reshape == NULL || 9442 mddev->pers->check_reshape(mddev) != 0) 9443 return false; 9444 9445 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9446 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9447 return true; 9448 } 9449 9450 /* 9451 * Remove any failed drives, then add spares if possible. Spares are 9452 * also removed and re-added, to allow the personality to fail the 9453 * re-add. 9454 */ 9455 *spares = remove_and_add_spares(mddev, NULL); 9456 if (*spares) { 9457 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9458 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9459 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9460 9461 /* Start new recovery. */ 9462 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9463 return true; 9464 } 9465 9466 /* Check if recovery is in progress. */ 9467 if (mddev->recovery_cp < MaxSector) { 9468 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9469 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9470 return true; 9471 } 9472 9473 /* Delay to choose resync/check/repair in md_do_sync(). */ 9474 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9475 return true; 9476 9477 /* Nothing to be done */ 9478 return false; 9479 } 9480 9481 static void md_start_sync(struct work_struct *ws) 9482 { 9483 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 9484 int spares = 0; 9485 bool suspend = false; 9486 char *name; 9487 9488 /* 9489 * If reshape is still in progress, spares won't be added or removed 9490 * from conf until reshape is done. 9491 */ 9492 if (mddev->reshape_position == MaxSector && 9493 md_spares_need_change(mddev)) { 9494 suspend = true; 9495 mddev_suspend(mddev, false); 9496 } 9497 9498 mddev_lock_nointr(mddev); 9499 if (!md_is_rdwr(mddev)) { 9500 /* 9501 * On a read-only array we can: 9502 * - remove failed devices 9503 * - add already-in_sync devices if the array itself is in-sync. 9504 * As we only add devices that are already in-sync, we can 9505 * activate the spares immediately. 9506 */ 9507 remove_and_add_spares(mddev, NULL); 9508 goto not_running; 9509 } 9510 9511 if (!md_choose_sync_action(mddev, &spares)) 9512 goto not_running; 9513 9514 if (!mddev->pers->sync_request) 9515 goto not_running; 9516 9517 /* 9518 * We are adding a device or devices to an array which has the bitmap 9519 * stored on all devices. So make sure all bitmap pages get written. 9520 */ 9521 if (spares) 9522 mddev->bitmap_ops->write_all(mddev); 9523 9524 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? 9525 "reshape" : "resync"; 9526 rcu_assign_pointer(mddev->sync_thread, 9527 md_register_thread(md_do_sync, mddev, name)); 9528 if (!mddev->sync_thread) { 9529 pr_warn("%s: could not start resync thread...\n", 9530 mdname(mddev)); 9531 /* leave the spares where they are, it shouldn't hurt */ 9532 goto not_running; 9533 } 9534 9535 mddev_unlock(mddev); 9536 /* 9537 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9538 * not set it again. Otherwise, we may cause issue like this one: 9539 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9540 * Therefore, use __mddev_resume(mddev, false). 9541 */ 9542 if (suspend) 9543 __mddev_resume(mddev, false); 9544 md_wakeup_thread(mddev->sync_thread); 9545 sysfs_notify_dirent_safe(mddev->sysfs_action); 9546 md_new_event(); 9547 return; 9548 9549 not_running: 9550 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9551 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9552 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9553 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9554 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9555 mddev_unlock(mddev); 9556 /* 9557 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9558 * not set it again. Otherwise, we may cause issue like this one: 9559 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9560 * Therefore, use __mddev_resume(mddev, false). 9561 */ 9562 if (suspend) 9563 __mddev_resume(mddev, false); 9564 9565 wake_up(&resync_wait); 9566 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 9567 mddev->sysfs_action) 9568 sysfs_notify_dirent_safe(mddev->sysfs_action); 9569 } 9570 9571 static void unregister_sync_thread(struct mddev *mddev) 9572 { 9573 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9574 /* resync/recovery still happening */ 9575 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9576 return; 9577 } 9578 9579 if (WARN_ON_ONCE(!mddev->sync_thread)) 9580 return; 9581 9582 md_reap_sync_thread(mddev); 9583 } 9584 9585 /* 9586 * This routine is regularly called by all per-raid-array threads to 9587 * deal with generic issues like resync and super-block update. 9588 * Raid personalities that don't have a thread (linear/raid0) do not 9589 * need this as they never do any recovery or update the superblock. 9590 * 9591 * It does not do any resync itself, but rather "forks" off other threads 9592 * to do that as needed. 9593 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9594 * "->recovery" and create a thread at ->sync_thread. 9595 * When the thread finishes it sets MD_RECOVERY_DONE 9596 * and wakeups up this thread which will reap the thread and finish up. 9597 * This thread also removes any faulty devices (with nr_pending == 0). 9598 * 9599 * The overall approach is: 9600 * 1/ if the superblock needs updating, update it. 9601 * 2/ If a recovery thread is running, don't do anything else. 9602 * 3/ If recovery has finished, clean up, possibly marking spares active. 9603 * 4/ If there are any faulty devices, remove them. 9604 * 5/ If array is degraded, try to add spares devices 9605 * 6/ If array has spares or is not in-sync, start a resync thread. 9606 */ 9607 void md_check_recovery(struct mddev *mddev) 9608 { 9609 if (mddev->bitmap) 9610 mddev->bitmap_ops->daemon_work(mddev); 9611 9612 if (signal_pending(current)) { 9613 if (mddev->pers->sync_request && !mddev->external) { 9614 pr_debug("md: %s in immediate safe mode\n", 9615 mdname(mddev)); 9616 mddev->safemode = 2; 9617 } 9618 flush_signals(current); 9619 } 9620 9621 if (!md_is_rdwr(mddev) && 9622 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9623 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9624 return; 9625 if ( ! ( 9626 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9627 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9628 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9629 (mddev->external == 0 && mddev->safemode == 1) || 9630 (mddev->safemode == 2 9631 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9632 )) 9633 return; 9634 9635 if (mddev_trylock(mddev)) { 9636 bool try_set_sync = mddev->safemode != 0; 9637 9638 if (!mddev->external && mddev->safemode == 1) 9639 mddev->safemode = 0; 9640 9641 if (!md_is_rdwr(mddev)) { 9642 struct md_rdev *rdev; 9643 9644 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9645 unregister_sync_thread(mddev); 9646 goto unlock; 9647 } 9648 9649 if (!mddev->external && mddev->in_sync) 9650 /* 9651 * 'Blocked' flag not needed as failed devices 9652 * will be recorded if array switched to read/write. 9653 * Leaving it set will prevent the device 9654 * from being removed. 9655 */ 9656 rdev_for_each(rdev, mddev) 9657 clear_bit(Blocked, &rdev->flags); 9658 9659 /* 9660 * There is no thread, but we need to call 9661 * ->spare_active and clear saved_raid_disk 9662 */ 9663 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9664 md_reap_sync_thread(mddev); 9665 9666 /* 9667 * Let md_start_sync() to remove and add rdevs to the 9668 * array. 9669 */ 9670 if (md_spares_need_change(mddev)) { 9671 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9672 queue_work(md_misc_wq, &mddev->sync_work); 9673 } 9674 9675 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9676 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9677 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9678 9679 goto unlock; 9680 } 9681 9682 if (mddev_is_clustered(mddev)) { 9683 struct md_rdev *rdev, *tmp; 9684 /* kick the device if another node issued a 9685 * remove disk. 9686 */ 9687 rdev_for_each_safe(rdev, tmp, mddev) { 9688 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9689 rdev->raid_disk < 0) 9690 md_kick_rdev_from_array(rdev); 9691 } 9692 } 9693 9694 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9695 spin_lock(&mddev->lock); 9696 set_in_sync(mddev); 9697 spin_unlock(&mddev->lock); 9698 } 9699 9700 if (mddev->sb_flags) 9701 md_update_sb(mddev, 0); 9702 9703 /* 9704 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9705 * still set. 9706 */ 9707 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9708 unregister_sync_thread(mddev); 9709 goto unlock; 9710 } 9711 9712 /* Set RUNNING before clearing NEEDED to avoid 9713 * any transients in the value of "sync_action". 9714 */ 9715 mddev->curr_resync_completed = 0; 9716 spin_lock(&mddev->lock); 9717 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9718 spin_unlock(&mddev->lock); 9719 /* Clear some bits that don't mean anything, but 9720 * might be left set 9721 */ 9722 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9723 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9724 9725 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9726 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 9727 queue_work(md_misc_wq, &mddev->sync_work); 9728 } else { 9729 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9730 wake_up(&resync_wait); 9731 } 9732 9733 unlock: 9734 wake_up(&mddev->sb_wait); 9735 mddev_unlock(mddev); 9736 } 9737 } 9738 EXPORT_SYMBOL(md_check_recovery); 9739 9740 void md_reap_sync_thread(struct mddev *mddev) 9741 { 9742 struct md_rdev *rdev; 9743 sector_t old_dev_sectors = mddev->dev_sectors; 9744 bool is_reshaped = false; 9745 9746 /* resync has finished, collect result */ 9747 md_unregister_thread(mddev, &mddev->sync_thread); 9748 atomic_inc(&mddev->sync_seq); 9749 9750 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9751 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9752 mddev->degraded != mddev->raid_disks) { 9753 /* success...*/ 9754 /* activate any spares */ 9755 if (mddev->pers->spare_active(mddev)) { 9756 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9757 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9758 } 9759 } 9760 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9761 mddev->pers->finish_reshape) { 9762 mddev->pers->finish_reshape(mddev); 9763 if (mddev_is_clustered(mddev)) 9764 is_reshaped = true; 9765 } 9766 9767 /* If array is no-longer degraded, then any saved_raid_disk 9768 * information must be scrapped. 9769 */ 9770 if (!mddev->degraded) 9771 rdev_for_each(rdev, mddev) 9772 rdev->saved_raid_disk = -1; 9773 9774 md_update_sb(mddev, 1); 9775 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9776 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9777 * clustered raid */ 9778 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9779 md_cluster_ops->resync_finish(mddev); 9780 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9781 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9782 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9783 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9784 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9785 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9786 /* 9787 * We call md_cluster_ops->update_size here because sync_size could 9788 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9789 * so it is time to update size across cluster. 9790 */ 9791 if (mddev_is_clustered(mddev) && is_reshaped 9792 && !test_bit(MD_CLOSING, &mddev->flags)) 9793 md_cluster_ops->update_size(mddev, old_dev_sectors); 9794 /* flag recovery needed just to double check */ 9795 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9796 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9797 sysfs_notify_dirent_safe(mddev->sysfs_action); 9798 md_new_event(); 9799 if (mddev->event_work.func) 9800 queue_work(md_misc_wq, &mddev->event_work); 9801 wake_up(&resync_wait); 9802 } 9803 EXPORT_SYMBOL(md_reap_sync_thread); 9804 9805 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9806 { 9807 sysfs_notify_dirent_safe(rdev->sysfs_state); 9808 wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev), 9809 msecs_to_jiffies(5000)); 9810 rdev_dec_pending(rdev, mddev); 9811 } 9812 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9813 9814 void md_finish_reshape(struct mddev *mddev) 9815 { 9816 /* called be personality module when reshape completes. */ 9817 struct md_rdev *rdev; 9818 9819 rdev_for_each(rdev, mddev) { 9820 if (rdev->data_offset > rdev->new_data_offset) 9821 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9822 else 9823 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9824 rdev->data_offset = rdev->new_data_offset; 9825 } 9826 } 9827 EXPORT_SYMBOL(md_finish_reshape); 9828 9829 /* Bad block management */ 9830 9831 /* Returns true on success, false on failure */ 9832 bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9833 int is_new) 9834 { 9835 struct mddev *mddev = rdev->mddev; 9836 9837 /* 9838 * Recording new badblocks for faulty rdev will force unnecessary 9839 * super block updating. This is fragile for external management because 9840 * userspace daemon may trying to remove this device and deadlock may 9841 * occur. This will be probably solved in the mdadm, but it is safer to 9842 * avoid it. 9843 */ 9844 if (test_bit(Faulty, &rdev->flags)) 9845 return true; 9846 9847 if (is_new) 9848 s += rdev->new_data_offset; 9849 else 9850 s += rdev->data_offset; 9851 9852 if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) 9853 return false; 9854 9855 /* Make sure they get written out promptly */ 9856 if (test_bit(ExternalBbl, &rdev->flags)) 9857 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9858 sysfs_notify_dirent_safe(rdev->sysfs_state); 9859 set_mask_bits(&mddev->sb_flags, 0, 9860 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9861 md_wakeup_thread(rdev->mddev->thread); 9862 return true; 9863 } 9864 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9865 9866 void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9867 int is_new) 9868 { 9869 if (is_new) 9870 s += rdev->new_data_offset; 9871 else 9872 s += rdev->data_offset; 9873 9874 if (!badblocks_clear(&rdev->badblocks, s, sectors)) 9875 return; 9876 9877 if (test_bit(ExternalBbl, &rdev->flags)) 9878 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9879 } 9880 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9881 9882 static int md_notify_reboot(struct notifier_block *this, 9883 unsigned long code, void *x) 9884 { 9885 struct mddev *mddev, *n; 9886 int need_delay = 0; 9887 9888 spin_lock(&all_mddevs_lock); 9889 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 9890 if (!mddev_get(mddev)) 9891 continue; 9892 spin_unlock(&all_mddevs_lock); 9893 if (mddev_trylock(mddev)) { 9894 if (mddev->pers) 9895 __md_stop_writes(mddev); 9896 if (mddev->persistent) 9897 mddev->safemode = 2; 9898 mddev_unlock(mddev); 9899 } 9900 need_delay = 1; 9901 mddev_put(mddev); 9902 spin_lock(&all_mddevs_lock); 9903 } 9904 spin_unlock(&all_mddevs_lock); 9905 9906 /* 9907 * certain more exotic SCSI devices are known to be 9908 * volatile wrt too early system reboots. While the 9909 * right place to handle this issue is the given 9910 * driver, we do want to have a safe RAID driver ... 9911 */ 9912 if (need_delay) 9913 msleep(1000); 9914 9915 return NOTIFY_DONE; 9916 } 9917 9918 static struct notifier_block md_notifier = { 9919 .notifier_call = md_notify_reboot, 9920 .next = NULL, 9921 .priority = INT_MAX, /* before any real devices */ 9922 }; 9923 9924 static void md_geninit(void) 9925 { 9926 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9927 9928 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 9929 } 9930 9931 static int __init md_init(void) 9932 { 9933 int ret = -ENOMEM; 9934 9935 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9936 if (!md_wq) 9937 goto err_wq; 9938 9939 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9940 if (!md_misc_wq) 9941 goto err_misc_wq; 9942 9943 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 9944 0); 9945 if (!md_bitmap_wq) 9946 goto err_bitmap_wq; 9947 9948 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 9949 if (ret < 0) 9950 goto err_md; 9951 9952 ret = __register_blkdev(0, "mdp", md_probe); 9953 if (ret < 0) 9954 goto err_mdp; 9955 mdp_major = ret; 9956 9957 register_reboot_notifier(&md_notifier); 9958 raid_table_header = register_sysctl("dev/raid", raid_table); 9959 9960 md_geninit(); 9961 return 0; 9962 9963 err_mdp: 9964 unregister_blkdev(MD_MAJOR, "md"); 9965 err_md: 9966 destroy_workqueue(md_bitmap_wq); 9967 err_bitmap_wq: 9968 destroy_workqueue(md_misc_wq); 9969 err_misc_wq: 9970 destroy_workqueue(md_wq); 9971 err_wq: 9972 return ret; 9973 } 9974 9975 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9976 { 9977 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9978 struct md_rdev *rdev2, *tmp; 9979 int role, ret; 9980 9981 /* 9982 * If size is changed in another node then we need to 9983 * do resize as well. 9984 */ 9985 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9986 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9987 if (ret) 9988 pr_info("md-cluster: resize failed\n"); 9989 else 9990 mddev->bitmap_ops->update_sb(mddev->bitmap); 9991 } 9992 9993 /* Check for change of roles in the active devices */ 9994 rdev_for_each_safe(rdev2, tmp, mddev) { 9995 if (test_bit(Faulty, &rdev2->flags)) 9996 continue; 9997 9998 /* Check if the roles changed */ 9999 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 10000 10001 if (test_bit(Candidate, &rdev2->flags)) { 10002 if (role == MD_DISK_ROLE_FAULTY) { 10003 pr_info("md: Removing Candidate device %pg because add failed\n", 10004 rdev2->bdev); 10005 md_kick_rdev_from_array(rdev2); 10006 continue; 10007 } 10008 else 10009 clear_bit(Candidate, &rdev2->flags); 10010 } 10011 10012 if (role != rdev2->raid_disk) { 10013 /* 10014 * got activated except reshape is happening. 10015 */ 10016 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 10017 !(le32_to_cpu(sb->feature_map) & 10018 MD_FEATURE_RESHAPE_ACTIVE) && 10019 !md_cluster_ops->resync_status_get(mddev)) { 10020 /* 10021 * -1 to make raid1_add_disk() set conf->fullsync 10022 * to 1. This could avoid skipping sync when the 10023 * remote node is down during resyncing. 10024 */ 10025 if ((le32_to_cpu(sb->feature_map) 10026 & MD_FEATURE_RECOVERY_OFFSET)) 10027 rdev2->saved_raid_disk = -1; 10028 else 10029 rdev2->saved_raid_disk = role; 10030 ret = remove_and_add_spares(mddev, rdev2); 10031 pr_info("Activated spare: %pg\n", 10032 rdev2->bdev); 10033 /* wakeup mddev->thread here, so array could 10034 * perform resync with the new activated disk */ 10035 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10036 md_wakeup_thread(mddev->thread); 10037 } 10038 /* device faulty 10039 * We just want to do the minimum to mark the disk 10040 * as faulty. The recovery is performed by the 10041 * one who initiated the error. 10042 */ 10043 if (role == MD_DISK_ROLE_FAULTY || 10044 role == MD_DISK_ROLE_JOURNAL) { 10045 md_error(mddev, rdev2); 10046 clear_bit(Blocked, &rdev2->flags); 10047 } 10048 } 10049 } 10050 10051 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 10052 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 10053 if (ret) 10054 pr_warn("md: updating array disks failed. %d\n", ret); 10055 } 10056 10057 /* 10058 * Since mddev->delta_disks has already updated in update_raid_disks, 10059 * so it is time to check reshape. 10060 */ 10061 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10062 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10063 /* 10064 * reshape is happening in the remote node, we need to 10065 * update reshape_position and call start_reshape. 10066 */ 10067 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 10068 if (mddev->pers->update_reshape_pos) 10069 mddev->pers->update_reshape_pos(mddev); 10070 if (mddev->pers->start_reshape) 10071 mddev->pers->start_reshape(mddev); 10072 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10073 mddev->reshape_position != MaxSector && 10074 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10075 /* reshape is just done in another node. */ 10076 mddev->reshape_position = MaxSector; 10077 if (mddev->pers->update_reshape_pos) 10078 mddev->pers->update_reshape_pos(mddev); 10079 } 10080 10081 /* Finally set the event to be up to date */ 10082 mddev->events = le64_to_cpu(sb->events); 10083 } 10084 10085 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 10086 { 10087 int err; 10088 struct page *swapout = rdev->sb_page; 10089 struct mdp_superblock_1 *sb; 10090 10091 /* Store the sb page of the rdev in the swapout temporary 10092 * variable in case we err in the future 10093 */ 10094 rdev->sb_page = NULL; 10095 err = alloc_disk_sb(rdev); 10096 if (err == 0) { 10097 ClearPageUptodate(rdev->sb_page); 10098 rdev->sb_loaded = 0; 10099 err = super_types[mddev->major_version]. 10100 load_super(rdev, NULL, mddev->minor_version); 10101 } 10102 if (err < 0) { 10103 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 10104 __func__, __LINE__, rdev->desc_nr, err); 10105 if (rdev->sb_page) 10106 put_page(rdev->sb_page); 10107 rdev->sb_page = swapout; 10108 rdev->sb_loaded = 1; 10109 return err; 10110 } 10111 10112 sb = page_address(rdev->sb_page); 10113 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 10114 * is not set 10115 */ 10116 10117 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 10118 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 10119 10120 /* The other node finished recovery, call spare_active to set 10121 * device In_sync and mddev->degraded 10122 */ 10123 if (rdev->recovery_offset == MaxSector && 10124 !test_bit(In_sync, &rdev->flags) && 10125 mddev->pers->spare_active(mddev)) 10126 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10127 10128 put_page(swapout); 10129 return 0; 10130 } 10131 10132 void md_reload_sb(struct mddev *mddev, int nr) 10133 { 10134 struct md_rdev *rdev = NULL, *iter; 10135 int err; 10136 10137 /* Find the rdev */ 10138 rdev_for_each_rcu(iter, mddev) { 10139 if (iter->desc_nr == nr) { 10140 rdev = iter; 10141 break; 10142 } 10143 } 10144 10145 if (!rdev) { 10146 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 10147 return; 10148 } 10149 10150 err = read_rdev(mddev, rdev); 10151 if (err < 0) 10152 return; 10153 10154 check_sb_changes(mddev, rdev); 10155 10156 /* Read all rdev's to update recovery_offset */ 10157 rdev_for_each_rcu(rdev, mddev) { 10158 if (!test_bit(Faulty, &rdev->flags)) 10159 read_rdev(mddev, rdev); 10160 } 10161 } 10162 EXPORT_SYMBOL(md_reload_sb); 10163 10164 #ifndef MODULE 10165 10166 /* 10167 * Searches all registered partitions for autorun RAID arrays 10168 * at boot time. 10169 */ 10170 10171 static DEFINE_MUTEX(detected_devices_mutex); 10172 static LIST_HEAD(all_detected_devices); 10173 struct detected_devices_node { 10174 struct list_head list; 10175 dev_t dev; 10176 }; 10177 10178 void md_autodetect_dev(dev_t dev) 10179 { 10180 struct detected_devices_node *node_detected_dev; 10181 10182 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 10183 if (node_detected_dev) { 10184 node_detected_dev->dev = dev; 10185 mutex_lock(&detected_devices_mutex); 10186 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10187 mutex_unlock(&detected_devices_mutex); 10188 } 10189 } 10190 10191 void md_autostart_arrays(int part) 10192 { 10193 struct md_rdev *rdev; 10194 struct detected_devices_node *node_detected_dev; 10195 dev_t dev; 10196 int i_scanned, i_passed; 10197 10198 i_scanned = 0; 10199 i_passed = 0; 10200 10201 pr_info("md: Autodetecting RAID arrays.\n"); 10202 10203 mutex_lock(&detected_devices_mutex); 10204 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10205 i_scanned++; 10206 node_detected_dev = list_entry(all_detected_devices.next, 10207 struct detected_devices_node, list); 10208 list_del(&node_detected_dev->list); 10209 dev = node_detected_dev->dev; 10210 kfree(node_detected_dev); 10211 mutex_unlock(&detected_devices_mutex); 10212 rdev = md_import_device(dev,0, 90); 10213 mutex_lock(&detected_devices_mutex); 10214 if (IS_ERR(rdev)) 10215 continue; 10216 10217 if (test_bit(Faulty, &rdev->flags)) 10218 continue; 10219 10220 set_bit(AutoDetected, &rdev->flags); 10221 list_add(&rdev->same_set, &pending_raid_disks); 10222 i_passed++; 10223 } 10224 mutex_unlock(&detected_devices_mutex); 10225 10226 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10227 10228 autorun_devices(part); 10229 } 10230 10231 #endif /* !MODULE */ 10232 10233 static __exit void md_exit(void) 10234 { 10235 struct mddev *mddev, *n; 10236 int delay = 1; 10237 10238 unregister_blkdev(MD_MAJOR,"md"); 10239 unregister_blkdev(mdp_major, "mdp"); 10240 unregister_reboot_notifier(&md_notifier); 10241 unregister_sysctl_table(raid_table_header); 10242 10243 /* We cannot unload the modules while some process is 10244 * waiting for us in select() or poll() - wake them up 10245 */ 10246 md_unloading = 1; 10247 while (waitqueue_active(&md_event_waiters)) { 10248 /* not safe to leave yet */ 10249 wake_up(&md_event_waiters); 10250 msleep(delay); 10251 delay += delay; 10252 } 10253 remove_proc_entry("mdstat", NULL); 10254 10255 spin_lock(&all_mddevs_lock); 10256 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 10257 if (!mddev_get(mddev)) 10258 continue; 10259 spin_unlock(&all_mddevs_lock); 10260 export_array(mddev); 10261 mddev->ctime = 0; 10262 mddev->hold_active = 0; 10263 /* 10264 * As the mddev is now fully clear, mddev_put will schedule 10265 * the mddev for destruction by a workqueue, and the 10266 * destroy_workqueue() below will wait for that to complete. 10267 */ 10268 mddev_put(mddev); 10269 spin_lock(&all_mddevs_lock); 10270 } 10271 spin_unlock(&all_mddevs_lock); 10272 10273 destroy_workqueue(md_misc_wq); 10274 destroy_workqueue(md_bitmap_wq); 10275 destroy_workqueue(md_wq); 10276 } 10277 10278 subsys_initcall(md_init); 10279 module_exit(md_exit) 10280 10281 static int get_ro(char *buffer, const struct kernel_param *kp) 10282 { 10283 return sprintf(buffer, "%d\n", start_readonly); 10284 } 10285 static int set_ro(const char *val, const struct kernel_param *kp) 10286 { 10287 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10288 } 10289 10290 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10291 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10292 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10293 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10294 10295 MODULE_LICENSE("GPL"); 10296 MODULE_DESCRIPTION("MD RAID framework"); 10297 MODULE_ALIAS("md"); 10298 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10299