1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include "md.h" 69 #include "md-bitmap.h" 70 #include "md-cluster.h" 71 72 static const char *action_name[NR_SYNC_ACTIONS] = { 73 [ACTION_RESYNC] = "resync", 74 [ACTION_RECOVER] = "recover", 75 [ACTION_CHECK] = "check", 76 [ACTION_REPAIR] = "repair", 77 [ACTION_RESHAPE] = "reshape", 78 [ACTION_FROZEN] = "frozen", 79 [ACTION_IDLE] = "idle", 80 }; 81 82 /* pers_list is a list of registered personalities protected by pers_lock. */ 83 static LIST_HEAD(pers_list); 84 static DEFINE_SPINLOCK(pers_lock); 85 86 static const struct kobj_type md_ktype; 87 88 const struct md_cluster_operations *md_cluster_ops; 89 EXPORT_SYMBOL(md_cluster_ops); 90 static struct module *md_cluster_mod; 91 92 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 93 static struct workqueue_struct *md_wq; 94 95 /* 96 * This workqueue is used for sync_work to register new sync_thread, and for 97 * del_work to remove rdev, and for event_work that is only set by dm-raid. 98 * 99 * Noted that sync_work will grab reconfig_mutex, hence never flush this 100 * workqueue whith reconfig_mutex grabbed. 101 */ 102 static struct workqueue_struct *md_misc_wq; 103 struct workqueue_struct *md_bitmap_wq; 104 105 static int remove_and_add_spares(struct mddev *mddev, 106 struct md_rdev *this); 107 static void mddev_detach(struct mddev *mddev); 108 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 109 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 110 111 /* 112 * Default number of read corrections we'll attempt on an rdev 113 * before ejecting it from the array. We divide the read error 114 * count by 2 for every hour elapsed between read errors. 115 */ 116 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 117 /* Default safemode delay: 200 msec */ 118 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 119 /* 120 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 121 * is 1000 KB/sec, so the extra system load does not show up that much. 122 * Increase it if you want to have more _guaranteed_ speed. Note that 123 * the RAID driver will use the maximum available bandwidth if the IO 124 * subsystem is idle. There is also an 'absolute maximum' reconstruction 125 * speed limit - in case reconstruction slows down your system despite 126 * idle IO detection. 127 * 128 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 129 * or /sys/block/mdX/md/sync_speed_{min,max} 130 */ 131 132 static int sysctl_speed_limit_min = 1000; 133 static int sysctl_speed_limit_max = 200000; 134 static inline int speed_min(struct mddev *mddev) 135 { 136 return mddev->sync_speed_min ? 137 mddev->sync_speed_min : sysctl_speed_limit_min; 138 } 139 140 static inline int speed_max(struct mddev *mddev) 141 { 142 return mddev->sync_speed_max ? 143 mddev->sync_speed_max : sysctl_speed_limit_max; 144 } 145 146 static void rdev_uninit_serial(struct md_rdev *rdev) 147 { 148 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 149 return; 150 151 kvfree(rdev->serial); 152 rdev->serial = NULL; 153 } 154 155 static void rdevs_uninit_serial(struct mddev *mddev) 156 { 157 struct md_rdev *rdev; 158 159 rdev_for_each(rdev, mddev) 160 rdev_uninit_serial(rdev); 161 } 162 163 static int rdev_init_serial(struct md_rdev *rdev) 164 { 165 /* serial_nums equals with BARRIER_BUCKETS_NR */ 166 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 167 struct serial_in_rdev *serial = NULL; 168 169 if (test_bit(CollisionCheck, &rdev->flags)) 170 return 0; 171 172 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 173 GFP_KERNEL); 174 if (!serial) 175 return -ENOMEM; 176 177 for (i = 0; i < serial_nums; i++) { 178 struct serial_in_rdev *serial_tmp = &serial[i]; 179 180 spin_lock_init(&serial_tmp->serial_lock); 181 serial_tmp->serial_rb = RB_ROOT_CACHED; 182 init_waitqueue_head(&serial_tmp->serial_io_wait); 183 } 184 185 rdev->serial = serial; 186 set_bit(CollisionCheck, &rdev->flags); 187 188 return 0; 189 } 190 191 static int rdevs_init_serial(struct mddev *mddev) 192 { 193 struct md_rdev *rdev; 194 int ret = 0; 195 196 rdev_for_each(rdev, mddev) { 197 ret = rdev_init_serial(rdev); 198 if (ret) 199 break; 200 } 201 202 /* Free all resources if pool is not existed */ 203 if (ret && !mddev->serial_info_pool) 204 rdevs_uninit_serial(mddev); 205 206 return ret; 207 } 208 209 /* 210 * rdev needs to enable serial stuffs if it meets the conditions: 211 * 1. it is multi-queue device flaged with writemostly. 212 * 2. the write-behind mode is enabled. 213 */ 214 static int rdev_need_serial(struct md_rdev *rdev) 215 { 216 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 217 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 218 test_bit(WriteMostly, &rdev->flags)); 219 } 220 221 /* 222 * Init resource for rdev(s), then create serial_info_pool if: 223 * 1. rdev is the first device which return true from rdev_enable_serial. 224 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 225 */ 226 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 227 { 228 int ret = 0; 229 230 if (rdev && !rdev_need_serial(rdev) && 231 !test_bit(CollisionCheck, &rdev->flags)) 232 return; 233 234 if (!rdev) 235 ret = rdevs_init_serial(mddev); 236 else 237 ret = rdev_init_serial(rdev); 238 if (ret) 239 return; 240 241 if (mddev->serial_info_pool == NULL) { 242 /* 243 * already in memalloc noio context by 244 * mddev_suspend() 245 */ 246 mddev->serial_info_pool = 247 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 248 sizeof(struct serial_info)); 249 if (!mddev->serial_info_pool) { 250 rdevs_uninit_serial(mddev); 251 pr_err("can't alloc memory pool for serialization\n"); 252 } 253 } 254 } 255 256 /* 257 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 258 * 1. rdev is the last device flaged with CollisionCheck. 259 * 2. when bitmap is destroyed while policy is not enabled. 260 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 261 */ 262 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 263 { 264 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 265 return; 266 267 if (mddev->serial_info_pool) { 268 struct md_rdev *temp; 269 int num = 0; /* used to track if other rdevs need the pool */ 270 271 rdev_for_each(temp, mddev) { 272 if (!rdev) { 273 if (!mddev->serialize_policy || 274 !rdev_need_serial(temp)) 275 rdev_uninit_serial(temp); 276 else 277 num++; 278 } else if (temp != rdev && 279 test_bit(CollisionCheck, &temp->flags)) 280 num++; 281 } 282 283 if (rdev) 284 rdev_uninit_serial(rdev); 285 286 if (num) 287 pr_info("The mempool could be used by other devices\n"); 288 else { 289 mempool_destroy(mddev->serial_info_pool); 290 mddev->serial_info_pool = NULL; 291 } 292 } 293 } 294 295 static struct ctl_table_header *raid_table_header; 296 297 static struct ctl_table raid_table[] = { 298 { 299 .procname = "speed_limit_min", 300 .data = &sysctl_speed_limit_min, 301 .maxlen = sizeof(int), 302 .mode = S_IRUGO|S_IWUSR, 303 .proc_handler = proc_dointvec, 304 }, 305 { 306 .procname = "speed_limit_max", 307 .data = &sysctl_speed_limit_max, 308 .maxlen = sizeof(int), 309 .mode = S_IRUGO|S_IWUSR, 310 .proc_handler = proc_dointvec, 311 }, 312 }; 313 314 static int start_readonly; 315 316 /* 317 * The original mechanism for creating an md device is to create 318 * a device node in /dev and to open it. This causes races with device-close. 319 * The preferred method is to write to the "new_array" module parameter. 320 * This can avoid races. 321 * Setting create_on_open to false disables the original mechanism 322 * so all the races disappear. 323 */ 324 static bool create_on_open = true; 325 326 /* 327 * We have a system wide 'event count' that is incremented 328 * on any 'interesting' event, and readers of /proc/mdstat 329 * can use 'poll' or 'select' to find out when the event 330 * count increases. 331 * 332 * Events are: 333 * start array, stop array, error, add device, remove device, 334 * start build, activate spare 335 */ 336 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 337 static atomic_t md_event_count; 338 void md_new_event(void) 339 { 340 atomic_inc(&md_event_count); 341 wake_up(&md_event_waiters); 342 } 343 EXPORT_SYMBOL_GPL(md_new_event); 344 345 /* 346 * Enables to iterate over all existing md arrays 347 * all_mddevs_lock protects this list. 348 */ 349 static LIST_HEAD(all_mddevs); 350 static DEFINE_SPINLOCK(all_mddevs_lock); 351 352 static bool is_md_suspended(struct mddev *mddev) 353 { 354 return percpu_ref_is_dying(&mddev->active_io); 355 } 356 /* Rather than calling directly into the personality make_request function, 357 * IO requests come here first so that we can check if the device is 358 * being suspended pending a reconfiguration. 359 * We hold a refcount over the call to ->make_request. By the time that 360 * call has finished, the bio has been linked into some internal structure 361 * and so is visible to ->quiesce(), so we don't need the refcount any more. 362 */ 363 static bool is_suspended(struct mddev *mddev, struct bio *bio) 364 { 365 if (is_md_suspended(mddev)) 366 return true; 367 if (bio_data_dir(bio) != WRITE) 368 return false; 369 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 370 return false; 371 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 372 return false; 373 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 374 return false; 375 return true; 376 } 377 378 bool md_handle_request(struct mddev *mddev, struct bio *bio) 379 { 380 check_suspended: 381 if (is_suspended(mddev, bio)) { 382 DEFINE_WAIT(__wait); 383 /* Bail out if REQ_NOWAIT is set for the bio */ 384 if (bio->bi_opf & REQ_NOWAIT) { 385 bio_wouldblock_error(bio); 386 return true; 387 } 388 for (;;) { 389 prepare_to_wait(&mddev->sb_wait, &__wait, 390 TASK_UNINTERRUPTIBLE); 391 if (!is_suspended(mddev, bio)) 392 break; 393 schedule(); 394 } 395 finish_wait(&mddev->sb_wait, &__wait); 396 } 397 if (!percpu_ref_tryget_live(&mddev->active_io)) 398 goto check_suspended; 399 400 if (!mddev->pers->make_request(mddev, bio)) { 401 percpu_ref_put(&mddev->active_io); 402 if (!mddev->gendisk && mddev->pers->prepare_suspend) 403 return false; 404 goto check_suspended; 405 } 406 407 percpu_ref_put(&mddev->active_io); 408 return true; 409 } 410 EXPORT_SYMBOL(md_handle_request); 411 412 static void md_submit_bio(struct bio *bio) 413 { 414 const int rw = bio_data_dir(bio); 415 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 416 417 if (mddev == NULL || mddev->pers == NULL) { 418 bio_io_error(bio); 419 return; 420 } 421 422 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 423 bio_io_error(bio); 424 return; 425 } 426 427 bio = bio_split_to_limits(bio); 428 if (!bio) 429 return; 430 431 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 432 if (bio_sectors(bio) != 0) 433 bio->bi_status = BLK_STS_IOERR; 434 bio_endio(bio); 435 return; 436 } 437 438 /* bio could be mergeable after passing to underlayer */ 439 bio->bi_opf &= ~REQ_NOMERGE; 440 441 md_handle_request(mddev, bio); 442 } 443 444 /* 445 * Make sure no new requests are submitted to the device, and any requests that 446 * have been submitted are completely handled. 447 */ 448 int mddev_suspend(struct mddev *mddev, bool interruptible) 449 { 450 int err = 0; 451 452 /* 453 * hold reconfig_mutex to wait for normal io will deadlock, because 454 * other context can't update super_block, and normal io can rely on 455 * updating super_block. 456 */ 457 lockdep_assert_not_held(&mddev->reconfig_mutex); 458 459 if (interruptible) 460 err = mutex_lock_interruptible(&mddev->suspend_mutex); 461 else 462 mutex_lock(&mddev->suspend_mutex); 463 if (err) 464 return err; 465 466 if (mddev->suspended) { 467 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 468 mutex_unlock(&mddev->suspend_mutex); 469 return 0; 470 } 471 472 percpu_ref_kill(&mddev->active_io); 473 if (interruptible) 474 err = wait_event_interruptible(mddev->sb_wait, 475 percpu_ref_is_zero(&mddev->active_io)); 476 else 477 wait_event(mddev->sb_wait, 478 percpu_ref_is_zero(&mddev->active_io)); 479 if (err) { 480 percpu_ref_resurrect(&mddev->active_io); 481 mutex_unlock(&mddev->suspend_mutex); 482 return err; 483 } 484 485 /* 486 * For raid456, io might be waiting for reshape to make progress, 487 * allow new reshape to start while waiting for io to be done to 488 * prevent deadlock. 489 */ 490 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 491 492 /* restrict memory reclaim I/O during raid array is suspend */ 493 mddev->noio_flag = memalloc_noio_save(); 494 495 mutex_unlock(&mddev->suspend_mutex); 496 return 0; 497 } 498 EXPORT_SYMBOL_GPL(mddev_suspend); 499 500 static void __mddev_resume(struct mddev *mddev, bool recovery_needed) 501 { 502 lockdep_assert_not_held(&mddev->reconfig_mutex); 503 504 mutex_lock(&mddev->suspend_mutex); 505 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 506 if (mddev->suspended) { 507 mutex_unlock(&mddev->suspend_mutex); 508 return; 509 } 510 511 /* entred the memalloc scope from mddev_suspend() */ 512 memalloc_noio_restore(mddev->noio_flag); 513 514 percpu_ref_resurrect(&mddev->active_io); 515 wake_up(&mddev->sb_wait); 516 517 if (recovery_needed) 518 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 519 md_wakeup_thread(mddev->thread); 520 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 521 522 mutex_unlock(&mddev->suspend_mutex); 523 } 524 525 void mddev_resume(struct mddev *mddev) 526 { 527 return __mddev_resume(mddev, true); 528 } 529 EXPORT_SYMBOL_GPL(mddev_resume); 530 531 /* sync bdev before setting device to readonly or stopping raid*/ 532 static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) 533 { 534 mutex_lock(&mddev->open_mutex); 535 if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { 536 mutex_unlock(&mddev->open_mutex); 537 return -EBUSY; 538 } 539 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 540 mutex_unlock(&mddev->open_mutex); 541 return -EBUSY; 542 } 543 mutex_unlock(&mddev->open_mutex); 544 545 sync_blockdev(mddev->gendisk->part0); 546 return 0; 547 } 548 549 /* 550 * Generic flush handling for md 551 */ 552 553 static void md_end_flush(struct bio *bio) 554 { 555 struct md_rdev *rdev = bio->bi_private; 556 struct mddev *mddev = rdev->mddev; 557 558 bio_put(bio); 559 560 rdev_dec_pending(rdev, mddev); 561 562 if (atomic_dec_and_test(&mddev->flush_pending)) 563 /* The pre-request flush has finished */ 564 queue_work(md_wq, &mddev->flush_work); 565 } 566 567 static void md_submit_flush_data(struct work_struct *ws); 568 569 static void submit_flushes(struct work_struct *ws) 570 { 571 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 572 struct md_rdev *rdev; 573 574 mddev->start_flush = ktime_get_boottime(); 575 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 576 atomic_set(&mddev->flush_pending, 1); 577 rcu_read_lock(); 578 rdev_for_each_rcu(rdev, mddev) 579 if (rdev->raid_disk >= 0 && 580 !test_bit(Faulty, &rdev->flags)) { 581 struct bio *bi; 582 583 atomic_inc(&rdev->nr_pending); 584 rcu_read_unlock(); 585 bi = bio_alloc_bioset(rdev->bdev, 0, 586 REQ_OP_WRITE | REQ_PREFLUSH, 587 GFP_NOIO, &mddev->bio_set); 588 bi->bi_end_io = md_end_flush; 589 bi->bi_private = rdev; 590 atomic_inc(&mddev->flush_pending); 591 submit_bio(bi); 592 rcu_read_lock(); 593 } 594 rcu_read_unlock(); 595 if (atomic_dec_and_test(&mddev->flush_pending)) 596 queue_work(md_wq, &mddev->flush_work); 597 } 598 599 static void md_submit_flush_data(struct work_struct *ws) 600 { 601 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 602 struct bio *bio = mddev->flush_bio; 603 604 /* 605 * must reset flush_bio before calling into md_handle_request to avoid a 606 * deadlock, because other bios passed md_handle_request suspend check 607 * could wait for this and below md_handle_request could wait for those 608 * bios because of suspend check 609 */ 610 spin_lock_irq(&mddev->lock); 611 mddev->prev_flush_start = mddev->start_flush; 612 mddev->flush_bio = NULL; 613 spin_unlock_irq(&mddev->lock); 614 wake_up(&mddev->sb_wait); 615 616 if (bio->bi_iter.bi_size == 0) { 617 /* an empty barrier - all done */ 618 bio_endio(bio); 619 } else { 620 bio->bi_opf &= ~REQ_PREFLUSH; 621 622 /* 623 * make_requst() will never return error here, it only 624 * returns error in raid5_make_request() by dm-raid. 625 * Since dm always splits data and flush operation into 626 * two separate io, io size of flush submitted by dm 627 * always is 0, make_request() will not be called here. 628 */ 629 if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) 630 bio_io_error(bio); 631 } 632 633 /* The pair is percpu_ref_get() from md_flush_request() */ 634 percpu_ref_put(&mddev->active_io); 635 } 636 637 /* 638 * Manages consolidation of flushes and submitting any flushes needed for 639 * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is 640 * being finished in another context. Returns false if the flushing is 641 * complete but still needs the I/O portion of the bio to be processed. 642 */ 643 bool md_flush_request(struct mddev *mddev, struct bio *bio) 644 { 645 ktime_t req_start = ktime_get_boottime(); 646 spin_lock_irq(&mddev->lock); 647 /* flush requests wait until ongoing flush completes, 648 * hence coalescing all the pending requests. 649 */ 650 wait_event_lock_irq(mddev->sb_wait, 651 !mddev->flush_bio || 652 ktime_before(req_start, mddev->prev_flush_start), 653 mddev->lock); 654 /* new request after previous flush is completed */ 655 if (ktime_after(req_start, mddev->prev_flush_start)) { 656 WARN_ON(mddev->flush_bio); 657 /* 658 * Grab a reference to make sure mddev_suspend() will wait for 659 * this flush to be done. 660 * 661 * md_flush_reqeust() is called under md_handle_request() and 662 * 'active_io' is already grabbed, hence percpu_ref_is_zero() 663 * won't pass, percpu_ref_tryget_live() can't be used because 664 * percpu_ref_kill() can be called by mddev_suspend() 665 * concurrently. 666 */ 667 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 668 percpu_ref_get(&mddev->active_io); 669 mddev->flush_bio = bio; 670 spin_unlock_irq(&mddev->lock); 671 INIT_WORK(&mddev->flush_work, submit_flushes); 672 queue_work(md_wq, &mddev->flush_work); 673 return true; 674 } 675 676 /* flush was performed for some other bio while we waited. */ 677 spin_unlock_irq(&mddev->lock); 678 if (bio->bi_iter.bi_size == 0) { 679 /* pure flush without data - all done */ 680 bio_endio(bio); 681 return true; 682 } 683 684 bio->bi_opf &= ~REQ_PREFLUSH; 685 return false; 686 } 687 EXPORT_SYMBOL(md_flush_request); 688 689 static inline struct mddev *mddev_get(struct mddev *mddev) 690 { 691 lockdep_assert_held(&all_mddevs_lock); 692 693 if (test_bit(MD_DELETED, &mddev->flags)) 694 return NULL; 695 atomic_inc(&mddev->active); 696 return mddev; 697 } 698 699 static void mddev_delayed_delete(struct work_struct *ws); 700 701 static void __mddev_put(struct mddev *mddev) 702 { 703 if (mddev->raid_disks || !list_empty(&mddev->disks) || 704 mddev->ctime || mddev->hold_active) 705 return; 706 707 /* Array is not configured at all, and not held active, so destroy it */ 708 set_bit(MD_DELETED, &mddev->flags); 709 710 /* 711 * Call queue_work inside the spinlock so that flush_workqueue() after 712 * mddev_find will succeed in waiting for the work to be done. 713 */ 714 queue_work(md_misc_wq, &mddev->del_work); 715 } 716 717 void mddev_put(struct mddev *mddev) 718 { 719 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 720 return; 721 722 __mddev_put(mddev); 723 spin_unlock(&all_mddevs_lock); 724 } 725 726 static void md_safemode_timeout(struct timer_list *t); 727 static void md_start_sync(struct work_struct *ws); 728 729 static void active_io_release(struct percpu_ref *ref) 730 { 731 struct mddev *mddev = container_of(ref, struct mddev, active_io); 732 733 wake_up(&mddev->sb_wait); 734 } 735 736 static void no_op(struct percpu_ref *r) {} 737 738 int mddev_init(struct mddev *mddev) 739 { 740 741 if (percpu_ref_init(&mddev->active_io, active_io_release, 742 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 743 return -ENOMEM; 744 745 if (percpu_ref_init(&mddev->writes_pending, no_op, 746 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 747 percpu_ref_exit(&mddev->active_io); 748 return -ENOMEM; 749 } 750 751 /* We want to start with the refcount at zero */ 752 percpu_ref_put(&mddev->writes_pending); 753 754 mutex_init(&mddev->open_mutex); 755 mutex_init(&mddev->reconfig_mutex); 756 mutex_init(&mddev->suspend_mutex); 757 mutex_init(&mddev->bitmap_info.mutex); 758 INIT_LIST_HEAD(&mddev->disks); 759 INIT_LIST_HEAD(&mddev->all_mddevs); 760 INIT_LIST_HEAD(&mddev->deleting); 761 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 762 atomic_set(&mddev->active, 1); 763 atomic_set(&mddev->openers, 0); 764 atomic_set(&mddev->sync_seq, 0); 765 spin_lock_init(&mddev->lock); 766 atomic_set(&mddev->flush_pending, 0); 767 init_waitqueue_head(&mddev->sb_wait); 768 init_waitqueue_head(&mddev->recovery_wait); 769 mddev->reshape_position = MaxSector; 770 mddev->reshape_backwards = 0; 771 mddev->last_sync_action = ACTION_IDLE; 772 mddev->resync_min = 0; 773 mddev->resync_max = MaxSector; 774 mddev->level = LEVEL_NONE; 775 776 INIT_WORK(&mddev->sync_work, md_start_sync); 777 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 778 779 return 0; 780 } 781 EXPORT_SYMBOL_GPL(mddev_init); 782 783 void mddev_destroy(struct mddev *mddev) 784 { 785 percpu_ref_exit(&mddev->active_io); 786 percpu_ref_exit(&mddev->writes_pending); 787 } 788 EXPORT_SYMBOL_GPL(mddev_destroy); 789 790 static struct mddev *mddev_find_locked(dev_t unit) 791 { 792 struct mddev *mddev; 793 794 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 795 if (mddev->unit == unit) 796 return mddev; 797 798 return NULL; 799 } 800 801 /* find an unused unit number */ 802 static dev_t mddev_alloc_unit(void) 803 { 804 static int next_minor = 512; 805 int start = next_minor; 806 bool is_free = 0; 807 dev_t dev = 0; 808 809 while (!is_free) { 810 dev = MKDEV(MD_MAJOR, next_minor); 811 next_minor++; 812 if (next_minor > MINORMASK) 813 next_minor = 0; 814 if (next_minor == start) 815 return 0; /* Oh dear, all in use. */ 816 is_free = !mddev_find_locked(dev); 817 } 818 819 return dev; 820 } 821 822 static struct mddev *mddev_alloc(dev_t unit) 823 { 824 struct mddev *new; 825 int error; 826 827 if (unit && MAJOR(unit) != MD_MAJOR) 828 unit &= ~((1 << MdpMinorShift) - 1); 829 830 new = kzalloc(sizeof(*new), GFP_KERNEL); 831 if (!new) 832 return ERR_PTR(-ENOMEM); 833 834 error = mddev_init(new); 835 if (error) 836 goto out_free_new; 837 838 spin_lock(&all_mddevs_lock); 839 if (unit) { 840 error = -EEXIST; 841 if (mddev_find_locked(unit)) 842 goto out_destroy_new; 843 new->unit = unit; 844 if (MAJOR(unit) == MD_MAJOR) 845 new->md_minor = MINOR(unit); 846 else 847 new->md_minor = MINOR(unit) >> MdpMinorShift; 848 new->hold_active = UNTIL_IOCTL; 849 } else { 850 error = -ENODEV; 851 new->unit = mddev_alloc_unit(); 852 if (!new->unit) 853 goto out_destroy_new; 854 new->md_minor = MINOR(new->unit); 855 new->hold_active = UNTIL_STOP; 856 } 857 858 list_add(&new->all_mddevs, &all_mddevs); 859 spin_unlock(&all_mddevs_lock); 860 return new; 861 862 out_destroy_new: 863 spin_unlock(&all_mddevs_lock); 864 mddev_destroy(new); 865 out_free_new: 866 kfree(new); 867 return ERR_PTR(error); 868 } 869 870 static void mddev_free(struct mddev *mddev) 871 { 872 spin_lock(&all_mddevs_lock); 873 list_del(&mddev->all_mddevs); 874 spin_unlock(&all_mddevs_lock); 875 876 mddev_destroy(mddev); 877 kfree(mddev); 878 } 879 880 static const struct attribute_group md_redundancy_group; 881 882 void mddev_unlock(struct mddev *mddev) 883 { 884 struct md_rdev *rdev; 885 struct md_rdev *tmp; 886 LIST_HEAD(delete); 887 888 if (!list_empty(&mddev->deleting)) 889 list_splice_init(&mddev->deleting, &delete); 890 891 if (mddev->to_remove) { 892 /* These cannot be removed under reconfig_mutex as 893 * an access to the files will try to take reconfig_mutex 894 * while holding the file unremovable, which leads to 895 * a deadlock. 896 * So hold set sysfs_active while the remove in happeing, 897 * and anything else which might set ->to_remove or my 898 * otherwise change the sysfs namespace will fail with 899 * -EBUSY if sysfs_active is still set. 900 * We set sysfs_active under reconfig_mutex and elsewhere 901 * test it under the same mutex to ensure its correct value 902 * is seen. 903 */ 904 const struct attribute_group *to_remove = mddev->to_remove; 905 mddev->to_remove = NULL; 906 mddev->sysfs_active = 1; 907 mutex_unlock(&mddev->reconfig_mutex); 908 909 if (mddev->kobj.sd) { 910 if (to_remove != &md_redundancy_group) 911 sysfs_remove_group(&mddev->kobj, to_remove); 912 if (mddev->pers == NULL || 913 mddev->pers->sync_request == NULL) { 914 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 915 if (mddev->sysfs_action) 916 sysfs_put(mddev->sysfs_action); 917 if (mddev->sysfs_completed) 918 sysfs_put(mddev->sysfs_completed); 919 if (mddev->sysfs_degraded) 920 sysfs_put(mddev->sysfs_degraded); 921 mddev->sysfs_action = NULL; 922 mddev->sysfs_completed = NULL; 923 mddev->sysfs_degraded = NULL; 924 } 925 } 926 mddev->sysfs_active = 0; 927 } else 928 mutex_unlock(&mddev->reconfig_mutex); 929 930 md_wakeup_thread(mddev->thread); 931 wake_up(&mddev->sb_wait); 932 933 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 934 list_del_init(&rdev->same_set); 935 kobject_del(&rdev->kobj); 936 export_rdev(rdev, mddev); 937 } 938 } 939 EXPORT_SYMBOL_GPL(mddev_unlock); 940 941 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 942 { 943 struct md_rdev *rdev; 944 945 rdev_for_each_rcu(rdev, mddev) 946 if (rdev->desc_nr == nr) 947 return rdev; 948 949 return NULL; 950 } 951 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 952 953 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 954 { 955 struct md_rdev *rdev; 956 957 rdev_for_each(rdev, mddev) 958 if (rdev->bdev->bd_dev == dev) 959 return rdev; 960 961 return NULL; 962 } 963 964 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 965 { 966 struct md_rdev *rdev; 967 968 rdev_for_each_rcu(rdev, mddev) 969 if (rdev->bdev->bd_dev == dev) 970 return rdev; 971 972 return NULL; 973 } 974 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 975 976 static struct md_personality *find_pers(int level, char *clevel) 977 { 978 struct md_personality *pers; 979 list_for_each_entry(pers, &pers_list, list) { 980 if (level != LEVEL_NONE && pers->level == level) 981 return pers; 982 if (strcmp(pers->name, clevel)==0) 983 return pers; 984 } 985 return NULL; 986 } 987 988 /* return the offset of the super block in 512byte sectors */ 989 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 990 { 991 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 992 } 993 994 static int alloc_disk_sb(struct md_rdev *rdev) 995 { 996 rdev->sb_page = alloc_page(GFP_KERNEL); 997 if (!rdev->sb_page) 998 return -ENOMEM; 999 return 0; 1000 } 1001 1002 void md_rdev_clear(struct md_rdev *rdev) 1003 { 1004 if (rdev->sb_page) { 1005 put_page(rdev->sb_page); 1006 rdev->sb_loaded = 0; 1007 rdev->sb_page = NULL; 1008 rdev->sb_start = 0; 1009 rdev->sectors = 0; 1010 } 1011 if (rdev->bb_page) { 1012 put_page(rdev->bb_page); 1013 rdev->bb_page = NULL; 1014 } 1015 badblocks_exit(&rdev->badblocks); 1016 } 1017 EXPORT_SYMBOL_GPL(md_rdev_clear); 1018 1019 static void super_written(struct bio *bio) 1020 { 1021 struct md_rdev *rdev = bio->bi_private; 1022 struct mddev *mddev = rdev->mddev; 1023 1024 if (bio->bi_status) { 1025 pr_err("md: %s gets error=%d\n", __func__, 1026 blk_status_to_errno(bio->bi_status)); 1027 md_error(mddev, rdev); 1028 if (!test_bit(Faulty, &rdev->flags) 1029 && (bio->bi_opf & MD_FAILFAST)) { 1030 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 1031 set_bit(LastDev, &rdev->flags); 1032 } 1033 } else 1034 clear_bit(LastDev, &rdev->flags); 1035 1036 bio_put(bio); 1037 1038 rdev_dec_pending(rdev, mddev); 1039 1040 if (atomic_dec_and_test(&mddev->pending_writes)) 1041 wake_up(&mddev->sb_wait); 1042 } 1043 1044 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 1045 sector_t sector, int size, struct page *page) 1046 { 1047 /* write first size bytes of page to sector of rdev 1048 * Increment mddev->pending_writes before returning 1049 * and decrement it on completion, waking up sb_wait 1050 * if zero is reached. 1051 * If an error occurred, call md_error 1052 */ 1053 struct bio *bio; 1054 1055 if (!page) 1056 return; 1057 1058 if (test_bit(Faulty, &rdev->flags)) 1059 return; 1060 1061 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1062 1, 1063 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 1064 | REQ_PREFLUSH | REQ_FUA, 1065 GFP_NOIO, &mddev->sync_set); 1066 1067 atomic_inc(&rdev->nr_pending); 1068 1069 bio->bi_iter.bi_sector = sector; 1070 __bio_add_page(bio, page, size, 0); 1071 bio->bi_private = rdev; 1072 bio->bi_end_io = super_written; 1073 1074 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 1075 test_bit(FailFast, &rdev->flags) && 1076 !test_bit(LastDev, &rdev->flags)) 1077 bio->bi_opf |= MD_FAILFAST; 1078 1079 atomic_inc(&mddev->pending_writes); 1080 submit_bio(bio); 1081 } 1082 1083 int md_super_wait(struct mddev *mddev) 1084 { 1085 /* wait for all superblock writes that were scheduled to complete */ 1086 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1087 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1088 return -EAGAIN; 1089 return 0; 1090 } 1091 1092 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1093 struct page *page, blk_opf_t opf, bool metadata_op) 1094 { 1095 struct bio bio; 1096 struct bio_vec bvec; 1097 1098 if (metadata_op && rdev->meta_bdev) 1099 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1100 else 1101 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1102 1103 if (metadata_op) 1104 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1105 else if (rdev->mddev->reshape_position != MaxSector && 1106 (rdev->mddev->reshape_backwards == 1107 (sector >= rdev->mddev->reshape_position))) 1108 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1109 else 1110 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1111 __bio_add_page(&bio, page, size, 0); 1112 1113 submit_bio_wait(&bio); 1114 1115 return !bio.bi_status; 1116 } 1117 EXPORT_SYMBOL_GPL(sync_page_io); 1118 1119 static int read_disk_sb(struct md_rdev *rdev, int size) 1120 { 1121 if (rdev->sb_loaded) 1122 return 0; 1123 1124 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1125 goto fail; 1126 rdev->sb_loaded = 1; 1127 return 0; 1128 1129 fail: 1130 pr_err("md: disabled device %pg, could not read superblock.\n", 1131 rdev->bdev); 1132 return -EINVAL; 1133 } 1134 1135 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1136 { 1137 return sb1->set_uuid0 == sb2->set_uuid0 && 1138 sb1->set_uuid1 == sb2->set_uuid1 && 1139 sb1->set_uuid2 == sb2->set_uuid2 && 1140 sb1->set_uuid3 == sb2->set_uuid3; 1141 } 1142 1143 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1144 { 1145 int ret; 1146 mdp_super_t *tmp1, *tmp2; 1147 1148 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1149 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1150 1151 if (!tmp1 || !tmp2) { 1152 ret = 0; 1153 goto abort; 1154 } 1155 1156 *tmp1 = *sb1; 1157 *tmp2 = *sb2; 1158 1159 /* 1160 * nr_disks is not constant 1161 */ 1162 tmp1->nr_disks = 0; 1163 tmp2->nr_disks = 0; 1164 1165 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1166 abort: 1167 kfree(tmp1); 1168 kfree(tmp2); 1169 return ret; 1170 } 1171 1172 static u32 md_csum_fold(u32 csum) 1173 { 1174 csum = (csum & 0xffff) + (csum >> 16); 1175 return (csum & 0xffff) + (csum >> 16); 1176 } 1177 1178 static unsigned int calc_sb_csum(mdp_super_t *sb) 1179 { 1180 u64 newcsum = 0; 1181 u32 *sb32 = (u32*)sb; 1182 int i; 1183 unsigned int disk_csum, csum; 1184 1185 disk_csum = sb->sb_csum; 1186 sb->sb_csum = 0; 1187 1188 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1189 newcsum += sb32[i]; 1190 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1191 1192 #ifdef CONFIG_ALPHA 1193 /* This used to use csum_partial, which was wrong for several 1194 * reasons including that different results are returned on 1195 * different architectures. It isn't critical that we get exactly 1196 * the same return value as before (we always csum_fold before 1197 * testing, and that removes any differences). However as we 1198 * know that csum_partial always returned a 16bit value on 1199 * alphas, do a fold to maximise conformity to previous behaviour. 1200 */ 1201 sb->sb_csum = md_csum_fold(disk_csum); 1202 #else 1203 sb->sb_csum = disk_csum; 1204 #endif 1205 return csum; 1206 } 1207 1208 /* 1209 * Handle superblock details. 1210 * We want to be able to handle multiple superblock formats 1211 * so we have a common interface to them all, and an array of 1212 * different handlers. 1213 * We rely on user-space to write the initial superblock, and support 1214 * reading and updating of superblocks. 1215 * Interface methods are: 1216 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1217 * loads and validates a superblock on dev. 1218 * if refdev != NULL, compare superblocks on both devices 1219 * Return: 1220 * 0 - dev has a superblock that is compatible with refdev 1221 * 1 - dev has a superblock that is compatible and newer than refdev 1222 * so dev should be used as the refdev in future 1223 * -EINVAL superblock incompatible or invalid 1224 * -othererror e.g. -EIO 1225 * 1226 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1227 * Verify that dev is acceptable into mddev. 1228 * The first time, mddev->raid_disks will be 0, and data from 1229 * dev should be merged in. Subsequent calls check that dev 1230 * is new enough. Return 0 or -EINVAL 1231 * 1232 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1233 * Update the superblock for rdev with data in mddev 1234 * This does not write to disc. 1235 * 1236 */ 1237 1238 struct super_type { 1239 char *name; 1240 struct module *owner; 1241 int (*load_super)(struct md_rdev *rdev, 1242 struct md_rdev *refdev, 1243 int minor_version); 1244 int (*validate_super)(struct mddev *mddev, 1245 struct md_rdev *freshest, 1246 struct md_rdev *rdev); 1247 void (*sync_super)(struct mddev *mddev, 1248 struct md_rdev *rdev); 1249 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1250 sector_t num_sectors); 1251 int (*allow_new_offset)(struct md_rdev *rdev, 1252 unsigned long long new_offset); 1253 }; 1254 1255 /* 1256 * Check that the given mddev has no bitmap. 1257 * 1258 * This function is called from the run method of all personalities that do not 1259 * support bitmaps. It prints an error message and returns non-zero if mddev 1260 * has a bitmap. Otherwise, it returns 0. 1261 * 1262 */ 1263 int md_check_no_bitmap(struct mddev *mddev) 1264 { 1265 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1266 return 0; 1267 pr_warn("%s: bitmaps are not supported for %s\n", 1268 mdname(mddev), mddev->pers->name); 1269 return 1; 1270 } 1271 EXPORT_SYMBOL(md_check_no_bitmap); 1272 1273 /* 1274 * load_super for 0.90.0 1275 */ 1276 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1277 { 1278 mdp_super_t *sb; 1279 int ret; 1280 bool spare_disk = true; 1281 1282 /* 1283 * Calculate the position of the superblock (512byte sectors), 1284 * it's at the end of the disk. 1285 * 1286 * It also happens to be a multiple of 4Kb. 1287 */ 1288 rdev->sb_start = calc_dev_sboffset(rdev); 1289 1290 ret = read_disk_sb(rdev, MD_SB_BYTES); 1291 if (ret) 1292 return ret; 1293 1294 ret = -EINVAL; 1295 1296 sb = page_address(rdev->sb_page); 1297 1298 if (sb->md_magic != MD_SB_MAGIC) { 1299 pr_warn("md: invalid raid superblock magic on %pg\n", 1300 rdev->bdev); 1301 goto abort; 1302 } 1303 1304 if (sb->major_version != 0 || 1305 sb->minor_version < 90 || 1306 sb->minor_version > 91) { 1307 pr_warn("Bad version number %d.%d on %pg\n", 1308 sb->major_version, sb->minor_version, rdev->bdev); 1309 goto abort; 1310 } 1311 1312 if (sb->raid_disks <= 0) 1313 goto abort; 1314 1315 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1316 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1317 goto abort; 1318 } 1319 1320 rdev->preferred_minor = sb->md_minor; 1321 rdev->data_offset = 0; 1322 rdev->new_data_offset = 0; 1323 rdev->sb_size = MD_SB_BYTES; 1324 rdev->badblocks.shift = -1; 1325 1326 rdev->desc_nr = sb->this_disk.number; 1327 1328 /* not spare disk */ 1329 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && 1330 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1331 spare_disk = false; 1332 1333 if (!refdev) { 1334 if (!spare_disk) 1335 ret = 1; 1336 else 1337 ret = 0; 1338 } else { 1339 __u64 ev1, ev2; 1340 mdp_super_t *refsb = page_address(refdev->sb_page); 1341 if (!md_uuid_equal(refsb, sb)) { 1342 pr_warn("md: %pg has different UUID to %pg\n", 1343 rdev->bdev, refdev->bdev); 1344 goto abort; 1345 } 1346 if (!md_sb_equal(refsb, sb)) { 1347 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1348 rdev->bdev, refdev->bdev); 1349 goto abort; 1350 } 1351 ev1 = md_event(sb); 1352 ev2 = md_event(refsb); 1353 1354 if (!spare_disk && ev1 > ev2) 1355 ret = 1; 1356 else 1357 ret = 0; 1358 } 1359 rdev->sectors = rdev->sb_start; 1360 /* Limit to 4TB as metadata cannot record more than that. 1361 * (not needed for Linear and RAID0 as metadata doesn't 1362 * record this size) 1363 */ 1364 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1365 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1366 1367 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1368 /* "this cannot possibly happen" ... */ 1369 ret = -EINVAL; 1370 1371 abort: 1372 return ret; 1373 } 1374 1375 /* 1376 * validate_super for 0.90.0 1377 * note: we are not using "freshest" for 0.9 superblock 1378 */ 1379 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1380 { 1381 mdp_disk_t *desc; 1382 mdp_super_t *sb = page_address(rdev->sb_page); 1383 __u64 ev1 = md_event(sb); 1384 1385 rdev->raid_disk = -1; 1386 clear_bit(Faulty, &rdev->flags); 1387 clear_bit(In_sync, &rdev->flags); 1388 clear_bit(Bitmap_sync, &rdev->flags); 1389 clear_bit(WriteMostly, &rdev->flags); 1390 1391 if (mddev->raid_disks == 0) { 1392 mddev->major_version = 0; 1393 mddev->minor_version = sb->minor_version; 1394 mddev->patch_version = sb->patch_version; 1395 mddev->external = 0; 1396 mddev->chunk_sectors = sb->chunk_size >> 9; 1397 mddev->ctime = sb->ctime; 1398 mddev->utime = sb->utime; 1399 mddev->level = sb->level; 1400 mddev->clevel[0] = 0; 1401 mddev->layout = sb->layout; 1402 mddev->raid_disks = sb->raid_disks; 1403 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1404 mddev->events = ev1; 1405 mddev->bitmap_info.offset = 0; 1406 mddev->bitmap_info.space = 0; 1407 /* bitmap can use 60 K after the 4K superblocks */ 1408 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1409 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1410 mddev->reshape_backwards = 0; 1411 1412 if (mddev->minor_version >= 91) { 1413 mddev->reshape_position = sb->reshape_position; 1414 mddev->delta_disks = sb->delta_disks; 1415 mddev->new_level = sb->new_level; 1416 mddev->new_layout = sb->new_layout; 1417 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1418 if (mddev->delta_disks < 0) 1419 mddev->reshape_backwards = 1; 1420 } else { 1421 mddev->reshape_position = MaxSector; 1422 mddev->delta_disks = 0; 1423 mddev->new_level = mddev->level; 1424 mddev->new_layout = mddev->layout; 1425 mddev->new_chunk_sectors = mddev->chunk_sectors; 1426 } 1427 if (mddev->level == 0) 1428 mddev->layout = -1; 1429 1430 if (sb->state & (1<<MD_SB_CLEAN)) 1431 mddev->recovery_cp = MaxSector; 1432 else { 1433 if (sb->events_hi == sb->cp_events_hi && 1434 sb->events_lo == sb->cp_events_lo) { 1435 mddev->recovery_cp = sb->recovery_cp; 1436 } else 1437 mddev->recovery_cp = 0; 1438 } 1439 1440 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1441 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1442 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1443 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1444 1445 mddev->max_disks = MD_SB_DISKS; 1446 1447 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1448 mddev->bitmap_info.file == NULL) { 1449 mddev->bitmap_info.offset = 1450 mddev->bitmap_info.default_offset; 1451 mddev->bitmap_info.space = 1452 mddev->bitmap_info.default_space; 1453 } 1454 1455 } else if (mddev->pers == NULL) { 1456 /* Insist on good event counter while assembling, except 1457 * for spares (which don't need an event count) */ 1458 ++ev1; 1459 if (sb->disks[rdev->desc_nr].state & ( 1460 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1461 if (ev1 < mddev->events) 1462 return -EINVAL; 1463 } else if (mddev->bitmap) { 1464 /* if adding to array with a bitmap, then we can accept an 1465 * older device ... but not too old. 1466 */ 1467 if (ev1 < mddev->bitmap->events_cleared) 1468 return 0; 1469 if (ev1 < mddev->events) 1470 set_bit(Bitmap_sync, &rdev->flags); 1471 } else { 1472 if (ev1 < mddev->events) 1473 /* just a hot-add of a new device, leave raid_disk at -1 */ 1474 return 0; 1475 } 1476 1477 desc = sb->disks + rdev->desc_nr; 1478 1479 if (desc->state & (1<<MD_DISK_FAULTY)) 1480 set_bit(Faulty, &rdev->flags); 1481 else if (desc->state & (1<<MD_DISK_SYNC)) { 1482 set_bit(In_sync, &rdev->flags); 1483 rdev->raid_disk = desc->raid_disk; 1484 rdev->saved_raid_disk = desc->raid_disk; 1485 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1486 /* active but not in sync implies recovery up to 1487 * reshape position. We don't know exactly where 1488 * that is, so set to zero for now 1489 */ 1490 if (mddev->minor_version >= 91) { 1491 rdev->recovery_offset = 0; 1492 rdev->raid_disk = desc->raid_disk; 1493 } 1494 } 1495 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1496 set_bit(WriteMostly, &rdev->flags); 1497 if (desc->state & (1<<MD_DISK_FAILFAST)) 1498 set_bit(FailFast, &rdev->flags); 1499 return 0; 1500 } 1501 1502 /* 1503 * sync_super for 0.90.0 1504 */ 1505 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1506 { 1507 mdp_super_t *sb; 1508 struct md_rdev *rdev2; 1509 int next_spare = mddev->raid_disks; 1510 1511 /* make rdev->sb match mddev data.. 1512 * 1513 * 1/ zero out disks 1514 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1515 * 3/ any empty disks < next_spare become removed 1516 * 1517 * disks[0] gets initialised to REMOVED because 1518 * we cannot be sure from other fields if it has 1519 * been initialised or not. 1520 */ 1521 int i; 1522 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1523 1524 rdev->sb_size = MD_SB_BYTES; 1525 1526 sb = page_address(rdev->sb_page); 1527 1528 memset(sb, 0, sizeof(*sb)); 1529 1530 sb->md_magic = MD_SB_MAGIC; 1531 sb->major_version = mddev->major_version; 1532 sb->patch_version = mddev->patch_version; 1533 sb->gvalid_words = 0; /* ignored */ 1534 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1535 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1536 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1537 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1538 1539 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1540 sb->level = mddev->level; 1541 sb->size = mddev->dev_sectors / 2; 1542 sb->raid_disks = mddev->raid_disks; 1543 sb->md_minor = mddev->md_minor; 1544 sb->not_persistent = 0; 1545 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1546 sb->state = 0; 1547 sb->events_hi = (mddev->events>>32); 1548 sb->events_lo = (u32)mddev->events; 1549 1550 if (mddev->reshape_position == MaxSector) 1551 sb->minor_version = 90; 1552 else { 1553 sb->minor_version = 91; 1554 sb->reshape_position = mddev->reshape_position; 1555 sb->new_level = mddev->new_level; 1556 sb->delta_disks = mddev->delta_disks; 1557 sb->new_layout = mddev->new_layout; 1558 sb->new_chunk = mddev->new_chunk_sectors << 9; 1559 } 1560 mddev->minor_version = sb->minor_version; 1561 if (mddev->in_sync) 1562 { 1563 sb->recovery_cp = mddev->recovery_cp; 1564 sb->cp_events_hi = (mddev->events>>32); 1565 sb->cp_events_lo = (u32)mddev->events; 1566 if (mddev->recovery_cp == MaxSector) 1567 sb->state = (1<< MD_SB_CLEAN); 1568 } else 1569 sb->recovery_cp = 0; 1570 1571 sb->layout = mddev->layout; 1572 sb->chunk_size = mddev->chunk_sectors << 9; 1573 1574 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1575 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1576 1577 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1578 rdev_for_each(rdev2, mddev) { 1579 mdp_disk_t *d; 1580 int desc_nr; 1581 int is_active = test_bit(In_sync, &rdev2->flags); 1582 1583 if (rdev2->raid_disk >= 0 && 1584 sb->minor_version >= 91) 1585 /* we have nowhere to store the recovery_offset, 1586 * but if it is not below the reshape_position, 1587 * we can piggy-back on that. 1588 */ 1589 is_active = 1; 1590 if (rdev2->raid_disk < 0 || 1591 test_bit(Faulty, &rdev2->flags)) 1592 is_active = 0; 1593 if (is_active) 1594 desc_nr = rdev2->raid_disk; 1595 else 1596 desc_nr = next_spare++; 1597 rdev2->desc_nr = desc_nr; 1598 d = &sb->disks[rdev2->desc_nr]; 1599 nr_disks++; 1600 d->number = rdev2->desc_nr; 1601 d->major = MAJOR(rdev2->bdev->bd_dev); 1602 d->minor = MINOR(rdev2->bdev->bd_dev); 1603 if (is_active) 1604 d->raid_disk = rdev2->raid_disk; 1605 else 1606 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1607 if (test_bit(Faulty, &rdev2->flags)) 1608 d->state = (1<<MD_DISK_FAULTY); 1609 else if (is_active) { 1610 d->state = (1<<MD_DISK_ACTIVE); 1611 if (test_bit(In_sync, &rdev2->flags)) 1612 d->state |= (1<<MD_DISK_SYNC); 1613 active++; 1614 working++; 1615 } else { 1616 d->state = 0; 1617 spare++; 1618 working++; 1619 } 1620 if (test_bit(WriteMostly, &rdev2->flags)) 1621 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1622 if (test_bit(FailFast, &rdev2->flags)) 1623 d->state |= (1<<MD_DISK_FAILFAST); 1624 } 1625 /* now set the "removed" and "faulty" bits on any missing devices */ 1626 for (i=0 ; i < mddev->raid_disks ; i++) { 1627 mdp_disk_t *d = &sb->disks[i]; 1628 if (d->state == 0 && d->number == 0) { 1629 d->number = i; 1630 d->raid_disk = i; 1631 d->state = (1<<MD_DISK_REMOVED); 1632 d->state |= (1<<MD_DISK_FAULTY); 1633 failed++; 1634 } 1635 } 1636 sb->nr_disks = nr_disks; 1637 sb->active_disks = active; 1638 sb->working_disks = working; 1639 sb->failed_disks = failed; 1640 sb->spare_disks = spare; 1641 1642 sb->this_disk = sb->disks[rdev->desc_nr]; 1643 sb->sb_csum = calc_sb_csum(sb); 1644 } 1645 1646 /* 1647 * rdev_size_change for 0.90.0 1648 */ 1649 static unsigned long long 1650 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1651 { 1652 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1653 return 0; /* component must fit device */ 1654 if (rdev->mddev->bitmap_info.offset) 1655 return 0; /* can't move bitmap */ 1656 rdev->sb_start = calc_dev_sboffset(rdev); 1657 if (!num_sectors || num_sectors > rdev->sb_start) 1658 num_sectors = rdev->sb_start; 1659 /* Limit to 4TB as metadata cannot record more than that. 1660 * 4TB == 2^32 KB, or 2*2^32 sectors. 1661 */ 1662 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1663 num_sectors = (sector_t)(2ULL << 32) - 2; 1664 do { 1665 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1666 rdev->sb_page); 1667 } while (md_super_wait(rdev->mddev) < 0); 1668 return num_sectors; 1669 } 1670 1671 static int 1672 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1673 { 1674 /* non-zero offset changes not possible with v0.90 */ 1675 return new_offset == 0; 1676 } 1677 1678 /* 1679 * version 1 superblock 1680 */ 1681 1682 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1683 { 1684 __le32 disk_csum; 1685 u32 csum; 1686 unsigned long long newcsum; 1687 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1688 __le32 *isuper = (__le32*)sb; 1689 1690 disk_csum = sb->sb_csum; 1691 sb->sb_csum = 0; 1692 newcsum = 0; 1693 for (; size >= 4; size -= 4) 1694 newcsum += le32_to_cpu(*isuper++); 1695 1696 if (size == 2) 1697 newcsum += le16_to_cpu(*(__le16*) isuper); 1698 1699 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1700 sb->sb_csum = disk_csum; 1701 return cpu_to_le32(csum); 1702 } 1703 1704 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1705 { 1706 struct mdp_superblock_1 *sb; 1707 int ret; 1708 sector_t sb_start; 1709 sector_t sectors; 1710 int bmask; 1711 bool spare_disk = true; 1712 1713 /* 1714 * Calculate the position of the superblock in 512byte sectors. 1715 * It is always aligned to a 4K boundary and 1716 * depeding on minor_version, it can be: 1717 * 0: At least 8K, but less than 12K, from end of device 1718 * 1: At start of device 1719 * 2: 4K from start of device. 1720 */ 1721 switch(minor_version) { 1722 case 0: 1723 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1724 sb_start &= ~(sector_t)(4*2-1); 1725 break; 1726 case 1: 1727 sb_start = 0; 1728 break; 1729 case 2: 1730 sb_start = 8; 1731 break; 1732 default: 1733 return -EINVAL; 1734 } 1735 rdev->sb_start = sb_start; 1736 1737 /* superblock is rarely larger than 1K, but it can be larger, 1738 * and it is safe to read 4k, so we do that 1739 */ 1740 ret = read_disk_sb(rdev, 4096); 1741 if (ret) return ret; 1742 1743 sb = page_address(rdev->sb_page); 1744 1745 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1746 sb->major_version != cpu_to_le32(1) || 1747 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1748 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1749 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1750 return -EINVAL; 1751 1752 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1753 pr_warn("md: invalid superblock checksum on %pg\n", 1754 rdev->bdev); 1755 return -EINVAL; 1756 } 1757 if (le64_to_cpu(sb->data_size) < 10) { 1758 pr_warn("md: data_size too small on %pg\n", 1759 rdev->bdev); 1760 return -EINVAL; 1761 } 1762 if (sb->pad0 || 1763 sb->pad3[0] || 1764 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1765 /* Some padding is non-zero, might be a new feature */ 1766 return -EINVAL; 1767 1768 rdev->preferred_minor = 0xffff; 1769 rdev->data_offset = le64_to_cpu(sb->data_offset); 1770 rdev->new_data_offset = rdev->data_offset; 1771 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1772 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1773 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1774 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1775 1776 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1777 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1778 if (rdev->sb_size & bmask) 1779 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1780 1781 if (minor_version 1782 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1783 return -EINVAL; 1784 if (minor_version 1785 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1786 return -EINVAL; 1787 1788 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1789 1790 if (!rdev->bb_page) { 1791 rdev->bb_page = alloc_page(GFP_KERNEL); 1792 if (!rdev->bb_page) 1793 return -ENOMEM; 1794 } 1795 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1796 rdev->badblocks.count == 0) { 1797 /* need to load the bad block list. 1798 * Currently we limit it to one page. 1799 */ 1800 s32 offset; 1801 sector_t bb_sector; 1802 __le64 *bbp; 1803 int i; 1804 int sectors = le16_to_cpu(sb->bblog_size); 1805 if (sectors > (PAGE_SIZE / 512)) 1806 return -EINVAL; 1807 offset = le32_to_cpu(sb->bblog_offset); 1808 if (offset == 0) 1809 return -EINVAL; 1810 bb_sector = (long long)offset; 1811 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1812 rdev->bb_page, REQ_OP_READ, true)) 1813 return -EIO; 1814 bbp = (__le64 *)page_address(rdev->bb_page); 1815 rdev->badblocks.shift = sb->bblog_shift; 1816 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1817 u64 bb = le64_to_cpu(*bbp); 1818 int count = bb & (0x3ff); 1819 u64 sector = bb >> 10; 1820 sector <<= sb->bblog_shift; 1821 count <<= sb->bblog_shift; 1822 if (bb + 1 == 0) 1823 break; 1824 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1825 return -EINVAL; 1826 } 1827 } else if (sb->bblog_offset != 0) 1828 rdev->badblocks.shift = 0; 1829 1830 if ((le32_to_cpu(sb->feature_map) & 1831 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1832 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1833 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1834 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1835 } 1836 1837 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1838 sb->level != 0) 1839 return -EINVAL; 1840 1841 /* not spare disk */ 1842 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1843 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1844 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1845 spare_disk = false; 1846 1847 if (!refdev) { 1848 if (!spare_disk) 1849 ret = 1; 1850 else 1851 ret = 0; 1852 } else { 1853 __u64 ev1, ev2; 1854 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1855 1856 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1857 sb->level != refsb->level || 1858 sb->layout != refsb->layout || 1859 sb->chunksize != refsb->chunksize) { 1860 pr_warn("md: %pg has strangely different superblock to %pg\n", 1861 rdev->bdev, 1862 refdev->bdev); 1863 return -EINVAL; 1864 } 1865 ev1 = le64_to_cpu(sb->events); 1866 ev2 = le64_to_cpu(refsb->events); 1867 1868 if (!spare_disk && ev1 > ev2) 1869 ret = 1; 1870 else 1871 ret = 0; 1872 } 1873 if (minor_version) 1874 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1875 else 1876 sectors = rdev->sb_start; 1877 if (sectors < le64_to_cpu(sb->data_size)) 1878 return -EINVAL; 1879 rdev->sectors = le64_to_cpu(sb->data_size); 1880 return ret; 1881 } 1882 1883 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1884 { 1885 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1886 __u64 ev1 = le64_to_cpu(sb->events); 1887 int role; 1888 1889 rdev->raid_disk = -1; 1890 clear_bit(Faulty, &rdev->flags); 1891 clear_bit(In_sync, &rdev->flags); 1892 clear_bit(Bitmap_sync, &rdev->flags); 1893 clear_bit(WriteMostly, &rdev->flags); 1894 1895 if (mddev->raid_disks == 0) { 1896 mddev->major_version = 1; 1897 mddev->patch_version = 0; 1898 mddev->external = 0; 1899 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1900 mddev->ctime = le64_to_cpu(sb->ctime); 1901 mddev->utime = le64_to_cpu(sb->utime); 1902 mddev->level = le32_to_cpu(sb->level); 1903 mddev->clevel[0] = 0; 1904 mddev->layout = le32_to_cpu(sb->layout); 1905 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1906 mddev->dev_sectors = le64_to_cpu(sb->size); 1907 mddev->events = ev1; 1908 mddev->bitmap_info.offset = 0; 1909 mddev->bitmap_info.space = 0; 1910 /* Default location for bitmap is 1K after superblock 1911 * using 3K - total of 4K 1912 */ 1913 mddev->bitmap_info.default_offset = 1024 >> 9; 1914 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1915 mddev->reshape_backwards = 0; 1916 1917 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1918 memcpy(mddev->uuid, sb->set_uuid, 16); 1919 1920 mddev->max_disks = (4096-256)/2; 1921 1922 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1923 mddev->bitmap_info.file == NULL) { 1924 mddev->bitmap_info.offset = 1925 (__s32)le32_to_cpu(sb->bitmap_offset); 1926 /* Metadata doesn't record how much space is available. 1927 * For 1.0, we assume we can use up to the superblock 1928 * if before, else to 4K beyond superblock. 1929 * For others, assume no change is possible. 1930 */ 1931 if (mddev->minor_version > 0) 1932 mddev->bitmap_info.space = 0; 1933 else if (mddev->bitmap_info.offset > 0) 1934 mddev->bitmap_info.space = 1935 8 - mddev->bitmap_info.offset; 1936 else 1937 mddev->bitmap_info.space = 1938 -mddev->bitmap_info.offset; 1939 } 1940 1941 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1942 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1943 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1944 mddev->new_level = le32_to_cpu(sb->new_level); 1945 mddev->new_layout = le32_to_cpu(sb->new_layout); 1946 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1947 if (mddev->delta_disks < 0 || 1948 (mddev->delta_disks == 0 && 1949 (le32_to_cpu(sb->feature_map) 1950 & MD_FEATURE_RESHAPE_BACKWARDS))) 1951 mddev->reshape_backwards = 1; 1952 } else { 1953 mddev->reshape_position = MaxSector; 1954 mddev->delta_disks = 0; 1955 mddev->new_level = mddev->level; 1956 mddev->new_layout = mddev->layout; 1957 mddev->new_chunk_sectors = mddev->chunk_sectors; 1958 } 1959 1960 if (mddev->level == 0 && 1961 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1962 mddev->layout = -1; 1963 1964 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1965 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1966 1967 if (le32_to_cpu(sb->feature_map) & 1968 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1969 if (le32_to_cpu(sb->feature_map) & 1970 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1971 return -EINVAL; 1972 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1973 (le32_to_cpu(sb->feature_map) & 1974 MD_FEATURE_MULTIPLE_PPLS)) 1975 return -EINVAL; 1976 set_bit(MD_HAS_PPL, &mddev->flags); 1977 } 1978 } else if (mddev->pers == NULL) { 1979 /* Insist of good event counter while assembling, except for 1980 * spares (which don't need an event count). 1981 * Similar to mdadm, we allow event counter difference of 1 1982 * from the freshest device. 1983 */ 1984 if (rdev->desc_nr >= 0 && 1985 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1986 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1987 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1988 if (ev1 + 1 < mddev->events) 1989 return -EINVAL; 1990 } else if (mddev->bitmap) { 1991 /* If adding to array with a bitmap, then we can accept an 1992 * older device, but not too old. 1993 */ 1994 if (ev1 < mddev->bitmap->events_cleared) 1995 return 0; 1996 if (ev1 < mddev->events) 1997 set_bit(Bitmap_sync, &rdev->flags); 1998 } else { 1999 if (ev1 < mddev->events) 2000 /* just a hot-add of a new device, leave raid_disk at -1 */ 2001 return 0; 2002 } 2003 2004 if (rdev->desc_nr < 0 || 2005 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 2006 role = MD_DISK_ROLE_SPARE; 2007 rdev->desc_nr = -1; 2008 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 2009 /* 2010 * If we are assembling, and our event counter is smaller than the 2011 * highest event counter, we cannot trust our superblock about the role. 2012 * It could happen that our rdev was marked as Faulty, and all other 2013 * superblocks were updated with +1 event counter. 2014 * Then, before the next superblock update, which typically happens when 2015 * remove_and_add_spares() removes the device from the array, there was 2016 * a crash or reboot. 2017 * If we allow current rdev without consulting the freshest superblock, 2018 * we could cause data corruption. 2019 * Note that in this case our event counter is smaller by 1 than the 2020 * highest, otherwise, this rdev would not be allowed into array; 2021 * both kernel and mdadm allow event counter difference of 1. 2022 */ 2023 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 2024 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 2025 2026 if (rdev->desc_nr >= freshest_max_dev) { 2027 /* this is unexpected, better not proceed */ 2028 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 2029 mdname(mddev), rdev->bdev, rdev->desc_nr, 2030 freshest->bdev, freshest_max_dev); 2031 return -EUCLEAN; 2032 } 2033 2034 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 2035 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 2036 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 2037 } else { 2038 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2039 } 2040 switch (role) { 2041 case MD_DISK_ROLE_SPARE: /* spare */ 2042 break; 2043 case MD_DISK_ROLE_FAULTY: /* faulty */ 2044 set_bit(Faulty, &rdev->flags); 2045 break; 2046 case MD_DISK_ROLE_JOURNAL: /* journal device */ 2047 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 2048 /* journal device without journal feature */ 2049 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 2050 return -EINVAL; 2051 } 2052 set_bit(Journal, &rdev->flags); 2053 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 2054 rdev->raid_disk = 0; 2055 break; 2056 default: 2057 rdev->saved_raid_disk = role; 2058 if ((le32_to_cpu(sb->feature_map) & 2059 MD_FEATURE_RECOVERY_OFFSET)) { 2060 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 2061 if (!(le32_to_cpu(sb->feature_map) & 2062 MD_FEATURE_RECOVERY_BITMAP)) 2063 rdev->saved_raid_disk = -1; 2064 } else { 2065 /* 2066 * If the array is FROZEN, then the device can't 2067 * be in_sync with rest of array. 2068 */ 2069 if (!test_bit(MD_RECOVERY_FROZEN, 2070 &mddev->recovery)) 2071 set_bit(In_sync, &rdev->flags); 2072 } 2073 rdev->raid_disk = role; 2074 break; 2075 } 2076 if (sb->devflags & WriteMostly1) 2077 set_bit(WriteMostly, &rdev->flags); 2078 if (sb->devflags & FailFast1) 2079 set_bit(FailFast, &rdev->flags); 2080 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2081 set_bit(Replacement, &rdev->flags); 2082 2083 return 0; 2084 } 2085 2086 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2087 { 2088 struct mdp_superblock_1 *sb; 2089 struct md_rdev *rdev2; 2090 int max_dev, i; 2091 /* make rdev->sb match mddev and rdev data. */ 2092 2093 sb = page_address(rdev->sb_page); 2094 2095 sb->feature_map = 0; 2096 sb->pad0 = 0; 2097 sb->recovery_offset = cpu_to_le64(0); 2098 memset(sb->pad3, 0, sizeof(sb->pad3)); 2099 2100 sb->utime = cpu_to_le64((__u64)mddev->utime); 2101 sb->events = cpu_to_le64(mddev->events); 2102 if (mddev->in_sync) 2103 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 2104 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2105 sb->resync_offset = cpu_to_le64(MaxSector); 2106 else 2107 sb->resync_offset = cpu_to_le64(0); 2108 2109 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2110 2111 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2112 sb->size = cpu_to_le64(mddev->dev_sectors); 2113 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2114 sb->level = cpu_to_le32(mddev->level); 2115 sb->layout = cpu_to_le32(mddev->layout); 2116 if (test_bit(FailFast, &rdev->flags)) 2117 sb->devflags |= FailFast1; 2118 else 2119 sb->devflags &= ~FailFast1; 2120 2121 if (test_bit(WriteMostly, &rdev->flags)) 2122 sb->devflags |= WriteMostly1; 2123 else 2124 sb->devflags &= ~WriteMostly1; 2125 sb->data_offset = cpu_to_le64(rdev->data_offset); 2126 sb->data_size = cpu_to_le64(rdev->sectors); 2127 2128 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2129 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2130 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2131 } 2132 2133 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2134 !test_bit(In_sync, &rdev->flags)) { 2135 sb->feature_map |= 2136 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2137 sb->recovery_offset = 2138 cpu_to_le64(rdev->recovery_offset); 2139 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2140 sb->feature_map |= 2141 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2142 } 2143 /* Note: recovery_offset and journal_tail share space */ 2144 if (test_bit(Journal, &rdev->flags)) 2145 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2146 if (test_bit(Replacement, &rdev->flags)) 2147 sb->feature_map |= 2148 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2149 2150 if (mddev->reshape_position != MaxSector) { 2151 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2152 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2153 sb->new_layout = cpu_to_le32(mddev->new_layout); 2154 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2155 sb->new_level = cpu_to_le32(mddev->new_level); 2156 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2157 if (mddev->delta_disks == 0 && 2158 mddev->reshape_backwards) 2159 sb->feature_map 2160 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2161 if (rdev->new_data_offset != rdev->data_offset) { 2162 sb->feature_map 2163 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2164 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2165 - rdev->data_offset)); 2166 } 2167 } 2168 2169 if (mddev_is_clustered(mddev)) 2170 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2171 2172 if (rdev->badblocks.count == 0) 2173 /* Nothing to do for bad blocks*/ ; 2174 else if (sb->bblog_offset == 0) 2175 /* Cannot record bad blocks on this device */ 2176 md_error(mddev, rdev); 2177 else { 2178 struct badblocks *bb = &rdev->badblocks; 2179 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2180 u64 *p = bb->page; 2181 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2182 if (bb->changed) { 2183 unsigned seq; 2184 2185 retry: 2186 seq = read_seqbegin(&bb->lock); 2187 2188 memset(bbp, 0xff, PAGE_SIZE); 2189 2190 for (i = 0 ; i < bb->count ; i++) { 2191 u64 internal_bb = p[i]; 2192 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2193 | BB_LEN(internal_bb)); 2194 bbp[i] = cpu_to_le64(store_bb); 2195 } 2196 bb->changed = 0; 2197 if (read_seqretry(&bb->lock, seq)) 2198 goto retry; 2199 2200 bb->sector = (rdev->sb_start + 2201 (int)le32_to_cpu(sb->bblog_offset)); 2202 bb->size = le16_to_cpu(sb->bblog_size); 2203 } 2204 } 2205 2206 max_dev = 0; 2207 rdev_for_each(rdev2, mddev) 2208 if (rdev2->desc_nr+1 > max_dev) 2209 max_dev = rdev2->desc_nr+1; 2210 2211 if (max_dev > le32_to_cpu(sb->max_dev)) { 2212 int bmask; 2213 sb->max_dev = cpu_to_le32(max_dev); 2214 rdev->sb_size = max_dev * 2 + 256; 2215 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2216 if (rdev->sb_size & bmask) 2217 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2218 } else 2219 max_dev = le32_to_cpu(sb->max_dev); 2220 2221 for (i=0; i<max_dev;i++) 2222 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2223 2224 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2225 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2226 2227 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2228 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2229 sb->feature_map |= 2230 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2231 else 2232 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2233 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2234 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2235 } 2236 2237 rdev_for_each(rdev2, mddev) { 2238 i = rdev2->desc_nr; 2239 if (test_bit(Faulty, &rdev2->flags)) 2240 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2241 else if (test_bit(In_sync, &rdev2->flags)) 2242 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2243 else if (test_bit(Journal, &rdev2->flags)) 2244 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2245 else if (rdev2->raid_disk >= 0) 2246 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2247 else 2248 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2249 } 2250 2251 sb->sb_csum = calc_sb_1_csum(sb); 2252 } 2253 2254 static sector_t super_1_choose_bm_space(sector_t dev_size) 2255 { 2256 sector_t bm_space; 2257 2258 /* if the device is bigger than 8Gig, save 64k for bitmap 2259 * usage, if bigger than 200Gig, save 128k 2260 */ 2261 if (dev_size < 64*2) 2262 bm_space = 0; 2263 else if (dev_size - 64*2 >= 200*1024*1024*2) 2264 bm_space = 128*2; 2265 else if (dev_size - 4*2 > 8*1024*1024*2) 2266 bm_space = 64*2; 2267 else 2268 bm_space = 4*2; 2269 return bm_space; 2270 } 2271 2272 static unsigned long long 2273 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2274 { 2275 struct mdp_superblock_1 *sb; 2276 sector_t max_sectors; 2277 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2278 return 0; /* component must fit device */ 2279 if (rdev->data_offset != rdev->new_data_offset) 2280 return 0; /* too confusing */ 2281 if (rdev->sb_start < rdev->data_offset) { 2282 /* minor versions 1 and 2; superblock before data */ 2283 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2284 if (!num_sectors || num_sectors > max_sectors) 2285 num_sectors = max_sectors; 2286 } else if (rdev->mddev->bitmap_info.offset) { 2287 /* minor version 0 with bitmap we can't move */ 2288 return 0; 2289 } else { 2290 /* minor version 0; superblock after data */ 2291 sector_t sb_start, bm_space; 2292 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2293 2294 /* 8K is for superblock */ 2295 sb_start = dev_size - 8*2; 2296 sb_start &= ~(sector_t)(4*2 - 1); 2297 2298 bm_space = super_1_choose_bm_space(dev_size); 2299 2300 /* Space that can be used to store date needs to decrease 2301 * superblock bitmap space and bad block space(4K) 2302 */ 2303 max_sectors = sb_start - bm_space - 4*2; 2304 2305 if (!num_sectors || num_sectors > max_sectors) 2306 num_sectors = max_sectors; 2307 rdev->sb_start = sb_start; 2308 } 2309 sb = page_address(rdev->sb_page); 2310 sb->data_size = cpu_to_le64(num_sectors); 2311 sb->super_offset = cpu_to_le64(rdev->sb_start); 2312 sb->sb_csum = calc_sb_1_csum(sb); 2313 do { 2314 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2315 rdev->sb_page); 2316 } while (md_super_wait(rdev->mddev) < 0); 2317 return num_sectors; 2318 2319 } 2320 2321 static int 2322 super_1_allow_new_offset(struct md_rdev *rdev, 2323 unsigned long long new_offset) 2324 { 2325 /* All necessary checks on new >= old have been done */ 2326 struct bitmap *bitmap; 2327 if (new_offset >= rdev->data_offset) 2328 return 1; 2329 2330 /* with 1.0 metadata, there is no metadata to tread on 2331 * so we can always move back */ 2332 if (rdev->mddev->minor_version == 0) 2333 return 1; 2334 2335 /* otherwise we must be sure not to step on 2336 * any metadata, so stay: 2337 * 36K beyond start of superblock 2338 * beyond end of badblocks 2339 * beyond write-intent bitmap 2340 */ 2341 if (rdev->sb_start + (32+4)*2 > new_offset) 2342 return 0; 2343 bitmap = rdev->mddev->bitmap; 2344 if (bitmap && !rdev->mddev->bitmap_info.file && 2345 rdev->sb_start + rdev->mddev->bitmap_info.offset + 2346 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 2347 return 0; 2348 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2349 return 0; 2350 2351 return 1; 2352 } 2353 2354 static struct super_type super_types[] = { 2355 [0] = { 2356 .name = "0.90.0", 2357 .owner = THIS_MODULE, 2358 .load_super = super_90_load, 2359 .validate_super = super_90_validate, 2360 .sync_super = super_90_sync, 2361 .rdev_size_change = super_90_rdev_size_change, 2362 .allow_new_offset = super_90_allow_new_offset, 2363 }, 2364 [1] = { 2365 .name = "md-1", 2366 .owner = THIS_MODULE, 2367 .load_super = super_1_load, 2368 .validate_super = super_1_validate, 2369 .sync_super = super_1_sync, 2370 .rdev_size_change = super_1_rdev_size_change, 2371 .allow_new_offset = super_1_allow_new_offset, 2372 }, 2373 }; 2374 2375 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2376 { 2377 if (mddev->sync_super) { 2378 mddev->sync_super(mddev, rdev); 2379 return; 2380 } 2381 2382 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2383 2384 super_types[mddev->major_version].sync_super(mddev, rdev); 2385 } 2386 2387 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2388 { 2389 struct md_rdev *rdev, *rdev2; 2390 2391 rcu_read_lock(); 2392 rdev_for_each_rcu(rdev, mddev1) { 2393 if (test_bit(Faulty, &rdev->flags) || 2394 test_bit(Journal, &rdev->flags) || 2395 rdev->raid_disk == -1) 2396 continue; 2397 rdev_for_each_rcu(rdev2, mddev2) { 2398 if (test_bit(Faulty, &rdev2->flags) || 2399 test_bit(Journal, &rdev2->flags) || 2400 rdev2->raid_disk == -1) 2401 continue; 2402 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2403 rcu_read_unlock(); 2404 return 1; 2405 } 2406 } 2407 } 2408 rcu_read_unlock(); 2409 return 0; 2410 } 2411 2412 static LIST_HEAD(pending_raid_disks); 2413 2414 /* 2415 * Try to register data integrity profile for an mddev 2416 * 2417 * This is called when an array is started and after a disk has been kicked 2418 * from the array. It only succeeds if all working and active component devices 2419 * are integrity capable with matching profiles. 2420 */ 2421 int md_integrity_register(struct mddev *mddev) 2422 { 2423 if (list_empty(&mddev->disks)) 2424 return 0; /* nothing to do */ 2425 if (mddev_is_dm(mddev) || !blk_get_integrity(mddev->gendisk)) 2426 return 0; /* shouldn't register */ 2427 2428 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2429 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || 2430 (mddev->level != 1 && mddev->level != 10 && 2431 bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { 2432 /* 2433 * No need to handle the failure of bioset_integrity_create, 2434 * because the function is called by md_run() -> pers->run(), 2435 * md_run calls bioset_exit -> bioset_integrity_free in case 2436 * of failure case. 2437 */ 2438 pr_err("md: failed to create integrity pool for %s\n", 2439 mdname(mddev)); 2440 return -EINVAL; 2441 } 2442 return 0; 2443 } 2444 EXPORT_SYMBOL(md_integrity_register); 2445 2446 static bool rdev_read_only(struct md_rdev *rdev) 2447 { 2448 return bdev_read_only(rdev->bdev) || 2449 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2450 } 2451 2452 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2453 { 2454 char b[BDEVNAME_SIZE]; 2455 int err; 2456 2457 /* prevent duplicates */ 2458 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2459 return -EEXIST; 2460 2461 if (rdev_read_only(rdev) && mddev->pers) 2462 return -EROFS; 2463 2464 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2465 if (!test_bit(Journal, &rdev->flags) && 2466 rdev->sectors && 2467 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2468 if (mddev->pers) { 2469 /* Cannot change size, so fail 2470 * If mddev->level <= 0, then we don't care 2471 * about aligning sizes (e.g. linear) 2472 */ 2473 if (mddev->level > 0) 2474 return -ENOSPC; 2475 } else 2476 mddev->dev_sectors = rdev->sectors; 2477 } 2478 2479 /* Verify rdev->desc_nr is unique. 2480 * If it is -1, assign a free number, else 2481 * check number is not in use 2482 */ 2483 rcu_read_lock(); 2484 if (rdev->desc_nr < 0) { 2485 int choice = 0; 2486 if (mddev->pers) 2487 choice = mddev->raid_disks; 2488 while (md_find_rdev_nr_rcu(mddev, choice)) 2489 choice++; 2490 rdev->desc_nr = choice; 2491 } else { 2492 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2493 rcu_read_unlock(); 2494 return -EBUSY; 2495 } 2496 } 2497 rcu_read_unlock(); 2498 if (!test_bit(Journal, &rdev->flags) && 2499 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2500 pr_warn("md: %s: array is limited to %d devices\n", 2501 mdname(mddev), mddev->max_disks); 2502 return -EBUSY; 2503 } 2504 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2505 strreplace(b, '/', '!'); 2506 2507 rdev->mddev = mddev; 2508 pr_debug("md: bind<%s>\n", b); 2509 2510 if (mddev->raid_disks) 2511 mddev_create_serial_pool(mddev, rdev); 2512 2513 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2514 goto fail; 2515 2516 /* failure here is OK */ 2517 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2518 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2519 rdev->sysfs_unack_badblocks = 2520 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2521 rdev->sysfs_badblocks = 2522 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2523 2524 list_add_rcu(&rdev->same_set, &mddev->disks); 2525 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2526 2527 /* May as well allow recovery to be retried once */ 2528 mddev->recovery_disabled++; 2529 2530 return 0; 2531 2532 fail: 2533 pr_warn("md: failed to register dev-%s for %s\n", 2534 b, mdname(mddev)); 2535 mddev_destroy_serial_pool(mddev, rdev); 2536 return err; 2537 } 2538 2539 void md_autodetect_dev(dev_t dev); 2540 2541 /* just for claiming the bdev */ 2542 static struct md_rdev claim_rdev; 2543 2544 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2545 { 2546 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2547 md_rdev_clear(rdev); 2548 #ifndef MODULE 2549 if (test_bit(AutoDetected, &rdev->flags)) 2550 md_autodetect_dev(rdev->bdev->bd_dev); 2551 #endif 2552 fput(rdev->bdev_file); 2553 rdev->bdev = NULL; 2554 kobject_put(&rdev->kobj); 2555 } 2556 2557 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2558 { 2559 struct mddev *mddev = rdev->mddev; 2560 2561 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2562 list_del_rcu(&rdev->same_set); 2563 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2564 mddev_destroy_serial_pool(rdev->mddev, rdev); 2565 WRITE_ONCE(rdev->mddev, NULL); 2566 sysfs_remove_link(&rdev->kobj, "block"); 2567 sysfs_put(rdev->sysfs_state); 2568 sysfs_put(rdev->sysfs_unack_badblocks); 2569 sysfs_put(rdev->sysfs_badblocks); 2570 rdev->sysfs_state = NULL; 2571 rdev->sysfs_unack_badblocks = NULL; 2572 rdev->sysfs_badblocks = NULL; 2573 rdev->badblocks.count = 0; 2574 2575 synchronize_rcu(); 2576 2577 /* 2578 * kobject_del() will wait for all in progress writers to be done, where 2579 * reconfig_mutex is held, hence it can't be called under 2580 * reconfig_mutex and it's delayed to mddev_unlock(). 2581 */ 2582 list_add(&rdev->same_set, &mddev->deleting); 2583 } 2584 2585 static void export_array(struct mddev *mddev) 2586 { 2587 struct md_rdev *rdev; 2588 2589 while (!list_empty(&mddev->disks)) { 2590 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2591 same_set); 2592 md_kick_rdev_from_array(rdev); 2593 } 2594 mddev->raid_disks = 0; 2595 mddev->major_version = 0; 2596 } 2597 2598 static bool set_in_sync(struct mddev *mddev) 2599 { 2600 lockdep_assert_held(&mddev->lock); 2601 if (!mddev->in_sync) { 2602 mddev->sync_checkers++; 2603 spin_unlock(&mddev->lock); 2604 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2605 spin_lock(&mddev->lock); 2606 if (!mddev->in_sync && 2607 percpu_ref_is_zero(&mddev->writes_pending)) { 2608 mddev->in_sync = 1; 2609 /* 2610 * Ensure ->in_sync is visible before we clear 2611 * ->sync_checkers. 2612 */ 2613 smp_mb(); 2614 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2615 sysfs_notify_dirent_safe(mddev->sysfs_state); 2616 } 2617 if (--mddev->sync_checkers == 0) 2618 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2619 } 2620 if (mddev->safemode == 1) 2621 mddev->safemode = 0; 2622 return mddev->in_sync; 2623 } 2624 2625 static void sync_sbs(struct mddev *mddev, int nospares) 2626 { 2627 /* Update each superblock (in-memory image), but 2628 * if we are allowed to, skip spares which already 2629 * have the right event counter, or have one earlier 2630 * (which would mean they aren't being marked as dirty 2631 * with the rest of the array) 2632 */ 2633 struct md_rdev *rdev; 2634 rdev_for_each(rdev, mddev) { 2635 if (rdev->sb_events == mddev->events || 2636 (nospares && 2637 rdev->raid_disk < 0 && 2638 rdev->sb_events+1 == mddev->events)) { 2639 /* Don't update this superblock */ 2640 rdev->sb_loaded = 2; 2641 } else { 2642 sync_super(mddev, rdev); 2643 rdev->sb_loaded = 1; 2644 } 2645 } 2646 } 2647 2648 static bool does_sb_need_changing(struct mddev *mddev) 2649 { 2650 struct md_rdev *rdev = NULL, *iter; 2651 struct mdp_superblock_1 *sb; 2652 int role; 2653 2654 /* Find a good rdev */ 2655 rdev_for_each(iter, mddev) 2656 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2657 rdev = iter; 2658 break; 2659 } 2660 2661 /* No good device found. */ 2662 if (!rdev) 2663 return false; 2664 2665 sb = page_address(rdev->sb_page); 2666 /* Check if a device has become faulty or a spare become active */ 2667 rdev_for_each(rdev, mddev) { 2668 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2669 /* Device activated? */ 2670 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2671 !test_bit(Faulty, &rdev->flags)) 2672 return true; 2673 /* Device turned faulty? */ 2674 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2675 return true; 2676 } 2677 2678 /* Check if any mddev parameters have changed */ 2679 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2680 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2681 (mddev->layout != le32_to_cpu(sb->layout)) || 2682 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2683 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2684 return true; 2685 2686 return false; 2687 } 2688 2689 void md_update_sb(struct mddev *mddev, int force_change) 2690 { 2691 struct md_rdev *rdev; 2692 int sync_req; 2693 int nospares = 0; 2694 int any_badblocks_changed = 0; 2695 int ret = -1; 2696 2697 if (!md_is_rdwr(mddev)) { 2698 if (force_change) 2699 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2700 return; 2701 } 2702 2703 repeat: 2704 if (mddev_is_clustered(mddev)) { 2705 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2706 force_change = 1; 2707 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2708 nospares = 1; 2709 ret = md_cluster_ops->metadata_update_start(mddev); 2710 /* Has someone else has updated the sb */ 2711 if (!does_sb_need_changing(mddev)) { 2712 if (ret == 0) 2713 md_cluster_ops->metadata_update_cancel(mddev); 2714 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2715 BIT(MD_SB_CHANGE_DEVS) | 2716 BIT(MD_SB_CHANGE_CLEAN)); 2717 return; 2718 } 2719 } 2720 2721 /* 2722 * First make sure individual recovery_offsets are correct 2723 * curr_resync_completed can only be used during recovery. 2724 * During reshape/resync it might use array-addresses rather 2725 * that device addresses. 2726 */ 2727 rdev_for_each(rdev, mddev) { 2728 if (rdev->raid_disk >= 0 && 2729 mddev->delta_disks >= 0 && 2730 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2731 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2732 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2733 !test_bit(Journal, &rdev->flags) && 2734 !test_bit(In_sync, &rdev->flags) && 2735 mddev->curr_resync_completed > rdev->recovery_offset) 2736 rdev->recovery_offset = mddev->curr_resync_completed; 2737 2738 } 2739 if (!mddev->persistent) { 2740 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2741 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2742 if (!mddev->external) { 2743 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2744 rdev_for_each(rdev, mddev) { 2745 if (rdev->badblocks.changed) { 2746 rdev->badblocks.changed = 0; 2747 ack_all_badblocks(&rdev->badblocks); 2748 md_error(mddev, rdev); 2749 } 2750 clear_bit(Blocked, &rdev->flags); 2751 clear_bit(BlockedBadBlocks, &rdev->flags); 2752 wake_up(&rdev->blocked_wait); 2753 } 2754 } 2755 wake_up(&mddev->sb_wait); 2756 return; 2757 } 2758 2759 spin_lock(&mddev->lock); 2760 2761 mddev->utime = ktime_get_real_seconds(); 2762 2763 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2764 force_change = 1; 2765 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2766 /* just a clean<-> dirty transition, possibly leave spares alone, 2767 * though if events isn't the right even/odd, we will have to do 2768 * spares after all 2769 */ 2770 nospares = 1; 2771 if (force_change) 2772 nospares = 0; 2773 if (mddev->degraded) 2774 /* If the array is degraded, then skipping spares is both 2775 * dangerous and fairly pointless. 2776 * Dangerous because a device that was removed from the array 2777 * might have a event_count that still looks up-to-date, 2778 * so it can be re-added without a resync. 2779 * Pointless because if there are any spares to skip, 2780 * then a recovery will happen and soon that array won't 2781 * be degraded any more and the spare can go back to sleep then. 2782 */ 2783 nospares = 0; 2784 2785 sync_req = mddev->in_sync; 2786 2787 /* If this is just a dirty<->clean transition, and the array is clean 2788 * and 'events' is odd, we can roll back to the previous clean state */ 2789 if (nospares 2790 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2791 && mddev->can_decrease_events 2792 && mddev->events != 1) { 2793 mddev->events--; 2794 mddev->can_decrease_events = 0; 2795 } else { 2796 /* otherwise we have to go forward and ... */ 2797 mddev->events ++; 2798 mddev->can_decrease_events = nospares; 2799 } 2800 2801 /* 2802 * This 64-bit counter should never wrap. 2803 * Either we are in around ~1 trillion A.C., assuming 2804 * 1 reboot per second, or we have a bug... 2805 */ 2806 WARN_ON(mddev->events == 0); 2807 2808 rdev_for_each(rdev, mddev) { 2809 if (rdev->badblocks.changed) 2810 any_badblocks_changed++; 2811 if (test_bit(Faulty, &rdev->flags)) 2812 set_bit(FaultRecorded, &rdev->flags); 2813 } 2814 2815 sync_sbs(mddev, nospares); 2816 spin_unlock(&mddev->lock); 2817 2818 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2819 mdname(mddev), mddev->in_sync); 2820 2821 mddev_add_trace_msg(mddev, "md md_update_sb"); 2822 rewrite: 2823 md_bitmap_update_sb(mddev->bitmap); 2824 rdev_for_each(rdev, mddev) { 2825 if (rdev->sb_loaded != 1) 2826 continue; /* no noise on spare devices */ 2827 2828 if (!test_bit(Faulty, &rdev->flags)) { 2829 md_super_write(mddev,rdev, 2830 rdev->sb_start, rdev->sb_size, 2831 rdev->sb_page); 2832 pr_debug("md: (write) %pg's sb offset: %llu\n", 2833 rdev->bdev, 2834 (unsigned long long)rdev->sb_start); 2835 rdev->sb_events = mddev->events; 2836 if (rdev->badblocks.size) { 2837 md_super_write(mddev, rdev, 2838 rdev->badblocks.sector, 2839 rdev->badblocks.size << 9, 2840 rdev->bb_page); 2841 rdev->badblocks.size = 0; 2842 } 2843 2844 } else 2845 pr_debug("md: %pg (skipping faulty)\n", 2846 rdev->bdev); 2847 } 2848 if (md_super_wait(mddev) < 0) 2849 goto rewrite; 2850 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2851 2852 if (mddev_is_clustered(mddev) && ret == 0) 2853 md_cluster_ops->metadata_update_finish(mddev); 2854 2855 if (mddev->in_sync != sync_req || 2856 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2857 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2858 /* have to write it out again */ 2859 goto repeat; 2860 wake_up(&mddev->sb_wait); 2861 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2862 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2863 2864 rdev_for_each(rdev, mddev) { 2865 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2866 clear_bit(Blocked, &rdev->flags); 2867 2868 if (any_badblocks_changed) 2869 ack_all_badblocks(&rdev->badblocks); 2870 clear_bit(BlockedBadBlocks, &rdev->flags); 2871 wake_up(&rdev->blocked_wait); 2872 } 2873 } 2874 EXPORT_SYMBOL(md_update_sb); 2875 2876 static int add_bound_rdev(struct md_rdev *rdev) 2877 { 2878 struct mddev *mddev = rdev->mddev; 2879 int err = 0; 2880 bool add_journal = test_bit(Journal, &rdev->flags); 2881 2882 if (!mddev->pers->hot_remove_disk || add_journal) { 2883 /* If there is hot_add_disk but no hot_remove_disk 2884 * then added disks for geometry changes, 2885 * and should be added immediately. 2886 */ 2887 super_types[mddev->major_version]. 2888 validate_super(mddev, NULL/*freshest*/, rdev); 2889 err = mddev->pers->hot_add_disk(mddev, rdev); 2890 if (err) { 2891 md_kick_rdev_from_array(rdev); 2892 return err; 2893 } 2894 } 2895 sysfs_notify_dirent_safe(rdev->sysfs_state); 2896 2897 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2898 if (mddev->degraded) 2899 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2900 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2901 md_new_event(); 2902 return 0; 2903 } 2904 2905 /* words written to sysfs files may, or may not, be \n terminated. 2906 * We want to accept with case. For this we use cmd_match. 2907 */ 2908 static int cmd_match(const char *cmd, const char *str) 2909 { 2910 /* See if cmd, written into a sysfs file, matches 2911 * str. They must either be the same, or cmd can 2912 * have a trailing newline 2913 */ 2914 while (*cmd && *str && *cmd == *str) { 2915 cmd++; 2916 str++; 2917 } 2918 if (*cmd == '\n') 2919 cmd++; 2920 if (*str || *cmd) 2921 return 0; 2922 return 1; 2923 } 2924 2925 struct rdev_sysfs_entry { 2926 struct attribute attr; 2927 ssize_t (*show)(struct md_rdev *, char *); 2928 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2929 }; 2930 2931 static ssize_t 2932 state_show(struct md_rdev *rdev, char *page) 2933 { 2934 char *sep = ","; 2935 size_t len = 0; 2936 unsigned long flags = READ_ONCE(rdev->flags); 2937 2938 if (test_bit(Faulty, &flags) || 2939 (!test_bit(ExternalBbl, &flags) && 2940 rdev->badblocks.unacked_exist)) 2941 len += sprintf(page+len, "faulty%s", sep); 2942 if (test_bit(In_sync, &flags)) 2943 len += sprintf(page+len, "in_sync%s", sep); 2944 if (test_bit(Journal, &flags)) 2945 len += sprintf(page+len, "journal%s", sep); 2946 if (test_bit(WriteMostly, &flags)) 2947 len += sprintf(page+len, "write_mostly%s", sep); 2948 if (test_bit(Blocked, &flags) || 2949 (rdev->badblocks.unacked_exist 2950 && !test_bit(Faulty, &flags))) 2951 len += sprintf(page+len, "blocked%s", sep); 2952 if (!test_bit(Faulty, &flags) && 2953 !test_bit(Journal, &flags) && 2954 !test_bit(In_sync, &flags)) 2955 len += sprintf(page+len, "spare%s", sep); 2956 if (test_bit(WriteErrorSeen, &flags)) 2957 len += sprintf(page+len, "write_error%s", sep); 2958 if (test_bit(WantReplacement, &flags)) 2959 len += sprintf(page+len, "want_replacement%s", sep); 2960 if (test_bit(Replacement, &flags)) 2961 len += sprintf(page+len, "replacement%s", sep); 2962 if (test_bit(ExternalBbl, &flags)) 2963 len += sprintf(page+len, "external_bbl%s", sep); 2964 if (test_bit(FailFast, &flags)) 2965 len += sprintf(page+len, "failfast%s", sep); 2966 2967 if (len) 2968 len -= strlen(sep); 2969 2970 return len+sprintf(page+len, "\n"); 2971 } 2972 2973 static ssize_t 2974 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2975 { 2976 /* can write 2977 * faulty - simulates an error 2978 * remove - disconnects the device 2979 * writemostly - sets write_mostly 2980 * -writemostly - clears write_mostly 2981 * blocked - sets the Blocked flags 2982 * -blocked - clears the Blocked and possibly simulates an error 2983 * insync - sets Insync providing device isn't active 2984 * -insync - clear Insync for a device with a slot assigned, 2985 * so that it gets rebuilt based on bitmap 2986 * write_error - sets WriteErrorSeen 2987 * -write_error - clears WriteErrorSeen 2988 * {,-}failfast - set/clear FailFast 2989 */ 2990 2991 struct mddev *mddev = rdev->mddev; 2992 int err = -EINVAL; 2993 bool need_update_sb = false; 2994 2995 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2996 md_error(rdev->mddev, rdev); 2997 2998 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 2999 err = -EBUSY; 3000 else 3001 err = 0; 3002 } else if (cmd_match(buf, "remove")) { 3003 if (rdev->mddev->pers) { 3004 clear_bit(Blocked, &rdev->flags); 3005 remove_and_add_spares(rdev->mddev, rdev); 3006 } 3007 if (rdev->raid_disk >= 0) 3008 err = -EBUSY; 3009 else { 3010 err = 0; 3011 if (mddev_is_clustered(mddev)) 3012 err = md_cluster_ops->remove_disk(mddev, rdev); 3013 3014 if (err == 0) { 3015 md_kick_rdev_from_array(rdev); 3016 if (mddev->pers) 3017 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3018 md_new_event(); 3019 } 3020 } 3021 } else if (cmd_match(buf, "writemostly")) { 3022 set_bit(WriteMostly, &rdev->flags); 3023 mddev_create_serial_pool(rdev->mddev, rdev); 3024 need_update_sb = true; 3025 err = 0; 3026 } else if (cmd_match(buf, "-writemostly")) { 3027 mddev_destroy_serial_pool(rdev->mddev, rdev); 3028 clear_bit(WriteMostly, &rdev->flags); 3029 need_update_sb = true; 3030 err = 0; 3031 } else if (cmd_match(buf, "blocked")) { 3032 set_bit(Blocked, &rdev->flags); 3033 err = 0; 3034 } else if (cmd_match(buf, "-blocked")) { 3035 if (!test_bit(Faulty, &rdev->flags) && 3036 !test_bit(ExternalBbl, &rdev->flags) && 3037 rdev->badblocks.unacked_exist) { 3038 /* metadata handler doesn't understand badblocks, 3039 * so we need to fail the device 3040 */ 3041 md_error(rdev->mddev, rdev); 3042 } 3043 clear_bit(Blocked, &rdev->flags); 3044 clear_bit(BlockedBadBlocks, &rdev->flags); 3045 wake_up(&rdev->blocked_wait); 3046 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3047 3048 err = 0; 3049 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3050 set_bit(In_sync, &rdev->flags); 3051 err = 0; 3052 } else if (cmd_match(buf, "failfast")) { 3053 set_bit(FailFast, &rdev->flags); 3054 need_update_sb = true; 3055 err = 0; 3056 } else if (cmd_match(buf, "-failfast")) { 3057 clear_bit(FailFast, &rdev->flags); 3058 need_update_sb = true; 3059 err = 0; 3060 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3061 !test_bit(Journal, &rdev->flags)) { 3062 if (rdev->mddev->pers == NULL) { 3063 clear_bit(In_sync, &rdev->flags); 3064 rdev->saved_raid_disk = rdev->raid_disk; 3065 rdev->raid_disk = -1; 3066 err = 0; 3067 } 3068 } else if (cmd_match(buf, "write_error")) { 3069 set_bit(WriteErrorSeen, &rdev->flags); 3070 err = 0; 3071 } else if (cmd_match(buf, "-write_error")) { 3072 clear_bit(WriteErrorSeen, &rdev->flags); 3073 err = 0; 3074 } else if (cmd_match(buf, "want_replacement")) { 3075 /* Any non-spare device that is not a replacement can 3076 * become want_replacement at any time, but we then need to 3077 * check if recovery is needed. 3078 */ 3079 if (rdev->raid_disk >= 0 && 3080 !test_bit(Journal, &rdev->flags) && 3081 !test_bit(Replacement, &rdev->flags)) 3082 set_bit(WantReplacement, &rdev->flags); 3083 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3084 err = 0; 3085 } else if (cmd_match(buf, "-want_replacement")) { 3086 /* Clearing 'want_replacement' is always allowed. 3087 * Once replacements starts it is too late though. 3088 */ 3089 err = 0; 3090 clear_bit(WantReplacement, &rdev->flags); 3091 } else if (cmd_match(buf, "replacement")) { 3092 /* Can only set a device as a replacement when array has not 3093 * yet been started. Once running, replacement is automatic 3094 * from spares, or by assigning 'slot'. 3095 */ 3096 if (rdev->mddev->pers) 3097 err = -EBUSY; 3098 else { 3099 set_bit(Replacement, &rdev->flags); 3100 err = 0; 3101 } 3102 } else if (cmd_match(buf, "-replacement")) { 3103 /* Similarly, can only clear Replacement before start */ 3104 if (rdev->mddev->pers) 3105 err = -EBUSY; 3106 else { 3107 clear_bit(Replacement, &rdev->flags); 3108 err = 0; 3109 } 3110 } else if (cmd_match(buf, "re-add")) { 3111 if (!rdev->mddev->pers) 3112 err = -EINVAL; 3113 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3114 rdev->saved_raid_disk >= 0) { 3115 /* clear_bit is performed _after_ all the devices 3116 * have their local Faulty bit cleared. If any writes 3117 * happen in the meantime in the local node, they 3118 * will land in the local bitmap, which will be synced 3119 * by this node eventually 3120 */ 3121 if (!mddev_is_clustered(rdev->mddev) || 3122 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 3123 clear_bit(Faulty, &rdev->flags); 3124 err = add_bound_rdev(rdev); 3125 } 3126 } else 3127 err = -EBUSY; 3128 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3129 set_bit(ExternalBbl, &rdev->flags); 3130 rdev->badblocks.shift = 0; 3131 err = 0; 3132 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3133 clear_bit(ExternalBbl, &rdev->flags); 3134 err = 0; 3135 } 3136 if (need_update_sb) 3137 md_update_sb(mddev, 1); 3138 if (!err) 3139 sysfs_notify_dirent_safe(rdev->sysfs_state); 3140 return err ? err : len; 3141 } 3142 static struct rdev_sysfs_entry rdev_state = 3143 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3144 3145 static ssize_t 3146 errors_show(struct md_rdev *rdev, char *page) 3147 { 3148 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3149 } 3150 3151 static ssize_t 3152 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3153 { 3154 unsigned int n; 3155 int rv; 3156 3157 rv = kstrtouint(buf, 10, &n); 3158 if (rv < 0) 3159 return rv; 3160 atomic_set(&rdev->corrected_errors, n); 3161 return len; 3162 } 3163 static struct rdev_sysfs_entry rdev_errors = 3164 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3165 3166 static ssize_t 3167 slot_show(struct md_rdev *rdev, char *page) 3168 { 3169 if (test_bit(Journal, &rdev->flags)) 3170 return sprintf(page, "journal\n"); 3171 else if (rdev->raid_disk < 0) 3172 return sprintf(page, "none\n"); 3173 else 3174 return sprintf(page, "%d\n", rdev->raid_disk); 3175 } 3176 3177 static ssize_t 3178 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3179 { 3180 int slot; 3181 int err; 3182 3183 if (test_bit(Journal, &rdev->flags)) 3184 return -EBUSY; 3185 if (strncmp(buf, "none", 4)==0) 3186 slot = -1; 3187 else { 3188 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3189 if (err < 0) 3190 return err; 3191 if (slot < 0) 3192 /* overflow */ 3193 return -ENOSPC; 3194 } 3195 if (rdev->mddev->pers && slot == -1) { 3196 /* Setting 'slot' on an active array requires also 3197 * updating the 'rd%d' link, and communicating 3198 * with the personality with ->hot_*_disk. 3199 * For now we only support removing 3200 * failed/spare devices. This normally happens automatically, 3201 * but not when the metadata is externally managed. 3202 */ 3203 if (rdev->raid_disk == -1) 3204 return -EEXIST; 3205 /* personality does all needed checks */ 3206 if (rdev->mddev->pers->hot_remove_disk == NULL) 3207 return -EINVAL; 3208 clear_bit(Blocked, &rdev->flags); 3209 remove_and_add_spares(rdev->mddev, rdev); 3210 if (rdev->raid_disk >= 0) 3211 return -EBUSY; 3212 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3213 } else if (rdev->mddev->pers) { 3214 /* Activating a spare .. or possibly reactivating 3215 * if we ever get bitmaps working here. 3216 */ 3217 int err; 3218 3219 if (rdev->raid_disk != -1) 3220 return -EBUSY; 3221 3222 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3223 return -EBUSY; 3224 3225 if (rdev->mddev->pers->hot_add_disk == NULL) 3226 return -EINVAL; 3227 3228 if (slot >= rdev->mddev->raid_disks && 3229 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3230 return -ENOSPC; 3231 3232 rdev->raid_disk = slot; 3233 if (test_bit(In_sync, &rdev->flags)) 3234 rdev->saved_raid_disk = slot; 3235 else 3236 rdev->saved_raid_disk = -1; 3237 clear_bit(In_sync, &rdev->flags); 3238 clear_bit(Bitmap_sync, &rdev->flags); 3239 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3240 if (err) { 3241 rdev->raid_disk = -1; 3242 return err; 3243 } else 3244 sysfs_notify_dirent_safe(rdev->sysfs_state); 3245 /* failure here is OK */; 3246 sysfs_link_rdev(rdev->mddev, rdev); 3247 /* don't wakeup anyone, leave that to userspace. */ 3248 } else { 3249 if (slot >= rdev->mddev->raid_disks && 3250 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3251 return -ENOSPC; 3252 rdev->raid_disk = slot; 3253 /* assume it is working */ 3254 clear_bit(Faulty, &rdev->flags); 3255 clear_bit(WriteMostly, &rdev->flags); 3256 set_bit(In_sync, &rdev->flags); 3257 sysfs_notify_dirent_safe(rdev->sysfs_state); 3258 } 3259 return len; 3260 } 3261 3262 static struct rdev_sysfs_entry rdev_slot = 3263 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3264 3265 static ssize_t 3266 offset_show(struct md_rdev *rdev, char *page) 3267 { 3268 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3269 } 3270 3271 static ssize_t 3272 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3273 { 3274 unsigned long long offset; 3275 if (kstrtoull(buf, 10, &offset) < 0) 3276 return -EINVAL; 3277 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3278 return -EBUSY; 3279 if (rdev->sectors && rdev->mddev->external) 3280 /* Must set offset before size, so overlap checks 3281 * can be sane */ 3282 return -EBUSY; 3283 rdev->data_offset = offset; 3284 rdev->new_data_offset = offset; 3285 return len; 3286 } 3287 3288 static struct rdev_sysfs_entry rdev_offset = 3289 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3290 3291 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3292 { 3293 return sprintf(page, "%llu\n", 3294 (unsigned long long)rdev->new_data_offset); 3295 } 3296 3297 static ssize_t new_offset_store(struct md_rdev *rdev, 3298 const char *buf, size_t len) 3299 { 3300 unsigned long long new_offset; 3301 struct mddev *mddev = rdev->mddev; 3302 3303 if (kstrtoull(buf, 10, &new_offset) < 0) 3304 return -EINVAL; 3305 3306 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3307 return -EBUSY; 3308 if (new_offset == rdev->data_offset) 3309 /* reset is always permitted */ 3310 ; 3311 else if (new_offset > rdev->data_offset) { 3312 /* must not push array size beyond rdev_sectors */ 3313 if (new_offset - rdev->data_offset 3314 + mddev->dev_sectors > rdev->sectors) 3315 return -E2BIG; 3316 } 3317 /* Metadata worries about other space details. */ 3318 3319 /* decreasing the offset is inconsistent with a backwards 3320 * reshape. 3321 */ 3322 if (new_offset < rdev->data_offset && 3323 mddev->reshape_backwards) 3324 return -EINVAL; 3325 /* Increasing offset is inconsistent with forwards 3326 * reshape. reshape_direction should be set to 3327 * 'backwards' first. 3328 */ 3329 if (new_offset > rdev->data_offset && 3330 !mddev->reshape_backwards) 3331 return -EINVAL; 3332 3333 if (mddev->pers && mddev->persistent && 3334 !super_types[mddev->major_version] 3335 .allow_new_offset(rdev, new_offset)) 3336 return -E2BIG; 3337 rdev->new_data_offset = new_offset; 3338 if (new_offset > rdev->data_offset) 3339 mddev->reshape_backwards = 1; 3340 else if (new_offset < rdev->data_offset) 3341 mddev->reshape_backwards = 0; 3342 3343 return len; 3344 } 3345 static struct rdev_sysfs_entry rdev_new_offset = 3346 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3347 3348 static ssize_t 3349 rdev_size_show(struct md_rdev *rdev, char *page) 3350 { 3351 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3352 } 3353 3354 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3355 { 3356 /* check if two start/length pairs overlap */ 3357 if (a->data_offset + a->sectors <= b->data_offset) 3358 return false; 3359 if (b->data_offset + b->sectors <= a->data_offset) 3360 return false; 3361 return true; 3362 } 3363 3364 static bool md_rdev_overlaps(struct md_rdev *rdev) 3365 { 3366 struct mddev *mddev; 3367 struct md_rdev *rdev2; 3368 3369 spin_lock(&all_mddevs_lock); 3370 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3371 if (test_bit(MD_DELETED, &mddev->flags)) 3372 continue; 3373 rdev_for_each(rdev2, mddev) { 3374 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3375 md_rdevs_overlap(rdev, rdev2)) { 3376 spin_unlock(&all_mddevs_lock); 3377 return true; 3378 } 3379 } 3380 } 3381 spin_unlock(&all_mddevs_lock); 3382 return false; 3383 } 3384 3385 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3386 { 3387 unsigned long long blocks; 3388 sector_t new; 3389 3390 if (kstrtoull(buf, 10, &blocks) < 0) 3391 return -EINVAL; 3392 3393 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3394 return -EINVAL; /* sector conversion overflow */ 3395 3396 new = blocks * 2; 3397 if (new != blocks * 2) 3398 return -EINVAL; /* unsigned long long to sector_t overflow */ 3399 3400 *sectors = new; 3401 return 0; 3402 } 3403 3404 static ssize_t 3405 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3406 { 3407 struct mddev *my_mddev = rdev->mddev; 3408 sector_t oldsectors = rdev->sectors; 3409 sector_t sectors; 3410 3411 if (test_bit(Journal, &rdev->flags)) 3412 return -EBUSY; 3413 if (strict_blocks_to_sectors(buf, §ors) < 0) 3414 return -EINVAL; 3415 if (rdev->data_offset != rdev->new_data_offset) 3416 return -EINVAL; /* too confusing */ 3417 if (my_mddev->pers && rdev->raid_disk >= 0) { 3418 if (my_mddev->persistent) { 3419 sectors = super_types[my_mddev->major_version]. 3420 rdev_size_change(rdev, sectors); 3421 if (!sectors) 3422 return -EBUSY; 3423 } else if (!sectors) 3424 sectors = bdev_nr_sectors(rdev->bdev) - 3425 rdev->data_offset; 3426 if (!my_mddev->pers->resize) 3427 /* Cannot change size for RAID0 or Linear etc */ 3428 return -EINVAL; 3429 } 3430 if (sectors < my_mddev->dev_sectors) 3431 return -EINVAL; /* component must fit device */ 3432 3433 rdev->sectors = sectors; 3434 3435 /* 3436 * Check that all other rdevs with the same bdev do not overlap. This 3437 * check does not provide a hard guarantee, it just helps avoid 3438 * dangerous mistakes. 3439 */ 3440 if (sectors > oldsectors && my_mddev->external && 3441 md_rdev_overlaps(rdev)) { 3442 /* 3443 * Someone else could have slipped in a size change here, but 3444 * doing so is just silly. We put oldsectors back because we 3445 * know it is safe, and trust userspace not to race with itself. 3446 */ 3447 rdev->sectors = oldsectors; 3448 return -EBUSY; 3449 } 3450 return len; 3451 } 3452 3453 static struct rdev_sysfs_entry rdev_size = 3454 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3455 3456 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3457 { 3458 unsigned long long recovery_start = rdev->recovery_offset; 3459 3460 if (test_bit(In_sync, &rdev->flags) || 3461 recovery_start == MaxSector) 3462 return sprintf(page, "none\n"); 3463 3464 return sprintf(page, "%llu\n", recovery_start); 3465 } 3466 3467 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3468 { 3469 unsigned long long recovery_start; 3470 3471 if (cmd_match(buf, "none")) 3472 recovery_start = MaxSector; 3473 else if (kstrtoull(buf, 10, &recovery_start)) 3474 return -EINVAL; 3475 3476 if (rdev->mddev->pers && 3477 rdev->raid_disk >= 0) 3478 return -EBUSY; 3479 3480 rdev->recovery_offset = recovery_start; 3481 if (recovery_start == MaxSector) 3482 set_bit(In_sync, &rdev->flags); 3483 else 3484 clear_bit(In_sync, &rdev->flags); 3485 return len; 3486 } 3487 3488 static struct rdev_sysfs_entry rdev_recovery_start = 3489 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3490 3491 /* sysfs access to bad-blocks list. 3492 * We present two files. 3493 * 'bad-blocks' lists sector numbers and lengths of ranges that 3494 * are recorded as bad. The list is truncated to fit within 3495 * the one-page limit of sysfs. 3496 * Writing "sector length" to this file adds an acknowledged 3497 * bad block list. 3498 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3499 * been acknowledged. Writing to this file adds bad blocks 3500 * without acknowledging them. This is largely for testing. 3501 */ 3502 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3503 { 3504 return badblocks_show(&rdev->badblocks, page, 0); 3505 } 3506 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3507 { 3508 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3509 /* Maybe that ack was all we needed */ 3510 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3511 wake_up(&rdev->blocked_wait); 3512 return rv; 3513 } 3514 static struct rdev_sysfs_entry rdev_bad_blocks = 3515 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3516 3517 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3518 { 3519 return badblocks_show(&rdev->badblocks, page, 1); 3520 } 3521 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3522 { 3523 return badblocks_store(&rdev->badblocks, page, len, 1); 3524 } 3525 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3526 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3527 3528 static ssize_t 3529 ppl_sector_show(struct md_rdev *rdev, char *page) 3530 { 3531 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3532 } 3533 3534 static ssize_t 3535 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3536 { 3537 unsigned long long sector; 3538 3539 if (kstrtoull(buf, 10, §or) < 0) 3540 return -EINVAL; 3541 if (sector != (sector_t)sector) 3542 return -EINVAL; 3543 3544 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3545 rdev->raid_disk >= 0) 3546 return -EBUSY; 3547 3548 if (rdev->mddev->persistent) { 3549 if (rdev->mddev->major_version == 0) 3550 return -EINVAL; 3551 if ((sector > rdev->sb_start && 3552 sector - rdev->sb_start > S16_MAX) || 3553 (sector < rdev->sb_start && 3554 rdev->sb_start - sector > -S16_MIN)) 3555 return -EINVAL; 3556 rdev->ppl.offset = sector - rdev->sb_start; 3557 } else if (!rdev->mddev->external) { 3558 return -EBUSY; 3559 } 3560 rdev->ppl.sector = sector; 3561 return len; 3562 } 3563 3564 static struct rdev_sysfs_entry rdev_ppl_sector = 3565 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3566 3567 static ssize_t 3568 ppl_size_show(struct md_rdev *rdev, char *page) 3569 { 3570 return sprintf(page, "%u\n", rdev->ppl.size); 3571 } 3572 3573 static ssize_t 3574 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3575 { 3576 unsigned int size; 3577 3578 if (kstrtouint(buf, 10, &size) < 0) 3579 return -EINVAL; 3580 3581 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3582 rdev->raid_disk >= 0) 3583 return -EBUSY; 3584 3585 if (rdev->mddev->persistent) { 3586 if (rdev->mddev->major_version == 0) 3587 return -EINVAL; 3588 if (size > U16_MAX) 3589 return -EINVAL; 3590 } else if (!rdev->mddev->external) { 3591 return -EBUSY; 3592 } 3593 rdev->ppl.size = size; 3594 return len; 3595 } 3596 3597 static struct rdev_sysfs_entry rdev_ppl_size = 3598 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3599 3600 static struct attribute *rdev_default_attrs[] = { 3601 &rdev_state.attr, 3602 &rdev_errors.attr, 3603 &rdev_slot.attr, 3604 &rdev_offset.attr, 3605 &rdev_new_offset.attr, 3606 &rdev_size.attr, 3607 &rdev_recovery_start.attr, 3608 &rdev_bad_blocks.attr, 3609 &rdev_unack_bad_blocks.attr, 3610 &rdev_ppl_sector.attr, 3611 &rdev_ppl_size.attr, 3612 NULL, 3613 }; 3614 ATTRIBUTE_GROUPS(rdev_default); 3615 static ssize_t 3616 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3617 { 3618 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3619 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3620 3621 if (!entry->show) 3622 return -EIO; 3623 if (!rdev->mddev) 3624 return -ENODEV; 3625 return entry->show(rdev, page); 3626 } 3627 3628 static ssize_t 3629 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3630 const char *page, size_t length) 3631 { 3632 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3633 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3634 struct kernfs_node *kn = NULL; 3635 bool suspend = false; 3636 ssize_t rv; 3637 struct mddev *mddev = READ_ONCE(rdev->mddev); 3638 3639 if (!entry->store) 3640 return -EIO; 3641 if (!capable(CAP_SYS_ADMIN)) 3642 return -EACCES; 3643 if (!mddev) 3644 return -ENODEV; 3645 3646 if (entry->store == state_store) { 3647 if (cmd_match(page, "remove")) 3648 kn = sysfs_break_active_protection(kobj, attr); 3649 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3650 cmd_match(page, "writemostly") || 3651 cmd_match(page, "-writemostly")) 3652 suspend = true; 3653 } 3654 3655 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3656 if (!rv) { 3657 if (rdev->mddev == NULL) 3658 rv = -ENODEV; 3659 else 3660 rv = entry->store(rdev, page, length); 3661 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3662 } 3663 3664 if (kn) 3665 sysfs_unbreak_active_protection(kn); 3666 3667 return rv; 3668 } 3669 3670 static void rdev_free(struct kobject *ko) 3671 { 3672 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3673 kfree(rdev); 3674 } 3675 static const struct sysfs_ops rdev_sysfs_ops = { 3676 .show = rdev_attr_show, 3677 .store = rdev_attr_store, 3678 }; 3679 static const struct kobj_type rdev_ktype = { 3680 .release = rdev_free, 3681 .sysfs_ops = &rdev_sysfs_ops, 3682 .default_groups = rdev_default_groups, 3683 }; 3684 3685 int md_rdev_init(struct md_rdev *rdev) 3686 { 3687 rdev->desc_nr = -1; 3688 rdev->saved_raid_disk = -1; 3689 rdev->raid_disk = -1; 3690 rdev->flags = 0; 3691 rdev->data_offset = 0; 3692 rdev->new_data_offset = 0; 3693 rdev->sb_events = 0; 3694 rdev->last_read_error = 0; 3695 rdev->sb_loaded = 0; 3696 rdev->bb_page = NULL; 3697 atomic_set(&rdev->nr_pending, 0); 3698 atomic_set(&rdev->read_errors, 0); 3699 atomic_set(&rdev->corrected_errors, 0); 3700 3701 INIT_LIST_HEAD(&rdev->same_set); 3702 init_waitqueue_head(&rdev->blocked_wait); 3703 3704 /* Add space to store bad block list. 3705 * This reserves the space even on arrays where it cannot 3706 * be used - I wonder if that matters 3707 */ 3708 return badblocks_init(&rdev->badblocks, 0); 3709 } 3710 EXPORT_SYMBOL_GPL(md_rdev_init); 3711 3712 /* 3713 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3714 * 3715 * mark the device faulty if: 3716 * 3717 * - the device is nonexistent (zero size) 3718 * - the device has no valid superblock 3719 * 3720 * a faulty rdev _never_ has rdev->sb set. 3721 */ 3722 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3723 { 3724 struct md_rdev *rdev; 3725 sector_t size; 3726 int err; 3727 3728 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3729 if (!rdev) 3730 return ERR_PTR(-ENOMEM); 3731 3732 err = md_rdev_init(rdev); 3733 if (err) 3734 goto out_free_rdev; 3735 err = alloc_disk_sb(rdev); 3736 if (err) 3737 goto out_clear_rdev; 3738 3739 rdev->bdev_file = bdev_file_open_by_dev(newdev, 3740 BLK_OPEN_READ | BLK_OPEN_WRITE, 3741 super_format == -2 ? &claim_rdev : rdev, NULL); 3742 if (IS_ERR(rdev->bdev_file)) { 3743 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3744 MAJOR(newdev), MINOR(newdev)); 3745 err = PTR_ERR(rdev->bdev_file); 3746 goto out_clear_rdev; 3747 } 3748 rdev->bdev = file_bdev(rdev->bdev_file); 3749 3750 kobject_init(&rdev->kobj, &rdev_ktype); 3751 3752 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3753 if (!size) { 3754 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3755 rdev->bdev); 3756 err = -EINVAL; 3757 goto out_blkdev_put; 3758 } 3759 3760 if (super_format >= 0) { 3761 err = super_types[super_format]. 3762 load_super(rdev, NULL, super_minor); 3763 if (err == -EINVAL) { 3764 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3765 rdev->bdev, 3766 super_format, super_minor); 3767 goto out_blkdev_put; 3768 } 3769 if (err < 0) { 3770 pr_warn("md: could not read %pg's sb, not importing!\n", 3771 rdev->bdev); 3772 goto out_blkdev_put; 3773 } 3774 } 3775 3776 return rdev; 3777 3778 out_blkdev_put: 3779 fput(rdev->bdev_file); 3780 out_clear_rdev: 3781 md_rdev_clear(rdev); 3782 out_free_rdev: 3783 kfree(rdev); 3784 return ERR_PTR(err); 3785 } 3786 3787 /* 3788 * Check a full RAID array for plausibility 3789 */ 3790 3791 static int analyze_sbs(struct mddev *mddev) 3792 { 3793 int i; 3794 struct md_rdev *rdev, *freshest, *tmp; 3795 3796 freshest = NULL; 3797 rdev_for_each_safe(rdev, tmp, mddev) 3798 switch (super_types[mddev->major_version]. 3799 load_super(rdev, freshest, mddev->minor_version)) { 3800 case 1: 3801 freshest = rdev; 3802 break; 3803 case 0: 3804 break; 3805 default: 3806 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3807 rdev->bdev); 3808 md_kick_rdev_from_array(rdev); 3809 } 3810 3811 /* Cannot find a valid fresh disk */ 3812 if (!freshest) { 3813 pr_warn("md: cannot find a valid disk\n"); 3814 return -EINVAL; 3815 } 3816 3817 super_types[mddev->major_version]. 3818 validate_super(mddev, NULL/*freshest*/, freshest); 3819 3820 i = 0; 3821 rdev_for_each_safe(rdev, tmp, mddev) { 3822 if (mddev->max_disks && 3823 (rdev->desc_nr >= mddev->max_disks || 3824 i > mddev->max_disks)) { 3825 pr_warn("md: %s: %pg: only %d devices permitted\n", 3826 mdname(mddev), rdev->bdev, 3827 mddev->max_disks); 3828 md_kick_rdev_from_array(rdev); 3829 continue; 3830 } 3831 if (rdev != freshest) { 3832 if (super_types[mddev->major_version]. 3833 validate_super(mddev, freshest, rdev)) { 3834 pr_warn("md: kicking non-fresh %pg from array!\n", 3835 rdev->bdev); 3836 md_kick_rdev_from_array(rdev); 3837 continue; 3838 } 3839 } 3840 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && 3841 !test_bit(Journal, &rdev->flags)) { 3842 rdev->raid_disk = -1; 3843 clear_bit(In_sync, &rdev->flags); 3844 } 3845 } 3846 3847 return 0; 3848 } 3849 3850 /* Read a fixed-point number. 3851 * Numbers in sysfs attributes should be in "standard" units where 3852 * possible, so time should be in seconds. 3853 * However we internally use a a much smaller unit such as 3854 * milliseconds or jiffies. 3855 * This function takes a decimal number with a possible fractional 3856 * component, and produces an integer which is the result of 3857 * multiplying that number by 10^'scale'. 3858 * all without any floating-point arithmetic. 3859 */ 3860 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3861 { 3862 unsigned long result = 0; 3863 long decimals = -1; 3864 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3865 if (*cp == '.') 3866 decimals = 0; 3867 else if (decimals < scale) { 3868 unsigned int value; 3869 value = *cp - '0'; 3870 result = result * 10 + value; 3871 if (decimals >= 0) 3872 decimals++; 3873 } 3874 cp++; 3875 } 3876 if (*cp == '\n') 3877 cp++; 3878 if (*cp) 3879 return -EINVAL; 3880 if (decimals < 0) 3881 decimals = 0; 3882 *res = result * int_pow(10, scale - decimals); 3883 return 0; 3884 } 3885 3886 static ssize_t 3887 safe_delay_show(struct mddev *mddev, char *page) 3888 { 3889 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3890 3891 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3892 } 3893 static ssize_t 3894 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3895 { 3896 unsigned long msec; 3897 3898 if (mddev_is_clustered(mddev)) { 3899 pr_warn("md: Safemode is disabled for clustered mode\n"); 3900 return -EINVAL; 3901 } 3902 3903 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3904 return -EINVAL; 3905 if (msec == 0) 3906 mddev->safemode_delay = 0; 3907 else { 3908 unsigned long old_delay = mddev->safemode_delay; 3909 unsigned long new_delay = (msec*HZ)/1000; 3910 3911 if (new_delay == 0) 3912 new_delay = 1; 3913 mddev->safemode_delay = new_delay; 3914 if (new_delay < old_delay || old_delay == 0) 3915 mod_timer(&mddev->safemode_timer, jiffies+1); 3916 } 3917 return len; 3918 } 3919 static struct md_sysfs_entry md_safe_delay = 3920 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3921 3922 static ssize_t 3923 level_show(struct mddev *mddev, char *page) 3924 { 3925 struct md_personality *p; 3926 int ret; 3927 spin_lock(&mddev->lock); 3928 p = mddev->pers; 3929 if (p) 3930 ret = sprintf(page, "%s\n", p->name); 3931 else if (mddev->clevel[0]) 3932 ret = sprintf(page, "%s\n", mddev->clevel); 3933 else if (mddev->level != LEVEL_NONE) 3934 ret = sprintf(page, "%d\n", mddev->level); 3935 else 3936 ret = 0; 3937 spin_unlock(&mddev->lock); 3938 return ret; 3939 } 3940 3941 static ssize_t 3942 level_store(struct mddev *mddev, const char *buf, size_t len) 3943 { 3944 char clevel[16]; 3945 ssize_t rv; 3946 size_t slen = len; 3947 struct md_personality *pers, *oldpers; 3948 long level; 3949 void *priv, *oldpriv; 3950 struct md_rdev *rdev; 3951 3952 if (slen == 0 || slen >= sizeof(clevel)) 3953 return -EINVAL; 3954 3955 rv = mddev_suspend_and_lock(mddev); 3956 if (rv) 3957 return rv; 3958 3959 if (mddev->pers == NULL) { 3960 memcpy(mddev->clevel, buf, slen); 3961 if (mddev->clevel[slen-1] == '\n') 3962 slen--; 3963 mddev->clevel[slen] = 0; 3964 mddev->level = LEVEL_NONE; 3965 rv = len; 3966 goto out_unlock; 3967 } 3968 rv = -EROFS; 3969 if (!md_is_rdwr(mddev)) 3970 goto out_unlock; 3971 3972 /* request to change the personality. Need to ensure: 3973 * - array is not engaged in resync/recovery/reshape 3974 * - old personality can be suspended 3975 * - new personality will access other array. 3976 */ 3977 3978 rv = -EBUSY; 3979 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3980 mddev->reshape_position != MaxSector || 3981 mddev->sysfs_active) 3982 goto out_unlock; 3983 3984 rv = -EINVAL; 3985 if (!mddev->pers->quiesce) { 3986 pr_warn("md: %s: %s does not support online personality change\n", 3987 mdname(mddev), mddev->pers->name); 3988 goto out_unlock; 3989 } 3990 3991 /* Now find the new personality */ 3992 memcpy(clevel, buf, slen); 3993 if (clevel[slen-1] == '\n') 3994 slen--; 3995 clevel[slen] = 0; 3996 if (kstrtol(clevel, 10, &level)) 3997 level = LEVEL_NONE; 3998 3999 if (request_module("md-%s", clevel) != 0) 4000 request_module("md-level-%s", clevel); 4001 spin_lock(&pers_lock); 4002 pers = find_pers(level, clevel); 4003 if (!pers || !try_module_get(pers->owner)) { 4004 spin_unlock(&pers_lock); 4005 pr_warn("md: personality %s not loaded\n", clevel); 4006 rv = -EINVAL; 4007 goto out_unlock; 4008 } 4009 spin_unlock(&pers_lock); 4010 4011 if (pers == mddev->pers) { 4012 /* Nothing to do! */ 4013 module_put(pers->owner); 4014 rv = len; 4015 goto out_unlock; 4016 } 4017 if (!pers->takeover) { 4018 module_put(pers->owner); 4019 pr_warn("md: %s: %s does not support personality takeover\n", 4020 mdname(mddev), clevel); 4021 rv = -EINVAL; 4022 goto out_unlock; 4023 } 4024 4025 rdev_for_each(rdev, mddev) 4026 rdev->new_raid_disk = rdev->raid_disk; 4027 4028 /* ->takeover must set new_* and/or delta_disks 4029 * if it succeeds, and may set them when it fails. 4030 */ 4031 priv = pers->takeover(mddev); 4032 if (IS_ERR(priv)) { 4033 mddev->new_level = mddev->level; 4034 mddev->new_layout = mddev->layout; 4035 mddev->new_chunk_sectors = mddev->chunk_sectors; 4036 mddev->raid_disks -= mddev->delta_disks; 4037 mddev->delta_disks = 0; 4038 mddev->reshape_backwards = 0; 4039 module_put(pers->owner); 4040 pr_warn("md: %s: %s would not accept array\n", 4041 mdname(mddev), clevel); 4042 rv = PTR_ERR(priv); 4043 goto out_unlock; 4044 } 4045 4046 /* Looks like we have a winner */ 4047 mddev_detach(mddev); 4048 4049 spin_lock(&mddev->lock); 4050 oldpers = mddev->pers; 4051 oldpriv = mddev->private; 4052 mddev->pers = pers; 4053 mddev->private = priv; 4054 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4055 mddev->level = mddev->new_level; 4056 mddev->layout = mddev->new_layout; 4057 mddev->chunk_sectors = mddev->new_chunk_sectors; 4058 mddev->delta_disks = 0; 4059 mddev->reshape_backwards = 0; 4060 mddev->degraded = 0; 4061 spin_unlock(&mddev->lock); 4062 4063 if (oldpers->sync_request == NULL && 4064 mddev->external) { 4065 /* We are converting from a no-redundancy array 4066 * to a redundancy array and metadata is managed 4067 * externally so we need to be sure that writes 4068 * won't block due to a need to transition 4069 * clean->dirty 4070 * until external management is started. 4071 */ 4072 mddev->in_sync = 0; 4073 mddev->safemode_delay = 0; 4074 mddev->safemode = 0; 4075 } 4076 4077 oldpers->free(mddev, oldpriv); 4078 4079 if (oldpers->sync_request == NULL && 4080 pers->sync_request != NULL) { 4081 /* need to add the md_redundancy_group */ 4082 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4083 pr_warn("md: cannot register extra attributes for %s\n", 4084 mdname(mddev)); 4085 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4086 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4087 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4088 } 4089 if (oldpers->sync_request != NULL && 4090 pers->sync_request == NULL) { 4091 /* need to remove the md_redundancy_group */ 4092 if (mddev->to_remove == NULL) 4093 mddev->to_remove = &md_redundancy_group; 4094 } 4095 4096 module_put(oldpers->owner); 4097 4098 rdev_for_each(rdev, mddev) { 4099 if (rdev->raid_disk < 0) 4100 continue; 4101 if (rdev->new_raid_disk >= mddev->raid_disks) 4102 rdev->new_raid_disk = -1; 4103 if (rdev->new_raid_disk == rdev->raid_disk) 4104 continue; 4105 sysfs_unlink_rdev(mddev, rdev); 4106 } 4107 rdev_for_each(rdev, mddev) { 4108 if (rdev->raid_disk < 0) 4109 continue; 4110 if (rdev->new_raid_disk == rdev->raid_disk) 4111 continue; 4112 rdev->raid_disk = rdev->new_raid_disk; 4113 if (rdev->raid_disk < 0) 4114 clear_bit(In_sync, &rdev->flags); 4115 else { 4116 if (sysfs_link_rdev(mddev, rdev)) 4117 pr_warn("md: cannot register rd%d for %s after level change\n", 4118 rdev->raid_disk, mdname(mddev)); 4119 } 4120 } 4121 4122 if (pers->sync_request == NULL) { 4123 /* this is now an array without redundancy, so 4124 * it must always be in_sync 4125 */ 4126 mddev->in_sync = 1; 4127 del_timer_sync(&mddev->safemode_timer); 4128 } 4129 pers->run(mddev); 4130 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4131 if (!mddev->thread) 4132 md_update_sb(mddev, 1); 4133 sysfs_notify_dirent_safe(mddev->sysfs_level); 4134 md_new_event(); 4135 rv = len; 4136 out_unlock: 4137 mddev_unlock_and_resume(mddev); 4138 return rv; 4139 } 4140 4141 static struct md_sysfs_entry md_level = 4142 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4143 4144 static ssize_t 4145 layout_show(struct mddev *mddev, char *page) 4146 { 4147 /* just a number, not meaningful for all levels */ 4148 if (mddev->reshape_position != MaxSector && 4149 mddev->layout != mddev->new_layout) 4150 return sprintf(page, "%d (%d)\n", 4151 mddev->new_layout, mddev->layout); 4152 return sprintf(page, "%d\n", mddev->layout); 4153 } 4154 4155 static ssize_t 4156 layout_store(struct mddev *mddev, const char *buf, size_t len) 4157 { 4158 unsigned int n; 4159 int err; 4160 4161 err = kstrtouint(buf, 10, &n); 4162 if (err < 0) 4163 return err; 4164 err = mddev_lock(mddev); 4165 if (err) 4166 return err; 4167 4168 if (mddev->pers) { 4169 if (mddev->pers->check_reshape == NULL) 4170 err = -EBUSY; 4171 else if (!md_is_rdwr(mddev)) 4172 err = -EROFS; 4173 else { 4174 mddev->new_layout = n; 4175 err = mddev->pers->check_reshape(mddev); 4176 if (err) 4177 mddev->new_layout = mddev->layout; 4178 } 4179 } else { 4180 mddev->new_layout = n; 4181 if (mddev->reshape_position == MaxSector) 4182 mddev->layout = n; 4183 } 4184 mddev_unlock(mddev); 4185 return err ?: len; 4186 } 4187 static struct md_sysfs_entry md_layout = 4188 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4189 4190 static ssize_t 4191 raid_disks_show(struct mddev *mddev, char *page) 4192 { 4193 if (mddev->raid_disks == 0) 4194 return 0; 4195 if (mddev->reshape_position != MaxSector && 4196 mddev->delta_disks != 0) 4197 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4198 mddev->raid_disks - mddev->delta_disks); 4199 return sprintf(page, "%d\n", mddev->raid_disks); 4200 } 4201 4202 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4203 4204 static ssize_t 4205 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4206 { 4207 unsigned int n; 4208 int err; 4209 4210 err = kstrtouint(buf, 10, &n); 4211 if (err < 0) 4212 return err; 4213 4214 err = mddev_lock(mddev); 4215 if (err) 4216 return err; 4217 if (mddev->pers) 4218 err = update_raid_disks(mddev, n); 4219 else if (mddev->reshape_position != MaxSector) { 4220 struct md_rdev *rdev; 4221 int olddisks = mddev->raid_disks - mddev->delta_disks; 4222 4223 err = -EINVAL; 4224 rdev_for_each(rdev, mddev) { 4225 if (olddisks < n && 4226 rdev->data_offset < rdev->new_data_offset) 4227 goto out_unlock; 4228 if (olddisks > n && 4229 rdev->data_offset > rdev->new_data_offset) 4230 goto out_unlock; 4231 } 4232 err = 0; 4233 mddev->delta_disks = n - olddisks; 4234 mddev->raid_disks = n; 4235 mddev->reshape_backwards = (mddev->delta_disks < 0); 4236 } else 4237 mddev->raid_disks = n; 4238 out_unlock: 4239 mddev_unlock(mddev); 4240 return err ? err : len; 4241 } 4242 static struct md_sysfs_entry md_raid_disks = 4243 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4244 4245 static ssize_t 4246 uuid_show(struct mddev *mddev, char *page) 4247 { 4248 return sprintf(page, "%pU\n", mddev->uuid); 4249 } 4250 static struct md_sysfs_entry md_uuid = 4251 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4252 4253 static ssize_t 4254 chunk_size_show(struct mddev *mddev, char *page) 4255 { 4256 if (mddev->reshape_position != MaxSector && 4257 mddev->chunk_sectors != mddev->new_chunk_sectors) 4258 return sprintf(page, "%d (%d)\n", 4259 mddev->new_chunk_sectors << 9, 4260 mddev->chunk_sectors << 9); 4261 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4262 } 4263 4264 static ssize_t 4265 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4266 { 4267 unsigned long n; 4268 int err; 4269 4270 err = kstrtoul(buf, 10, &n); 4271 if (err < 0) 4272 return err; 4273 4274 err = mddev_lock(mddev); 4275 if (err) 4276 return err; 4277 if (mddev->pers) { 4278 if (mddev->pers->check_reshape == NULL) 4279 err = -EBUSY; 4280 else if (!md_is_rdwr(mddev)) 4281 err = -EROFS; 4282 else { 4283 mddev->new_chunk_sectors = n >> 9; 4284 err = mddev->pers->check_reshape(mddev); 4285 if (err) 4286 mddev->new_chunk_sectors = mddev->chunk_sectors; 4287 } 4288 } else { 4289 mddev->new_chunk_sectors = n >> 9; 4290 if (mddev->reshape_position == MaxSector) 4291 mddev->chunk_sectors = n >> 9; 4292 } 4293 mddev_unlock(mddev); 4294 return err ?: len; 4295 } 4296 static struct md_sysfs_entry md_chunk_size = 4297 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4298 4299 static ssize_t 4300 resync_start_show(struct mddev *mddev, char *page) 4301 { 4302 if (mddev->recovery_cp == MaxSector) 4303 return sprintf(page, "none\n"); 4304 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4305 } 4306 4307 static ssize_t 4308 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4309 { 4310 unsigned long long n; 4311 int err; 4312 4313 if (cmd_match(buf, "none")) 4314 n = MaxSector; 4315 else { 4316 err = kstrtoull(buf, 10, &n); 4317 if (err < 0) 4318 return err; 4319 if (n != (sector_t)n) 4320 return -EINVAL; 4321 } 4322 4323 err = mddev_lock(mddev); 4324 if (err) 4325 return err; 4326 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4327 err = -EBUSY; 4328 4329 if (!err) { 4330 mddev->recovery_cp = n; 4331 if (mddev->pers) 4332 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4333 } 4334 mddev_unlock(mddev); 4335 return err ?: len; 4336 } 4337 static struct md_sysfs_entry md_resync_start = 4338 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4339 resync_start_show, resync_start_store); 4340 4341 /* 4342 * The array state can be: 4343 * 4344 * clear 4345 * No devices, no size, no level 4346 * Equivalent to STOP_ARRAY ioctl 4347 * inactive 4348 * May have some settings, but array is not active 4349 * all IO results in error 4350 * When written, doesn't tear down array, but just stops it 4351 * suspended (not supported yet) 4352 * All IO requests will block. The array can be reconfigured. 4353 * Writing this, if accepted, will block until array is quiescent 4354 * readonly 4355 * no resync can happen. no superblocks get written. 4356 * write requests fail 4357 * read-auto 4358 * like readonly, but behaves like 'clean' on a write request. 4359 * 4360 * clean - no pending writes, but otherwise active. 4361 * When written to inactive array, starts without resync 4362 * If a write request arrives then 4363 * if metadata is known, mark 'dirty' and switch to 'active'. 4364 * if not known, block and switch to write-pending 4365 * If written to an active array that has pending writes, then fails. 4366 * active 4367 * fully active: IO and resync can be happening. 4368 * When written to inactive array, starts with resync 4369 * 4370 * write-pending 4371 * clean, but writes are blocked waiting for 'active' to be written. 4372 * 4373 * active-idle 4374 * like active, but no writes have been seen for a while (100msec). 4375 * 4376 * broken 4377 * Array is failed. It's useful because mounted-arrays aren't stopped 4378 * when array is failed, so this state will at least alert the user that 4379 * something is wrong. 4380 */ 4381 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4382 write_pending, active_idle, broken, bad_word}; 4383 static char *array_states[] = { 4384 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4385 "write-pending", "active-idle", "broken", NULL }; 4386 4387 static int match_word(const char *word, char **list) 4388 { 4389 int n; 4390 for (n=0; list[n]; n++) 4391 if (cmd_match(word, list[n])) 4392 break; 4393 return n; 4394 } 4395 4396 static ssize_t 4397 array_state_show(struct mddev *mddev, char *page) 4398 { 4399 enum array_state st = inactive; 4400 4401 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4402 switch(mddev->ro) { 4403 case MD_RDONLY: 4404 st = readonly; 4405 break; 4406 case MD_AUTO_READ: 4407 st = read_auto; 4408 break; 4409 case MD_RDWR: 4410 spin_lock(&mddev->lock); 4411 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4412 st = write_pending; 4413 else if (mddev->in_sync) 4414 st = clean; 4415 else if (mddev->safemode) 4416 st = active_idle; 4417 else 4418 st = active; 4419 spin_unlock(&mddev->lock); 4420 } 4421 4422 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4423 st = broken; 4424 } else { 4425 if (list_empty(&mddev->disks) && 4426 mddev->raid_disks == 0 && 4427 mddev->dev_sectors == 0) 4428 st = clear; 4429 else 4430 st = inactive; 4431 } 4432 return sprintf(page, "%s\n", array_states[st]); 4433 } 4434 4435 static int do_md_stop(struct mddev *mddev, int ro); 4436 static int md_set_readonly(struct mddev *mddev); 4437 static int restart_array(struct mddev *mddev); 4438 4439 static ssize_t 4440 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4441 { 4442 int err = 0; 4443 enum array_state st = match_word(buf, array_states); 4444 4445 /* No lock dependent actions */ 4446 switch (st) { 4447 case suspended: /* not supported yet */ 4448 case write_pending: /* cannot be set */ 4449 case active_idle: /* cannot be set */ 4450 case broken: /* cannot be set */ 4451 case bad_word: 4452 return -EINVAL; 4453 case clear: 4454 case readonly: 4455 case inactive: 4456 case read_auto: 4457 if (!mddev->pers || !md_is_rdwr(mddev)) 4458 break; 4459 /* write sysfs will not open mddev and opener should be 0 */ 4460 err = mddev_set_closing_and_sync_blockdev(mddev, 0); 4461 if (err) 4462 return err; 4463 break; 4464 default: 4465 break; 4466 } 4467 4468 if (mddev->pers && (st == active || st == clean) && 4469 mddev->ro != MD_RDONLY) { 4470 /* don't take reconfig_mutex when toggling between 4471 * clean and active 4472 */ 4473 spin_lock(&mddev->lock); 4474 if (st == active) { 4475 restart_array(mddev); 4476 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4477 md_wakeup_thread(mddev->thread); 4478 wake_up(&mddev->sb_wait); 4479 } else /* st == clean */ { 4480 restart_array(mddev); 4481 if (!set_in_sync(mddev)) 4482 err = -EBUSY; 4483 } 4484 if (!err) 4485 sysfs_notify_dirent_safe(mddev->sysfs_state); 4486 spin_unlock(&mddev->lock); 4487 return err ?: len; 4488 } 4489 err = mddev_lock(mddev); 4490 if (err) 4491 return err; 4492 4493 switch (st) { 4494 case inactive: 4495 /* stop an active array, return 0 otherwise */ 4496 if (mddev->pers) 4497 err = do_md_stop(mddev, 2); 4498 break; 4499 case clear: 4500 err = do_md_stop(mddev, 0); 4501 break; 4502 case readonly: 4503 if (mddev->pers) 4504 err = md_set_readonly(mddev); 4505 else { 4506 mddev->ro = MD_RDONLY; 4507 set_disk_ro(mddev->gendisk, 1); 4508 err = do_md_run(mddev); 4509 } 4510 break; 4511 case read_auto: 4512 if (mddev->pers) { 4513 if (md_is_rdwr(mddev)) 4514 err = md_set_readonly(mddev); 4515 else if (mddev->ro == MD_RDONLY) 4516 err = restart_array(mddev); 4517 if (err == 0) { 4518 mddev->ro = MD_AUTO_READ; 4519 set_disk_ro(mddev->gendisk, 0); 4520 } 4521 } else { 4522 mddev->ro = MD_AUTO_READ; 4523 err = do_md_run(mddev); 4524 } 4525 break; 4526 case clean: 4527 if (mddev->pers) { 4528 err = restart_array(mddev); 4529 if (err) 4530 break; 4531 spin_lock(&mddev->lock); 4532 if (!set_in_sync(mddev)) 4533 err = -EBUSY; 4534 spin_unlock(&mddev->lock); 4535 } else 4536 err = -EINVAL; 4537 break; 4538 case active: 4539 if (mddev->pers) { 4540 err = restart_array(mddev); 4541 if (err) 4542 break; 4543 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4544 wake_up(&mddev->sb_wait); 4545 err = 0; 4546 } else { 4547 mddev->ro = MD_RDWR; 4548 set_disk_ro(mddev->gendisk, 0); 4549 err = do_md_run(mddev); 4550 } 4551 break; 4552 default: 4553 err = -EINVAL; 4554 break; 4555 } 4556 4557 if (!err) { 4558 if (mddev->hold_active == UNTIL_IOCTL) 4559 mddev->hold_active = 0; 4560 sysfs_notify_dirent_safe(mddev->sysfs_state); 4561 } 4562 mddev_unlock(mddev); 4563 4564 if (st == readonly || st == read_auto || st == inactive || 4565 (err && st == clear)) 4566 clear_bit(MD_CLOSING, &mddev->flags); 4567 4568 return err ?: len; 4569 } 4570 static struct md_sysfs_entry md_array_state = 4571 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4572 4573 static ssize_t 4574 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4575 return sprintf(page, "%d\n", 4576 atomic_read(&mddev->max_corr_read_errors)); 4577 } 4578 4579 static ssize_t 4580 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4581 { 4582 unsigned int n; 4583 int rv; 4584 4585 rv = kstrtouint(buf, 10, &n); 4586 if (rv < 0) 4587 return rv; 4588 if (n > INT_MAX) 4589 return -EINVAL; 4590 atomic_set(&mddev->max_corr_read_errors, n); 4591 return len; 4592 } 4593 4594 static struct md_sysfs_entry max_corr_read_errors = 4595 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4596 max_corrected_read_errors_store); 4597 4598 static ssize_t 4599 null_show(struct mddev *mddev, char *page) 4600 { 4601 return -EINVAL; 4602 } 4603 4604 static ssize_t 4605 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4606 { 4607 /* buf must be %d:%d\n? giving major and minor numbers */ 4608 /* The new device is added to the array. 4609 * If the array has a persistent superblock, we read the 4610 * superblock to initialise info and check validity. 4611 * Otherwise, only checking done is that in bind_rdev_to_array, 4612 * which mainly checks size. 4613 */ 4614 char *e; 4615 int major = simple_strtoul(buf, &e, 10); 4616 int minor; 4617 dev_t dev; 4618 struct md_rdev *rdev; 4619 int err; 4620 4621 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4622 return -EINVAL; 4623 minor = simple_strtoul(e+1, &e, 10); 4624 if (*e && *e != '\n') 4625 return -EINVAL; 4626 dev = MKDEV(major, minor); 4627 if (major != MAJOR(dev) || 4628 minor != MINOR(dev)) 4629 return -EOVERFLOW; 4630 4631 err = mddev_suspend_and_lock(mddev); 4632 if (err) 4633 return err; 4634 if (mddev->persistent) { 4635 rdev = md_import_device(dev, mddev->major_version, 4636 mddev->minor_version); 4637 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4638 struct md_rdev *rdev0 4639 = list_entry(mddev->disks.next, 4640 struct md_rdev, same_set); 4641 err = super_types[mddev->major_version] 4642 .load_super(rdev, rdev0, mddev->minor_version); 4643 if (err < 0) 4644 goto out; 4645 } 4646 } else if (mddev->external) 4647 rdev = md_import_device(dev, -2, -1); 4648 else 4649 rdev = md_import_device(dev, -1, -1); 4650 4651 if (IS_ERR(rdev)) { 4652 mddev_unlock_and_resume(mddev); 4653 return PTR_ERR(rdev); 4654 } 4655 err = bind_rdev_to_array(rdev, mddev); 4656 out: 4657 if (err) 4658 export_rdev(rdev, mddev); 4659 mddev_unlock_and_resume(mddev); 4660 if (!err) 4661 md_new_event(); 4662 return err ? err : len; 4663 } 4664 4665 static struct md_sysfs_entry md_new_device = 4666 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4667 4668 static ssize_t 4669 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4670 { 4671 char *end; 4672 unsigned long chunk, end_chunk; 4673 int err; 4674 4675 err = mddev_lock(mddev); 4676 if (err) 4677 return err; 4678 if (!mddev->bitmap) 4679 goto out; 4680 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4681 while (*buf) { 4682 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4683 if (buf == end) break; 4684 if (*end == '-') { /* range */ 4685 buf = end + 1; 4686 end_chunk = simple_strtoul(buf, &end, 0); 4687 if (buf == end) break; 4688 } 4689 if (*end && !isspace(*end)) break; 4690 md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4691 buf = skip_spaces(end); 4692 } 4693 md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4694 out: 4695 mddev_unlock(mddev); 4696 return len; 4697 } 4698 4699 static struct md_sysfs_entry md_bitmap = 4700 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4701 4702 static ssize_t 4703 size_show(struct mddev *mddev, char *page) 4704 { 4705 return sprintf(page, "%llu\n", 4706 (unsigned long long)mddev->dev_sectors / 2); 4707 } 4708 4709 static int update_size(struct mddev *mddev, sector_t num_sectors); 4710 4711 static ssize_t 4712 size_store(struct mddev *mddev, const char *buf, size_t len) 4713 { 4714 /* If array is inactive, we can reduce the component size, but 4715 * not increase it (except from 0). 4716 * If array is active, we can try an on-line resize 4717 */ 4718 sector_t sectors; 4719 int err = strict_blocks_to_sectors(buf, §ors); 4720 4721 if (err < 0) 4722 return err; 4723 err = mddev_lock(mddev); 4724 if (err) 4725 return err; 4726 if (mddev->pers) { 4727 err = update_size(mddev, sectors); 4728 if (err == 0) 4729 md_update_sb(mddev, 1); 4730 } else { 4731 if (mddev->dev_sectors == 0 || 4732 mddev->dev_sectors > sectors) 4733 mddev->dev_sectors = sectors; 4734 else 4735 err = -ENOSPC; 4736 } 4737 mddev_unlock(mddev); 4738 return err ? err : len; 4739 } 4740 4741 static struct md_sysfs_entry md_size = 4742 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4743 4744 /* Metadata version. 4745 * This is one of 4746 * 'none' for arrays with no metadata (good luck...) 4747 * 'external' for arrays with externally managed metadata, 4748 * or N.M for internally known formats 4749 */ 4750 static ssize_t 4751 metadata_show(struct mddev *mddev, char *page) 4752 { 4753 if (mddev->persistent) 4754 return sprintf(page, "%d.%d\n", 4755 mddev->major_version, mddev->minor_version); 4756 else if (mddev->external) 4757 return sprintf(page, "external:%s\n", mddev->metadata_type); 4758 else 4759 return sprintf(page, "none\n"); 4760 } 4761 4762 static ssize_t 4763 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4764 { 4765 int major, minor; 4766 char *e; 4767 int err; 4768 /* Changing the details of 'external' metadata is 4769 * always permitted. Otherwise there must be 4770 * no devices attached to the array. 4771 */ 4772 4773 err = mddev_lock(mddev); 4774 if (err) 4775 return err; 4776 err = -EBUSY; 4777 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4778 ; 4779 else if (!list_empty(&mddev->disks)) 4780 goto out_unlock; 4781 4782 err = 0; 4783 if (cmd_match(buf, "none")) { 4784 mddev->persistent = 0; 4785 mddev->external = 0; 4786 mddev->major_version = 0; 4787 mddev->minor_version = 90; 4788 goto out_unlock; 4789 } 4790 if (strncmp(buf, "external:", 9) == 0) { 4791 size_t namelen = len-9; 4792 if (namelen >= sizeof(mddev->metadata_type)) 4793 namelen = sizeof(mddev->metadata_type)-1; 4794 memcpy(mddev->metadata_type, buf+9, namelen); 4795 mddev->metadata_type[namelen] = 0; 4796 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4797 mddev->metadata_type[--namelen] = 0; 4798 mddev->persistent = 0; 4799 mddev->external = 1; 4800 mddev->major_version = 0; 4801 mddev->minor_version = 90; 4802 goto out_unlock; 4803 } 4804 major = simple_strtoul(buf, &e, 10); 4805 err = -EINVAL; 4806 if (e==buf || *e != '.') 4807 goto out_unlock; 4808 buf = e+1; 4809 minor = simple_strtoul(buf, &e, 10); 4810 if (e==buf || (*e && *e != '\n') ) 4811 goto out_unlock; 4812 err = -ENOENT; 4813 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4814 goto out_unlock; 4815 mddev->major_version = major; 4816 mddev->minor_version = minor; 4817 mddev->persistent = 1; 4818 mddev->external = 0; 4819 err = 0; 4820 out_unlock: 4821 mddev_unlock(mddev); 4822 return err ?: len; 4823 } 4824 4825 static struct md_sysfs_entry md_metadata = 4826 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4827 4828 enum sync_action md_sync_action(struct mddev *mddev) 4829 { 4830 unsigned long recovery = mddev->recovery; 4831 4832 /* 4833 * frozen has the highest priority, means running sync_thread will be 4834 * stopped immediately, and no new sync_thread can start. 4835 */ 4836 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4837 return ACTION_FROZEN; 4838 4839 /* 4840 * read-only array can't register sync_thread, and it can only 4841 * add/remove spares. 4842 */ 4843 if (!md_is_rdwr(mddev)) 4844 return ACTION_IDLE; 4845 4846 /* 4847 * idle means no sync_thread is running, and no new sync_thread is 4848 * requested. 4849 */ 4850 if (!test_bit(MD_RECOVERY_RUNNING, &recovery) && 4851 !test_bit(MD_RECOVERY_NEEDED, &recovery)) 4852 return ACTION_IDLE; 4853 4854 if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || 4855 mddev->reshape_position != MaxSector) 4856 return ACTION_RESHAPE; 4857 4858 if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4859 return ACTION_RECOVER; 4860 4861 if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4862 /* 4863 * MD_RECOVERY_CHECK must be paired with 4864 * MD_RECOVERY_REQUESTED. 4865 */ 4866 if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4867 return ACTION_CHECK; 4868 if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4869 return ACTION_REPAIR; 4870 return ACTION_RESYNC; 4871 } 4872 4873 /* 4874 * MD_RECOVERY_NEEDED or MD_RECOVERY_RUNNING is set, however, no 4875 * sync_action is specified. 4876 */ 4877 return ACTION_IDLE; 4878 } 4879 4880 enum sync_action md_sync_action_by_name(const char *page) 4881 { 4882 enum sync_action action; 4883 4884 for (action = 0; action < NR_SYNC_ACTIONS; ++action) { 4885 if (cmd_match(page, action_name[action])) 4886 return action; 4887 } 4888 4889 return NR_SYNC_ACTIONS; 4890 } 4891 4892 const char *md_sync_action_name(enum sync_action action) 4893 { 4894 return action_name[action]; 4895 } 4896 4897 static ssize_t 4898 action_show(struct mddev *mddev, char *page) 4899 { 4900 enum sync_action action = md_sync_action(mddev); 4901 4902 return sprintf(page, "%s\n", md_sync_action_name(action)); 4903 } 4904 4905 /** 4906 * stop_sync_thread() - wait for sync_thread to stop if it's running. 4907 * @mddev: the array. 4908 * @locked: if set, reconfig_mutex will still be held after this function 4909 * return; if not set, reconfig_mutex will be released after this 4910 * function return. 4911 */ 4912 static void stop_sync_thread(struct mddev *mddev, bool locked) 4913 { 4914 int sync_seq = atomic_read(&mddev->sync_seq); 4915 4916 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4917 if (!locked) 4918 mddev_unlock(mddev); 4919 return; 4920 } 4921 4922 mddev_unlock(mddev); 4923 4924 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4925 /* 4926 * Thread might be blocked waiting for metadata update which will now 4927 * never happen 4928 */ 4929 md_wakeup_thread_directly(mddev->sync_thread); 4930 if (work_pending(&mddev->sync_work)) 4931 flush_work(&mddev->sync_work); 4932 4933 wait_event(resync_wait, 4934 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4935 (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery) && 4936 sync_seq != atomic_read(&mddev->sync_seq))); 4937 4938 if (locked) 4939 mddev_lock_nointr(mddev); 4940 } 4941 4942 void md_idle_sync_thread(struct mddev *mddev) 4943 { 4944 lockdep_assert_held(&mddev->reconfig_mutex); 4945 4946 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4947 stop_sync_thread(mddev, true); 4948 } 4949 EXPORT_SYMBOL_GPL(md_idle_sync_thread); 4950 4951 void md_frozen_sync_thread(struct mddev *mddev) 4952 { 4953 lockdep_assert_held(&mddev->reconfig_mutex); 4954 4955 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4956 stop_sync_thread(mddev, true); 4957 } 4958 EXPORT_SYMBOL_GPL(md_frozen_sync_thread); 4959 4960 void md_unfrozen_sync_thread(struct mddev *mddev) 4961 { 4962 lockdep_assert_held(&mddev->reconfig_mutex); 4963 4964 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4965 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4966 md_wakeup_thread(mddev->thread); 4967 sysfs_notify_dirent_safe(mddev->sysfs_action); 4968 } 4969 EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); 4970 4971 static int mddev_start_reshape(struct mddev *mddev) 4972 { 4973 int ret; 4974 4975 if (mddev->pers->start_reshape == NULL) 4976 return -EINVAL; 4977 4978 if (mddev->reshape_position == MaxSector || 4979 mddev->pers->check_reshape == NULL || 4980 mddev->pers->check_reshape(mddev)) { 4981 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4982 ret = mddev->pers->start_reshape(mddev); 4983 if (ret) 4984 return ret; 4985 } else { 4986 /* 4987 * If reshape is still in progress, and md_check_recovery() can 4988 * continue to reshape, don't restart reshape because data can 4989 * be corrupted for raid456. 4990 */ 4991 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4992 } 4993 4994 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 4995 return 0; 4996 } 4997 4998 static ssize_t 4999 action_store(struct mddev *mddev, const char *page, size_t len) 5000 { 5001 int ret; 5002 enum sync_action action; 5003 5004 if (!mddev->pers || !mddev->pers->sync_request) 5005 return -EINVAL; 5006 5007 retry: 5008 if (work_busy(&mddev->sync_work)) 5009 flush_work(&mddev->sync_work); 5010 5011 ret = mddev_lock(mddev); 5012 if (ret) 5013 return ret; 5014 5015 if (work_busy(&mddev->sync_work)) { 5016 mddev_unlock(mddev); 5017 goto retry; 5018 } 5019 5020 action = md_sync_action_by_name(page); 5021 5022 /* TODO: mdadm rely on "idle" to start sync_thread. */ 5023 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5024 switch (action) { 5025 case ACTION_FROZEN: 5026 md_frozen_sync_thread(mddev); 5027 ret = len; 5028 goto out; 5029 case ACTION_IDLE: 5030 md_idle_sync_thread(mddev); 5031 break; 5032 case ACTION_RESHAPE: 5033 case ACTION_RECOVER: 5034 case ACTION_CHECK: 5035 case ACTION_REPAIR: 5036 case ACTION_RESYNC: 5037 ret = -EBUSY; 5038 goto out; 5039 default: 5040 ret = -EINVAL; 5041 goto out; 5042 } 5043 } else { 5044 switch (action) { 5045 case ACTION_FROZEN: 5046 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5047 ret = len; 5048 goto out; 5049 case ACTION_RESHAPE: 5050 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5051 ret = mddev_start_reshape(mddev); 5052 if (ret) 5053 goto out; 5054 break; 5055 case ACTION_RECOVER: 5056 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5057 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5058 break; 5059 case ACTION_CHECK: 5060 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5061 fallthrough; 5062 case ACTION_REPAIR: 5063 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 5064 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5065 fallthrough; 5066 case ACTION_RESYNC: 5067 case ACTION_IDLE: 5068 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5069 break; 5070 default: 5071 ret = -EINVAL; 5072 goto out; 5073 } 5074 } 5075 5076 if (mddev->ro == MD_AUTO_READ) { 5077 /* A write to sync_action is enough to justify 5078 * canceling read-auto mode 5079 */ 5080 mddev->ro = MD_RDWR; 5081 md_wakeup_thread(mddev->sync_thread); 5082 } 5083 5084 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5085 md_wakeup_thread(mddev->thread); 5086 sysfs_notify_dirent_safe(mddev->sysfs_action); 5087 ret = len; 5088 5089 out: 5090 mddev_unlock(mddev); 5091 return ret; 5092 } 5093 5094 static struct md_sysfs_entry md_scan_mode = 5095 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 5096 5097 static ssize_t 5098 last_sync_action_show(struct mddev *mddev, char *page) 5099 { 5100 return sprintf(page, "%s\n", 5101 md_sync_action_name(mddev->last_sync_action)); 5102 } 5103 5104 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 5105 5106 static ssize_t 5107 mismatch_cnt_show(struct mddev *mddev, char *page) 5108 { 5109 return sprintf(page, "%llu\n", 5110 (unsigned long long) 5111 atomic64_read(&mddev->resync_mismatches)); 5112 } 5113 5114 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 5115 5116 static ssize_t 5117 sync_min_show(struct mddev *mddev, char *page) 5118 { 5119 return sprintf(page, "%d (%s)\n", speed_min(mddev), 5120 mddev->sync_speed_min ? "local": "system"); 5121 } 5122 5123 static ssize_t 5124 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 5125 { 5126 unsigned int min; 5127 int rv; 5128 5129 if (strncmp(buf, "system", 6)==0) { 5130 min = 0; 5131 } else { 5132 rv = kstrtouint(buf, 10, &min); 5133 if (rv < 0) 5134 return rv; 5135 if (min == 0) 5136 return -EINVAL; 5137 } 5138 mddev->sync_speed_min = min; 5139 return len; 5140 } 5141 5142 static struct md_sysfs_entry md_sync_min = 5143 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5144 5145 static ssize_t 5146 sync_max_show(struct mddev *mddev, char *page) 5147 { 5148 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5149 mddev->sync_speed_max ? "local": "system"); 5150 } 5151 5152 static ssize_t 5153 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5154 { 5155 unsigned int max; 5156 int rv; 5157 5158 if (strncmp(buf, "system", 6)==0) { 5159 max = 0; 5160 } else { 5161 rv = kstrtouint(buf, 10, &max); 5162 if (rv < 0) 5163 return rv; 5164 if (max == 0) 5165 return -EINVAL; 5166 } 5167 mddev->sync_speed_max = max; 5168 return len; 5169 } 5170 5171 static struct md_sysfs_entry md_sync_max = 5172 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5173 5174 static ssize_t 5175 degraded_show(struct mddev *mddev, char *page) 5176 { 5177 return sprintf(page, "%d\n", mddev->degraded); 5178 } 5179 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5180 5181 static ssize_t 5182 sync_force_parallel_show(struct mddev *mddev, char *page) 5183 { 5184 return sprintf(page, "%d\n", mddev->parallel_resync); 5185 } 5186 5187 static ssize_t 5188 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5189 { 5190 long n; 5191 5192 if (kstrtol(buf, 10, &n)) 5193 return -EINVAL; 5194 5195 if (n != 0 && n != 1) 5196 return -EINVAL; 5197 5198 mddev->parallel_resync = n; 5199 5200 if (mddev->sync_thread) 5201 wake_up(&resync_wait); 5202 5203 return len; 5204 } 5205 5206 /* force parallel resync, even with shared block devices */ 5207 static struct md_sysfs_entry md_sync_force_parallel = 5208 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5209 sync_force_parallel_show, sync_force_parallel_store); 5210 5211 static ssize_t 5212 sync_speed_show(struct mddev *mddev, char *page) 5213 { 5214 unsigned long resync, dt, db; 5215 if (mddev->curr_resync == MD_RESYNC_NONE) 5216 return sprintf(page, "none\n"); 5217 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5218 dt = (jiffies - mddev->resync_mark) / HZ; 5219 if (!dt) dt++; 5220 db = resync - mddev->resync_mark_cnt; 5221 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5222 } 5223 5224 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5225 5226 static ssize_t 5227 sync_completed_show(struct mddev *mddev, char *page) 5228 { 5229 unsigned long long max_sectors, resync; 5230 5231 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5232 return sprintf(page, "none\n"); 5233 5234 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5235 mddev->curr_resync == MD_RESYNC_DELAYED) 5236 return sprintf(page, "delayed\n"); 5237 5238 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5239 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5240 max_sectors = mddev->resync_max_sectors; 5241 else 5242 max_sectors = mddev->dev_sectors; 5243 5244 resync = mddev->curr_resync_completed; 5245 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5246 } 5247 5248 static struct md_sysfs_entry md_sync_completed = 5249 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5250 5251 static ssize_t 5252 min_sync_show(struct mddev *mddev, char *page) 5253 { 5254 return sprintf(page, "%llu\n", 5255 (unsigned long long)mddev->resync_min); 5256 } 5257 static ssize_t 5258 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5259 { 5260 unsigned long long min; 5261 int err; 5262 5263 if (kstrtoull(buf, 10, &min)) 5264 return -EINVAL; 5265 5266 spin_lock(&mddev->lock); 5267 err = -EINVAL; 5268 if (min > mddev->resync_max) 5269 goto out_unlock; 5270 5271 err = -EBUSY; 5272 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5273 goto out_unlock; 5274 5275 /* Round down to multiple of 4K for safety */ 5276 mddev->resync_min = round_down(min, 8); 5277 err = 0; 5278 5279 out_unlock: 5280 spin_unlock(&mddev->lock); 5281 return err ?: len; 5282 } 5283 5284 static struct md_sysfs_entry md_min_sync = 5285 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5286 5287 static ssize_t 5288 max_sync_show(struct mddev *mddev, char *page) 5289 { 5290 if (mddev->resync_max == MaxSector) 5291 return sprintf(page, "max\n"); 5292 else 5293 return sprintf(page, "%llu\n", 5294 (unsigned long long)mddev->resync_max); 5295 } 5296 static ssize_t 5297 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5298 { 5299 int err; 5300 spin_lock(&mddev->lock); 5301 if (strncmp(buf, "max", 3) == 0) 5302 mddev->resync_max = MaxSector; 5303 else { 5304 unsigned long long max; 5305 int chunk; 5306 5307 err = -EINVAL; 5308 if (kstrtoull(buf, 10, &max)) 5309 goto out_unlock; 5310 if (max < mddev->resync_min) 5311 goto out_unlock; 5312 5313 err = -EBUSY; 5314 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5315 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5316 goto out_unlock; 5317 5318 /* Must be a multiple of chunk_size */ 5319 chunk = mddev->chunk_sectors; 5320 if (chunk) { 5321 sector_t temp = max; 5322 5323 err = -EINVAL; 5324 if (sector_div(temp, chunk)) 5325 goto out_unlock; 5326 } 5327 mddev->resync_max = max; 5328 } 5329 wake_up(&mddev->recovery_wait); 5330 err = 0; 5331 out_unlock: 5332 spin_unlock(&mddev->lock); 5333 return err ?: len; 5334 } 5335 5336 static struct md_sysfs_entry md_max_sync = 5337 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5338 5339 static ssize_t 5340 suspend_lo_show(struct mddev *mddev, char *page) 5341 { 5342 return sprintf(page, "%llu\n", 5343 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5344 } 5345 5346 static ssize_t 5347 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5348 { 5349 unsigned long long new; 5350 int err; 5351 5352 err = kstrtoull(buf, 10, &new); 5353 if (err < 0) 5354 return err; 5355 if (new != (sector_t)new) 5356 return -EINVAL; 5357 5358 err = mddev_suspend(mddev, true); 5359 if (err) 5360 return err; 5361 5362 WRITE_ONCE(mddev->suspend_lo, new); 5363 mddev_resume(mddev); 5364 5365 return len; 5366 } 5367 static struct md_sysfs_entry md_suspend_lo = 5368 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5369 5370 static ssize_t 5371 suspend_hi_show(struct mddev *mddev, char *page) 5372 { 5373 return sprintf(page, "%llu\n", 5374 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5375 } 5376 5377 static ssize_t 5378 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5379 { 5380 unsigned long long new; 5381 int err; 5382 5383 err = kstrtoull(buf, 10, &new); 5384 if (err < 0) 5385 return err; 5386 if (new != (sector_t)new) 5387 return -EINVAL; 5388 5389 err = mddev_suspend(mddev, true); 5390 if (err) 5391 return err; 5392 5393 WRITE_ONCE(mddev->suspend_hi, new); 5394 mddev_resume(mddev); 5395 5396 return len; 5397 } 5398 static struct md_sysfs_entry md_suspend_hi = 5399 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5400 5401 static ssize_t 5402 reshape_position_show(struct mddev *mddev, char *page) 5403 { 5404 if (mddev->reshape_position != MaxSector) 5405 return sprintf(page, "%llu\n", 5406 (unsigned long long)mddev->reshape_position); 5407 strcpy(page, "none\n"); 5408 return 5; 5409 } 5410 5411 static ssize_t 5412 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5413 { 5414 struct md_rdev *rdev; 5415 unsigned long long new; 5416 int err; 5417 5418 err = kstrtoull(buf, 10, &new); 5419 if (err < 0) 5420 return err; 5421 if (new != (sector_t)new) 5422 return -EINVAL; 5423 err = mddev_lock(mddev); 5424 if (err) 5425 return err; 5426 err = -EBUSY; 5427 if (mddev->pers) 5428 goto unlock; 5429 mddev->reshape_position = new; 5430 mddev->delta_disks = 0; 5431 mddev->reshape_backwards = 0; 5432 mddev->new_level = mddev->level; 5433 mddev->new_layout = mddev->layout; 5434 mddev->new_chunk_sectors = mddev->chunk_sectors; 5435 rdev_for_each(rdev, mddev) 5436 rdev->new_data_offset = rdev->data_offset; 5437 err = 0; 5438 unlock: 5439 mddev_unlock(mddev); 5440 return err ?: len; 5441 } 5442 5443 static struct md_sysfs_entry md_reshape_position = 5444 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5445 reshape_position_store); 5446 5447 static ssize_t 5448 reshape_direction_show(struct mddev *mddev, char *page) 5449 { 5450 return sprintf(page, "%s\n", 5451 mddev->reshape_backwards ? "backwards" : "forwards"); 5452 } 5453 5454 static ssize_t 5455 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5456 { 5457 int backwards = 0; 5458 int err; 5459 5460 if (cmd_match(buf, "forwards")) 5461 backwards = 0; 5462 else if (cmd_match(buf, "backwards")) 5463 backwards = 1; 5464 else 5465 return -EINVAL; 5466 if (mddev->reshape_backwards == backwards) 5467 return len; 5468 5469 err = mddev_lock(mddev); 5470 if (err) 5471 return err; 5472 /* check if we are allowed to change */ 5473 if (mddev->delta_disks) 5474 err = -EBUSY; 5475 else if (mddev->persistent && 5476 mddev->major_version == 0) 5477 err = -EINVAL; 5478 else 5479 mddev->reshape_backwards = backwards; 5480 mddev_unlock(mddev); 5481 return err ?: len; 5482 } 5483 5484 static struct md_sysfs_entry md_reshape_direction = 5485 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5486 reshape_direction_store); 5487 5488 static ssize_t 5489 array_size_show(struct mddev *mddev, char *page) 5490 { 5491 if (mddev->external_size) 5492 return sprintf(page, "%llu\n", 5493 (unsigned long long)mddev->array_sectors/2); 5494 else 5495 return sprintf(page, "default\n"); 5496 } 5497 5498 static ssize_t 5499 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5500 { 5501 sector_t sectors; 5502 int err; 5503 5504 err = mddev_lock(mddev); 5505 if (err) 5506 return err; 5507 5508 /* cluster raid doesn't support change array_sectors */ 5509 if (mddev_is_clustered(mddev)) { 5510 mddev_unlock(mddev); 5511 return -EINVAL; 5512 } 5513 5514 if (strncmp(buf, "default", 7) == 0) { 5515 if (mddev->pers) 5516 sectors = mddev->pers->size(mddev, 0, 0); 5517 else 5518 sectors = mddev->array_sectors; 5519 5520 mddev->external_size = 0; 5521 } else { 5522 if (strict_blocks_to_sectors(buf, §ors) < 0) 5523 err = -EINVAL; 5524 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5525 err = -E2BIG; 5526 else 5527 mddev->external_size = 1; 5528 } 5529 5530 if (!err) { 5531 mddev->array_sectors = sectors; 5532 if (mddev->pers) 5533 set_capacity_and_notify(mddev->gendisk, 5534 mddev->array_sectors); 5535 } 5536 mddev_unlock(mddev); 5537 return err ?: len; 5538 } 5539 5540 static struct md_sysfs_entry md_array_size = 5541 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5542 array_size_store); 5543 5544 static ssize_t 5545 consistency_policy_show(struct mddev *mddev, char *page) 5546 { 5547 int ret; 5548 5549 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5550 ret = sprintf(page, "journal\n"); 5551 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5552 ret = sprintf(page, "ppl\n"); 5553 } else if (mddev->bitmap) { 5554 ret = sprintf(page, "bitmap\n"); 5555 } else if (mddev->pers) { 5556 if (mddev->pers->sync_request) 5557 ret = sprintf(page, "resync\n"); 5558 else 5559 ret = sprintf(page, "none\n"); 5560 } else { 5561 ret = sprintf(page, "unknown\n"); 5562 } 5563 5564 return ret; 5565 } 5566 5567 static ssize_t 5568 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5569 { 5570 int err = 0; 5571 5572 if (mddev->pers) { 5573 if (mddev->pers->change_consistency_policy) 5574 err = mddev->pers->change_consistency_policy(mddev, buf); 5575 else 5576 err = -EBUSY; 5577 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5578 set_bit(MD_HAS_PPL, &mddev->flags); 5579 } else { 5580 err = -EINVAL; 5581 } 5582 5583 return err ? err : len; 5584 } 5585 5586 static struct md_sysfs_entry md_consistency_policy = 5587 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5588 consistency_policy_store); 5589 5590 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5591 { 5592 return sprintf(page, "%d\n", mddev->fail_last_dev); 5593 } 5594 5595 /* 5596 * Setting fail_last_dev to true to allow last device to be forcibly removed 5597 * from RAID1/RAID10. 5598 */ 5599 static ssize_t 5600 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5601 { 5602 int ret; 5603 bool value; 5604 5605 ret = kstrtobool(buf, &value); 5606 if (ret) 5607 return ret; 5608 5609 if (value != mddev->fail_last_dev) 5610 mddev->fail_last_dev = value; 5611 5612 return len; 5613 } 5614 static struct md_sysfs_entry md_fail_last_dev = 5615 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5616 fail_last_dev_store); 5617 5618 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5619 { 5620 if (mddev->pers == NULL || (mddev->pers->level != 1)) 5621 return sprintf(page, "n/a\n"); 5622 else 5623 return sprintf(page, "%d\n", mddev->serialize_policy); 5624 } 5625 5626 /* 5627 * Setting serialize_policy to true to enforce write IO is not reordered 5628 * for raid1. 5629 */ 5630 static ssize_t 5631 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5632 { 5633 int err; 5634 bool value; 5635 5636 err = kstrtobool(buf, &value); 5637 if (err) 5638 return err; 5639 5640 if (value == mddev->serialize_policy) 5641 return len; 5642 5643 err = mddev_suspend_and_lock(mddev); 5644 if (err) 5645 return err; 5646 if (mddev->pers == NULL || (mddev->pers->level != 1)) { 5647 pr_err("md: serialize_policy is only effective for raid1\n"); 5648 err = -EINVAL; 5649 goto unlock; 5650 } 5651 5652 if (value) 5653 mddev_create_serial_pool(mddev, NULL); 5654 else 5655 mddev_destroy_serial_pool(mddev, NULL); 5656 mddev->serialize_policy = value; 5657 unlock: 5658 mddev_unlock_and_resume(mddev); 5659 return err ?: len; 5660 } 5661 5662 static struct md_sysfs_entry md_serialize_policy = 5663 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5664 serialize_policy_store); 5665 5666 5667 static struct attribute *md_default_attrs[] = { 5668 &md_level.attr, 5669 &md_layout.attr, 5670 &md_raid_disks.attr, 5671 &md_uuid.attr, 5672 &md_chunk_size.attr, 5673 &md_size.attr, 5674 &md_resync_start.attr, 5675 &md_metadata.attr, 5676 &md_new_device.attr, 5677 &md_safe_delay.attr, 5678 &md_array_state.attr, 5679 &md_reshape_position.attr, 5680 &md_reshape_direction.attr, 5681 &md_array_size.attr, 5682 &max_corr_read_errors.attr, 5683 &md_consistency_policy.attr, 5684 &md_fail_last_dev.attr, 5685 &md_serialize_policy.attr, 5686 NULL, 5687 }; 5688 5689 static const struct attribute_group md_default_group = { 5690 .attrs = md_default_attrs, 5691 }; 5692 5693 static struct attribute *md_redundancy_attrs[] = { 5694 &md_scan_mode.attr, 5695 &md_last_scan_mode.attr, 5696 &md_mismatches.attr, 5697 &md_sync_min.attr, 5698 &md_sync_max.attr, 5699 &md_sync_speed.attr, 5700 &md_sync_force_parallel.attr, 5701 &md_sync_completed.attr, 5702 &md_min_sync.attr, 5703 &md_max_sync.attr, 5704 &md_suspend_lo.attr, 5705 &md_suspend_hi.attr, 5706 &md_bitmap.attr, 5707 &md_degraded.attr, 5708 NULL, 5709 }; 5710 static const struct attribute_group md_redundancy_group = { 5711 .name = NULL, 5712 .attrs = md_redundancy_attrs, 5713 }; 5714 5715 static const struct attribute_group *md_attr_groups[] = { 5716 &md_default_group, 5717 &md_bitmap_group, 5718 NULL, 5719 }; 5720 5721 static ssize_t 5722 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5723 { 5724 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5725 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5726 ssize_t rv; 5727 5728 if (!entry->show) 5729 return -EIO; 5730 spin_lock(&all_mddevs_lock); 5731 if (!mddev_get(mddev)) { 5732 spin_unlock(&all_mddevs_lock); 5733 return -EBUSY; 5734 } 5735 spin_unlock(&all_mddevs_lock); 5736 5737 rv = entry->show(mddev, page); 5738 mddev_put(mddev); 5739 return rv; 5740 } 5741 5742 static ssize_t 5743 md_attr_store(struct kobject *kobj, struct attribute *attr, 5744 const char *page, size_t length) 5745 { 5746 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5747 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5748 ssize_t rv; 5749 5750 if (!entry->store) 5751 return -EIO; 5752 if (!capable(CAP_SYS_ADMIN)) 5753 return -EACCES; 5754 spin_lock(&all_mddevs_lock); 5755 if (!mddev_get(mddev)) { 5756 spin_unlock(&all_mddevs_lock); 5757 return -EBUSY; 5758 } 5759 spin_unlock(&all_mddevs_lock); 5760 rv = entry->store(mddev, page, length); 5761 mddev_put(mddev); 5762 return rv; 5763 } 5764 5765 static void md_kobj_release(struct kobject *ko) 5766 { 5767 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5768 5769 if (mddev->sysfs_state) 5770 sysfs_put(mddev->sysfs_state); 5771 if (mddev->sysfs_level) 5772 sysfs_put(mddev->sysfs_level); 5773 5774 del_gendisk(mddev->gendisk); 5775 put_disk(mddev->gendisk); 5776 } 5777 5778 static const struct sysfs_ops md_sysfs_ops = { 5779 .show = md_attr_show, 5780 .store = md_attr_store, 5781 }; 5782 static const struct kobj_type md_ktype = { 5783 .release = md_kobj_release, 5784 .sysfs_ops = &md_sysfs_ops, 5785 .default_groups = md_attr_groups, 5786 }; 5787 5788 int mdp_major = 0; 5789 5790 /* stack the limit for all rdevs into lim */ 5791 int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, 5792 unsigned int flags) 5793 { 5794 struct md_rdev *rdev; 5795 5796 rdev_for_each(rdev, mddev) { 5797 queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, 5798 mddev->gendisk->disk_name); 5799 if ((flags & MDDEV_STACK_INTEGRITY) && 5800 !queue_limits_stack_integrity_bdev(lim, rdev->bdev)) 5801 return -EINVAL; 5802 } 5803 5804 return 0; 5805 } 5806 EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); 5807 5808 /* apply the extra stacking limits from a new rdev into mddev */ 5809 int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) 5810 { 5811 struct queue_limits lim; 5812 5813 if (mddev_is_dm(mddev)) 5814 return 0; 5815 5816 lim = queue_limits_start_update(mddev->gendisk->queue); 5817 queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, 5818 mddev->gendisk->disk_name); 5819 5820 if (!queue_limits_stack_integrity_bdev(&lim, rdev->bdev)) { 5821 pr_err("%s: incompatible integrity profile for %pg\n", 5822 mdname(mddev), rdev->bdev); 5823 queue_limits_cancel_update(mddev->gendisk->queue); 5824 return -ENXIO; 5825 } 5826 5827 return queue_limits_commit_update(mddev->gendisk->queue, &lim); 5828 } 5829 EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); 5830 5831 /* update the optimal I/O size after a reshape */ 5832 void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) 5833 { 5834 struct queue_limits lim; 5835 5836 if (mddev_is_dm(mddev)) 5837 return; 5838 5839 /* don't bother updating io_opt if we can't suspend the array */ 5840 if (mddev_suspend(mddev, false) < 0) 5841 return; 5842 lim = queue_limits_start_update(mddev->gendisk->queue); 5843 lim.io_opt = lim.io_min * nr_stripes; 5844 queue_limits_commit_update(mddev->gendisk->queue, &lim); 5845 mddev_resume(mddev); 5846 } 5847 EXPORT_SYMBOL_GPL(mddev_update_io_opt); 5848 5849 static void mddev_delayed_delete(struct work_struct *ws) 5850 { 5851 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5852 5853 kobject_put(&mddev->kobj); 5854 } 5855 5856 void md_init_stacking_limits(struct queue_limits *lim) 5857 { 5858 blk_set_stacking_limits(lim); 5859 lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | 5860 BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; 5861 } 5862 EXPORT_SYMBOL_GPL(md_init_stacking_limits); 5863 5864 struct mddev *md_alloc(dev_t dev, char *name) 5865 { 5866 /* 5867 * If dev is zero, name is the name of a device to allocate with 5868 * an arbitrary minor number. It will be "md_???" 5869 * If dev is non-zero it must be a device number with a MAJOR of 5870 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5871 * the device is being created by opening a node in /dev. 5872 * If "name" is not NULL, the device is being created by 5873 * writing to /sys/module/md_mod/parameters/new_array. 5874 */ 5875 static DEFINE_MUTEX(disks_mutex); 5876 struct mddev *mddev; 5877 struct gendisk *disk; 5878 int partitioned; 5879 int shift; 5880 int unit; 5881 int error; 5882 5883 /* 5884 * Wait for any previous instance of this device to be completely 5885 * removed (mddev_delayed_delete). 5886 */ 5887 flush_workqueue(md_misc_wq); 5888 5889 mutex_lock(&disks_mutex); 5890 mddev = mddev_alloc(dev); 5891 if (IS_ERR(mddev)) { 5892 error = PTR_ERR(mddev); 5893 goto out_unlock; 5894 } 5895 5896 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5897 shift = partitioned ? MdpMinorShift : 0; 5898 unit = MINOR(mddev->unit) >> shift; 5899 5900 if (name && !dev) { 5901 /* Need to ensure that 'name' is not a duplicate. 5902 */ 5903 struct mddev *mddev2; 5904 spin_lock(&all_mddevs_lock); 5905 5906 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5907 if (mddev2->gendisk && 5908 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5909 spin_unlock(&all_mddevs_lock); 5910 error = -EEXIST; 5911 goto out_free_mddev; 5912 } 5913 spin_unlock(&all_mddevs_lock); 5914 } 5915 if (name && dev) 5916 /* 5917 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5918 */ 5919 mddev->hold_active = UNTIL_STOP; 5920 5921 disk = blk_alloc_disk(NULL, NUMA_NO_NODE); 5922 if (IS_ERR(disk)) { 5923 error = PTR_ERR(disk); 5924 goto out_free_mddev; 5925 } 5926 5927 disk->major = MAJOR(mddev->unit); 5928 disk->first_minor = unit << shift; 5929 disk->minors = 1 << shift; 5930 if (name) 5931 strcpy(disk->disk_name, name); 5932 else if (partitioned) 5933 sprintf(disk->disk_name, "md_d%d", unit); 5934 else 5935 sprintf(disk->disk_name, "md%d", unit); 5936 disk->fops = &md_fops; 5937 disk->private_data = mddev; 5938 5939 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5940 mddev->gendisk = disk; 5941 error = add_disk(disk); 5942 if (error) 5943 goto out_put_disk; 5944 5945 kobject_init(&mddev->kobj, &md_ktype); 5946 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5947 if (error) { 5948 /* 5949 * The disk is already live at this point. Clear the hold flag 5950 * and let mddev_put take care of the deletion, as it isn't any 5951 * different from a normal close on last release now. 5952 */ 5953 mddev->hold_active = 0; 5954 mutex_unlock(&disks_mutex); 5955 mddev_put(mddev); 5956 return ERR_PTR(error); 5957 } 5958 5959 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5960 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5961 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 5962 mutex_unlock(&disks_mutex); 5963 return mddev; 5964 5965 out_put_disk: 5966 put_disk(disk); 5967 out_free_mddev: 5968 mddev_free(mddev); 5969 out_unlock: 5970 mutex_unlock(&disks_mutex); 5971 return ERR_PTR(error); 5972 } 5973 5974 static int md_alloc_and_put(dev_t dev, char *name) 5975 { 5976 struct mddev *mddev = md_alloc(dev, name); 5977 5978 if (IS_ERR(mddev)) 5979 return PTR_ERR(mddev); 5980 mddev_put(mddev); 5981 return 0; 5982 } 5983 5984 static void md_probe(dev_t dev) 5985 { 5986 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 5987 return; 5988 if (create_on_open) 5989 md_alloc_and_put(dev, NULL); 5990 } 5991 5992 static int add_named_array(const char *val, const struct kernel_param *kp) 5993 { 5994 /* 5995 * val must be "md_*" or "mdNNN". 5996 * For "md_*" we allocate an array with a large free minor number, and 5997 * set the name to val. val must not already be an active name. 5998 * For "mdNNN" we allocate an array with the minor number NNN 5999 * which must not already be in use. 6000 */ 6001 int len = strlen(val); 6002 char buf[DISK_NAME_LEN]; 6003 unsigned long devnum; 6004 6005 while (len && val[len-1] == '\n') 6006 len--; 6007 if (len >= DISK_NAME_LEN) 6008 return -E2BIG; 6009 strscpy(buf, val, len+1); 6010 if (strncmp(buf, "md_", 3) == 0) 6011 return md_alloc_and_put(0, buf); 6012 if (strncmp(buf, "md", 2) == 0 && 6013 isdigit(buf[2]) && 6014 kstrtoul(buf+2, 10, &devnum) == 0 && 6015 devnum <= MINORMASK) 6016 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 6017 6018 return -EINVAL; 6019 } 6020 6021 static void md_safemode_timeout(struct timer_list *t) 6022 { 6023 struct mddev *mddev = from_timer(mddev, t, safemode_timer); 6024 6025 mddev->safemode = 1; 6026 if (mddev->external) 6027 sysfs_notify_dirent_safe(mddev->sysfs_state); 6028 6029 md_wakeup_thread(mddev->thread); 6030 } 6031 6032 static int start_dirty_degraded; 6033 6034 int md_run(struct mddev *mddev) 6035 { 6036 int err; 6037 struct md_rdev *rdev; 6038 struct md_personality *pers; 6039 bool nowait = true; 6040 6041 if (list_empty(&mddev->disks)) 6042 /* cannot run an array with no devices.. */ 6043 return -EINVAL; 6044 6045 if (mddev->pers) 6046 return -EBUSY; 6047 /* Cannot run until previous stop completes properly */ 6048 if (mddev->sysfs_active) 6049 return -EBUSY; 6050 6051 /* 6052 * Analyze all RAID superblock(s) 6053 */ 6054 if (!mddev->raid_disks) { 6055 if (!mddev->persistent) 6056 return -EINVAL; 6057 err = analyze_sbs(mddev); 6058 if (err) 6059 return -EINVAL; 6060 } 6061 6062 if (mddev->level != LEVEL_NONE) 6063 request_module("md-level-%d", mddev->level); 6064 else if (mddev->clevel[0]) 6065 request_module("md-%s", mddev->clevel); 6066 6067 /* 6068 * Drop all container device buffers, from now on 6069 * the only valid external interface is through the md 6070 * device. 6071 */ 6072 mddev->has_superblocks = false; 6073 rdev_for_each(rdev, mddev) { 6074 if (test_bit(Faulty, &rdev->flags)) 6075 continue; 6076 sync_blockdev(rdev->bdev); 6077 invalidate_bdev(rdev->bdev); 6078 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 6079 mddev->ro = MD_RDONLY; 6080 if (!mddev_is_dm(mddev)) 6081 set_disk_ro(mddev->gendisk, 1); 6082 } 6083 6084 if (rdev->sb_page) 6085 mddev->has_superblocks = true; 6086 6087 /* perform some consistency tests on the device. 6088 * We don't want the data to overlap the metadata, 6089 * Internal Bitmap issues have been handled elsewhere. 6090 */ 6091 if (rdev->meta_bdev) { 6092 /* Nothing to check */; 6093 } else if (rdev->data_offset < rdev->sb_start) { 6094 if (mddev->dev_sectors && 6095 rdev->data_offset + mddev->dev_sectors 6096 > rdev->sb_start) { 6097 pr_warn("md: %s: data overlaps metadata\n", 6098 mdname(mddev)); 6099 return -EINVAL; 6100 } 6101 } else { 6102 if (rdev->sb_start + rdev->sb_size/512 6103 > rdev->data_offset) { 6104 pr_warn("md: %s: metadata overlaps data\n", 6105 mdname(mddev)); 6106 return -EINVAL; 6107 } 6108 } 6109 sysfs_notify_dirent_safe(rdev->sysfs_state); 6110 nowait = nowait && bdev_nowait(rdev->bdev); 6111 } 6112 6113 if (!bioset_initialized(&mddev->bio_set)) { 6114 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6115 if (err) 6116 return err; 6117 } 6118 if (!bioset_initialized(&mddev->sync_set)) { 6119 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 6120 if (err) 6121 goto exit_bio_set; 6122 } 6123 6124 if (!bioset_initialized(&mddev->io_clone_set)) { 6125 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 6126 offsetof(struct md_io_clone, bio_clone), 0); 6127 if (err) 6128 goto exit_sync_set; 6129 } 6130 6131 spin_lock(&pers_lock); 6132 pers = find_pers(mddev->level, mddev->clevel); 6133 if (!pers || !try_module_get(pers->owner)) { 6134 spin_unlock(&pers_lock); 6135 if (mddev->level != LEVEL_NONE) 6136 pr_warn("md: personality for level %d is not loaded!\n", 6137 mddev->level); 6138 else 6139 pr_warn("md: personality for level %s is not loaded!\n", 6140 mddev->clevel); 6141 err = -EINVAL; 6142 goto abort; 6143 } 6144 spin_unlock(&pers_lock); 6145 if (mddev->level != pers->level) { 6146 mddev->level = pers->level; 6147 mddev->new_level = pers->level; 6148 } 6149 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 6150 6151 if (mddev->reshape_position != MaxSector && 6152 pers->start_reshape == NULL) { 6153 /* This personality cannot handle reshaping... */ 6154 module_put(pers->owner); 6155 err = -EINVAL; 6156 goto abort; 6157 } 6158 6159 if (pers->sync_request) { 6160 /* Warn if this is a potentially silly 6161 * configuration. 6162 */ 6163 struct md_rdev *rdev2; 6164 int warned = 0; 6165 6166 rdev_for_each(rdev, mddev) 6167 rdev_for_each(rdev2, mddev) { 6168 if (rdev < rdev2 && 6169 rdev->bdev->bd_disk == 6170 rdev2->bdev->bd_disk) { 6171 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 6172 mdname(mddev), 6173 rdev->bdev, 6174 rdev2->bdev); 6175 warned = 1; 6176 } 6177 } 6178 6179 if (warned) 6180 pr_warn("True protection against single-disk failure might be compromised.\n"); 6181 } 6182 6183 /* dm-raid expect sync_thread to be frozen until resume */ 6184 if (mddev->gendisk) 6185 mddev->recovery = 0; 6186 6187 /* may be over-ridden by personality */ 6188 mddev->resync_max_sectors = mddev->dev_sectors; 6189 6190 mddev->ok_start_degraded = start_dirty_degraded; 6191 6192 if (start_readonly && md_is_rdwr(mddev)) 6193 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6194 6195 err = pers->run(mddev); 6196 if (err) 6197 pr_warn("md: pers->run() failed ...\n"); 6198 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6199 WARN_ONCE(!mddev->external_size, 6200 "%s: default size too small, but 'external_size' not in effect?\n", 6201 __func__); 6202 pr_warn("md: invalid array_size %llu > default size %llu\n", 6203 (unsigned long long)mddev->array_sectors / 2, 6204 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6205 err = -EINVAL; 6206 } 6207 if (err == 0 && pers->sync_request && 6208 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6209 struct bitmap *bitmap; 6210 6211 bitmap = md_bitmap_create(mddev, -1); 6212 if (IS_ERR(bitmap)) { 6213 err = PTR_ERR(bitmap); 6214 pr_warn("%s: failed to create bitmap (%d)\n", 6215 mdname(mddev), err); 6216 } else 6217 mddev->bitmap = bitmap; 6218 6219 } 6220 if (err) 6221 goto bitmap_abort; 6222 6223 if (mddev->bitmap_info.max_write_behind > 0) { 6224 bool create_pool = false; 6225 6226 rdev_for_each(rdev, mddev) { 6227 if (test_bit(WriteMostly, &rdev->flags) && 6228 rdev_init_serial(rdev)) 6229 create_pool = true; 6230 } 6231 if (create_pool && mddev->serial_info_pool == NULL) { 6232 mddev->serial_info_pool = 6233 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6234 sizeof(struct serial_info)); 6235 if (!mddev->serial_info_pool) { 6236 err = -ENOMEM; 6237 goto bitmap_abort; 6238 } 6239 } 6240 } 6241 6242 if (pers->sync_request) { 6243 if (mddev->kobj.sd && 6244 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6245 pr_warn("md: cannot register extra attributes for %s\n", 6246 mdname(mddev)); 6247 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6248 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6249 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6250 } else if (mddev->ro == MD_AUTO_READ) 6251 mddev->ro = MD_RDWR; 6252 6253 atomic_set(&mddev->max_corr_read_errors, 6254 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6255 mddev->safemode = 0; 6256 if (mddev_is_clustered(mddev)) 6257 mddev->safemode_delay = 0; 6258 else 6259 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6260 mddev->in_sync = 1; 6261 smp_wmb(); 6262 spin_lock(&mddev->lock); 6263 mddev->pers = pers; 6264 spin_unlock(&mddev->lock); 6265 rdev_for_each(rdev, mddev) 6266 if (rdev->raid_disk >= 0) 6267 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6268 6269 if (mddev->degraded && md_is_rdwr(mddev)) 6270 /* This ensures that recovering status is reported immediately 6271 * via sysfs - until a lack of spares is confirmed. 6272 */ 6273 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6274 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6275 6276 if (mddev->sb_flags) 6277 md_update_sb(mddev, 0); 6278 6279 md_new_event(); 6280 return 0; 6281 6282 bitmap_abort: 6283 mddev_detach(mddev); 6284 if (mddev->private) 6285 pers->free(mddev, mddev->private); 6286 mddev->private = NULL; 6287 module_put(pers->owner); 6288 md_bitmap_destroy(mddev); 6289 abort: 6290 bioset_exit(&mddev->io_clone_set); 6291 exit_sync_set: 6292 bioset_exit(&mddev->sync_set); 6293 exit_bio_set: 6294 bioset_exit(&mddev->bio_set); 6295 return err; 6296 } 6297 EXPORT_SYMBOL_GPL(md_run); 6298 6299 int do_md_run(struct mddev *mddev) 6300 { 6301 int err; 6302 6303 set_bit(MD_NOT_READY, &mddev->flags); 6304 err = md_run(mddev); 6305 if (err) 6306 goto out; 6307 err = md_bitmap_load(mddev); 6308 if (err) { 6309 md_bitmap_destroy(mddev); 6310 goto out; 6311 } 6312 6313 if (mddev_is_clustered(mddev)) 6314 md_allow_write(mddev); 6315 6316 /* run start up tasks that require md_thread */ 6317 md_start(mddev); 6318 6319 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6320 6321 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6322 clear_bit(MD_NOT_READY, &mddev->flags); 6323 mddev->changed = 1; 6324 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6325 sysfs_notify_dirent_safe(mddev->sysfs_state); 6326 sysfs_notify_dirent_safe(mddev->sysfs_action); 6327 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6328 out: 6329 clear_bit(MD_NOT_READY, &mddev->flags); 6330 return err; 6331 } 6332 6333 int md_start(struct mddev *mddev) 6334 { 6335 int ret = 0; 6336 6337 if (mddev->pers->start) { 6338 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6339 ret = mddev->pers->start(mddev); 6340 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6341 md_wakeup_thread(mddev->sync_thread); 6342 } 6343 return ret; 6344 } 6345 EXPORT_SYMBOL_GPL(md_start); 6346 6347 static int restart_array(struct mddev *mddev) 6348 { 6349 struct gendisk *disk = mddev->gendisk; 6350 struct md_rdev *rdev; 6351 bool has_journal = false; 6352 bool has_readonly = false; 6353 6354 /* Complain if it has no devices */ 6355 if (list_empty(&mddev->disks)) 6356 return -ENXIO; 6357 if (!mddev->pers) 6358 return -EINVAL; 6359 if (md_is_rdwr(mddev)) 6360 return -EBUSY; 6361 6362 rcu_read_lock(); 6363 rdev_for_each_rcu(rdev, mddev) { 6364 if (test_bit(Journal, &rdev->flags) && 6365 !test_bit(Faulty, &rdev->flags)) 6366 has_journal = true; 6367 if (rdev_read_only(rdev)) 6368 has_readonly = true; 6369 } 6370 rcu_read_unlock(); 6371 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6372 /* Don't restart rw with journal missing/faulty */ 6373 return -EINVAL; 6374 if (has_readonly) 6375 return -EROFS; 6376 6377 mddev->safemode = 0; 6378 mddev->ro = MD_RDWR; 6379 set_disk_ro(disk, 0); 6380 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6381 /* Kick recovery or resync if necessary */ 6382 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6383 md_wakeup_thread(mddev->sync_thread); 6384 sysfs_notify_dirent_safe(mddev->sysfs_state); 6385 return 0; 6386 } 6387 6388 static void md_clean(struct mddev *mddev) 6389 { 6390 mddev->array_sectors = 0; 6391 mddev->external_size = 0; 6392 mddev->dev_sectors = 0; 6393 mddev->raid_disks = 0; 6394 mddev->recovery_cp = 0; 6395 mddev->resync_min = 0; 6396 mddev->resync_max = MaxSector; 6397 mddev->reshape_position = MaxSector; 6398 /* we still need mddev->external in export_rdev, do not clear it yet */ 6399 mddev->persistent = 0; 6400 mddev->level = LEVEL_NONE; 6401 mddev->clevel[0] = 0; 6402 /* 6403 * Don't clear MD_CLOSING, or mddev can be opened again. 6404 * 'hold_active != 0' means mddev is still in the creation 6405 * process and will be used later. 6406 */ 6407 if (mddev->hold_active) 6408 mddev->flags = 0; 6409 else 6410 mddev->flags &= BIT_ULL_MASK(MD_CLOSING); 6411 mddev->sb_flags = 0; 6412 mddev->ro = MD_RDWR; 6413 mddev->metadata_type[0] = 0; 6414 mddev->chunk_sectors = 0; 6415 mddev->ctime = mddev->utime = 0; 6416 mddev->layout = 0; 6417 mddev->max_disks = 0; 6418 mddev->events = 0; 6419 mddev->can_decrease_events = 0; 6420 mddev->delta_disks = 0; 6421 mddev->reshape_backwards = 0; 6422 mddev->new_level = LEVEL_NONE; 6423 mddev->new_layout = 0; 6424 mddev->new_chunk_sectors = 0; 6425 mddev->curr_resync = MD_RESYNC_NONE; 6426 atomic64_set(&mddev->resync_mismatches, 0); 6427 mddev->suspend_lo = mddev->suspend_hi = 0; 6428 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6429 mddev->recovery = 0; 6430 mddev->in_sync = 0; 6431 mddev->changed = 0; 6432 mddev->degraded = 0; 6433 mddev->safemode = 0; 6434 mddev->private = NULL; 6435 mddev->cluster_info = NULL; 6436 mddev->bitmap_info.offset = 0; 6437 mddev->bitmap_info.default_offset = 0; 6438 mddev->bitmap_info.default_space = 0; 6439 mddev->bitmap_info.chunksize = 0; 6440 mddev->bitmap_info.daemon_sleep = 0; 6441 mddev->bitmap_info.max_write_behind = 0; 6442 mddev->bitmap_info.nodes = 0; 6443 } 6444 6445 static void __md_stop_writes(struct mddev *mddev) 6446 { 6447 del_timer_sync(&mddev->safemode_timer); 6448 6449 if (mddev->pers && mddev->pers->quiesce) { 6450 mddev->pers->quiesce(mddev, 1); 6451 mddev->pers->quiesce(mddev, 0); 6452 } 6453 md_bitmap_flush(mddev); 6454 6455 if (md_is_rdwr(mddev) && 6456 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6457 mddev->sb_flags)) { 6458 /* mark array as shutdown cleanly */ 6459 if (!mddev_is_clustered(mddev)) 6460 mddev->in_sync = 1; 6461 md_update_sb(mddev, 1); 6462 } 6463 /* disable policy to guarantee rdevs free resources for serialization */ 6464 mddev->serialize_policy = 0; 6465 mddev_destroy_serial_pool(mddev, NULL); 6466 } 6467 6468 void md_stop_writes(struct mddev *mddev) 6469 { 6470 mddev_lock_nointr(mddev); 6471 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6472 stop_sync_thread(mddev, true); 6473 __md_stop_writes(mddev); 6474 mddev_unlock(mddev); 6475 } 6476 EXPORT_SYMBOL_GPL(md_stop_writes); 6477 6478 static void mddev_detach(struct mddev *mddev) 6479 { 6480 md_bitmap_wait_behind_writes(mddev); 6481 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6482 mddev->pers->quiesce(mddev, 1); 6483 mddev->pers->quiesce(mddev, 0); 6484 } 6485 md_unregister_thread(mddev, &mddev->thread); 6486 6487 /* the unplug fn references 'conf' */ 6488 if (!mddev_is_dm(mddev)) 6489 blk_sync_queue(mddev->gendisk->queue); 6490 } 6491 6492 static void __md_stop(struct mddev *mddev) 6493 { 6494 struct md_personality *pers = mddev->pers; 6495 md_bitmap_destroy(mddev); 6496 mddev_detach(mddev); 6497 spin_lock(&mddev->lock); 6498 mddev->pers = NULL; 6499 spin_unlock(&mddev->lock); 6500 if (mddev->private) 6501 pers->free(mddev, mddev->private); 6502 mddev->private = NULL; 6503 if (pers->sync_request && mddev->to_remove == NULL) 6504 mddev->to_remove = &md_redundancy_group; 6505 module_put(pers->owner); 6506 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6507 6508 bioset_exit(&mddev->bio_set); 6509 bioset_exit(&mddev->sync_set); 6510 bioset_exit(&mddev->io_clone_set); 6511 } 6512 6513 void md_stop(struct mddev *mddev) 6514 { 6515 lockdep_assert_held(&mddev->reconfig_mutex); 6516 6517 /* stop the array and free an attached data structures. 6518 * This is called from dm-raid 6519 */ 6520 __md_stop_writes(mddev); 6521 __md_stop(mddev); 6522 } 6523 6524 EXPORT_SYMBOL_GPL(md_stop); 6525 6526 /* ensure 'mddev->pers' exist before calling md_set_readonly() */ 6527 static int md_set_readonly(struct mddev *mddev) 6528 { 6529 int err = 0; 6530 int did_freeze = 0; 6531 6532 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6533 return -EBUSY; 6534 6535 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6536 did_freeze = 1; 6537 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6538 } 6539 6540 stop_sync_thread(mddev, false); 6541 wait_event(mddev->sb_wait, 6542 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6543 mddev_lock_nointr(mddev); 6544 6545 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6546 pr_warn("md: %s still in use.\n",mdname(mddev)); 6547 err = -EBUSY; 6548 goto out; 6549 } 6550 6551 __md_stop_writes(mddev); 6552 6553 if (mddev->ro == MD_RDONLY) { 6554 err = -ENXIO; 6555 goto out; 6556 } 6557 6558 mddev->ro = MD_RDONLY; 6559 set_disk_ro(mddev->gendisk, 1); 6560 6561 out: 6562 if (!err || did_freeze) { 6563 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6564 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6565 sysfs_notify_dirent_safe(mddev->sysfs_state); 6566 } 6567 6568 return err; 6569 } 6570 6571 /* mode: 6572 * 0 - completely stop and dis-assemble array 6573 * 2 - stop but do not disassemble array 6574 */ 6575 static int do_md_stop(struct mddev *mddev, int mode) 6576 { 6577 struct gendisk *disk = mddev->gendisk; 6578 struct md_rdev *rdev; 6579 int did_freeze = 0; 6580 6581 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6582 did_freeze = 1; 6583 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6584 } 6585 6586 stop_sync_thread(mddev, true); 6587 6588 if (mddev->sysfs_active || 6589 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6590 pr_warn("md: %s still in use.\n",mdname(mddev)); 6591 if (did_freeze) { 6592 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6593 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6594 } 6595 return -EBUSY; 6596 } 6597 if (mddev->pers) { 6598 if (!md_is_rdwr(mddev)) 6599 set_disk_ro(disk, 0); 6600 6601 __md_stop_writes(mddev); 6602 __md_stop(mddev); 6603 6604 /* tell userspace to handle 'inactive' */ 6605 sysfs_notify_dirent_safe(mddev->sysfs_state); 6606 6607 rdev_for_each(rdev, mddev) 6608 if (rdev->raid_disk >= 0) 6609 sysfs_unlink_rdev(mddev, rdev); 6610 6611 set_capacity_and_notify(disk, 0); 6612 mddev->changed = 1; 6613 6614 if (!md_is_rdwr(mddev)) 6615 mddev->ro = MD_RDWR; 6616 } 6617 /* 6618 * Free resources if final stop 6619 */ 6620 if (mode == 0) { 6621 pr_info("md: %s stopped.\n", mdname(mddev)); 6622 6623 if (mddev->bitmap_info.file) { 6624 struct file *f = mddev->bitmap_info.file; 6625 spin_lock(&mddev->lock); 6626 mddev->bitmap_info.file = NULL; 6627 spin_unlock(&mddev->lock); 6628 fput(f); 6629 } 6630 mddev->bitmap_info.offset = 0; 6631 6632 export_array(mddev); 6633 6634 md_clean(mddev); 6635 if (mddev->hold_active == UNTIL_STOP) 6636 mddev->hold_active = 0; 6637 } 6638 md_new_event(); 6639 sysfs_notify_dirent_safe(mddev->sysfs_state); 6640 return 0; 6641 } 6642 6643 #ifndef MODULE 6644 static void autorun_array(struct mddev *mddev) 6645 { 6646 struct md_rdev *rdev; 6647 int err; 6648 6649 if (list_empty(&mddev->disks)) 6650 return; 6651 6652 pr_info("md: running: "); 6653 6654 rdev_for_each(rdev, mddev) { 6655 pr_cont("<%pg>", rdev->bdev); 6656 } 6657 pr_cont("\n"); 6658 6659 err = do_md_run(mddev); 6660 if (err) { 6661 pr_warn("md: do_md_run() returned %d\n", err); 6662 do_md_stop(mddev, 0); 6663 } 6664 } 6665 6666 /* 6667 * lets try to run arrays based on all disks that have arrived 6668 * until now. (those are in pending_raid_disks) 6669 * 6670 * the method: pick the first pending disk, collect all disks with 6671 * the same UUID, remove all from the pending list and put them into 6672 * the 'same_array' list. Then order this list based on superblock 6673 * update time (freshest comes first), kick out 'old' disks and 6674 * compare superblocks. If everything's fine then run it. 6675 * 6676 * If "unit" is allocated, then bump its reference count 6677 */ 6678 static void autorun_devices(int part) 6679 { 6680 struct md_rdev *rdev0, *rdev, *tmp; 6681 struct mddev *mddev; 6682 6683 pr_info("md: autorun ...\n"); 6684 while (!list_empty(&pending_raid_disks)) { 6685 int unit; 6686 dev_t dev; 6687 LIST_HEAD(candidates); 6688 rdev0 = list_entry(pending_raid_disks.next, 6689 struct md_rdev, same_set); 6690 6691 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6692 INIT_LIST_HEAD(&candidates); 6693 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6694 if (super_90_load(rdev, rdev0, 0) >= 0) { 6695 pr_debug("md: adding %pg ...\n", 6696 rdev->bdev); 6697 list_move(&rdev->same_set, &candidates); 6698 } 6699 /* 6700 * now we have a set of devices, with all of them having 6701 * mostly sane superblocks. It's time to allocate the 6702 * mddev. 6703 */ 6704 if (part) { 6705 dev = MKDEV(mdp_major, 6706 rdev0->preferred_minor << MdpMinorShift); 6707 unit = MINOR(dev) >> MdpMinorShift; 6708 } else { 6709 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6710 unit = MINOR(dev); 6711 } 6712 if (rdev0->preferred_minor != unit) { 6713 pr_warn("md: unit number in %pg is bad: %d\n", 6714 rdev0->bdev, rdev0->preferred_minor); 6715 break; 6716 } 6717 6718 mddev = md_alloc(dev, NULL); 6719 if (IS_ERR(mddev)) 6720 break; 6721 6722 if (mddev_suspend_and_lock(mddev)) 6723 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6724 else if (mddev->raid_disks || mddev->major_version 6725 || !list_empty(&mddev->disks)) { 6726 pr_warn("md: %s already running, cannot run %pg\n", 6727 mdname(mddev), rdev0->bdev); 6728 mddev_unlock_and_resume(mddev); 6729 } else { 6730 pr_debug("md: created %s\n", mdname(mddev)); 6731 mddev->persistent = 1; 6732 rdev_for_each_list(rdev, tmp, &candidates) { 6733 list_del_init(&rdev->same_set); 6734 if (bind_rdev_to_array(rdev, mddev)) 6735 export_rdev(rdev, mddev); 6736 } 6737 autorun_array(mddev); 6738 mddev_unlock_and_resume(mddev); 6739 } 6740 /* on success, candidates will be empty, on error 6741 * it won't... 6742 */ 6743 rdev_for_each_list(rdev, tmp, &candidates) { 6744 list_del_init(&rdev->same_set); 6745 export_rdev(rdev, mddev); 6746 } 6747 mddev_put(mddev); 6748 } 6749 pr_info("md: ... autorun DONE.\n"); 6750 } 6751 #endif /* !MODULE */ 6752 6753 static int get_version(void __user *arg) 6754 { 6755 mdu_version_t ver; 6756 6757 ver.major = MD_MAJOR_VERSION; 6758 ver.minor = MD_MINOR_VERSION; 6759 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6760 6761 if (copy_to_user(arg, &ver, sizeof(ver))) 6762 return -EFAULT; 6763 6764 return 0; 6765 } 6766 6767 static int get_array_info(struct mddev *mddev, void __user *arg) 6768 { 6769 mdu_array_info_t info; 6770 int nr,working,insync,failed,spare; 6771 struct md_rdev *rdev; 6772 6773 nr = working = insync = failed = spare = 0; 6774 rcu_read_lock(); 6775 rdev_for_each_rcu(rdev, mddev) { 6776 nr++; 6777 if (test_bit(Faulty, &rdev->flags)) 6778 failed++; 6779 else { 6780 working++; 6781 if (test_bit(In_sync, &rdev->flags)) 6782 insync++; 6783 else if (test_bit(Journal, &rdev->flags)) 6784 /* TODO: add journal count to md_u.h */ 6785 ; 6786 else 6787 spare++; 6788 } 6789 } 6790 rcu_read_unlock(); 6791 6792 info.major_version = mddev->major_version; 6793 info.minor_version = mddev->minor_version; 6794 info.patch_version = MD_PATCHLEVEL_VERSION; 6795 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6796 info.level = mddev->level; 6797 info.size = mddev->dev_sectors / 2; 6798 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6799 info.size = -1; 6800 info.nr_disks = nr; 6801 info.raid_disks = mddev->raid_disks; 6802 info.md_minor = mddev->md_minor; 6803 info.not_persistent= !mddev->persistent; 6804 6805 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6806 info.state = 0; 6807 if (mddev->in_sync) 6808 info.state = (1<<MD_SB_CLEAN); 6809 if (mddev->bitmap && mddev->bitmap_info.offset) 6810 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6811 if (mddev_is_clustered(mddev)) 6812 info.state |= (1<<MD_SB_CLUSTERED); 6813 info.active_disks = insync; 6814 info.working_disks = working; 6815 info.failed_disks = failed; 6816 info.spare_disks = spare; 6817 6818 info.layout = mddev->layout; 6819 info.chunk_size = mddev->chunk_sectors << 9; 6820 6821 if (copy_to_user(arg, &info, sizeof(info))) 6822 return -EFAULT; 6823 6824 return 0; 6825 } 6826 6827 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6828 { 6829 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6830 char *ptr; 6831 int err; 6832 6833 file = kzalloc(sizeof(*file), GFP_NOIO); 6834 if (!file) 6835 return -ENOMEM; 6836 6837 err = 0; 6838 spin_lock(&mddev->lock); 6839 /* bitmap enabled */ 6840 if (mddev->bitmap_info.file) { 6841 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6842 sizeof(file->pathname)); 6843 if (IS_ERR(ptr)) 6844 err = PTR_ERR(ptr); 6845 else 6846 memmove(file->pathname, ptr, 6847 sizeof(file->pathname)-(ptr-file->pathname)); 6848 } 6849 spin_unlock(&mddev->lock); 6850 6851 if (err == 0 && 6852 copy_to_user(arg, file, sizeof(*file))) 6853 err = -EFAULT; 6854 6855 kfree(file); 6856 return err; 6857 } 6858 6859 static int get_disk_info(struct mddev *mddev, void __user * arg) 6860 { 6861 mdu_disk_info_t info; 6862 struct md_rdev *rdev; 6863 6864 if (copy_from_user(&info, arg, sizeof(info))) 6865 return -EFAULT; 6866 6867 rcu_read_lock(); 6868 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6869 if (rdev) { 6870 info.major = MAJOR(rdev->bdev->bd_dev); 6871 info.minor = MINOR(rdev->bdev->bd_dev); 6872 info.raid_disk = rdev->raid_disk; 6873 info.state = 0; 6874 if (test_bit(Faulty, &rdev->flags)) 6875 info.state |= (1<<MD_DISK_FAULTY); 6876 else if (test_bit(In_sync, &rdev->flags)) { 6877 info.state |= (1<<MD_DISK_ACTIVE); 6878 info.state |= (1<<MD_DISK_SYNC); 6879 } 6880 if (test_bit(Journal, &rdev->flags)) 6881 info.state |= (1<<MD_DISK_JOURNAL); 6882 if (test_bit(WriteMostly, &rdev->flags)) 6883 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6884 if (test_bit(FailFast, &rdev->flags)) 6885 info.state |= (1<<MD_DISK_FAILFAST); 6886 } else { 6887 info.major = info.minor = 0; 6888 info.raid_disk = -1; 6889 info.state = (1<<MD_DISK_REMOVED); 6890 } 6891 rcu_read_unlock(); 6892 6893 if (copy_to_user(arg, &info, sizeof(info))) 6894 return -EFAULT; 6895 6896 return 0; 6897 } 6898 6899 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6900 { 6901 struct md_rdev *rdev; 6902 dev_t dev = MKDEV(info->major,info->minor); 6903 6904 if (mddev_is_clustered(mddev) && 6905 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6906 pr_warn("%s: Cannot add to clustered mddev.\n", 6907 mdname(mddev)); 6908 return -EINVAL; 6909 } 6910 6911 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6912 return -EOVERFLOW; 6913 6914 if (!mddev->raid_disks) { 6915 int err; 6916 /* expecting a device which has a superblock */ 6917 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6918 if (IS_ERR(rdev)) { 6919 pr_warn("md: md_import_device returned %ld\n", 6920 PTR_ERR(rdev)); 6921 return PTR_ERR(rdev); 6922 } 6923 if (!list_empty(&mddev->disks)) { 6924 struct md_rdev *rdev0 6925 = list_entry(mddev->disks.next, 6926 struct md_rdev, same_set); 6927 err = super_types[mddev->major_version] 6928 .load_super(rdev, rdev0, mddev->minor_version); 6929 if (err < 0) { 6930 pr_warn("md: %pg has different UUID to %pg\n", 6931 rdev->bdev, 6932 rdev0->bdev); 6933 export_rdev(rdev, mddev); 6934 return -EINVAL; 6935 } 6936 } 6937 err = bind_rdev_to_array(rdev, mddev); 6938 if (err) 6939 export_rdev(rdev, mddev); 6940 return err; 6941 } 6942 6943 /* 6944 * md_add_new_disk can be used once the array is assembled 6945 * to add "hot spares". They must already have a superblock 6946 * written 6947 */ 6948 if (mddev->pers) { 6949 int err; 6950 if (!mddev->pers->hot_add_disk) { 6951 pr_warn("%s: personality does not support diskops!\n", 6952 mdname(mddev)); 6953 return -EINVAL; 6954 } 6955 if (mddev->persistent) 6956 rdev = md_import_device(dev, mddev->major_version, 6957 mddev->minor_version); 6958 else 6959 rdev = md_import_device(dev, -1, -1); 6960 if (IS_ERR(rdev)) { 6961 pr_warn("md: md_import_device returned %ld\n", 6962 PTR_ERR(rdev)); 6963 return PTR_ERR(rdev); 6964 } 6965 /* set saved_raid_disk if appropriate */ 6966 if (!mddev->persistent) { 6967 if (info->state & (1<<MD_DISK_SYNC) && 6968 info->raid_disk < mddev->raid_disks) { 6969 rdev->raid_disk = info->raid_disk; 6970 clear_bit(Bitmap_sync, &rdev->flags); 6971 } else 6972 rdev->raid_disk = -1; 6973 rdev->saved_raid_disk = rdev->raid_disk; 6974 } else 6975 super_types[mddev->major_version]. 6976 validate_super(mddev, NULL/*freshest*/, rdev); 6977 if ((info->state & (1<<MD_DISK_SYNC)) && 6978 rdev->raid_disk != info->raid_disk) { 6979 /* This was a hot-add request, but events doesn't 6980 * match, so reject it. 6981 */ 6982 export_rdev(rdev, mddev); 6983 return -EINVAL; 6984 } 6985 6986 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6987 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6988 set_bit(WriteMostly, &rdev->flags); 6989 else 6990 clear_bit(WriteMostly, &rdev->flags); 6991 if (info->state & (1<<MD_DISK_FAILFAST)) 6992 set_bit(FailFast, &rdev->flags); 6993 else 6994 clear_bit(FailFast, &rdev->flags); 6995 6996 if (info->state & (1<<MD_DISK_JOURNAL)) { 6997 struct md_rdev *rdev2; 6998 bool has_journal = false; 6999 7000 /* make sure no existing journal disk */ 7001 rdev_for_each(rdev2, mddev) { 7002 if (test_bit(Journal, &rdev2->flags)) { 7003 has_journal = true; 7004 break; 7005 } 7006 } 7007 if (has_journal || mddev->bitmap) { 7008 export_rdev(rdev, mddev); 7009 return -EBUSY; 7010 } 7011 set_bit(Journal, &rdev->flags); 7012 } 7013 /* 7014 * check whether the device shows up in other nodes 7015 */ 7016 if (mddev_is_clustered(mddev)) { 7017 if (info->state & (1 << MD_DISK_CANDIDATE)) 7018 set_bit(Candidate, &rdev->flags); 7019 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 7020 /* --add initiated by this node */ 7021 err = md_cluster_ops->add_new_disk(mddev, rdev); 7022 if (err) { 7023 export_rdev(rdev, mddev); 7024 return err; 7025 } 7026 } 7027 } 7028 7029 rdev->raid_disk = -1; 7030 err = bind_rdev_to_array(rdev, mddev); 7031 7032 if (err) 7033 export_rdev(rdev, mddev); 7034 7035 if (mddev_is_clustered(mddev)) { 7036 if (info->state & (1 << MD_DISK_CANDIDATE)) { 7037 if (!err) { 7038 err = md_cluster_ops->new_disk_ack(mddev, 7039 err == 0); 7040 if (err) 7041 md_kick_rdev_from_array(rdev); 7042 } 7043 } else { 7044 if (err) 7045 md_cluster_ops->add_new_disk_cancel(mddev); 7046 else 7047 err = add_bound_rdev(rdev); 7048 } 7049 7050 } else if (!err) 7051 err = add_bound_rdev(rdev); 7052 7053 return err; 7054 } 7055 7056 /* otherwise, md_add_new_disk is only allowed 7057 * for major_version==0 superblocks 7058 */ 7059 if (mddev->major_version != 0) { 7060 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 7061 return -EINVAL; 7062 } 7063 7064 if (!(info->state & (1<<MD_DISK_FAULTY))) { 7065 int err; 7066 rdev = md_import_device(dev, -1, 0); 7067 if (IS_ERR(rdev)) { 7068 pr_warn("md: error, md_import_device() returned %ld\n", 7069 PTR_ERR(rdev)); 7070 return PTR_ERR(rdev); 7071 } 7072 rdev->desc_nr = info->number; 7073 if (info->raid_disk < mddev->raid_disks) 7074 rdev->raid_disk = info->raid_disk; 7075 else 7076 rdev->raid_disk = -1; 7077 7078 if (rdev->raid_disk < mddev->raid_disks) 7079 if (info->state & (1<<MD_DISK_SYNC)) 7080 set_bit(In_sync, &rdev->flags); 7081 7082 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 7083 set_bit(WriteMostly, &rdev->flags); 7084 if (info->state & (1<<MD_DISK_FAILFAST)) 7085 set_bit(FailFast, &rdev->flags); 7086 7087 if (!mddev->persistent) { 7088 pr_debug("md: nonpersistent superblock ...\n"); 7089 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7090 } else 7091 rdev->sb_start = calc_dev_sboffset(rdev); 7092 rdev->sectors = rdev->sb_start; 7093 7094 err = bind_rdev_to_array(rdev, mddev); 7095 if (err) { 7096 export_rdev(rdev, mddev); 7097 return err; 7098 } 7099 } 7100 7101 return 0; 7102 } 7103 7104 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 7105 { 7106 struct md_rdev *rdev; 7107 7108 if (!mddev->pers) 7109 return -ENODEV; 7110 7111 rdev = find_rdev(mddev, dev); 7112 if (!rdev) 7113 return -ENXIO; 7114 7115 if (rdev->raid_disk < 0) 7116 goto kick_rdev; 7117 7118 clear_bit(Blocked, &rdev->flags); 7119 remove_and_add_spares(mddev, rdev); 7120 7121 if (rdev->raid_disk >= 0) 7122 goto busy; 7123 7124 kick_rdev: 7125 if (mddev_is_clustered(mddev)) { 7126 if (md_cluster_ops->remove_disk(mddev, rdev)) 7127 goto busy; 7128 } 7129 7130 md_kick_rdev_from_array(rdev); 7131 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7132 if (!mddev->thread) 7133 md_update_sb(mddev, 1); 7134 md_new_event(); 7135 7136 return 0; 7137 busy: 7138 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7139 rdev->bdev, mdname(mddev)); 7140 return -EBUSY; 7141 } 7142 7143 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7144 { 7145 int err; 7146 struct md_rdev *rdev; 7147 7148 if (!mddev->pers) 7149 return -ENODEV; 7150 7151 if (mddev->major_version != 0) { 7152 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7153 mdname(mddev)); 7154 return -EINVAL; 7155 } 7156 if (!mddev->pers->hot_add_disk) { 7157 pr_warn("%s: personality does not support diskops!\n", 7158 mdname(mddev)); 7159 return -EINVAL; 7160 } 7161 7162 rdev = md_import_device(dev, -1, 0); 7163 if (IS_ERR(rdev)) { 7164 pr_warn("md: error, md_import_device() returned %ld\n", 7165 PTR_ERR(rdev)); 7166 return -EINVAL; 7167 } 7168 7169 if (mddev->persistent) 7170 rdev->sb_start = calc_dev_sboffset(rdev); 7171 else 7172 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7173 7174 rdev->sectors = rdev->sb_start; 7175 7176 if (test_bit(Faulty, &rdev->flags)) { 7177 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7178 rdev->bdev, mdname(mddev)); 7179 err = -EINVAL; 7180 goto abort_export; 7181 } 7182 7183 clear_bit(In_sync, &rdev->flags); 7184 rdev->desc_nr = -1; 7185 rdev->saved_raid_disk = -1; 7186 err = bind_rdev_to_array(rdev, mddev); 7187 if (err) 7188 goto abort_export; 7189 7190 /* 7191 * The rest should better be atomic, we can have disk failures 7192 * noticed in interrupt contexts ... 7193 */ 7194 7195 rdev->raid_disk = -1; 7196 7197 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7198 if (!mddev->thread) 7199 md_update_sb(mddev, 1); 7200 /* 7201 * Kick recovery, maybe this spare has to be added to the 7202 * array immediately. 7203 */ 7204 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7205 md_new_event(); 7206 return 0; 7207 7208 abort_export: 7209 export_rdev(rdev, mddev); 7210 return err; 7211 } 7212 7213 static int set_bitmap_file(struct mddev *mddev, int fd) 7214 { 7215 int err = 0; 7216 7217 if (mddev->pers) { 7218 if (!mddev->pers->quiesce || !mddev->thread) 7219 return -EBUSY; 7220 if (mddev->recovery || mddev->sync_thread) 7221 return -EBUSY; 7222 /* we should be able to change the bitmap.. */ 7223 } 7224 7225 if (fd >= 0) { 7226 struct inode *inode; 7227 struct file *f; 7228 7229 if (mddev->bitmap || mddev->bitmap_info.file) 7230 return -EEXIST; /* cannot add when bitmap is present */ 7231 7232 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7233 pr_warn("%s: bitmap files not supported by this kernel\n", 7234 mdname(mddev)); 7235 return -EINVAL; 7236 } 7237 pr_warn("%s: using deprecated bitmap file support\n", 7238 mdname(mddev)); 7239 7240 f = fget(fd); 7241 7242 if (f == NULL) { 7243 pr_warn("%s: error: failed to get bitmap file\n", 7244 mdname(mddev)); 7245 return -EBADF; 7246 } 7247 7248 inode = f->f_mapping->host; 7249 if (!S_ISREG(inode->i_mode)) { 7250 pr_warn("%s: error: bitmap file must be a regular file\n", 7251 mdname(mddev)); 7252 err = -EBADF; 7253 } else if (!(f->f_mode & FMODE_WRITE)) { 7254 pr_warn("%s: error: bitmap file must open for write\n", 7255 mdname(mddev)); 7256 err = -EBADF; 7257 } else if (atomic_read(&inode->i_writecount) != 1) { 7258 pr_warn("%s: error: bitmap file is already in use\n", 7259 mdname(mddev)); 7260 err = -EBUSY; 7261 } 7262 if (err) { 7263 fput(f); 7264 return err; 7265 } 7266 mddev->bitmap_info.file = f; 7267 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7268 } else if (mddev->bitmap == NULL) 7269 return -ENOENT; /* cannot remove what isn't there */ 7270 err = 0; 7271 if (mddev->pers) { 7272 if (fd >= 0) { 7273 struct bitmap *bitmap; 7274 7275 bitmap = md_bitmap_create(mddev, -1); 7276 if (!IS_ERR(bitmap)) { 7277 mddev->bitmap = bitmap; 7278 err = md_bitmap_load(mddev); 7279 } else 7280 err = PTR_ERR(bitmap); 7281 if (err) { 7282 md_bitmap_destroy(mddev); 7283 fd = -1; 7284 } 7285 } else if (fd < 0) { 7286 md_bitmap_destroy(mddev); 7287 } 7288 } 7289 if (fd < 0) { 7290 struct file *f = mddev->bitmap_info.file; 7291 if (f) { 7292 spin_lock(&mddev->lock); 7293 mddev->bitmap_info.file = NULL; 7294 spin_unlock(&mddev->lock); 7295 fput(f); 7296 } 7297 } 7298 7299 return err; 7300 } 7301 7302 /* 7303 * md_set_array_info is used two different ways 7304 * The original usage is when creating a new array. 7305 * In this usage, raid_disks is > 0 and it together with 7306 * level, size, not_persistent,layout,chunksize determine the 7307 * shape of the array. 7308 * This will always create an array with a type-0.90.0 superblock. 7309 * The newer usage is when assembling an array. 7310 * In this case raid_disks will be 0, and the major_version field is 7311 * use to determine which style super-blocks are to be found on the devices. 7312 * The minor and patch _version numbers are also kept incase the 7313 * super_block handler wishes to interpret them. 7314 */ 7315 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7316 { 7317 if (info->raid_disks == 0) { 7318 /* just setting version number for superblock loading */ 7319 if (info->major_version < 0 || 7320 info->major_version >= ARRAY_SIZE(super_types) || 7321 super_types[info->major_version].name == NULL) { 7322 /* maybe try to auto-load a module? */ 7323 pr_warn("md: superblock version %d not known\n", 7324 info->major_version); 7325 return -EINVAL; 7326 } 7327 mddev->major_version = info->major_version; 7328 mddev->minor_version = info->minor_version; 7329 mddev->patch_version = info->patch_version; 7330 mddev->persistent = !info->not_persistent; 7331 /* ensure mddev_put doesn't delete this now that there 7332 * is some minimal configuration. 7333 */ 7334 mddev->ctime = ktime_get_real_seconds(); 7335 return 0; 7336 } 7337 mddev->major_version = MD_MAJOR_VERSION; 7338 mddev->minor_version = MD_MINOR_VERSION; 7339 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7340 mddev->ctime = ktime_get_real_seconds(); 7341 7342 mddev->level = info->level; 7343 mddev->clevel[0] = 0; 7344 mddev->dev_sectors = 2 * (sector_t)info->size; 7345 mddev->raid_disks = info->raid_disks; 7346 /* don't set md_minor, it is determined by which /dev/md* was 7347 * openned 7348 */ 7349 if (info->state & (1<<MD_SB_CLEAN)) 7350 mddev->recovery_cp = MaxSector; 7351 else 7352 mddev->recovery_cp = 0; 7353 mddev->persistent = ! info->not_persistent; 7354 mddev->external = 0; 7355 7356 mddev->layout = info->layout; 7357 if (mddev->level == 0) 7358 /* Cannot trust RAID0 layout info here */ 7359 mddev->layout = -1; 7360 mddev->chunk_sectors = info->chunk_size >> 9; 7361 7362 if (mddev->persistent) { 7363 mddev->max_disks = MD_SB_DISKS; 7364 mddev->flags = 0; 7365 mddev->sb_flags = 0; 7366 } 7367 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7368 7369 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7370 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7371 mddev->bitmap_info.offset = 0; 7372 7373 mddev->reshape_position = MaxSector; 7374 7375 /* 7376 * Generate a 128 bit UUID 7377 */ 7378 get_random_bytes(mddev->uuid, 16); 7379 7380 mddev->new_level = mddev->level; 7381 mddev->new_chunk_sectors = mddev->chunk_sectors; 7382 mddev->new_layout = mddev->layout; 7383 mddev->delta_disks = 0; 7384 mddev->reshape_backwards = 0; 7385 7386 return 0; 7387 } 7388 7389 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7390 { 7391 lockdep_assert_held(&mddev->reconfig_mutex); 7392 7393 if (mddev->external_size) 7394 return; 7395 7396 mddev->array_sectors = array_sectors; 7397 } 7398 EXPORT_SYMBOL(md_set_array_sectors); 7399 7400 static int update_size(struct mddev *mddev, sector_t num_sectors) 7401 { 7402 struct md_rdev *rdev; 7403 int rv; 7404 int fit = (num_sectors == 0); 7405 sector_t old_dev_sectors = mddev->dev_sectors; 7406 7407 if (mddev->pers->resize == NULL) 7408 return -EINVAL; 7409 /* The "num_sectors" is the number of sectors of each device that 7410 * is used. This can only make sense for arrays with redundancy. 7411 * linear and raid0 always use whatever space is available. We can only 7412 * consider changing this number if no resync or reconstruction is 7413 * happening, and if the new size is acceptable. It must fit before the 7414 * sb_start or, if that is <data_offset, it must fit before the size 7415 * of each device. If num_sectors is zero, we find the largest size 7416 * that fits. 7417 */ 7418 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 7419 return -EBUSY; 7420 if (!md_is_rdwr(mddev)) 7421 return -EROFS; 7422 7423 rdev_for_each(rdev, mddev) { 7424 sector_t avail = rdev->sectors; 7425 7426 if (fit && (num_sectors == 0 || num_sectors > avail)) 7427 num_sectors = avail; 7428 if (avail < num_sectors) 7429 return -ENOSPC; 7430 } 7431 rv = mddev->pers->resize(mddev, num_sectors); 7432 if (!rv) { 7433 if (mddev_is_clustered(mddev)) 7434 md_cluster_ops->update_size(mddev, old_dev_sectors); 7435 else if (!mddev_is_dm(mddev)) 7436 set_capacity_and_notify(mddev->gendisk, 7437 mddev->array_sectors); 7438 } 7439 return rv; 7440 } 7441 7442 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7443 { 7444 int rv; 7445 struct md_rdev *rdev; 7446 /* change the number of raid disks */ 7447 if (mddev->pers->check_reshape == NULL) 7448 return -EINVAL; 7449 if (!md_is_rdwr(mddev)) 7450 return -EROFS; 7451 if (raid_disks <= 0 || 7452 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7453 return -EINVAL; 7454 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7455 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7456 mddev->reshape_position != MaxSector) 7457 return -EBUSY; 7458 7459 rdev_for_each(rdev, mddev) { 7460 if (mddev->raid_disks < raid_disks && 7461 rdev->data_offset < rdev->new_data_offset) 7462 return -EINVAL; 7463 if (mddev->raid_disks > raid_disks && 7464 rdev->data_offset > rdev->new_data_offset) 7465 return -EINVAL; 7466 } 7467 7468 mddev->delta_disks = raid_disks - mddev->raid_disks; 7469 if (mddev->delta_disks < 0) 7470 mddev->reshape_backwards = 1; 7471 else if (mddev->delta_disks > 0) 7472 mddev->reshape_backwards = 0; 7473 7474 rv = mddev->pers->check_reshape(mddev); 7475 if (rv < 0) { 7476 mddev->delta_disks = 0; 7477 mddev->reshape_backwards = 0; 7478 } 7479 return rv; 7480 } 7481 7482 /* 7483 * update_array_info is used to change the configuration of an 7484 * on-line array. 7485 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7486 * fields in the info are checked against the array. 7487 * Any differences that cannot be handled will cause an error. 7488 * Normally, only one change can be managed at a time. 7489 */ 7490 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7491 { 7492 int rv = 0; 7493 int cnt = 0; 7494 int state = 0; 7495 7496 /* calculate expected state,ignoring low bits */ 7497 if (mddev->bitmap && mddev->bitmap_info.offset) 7498 state |= (1 << MD_SB_BITMAP_PRESENT); 7499 7500 if (mddev->major_version != info->major_version || 7501 mddev->minor_version != info->minor_version || 7502 /* mddev->patch_version != info->patch_version || */ 7503 mddev->ctime != info->ctime || 7504 mddev->level != info->level || 7505 /* mddev->layout != info->layout || */ 7506 mddev->persistent != !info->not_persistent || 7507 mddev->chunk_sectors != info->chunk_size >> 9 || 7508 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7509 ((state^info->state) & 0xfffffe00) 7510 ) 7511 return -EINVAL; 7512 /* Check there is only one change */ 7513 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7514 cnt++; 7515 if (mddev->raid_disks != info->raid_disks) 7516 cnt++; 7517 if (mddev->layout != info->layout) 7518 cnt++; 7519 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7520 cnt++; 7521 if (cnt == 0) 7522 return 0; 7523 if (cnt > 1) 7524 return -EINVAL; 7525 7526 if (mddev->layout != info->layout) { 7527 /* Change layout 7528 * we don't need to do anything at the md level, the 7529 * personality will take care of it all. 7530 */ 7531 if (mddev->pers->check_reshape == NULL) 7532 return -EINVAL; 7533 else { 7534 mddev->new_layout = info->layout; 7535 rv = mddev->pers->check_reshape(mddev); 7536 if (rv) 7537 mddev->new_layout = mddev->layout; 7538 return rv; 7539 } 7540 } 7541 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7542 rv = update_size(mddev, (sector_t)info->size * 2); 7543 7544 if (mddev->raid_disks != info->raid_disks) 7545 rv = update_raid_disks(mddev, info->raid_disks); 7546 7547 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7548 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7549 rv = -EINVAL; 7550 goto err; 7551 } 7552 if (mddev->recovery || mddev->sync_thread) { 7553 rv = -EBUSY; 7554 goto err; 7555 } 7556 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7557 struct bitmap *bitmap; 7558 /* add the bitmap */ 7559 if (mddev->bitmap) { 7560 rv = -EEXIST; 7561 goto err; 7562 } 7563 if (mddev->bitmap_info.default_offset == 0) { 7564 rv = -EINVAL; 7565 goto err; 7566 } 7567 mddev->bitmap_info.offset = 7568 mddev->bitmap_info.default_offset; 7569 mddev->bitmap_info.space = 7570 mddev->bitmap_info.default_space; 7571 bitmap = md_bitmap_create(mddev, -1); 7572 if (!IS_ERR(bitmap)) { 7573 mddev->bitmap = bitmap; 7574 rv = md_bitmap_load(mddev); 7575 } else 7576 rv = PTR_ERR(bitmap); 7577 if (rv) 7578 md_bitmap_destroy(mddev); 7579 } else { 7580 /* remove the bitmap */ 7581 if (!mddev->bitmap) { 7582 rv = -ENOENT; 7583 goto err; 7584 } 7585 if (mddev->bitmap->storage.file) { 7586 rv = -EINVAL; 7587 goto err; 7588 } 7589 if (mddev->bitmap_info.nodes) { 7590 /* hold PW on all the bitmap lock */ 7591 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7592 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7593 rv = -EPERM; 7594 md_cluster_ops->unlock_all_bitmaps(mddev); 7595 goto err; 7596 } 7597 7598 mddev->bitmap_info.nodes = 0; 7599 md_cluster_ops->leave(mddev); 7600 module_put(md_cluster_mod); 7601 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7602 } 7603 md_bitmap_destroy(mddev); 7604 mddev->bitmap_info.offset = 0; 7605 } 7606 } 7607 md_update_sb(mddev, 1); 7608 return rv; 7609 err: 7610 return rv; 7611 } 7612 7613 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7614 { 7615 struct md_rdev *rdev; 7616 int err = 0; 7617 7618 if (mddev->pers == NULL) 7619 return -ENODEV; 7620 7621 rcu_read_lock(); 7622 rdev = md_find_rdev_rcu(mddev, dev); 7623 if (!rdev) 7624 err = -ENODEV; 7625 else { 7626 md_error(mddev, rdev); 7627 if (test_bit(MD_BROKEN, &mddev->flags)) 7628 err = -EBUSY; 7629 } 7630 rcu_read_unlock(); 7631 return err; 7632 } 7633 7634 /* 7635 * We have a problem here : there is no easy way to give a CHS 7636 * virtual geometry. We currently pretend that we have a 2 heads 7637 * 4 sectors (with a BIG number of cylinders...). This drives 7638 * dosfs just mad... ;-) 7639 */ 7640 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7641 { 7642 struct mddev *mddev = bdev->bd_disk->private_data; 7643 7644 geo->heads = 2; 7645 geo->sectors = 4; 7646 geo->cylinders = mddev->array_sectors / 8; 7647 return 0; 7648 } 7649 7650 static inline int md_ioctl_valid(unsigned int cmd) 7651 { 7652 switch (cmd) { 7653 case GET_ARRAY_INFO: 7654 case GET_DISK_INFO: 7655 case RAID_VERSION: 7656 return 0; 7657 case ADD_NEW_DISK: 7658 case GET_BITMAP_FILE: 7659 case HOT_ADD_DISK: 7660 case HOT_REMOVE_DISK: 7661 case RESTART_ARRAY_RW: 7662 case RUN_ARRAY: 7663 case SET_ARRAY_INFO: 7664 case SET_BITMAP_FILE: 7665 case SET_DISK_FAULTY: 7666 case STOP_ARRAY: 7667 case STOP_ARRAY_RO: 7668 case CLUSTERED_DISK_NACK: 7669 if (!capable(CAP_SYS_ADMIN)) 7670 return -EACCES; 7671 return 0; 7672 default: 7673 return -ENOTTY; 7674 } 7675 } 7676 7677 static bool md_ioctl_need_suspend(unsigned int cmd) 7678 { 7679 switch (cmd) { 7680 case ADD_NEW_DISK: 7681 case HOT_ADD_DISK: 7682 case HOT_REMOVE_DISK: 7683 case SET_BITMAP_FILE: 7684 case SET_ARRAY_INFO: 7685 return true; 7686 default: 7687 return false; 7688 } 7689 } 7690 7691 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7692 { 7693 mdu_array_info_t info; 7694 int err; 7695 7696 if (!argp) 7697 memset(&info, 0, sizeof(info)); 7698 else if (copy_from_user(&info, argp, sizeof(info))) 7699 return -EFAULT; 7700 7701 if (mddev->pers) { 7702 err = update_array_info(mddev, &info); 7703 if (err) 7704 pr_warn("md: couldn't update array info. %d\n", err); 7705 return err; 7706 } 7707 7708 if (!list_empty(&mddev->disks)) { 7709 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7710 return -EBUSY; 7711 } 7712 7713 if (mddev->raid_disks) { 7714 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7715 return -EBUSY; 7716 } 7717 7718 err = md_set_array_info(mddev, &info); 7719 if (err) 7720 pr_warn("md: couldn't set array info. %d\n", err); 7721 7722 return err; 7723 } 7724 7725 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7726 unsigned int cmd, unsigned long arg) 7727 { 7728 int err = 0; 7729 void __user *argp = (void __user *)arg; 7730 struct mddev *mddev = NULL; 7731 7732 err = md_ioctl_valid(cmd); 7733 if (err) 7734 return err; 7735 7736 /* 7737 * Commands dealing with the RAID driver but not any 7738 * particular array: 7739 */ 7740 if (cmd == RAID_VERSION) 7741 return get_version(argp); 7742 7743 /* 7744 * Commands creating/starting a new array: 7745 */ 7746 7747 mddev = bdev->bd_disk->private_data; 7748 7749 /* Some actions do not requires the mutex */ 7750 switch (cmd) { 7751 case GET_ARRAY_INFO: 7752 if (!mddev->raid_disks && !mddev->external) 7753 return -ENODEV; 7754 return get_array_info(mddev, argp); 7755 7756 case GET_DISK_INFO: 7757 if (!mddev->raid_disks && !mddev->external) 7758 return -ENODEV; 7759 return get_disk_info(mddev, argp); 7760 7761 case SET_DISK_FAULTY: 7762 return set_disk_faulty(mddev, new_decode_dev(arg)); 7763 7764 case GET_BITMAP_FILE: 7765 return get_bitmap_file(mddev, argp); 7766 } 7767 7768 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7769 /* Need to flush page cache, and ensure no-one else opens 7770 * and writes 7771 */ 7772 err = mddev_set_closing_and_sync_blockdev(mddev, 1); 7773 if (err) 7774 return err; 7775 } 7776 7777 if (!md_is_rdwr(mddev)) 7778 flush_work(&mddev->sync_work); 7779 7780 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 7781 mddev_lock(mddev); 7782 if (err) { 7783 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7784 err, cmd); 7785 goto out; 7786 } 7787 7788 if (cmd == SET_ARRAY_INFO) { 7789 err = __md_set_array_info(mddev, argp); 7790 goto unlock; 7791 } 7792 7793 /* 7794 * Commands querying/configuring an existing array: 7795 */ 7796 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7797 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7798 if ((!mddev->raid_disks && !mddev->external) 7799 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7800 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7801 && cmd != GET_BITMAP_FILE) { 7802 err = -ENODEV; 7803 goto unlock; 7804 } 7805 7806 /* 7807 * Commands even a read-only array can execute: 7808 */ 7809 switch (cmd) { 7810 case RESTART_ARRAY_RW: 7811 err = restart_array(mddev); 7812 goto unlock; 7813 7814 case STOP_ARRAY: 7815 err = do_md_stop(mddev, 0); 7816 goto unlock; 7817 7818 case STOP_ARRAY_RO: 7819 if (mddev->pers) 7820 err = md_set_readonly(mddev); 7821 goto unlock; 7822 7823 case HOT_REMOVE_DISK: 7824 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7825 goto unlock; 7826 7827 case ADD_NEW_DISK: 7828 /* We can support ADD_NEW_DISK on read-only arrays 7829 * only if we are re-adding a preexisting device. 7830 * So require mddev->pers and MD_DISK_SYNC. 7831 */ 7832 if (mddev->pers) { 7833 mdu_disk_info_t info; 7834 if (copy_from_user(&info, argp, sizeof(info))) 7835 err = -EFAULT; 7836 else if (!(info.state & (1<<MD_DISK_SYNC))) 7837 /* Need to clear read-only for this */ 7838 break; 7839 else 7840 err = md_add_new_disk(mddev, &info); 7841 goto unlock; 7842 } 7843 break; 7844 } 7845 7846 /* 7847 * The remaining ioctls are changing the state of the 7848 * superblock, so we do not allow them on read-only arrays. 7849 */ 7850 if (!md_is_rdwr(mddev) && mddev->pers) { 7851 if (mddev->ro != MD_AUTO_READ) { 7852 err = -EROFS; 7853 goto unlock; 7854 } 7855 mddev->ro = MD_RDWR; 7856 sysfs_notify_dirent_safe(mddev->sysfs_state); 7857 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7858 /* mddev_unlock will wake thread */ 7859 /* If a device failed while we were read-only, we 7860 * need to make sure the metadata is updated now. 7861 */ 7862 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7863 mddev_unlock(mddev); 7864 wait_event(mddev->sb_wait, 7865 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7866 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7867 mddev_lock_nointr(mddev); 7868 } 7869 } 7870 7871 switch (cmd) { 7872 case ADD_NEW_DISK: 7873 { 7874 mdu_disk_info_t info; 7875 if (copy_from_user(&info, argp, sizeof(info))) 7876 err = -EFAULT; 7877 else 7878 err = md_add_new_disk(mddev, &info); 7879 goto unlock; 7880 } 7881 7882 case CLUSTERED_DISK_NACK: 7883 if (mddev_is_clustered(mddev)) 7884 md_cluster_ops->new_disk_ack(mddev, false); 7885 else 7886 err = -EINVAL; 7887 goto unlock; 7888 7889 case HOT_ADD_DISK: 7890 err = hot_add_disk(mddev, new_decode_dev(arg)); 7891 goto unlock; 7892 7893 case RUN_ARRAY: 7894 err = do_md_run(mddev); 7895 goto unlock; 7896 7897 case SET_BITMAP_FILE: 7898 err = set_bitmap_file(mddev, (int)arg); 7899 goto unlock; 7900 7901 default: 7902 err = -EINVAL; 7903 goto unlock; 7904 } 7905 7906 unlock: 7907 if (mddev->hold_active == UNTIL_IOCTL && 7908 err != -EINVAL) 7909 mddev->hold_active = 0; 7910 7911 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 7912 mddev_unlock(mddev); 7913 7914 out: 7915 if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) 7916 clear_bit(MD_CLOSING, &mddev->flags); 7917 return err; 7918 } 7919 #ifdef CONFIG_COMPAT 7920 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 7921 unsigned int cmd, unsigned long arg) 7922 { 7923 switch (cmd) { 7924 case HOT_REMOVE_DISK: 7925 case HOT_ADD_DISK: 7926 case SET_DISK_FAULTY: 7927 case SET_BITMAP_FILE: 7928 /* These take in integer arg, do not convert */ 7929 break; 7930 default: 7931 arg = (unsigned long)compat_ptr(arg); 7932 break; 7933 } 7934 7935 return md_ioctl(bdev, mode, cmd, arg); 7936 } 7937 #endif /* CONFIG_COMPAT */ 7938 7939 static int md_set_read_only(struct block_device *bdev, bool ro) 7940 { 7941 struct mddev *mddev = bdev->bd_disk->private_data; 7942 int err; 7943 7944 err = mddev_lock(mddev); 7945 if (err) 7946 return err; 7947 7948 if (!mddev->raid_disks && !mddev->external) { 7949 err = -ENODEV; 7950 goto out_unlock; 7951 } 7952 7953 /* 7954 * Transitioning to read-auto need only happen for arrays that call 7955 * md_write_start and which are not ready for writes yet. 7956 */ 7957 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 7958 err = restart_array(mddev); 7959 if (err) 7960 goto out_unlock; 7961 mddev->ro = MD_AUTO_READ; 7962 } 7963 7964 out_unlock: 7965 mddev_unlock(mddev); 7966 return err; 7967 } 7968 7969 static int md_open(struct gendisk *disk, blk_mode_t mode) 7970 { 7971 struct mddev *mddev; 7972 int err; 7973 7974 spin_lock(&all_mddevs_lock); 7975 mddev = mddev_get(disk->private_data); 7976 spin_unlock(&all_mddevs_lock); 7977 if (!mddev) 7978 return -ENODEV; 7979 7980 err = mutex_lock_interruptible(&mddev->open_mutex); 7981 if (err) 7982 goto out; 7983 7984 err = -ENODEV; 7985 if (test_bit(MD_CLOSING, &mddev->flags)) 7986 goto out_unlock; 7987 7988 atomic_inc(&mddev->openers); 7989 mutex_unlock(&mddev->open_mutex); 7990 7991 disk_check_media_change(disk); 7992 return 0; 7993 7994 out_unlock: 7995 mutex_unlock(&mddev->open_mutex); 7996 out: 7997 mddev_put(mddev); 7998 return err; 7999 } 8000 8001 static void md_release(struct gendisk *disk) 8002 { 8003 struct mddev *mddev = disk->private_data; 8004 8005 BUG_ON(!mddev); 8006 atomic_dec(&mddev->openers); 8007 mddev_put(mddev); 8008 } 8009 8010 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 8011 { 8012 struct mddev *mddev = disk->private_data; 8013 unsigned int ret = 0; 8014 8015 if (mddev->changed) 8016 ret = DISK_EVENT_MEDIA_CHANGE; 8017 mddev->changed = 0; 8018 return ret; 8019 } 8020 8021 static void md_free_disk(struct gendisk *disk) 8022 { 8023 struct mddev *mddev = disk->private_data; 8024 8025 mddev_free(mddev); 8026 } 8027 8028 const struct block_device_operations md_fops = 8029 { 8030 .owner = THIS_MODULE, 8031 .submit_bio = md_submit_bio, 8032 .open = md_open, 8033 .release = md_release, 8034 .ioctl = md_ioctl, 8035 #ifdef CONFIG_COMPAT 8036 .compat_ioctl = md_compat_ioctl, 8037 #endif 8038 .getgeo = md_getgeo, 8039 .check_events = md_check_events, 8040 .set_read_only = md_set_read_only, 8041 .free_disk = md_free_disk, 8042 }; 8043 8044 static int md_thread(void *arg) 8045 { 8046 struct md_thread *thread = arg; 8047 8048 /* 8049 * md_thread is a 'system-thread', it's priority should be very 8050 * high. We avoid resource deadlocks individually in each 8051 * raid personality. (RAID5 does preallocation) We also use RR and 8052 * the very same RT priority as kswapd, thus we will never get 8053 * into a priority inversion deadlock. 8054 * 8055 * we definitely have to have equal or higher priority than 8056 * bdflush, otherwise bdflush will deadlock if there are too 8057 * many dirty RAID5 blocks. 8058 */ 8059 8060 allow_signal(SIGKILL); 8061 while (!kthread_should_stop()) { 8062 8063 /* We need to wait INTERRUPTIBLE so that 8064 * we don't add to the load-average. 8065 * That means we need to be sure no signals are 8066 * pending 8067 */ 8068 if (signal_pending(current)) 8069 flush_signals(current); 8070 8071 wait_event_interruptible_timeout 8072 (thread->wqueue, 8073 test_bit(THREAD_WAKEUP, &thread->flags) 8074 || kthread_should_stop() || kthread_should_park(), 8075 thread->timeout); 8076 8077 clear_bit(THREAD_WAKEUP, &thread->flags); 8078 if (kthread_should_park()) 8079 kthread_parkme(); 8080 if (!kthread_should_stop()) 8081 thread->run(thread); 8082 } 8083 8084 return 0; 8085 } 8086 8087 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 8088 { 8089 struct md_thread *t; 8090 8091 rcu_read_lock(); 8092 t = rcu_dereference(thread); 8093 if (t) 8094 wake_up_process(t->tsk); 8095 rcu_read_unlock(); 8096 } 8097 8098 void md_wakeup_thread(struct md_thread __rcu *thread) 8099 { 8100 struct md_thread *t; 8101 8102 rcu_read_lock(); 8103 t = rcu_dereference(thread); 8104 if (t) { 8105 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8106 set_bit(THREAD_WAKEUP, &t->flags); 8107 if (wq_has_sleeper(&t->wqueue)) 8108 wake_up(&t->wqueue); 8109 } 8110 rcu_read_unlock(); 8111 } 8112 EXPORT_SYMBOL(md_wakeup_thread); 8113 8114 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8115 struct mddev *mddev, const char *name) 8116 { 8117 struct md_thread *thread; 8118 8119 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8120 if (!thread) 8121 return NULL; 8122 8123 init_waitqueue_head(&thread->wqueue); 8124 8125 thread->run = run; 8126 thread->mddev = mddev; 8127 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8128 thread->tsk = kthread_run(md_thread, thread, 8129 "%s_%s", 8130 mdname(thread->mddev), 8131 name); 8132 if (IS_ERR(thread->tsk)) { 8133 kfree(thread); 8134 return NULL; 8135 } 8136 return thread; 8137 } 8138 EXPORT_SYMBOL(md_register_thread); 8139 8140 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8141 { 8142 struct md_thread *thread = rcu_dereference_protected(*threadp, 8143 lockdep_is_held(&mddev->reconfig_mutex)); 8144 8145 if (!thread) 8146 return; 8147 8148 rcu_assign_pointer(*threadp, NULL); 8149 synchronize_rcu(); 8150 8151 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8152 kthread_stop(thread->tsk); 8153 kfree(thread); 8154 } 8155 EXPORT_SYMBOL(md_unregister_thread); 8156 8157 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8158 { 8159 if (!rdev || test_bit(Faulty, &rdev->flags)) 8160 return; 8161 8162 if (!mddev->pers || !mddev->pers->error_handler) 8163 return; 8164 mddev->pers->error_handler(mddev, rdev); 8165 8166 if (mddev->pers->level == 0) 8167 return; 8168 8169 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8170 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8171 sysfs_notify_dirent_safe(rdev->sysfs_state); 8172 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8173 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8174 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8175 md_wakeup_thread(mddev->thread); 8176 } 8177 if (mddev->event_work.func) 8178 queue_work(md_misc_wq, &mddev->event_work); 8179 md_new_event(); 8180 } 8181 EXPORT_SYMBOL(md_error); 8182 8183 /* seq_file implementation /proc/mdstat */ 8184 8185 static void status_unused(struct seq_file *seq) 8186 { 8187 int i = 0; 8188 struct md_rdev *rdev; 8189 8190 seq_printf(seq, "unused devices: "); 8191 8192 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8193 i++; 8194 seq_printf(seq, "%pg ", rdev->bdev); 8195 } 8196 if (!i) 8197 seq_printf(seq, "<none>"); 8198 8199 seq_printf(seq, "\n"); 8200 } 8201 8202 static void status_personalities(struct seq_file *seq) 8203 { 8204 struct md_personality *pers; 8205 8206 seq_puts(seq, "Personalities : "); 8207 spin_lock(&pers_lock); 8208 list_for_each_entry(pers, &pers_list, list) 8209 seq_printf(seq, "[%s] ", pers->name); 8210 8211 spin_unlock(&pers_lock); 8212 seq_puts(seq, "\n"); 8213 } 8214 8215 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8216 { 8217 sector_t max_sectors, resync, res; 8218 unsigned long dt, db = 0; 8219 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8220 int scale, recovery_active; 8221 unsigned int per_milli; 8222 8223 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8224 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8225 max_sectors = mddev->resync_max_sectors; 8226 else 8227 max_sectors = mddev->dev_sectors; 8228 8229 resync = mddev->curr_resync; 8230 if (resync < MD_RESYNC_ACTIVE) { 8231 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8232 /* Still cleaning up */ 8233 resync = max_sectors; 8234 } else if (resync > max_sectors) { 8235 resync = max_sectors; 8236 } else { 8237 res = atomic_read(&mddev->recovery_active); 8238 /* 8239 * Resync has started, but the subtraction has overflowed or 8240 * yielded one of the special values. Force it to active to 8241 * ensure the status reports an active resync. 8242 */ 8243 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8244 resync = MD_RESYNC_ACTIVE; 8245 else 8246 resync -= res; 8247 } 8248 8249 if (resync == MD_RESYNC_NONE) { 8250 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8251 struct md_rdev *rdev; 8252 8253 rdev_for_each(rdev, mddev) 8254 if (rdev->raid_disk >= 0 && 8255 !test_bit(Faulty, &rdev->flags) && 8256 rdev->recovery_offset != MaxSector && 8257 rdev->recovery_offset) { 8258 seq_printf(seq, "\trecover=REMOTE"); 8259 return 1; 8260 } 8261 if (mddev->reshape_position != MaxSector) 8262 seq_printf(seq, "\treshape=REMOTE"); 8263 else 8264 seq_printf(seq, "\tresync=REMOTE"); 8265 return 1; 8266 } 8267 if (mddev->recovery_cp < MaxSector) { 8268 seq_printf(seq, "\tresync=PENDING"); 8269 return 1; 8270 } 8271 return 0; 8272 } 8273 if (resync < MD_RESYNC_ACTIVE) { 8274 seq_printf(seq, "\tresync=DELAYED"); 8275 return 1; 8276 } 8277 8278 WARN_ON(max_sectors == 0); 8279 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8280 * in a sector_t, and (max_sectors>>scale) will fit in a 8281 * u32, as those are the requirements for sector_div. 8282 * Thus 'scale' must be at least 10 8283 */ 8284 scale = 10; 8285 if (sizeof(sector_t) > sizeof(unsigned long)) { 8286 while ( max_sectors/2 > (1ULL<<(scale+32))) 8287 scale++; 8288 } 8289 res = (resync>>scale)*1000; 8290 sector_div(res, (u32)((max_sectors>>scale)+1)); 8291 8292 per_milli = res; 8293 { 8294 int i, x = per_milli/50, y = 20-x; 8295 seq_printf(seq, "["); 8296 for (i = 0; i < x; i++) 8297 seq_printf(seq, "="); 8298 seq_printf(seq, ">"); 8299 for (i = 0; i < y; i++) 8300 seq_printf(seq, "."); 8301 seq_printf(seq, "] "); 8302 } 8303 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8304 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8305 "reshape" : 8306 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8307 "check" : 8308 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8309 "resync" : "recovery"))), 8310 per_milli/10, per_milli % 10, 8311 (unsigned long long) resync/2, 8312 (unsigned long long) max_sectors/2); 8313 8314 /* 8315 * dt: time from mark until now 8316 * db: blocks written from mark until now 8317 * rt: remaining time 8318 * 8319 * rt is a sector_t, which is always 64bit now. We are keeping 8320 * the original algorithm, but it is not really necessary. 8321 * 8322 * Original algorithm: 8323 * So we divide before multiply in case it is 32bit and close 8324 * to the limit. 8325 * We scale the divisor (db) by 32 to avoid losing precision 8326 * near the end of resync when the number of remaining sectors 8327 * is close to 'db'. 8328 * We then divide rt by 32 after multiplying by db to compensate. 8329 * The '+1' avoids division by zero if db is very small. 8330 */ 8331 dt = ((jiffies - mddev->resync_mark) / HZ); 8332 if (!dt) dt++; 8333 8334 curr_mark_cnt = mddev->curr_mark_cnt; 8335 recovery_active = atomic_read(&mddev->recovery_active); 8336 resync_mark_cnt = mddev->resync_mark_cnt; 8337 8338 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8339 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8340 8341 rt = max_sectors - resync; /* number of remaining sectors */ 8342 rt = div64_u64(rt, db/32+1); 8343 rt *= dt; 8344 rt >>= 5; 8345 8346 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8347 ((unsigned long)rt % 60)/6); 8348 8349 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8350 return 1; 8351 } 8352 8353 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8354 __acquires(&all_mddevs_lock) 8355 { 8356 seq->poll_event = atomic_read(&md_event_count); 8357 spin_lock(&all_mddevs_lock); 8358 8359 return seq_list_start_head(&all_mddevs, *pos); 8360 } 8361 8362 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8363 { 8364 return seq_list_next(v, &all_mddevs, pos); 8365 } 8366 8367 static void md_seq_stop(struct seq_file *seq, void *v) 8368 __releases(&all_mddevs_lock) 8369 { 8370 spin_unlock(&all_mddevs_lock); 8371 } 8372 8373 static int md_seq_show(struct seq_file *seq, void *v) 8374 { 8375 struct mddev *mddev; 8376 sector_t sectors; 8377 struct md_rdev *rdev; 8378 8379 if (v == &all_mddevs) { 8380 status_personalities(seq); 8381 if (list_empty(&all_mddevs)) 8382 status_unused(seq); 8383 return 0; 8384 } 8385 8386 mddev = list_entry(v, struct mddev, all_mddevs); 8387 if (!mddev_get(mddev)) 8388 return 0; 8389 8390 spin_unlock(&all_mddevs_lock); 8391 spin_lock(&mddev->lock); 8392 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8393 seq_printf(seq, "%s : %sactive", mdname(mddev), 8394 mddev->pers ? "" : "in"); 8395 if (mddev->pers) { 8396 if (mddev->ro == MD_RDONLY) 8397 seq_printf(seq, " (read-only)"); 8398 if (mddev->ro == MD_AUTO_READ) 8399 seq_printf(seq, " (auto-read-only)"); 8400 seq_printf(seq, " %s", mddev->pers->name); 8401 } 8402 8403 sectors = 0; 8404 rcu_read_lock(); 8405 rdev_for_each_rcu(rdev, mddev) { 8406 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8407 8408 if (test_bit(WriteMostly, &rdev->flags)) 8409 seq_printf(seq, "(W)"); 8410 if (test_bit(Journal, &rdev->flags)) 8411 seq_printf(seq, "(J)"); 8412 if (test_bit(Faulty, &rdev->flags)) { 8413 seq_printf(seq, "(F)"); 8414 continue; 8415 } 8416 if (rdev->raid_disk < 0) 8417 seq_printf(seq, "(S)"); /* spare */ 8418 if (test_bit(Replacement, &rdev->flags)) 8419 seq_printf(seq, "(R)"); 8420 sectors += rdev->sectors; 8421 } 8422 rcu_read_unlock(); 8423 8424 if (!list_empty(&mddev->disks)) { 8425 if (mddev->pers) 8426 seq_printf(seq, "\n %llu blocks", 8427 (unsigned long long) 8428 mddev->array_sectors / 2); 8429 else 8430 seq_printf(seq, "\n %llu blocks", 8431 (unsigned long long)sectors / 2); 8432 } 8433 if (mddev->persistent) { 8434 if (mddev->major_version != 0 || 8435 mddev->minor_version != 90) { 8436 seq_printf(seq," super %d.%d", 8437 mddev->major_version, 8438 mddev->minor_version); 8439 } 8440 } else if (mddev->external) 8441 seq_printf(seq, " super external:%s", 8442 mddev->metadata_type); 8443 else 8444 seq_printf(seq, " super non-persistent"); 8445 8446 if (mddev->pers) { 8447 mddev->pers->status(seq, mddev); 8448 seq_printf(seq, "\n "); 8449 if (mddev->pers->sync_request) { 8450 if (status_resync(seq, mddev)) 8451 seq_printf(seq, "\n "); 8452 } 8453 } else 8454 seq_printf(seq, "\n "); 8455 8456 md_bitmap_status(seq, mddev->bitmap); 8457 8458 seq_printf(seq, "\n"); 8459 } 8460 spin_unlock(&mddev->lock); 8461 spin_lock(&all_mddevs_lock); 8462 8463 if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) 8464 status_unused(seq); 8465 8466 if (atomic_dec_and_test(&mddev->active)) 8467 __mddev_put(mddev); 8468 8469 return 0; 8470 } 8471 8472 static const struct seq_operations md_seq_ops = { 8473 .start = md_seq_start, 8474 .next = md_seq_next, 8475 .stop = md_seq_stop, 8476 .show = md_seq_show, 8477 }; 8478 8479 static int md_seq_open(struct inode *inode, struct file *file) 8480 { 8481 struct seq_file *seq; 8482 int error; 8483 8484 error = seq_open(file, &md_seq_ops); 8485 if (error) 8486 return error; 8487 8488 seq = file->private_data; 8489 seq->poll_event = atomic_read(&md_event_count); 8490 return error; 8491 } 8492 8493 static int md_unloading; 8494 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8495 { 8496 struct seq_file *seq = filp->private_data; 8497 __poll_t mask; 8498 8499 if (md_unloading) 8500 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8501 poll_wait(filp, &md_event_waiters, wait); 8502 8503 /* always allow read */ 8504 mask = EPOLLIN | EPOLLRDNORM; 8505 8506 if (seq->poll_event != atomic_read(&md_event_count)) 8507 mask |= EPOLLERR | EPOLLPRI; 8508 return mask; 8509 } 8510 8511 static const struct proc_ops mdstat_proc_ops = { 8512 .proc_open = md_seq_open, 8513 .proc_read = seq_read, 8514 .proc_lseek = seq_lseek, 8515 .proc_release = seq_release, 8516 .proc_poll = mdstat_poll, 8517 }; 8518 8519 int register_md_personality(struct md_personality *p) 8520 { 8521 pr_debug("md: %s personality registered for level %d\n", 8522 p->name, p->level); 8523 spin_lock(&pers_lock); 8524 list_add_tail(&p->list, &pers_list); 8525 spin_unlock(&pers_lock); 8526 return 0; 8527 } 8528 EXPORT_SYMBOL(register_md_personality); 8529 8530 int unregister_md_personality(struct md_personality *p) 8531 { 8532 pr_debug("md: %s personality unregistered\n", p->name); 8533 spin_lock(&pers_lock); 8534 list_del_init(&p->list); 8535 spin_unlock(&pers_lock); 8536 return 0; 8537 } 8538 EXPORT_SYMBOL(unregister_md_personality); 8539 8540 int register_md_cluster_operations(const struct md_cluster_operations *ops, 8541 struct module *module) 8542 { 8543 int ret = 0; 8544 spin_lock(&pers_lock); 8545 if (md_cluster_ops != NULL) 8546 ret = -EALREADY; 8547 else { 8548 md_cluster_ops = ops; 8549 md_cluster_mod = module; 8550 } 8551 spin_unlock(&pers_lock); 8552 return ret; 8553 } 8554 EXPORT_SYMBOL(register_md_cluster_operations); 8555 8556 int unregister_md_cluster_operations(void) 8557 { 8558 spin_lock(&pers_lock); 8559 md_cluster_ops = NULL; 8560 spin_unlock(&pers_lock); 8561 return 0; 8562 } 8563 EXPORT_SYMBOL(unregister_md_cluster_operations); 8564 8565 int md_setup_cluster(struct mddev *mddev, int nodes) 8566 { 8567 int ret; 8568 if (!md_cluster_ops) 8569 request_module("md-cluster"); 8570 spin_lock(&pers_lock); 8571 /* ensure module won't be unloaded */ 8572 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 8573 pr_warn("can't find md-cluster module or get its reference.\n"); 8574 spin_unlock(&pers_lock); 8575 return -ENOENT; 8576 } 8577 spin_unlock(&pers_lock); 8578 8579 ret = md_cluster_ops->join(mddev, nodes); 8580 if (!ret) 8581 mddev->safemode_delay = 0; 8582 return ret; 8583 } 8584 8585 void md_cluster_stop(struct mddev *mddev) 8586 { 8587 if (!md_cluster_ops) 8588 return; 8589 md_cluster_ops->leave(mddev); 8590 module_put(md_cluster_mod); 8591 } 8592 8593 static int is_mddev_idle(struct mddev *mddev, int init) 8594 { 8595 struct md_rdev *rdev; 8596 int idle; 8597 int curr_events; 8598 8599 idle = 1; 8600 rcu_read_lock(); 8601 rdev_for_each_rcu(rdev, mddev) { 8602 struct gendisk *disk = rdev->bdev->bd_disk; 8603 8604 if (!init && !blk_queue_io_stat(disk->queue)) 8605 continue; 8606 8607 curr_events = (int)part_stat_read_accum(disk->part0, sectors) - 8608 atomic_read(&disk->sync_io); 8609 /* sync IO will cause sync_io to increase before the disk_stats 8610 * as sync_io is counted when a request starts, and 8611 * disk_stats is counted when it completes. 8612 * So resync activity will cause curr_events to be smaller than 8613 * when there was no such activity. 8614 * non-sync IO will cause disk_stat to increase without 8615 * increasing sync_io so curr_events will (eventually) 8616 * be larger than it was before. Once it becomes 8617 * substantially larger, the test below will cause 8618 * the array to appear non-idle, and resync will slow 8619 * down. 8620 * If there is a lot of outstanding resync activity when 8621 * we set last_event to curr_events, then all that activity 8622 * completing might cause the array to appear non-idle 8623 * and resync will be slowed down even though there might 8624 * not have been non-resync activity. This will only 8625 * happen once though. 'last_events' will soon reflect 8626 * the state where there is little or no outstanding 8627 * resync requests, and further resync activity will 8628 * always make curr_events less than last_events. 8629 * 8630 */ 8631 if (init || curr_events - rdev->last_events > 64) { 8632 rdev->last_events = curr_events; 8633 idle = 0; 8634 } 8635 } 8636 rcu_read_unlock(); 8637 return idle; 8638 } 8639 8640 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8641 { 8642 /* another "blocks" (512byte) blocks have been synced */ 8643 atomic_sub(blocks, &mddev->recovery_active); 8644 wake_up(&mddev->recovery_wait); 8645 if (!ok) { 8646 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8647 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8648 md_wakeup_thread(mddev->thread); 8649 // stop recovery, signal do_sync .... 8650 } 8651 } 8652 EXPORT_SYMBOL(md_done_sync); 8653 8654 /* md_write_start(mddev, bi) 8655 * If we need to update some array metadata (e.g. 'active' flag 8656 * in superblock) before writing, schedule a superblock update 8657 * and wait for it to complete. 8658 * A return value of 'false' means that the write wasn't recorded 8659 * and cannot proceed as the array is being suspend. 8660 */ 8661 void md_write_start(struct mddev *mddev, struct bio *bi) 8662 { 8663 int did_change = 0; 8664 8665 if (bio_data_dir(bi) != WRITE) 8666 return; 8667 8668 BUG_ON(mddev->ro == MD_RDONLY); 8669 if (mddev->ro == MD_AUTO_READ) { 8670 /* need to switch to read/write */ 8671 flush_work(&mddev->sync_work); 8672 mddev->ro = MD_RDWR; 8673 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8674 md_wakeup_thread(mddev->thread); 8675 md_wakeup_thread(mddev->sync_thread); 8676 did_change = 1; 8677 } 8678 rcu_read_lock(); 8679 percpu_ref_get(&mddev->writes_pending); 8680 smp_mb(); /* Match smp_mb in set_in_sync() */ 8681 if (mddev->safemode == 1) 8682 mddev->safemode = 0; 8683 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8684 if (mddev->in_sync || mddev->sync_checkers) { 8685 spin_lock(&mddev->lock); 8686 if (mddev->in_sync) { 8687 mddev->in_sync = 0; 8688 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8689 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8690 md_wakeup_thread(mddev->thread); 8691 did_change = 1; 8692 } 8693 spin_unlock(&mddev->lock); 8694 } 8695 rcu_read_unlock(); 8696 if (did_change) 8697 sysfs_notify_dirent_safe(mddev->sysfs_state); 8698 if (!mddev->has_superblocks) 8699 return; 8700 wait_event(mddev->sb_wait, 8701 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8702 } 8703 EXPORT_SYMBOL(md_write_start); 8704 8705 /* md_write_inc can only be called when md_write_start() has 8706 * already been called at least once of the current request. 8707 * It increments the counter and is useful when a single request 8708 * is split into several parts. Each part causes an increment and 8709 * so needs a matching md_write_end(). 8710 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8711 * a spinlocked region. 8712 */ 8713 void md_write_inc(struct mddev *mddev, struct bio *bi) 8714 { 8715 if (bio_data_dir(bi) != WRITE) 8716 return; 8717 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8718 percpu_ref_get(&mddev->writes_pending); 8719 } 8720 EXPORT_SYMBOL(md_write_inc); 8721 8722 void md_write_end(struct mddev *mddev) 8723 { 8724 percpu_ref_put(&mddev->writes_pending); 8725 8726 if (mddev->safemode == 2) 8727 md_wakeup_thread(mddev->thread); 8728 else if (mddev->safemode_delay) 8729 /* The roundup() ensures this only performs locking once 8730 * every ->safemode_delay jiffies 8731 */ 8732 mod_timer(&mddev->safemode_timer, 8733 roundup(jiffies, mddev->safemode_delay) + 8734 mddev->safemode_delay); 8735 } 8736 8737 EXPORT_SYMBOL(md_write_end); 8738 8739 /* This is used by raid0 and raid10 */ 8740 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8741 struct bio *bio, sector_t start, sector_t size) 8742 { 8743 struct bio *discard_bio = NULL; 8744 8745 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8746 &discard_bio) || !discard_bio) 8747 return; 8748 8749 bio_chain(discard_bio, bio); 8750 bio_clone_blkg_association(discard_bio, bio); 8751 mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); 8752 submit_bio_noacct(discard_bio); 8753 } 8754 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8755 8756 static void md_end_clone_io(struct bio *bio) 8757 { 8758 struct md_io_clone *md_io_clone = bio->bi_private; 8759 struct bio *orig_bio = md_io_clone->orig_bio; 8760 struct mddev *mddev = md_io_clone->mddev; 8761 8762 if (bio->bi_status && !orig_bio->bi_status) 8763 orig_bio->bi_status = bio->bi_status; 8764 8765 if (md_io_clone->start_time) 8766 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8767 8768 bio_put(bio); 8769 bio_endio(orig_bio); 8770 percpu_ref_put(&mddev->active_io); 8771 } 8772 8773 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8774 { 8775 struct block_device *bdev = (*bio)->bi_bdev; 8776 struct md_io_clone *md_io_clone; 8777 struct bio *clone = 8778 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8779 8780 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8781 md_io_clone->orig_bio = *bio; 8782 md_io_clone->mddev = mddev; 8783 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8784 md_io_clone->start_time = bio_start_io_acct(*bio); 8785 8786 clone->bi_end_io = md_end_clone_io; 8787 clone->bi_private = md_io_clone; 8788 *bio = clone; 8789 } 8790 8791 void md_account_bio(struct mddev *mddev, struct bio **bio) 8792 { 8793 percpu_ref_get(&mddev->active_io); 8794 md_clone_bio(mddev, bio); 8795 } 8796 EXPORT_SYMBOL_GPL(md_account_bio); 8797 8798 void md_free_cloned_bio(struct bio *bio) 8799 { 8800 struct md_io_clone *md_io_clone = bio->bi_private; 8801 struct bio *orig_bio = md_io_clone->orig_bio; 8802 struct mddev *mddev = md_io_clone->mddev; 8803 8804 if (bio->bi_status && !orig_bio->bi_status) 8805 orig_bio->bi_status = bio->bi_status; 8806 8807 if (md_io_clone->start_time) 8808 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8809 8810 bio_put(bio); 8811 percpu_ref_put(&mddev->active_io); 8812 } 8813 EXPORT_SYMBOL_GPL(md_free_cloned_bio); 8814 8815 /* md_allow_write(mddev) 8816 * Calling this ensures that the array is marked 'active' so that writes 8817 * may proceed without blocking. It is important to call this before 8818 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8819 * Must be called with mddev_lock held. 8820 */ 8821 void md_allow_write(struct mddev *mddev) 8822 { 8823 if (!mddev->pers) 8824 return; 8825 if (!md_is_rdwr(mddev)) 8826 return; 8827 if (!mddev->pers->sync_request) 8828 return; 8829 8830 spin_lock(&mddev->lock); 8831 if (mddev->in_sync) { 8832 mddev->in_sync = 0; 8833 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8834 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8835 if (mddev->safemode_delay && 8836 mddev->safemode == 0) 8837 mddev->safemode = 1; 8838 spin_unlock(&mddev->lock); 8839 md_update_sb(mddev, 0); 8840 sysfs_notify_dirent_safe(mddev->sysfs_state); 8841 /* wait for the dirty state to be recorded in the metadata */ 8842 wait_event(mddev->sb_wait, 8843 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8844 } else 8845 spin_unlock(&mddev->lock); 8846 } 8847 EXPORT_SYMBOL_GPL(md_allow_write); 8848 8849 static sector_t md_sync_max_sectors(struct mddev *mddev, 8850 enum sync_action action) 8851 { 8852 switch (action) { 8853 case ACTION_RESYNC: 8854 case ACTION_CHECK: 8855 case ACTION_REPAIR: 8856 atomic64_set(&mddev->resync_mismatches, 0); 8857 fallthrough; 8858 case ACTION_RESHAPE: 8859 return mddev->resync_max_sectors; 8860 case ACTION_RECOVER: 8861 return mddev->dev_sectors; 8862 default: 8863 return 0; 8864 } 8865 } 8866 8867 static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) 8868 { 8869 sector_t start = 0; 8870 struct md_rdev *rdev; 8871 8872 switch (action) { 8873 case ACTION_CHECK: 8874 case ACTION_REPAIR: 8875 return mddev->resync_min; 8876 case ACTION_RESYNC: 8877 if (!mddev->bitmap) 8878 return mddev->recovery_cp; 8879 return 0; 8880 case ACTION_RESHAPE: 8881 /* 8882 * If the original node aborts reshaping then we continue the 8883 * reshaping, so set again to avoid restart reshape from the 8884 * first beginning 8885 */ 8886 if (mddev_is_clustered(mddev) && 8887 mddev->reshape_position != MaxSector) 8888 return mddev->reshape_position; 8889 return 0; 8890 case ACTION_RECOVER: 8891 start = MaxSector; 8892 rcu_read_lock(); 8893 rdev_for_each_rcu(rdev, mddev) 8894 if (rdev->raid_disk >= 0 && 8895 !test_bit(Journal, &rdev->flags) && 8896 !test_bit(Faulty, &rdev->flags) && 8897 !test_bit(In_sync, &rdev->flags) && 8898 rdev->recovery_offset < start) 8899 start = rdev->recovery_offset; 8900 rcu_read_unlock(); 8901 8902 /* If there is a bitmap, we need to make sure all 8903 * writes that started before we added a spare 8904 * complete before we start doing a recovery. 8905 * Otherwise the write might complete and (via 8906 * bitmap_endwrite) set a bit in the bitmap after the 8907 * recovery has checked that bit and skipped that 8908 * region. 8909 */ 8910 if (mddev->bitmap) { 8911 mddev->pers->quiesce(mddev, 1); 8912 mddev->pers->quiesce(mddev, 0); 8913 } 8914 return start; 8915 default: 8916 return MaxSector; 8917 } 8918 } 8919 8920 #define SYNC_MARKS 10 8921 #define SYNC_MARK_STEP (3*HZ) 8922 #define UPDATE_FREQUENCY (5*60*HZ) 8923 void md_do_sync(struct md_thread *thread) 8924 { 8925 struct mddev *mddev = thread->mddev; 8926 struct mddev *mddev2; 8927 unsigned int currspeed = 0, window; 8928 sector_t max_sectors,j, io_sectors, recovery_done; 8929 unsigned long mark[SYNC_MARKS]; 8930 unsigned long update_time; 8931 sector_t mark_cnt[SYNC_MARKS]; 8932 int last_mark,m; 8933 sector_t last_check; 8934 int skipped = 0; 8935 struct md_rdev *rdev; 8936 enum sync_action action; 8937 const char *desc; 8938 struct blk_plug plug; 8939 int ret; 8940 8941 /* just incase thread restarts... */ 8942 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8943 return; 8944 8945 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8946 goto skip; 8947 8948 if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || 8949 !md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 8950 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8951 goto skip; 8952 } 8953 8954 if (mddev_is_clustered(mddev)) { 8955 ret = md_cluster_ops->resync_start(mddev); 8956 if (ret) 8957 goto skip; 8958 8959 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8960 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8961 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8962 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8963 && ((unsigned long long)mddev->curr_resync_completed 8964 < (unsigned long long)mddev->resync_max_sectors)) 8965 goto skip; 8966 } 8967 8968 action = md_sync_action(mddev); 8969 desc = md_sync_action_name(action); 8970 mddev->last_sync_action = action; 8971 8972 /* 8973 * Before starting a resync we must have set curr_resync to 8974 * 2, and then checked that every "conflicting" array has curr_resync 8975 * less than ours. When we find one that is the same or higher 8976 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8977 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8978 * This will mean we have to start checking from the beginning again. 8979 * 8980 */ 8981 if (mddev_is_clustered(mddev)) 8982 md_cluster_ops->resync_start_notify(mddev); 8983 do { 8984 int mddev2_minor = -1; 8985 mddev->curr_resync = MD_RESYNC_DELAYED; 8986 8987 try_again: 8988 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8989 goto skip; 8990 spin_lock(&all_mddevs_lock); 8991 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 8992 if (test_bit(MD_DELETED, &mddev2->flags)) 8993 continue; 8994 if (mddev2 == mddev) 8995 continue; 8996 if (!mddev->parallel_resync 8997 && mddev2->curr_resync 8998 && match_mddev_units(mddev, mddev2)) { 8999 DEFINE_WAIT(wq); 9000 if (mddev < mddev2 && 9001 mddev->curr_resync == MD_RESYNC_DELAYED) { 9002 /* arbitrarily yield */ 9003 mddev->curr_resync = MD_RESYNC_YIELDED; 9004 wake_up(&resync_wait); 9005 } 9006 if (mddev > mddev2 && 9007 mddev->curr_resync == MD_RESYNC_YIELDED) 9008 /* no need to wait here, we can wait the next 9009 * time 'round when curr_resync == 2 9010 */ 9011 continue; 9012 /* We need to wait 'interruptible' so as not to 9013 * contribute to the load average, and not to 9014 * be caught by 'softlockup' 9015 */ 9016 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 9017 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9018 mddev2->curr_resync >= mddev->curr_resync) { 9019 if (mddev2_minor != mddev2->md_minor) { 9020 mddev2_minor = mddev2->md_minor; 9021 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 9022 desc, mdname(mddev), 9023 mdname(mddev2)); 9024 } 9025 spin_unlock(&all_mddevs_lock); 9026 9027 if (signal_pending(current)) 9028 flush_signals(current); 9029 schedule(); 9030 finish_wait(&resync_wait, &wq); 9031 goto try_again; 9032 } 9033 finish_wait(&resync_wait, &wq); 9034 } 9035 } 9036 spin_unlock(&all_mddevs_lock); 9037 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 9038 9039 max_sectors = md_sync_max_sectors(mddev, action); 9040 j = md_sync_position(mddev, action); 9041 9042 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 9043 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 9044 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 9045 speed_max(mddev), desc); 9046 9047 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 9048 9049 io_sectors = 0; 9050 for (m = 0; m < SYNC_MARKS; m++) { 9051 mark[m] = jiffies; 9052 mark_cnt[m] = io_sectors; 9053 } 9054 last_mark = 0; 9055 mddev->resync_mark = mark[last_mark]; 9056 mddev->resync_mark_cnt = mark_cnt[last_mark]; 9057 9058 /* 9059 * Tune reconstruction: 9060 */ 9061 window = 32 * (PAGE_SIZE / 512); 9062 pr_debug("md: using %dk window, over a total of %lluk.\n", 9063 window/2, (unsigned long long)max_sectors/2); 9064 9065 atomic_set(&mddev->recovery_active, 0); 9066 last_check = 0; 9067 9068 if (j >= MD_RESYNC_ACTIVE) { 9069 pr_debug("md: resuming %s of %s from checkpoint.\n", 9070 desc, mdname(mddev)); 9071 mddev->curr_resync = j; 9072 } else 9073 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 9074 mddev->curr_resync_completed = j; 9075 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9076 md_new_event(); 9077 update_time = jiffies; 9078 9079 blk_start_plug(&plug); 9080 while (j < max_sectors) { 9081 sector_t sectors; 9082 9083 skipped = 0; 9084 9085 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9086 ((mddev->curr_resync > mddev->curr_resync_completed && 9087 (mddev->curr_resync - mddev->curr_resync_completed) 9088 > (max_sectors >> 4)) || 9089 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 9090 (j - mddev->curr_resync_completed)*2 9091 >= mddev->resync_max - mddev->curr_resync_completed || 9092 mddev->curr_resync_completed > mddev->resync_max 9093 )) { 9094 /* time to update curr_resync_completed */ 9095 wait_event(mddev->recovery_wait, 9096 atomic_read(&mddev->recovery_active) == 0); 9097 mddev->curr_resync_completed = j; 9098 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 9099 j > mddev->recovery_cp) 9100 mddev->recovery_cp = j; 9101 update_time = jiffies; 9102 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 9103 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9104 } 9105 9106 while (j >= mddev->resync_max && 9107 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9108 /* As this condition is controlled by user-space, 9109 * we can block indefinitely, so use '_interruptible' 9110 * to avoid triggering warnings. 9111 */ 9112 flush_signals(current); /* just in case */ 9113 wait_event_interruptible(mddev->recovery_wait, 9114 mddev->resync_max > j 9115 || test_bit(MD_RECOVERY_INTR, 9116 &mddev->recovery)); 9117 } 9118 9119 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9120 break; 9121 9122 sectors = mddev->pers->sync_request(mddev, j, max_sectors, 9123 &skipped); 9124 if (sectors == 0) { 9125 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9126 break; 9127 } 9128 9129 if (!skipped) { /* actual IO requested */ 9130 io_sectors += sectors; 9131 atomic_add(sectors, &mddev->recovery_active); 9132 } 9133 9134 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9135 break; 9136 9137 j += sectors; 9138 if (j > max_sectors) 9139 /* when skipping, extra large numbers can be returned. */ 9140 j = max_sectors; 9141 if (j >= MD_RESYNC_ACTIVE) 9142 mddev->curr_resync = j; 9143 mddev->curr_mark_cnt = io_sectors; 9144 if (last_check == 0) 9145 /* this is the earliest that rebuild will be 9146 * visible in /proc/mdstat 9147 */ 9148 md_new_event(); 9149 9150 if (last_check + window > io_sectors || j == max_sectors) 9151 continue; 9152 9153 last_check = io_sectors; 9154 repeat: 9155 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9156 /* step marks */ 9157 int next = (last_mark+1) % SYNC_MARKS; 9158 9159 mddev->resync_mark = mark[next]; 9160 mddev->resync_mark_cnt = mark_cnt[next]; 9161 mark[next] = jiffies; 9162 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9163 last_mark = next; 9164 } 9165 9166 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9167 break; 9168 9169 /* 9170 * this loop exits only if either when we are slower than 9171 * the 'hard' speed limit, or the system was IO-idle for 9172 * a jiffy. 9173 * the system might be non-idle CPU-wise, but we only care 9174 * about not overloading the IO subsystem. (things like an 9175 * e2fsck being done on the RAID array should execute fast) 9176 */ 9177 cond_resched(); 9178 9179 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9180 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9181 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9182 9183 if (currspeed > speed_min(mddev)) { 9184 if (currspeed > speed_max(mddev)) { 9185 msleep(500); 9186 goto repeat; 9187 } 9188 if (!is_mddev_idle(mddev, 0)) { 9189 /* 9190 * Give other IO more of a chance. 9191 * The faster the devices, the less we wait. 9192 */ 9193 wait_event(mddev->recovery_wait, 9194 !atomic_read(&mddev->recovery_active)); 9195 } 9196 } 9197 } 9198 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9199 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9200 ? "interrupted" : "done"); 9201 /* 9202 * this also signals 'finished resyncing' to md_stop 9203 */ 9204 blk_finish_plug(&plug); 9205 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9206 9207 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9208 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9209 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9210 mddev->curr_resync_completed = mddev->curr_resync; 9211 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9212 } 9213 mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); 9214 9215 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9216 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9217 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9218 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9219 if (mddev->curr_resync >= mddev->recovery_cp) { 9220 pr_debug("md: checkpointing %s of %s.\n", 9221 desc, mdname(mddev)); 9222 if (test_bit(MD_RECOVERY_ERROR, 9223 &mddev->recovery)) 9224 mddev->recovery_cp = 9225 mddev->curr_resync_completed; 9226 else 9227 mddev->recovery_cp = 9228 mddev->curr_resync; 9229 } 9230 } else 9231 mddev->recovery_cp = MaxSector; 9232 } else { 9233 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9234 mddev->curr_resync = MaxSector; 9235 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9236 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9237 rcu_read_lock(); 9238 rdev_for_each_rcu(rdev, mddev) 9239 if (rdev->raid_disk >= 0 && 9240 mddev->delta_disks >= 0 && 9241 !test_bit(Journal, &rdev->flags) && 9242 !test_bit(Faulty, &rdev->flags) && 9243 !test_bit(In_sync, &rdev->flags) && 9244 rdev->recovery_offset < mddev->curr_resync) 9245 rdev->recovery_offset = mddev->curr_resync; 9246 rcu_read_unlock(); 9247 } 9248 } 9249 } 9250 skip: 9251 /* set CHANGE_PENDING here since maybe another update is needed, 9252 * so other nodes are informed. It should be harmless for normal 9253 * raid */ 9254 set_mask_bits(&mddev->sb_flags, 0, 9255 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9256 9257 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9258 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9259 mddev->delta_disks > 0 && 9260 mddev->pers->finish_reshape && 9261 mddev->pers->size && 9262 !mddev_is_dm(mddev)) { 9263 mddev_lock_nointr(mddev); 9264 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9265 mddev_unlock(mddev); 9266 if (!mddev_is_clustered(mddev)) 9267 set_capacity_and_notify(mddev->gendisk, 9268 mddev->array_sectors); 9269 } 9270 9271 spin_lock(&mddev->lock); 9272 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9273 /* We completed so min/max setting can be forgotten if used. */ 9274 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9275 mddev->resync_min = 0; 9276 mddev->resync_max = MaxSector; 9277 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9278 mddev->resync_min = mddev->curr_resync_completed; 9279 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9280 mddev->curr_resync = MD_RESYNC_NONE; 9281 spin_unlock(&mddev->lock); 9282 9283 wake_up(&resync_wait); 9284 md_wakeup_thread(mddev->thread); 9285 return; 9286 } 9287 EXPORT_SYMBOL_GPL(md_do_sync); 9288 9289 static bool rdev_removeable(struct md_rdev *rdev) 9290 { 9291 /* rdev is not used. */ 9292 if (rdev->raid_disk < 0) 9293 return false; 9294 9295 /* There are still inflight io, don't remove this rdev. */ 9296 if (atomic_read(&rdev->nr_pending)) 9297 return false; 9298 9299 /* 9300 * An error occurred but has not yet been acknowledged by the metadata 9301 * handler, don't remove this rdev. 9302 */ 9303 if (test_bit(Blocked, &rdev->flags)) 9304 return false; 9305 9306 /* Fautly rdev is not used, it's safe to remove it. */ 9307 if (test_bit(Faulty, &rdev->flags)) 9308 return true; 9309 9310 /* Journal disk can only be removed if it's faulty. */ 9311 if (test_bit(Journal, &rdev->flags)) 9312 return false; 9313 9314 /* 9315 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9316 * replacement has just become active from pers->spare_active(), and 9317 * then pers->hot_remove_disk() will replace this rdev with replacement. 9318 */ 9319 if (!test_bit(In_sync, &rdev->flags)) 9320 return true; 9321 9322 return false; 9323 } 9324 9325 static bool rdev_is_spare(struct md_rdev *rdev) 9326 { 9327 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 9328 !test_bit(In_sync, &rdev->flags) && 9329 !test_bit(Journal, &rdev->flags) && 9330 !test_bit(Faulty, &rdev->flags); 9331 } 9332 9333 static bool rdev_addable(struct md_rdev *rdev) 9334 { 9335 /* rdev is already used, don't add it again. */ 9336 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9337 test_bit(Faulty, &rdev->flags)) 9338 return false; 9339 9340 /* Allow to add journal disk. */ 9341 if (test_bit(Journal, &rdev->flags)) 9342 return true; 9343 9344 /* Allow to add if array is read-write. */ 9345 if (md_is_rdwr(rdev->mddev)) 9346 return true; 9347 9348 /* 9349 * For read-only array, only allow to readd a rdev. And if bitmap is 9350 * used, don't allow to readd a rdev that is too old. 9351 */ 9352 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 9353 return true; 9354 9355 return false; 9356 } 9357 9358 static bool md_spares_need_change(struct mddev *mddev) 9359 { 9360 struct md_rdev *rdev; 9361 9362 rcu_read_lock(); 9363 rdev_for_each_rcu(rdev, mddev) { 9364 if (rdev_removeable(rdev) || rdev_addable(rdev)) { 9365 rcu_read_unlock(); 9366 return true; 9367 } 9368 } 9369 rcu_read_unlock(); 9370 return false; 9371 } 9372 9373 static int remove_and_add_spares(struct mddev *mddev, 9374 struct md_rdev *this) 9375 { 9376 struct md_rdev *rdev; 9377 int spares = 0; 9378 int removed = 0; 9379 9380 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9381 /* Mustn't remove devices when resync thread is running */ 9382 return 0; 9383 9384 rdev_for_each(rdev, mddev) { 9385 if ((this == NULL || rdev == this) && rdev_removeable(rdev) && 9386 !mddev->pers->hot_remove_disk(mddev, rdev)) { 9387 sysfs_unlink_rdev(mddev, rdev); 9388 rdev->saved_raid_disk = rdev->raid_disk; 9389 rdev->raid_disk = -1; 9390 removed++; 9391 } 9392 } 9393 9394 if (removed && mddev->kobj.sd) 9395 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9396 9397 if (this && removed) 9398 goto no_add; 9399 9400 rdev_for_each(rdev, mddev) { 9401 if (this && this != rdev) 9402 continue; 9403 if (rdev_is_spare(rdev)) 9404 spares++; 9405 if (!rdev_addable(rdev)) 9406 continue; 9407 if (!test_bit(Journal, &rdev->flags)) 9408 rdev->recovery_offset = 0; 9409 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9410 /* failure here is OK */ 9411 sysfs_link_rdev(mddev, rdev); 9412 if (!test_bit(Journal, &rdev->flags)) 9413 spares++; 9414 md_new_event(); 9415 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9416 } 9417 } 9418 no_add: 9419 if (removed) 9420 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9421 return spares; 9422 } 9423 9424 static bool md_choose_sync_action(struct mddev *mddev, int *spares) 9425 { 9426 /* Check if reshape is in progress first. */ 9427 if (mddev->reshape_position != MaxSector) { 9428 if (mddev->pers->check_reshape == NULL || 9429 mddev->pers->check_reshape(mddev) != 0) 9430 return false; 9431 9432 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9433 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9434 return true; 9435 } 9436 9437 /* 9438 * Remove any failed drives, then add spares if possible. Spares are 9439 * also removed and re-added, to allow the personality to fail the 9440 * re-add. 9441 */ 9442 *spares = remove_and_add_spares(mddev, NULL); 9443 if (*spares) { 9444 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9445 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9446 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9447 9448 /* Start new recovery. */ 9449 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9450 return true; 9451 } 9452 9453 /* Check if recovery is in progress. */ 9454 if (mddev->recovery_cp < MaxSector) { 9455 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9456 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9457 return true; 9458 } 9459 9460 /* Delay to choose resync/check/repair in md_do_sync(). */ 9461 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9462 return true; 9463 9464 /* Nothing to be done */ 9465 return false; 9466 } 9467 9468 static void md_start_sync(struct work_struct *ws) 9469 { 9470 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 9471 int spares = 0; 9472 bool suspend = false; 9473 char *name; 9474 9475 /* 9476 * If reshape is still in progress, spares won't be added or removed 9477 * from conf until reshape is done. 9478 */ 9479 if (mddev->reshape_position == MaxSector && 9480 md_spares_need_change(mddev)) { 9481 suspend = true; 9482 mddev_suspend(mddev, false); 9483 } 9484 9485 mddev_lock_nointr(mddev); 9486 if (!md_is_rdwr(mddev)) { 9487 /* 9488 * On a read-only array we can: 9489 * - remove failed devices 9490 * - add already-in_sync devices if the array itself is in-sync. 9491 * As we only add devices that are already in-sync, we can 9492 * activate the spares immediately. 9493 */ 9494 remove_and_add_spares(mddev, NULL); 9495 goto not_running; 9496 } 9497 9498 if (!md_choose_sync_action(mddev, &spares)) 9499 goto not_running; 9500 9501 if (!mddev->pers->sync_request) 9502 goto not_running; 9503 9504 /* 9505 * We are adding a device or devices to an array which has the bitmap 9506 * stored on all devices. So make sure all bitmap pages get written. 9507 */ 9508 if (spares) 9509 md_bitmap_write_all(mddev->bitmap); 9510 9511 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? 9512 "reshape" : "resync"; 9513 rcu_assign_pointer(mddev->sync_thread, 9514 md_register_thread(md_do_sync, mddev, name)); 9515 if (!mddev->sync_thread) { 9516 pr_warn("%s: could not start resync thread...\n", 9517 mdname(mddev)); 9518 /* leave the spares where they are, it shouldn't hurt */ 9519 goto not_running; 9520 } 9521 9522 mddev_unlock(mddev); 9523 /* 9524 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9525 * not set it again. Otherwise, we may cause issue like this one: 9526 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9527 * Therefore, use __mddev_resume(mddev, false). 9528 */ 9529 if (suspend) 9530 __mddev_resume(mddev, false); 9531 md_wakeup_thread(mddev->sync_thread); 9532 sysfs_notify_dirent_safe(mddev->sysfs_action); 9533 md_new_event(); 9534 return; 9535 9536 not_running: 9537 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9538 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9539 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9540 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9541 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9542 mddev_unlock(mddev); 9543 /* 9544 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9545 * not set it again. Otherwise, we may cause issue like this one: 9546 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9547 * Therefore, use __mddev_resume(mddev, false). 9548 */ 9549 if (suspend) 9550 __mddev_resume(mddev, false); 9551 9552 wake_up(&resync_wait); 9553 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 9554 mddev->sysfs_action) 9555 sysfs_notify_dirent_safe(mddev->sysfs_action); 9556 } 9557 9558 static void unregister_sync_thread(struct mddev *mddev) 9559 { 9560 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9561 /* resync/recovery still happening */ 9562 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9563 return; 9564 } 9565 9566 if (WARN_ON_ONCE(!mddev->sync_thread)) 9567 return; 9568 9569 md_reap_sync_thread(mddev); 9570 } 9571 9572 /* 9573 * This routine is regularly called by all per-raid-array threads to 9574 * deal with generic issues like resync and super-block update. 9575 * Raid personalities that don't have a thread (linear/raid0) do not 9576 * need this as they never do any recovery or update the superblock. 9577 * 9578 * It does not do any resync itself, but rather "forks" off other threads 9579 * to do that as needed. 9580 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9581 * "->recovery" and create a thread at ->sync_thread. 9582 * When the thread finishes it sets MD_RECOVERY_DONE 9583 * and wakeups up this thread which will reap the thread and finish up. 9584 * This thread also removes any faulty devices (with nr_pending == 0). 9585 * 9586 * The overall approach is: 9587 * 1/ if the superblock needs updating, update it. 9588 * 2/ If a recovery thread is running, don't do anything else. 9589 * 3/ If recovery has finished, clean up, possibly marking spares active. 9590 * 4/ If there are any faulty devices, remove them. 9591 * 5/ If array is degraded, try to add spares devices 9592 * 6/ If array has spares or is not in-sync, start a resync thread. 9593 */ 9594 void md_check_recovery(struct mddev *mddev) 9595 { 9596 if (mddev->bitmap) 9597 md_bitmap_daemon_work(mddev); 9598 9599 if (signal_pending(current)) { 9600 if (mddev->pers->sync_request && !mddev->external) { 9601 pr_debug("md: %s in immediate safe mode\n", 9602 mdname(mddev)); 9603 mddev->safemode = 2; 9604 } 9605 flush_signals(current); 9606 } 9607 9608 if (!md_is_rdwr(mddev) && 9609 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9610 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 9611 return; 9612 if ( ! ( 9613 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9614 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9615 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9616 (mddev->external == 0 && mddev->safemode == 1) || 9617 (mddev->safemode == 2 9618 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9619 )) 9620 return; 9621 9622 if (mddev_trylock(mddev)) { 9623 bool try_set_sync = mddev->safemode != 0; 9624 9625 if (!mddev->external && mddev->safemode == 1) 9626 mddev->safemode = 0; 9627 9628 if (!md_is_rdwr(mddev)) { 9629 struct md_rdev *rdev; 9630 9631 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9632 unregister_sync_thread(mddev); 9633 goto unlock; 9634 } 9635 9636 if (!mddev->external && mddev->in_sync) 9637 /* 9638 * 'Blocked' flag not needed as failed devices 9639 * will be recorded if array switched to read/write. 9640 * Leaving it set will prevent the device 9641 * from being removed. 9642 */ 9643 rdev_for_each(rdev, mddev) 9644 clear_bit(Blocked, &rdev->flags); 9645 9646 /* 9647 * There is no thread, but we need to call 9648 * ->spare_active and clear saved_raid_disk 9649 */ 9650 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9651 md_reap_sync_thread(mddev); 9652 9653 /* 9654 * Let md_start_sync() to remove and add rdevs to the 9655 * array. 9656 */ 9657 if (md_spares_need_change(mddev)) { 9658 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9659 queue_work(md_misc_wq, &mddev->sync_work); 9660 } 9661 9662 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9663 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9664 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9665 9666 goto unlock; 9667 } 9668 9669 if (mddev_is_clustered(mddev)) { 9670 struct md_rdev *rdev, *tmp; 9671 /* kick the device if another node issued a 9672 * remove disk. 9673 */ 9674 rdev_for_each_safe(rdev, tmp, mddev) { 9675 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9676 rdev->raid_disk < 0) 9677 md_kick_rdev_from_array(rdev); 9678 } 9679 } 9680 9681 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9682 spin_lock(&mddev->lock); 9683 set_in_sync(mddev); 9684 spin_unlock(&mddev->lock); 9685 } 9686 9687 if (mddev->sb_flags) 9688 md_update_sb(mddev, 0); 9689 9690 /* 9691 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9692 * still set. 9693 */ 9694 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9695 unregister_sync_thread(mddev); 9696 goto unlock; 9697 } 9698 9699 /* Set RUNNING before clearing NEEDED to avoid 9700 * any transients in the value of "sync_action". 9701 */ 9702 mddev->curr_resync_completed = 0; 9703 spin_lock(&mddev->lock); 9704 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9705 spin_unlock(&mddev->lock); 9706 /* Clear some bits that don't mean anything, but 9707 * might be left set 9708 */ 9709 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9710 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9711 9712 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9713 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 9714 queue_work(md_misc_wq, &mddev->sync_work); 9715 } else { 9716 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9717 wake_up(&resync_wait); 9718 } 9719 9720 unlock: 9721 wake_up(&mddev->sb_wait); 9722 mddev_unlock(mddev); 9723 } 9724 } 9725 EXPORT_SYMBOL(md_check_recovery); 9726 9727 void md_reap_sync_thread(struct mddev *mddev) 9728 { 9729 struct md_rdev *rdev; 9730 sector_t old_dev_sectors = mddev->dev_sectors; 9731 bool is_reshaped = false; 9732 9733 /* resync has finished, collect result */ 9734 md_unregister_thread(mddev, &mddev->sync_thread); 9735 atomic_inc(&mddev->sync_seq); 9736 9737 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9738 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9739 mddev->degraded != mddev->raid_disks) { 9740 /* success...*/ 9741 /* activate any spares */ 9742 if (mddev->pers->spare_active(mddev)) { 9743 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9744 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9745 } 9746 } 9747 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9748 mddev->pers->finish_reshape) { 9749 mddev->pers->finish_reshape(mddev); 9750 if (mddev_is_clustered(mddev)) 9751 is_reshaped = true; 9752 } 9753 9754 /* If array is no-longer degraded, then any saved_raid_disk 9755 * information must be scrapped. 9756 */ 9757 if (!mddev->degraded) 9758 rdev_for_each(rdev, mddev) 9759 rdev->saved_raid_disk = -1; 9760 9761 md_update_sb(mddev, 1); 9762 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9763 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9764 * clustered raid */ 9765 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9766 md_cluster_ops->resync_finish(mddev); 9767 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9768 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9769 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9770 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9771 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9772 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9773 /* 9774 * We call md_cluster_ops->update_size here because sync_size could 9775 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9776 * so it is time to update size across cluster. 9777 */ 9778 if (mddev_is_clustered(mddev) && is_reshaped 9779 && !test_bit(MD_CLOSING, &mddev->flags)) 9780 md_cluster_ops->update_size(mddev, old_dev_sectors); 9781 /* flag recovery needed just to double check */ 9782 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9783 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9784 sysfs_notify_dirent_safe(mddev->sysfs_action); 9785 md_new_event(); 9786 if (mddev->event_work.func) 9787 queue_work(md_misc_wq, &mddev->event_work); 9788 wake_up(&resync_wait); 9789 } 9790 EXPORT_SYMBOL(md_reap_sync_thread); 9791 9792 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9793 { 9794 sysfs_notify_dirent_safe(rdev->sysfs_state); 9795 wait_event_timeout(rdev->blocked_wait, 9796 !test_bit(Blocked, &rdev->flags) && 9797 !test_bit(BlockedBadBlocks, &rdev->flags), 9798 msecs_to_jiffies(5000)); 9799 rdev_dec_pending(rdev, mddev); 9800 } 9801 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9802 9803 void md_finish_reshape(struct mddev *mddev) 9804 { 9805 /* called be personality module when reshape completes. */ 9806 struct md_rdev *rdev; 9807 9808 rdev_for_each(rdev, mddev) { 9809 if (rdev->data_offset > rdev->new_data_offset) 9810 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9811 else 9812 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9813 rdev->data_offset = rdev->new_data_offset; 9814 } 9815 } 9816 EXPORT_SYMBOL(md_finish_reshape); 9817 9818 /* Bad block management */ 9819 9820 /* Returns 1 on success, 0 on failure */ 9821 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9822 int is_new) 9823 { 9824 struct mddev *mddev = rdev->mddev; 9825 int rv; 9826 if (is_new) 9827 s += rdev->new_data_offset; 9828 else 9829 s += rdev->data_offset; 9830 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 9831 if (rv == 0) { 9832 /* Make sure they get written out promptly */ 9833 if (test_bit(ExternalBbl, &rdev->flags)) 9834 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9835 sysfs_notify_dirent_safe(rdev->sysfs_state); 9836 set_mask_bits(&mddev->sb_flags, 0, 9837 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9838 md_wakeup_thread(rdev->mddev->thread); 9839 return 1; 9840 } else 9841 return 0; 9842 } 9843 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9844 9845 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9846 int is_new) 9847 { 9848 int rv; 9849 if (is_new) 9850 s += rdev->new_data_offset; 9851 else 9852 s += rdev->data_offset; 9853 rv = badblocks_clear(&rdev->badblocks, s, sectors); 9854 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 9855 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9856 return rv; 9857 } 9858 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9859 9860 static int md_notify_reboot(struct notifier_block *this, 9861 unsigned long code, void *x) 9862 { 9863 struct mddev *mddev, *n; 9864 int need_delay = 0; 9865 9866 spin_lock(&all_mddevs_lock); 9867 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 9868 if (!mddev_get(mddev)) 9869 continue; 9870 spin_unlock(&all_mddevs_lock); 9871 if (mddev_trylock(mddev)) { 9872 if (mddev->pers) 9873 __md_stop_writes(mddev); 9874 if (mddev->persistent) 9875 mddev->safemode = 2; 9876 mddev_unlock(mddev); 9877 } 9878 need_delay = 1; 9879 mddev_put(mddev); 9880 spin_lock(&all_mddevs_lock); 9881 } 9882 spin_unlock(&all_mddevs_lock); 9883 9884 /* 9885 * certain more exotic SCSI devices are known to be 9886 * volatile wrt too early system reboots. While the 9887 * right place to handle this issue is the given 9888 * driver, we do want to have a safe RAID driver ... 9889 */ 9890 if (need_delay) 9891 msleep(1000); 9892 9893 return NOTIFY_DONE; 9894 } 9895 9896 static struct notifier_block md_notifier = { 9897 .notifier_call = md_notify_reboot, 9898 .next = NULL, 9899 .priority = INT_MAX, /* before any real devices */ 9900 }; 9901 9902 static void md_geninit(void) 9903 { 9904 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9905 9906 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 9907 } 9908 9909 static int __init md_init(void) 9910 { 9911 int ret = -ENOMEM; 9912 9913 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9914 if (!md_wq) 9915 goto err_wq; 9916 9917 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9918 if (!md_misc_wq) 9919 goto err_misc_wq; 9920 9921 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 9922 0); 9923 if (!md_bitmap_wq) 9924 goto err_bitmap_wq; 9925 9926 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 9927 if (ret < 0) 9928 goto err_md; 9929 9930 ret = __register_blkdev(0, "mdp", md_probe); 9931 if (ret < 0) 9932 goto err_mdp; 9933 mdp_major = ret; 9934 9935 register_reboot_notifier(&md_notifier); 9936 raid_table_header = register_sysctl("dev/raid", raid_table); 9937 9938 md_geninit(); 9939 return 0; 9940 9941 err_mdp: 9942 unregister_blkdev(MD_MAJOR, "md"); 9943 err_md: 9944 destroy_workqueue(md_bitmap_wq); 9945 err_bitmap_wq: 9946 destroy_workqueue(md_misc_wq); 9947 err_misc_wq: 9948 destroy_workqueue(md_wq); 9949 err_wq: 9950 return ret; 9951 } 9952 9953 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9954 { 9955 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9956 struct md_rdev *rdev2, *tmp; 9957 int role, ret; 9958 9959 /* 9960 * If size is changed in another node then we need to 9961 * do resize as well. 9962 */ 9963 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9964 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9965 if (ret) 9966 pr_info("md-cluster: resize failed\n"); 9967 else 9968 md_bitmap_update_sb(mddev->bitmap); 9969 } 9970 9971 /* Check for change of roles in the active devices */ 9972 rdev_for_each_safe(rdev2, tmp, mddev) { 9973 if (test_bit(Faulty, &rdev2->flags)) 9974 continue; 9975 9976 /* Check if the roles changed */ 9977 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9978 9979 if (test_bit(Candidate, &rdev2->flags)) { 9980 if (role == MD_DISK_ROLE_FAULTY) { 9981 pr_info("md: Removing Candidate device %pg because add failed\n", 9982 rdev2->bdev); 9983 md_kick_rdev_from_array(rdev2); 9984 continue; 9985 } 9986 else 9987 clear_bit(Candidate, &rdev2->flags); 9988 } 9989 9990 if (role != rdev2->raid_disk) { 9991 /* 9992 * got activated except reshape is happening. 9993 */ 9994 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 9995 !(le32_to_cpu(sb->feature_map) & 9996 MD_FEATURE_RESHAPE_ACTIVE) && 9997 !md_cluster_ops->resync_status_get(mddev)) { 9998 /* 9999 * -1 to make raid1_add_disk() set conf->fullsync 10000 * to 1. This could avoid skipping sync when the 10001 * remote node is down during resyncing. 10002 */ 10003 if ((le32_to_cpu(sb->feature_map) 10004 & MD_FEATURE_RECOVERY_OFFSET)) 10005 rdev2->saved_raid_disk = -1; 10006 else 10007 rdev2->saved_raid_disk = role; 10008 ret = remove_and_add_spares(mddev, rdev2); 10009 pr_info("Activated spare: %pg\n", 10010 rdev2->bdev); 10011 /* wakeup mddev->thread here, so array could 10012 * perform resync with the new activated disk */ 10013 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 10014 md_wakeup_thread(mddev->thread); 10015 } 10016 /* device faulty 10017 * We just want to do the minimum to mark the disk 10018 * as faulty. The recovery is performed by the 10019 * one who initiated the error. 10020 */ 10021 if (role == MD_DISK_ROLE_FAULTY || 10022 role == MD_DISK_ROLE_JOURNAL) { 10023 md_error(mddev, rdev2); 10024 clear_bit(Blocked, &rdev2->flags); 10025 } 10026 } 10027 } 10028 10029 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 10030 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 10031 if (ret) 10032 pr_warn("md: updating array disks failed. %d\n", ret); 10033 } 10034 10035 /* 10036 * Since mddev->delta_disks has already updated in update_raid_disks, 10037 * so it is time to check reshape. 10038 */ 10039 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10040 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10041 /* 10042 * reshape is happening in the remote node, we need to 10043 * update reshape_position and call start_reshape. 10044 */ 10045 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 10046 if (mddev->pers->update_reshape_pos) 10047 mddev->pers->update_reshape_pos(mddev); 10048 if (mddev->pers->start_reshape) 10049 mddev->pers->start_reshape(mddev); 10050 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 10051 mddev->reshape_position != MaxSector && 10052 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 10053 /* reshape is just done in another node. */ 10054 mddev->reshape_position = MaxSector; 10055 if (mddev->pers->update_reshape_pos) 10056 mddev->pers->update_reshape_pos(mddev); 10057 } 10058 10059 /* Finally set the event to be up to date */ 10060 mddev->events = le64_to_cpu(sb->events); 10061 } 10062 10063 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 10064 { 10065 int err; 10066 struct page *swapout = rdev->sb_page; 10067 struct mdp_superblock_1 *sb; 10068 10069 /* Store the sb page of the rdev in the swapout temporary 10070 * variable in case we err in the future 10071 */ 10072 rdev->sb_page = NULL; 10073 err = alloc_disk_sb(rdev); 10074 if (err == 0) { 10075 ClearPageUptodate(rdev->sb_page); 10076 rdev->sb_loaded = 0; 10077 err = super_types[mddev->major_version]. 10078 load_super(rdev, NULL, mddev->minor_version); 10079 } 10080 if (err < 0) { 10081 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 10082 __func__, __LINE__, rdev->desc_nr, err); 10083 if (rdev->sb_page) 10084 put_page(rdev->sb_page); 10085 rdev->sb_page = swapout; 10086 rdev->sb_loaded = 1; 10087 return err; 10088 } 10089 10090 sb = page_address(rdev->sb_page); 10091 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 10092 * is not set 10093 */ 10094 10095 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 10096 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 10097 10098 /* The other node finished recovery, call spare_active to set 10099 * device In_sync and mddev->degraded 10100 */ 10101 if (rdev->recovery_offset == MaxSector && 10102 !test_bit(In_sync, &rdev->flags) && 10103 mddev->pers->spare_active(mddev)) 10104 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 10105 10106 put_page(swapout); 10107 return 0; 10108 } 10109 10110 void md_reload_sb(struct mddev *mddev, int nr) 10111 { 10112 struct md_rdev *rdev = NULL, *iter; 10113 int err; 10114 10115 /* Find the rdev */ 10116 rdev_for_each_rcu(iter, mddev) { 10117 if (iter->desc_nr == nr) { 10118 rdev = iter; 10119 break; 10120 } 10121 } 10122 10123 if (!rdev) { 10124 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 10125 return; 10126 } 10127 10128 err = read_rdev(mddev, rdev); 10129 if (err < 0) 10130 return; 10131 10132 check_sb_changes(mddev, rdev); 10133 10134 /* Read all rdev's to update recovery_offset */ 10135 rdev_for_each_rcu(rdev, mddev) { 10136 if (!test_bit(Faulty, &rdev->flags)) 10137 read_rdev(mddev, rdev); 10138 } 10139 } 10140 EXPORT_SYMBOL(md_reload_sb); 10141 10142 #ifndef MODULE 10143 10144 /* 10145 * Searches all registered partitions for autorun RAID arrays 10146 * at boot time. 10147 */ 10148 10149 static DEFINE_MUTEX(detected_devices_mutex); 10150 static LIST_HEAD(all_detected_devices); 10151 struct detected_devices_node { 10152 struct list_head list; 10153 dev_t dev; 10154 }; 10155 10156 void md_autodetect_dev(dev_t dev) 10157 { 10158 struct detected_devices_node *node_detected_dev; 10159 10160 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 10161 if (node_detected_dev) { 10162 node_detected_dev->dev = dev; 10163 mutex_lock(&detected_devices_mutex); 10164 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10165 mutex_unlock(&detected_devices_mutex); 10166 } 10167 } 10168 10169 void md_autostart_arrays(int part) 10170 { 10171 struct md_rdev *rdev; 10172 struct detected_devices_node *node_detected_dev; 10173 dev_t dev; 10174 int i_scanned, i_passed; 10175 10176 i_scanned = 0; 10177 i_passed = 0; 10178 10179 pr_info("md: Autodetecting RAID arrays.\n"); 10180 10181 mutex_lock(&detected_devices_mutex); 10182 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10183 i_scanned++; 10184 node_detected_dev = list_entry(all_detected_devices.next, 10185 struct detected_devices_node, list); 10186 list_del(&node_detected_dev->list); 10187 dev = node_detected_dev->dev; 10188 kfree(node_detected_dev); 10189 mutex_unlock(&detected_devices_mutex); 10190 rdev = md_import_device(dev,0, 90); 10191 mutex_lock(&detected_devices_mutex); 10192 if (IS_ERR(rdev)) 10193 continue; 10194 10195 if (test_bit(Faulty, &rdev->flags)) 10196 continue; 10197 10198 set_bit(AutoDetected, &rdev->flags); 10199 list_add(&rdev->same_set, &pending_raid_disks); 10200 i_passed++; 10201 } 10202 mutex_unlock(&detected_devices_mutex); 10203 10204 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10205 10206 autorun_devices(part); 10207 } 10208 10209 #endif /* !MODULE */ 10210 10211 static __exit void md_exit(void) 10212 { 10213 struct mddev *mddev, *n; 10214 int delay = 1; 10215 10216 unregister_blkdev(MD_MAJOR,"md"); 10217 unregister_blkdev(mdp_major, "mdp"); 10218 unregister_reboot_notifier(&md_notifier); 10219 unregister_sysctl_table(raid_table_header); 10220 10221 /* We cannot unload the modules while some process is 10222 * waiting for us in select() or poll() - wake them up 10223 */ 10224 md_unloading = 1; 10225 while (waitqueue_active(&md_event_waiters)) { 10226 /* not safe to leave yet */ 10227 wake_up(&md_event_waiters); 10228 msleep(delay); 10229 delay += delay; 10230 } 10231 remove_proc_entry("mdstat", NULL); 10232 10233 spin_lock(&all_mddevs_lock); 10234 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 10235 if (!mddev_get(mddev)) 10236 continue; 10237 spin_unlock(&all_mddevs_lock); 10238 export_array(mddev); 10239 mddev->ctime = 0; 10240 mddev->hold_active = 0; 10241 /* 10242 * As the mddev is now fully clear, mddev_put will schedule 10243 * the mddev for destruction by a workqueue, and the 10244 * destroy_workqueue() below will wait for that to complete. 10245 */ 10246 mddev_put(mddev); 10247 spin_lock(&all_mddevs_lock); 10248 } 10249 spin_unlock(&all_mddevs_lock); 10250 10251 destroy_workqueue(md_misc_wq); 10252 destroy_workqueue(md_bitmap_wq); 10253 destroy_workqueue(md_wq); 10254 } 10255 10256 subsys_initcall(md_init); 10257 module_exit(md_exit) 10258 10259 static int get_ro(char *buffer, const struct kernel_param *kp) 10260 { 10261 return sprintf(buffer, "%d\n", start_readonly); 10262 } 10263 static int set_ro(const char *val, const struct kernel_param *kp) 10264 { 10265 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10266 } 10267 10268 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10269 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10270 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10271 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10272 10273 MODULE_LICENSE("GPL"); 10274 MODULE_DESCRIPTION("MD RAID framework"); 10275 MODULE_ALIAS("md"); 10276 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10277