1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include <trace/events/block.h> 69 #include "md.h" 70 #include "md-bitmap.h" 71 #include "md-cluster.h" 72 73 /* pers_list is a list of registered personalities protected by pers_lock. */ 74 static LIST_HEAD(pers_list); 75 static DEFINE_SPINLOCK(pers_lock); 76 77 static const struct kobj_type md_ktype; 78 79 struct md_cluster_operations *md_cluster_ops; 80 EXPORT_SYMBOL(md_cluster_ops); 81 static struct module *md_cluster_mod; 82 83 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 84 static struct workqueue_struct *md_wq; 85 static struct workqueue_struct *md_misc_wq; 86 struct workqueue_struct *md_bitmap_wq; 87 88 static int remove_and_add_spares(struct mddev *mddev, 89 struct md_rdev *this); 90 static void mddev_detach(struct mddev *mddev); 91 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 92 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 93 94 enum md_ro_state { 95 MD_RDWR, 96 MD_RDONLY, 97 MD_AUTO_READ, 98 MD_MAX_STATE 99 }; 100 101 static bool md_is_rdwr(struct mddev *mddev) 102 { 103 return (mddev->ro == MD_RDWR); 104 } 105 106 /* 107 * Default number of read corrections we'll attempt on an rdev 108 * before ejecting it from the array. We divide the read error 109 * count by 2 for every hour elapsed between read errors. 110 */ 111 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 112 /* Default safemode delay: 200 msec */ 113 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 114 /* 115 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 116 * is 1000 KB/sec, so the extra system load does not show up that much. 117 * Increase it if you want to have more _guaranteed_ speed. Note that 118 * the RAID driver will use the maximum available bandwidth if the IO 119 * subsystem is idle. There is also an 'absolute maximum' reconstruction 120 * speed limit - in case reconstruction slows down your system despite 121 * idle IO detection. 122 * 123 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 124 * or /sys/block/mdX/md/sync_speed_{min,max} 125 */ 126 127 static int sysctl_speed_limit_min = 1000; 128 static int sysctl_speed_limit_max = 200000; 129 static inline int speed_min(struct mddev *mddev) 130 { 131 return mddev->sync_speed_min ? 132 mddev->sync_speed_min : sysctl_speed_limit_min; 133 } 134 135 static inline int speed_max(struct mddev *mddev) 136 { 137 return mddev->sync_speed_max ? 138 mddev->sync_speed_max : sysctl_speed_limit_max; 139 } 140 141 static void rdev_uninit_serial(struct md_rdev *rdev) 142 { 143 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 144 return; 145 146 kvfree(rdev->serial); 147 rdev->serial = NULL; 148 } 149 150 static void rdevs_uninit_serial(struct mddev *mddev) 151 { 152 struct md_rdev *rdev; 153 154 rdev_for_each(rdev, mddev) 155 rdev_uninit_serial(rdev); 156 } 157 158 static int rdev_init_serial(struct md_rdev *rdev) 159 { 160 /* serial_nums equals with BARRIER_BUCKETS_NR */ 161 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 162 struct serial_in_rdev *serial = NULL; 163 164 if (test_bit(CollisionCheck, &rdev->flags)) 165 return 0; 166 167 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 168 GFP_KERNEL); 169 if (!serial) 170 return -ENOMEM; 171 172 for (i = 0; i < serial_nums; i++) { 173 struct serial_in_rdev *serial_tmp = &serial[i]; 174 175 spin_lock_init(&serial_tmp->serial_lock); 176 serial_tmp->serial_rb = RB_ROOT_CACHED; 177 init_waitqueue_head(&serial_tmp->serial_io_wait); 178 } 179 180 rdev->serial = serial; 181 set_bit(CollisionCheck, &rdev->flags); 182 183 return 0; 184 } 185 186 static int rdevs_init_serial(struct mddev *mddev) 187 { 188 struct md_rdev *rdev; 189 int ret = 0; 190 191 rdev_for_each(rdev, mddev) { 192 ret = rdev_init_serial(rdev); 193 if (ret) 194 break; 195 } 196 197 /* Free all resources if pool is not existed */ 198 if (ret && !mddev->serial_info_pool) 199 rdevs_uninit_serial(mddev); 200 201 return ret; 202 } 203 204 /* 205 * rdev needs to enable serial stuffs if it meets the conditions: 206 * 1. it is multi-queue device flaged with writemostly. 207 * 2. the write-behind mode is enabled. 208 */ 209 static int rdev_need_serial(struct md_rdev *rdev) 210 { 211 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 212 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 213 test_bit(WriteMostly, &rdev->flags)); 214 } 215 216 /* 217 * Init resource for rdev(s), then create serial_info_pool if: 218 * 1. rdev is the first device which return true from rdev_enable_serial. 219 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 220 */ 221 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 222 { 223 int ret = 0; 224 225 if (rdev && !rdev_need_serial(rdev) && 226 !test_bit(CollisionCheck, &rdev->flags)) 227 return; 228 229 if (!rdev) 230 ret = rdevs_init_serial(mddev); 231 else 232 ret = rdev_init_serial(rdev); 233 if (ret) 234 return; 235 236 if (mddev->serial_info_pool == NULL) { 237 /* 238 * already in memalloc noio context by 239 * mddev_suspend() 240 */ 241 mddev->serial_info_pool = 242 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 243 sizeof(struct serial_info)); 244 if (!mddev->serial_info_pool) { 245 rdevs_uninit_serial(mddev); 246 pr_err("can't alloc memory pool for serialization\n"); 247 } 248 } 249 } 250 251 /* 252 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 253 * 1. rdev is the last device flaged with CollisionCheck. 254 * 2. when bitmap is destroyed while policy is not enabled. 255 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 256 */ 257 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 258 { 259 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 260 return; 261 262 if (mddev->serial_info_pool) { 263 struct md_rdev *temp; 264 int num = 0; /* used to track if other rdevs need the pool */ 265 266 rdev_for_each(temp, mddev) { 267 if (!rdev) { 268 if (!mddev->serialize_policy || 269 !rdev_need_serial(temp)) 270 rdev_uninit_serial(temp); 271 else 272 num++; 273 } else if (temp != rdev && 274 test_bit(CollisionCheck, &temp->flags)) 275 num++; 276 } 277 278 if (rdev) 279 rdev_uninit_serial(rdev); 280 281 if (num) 282 pr_info("The mempool could be used by other devices\n"); 283 else { 284 mempool_destroy(mddev->serial_info_pool); 285 mddev->serial_info_pool = NULL; 286 } 287 } 288 } 289 290 static struct ctl_table_header *raid_table_header; 291 292 static struct ctl_table raid_table[] = { 293 { 294 .procname = "speed_limit_min", 295 .data = &sysctl_speed_limit_min, 296 .maxlen = sizeof(int), 297 .mode = S_IRUGO|S_IWUSR, 298 .proc_handler = proc_dointvec, 299 }, 300 { 301 .procname = "speed_limit_max", 302 .data = &sysctl_speed_limit_max, 303 .maxlen = sizeof(int), 304 .mode = S_IRUGO|S_IWUSR, 305 .proc_handler = proc_dointvec, 306 }, 307 }; 308 309 static int start_readonly; 310 311 /* 312 * The original mechanism for creating an md device is to create 313 * a device node in /dev and to open it. This causes races with device-close. 314 * The preferred method is to write to the "new_array" module parameter. 315 * This can avoid races. 316 * Setting create_on_open to false disables the original mechanism 317 * so all the races disappear. 318 */ 319 static bool create_on_open = true; 320 321 /* 322 * We have a system wide 'event count' that is incremented 323 * on any 'interesting' event, and readers of /proc/mdstat 324 * can use 'poll' or 'select' to find out when the event 325 * count increases. 326 * 327 * Events are: 328 * start array, stop array, error, add device, remove device, 329 * start build, activate spare 330 */ 331 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 332 static atomic_t md_event_count; 333 void md_new_event(void) 334 { 335 atomic_inc(&md_event_count); 336 wake_up(&md_event_waiters); 337 } 338 EXPORT_SYMBOL_GPL(md_new_event); 339 340 /* 341 * Enables to iterate over all existing md arrays 342 * all_mddevs_lock protects this list. 343 */ 344 static LIST_HEAD(all_mddevs); 345 static DEFINE_SPINLOCK(all_mddevs_lock); 346 347 static bool is_md_suspended(struct mddev *mddev) 348 { 349 return percpu_ref_is_dying(&mddev->active_io); 350 } 351 /* Rather than calling directly into the personality make_request function, 352 * IO requests come here first so that we can check if the device is 353 * being suspended pending a reconfiguration. 354 * We hold a refcount over the call to ->make_request. By the time that 355 * call has finished, the bio has been linked into some internal structure 356 * and so is visible to ->quiesce(), so we don't need the refcount any more. 357 */ 358 static bool is_suspended(struct mddev *mddev, struct bio *bio) 359 { 360 if (is_md_suspended(mddev)) 361 return true; 362 if (bio_data_dir(bio) != WRITE) 363 return false; 364 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 365 return false; 366 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 367 return false; 368 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 369 return false; 370 return true; 371 } 372 373 void md_handle_request(struct mddev *mddev, struct bio *bio) 374 { 375 check_suspended: 376 if (is_suspended(mddev, bio)) { 377 DEFINE_WAIT(__wait); 378 /* Bail out if REQ_NOWAIT is set for the bio */ 379 if (bio->bi_opf & REQ_NOWAIT) { 380 bio_wouldblock_error(bio); 381 return; 382 } 383 for (;;) { 384 prepare_to_wait(&mddev->sb_wait, &__wait, 385 TASK_UNINTERRUPTIBLE); 386 if (!is_suspended(mddev, bio)) 387 break; 388 schedule(); 389 } 390 finish_wait(&mddev->sb_wait, &__wait); 391 } 392 if (!percpu_ref_tryget_live(&mddev->active_io)) 393 goto check_suspended; 394 395 if (!mddev->pers->make_request(mddev, bio)) { 396 percpu_ref_put(&mddev->active_io); 397 goto check_suspended; 398 } 399 400 percpu_ref_put(&mddev->active_io); 401 } 402 EXPORT_SYMBOL(md_handle_request); 403 404 static void md_submit_bio(struct bio *bio) 405 { 406 const int rw = bio_data_dir(bio); 407 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 408 409 if (mddev == NULL || mddev->pers == NULL) { 410 bio_io_error(bio); 411 return; 412 } 413 414 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 415 bio_io_error(bio); 416 return; 417 } 418 419 bio = bio_split_to_limits(bio); 420 if (!bio) 421 return; 422 423 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 424 if (bio_sectors(bio) != 0) 425 bio->bi_status = BLK_STS_IOERR; 426 bio_endio(bio); 427 return; 428 } 429 430 /* bio could be mergeable after passing to underlayer */ 431 bio->bi_opf &= ~REQ_NOMERGE; 432 433 md_handle_request(mddev, bio); 434 } 435 436 /* 437 * Make sure no new requests are submitted to the device, and any requests that 438 * have been submitted are completely handled. 439 */ 440 int mddev_suspend(struct mddev *mddev, bool interruptible) 441 { 442 int err = 0; 443 444 /* 445 * hold reconfig_mutex to wait for normal io will deadlock, because 446 * other context can't update super_block, and normal io can rely on 447 * updating super_block. 448 */ 449 lockdep_assert_not_held(&mddev->reconfig_mutex); 450 451 if (interruptible) 452 err = mutex_lock_interruptible(&mddev->suspend_mutex); 453 else 454 mutex_lock(&mddev->suspend_mutex); 455 if (err) 456 return err; 457 458 if (mddev->suspended) { 459 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 460 mutex_unlock(&mddev->suspend_mutex); 461 return 0; 462 } 463 464 percpu_ref_kill(&mddev->active_io); 465 if (interruptible) 466 err = wait_event_interruptible(mddev->sb_wait, 467 percpu_ref_is_zero(&mddev->active_io)); 468 else 469 wait_event(mddev->sb_wait, 470 percpu_ref_is_zero(&mddev->active_io)); 471 if (err) { 472 percpu_ref_resurrect(&mddev->active_io); 473 mutex_unlock(&mddev->suspend_mutex); 474 return err; 475 } 476 477 /* 478 * For raid456, io might be waiting for reshape to make progress, 479 * allow new reshape to start while waiting for io to be done to 480 * prevent deadlock. 481 */ 482 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 483 484 del_timer_sync(&mddev->safemode_timer); 485 /* restrict memory reclaim I/O during raid array is suspend */ 486 mddev->noio_flag = memalloc_noio_save(); 487 488 mutex_unlock(&mddev->suspend_mutex); 489 return 0; 490 } 491 EXPORT_SYMBOL_GPL(mddev_suspend); 492 493 static void __mddev_resume(struct mddev *mddev, bool recovery_needed) 494 { 495 lockdep_assert_not_held(&mddev->reconfig_mutex); 496 497 mutex_lock(&mddev->suspend_mutex); 498 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 499 if (mddev->suspended) { 500 mutex_unlock(&mddev->suspend_mutex); 501 return; 502 } 503 504 /* entred the memalloc scope from mddev_suspend() */ 505 memalloc_noio_restore(mddev->noio_flag); 506 507 percpu_ref_resurrect(&mddev->active_io); 508 wake_up(&mddev->sb_wait); 509 510 if (recovery_needed) 511 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 512 md_wakeup_thread(mddev->thread); 513 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 514 515 mutex_unlock(&mddev->suspend_mutex); 516 } 517 518 void mddev_resume(struct mddev *mddev) 519 { 520 return __mddev_resume(mddev, true); 521 } 522 EXPORT_SYMBOL_GPL(mddev_resume); 523 524 /* 525 * Generic flush handling for md 526 */ 527 528 static void md_end_flush(struct bio *bio) 529 { 530 struct md_rdev *rdev = bio->bi_private; 531 struct mddev *mddev = rdev->mddev; 532 533 bio_put(bio); 534 535 rdev_dec_pending(rdev, mddev); 536 537 if (atomic_dec_and_test(&mddev->flush_pending)) { 538 /* The pre-request flush has finished */ 539 queue_work(md_wq, &mddev->flush_work); 540 } 541 } 542 543 static void md_submit_flush_data(struct work_struct *ws); 544 545 static void submit_flushes(struct work_struct *ws) 546 { 547 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 548 struct md_rdev *rdev; 549 550 mddev->start_flush = ktime_get_boottime(); 551 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 552 atomic_set(&mddev->flush_pending, 1); 553 rcu_read_lock(); 554 rdev_for_each_rcu(rdev, mddev) 555 if (rdev->raid_disk >= 0 && 556 !test_bit(Faulty, &rdev->flags)) { 557 /* Take two references, one is dropped 558 * when request finishes, one after 559 * we reclaim rcu_read_lock 560 */ 561 struct bio *bi; 562 atomic_inc(&rdev->nr_pending); 563 atomic_inc(&rdev->nr_pending); 564 rcu_read_unlock(); 565 bi = bio_alloc_bioset(rdev->bdev, 0, 566 REQ_OP_WRITE | REQ_PREFLUSH, 567 GFP_NOIO, &mddev->bio_set); 568 bi->bi_end_io = md_end_flush; 569 bi->bi_private = rdev; 570 atomic_inc(&mddev->flush_pending); 571 submit_bio(bi); 572 rcu_read_lock(); 573 rdev_dec_pending(rdev, mddev); 574 } 575 rcu_read_unlock(); 576 if (atomic_dec_and_test(&mddev->flush_pending)) 577 queue_work(md_wq, &mddev->flush_work); 578 } 579 580 static void md_submit_flush_data(struct work_struct *ws) 581 { 582 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 583 struct bio *bio = mddev->flush_bio; 584 585 /* 586 * must reset flush_bio before calling into md_handle_request to avoid a 587 * deadlock, because other bios passed md_handle_request suspend check 588 * could wait for this and below md_handle_request could wait for those 589 * bios because of suspend check 590 */ 591 spin_lock_irq(&mddev->lock); 592 mddev->prev_flush_start = mddev->start_flush; 593 mddev->flush_bio = NULL; 594 spin_unlock_irq(&mddev->lock); 595 wake_up(&mddev->sb_wait); 596 597 if (bio->bi_iter.bi_size == 0) { 598 /* an empty barrier - all done */ 599 bio_endio(bio); 600 } else { 601 bio->bi_opf &= ~REQ_PREFLUSH; 602 md_handle_request(mddev, bio); 603 } 604 } 605 606 /* 607 * Manages consolidation of flushes and submitting any flushes needed for 608 * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is 609 * being finished in another context. Returns false if the flushing is 610 * complete but still needs the I/O portion of the bio to be processed. 611 */ 612 bool md_flush_request(struct mddev *mddev, struct bio *bio) 613 { 614 ktime_t req_start = ktime_get_boottime(); 615 spin_lock_irq(&mddev->lock); 616 /* flush requests wait until ongoing flush completes, 617 * hence coalescing all the pending requests. 618 */ 619 wait_event_lock_irq(mddev->sb_wait, 620 !mddev->flush_bio || 621 ktime_before(req_start, mddev->prev_flush_start), 622 mddev->lock); 623 /* new request after previous flush is completed */ 624 if (ktime_after(req_start, mddev->prev_flush_start)) { 625 WARN_ON(mddev->flush_bio); 626 mddev->flush_bio = bio; 627 bio = NULL; 628 } 629 spin_unlock_irq(&mddev->lock); 630 631 if (!bio) { 632 INIT_WORK(&mddev->flush_work, submit_flushes); 633 queue_work(md_wq, &mddev->flush_work); 634 } else { 635 /* flush was performed for some other bio while we waited. */ 636 if (bio->bi_iter.bi_size == 0) 637 /* an empty barrier - all done */ 638 bio_endio(bio); 639 else { 640 bio->bi_opf &= ~REQ_PREFLUSH; 641 return false; 642 } 643 } 644 return true; 645 } 646 EXPORT_SYMBOL(md_flush_request); 647 648 static inline struct mddev *mddev_get(struct mddev *mddev) 649 { 650 lockdep_assert_held(&all_mddevs_lock); 651 652 if (test_bit(MD_DELETED, &mddev->flags)) 653 return NULL; 654 atomic_inc(&mddev->active); 655 return mddev; 656 } 657 658 static void mddev_delayed_delete(struct work_struct *ws); 659 660 static void __mddev_put(struct mddev *mddev) 661 { 662 if (mddev->raid_disks || !list_empty(&mddev->disks) || 663 mddev->ctime || mddev->hold_active) 664 return; 665 666 /* Array is not configured at all, and not held active, so destroy it */ 667 set_bit(MD_DELETED, &mddev->flags); 668 669 /* 670 * Call queue_work inside the spinlock so that flush_workqueue() after 671 * mddev_find will succeed in waiting for the work to be done. 672 */ 673 queue_work(md_misc_wq, &mddev->del_work); 674 } 675 676 void mddev_put(struct mddev *mddev) 677 { 678 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 679 return; 680 681 __mddev_put(mddev); 682 spin_unlock(&all_mddevs_lock); 683 } 684 685 static void md_safemode_timeout(struct timer_list *t); 686 static void md_start_sync(struct work_struct *ws); 687 688 static void active_io_release(struct percpu_ref *ref) 689 { 690 struct mddev *mddev = container_of(ref, struct mddev, active_io); 691 692 wake_up(&mddev->sb_wait); 693 } 694 695 static void no_op(struct percpu_ref *r) {} 696 697 int mddev_init(struct mddev *mddev) 698 { 699 700 if (percpu_ref_init(&mddev->active_io, active_io_release, 701 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 702 return -ENOMEM; 703 704 if (percpu_ref_init(&mddev->writes_pending, no_op, 705 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 706 percpu_ref_exit(&mddev->active_io); 707 return -ENOMEM; 708 } 709 710 /* We want to start with the refcount at zero */ 711 percpu_ref_put(&mddev->writes_pending); 712 713 mutex_init(&mddev->open_mutex); 714 mutex_init(&mddev->reconfig_mutex); 715 mutex_init(&mddev->sync_mutex); 716 mutex_init(&mddev->suspend_mutex); 717 mutex_init(&mddev->bitmap_info.mutex); 718 INIT_LIST_HEAD(&mddev->disks); 719 INIT_LIST_HEAD(&mddev->all_mddevs); 720 INIT_LIST_HEAD(&mddev->deleting); 721 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 722 atomic_set(&mddev->active, 1); 723 atomic_set(&mddev->openers, 0); 724 atomic_set(&mddev->sync_seq, 0); 725 spin_lock_init(&mddev->lock); 726 atomic_set(&mddev->flush_pending, 0); 727 init_waitqueue_head(&mddev->sb_wait); 728 init_waitqueue_head(&mddev->recovery_wait); 729 mddev->reshape_position = MaxSector; 730 mddev->reshape_backwards = 0; 731 mddev->last_sync_action = "none"; 732 mddev->resync_min = 0; 733 mddev->resync_max = MaxSector; 734 mddev->level = LEVEL_NONE; 735 736 INIT_WORK(&mddev->sync_work, md_start_sync); 737 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 738 739 return 0; 740 } 741 EXPORT_SYMBOL_GPL(mddev_init); 742 743 void mddev_destroy(struct mddev *mddev) 744 { 745 percpu_ref_exit(&mddev->active_io); 746 percpu_ref_exit(&mddev->writes_pending); 747 } 748 EXPORT_SYMBOL_GPL(mddev_destroy); 749 750 static struct mddev *mddev_find_locked(dev_t unit) 751 { 752 struct mddev *mddev; 753 754 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 755 if (mddev->unit == unit) 756 return mddev; 757 758 return NULL; 759 } 760 761 /* find an unused unit number */ 762 static dev_t mddev_alloc_unit(void) 763 { 764 static int next_minor = 512; 765 int start = next_minor; 766 bool is_free = 0; 767 dev_t dev = 0; 768 769 while (!is_free) { 770 dev = MKDEV(MD_MAJOR, next_minor); 771 next_minor++; 772 if (next_minor > MINORMASK) 773 next_minor = 0; 774 if (next_minor == start) 775 return 0; /* Oh dear, all in use. */ 776 is_free = !mddev_find_locked(dev); 777 } 778 779 return dev; 780 } 781 782 static struct mddev *mddev_alloc(dev_t unit) 783 { 784 struct mddev *new; 785 int error; 786 787 if (unit && MAJOR(unit) != MD_MAJOR) 788 unit &= ~((1 << MdpMinorShift) - 1); 789 790 new = kzalloc(sizeof(*new), GFP_KERNEL); 791 if (!new) 792 return ERR_PTR(-ENOMEM); 793 794 error = mddev_init(new); 795 if (error) 796 goto out_free_new; 797 798 spin_lock(&all_mddevs_lock); 799 if (unit) { 800 error = -EEXIST; 801 if (mddev_find_locked(unit)) 802 goto out_destroy_new; 803 new->unit = unit; 804 if (MAJOR(unit) == MD_MAJOR) 805 new->md_minor = MINOR(unit); 806 else 807 new->md_minor = MINOR(unit) >> MdpMinorShift; 808 new->hold_active = UNTIL_IOCTL; 809 } else { 810 error = -ENODEV; 811 new->unit = mddev_alloc_unit(); 812 if (!new->unit) 813 goto out_destroy_new; 814 new->md_minor = MINOR(new->unit); 815 new->hold_active = UNTIL_STOP; 816 } 817 818 list_add(&new->all_mddevs, &all_mddevs); 819 spin_unlock(&all_mddevs_lock); 820 return new; 821 822 out_destroy_new: 823 spin_unlock(&all_mddevs_lock); 824 mddev_destroy(new); 825 out_free_new: 826 kfree(new); 827 return ERR_PTR(error); 828 } 829 830 static void mddev_free(struct mddev *mddev) 831 { 832 spin_lock(&all_mddevs_lock); 833 list_del(&mddev->all_mddevs); 834 spin_unlock(&all_mddevs_lock); 835 836 mddev_destroy(mddev); 837 kfree(mddev); 838 } 839 840 static const struct attribute_group md_redundancy_group; 841 842 void mddev_unlock(struct mddev *mddev) 843 { 844 struct md_rdev *rdev; 845 struct md_rdev *tmp; 846 LIST_HEAD(delete); 847 848 if (!list_empty(&mddev->deleting)) 849 list_splice_init(&mddev->deleting, &delete); 850 851 if (mddev->to_remove) { 852 /* These cannot be removed under reconfig_mutex as 853 * an access to the files will try to take reconfig_mutex 854 * while holding the file unremovable, which leads to 855 * a deadlock. 856 * So hold set sysfs_active while the remove in happeing, 857 * and anything else which might set ->to_remove or my 858 * otherwise change the sysfs namespace will fail with 859 * -EBUSY if sysfs_active is still set. 860 * We set sysfs_active under reconfig_mutex and elsewhere 861 * test it under the same mutex to ensure its correct value 862 * is seen. 863 */ 864 const struct attribute_group *to_remove = mddev->to_remove; 865 mddev->to_remove = NULL; 866 mddev->sysfs_active = 1; 867 mutex_unlock(&mddev->reconfig_mutex); 868 869 if (mddev->kobj.sd) { 870 if (to_remove != &md_redundancy_group) 871 sysfs_remove_group(&mddev->kobj, to_remove); 872 if (mddev->pers == NULL || 873 mddev->pers->sync_request == NULL) { 874 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 875 if (mddev->sysfs_action) 876 sysfs_put(mddev->sysfs_action); 877 if (mddev->sysfs_completed) 878 sysfs_put(mddev->sysfs_completed); 879 if (mddev->sysfs_degraded) 880 sysfs_put(mddev->sysfs_degraded); 881 mddev->sysfs_action = NULL; 882 mddev->sysfs_completed = NULL; 883 mddev->sysfs_degraded = NULL; 884 } 885 } 886 mddev->sysfs_active = 0; 887 } else 888 mutex_unlock(&mddev->reconfig_mutex); 889 890 md_wakeup_thread(mddev->thread); 891 wake_up(&mddev->sb_wait); 892 893 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 894 list_del_init(&rdev->same_set); 895 kobject_del(&rdev->kobj); 896 export_rdev(rdev, mddev); 897 } 898 } 899 EXPORT_SYMBOL_GPL(mddev_unlock); 900 901 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 902 { 903 struct md_rdev *rdev; 904 905 rdev_for_each_rcu(rdev, mddev) 906 if (rdev->desc_nr == nr) 907 return rdev; 908 909 return NULL; 910 } 911 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 912 913 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 914 { 915 struct md_rdev *rdev; 916 917 rdev_for_each(rdev, mddev) 918 if (rdev->bdev->bd_dev == dev) 919 return rdev; 920 921 return NULL; 922 } 923 924 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 925 { 926 struct md_rdev *rdev; 927 928 rdev_for_each_rcu(rdev, mddev) 929 if (rdev->bdev->bd_dev == dev) 930 return rdev; 931 932 return NULL; 933 } 934 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 935 936 static struct md_personality *find_pers(int level, char *clevel) 937 { 938 struct md_personality *pers; 939 list_for_each_entry(pers, &pers_list, list) { 940 if (level != LEVEL_NONE && pers->level == level) 941 return pers; 942 if (strcmp(pers->name, clevel)==0) 943 return pers; 944 } 945 return NULL; 946 } 947 948 /* return the offset of the super block in 512byte sectors */ 949 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 950 { 951 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 952 } 953 954 static int alloc_disk_sb(struct md_rdev *rdev) 955 { 956 rdev->sb_page = alloc_page(GFP_KERNEL); 957 if (!rdev->sb_page) 958 return -ENOMEM; 959 return 0; 960 } 961 962 void md_rdev_clear(struct md_rdev *rdev) 963 { 964 if (rdev->sb_page) { 965 put_page(rdev->sb_page); 966 rdev->sb_loaded = 0; 967 rdev->sb_page = NULL; 968 rdev->sb_start = 0; 969 rdev->sectors = 0; 970 } 971 if (rdev->bb_page) { 972 put_page(rdev->bb_page); 973 rdev->bb_page = NULL; 974 } 975 badblocks_exit(&rdev->badblocks); 976 } 977 EXPORT_SYMBOL_GPL(md_rdev_clear); 978 979 static void super_written(struct bio *bio) 980 { 981 struct md_rdev *rdev = bio->bi_private; 982 struct mddev *mddev = rdev->mddev; 983 984 if (bio->bi_status) { 985 pr_err("md: %s gets error=%d\n", __func__, 986 blk_status_to_errno(bio->bi_status)); 987 md_error(mddev, rdev); 988 if (!test_bit(Faulty, &rdev->flags) 989 && (bio->bi_opf & MD_FAILFAST)) { 990 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 991 set_bit(LastDev, &rdev->flags); 992 } 993 } else 994 clear_bit(LastDev, &rdev->flags); 995 996 bio_put(bio); 997 998 rdev_dec_pending(rdev, mddev); 999 1000 if (atomic_dec_and_test(&mddev->pending_writes)) 1001 wake_up(&mddev->sb_wait); 1002 } 1003 1004 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 1005 sector_t sector, int size, struct page *page) 1006 { 1007 /* write first size bytes of page to sector of rdev 1008 * Increment mddev->pending_writes before returning 1009 * and decrement it on completion, waking up sb_wait 1010 * if zero is reached. 1011 * If an error occurred, call md_error 1012 */ 1013 struct bio *bio; 1014 1015 if (!page) 1016 return; 1017 1018 if (test_bit(Faulty, &rdev->flags)) 1019 return; 1020 1021 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1022 1, 1023 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA, 1024 GFP_NOIO, &mddev->sync_set); 1025 1026 atomic_inc(&rdev->nr_pending); 1027 1028 bio->bi_iter.bi_sector = sector; 1029 __bio_add_page(bio, page, size, 0); 1030 bio->bi_private = rdev; 1031 bio->bi_end_io = super_written; 1032 1033 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 1034 test_bit(FailFast, &rdev->flags) && 1035 !test_bit(LastDev, &rdev->flags)) 1036 bio->bi_opf |= MD_FAILFAST; 1037 1038 atomic_inc(&mddev->pending_writes); 1039 submit_bio(bio); 1040 } 1041 1042 int md_super_wait(struct mddev *mddev) 1043 { 1044 /* wait for all superblock writes that were scheduled to complete */ 1045 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1046 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1047 return -EAGAIN; 1048 return 0; 1049 } 1050 1051 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1052 struct page *page, blk_opf_t opf, bool metadata_op) 1053 { 1054 struct bio bio; 1055 struct bio_vec bvec; 1056 1057 if (metadata_op && rdev->meta_bdev) 1058 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1059 else 1060 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1061 1062 if (metadata_op) 1063 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1064 else if (rdev->mddev->reshape_position != MaxSector && 1065 (rdev->mddev->reshape_backwards == 1066 (sector >= rdev->mddev->reshape_position))) 1067 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1068 else 1069 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1070 __bio_add_page(&bio, page, size, 0); 1071 1072 submit_bio_wait(&bio); 1073 1074 return !bio.bi_status; 1075 } 1076 EXPORT_SYMBOL_GPL(sync_page_io); 1077 1078 static int read_disk_sb(struct md_rdev *rdev, int size) 1079 { 1080 if (rdev->sb_loaded) 1081 return 0; 1082 1083 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1084 goto fail; 1085 rdev->sb_loaded = 1; 1086 return 0; 1087 1088 fail: 1089 pr_err("md: disabled device %pg, could not read superblock.\n", 1090 rdev->bdev); 1091 return -EINVAL; 1092 } 1093 1094 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1095 { 1096 return sb1->set_uuid0 == sb2->set_uuid0 && 1097 sb1->set_uuid1 == sb2->set_uuid1 && 1098 sb1->set_uuid2 == sb2->set_uuid2 && 1099 sb1->set_uuid3 == sb2->set_uuid3; 1100 } 1101 1102 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1103 { 1104 int ret; 1105 mdp_super_t *tmp1, *tmp2; 1106 1107 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1108 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1109 1110 if (!tmp1 || !tmp2) { 1111 ret = 0; 1112 goto abort; 1113 } 1114 1115 *tmp1 = *sb1; 1116 *tmp2 = *sb2; 1117 1118 /* 1119 * nr_disks is not constant 1120 */ 1121 tmp1->nr_disks = 0; 1122 tmp2->nr_disks = 0; 1123 1124 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1125 abort: 1126 kfree(tmp1); 1127 kfree(tmp2); 1128 return ret; 1129 } 1130 1131 static u32 md_csum_fold(u32 csum) 1132 { 1133 csum = (csum & 0xffff) + (csum >> 16); 1134 return (csum & 0xffff) + (csum >> 16); 1135 } 1136 1137 static unsigned int calc_sb_csum(mdp_super_t *sb) 1138 { 1139 u64 newcsum = 0; 1140 u32 *sb32 = (u32*)sb; 1141 int i; 1142 unsigned int disk_csum, csum; 1143 1144 disk_csum = sb->sb_csum; 1145 sb->sb_csum = 0; 1146 1147 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1148 newcsum += sb32[i]; 1149 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1150 1151 #ifdef CONFIG_ALPHA 1152 /* This used to use csum_partial, which was wrong for several 1153 * reasons including that different results are returned on 1154 * different architectures. It isn't critical that we get exactly 1155 * the same return value as before (we always csum_fold before 1156 * testing, and that removes any differences). However as we 1157 * know that csum_partial always returned a 16bit value on 1158 * alphas, do a fold to maximise conformity to previous behaviour. 1159 */ 1160 sb->sb_csum = md_csum_fold(disk_csum); 1161 #else 1162 sb->sb_csum = disk_csum; 1163 #endif 1164 return csum; 1165 } 1166 1167 /* 1168 * Handle superblock details. 1169 * We want to be able to handle multiple superblock formats 1170 * so we have a common interface to them all, and an array of 1171 * different handlers. 1172 * We rely on user-space to write the initial superblock, and support 1173 * reading and updating of superblocks. 1174 * Interface methods are: 1175 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1176 * loads and validates a superblock on dev. 1177 * if refdev != NULL, compare superblocks on both devices 1178 * Return: 1179 * 0 - dev has a superblock that is compatible with refdev 1180 * 1 - dev has a superblock that is compatible and newer than refdev 1181 * so dev should be used as the refdev in future 1182 * -EINVAL superblock incompatible or invalid 1183 * -othererror e.g. -EIO 1184 * 1185 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1186 * Verify that dev is acceptable into mddev. 1187 * The first time, mddev->raid_disks will be 0, and data from 1188 * dev should be merged in. Subsequent calls check that dev 1189 * is new enough. Return 0 or -EINVAL 1190 * 1191 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1192 * Update the superblock for rdev with data in mddev 1193 * This does not write to disc. 1194 * 1195 */ 1196 1197 struct super_type { 1198 char *name; 1199 struct module *owner; 1200 int (*load_super)(struct md_rdev *rdev, 1201 struct md_rdev *refdev, 1202 int minor_version); 1203 int (*validate_super)(struct mddev *mddev, 1204 struct md_rdev *rdev); 1205 void (*sync_super)(struct mddev *mddev, 1206 struct md_rdev *rdev); 1207 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1208 sector_t num_sectors); 1209 int (*allow_new_offset)(struct md_rdev *rdev, 1210 unsigned long long new_offset); 1211 }; 1212 1213 /* 1214 * Check that the given mddev has no bitmap. 1215 * 1216 * This function is called from the run method of all personalities that do not 1217 * support bitmaps. It prints an error message and returns non-zero if mddev 1218 * has a bitmap. Otherwise, it returns 0. 1219 * 1220 */ 1221 int md_check_no_bitmap(struct mddev *mddev) 1222 { 1223 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1224 return 0; 1225 pr_warn("%s: bitmaps are not supported for %s\n", 1226 mdname(mddev), mddev->pers->name); 1227 return 1; 1228 } 1229 EXPORT_SYMBOL(md_check_no_bitmap); 1230 1231 /* 1232 * load_super for 0.90.0 1233 */ 1234 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1235 { 1236 mdp_super_t *sb; 1237 int ret; 1238 bool spare_disk = true; 1239 1240 /* 1241 * Calculate the position of the superblock (512byte sectors), 1242 * it's at the end of the disk. 1243 * 1244 * It also happens to be a multiple of 4Kb. 1245 */ 1246 rdev->sb_start = calc_dev_sboffset(rdev); 1247 1248 ret = read_disk_sb(rdev, MD_SB_BYTES); 1249 if (ret) 1250 return ret; 1251 1252 ret = -EINVAL; 1253 1254 sb = page_address(rdev->sb_page); 1255 1256 if (sb->md_magic != MD_SB_MAGIC) { 1257 pr_warn("md: invalid raid superblock magic on %pg\n", 1258 rdev->bdev); 1259 goto abort; 1260 } 1261 1262 if (sb->major_version != 0 || 1263 sb->minor_version < 90 || 1264 sb->minor_version > 91) { 1265 pr_warn("Bad version number %d.%d on %pg\n", 1266 sb->major_version, sb->minor_version, rdev->bdev); 1267 goto abort; 1268 } 1269 1270 if (sb->raid_disks <= 0) 1271 goto abort; 1272 1273 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1274 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1275 goto abort; 1276 } 1277 1278 rdev->preferred_minor = sb->md_minor; 1279 rdev->data_offset = 0; 1280 rdev->new_data_offset = 0; 1281 rdev->sb_size = MD_SB_BYTES; 1282 rdev->badblocks.shift = -1; 1283 1284 if (sb->level == LEVEL_MULTIPATH) 1285 rdev->desc_nr = -1; 1286 else 1287 rdev->desc_nr = sb->this_disk.number; 1288 1289 /* not spare disk, or LEVEL_MULTIPATH */ 1290 if (sb->level == LEVEL_MULTIPATH || 1291 (rdev->desc_nr >= 0 && 1292 rdev->desc_nr < MD_SB_DISKS && 1293 sb->disks[rdev->desc_nr].state & 1294 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) 1295 spare_disk = false; 1296 1297 if (!refdev) { 1298 if (!spare_disk) 1299 ret = 1; 1300 else 1301 ret = 0; 1302 } else { 1303 __u64 ev1, ev2; 1304 mdp_super_t *refsb = page_address(refdev->sb_page); 1305 if (!md_uuid_equal(refsb, sb)) { 1306 pr_warn("md: %pg has different UUID to %pg\n", 1307 rdev->bdev, refdev->bdev); 1308 goto abort; 1309 } 1310 if (!md_sb_equal(refsb, sb)) { 1311 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1312 rdev->bdev, refdev->bdev); 1313 goto abort; 1314 } 1315 ev1 = md_event(sb); 1316 ev2 = md_event(refsb); 1317 1318 if (!spare_disk && ev1 > ev2) 1319 ret = 1; 1320 else 1321 ret = 0; 1322 } 1323 rdev->sectors = rdev->sb_start; 1324 /* Limit to 4TB as metadata cannot record more than that. 1325 * (not needed for Linear and RAID0 as metadata doesn't 1326 * record this size) 1327 */ 1328 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1329 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1330 1331 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1332 /* "this cannot possibly happen" ... */ 1333 ret = -EINVAL; 1334 1335 abort: 1336 return ret; 1337 } 1338 1339 /* 1340 * validate_super for 0.90.0 1341 */ 1342 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1343 { 1344 mdp_disk_t *desc; 1345 mdp_super_t *sb = page_address(rdev->sb_page); 1346 __u64 ev1 = md_event(sb); 1347 1348 rdev->raid_disk = -1; 1349 clear_bit(Faulty, &rdev->flags); 1350 clear_bit(In_sync, &rdev->flags); 1351 clear_bit(Bitmap_sync, &rdev->flags); 1352 clear_bit(WriteMostly, &rdev->flags); 1353 1354 if (mddev->raid_disks == 0) { 1355 mddev->major_version = 0; 1356 mddev->minor_version = sb->minor_version; 1357 mddev->patch_version = sb->patch_version; 1358 mddev->external = 0; 1359 mddev->chunk_sectors = sb->chunk_size >> 9; 1360 mddev->ctime = sb->ctime; 1361 mddev->utime = sb->utime; 1362 mddev->level = sb->level; 1363 mddev->clevel[0] = 0; 1364 mddev->layout = sb->layout; 1365 mddev->raid_disks = sb->raid_disks; 1366 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1367 mddev->events = ev1; 1368 mddev->bitmap_info.offset = 0; 1369 mddev->bitmap_info.space = 0; 1370 /* bitmap can use 60 K after the 4K superblocks */ 1371 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1372 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1373 mddev->reshape_backwards = 0; 1374 1375 if (mddev->minor_version >= 91) { 1376 mddev->reshape_position = sb->reshape_position; 1377 mddev->delta_disks = sb->delta_disks; 1378 mddev->new_level = sb->new_level; 1379 mddev->new_layout = sb->new_layout; 1380 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1381 if (mddev->delta_disks < 0) 1382 mddev->reshape_backwards = 1; 1383 } else { 1384 mddev->reshape_position = MaxSector; 1385 mddev->delta_disks = 0; 1386 mddev->new_level = mddev->level; 1387 mddev->new_layout = mddev->layout; 1388 mddev->new_chunk_sectors = mddev->chunk_sectors; 1389 } 1390 if (mddev->level == 0) 1391 mddev->layout = -1; 1392 1393 if (sb->state & (1<<MD_SB_CLEAN)) 1394 mddev->recovery_cp = MaxSector; 1395 else { 1396 if (sb->events_hi == sb->cp_events_hi && 1397 sb->events_lo == sb->cp_events_lo) { 1398 mddev->recovery_cp = sb->recovery_cp; 1399 } else 1400 mddev->recovery_cp = 0; 1401 } 1402 1403 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1404 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1405 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1406 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1407 1408 mddev->max_disks = MD_SB_DISKS; 1409 1410 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1411 mddev->bitmap_info.file == NULL) { 1412 mddev->bitmap_info.offset = 1413 mddev->bitmap_info.default_offset; 1414 mddev->bitmap_info.space = 1415 mddev->bitmap_info.default_space; 1416 } 1417 1418 } else if (mddev->pers == NULL) { 1419 /* Insist on good event counter while assembling, except 1420 * for spares (which don't need an event count) */ 1421 ++ev1; 1422 if (sb->disks[rdev->desc_nr].state & ( 1423 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1424 if (ev1 < mddev->events) 1425 return -EINVAL; 1426 } else if (mddev->bitmap) { 1427 /* if adding to array with a bitmap, then we can accept an 1428 * older device ... but not too old. 1429 */ 1430 if (ev1 < mddev->bitmap->events_cleared) 1431 return 0; 1432 if (ev1 < mddev->events) 1433 set_bit(Bitmap_sync, &rdev->flags); 1434 } else { 1435 if (ev1 < mddev->events) 1436 /* just a hot-add of a new device, leave raid_disk at -1 */ 1437 return 0; 1438 } 1439 1440 if (mddev->level != LEVEL_MULTIPATH) { 1441 desc = sb->disks + rdev->desc_nr; 1442 1443 if (desc->state & (1<<MD_DISK_FAULTY)) 1444 set_bit(Faulty, &rdev->flags); 1445 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1446 desc->raid_disk < mddev->raid_disks */) { 1447 set_bit(In_sync, &rdev->flags); 1448 rdev->raid_disk = desc->raid_disk; 1449 rdev->saved_raid_disk = desc->raid_disk; 1450 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1451 /* active but not in sync implies recovery up to 1452 * reshape position. We don't know exactly where 1453 * that is, so set to zero for now */ 1454 if (mddev->minor_version >= 91) { 1455 rdev->recovery_offset = 0; 1456 rdev->raid_disk = desc->raid_disk; 1457 } 1458 } 1459 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1460 set_bit(WriteMostly, &rdev->flags); 1461 if (desc->state & (1<<MD_DISK_FAILFAST)) 1462 set_bit(FailFast, &rdev->flags); 1463 } else /* MULTIPATH are always insync */ 1464 set_bit(In_sync, &rdev->flags); 1465 return 0; 1466 } 1467 1468 /* 1469 * sync_super for 0.90.0 1470 */ 1471 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1472 { 1473 mdp_super_t *sb; 1474 struct md_rdev *rdev2; 1475 int next_spare = mddev->raid_disks; 1476 1477 /* make rdev->sb match mddev data.. 1478 * 1479 * 1/ zero out disks 1480 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1481 * 3/ any empty disks < next_spare become removed 1482 * 1483 * disks[0] gets initialised to REMOVED because 1484 * we cannot be sure from other fields if it has 1485 * been initialised or not. 1486 */ 1487 int i; 1488 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1489 1490 rdev->sb_size = MD_SB_BYTES; 1491 1492 sb = page_address(rdev->sb_page); 1493 1494 memset(sb, 0, sizeof(*sb)); 1495 1496 sb->md_magic = MD_SB_MAGIC; 1497 sb->major_version = mddev->major_version; 1498 sb->patch_version = mddev->patch_version; 1499 sb->gvalid_words = 0; /* ignored */ 1500 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1501 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1502 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1503 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1504 1505 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1506 sb->level = mddev->level; 1507 sb->size = mddev->dev_sectors / 2; 1508 sb->raid_disks = mddev->raid_disks; 1509 sb->md_minor = mddev->md_minor; 1510 sb->not_persistent = 0; 1511 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1512 sb->state = 0; 1513 sb->events_hi = (mddev->events>>32); 1514 sb->events_lo = (u32)mddev->events; 1515 1516 if (mddev->reshape_position == MaxSector) 1517 sb->minor_version = 90; 1518 else { 1519 sb->minor_version = 91; 1520 sb->reshape_position = mddev->reshape_position; 1521 sb->new_level = mddev->new_level; 1522 sb->delta_disks = mddev->delta_disks; 1523 sb->new_layout = mddev->new_layout; 1524 sb->new_chunk = mddev->new_chunk_sectors << 9; 1525 } 1526 mddev->minor_version = sb->minor_version; 1527 if (mddev->in_sync) 1528 { 1529 sb->recovery_cp = mddev->recovery_cp; 1530 sb->cp_events_hi = (mddev->events>>32); 1531 sb->cp_events_lo = (u32)mddev->events; 1532 if (mddev->recovery_cp == MaxSector) 1533 sb->state = (1<< MD_SB_CLEAN); 1534 } else 1535 sb->recovery_cp = 0; 1536 1537 sb->layout = mddev->layout; 1538 sb->chunk_size = mddev->chunk_sectors << 9; 1539 1540 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1541 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1542 1543 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1544 rdev_for_each(rdev2, mddev) { 1545 mdp_disk_t *d; 1546 int desc_nr; 1547 int is_active = test_bit(In_sync, &rdev2->flags); 1548 1549 if (rdev2->raid_disk >= 0 && 1550 sb->minor_version >= 91) 1551 /* we have nowhere to store the recovery_offset, 1552 * but if it is not below the reshape_position, 1553 * we can piggy-back on that. 1554 */ 1555 is_active = 1; 1556 if (rdev2->raid_disk < 0 || 1557 test_bit(Faulty, &rdev2->flags)) 1558 is_active = 0; 1559 if (is_active) 1560 desc_nr = rdev2->raid_disk; 1561 else 1562 desc_nr = next_spare++; 1563 rdev2->desc_nr = desc_nr; 1564 d = &sb->disks[rdev2->desc_nr]; 1565 nr_disks++; 1566 d->number = rdev2->desc_nr; 1567 d->major = MAJOR(rdev2->bdev->bd_dev); 1568 d->minor = MINOR(rdev2->bdev->bd_dev); 1569 if (is_active) 1570 d->raid_disk = rdev2->raid_disk; 1571 else 1572 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1573 if (test_bit(Faulty, &rdev2->flags)) 1574 d->state = (1<<MD_DISK_FAULTY); 1575 else if (is_active) { 1576 d->state = (1<<MD_DISK_ACTIVE); 1577 if (test_bit(In_sync, &rdev2->flags)) 1578 d->state |= (1<<MD_DISK_SYNC); 1579 active++; 1580 working++; 1581 } else { 1582 d->state = 0; 1583 spare++; 1584 working++; 1585 } 1586 if (test_bit(WriteMostly, &rdev2->flags)) 1587 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1588 if (test_bit(FailFast, &rdev2->flags)) 1589 d->state |= (1<<MD_DISK_FAILFAST); 1590 } 1591 /* now set the "removed" and "faulty" bits on any missing devices */ 1592 for (i=0 ; i < mddev->raid_disks ; i++) { 1593 mdp_disk_t *d = &sb->disks[i]; 1594 if (d->state == 0 && d->number == 0) { 1595 d->number = i; 1596 d->raid_disk = i; 1597 d->state = (1<<MD_DISK_REMOVED); 1598 d->state |= (1<<MD_DISK_FAULTY); 1599 failed++; 1600 } 1601 } 1602 sb->nr_disks = nr_disks; 1603 sb->active_disks = active; 1604 sb->working_disks = working; 1605 sb->failed_disks = failed; 1606 sb->spare_disks = spare; 1607 1608 sb->this_disk = sb->disks[rdev->desc_nr]; 1609 sb->sb_csum = calc_sb_csum(sb); 1610 } 1611 1612 /* 1613 * rdev_size_change for 0.90.0 1614 */ 1615 static unsigned long long 1616 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1617 { 1618 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1619 return 0; /* component must fit device */ 1620 if (rdev->mddev->bitmap_info.offset) 1621 return 0; /* can't move bitmap */ 1622 rdev->sb_start = calc_dev_sboffset(rdev); 1623 if (!num_sectors || num_sectors > rdev->sb_start) 1624 num_sectors = rdev->sb_start; 1625 /* Limit to 4TB as metadata cannot record more than that. 1626 * 4TB == 2^32 KB, or 2*2^32 sectors. 1627 */ 1628 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1629 num_sectors = (sector_t)(2ULL << 32) - 2; 1630 do { 1631 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1632 rdev->sb_page); 1633 } while (md_super_wait(rdev->mddev) < 0); 1634 return num_sectors; 1635 } 1636 1637 static int 1638 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1639 { 1640 /* non-zero offset changes not possible with v0.90 */ 1641 return new_offset == 0; 1642 } 1643 1644 /* 1645 * version 1 superblock 1646 */ 1647 1648 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1649 { 1650 __le32 disk_csum; 1651 u32 csum; 1652 unsigned long long newcsum; 1653 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1654 __le32 *isuper = (__le32*)sb; 1655 1656 disk_csum = sb->sb_csum; 1657 sb->sb_csum = 0; 1658 newcsum = 0; 1659 for (; size >= 4; size -= 4) 1660 newcsum += le32_to_cpu(*isuper++); 1661 1662 if (size == 2) 1663 newcsum += le16_to_cpu(*(__le16*) isuper); 1664 1665 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1666 sb->sb_csum = disk_csum; 1667 return cpu_to_le32(csum); 1668 } 1669 1670 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1671 { 1672 struct mdp_superblock_1 *sb; 1673 int ret; 1674 sector_t sb_start; 1675 sector_t sectors; 1676 int bmask; 1677 bool spare_disk = true; 1678 1679 /* 1680 * Calculate the position of the superblock in 512byte sectors. 1681 * It is always aligned to a 4K boundary and 1682 * depeding on minor_version, it can be: 1683 * 0: At least 8K, but less than 12K, from end of device 1684 * 1: At start of device 1685 * 2: 4K from start of device. 1686 */ 1687 switch(minor_version) { 1688 case 0: 1689 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1690 sb_start &= ~(sector_t)(4*2-1); 1691 break; 1692 case 1: 1693 sb_start = 0; 1694 break; 1695 case 2: 1696 sb_start = 8; 1697 break; 1698 default: 1699 return -EINVAL; 1700 } 1701 rdev->sb_start = sb_start; 1702 1703 /* superblock is rarely larger than 1K, but it can be larger, 1704 * and it is safe to read 4k, so we do that 1705 */ 1706 ret = read_disk_sb(rdev, 4096); 1707 if (ret) return ret; 1708 1709 sb = page_address(rdev->sb_page); 1710 1711 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1712 sb->major_version != cpu_to_le32(1) || 1713 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1714 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1715 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1716 return -EINVAL; 1717 1718 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1719 pr_warn("md: invalid superblock checksum on %pg\n", 1720 rdev->bdev); 1721 return -EINVAL; 1722 } 1723 if (le64_to_cpu(sb->data_size) < 10) { 1724 pr_warn("md: data_size too small on %pg\n", 1725 rdev->bdev); 1726 return -EINVAL; 1727 } 1728 if (sb->pad0 || 1729 sb->pad3[0] || 1730 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1731 /* Some padding is non-zero, might be a new feature */ 1732 return -EINVAL; 1733 1734 rdev->preferred_minor = 0xffff; 1735 rdev->data_offset = le64_to_cpu(sb->data_offset); 1736 rdev->new_data_offset = rdev->data_offset; 1737 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1738 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1739 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1740 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1741 1742 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1743 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1744 if (rdev->sb_size & bmask) 1745 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1746 1747 if (minor_version 1748 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1749 return -EINVAL; 1750 if (minor_version 1751 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1752 return -EINVAL; 1753 1754 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1755 rdev->desc_nr = -1; 1756 else 1757 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1758 1759 if (!rdev->bb_page) { 1760 rdev->bb_page = alloc_page(GFP_KERNEL); 1761 if (!rdev->bb_page) 1762 return -ENOMEM; 1763 } 1764 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1765 rdev->badblocks.count == 0) { 1766 /* need to load the bad block list. 1767 * Currently we limit it to one page. 1768 */ 1769 s32 offset; 1770 sector_t bb_sector; 1771 __le64 *bbp; 1772 int i; 1773 int sectors = le16_to_cpu(sb->bblog_size); 1774 if (sectors > (PAGE_SIZE / 512)) 1775 return -EINVAL; 1776 offset = le32_to_cpu(sb->bblog_offset); 1777 if (offset == 0) 1778 return -EINVAL; 1779 bb_sector = (long long)offset; 1780 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1781 rdev->bb_page, REQ_OP_READ, true)) 1782 return -EIO; 1783 bbp = (__le64 *)page_address(rdev->bb_page); 1784 rdev->badblocks.shift = sb->bblog_shift; 1785 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1786 u64 bb = le64_to_cpu(*bbp); 1787 int count = bb & (0x3ff); 1788 u64 sector = bb >> 10; 1789 sector <<= sb->bblog_shift; 1790 count <<= sb->bblog_shift; 1791 if (bb + 1 == 0) 1792 break; 1793 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1794 return -EINVAL; 1795 } 1796 } else if (sb->bblog_offset != 0) 1797 rdev->badblocks.shift = 0; 1798 1799 if ((le32_to_cpu(sb->feature_map) & 1800 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1801 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1802 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1803 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1804 } 1805 1806 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1807 sb->level != 0) 1808 return -EINVAL; 1809 1810 /* not spare disk, or LEVEL_MULTIPATH */ 1811 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || 1812 (rdev->desc_nr >= 0 && 1813 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1814 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1815 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) 1816 spare_disk = false; 1817 1818 if (!refdev) { 1819 if (!spare_disk) 1820 ret = 1; 1821 else 1822 ret = 0; 1823 } else { 1824 __u64 ev1, ev2; 1825 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1826 1827 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1828 sb->level != refsb->level || 1829 sb->layout != refsb->layout || 1830 sb->chunksize != refsb->chunksize) { 1831 pr_warn("md: %pg has strangely different superblock to %pg\n", 1832 rdev->bdev, 1833 refdev->bdev); 1834 return -EINVAL; 1835 } 1836 ev1 = le64_to_cpu(sb->events); 1837 ev2 = le64_to_cpu(refsb->events); 1838 1839 if (!spare_disk && ev1 > ev2) 1840 ret = 1; 1841 else 1842 ret = 0; 1843 } 1844 if (minor_version) 1845 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1846 else 1847 sectors = rdev->sb_start; 1848 if (sectors < le64_to_cpu(sb->data_size)) 1849 return -EINVAL; 1850 rdev->sectors = le64_to_cpu(sb->data_size); 1851 return ret; 1852 } 1853 1854 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1855 { 1856 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1857 __u64 ev1 = le64_to_cpu(sb->events); 1858 1859 rdev->raid_disk = -1; 1860 clear_bit(Faulty, &rdev->flags); 1861 clear_bit(In_sync, &rdev->flags); 1862 clear_bit(Bitmap_sync, &rdev->flags); 1863 clear_bit(WriteMostly, &rdev->flags); 1864 1865 if (mddev->raid_disks == 0) { 1866 mddev->major_version = 1; 1867 mddev->patch_version = 0; 1868 mddev->external = 0; 1869 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1870 mddev->ctime = le64_to_cpu(sb->ctime); 1871 mddev->utime = le64_to_cpu(sb->utime); 1872 mddev->level = le32_to_cpu(sb->level); 1873 mddev->clevel[0] = 0; 1874 mddev->layout = le32_to_cpu(sb->layout); 1875 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1876 mddev->dev_sectors = le64_to_cpu(sb->size); 1877 mddev->events = ev1; 1878 mddev->bitmap_info.offset = 0; 1879 mddev->bitmap_info.space = 0; 1880 /* Default location for bitmap is 1K after superblock 1881 * using 3K - total of 4K 1882 */ 1883 mddev->bitmap_info.default_offset = 1024 >> 9; 1884 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1885 mddev->reshape_backwards = 0; 1886 1887 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1888 memcpy(mddev->uuid, sb->set_uuid, 16); 1889 1890 mddev->max_disks = (4096-256)/2; 1891 1892 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1893 mddev->bitmap_info.file == NULL) { 1894 mddev->bitmap_info.offset = 1895 (__s32)le32_to_cpu(sb->bitmap_offset); 1896 /* Metadata doesn't record how much space is available. 1897 * For 1.0, we assume we can use up to the superblock 1898 * if before, else to 4K beyond superblock. 1899 * For others, assume no change is possible. 1900 */ 1901 if (mddev->minor_version > 0) 1902 mddev->bitmap_info.space = 0; 1903 else if (mddev->bitmap_info.offset > 0) 1904 mddev->bitmap_info.space = 1905 8 - mddev->bitmap_info.offset; 1906 else 1907 mddev->bitmap_info.space = 1908 -mddev->bitmap_info.offset; 1909 } 1910 1911 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1912 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1913 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1914 mddev->new_level = le32_to_cpu(sb->new_level); 1915 mddev->new_layout = le32_to_cpu(sb->new_layout); 1916 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1917 if (mddev->delta_disks < 0 || 1918 (mddev->delta_disks == 0 && 1919 (le32_to_cpu(sb->feature_map) 1920 & MD_FEATURE_RESHAPE_BACKWARDS))) 1921 mddev->reshape_backwards = 1; 1922 } else { 1923 mddev->reshape_position = MaxSector; 1924 mddev->delta_disks = 0; 1925 mddev->new_level = mddev->level; 1926 mddev->new_layout = mddev->layout; 1927 mddev->new_chunk_sectors = mddev->chunk_sectors; 1928 } 1929 1930 if (mddev->level == 0 && 1931 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1932 mddev->layout = -1; 1933 1934 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1935 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1936 1937 if (le32_to_cpu(sb->feature_map) & 1938 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1939 if (le32_to_cpu(sb->feature_map) & 1940 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1941 return -EINVAL; 1942 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1943 (le32_to_cpu(sb->feature_map) & 1944 MD_FEATURE_MULTIPLE_PPLS)) 1945 return -EINVAL; 1946 set_bit(MD_HAS_PPL, &mddev->flags); 1947 } 1948 } else if (mddev->pers == NULL) { 1949 /* Insist of good event counter while assembling, except for 1950 * spares (which don't need an event count) */ 1951 ++ev1; 1952 if (rdev->desc_nr >= 0 && 1953 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1954 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1955 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1956 if (ev1 < mddev->events) 1957 return -EINVAL; 1958 } else if (mddev->bitmap) { 1959 /* If adding to array with a bitmap, then we can accept an 1960 * older device, but not too old. 1961 */ 1962 if (ev1 < mddev->bitmap->events_cleared) 1963 return 0; 1964 if (ev1 < mddev->events) 1965 set_bit(Bitmap_sync, &rdev->flags); 1966 } else { 1967 if (ev1 < mddev->events) 1968 /* just a hot-add of a new device, leave raid_disk at -1 */ 1969 return 0; 1970 } 1971 if (mddev->level != LEVEL_MULTIPATH) { 1972 int role; 1973 if (rdev->desc_nr < 0 || 1974 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1975 role = MD_DISK_ROLE_SPARE; 1976 rdev->desc_nr = -1; 1977 } else 1978 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1979 switch(role) { 1980 case MD_DISK_ROLE_SPARE: /* spare */ 1981 break; 1982 case MD_DISK_ROLE_FAULTY: /* faulty */ 1983 set_bit(Faulty, &rdev->flags); 1984 break; 1985 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1986 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1987 /* journal device without journal feature */ 1988 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 1989 return -EINVAL; 1990 } 1991 set_bit(Journal, &rdev->flags); 1992 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1993 rdev->raid_disk = 0; 1994 break; 1995 default: 1996 rdev->saved_raid_disk = role; 1997 if ((le32_to_cpu(sb->feature_map) & 1998 MD_FEATURE_RECOVERY_OFFSET)) { 1999 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 2000 if (!(le32_to_cpu(sb->feature_map) & 2001 MD_FEATURE_RECOVERY_BITMAP)) 2002 rdev->saved_raid_disk = -1; 2003 } else { 2004 /* 2005 * If the array is FROZEN, then the device can't 2006 * be in_sync with rest of array. 2007 */ 2008 if (!test_bit(MD_RECOVERY_FROZEN, 2009 &mddev->recovery)) 2010 set_bit(In_sync, &rdev->flags); 2011 } 2012 rdev->raid_disk = role; 2013 break; 2014 } 2015 if (sb->devflags & WriteMostly1) 2016 set_bit(WriteMostly, &rdev->flags); 2017 if (sb->devflags & FailFast1) 2018 set_bit(FailFast, &rdev->flags); 2019 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2020 set_bit(Replacement, &rdev->flags); 2021 } else /* MULTIPATH are always insync */ 2022 set_bit(In_sync, &rdev->flags); 2023 2024 return 0; 2025 } 2026 2027 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2028 { 2029 struct mdp_superblock_1 *sb; 2030 struct md_rdev *rdev2; 2031 int max_dev, i; 2032 /* make rdev->sb match mddev and rdev data. */ 2033 2034 sb = page_address(rdev->sb_page); 2035 2036 sb->feature_map = 0; 2037 sb->pad0 = 0; 2038 sb->recovery_offset = cpu_to_le64(0); 2039 memset(sb->pad3, 0, sizeof(sb->pad3)); 2040 2041 sb->utime = cpu_to_le64((__u64)mddev->utime); 2042 sb->events = cpu_to_le64(mddev->events); 2043 if (mddev->in_sync) 2044 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 2045 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2046 sb->resync_offset = cpu_to_le64(MaxSector); 2047 else 2048 sb->resync_offset = cpu_to_le64(0); 2049 2050 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2051 2052 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2053 sb->size = cpu_to_le64(mddev->dev_sectors); 2054 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2055 sb->level = cpu_to_le32(mddev->level); 2056 sb->layout = cpu_to_le32(mddev->layout); 2057 if (test_bit(FailFast, &rdev->flags)) 2058 sb->devflags |= FailFast1; 2059 else 2060 sb->devflags &= ~FailFast1; 2061 2062 if (test_bit(WriteMostly, &rdev->flags)) 2063 sb->devflags |= WriteMostly1; 2064 else 2065 sb->devflags &= ~WriteMostly1; 2066 sb->data_offset = cpu_to_le64(rdev->data_offset); 2067 sb->data_size = cpu_to_le64(rdev->sectors); 2068 2069 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2070 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2071 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2072 } 2073 2074 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2075 !test_bit(In_sync, &rdev->flags)) { 2076 sb->feature_map |= 2077 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2078 sb->recovery_offset = 2079 cpu_to_le64(rdev->recovery_offset); 2080 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2081 sb->feature_map |= 2082 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2083 } 2084 /* Note: recovery_offset and journal_tail share space */ 2085 if (test_bit(Journal, &rdev->flags)) 2086 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2087 if (test_bit(Replacement, &rdev->flags)) 2088 sb->feature_map |= 2089 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2090 2091 if (mddev->reshape_position != MaxSector) { 2092 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2093 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2094 sb->new_layout = cpu_to_le32(mddev->new_layout); 2095 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2096 sb->new_level = cpu_to_le32(mddev->new_level); 2097 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2098 if (mddev->delta_disks == 0 && 2099 mddev->reshape_backwards) 2100 sb->feature_map 2101 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2102 if (rdev->new_data_offset != rdev->data_offset) { 2103 sb->feature_map 2104 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2105 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2106 - rdev->data_offset)); 2107 } 2108 } 2109 2110 if (mddev_is_clustered(mddev)) 2111 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2112 2113 if (rdev->badblocks.count == 0) 2114 /* Nothing to do for bad blocks*/ ; 2115 else if (sb->bblog_offset == 0) 2116 /* Cannot record bad blocks on this device */ 2117 md_error(mddev, rdev); 2118 else { 2119 struct badblocks *bb = &rdev->badblocks; 2120 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2121 u64 *p = bb->page; 2122 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2123 if (bb->changed) { 2124 unsigned seq; 2125 2126 retry: 2127 seq = read_seqbegin(&bb->lock); 2128 2129 memset(bbp, 0xff, PAGE_SIZE); 2130 2131 for (i = 0 ; i < bb->count ; i++) { 2132 u64 internal_bb = p[i]; 2133 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2134 | BB_LEN(internal_bb)); 2135 bbp[i] = cpu_to_le64(store_bb); 2136 } 2137 bb->changed = 0; 2138 if (read_seqretry(&bb->lock, seq)) 2139 goto retry; 2140 2141 bb->sector = (rdev->sb_start + 2142 (int)le32_to_cpu(sb->bblog_offset)); 2143 bb->size = le16_to_cpu(sb->bblog_size); 2144 } 2145 } 2146 2147 max_dev = 0; 2148 rdev_for_each(rdev2, mddev) 2149 if (rdev2->desc_nr+1 > max_dev) 2150 max_dev = rdev2->desc_nr+1; 2151 2152 if (max_dev > le32_to_cpu(sb->max_dev)) { 2153 int bmask; 2154 sb->max_dev = cpu_to_le32(max_dev); 2155 rdev->sb_size = max_dev * 2 + 256; 2156 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2157 if (rdev->sb_size & bmask) 2158 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2159 } else 2160 max_dev = le32_to_cpu(sb->max_dev); 2161 2162 for (i=0; i<max_dev;i++) 2163 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2164 2165 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2166 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2167 2168 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2169 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2170 sb->feature_map |= 2171 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2172 else 2173 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2174 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2175 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2176 } 2177 2178 rdev_for_each(rdev2, mddev) { 2179 i = rdev2->desc_nr; 2180 if (test_bit(Faulty, &rdev2->flags)) 2181 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2182 else if (test_bit(In_sync, &rdev2->flags)) 2183 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2184 else if (test_bit(Journal, &rdev2->flags)) 2185 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2186 else if (rdev2->raid_disk >= 0) 2187 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2188 else 2189 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2190 } 2191 2192 sb->sb_csum = calc_sb_1_csum(sb); 2193 } 2194 2195 static sector_t super_1_choose_bm_space(sector_t dev_size) 2196 { 2197 sector_t bm_space; 2198 2199 /* if the device is bigger than 8Gig, save 64k for bitmap 2200 * usage, if bigger than 200Gig, save 128k 2201 */ 2202 if (dev_size < 64*2) 2203 bm_space = 0; 2204 else if (dev_size - 64*2 >= 200*1024*1024*2) 2205 bm_space = 128*2; 2206 else if (dev_size - 4*2 > 8*1024*1024*2) 2207 bm_space = 64*2; 2208 else 2209 bm_space = 4*2; 2210 return bm_space; 2211 } 2212 2213 static unsigned long long 2214 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2215 { 2216 struct mdp_superblock_1 *sb; 2217 sector_t max_sectors; 2218 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2219 return 0; /* component must fit device */ 2220 if (rdev->data_offset != rdev->new_data_offset) 2221 return 0; /* too confusing */ 2222 if (rdev->sb_start < rdev->data_offset) { 2223 /* minor versions 1 and 2; superblock before data */ 2224 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2225 if (!num_sectors || num_sectors > max_sectors) 2226 num_sectors = max_sectors; 2227 } else if (rdev->mddev->bitmap_info.offset) { 2228 /* minor version 0 with bitmap we can't move */ 2229 return 0; 2230 } else { 2231 /* minor version 0; superblock after data */ 2232 sector_t sb_start, bm_space; 2233 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2234 2235 /* 8K is for superblock */ 2236 sb_start = dev_size - 8*2; 2237 sb_start &= ~(sector_t)(4*2 - 1); 2238 2239 bm_space = super_1_choose_bm_space(dev_size); 2240 2241 /* Space that can be used to store date needs to decrease 2242 * superblock bitmap space and bad block space(4K) 2243 */ 2244 max_sectors = sb_start - bm_space - 4*2; 2245 2246 if (!num_sectors || num_sectors > max_sectors) 2247 num_sectors = max_sectors; 2248 rdev->sb_start = sb_start; 2249 } 2250 sb = page_address(rdev->sb_page); 2251 sb->data_size = cpu_to_le64(num_sectors); 2252 sb->super_offset = cpu_to_le64(rdev->sb_start); 2253 sb->sb_csum = calc_sb_1_csum(sb); 2254 do { 2255 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2256 rdev->sb_page); 2257 } while (md_super_wait(rdev->mddev) < 0); 2258 return num_sectors; 2259 2260 } 2261 2262 static int 2263 super_1_allow_new_offset(struct md_rdev *rdev, 2264 unsigned long long new_offset) 2265 { 2266 /* All necessary checks on new >= old have been done */ 2267 struct bitmap *bitmap; 2268 if (new_offset >= rdev->data_offset) 2269 return 1; 2270 2271 /* with 1.0 metadata, there is no metadata to tread on 2272 * so we can always move back */ 2273 if (rdev->mddev->minor_version == 0) 2274 return 1; 2275 2276 /* otherwise we must be sure not to step on 2277 * any metadata, so stay: 2278 * 36K beyond start of superblock 2279 * beyond end of badblocks 2280 * beyond write-intent bitmap 2281 */ 2282 if (rdev->sb_start + (32+4)*2 > new_offset) 2283 return 0; 2284 bitmap = rdev->mddev->bitmap; 2285 if (bitmap && !rdev->mddev->bitmap_info.file && 2286 rdev->sb_start + rdev->mddev->bitmap_info.offset + 2287 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 2288 return 0; 2289 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2290 return 0; 2291 2292 return 1; 2293 } 2294 2295 static struct super_type super_types[] = { 2296 [0] = { 2297 .name = "0.90.0", 2298 .owner = THIS_MODULE, 2299 .load_super = super_90_load, 2300 .validate_super = super_90_validate, 2301 .sync_super = super_90_sync, 2302 .rdev_size_change = super_90_rdev_size_change, 2303 .allow_new_offset = super_90_allow_new_offset, 2304 }, 2305 [1] = { 2306 .name = "md-1", 2307 .owner = THIS_MODULE, 2308 .load_super = super_1_load, 2309 .validate_super = super_1_validate, 2310 .sync_super = super_1_sync, 2311 .rdev_size_change = super_1_rdev_size_change, 2312 .allow_new_offset = super_1_allow_new_offset, 2313 }, 2314 }; 2315 2316 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2317 { 2318 if (mddev->sync_super) { 2319 mddev->sync_super(mddev, rdev); 2320 return; 2321 } 2322 2323 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2324 2325 super_types[mddev->major_version].sync_super(mddev, rdev); 2326 } 2327 2328 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2329 { 2330 struct md_rdev *rdev, *rdev2; 2331 2332 rcu_read_lock(); 2333 rdev_for_each_rcu(rdev, mddev1) { 2334 if (test_bit(Faulty, &rdev->flags) || 2335 test_bit(Journal, &rdev->flags) || 2336 rdev->raid_disk == -1) 2337 continue; 2338 rdev_for_each_rcu(rdev2, mddev2) { 2339 if (test_bit(Faulty, &rdev2->flags) || 2340 test_bit(Journal, &rdev2->flags) || 2341 rdev2->raid_disk == -1) 2342 continue; 2343 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2344 rcu_read_unlock(); 2345 return 1; 2346 } 2347 } 2348 } 2349 rcu_read_unlock(); 2350 return 0; 2351 } 2352 2353 static LIST_HEAD(pending_raid_disks); 2354 2355 /* 2356 * Try to register data integrity profile for an mddev 2357 * 2358 * This is called when an array is started and after a disk has been kicked 2359 * from the array. It only succeeds if all working and active component devices 2360 * are integrity capable with matching profiles. 2361 */ 2362 int md_integrity_register(struct mddev *mddev) 2363 { 2364 struct md_rdev *rdev, *reference = NULL; 2365 2366 if (list_empty(&mddev->disks)) 2367 return 0; /* nothing to do */ 2368 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2369 return 0; /* shouldn't register, or already is */ 2370 rdev_for_each(rdev, mddev) { 2371 /* skip spares and non-functional disks */ 2372 if (test_bit(Faulty, &rdev->flags)) 2373 continue; 2374 if (rdev->raid_disk < 0) 2375 continue; 2376 if (!reference) { 2377 /* Use the first rdev as the reference */ 2378 reference = rdev; 2379 continue; 2380 } 2381 /* does this rdev's profile match the reference profile? */ 2382 if (blk_integrity_compare(reference->bdev->bd_disk, 2383 rdev->bdev->bd_disk) < 0) 2384 return -EINVAL; 2385 } 2386 if (!reference || !bdev_get_integrity(reference->bdev)) 2387 return 0; 2388 /* 2389 * All component devices are integrity capable and have matching 2390 * profiles, register the common profile for the md device. 2391 */ 2392 blk_integrity_register(mddev->gendisk, 2393 bdev_get_integrity(reference->bdev)); 2394 2395 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2396 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || 2397 (mddev->level != 1 && mddev->level != 10 && 2398 bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { 2399 /* 2400 * No need to handle the failure of bioset_integrity_create, 2401 * because the function is called by md_run() -> pers->run(), 2402 * md_run calls bioset_exit -> bioset_integrity_free in case 2403 * of failure case. 2404 */ 2405 pr_err("md: failed to create integrity pool for %s\n", 2406 mdname(mddev)); 2407 return -EINVAL; 2408 } 2409 return 0; 2410 } 2411 EXPORT_SYMBOL(md_integrity_register); 2412 2413 /* 2414 * Attempt to add an rdev, but only if it is consistent with the current 2415 * integrity profile 2416 */ 2417 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2418 { 2419 struct blk_integrity *bi_mddev; 2420 2421 if (!mddev->gendisk) 2422 return 0; 2423 2424 bi_mddev = blk_get_integrity(mddev->gendisk); 2425 2426 if (!bi_mddev) /* nothing to do */ 2427 return 0; 2428 2429 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2430 pr_err("%s: incompatible integrity profile for %pg\n", 2431 mdname(mddev), rdev->bdev); 2432 return -ENXIO; 2433 } 2434 2435 return 0; 2436 } 2437 EXPORT_SYMBOL(md_integrity_add_rdev); 2438 2439 static bool rdev_read_only(struct md_rdev *rdev) 2440 { 2441 return bdev_read_only(rdev->bdev) || 2442 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2443 } 2444 2445 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2446 { 2447 char b[BDEVNAME_SIZE]; 2448 int err; 2449 2450 /* prevent duplicates */ 2451 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2452 return -EEXIST; 2453 2454 if (rdev_read_only(rdev) && mddev->pers) 2455 return -EROFS; 2456 2457 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2458 if (!test_bit(Journal, &rdev->flags) && 2459 rdev->sectors && 2460 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2461 if (mddev->pers) { 2462 /* Cannot change size, so fail 2463 * If mddev->level <= 0, then we don't care 2464 * about aligning sizes (e.g. linear) 2465 */ 2466 if (mddev->level > 0) 2467 return -ENOSPC; 2468 } else 2469 mddev->dev_sectors = rdev->sectors; 2470 } 2471 2472 /* Verify rdev->desc_nr is unique. 2473 * If it is -1, assign a free number, else 2474 * check number is not in use 2475 */ 2476 rcu_read_lock(); 2477 if (rdev->desc_nr < 0) { 2478 int choice = 0; 2479 if (mddev->pers) 2480 choice = mddev->raid_disks; 2481 while (md_find_rdev_nr_rcu(mddev, choice)) 2482 choice++; 2483 rdev->desc_nr = choice; 2484 } else { 2485 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2486 rcu_read_unlock(); 2487 return -EBUSY; 2488 } 2489 } 2490 rcu_read_unlock(); 2491 if (!test_bit(Journal, &rdev->flags) && 2492 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2493 pr_warn("md: %s: array is limited to %d devices\n", 2494 mdname(mddev), mddev->max_disks); 2495 return -EBUSY; 2496 } 2497 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2498 strreplace(b, '/', '!'); 2499 2500 rdev->mddev = mddev; 2501 pr_debug("md: bind<%s>\n", b); 2502 2503 if (mddev->raid_disks) 2504 mddev_create_serial_pool(mddev, rdev); 2505 2506 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2507 goto fail; 2508 2509 /* failure here is OK */ 2510 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2511 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2512 rdev->sysfs_unack_badblocks = 2513 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2514 rdev->sysfs_badblocks = 2515 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2516 2517 list_add_rcu(&rdev->same_set, &mddev->disks); 2518 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2519 2520 /* May as well allow recovery to be retried once */ 2521 mddev->recovery_disabled++; 2522 2523 return 0; 2524 2525 fail: 2526 pr_warn("md: failed to register dev-%s for %s\n", 2527 b, mdname(mddev)); 2528 return err; 2529 } 2530 2531 void md_autodetect_dev(dev_t dev); 2532 2533 /* just for claiming the bdev */ 2534 static struct md_rdev claim_rdev; 2535 2536 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2537 { 2538 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2539 md_rdev_clear(rdev); 2540 #ifndef MODULE 2541 if (test_bit(AutoDetected, &rdev->flags)) 2542 md_autodetect_dev(rdev->bdev->bd_dev); 2543 #endif 2544 bdev_release(rdev->bdev_handle); 2545 rdev->bdev = NULL; 2546 kobject_put(&rdev->kobj); 2547 } 2548 2549 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2550 { 2551 struct mddev *mddev = rdev->mddev; 2552 2553 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2554 list_del_rcu(&rdev->same_set); 2555 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2556 mddev_destroy_serial_pool(rdev->mddev, rdev); 2557 rdev->mddev = NULL; 2558 sysfs_remove_link(&rdev->kobj, "block"); 2559 sysfs_put(rdev->sysfs_state); 2560 sysfs_put(rdev->sysfs_unack_badblocks); 2561 sysfs_put(rdev->sysfs_badblocks); 2562 rdev->sysfs_state = NULL; 2563 rdev->sysfs_unack_badblocks = NULL; 2564 rdev->sysfs_badblocks = NULL; 2565 rdev->badblocks.count = 0; 2566 2567 synchronize_rcu(); 2568 2569 /* 2570 * kobject_del() will wait for all in progress writers to be done, where 2571 * reconfig_mutex is held, hence it can't be called under 2572 * reconfig_mutex and it's delayed to mddev_unlock(). 2573 */ 2574 list_add(&rdev->same_set, &mddev->deleting); 2575 } 2576 2577 static void export_array(struct mddev *mddev) 2578 { 2579 struct md_rdev *rdev; 2580 2581 while (!list_empty(&mddev->disks)) { 2582 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2583 same_set); 2584 md_kick_rdev_from_array(rdev); 2585 } 2586 mddev->raid_disks = 0; 2587 mddev->major_version = 0; 2588 } 2589 2590 static bool set_in_sync(struct mddev *mddev) 2591 { 2592 lockdep_assert_held(&mddev->lock); 2593 if (!mddev->in_sync) { 2594 mddev->sync_checkers++; 2595 spin_unlock(&mddev->lock); 2596 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2597 spin_lock(&mddev->lock); 2598 if (!mddev->in_sync && 2599 percpu_ref_is_zero(&mddev->writes_pending)) { 2600 mddev->in_sync = 1; 2601 /* 2602 * Ensure ->in_sync is visible before we clear 2603 * ->sync_checkers. 2604 */ 2605 smp_mb(); 2606 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2607 sysfs_notify_dirent_safe(mddev->sysfs_state); 2608 } 2609 if (--mddev->sync_checkers == 0) 2610 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2611 } 2612 if (mddev->safemode == 1) 2613 mddev->safemode = 0; 2614 return mddev->in_sync; 2615 } 2616 2617 static void sync_sbs(struct mddev *mddev, int nospares) 2618 { 2619 /* Update each superblock (in-memory image), but 2620 * if we are allowed to, skip spares which already 2621 * have the right event counter, or have one earlier 2622 * (which would mean they aren't being marked as dirty 2623 * with the rest of the array) 2624 */ 2625 struct md_rdev *rdev; 2626 rdev_for_each(rdev, mddev) { 2627 if (rdev->sb_events == mddev->events || 2628 (nospares && 2629 rdev->raid_disk < 0 && 2630 rdev->sb_events+1 == mddev->events)) { 2631 /* Don't update this superblock */ 2632 rdev->sb_loaded = 2; 2633 } else { 2634 sync_super(mddev, rdev); 2635 rdev->sb_loaded = 1; 2636 } 2637 } 2638 } 2639 2640 static bool does_sb_need_changing(struct mddev *mddev) 2641 { 2642 struct md_rdev *rdev = NULL, *iter; 2643 struct mdp_superblock_1 *sb; 2644 int role; 2645 2646 /* Find a good rdev */ 2647 rdev_for_each(iter, mddev) 2648 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2649 rdev = iter; 2650 break; 2651 } 2652 2653 /* No good device found. */ 2654 if (!rdev) 2655 return false; 2656 2657 sb = page_address(rdev->sb_page); 2658 /* Check if a device has become faulty or a spare become active */ 2659 rdev_for_each(rdev, mddev) { 2660 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2661 /* Device activated? */ 2662 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2663 !test_bit(Faulty, &rdev->flags)) 2664 return true; 2665 /* Device turned faulty? */ 2666 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2667 return true; 2668 } 2669 2670 /* Check if any mddev parameters have changed */ 2671 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2672 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2673 (mddev->layout != le32_to_cpu(sb->layout)) || 2674 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2675 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2676 return true; 2677 2678 return false; 2679 } 2680 2681 void md_update_sb(struct mddev *mddev, int force_change) 2682 { 2683 struct md_rdev *rdev; 2684 int sync_req; 2685 int nospares = 0; 2686 int any_badblocks_changed = 0; 2687 int ret = -1; 2688 2689 if (!md_is_rdwr(mddev)) { 2690 if (force_change) 2691 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2692 return; 2693 } 2694 2695 repeat: 2696 if (mddev_is_clustered(mddev)) { 2697 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2698 force_change = 1; 2699 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2700 nospares = 1; 2701 ret = md_cluster_ops->metadata_update_start(mddev); 2702 /* Has someone else has updated the sb */ 2703 if (!does_sb_need_changing(mddev)) { 2704 if (ret == 0) 2705 md_cluster_ops->metadata_update_cancel(mddev); 2706 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2707 BIT(MD_SB_CHANGE_DEVS) | 2708 BIT(MD_SB_CHANGE_CLEAN)); 2709 return; 2710 } 2711 } 2712 2713 /* 2714 * First make sure individual recovery_offsets are correct 2715 * curr_resync_completed can only be used during recovery. 2716 * During reshape/resync it might use array-addresses rather 2717 * that device addresses. 2718 */ 2719 rdev_for_each(rdev, mddev) { 2720 if (rdev->raid_disk >= 0 && 2721 mddev->delta_disks >= 0 && 2722 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2723 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2724 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2725 !test_bit(Journal, &rdev->flags) && 2726 !test_bit(In_sync, &rdev->flags) && 2727 mddev->curr_resync_completed > rdev->recovery_offset) 2728 rdev->recovery_offset = mddev->curr_resync_completed; 2729 2730 } 2731 if (!mddev->persistent) { 2732 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2733 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2734 if (!mddev->external) { 2735 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2736 rdev_for_each(rdev, mddev) { 2737 if (rdev->badblocks.changed) { 2738 rdev->badblocks.changed = 0; 2739 ack_all_badblocks(&rdev->badblocks); 2740 md_error(mddev, rdev); 2741 } 2742 clear_bit(Blocked, &rdev->flags); 2743 clear_bit(BlockedBadBlocks, &rdev->flags); 2744 wake_up(&rdev->blocked_wait); 2745 } 2746 } 2747 wake_up(&mddev->sb_wait); 2748 return; 2749 } 2750 2751 spin_lock(&mddev->lock); 2752 2753 mddev->utime = ktime_get_real_seconds(); 2754 2755 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2756 force_change = 1; 2757 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2758 /* just a clean<-> dirty transition, possibly leave spares alone, 2759 * though if events isn't the right even/odd, we will have to do 2760 * spares after all 2761 */ 2762 nospares = 1; 2763 if (force_change) 2764 nospares = 0; 2765 if (mddev->degraded) 2766 /* If the array is degraded, then skipping spares is both 2767 * dangerous and fairly pointless. 2768 * Dangerous because a device that was removed from the array 2769 * might have a event_count that still looks up-to-date, 2770 * so it can be re-added without a resync. 2771 * Pointless because if there are any spares to skip, 2772 * then a recovery will happen and soon that array won't 2773 * be degraded any more and the spare can go back to sleep then. 2774 */ 2775 nospares = 0; 2776 2777 sync_req = mddev->in_sync; 2778 2779 /* If this is just a dirty<->clean transition, and the array is clean 2780 * and 'events' is odd, we can roll back to the previous clean state */ 2781 if (nospares 2782 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2783 && mddev->can_decrease_events 2784 && mddev->events != 1) { 2785 mddev->events--; 2786 mddev->can_decrease_events = 0; 2787 } else { 2788 /* otherwise we have to go forward and ... */ 2789 mddev->events ++; 2790 mddev->can_decrease_events = nospares; 2791 } 2792 2793 /* 2794 * This 64-bit counter should never wrap. 2795 * Either we are in around ~1 trillion A.C., assuming 2796 * 1 reboot per second, or we have a bug... 2797 */ 2798 WARN_ON(mddev->events == 0); 2799 2800 rdev_for_each(rdev, mddev) { 2801 if (rdev->badblocks.changed) 2802 any_badblocks_changed++; 2803 if (test_bit(Faulty, &rdev->flags)) 2804 set_bit(FaultRecorded, &rdev->flags); 2805 } 2806 2807 sync_sbs(mddev, nospares); 2808 spin_unlock(&mddev->lock); 2809 2810 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2811 mdname(mddev), mddev->in_sync); 2812 2813 if (mddev->queue) 2814 blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2815 rewrite: 2816 md_bitmap_update_sb(mddev->bitmap); 2817 rdev_for_each(rdev, mddev) { 2818 if (rdev->sb_loaded != 1) 2819 continue; /* no noise on spare devices */ 2820 2821 if (!test_bit(Faulty, &rdev->flags)) { 2822 md_super_write(mddev,rdev, 2823 rdev->sb_start, rdev->sb_size, 2824 rdev->sb_page); 2825 pr_debug("md: (write) %pg's sb offset: %llu\n", 2826 rdev->bdev, 2827 (unsigned long long)rdev->sb_start); 2828 rdev->sb_events = mddev->events; 2829 if (rdev->badblocks.size) { 2830 md_super_write(mddev, rdev, 2831 rdev->badblocks.sector, 2832 rdev->badblocks.size << 9, 2833 rdev->bb_page); 2834 rdev->badblocks.size = 0; 2835 } 2836 2837 } else 2838 pr_debug("md: %pg (skipping faulty)\n", 2839 rdev->bdev); 2840 2841 if (mddev->level == LEVEL_MULTIPATH) 2842 /* only need to write one superblock... */ 2843 break; 2844 } 2845 if (md_super_wait(mddev) < 0) 2846 goto rewrite; 2847 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2848 2849 if (mddev_is_clustered(mddev) && ret == 0) 2850 md_cluster_ops->metadata_update_finish(mddev); 2851 2852 if (mddev->in_sync != sync_req || 2853 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2854 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2855 /* have to write it out again */ 2856 goto repeat; 2857 wake_up(&mddev->sb_wait); 2858 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2859 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2860 2861 rdev_for_each(rdev, mddev) { 2862 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2863 clear_bit(Blocked, &rdev->flags); 2864 2865 if (any_badblocks_changed) 2866 ack_all_badblocks(&rdev->badblocks); 2867 clear_bit(BlockedBadBlocks, &rdev->flags); 2868 wake_up(&rdev->blocked_wait); 2869 } 2870 } 2871 EXPORT_SYMBOL(md_update_sb); 2872 2873 static int add_bound_rdev(struct md_rdev *rdev) 2874 { 2875 struct mddev *mddev = rdev->mddev; 2876 int err = 0; 2877 bool add_journal = test_bit(Journal, &rdev->flags); 2878 2879 if (!mddev->pers->hot_remove_disk || add_journal) { 2880 /* If there is hot_add_disk but no hot_remove_disk 2881 * then added disks for geometry changes, 2882 * and should be added immediately. 2883 */ 2884 super_types[mddev->major_version]. 2885 validate_super(mddev, rdev); 2886 err = mddev->pers->hot_add_disk(mddev, rdev); 2887 if (err) { 2888 md_kick_rdev_from_array(rdev); 2889 return err; 2890 } 2891 } 2892 sysfs_notify_dirent_safe(rdev->sysfs_state); 2893 2894 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2895 if (mddev->degraded) 2896 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2897 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2898 md_new_event(); 2899 md_wakeup_thread(mddev->thread); 2900 return 0; 2901 } 2902 2903 /* words written to sysfs files may, or may not, be \n terminated. 2904 * We want to accept with case. For this we use cmd_match. 2905 */ 2906 static int cmd_match(const char *cmd, const char *str) 2907 { 2908 /* See if cmd, written into a sysfs file, matches 2909 * str. They must either be the same, or cmd can 2910 * have a trailing newline 2911 */ 2912 while (*cmd && *str && *cmd == *str) { 2913 cmd++; 2914 str++; 2915 } 2916 if (*cmd == '\n') 2917 cmd++; 2918 if (*str || *cmd) 2919 return 0; 2920 return 1; 2921 } 2922 2923 struct rdev_sysfs_entry { 2924 struct attribute attr; 2925 ssize_t (*show)(struct md_rdev *, char *); 2926 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2927 }; 2928 2929 static ssize_t 2930 state_show(struct md_rdev *rdev, char *page) 2931 { 2932 char *sep = ","; 2933 size_t len = 0; 2934 unsigned long flags = READ_ONCE(rdev->flags); 2935 2936 if (test_bit(Faulty, &flags) || 2937 (!test_bit(ExternalBbl, &flags) && 2938 rdev->badblocks.unacked_exist)) 2939 len += sprintf(page+len, "faulty%s", sep); 2940 if (test_bit(In_sync, &flags)) 2941 len += sprintf(page+len, "in_sync%s", sep); 2942 if (test_bit(Journal, &flags)) 2943 len += sprintf(page+len, "journal%s", sep); 2944 if (test_bit(WriteMostly, &flags)) 2945 len += sprintf(page+len, "write_mostly%s", sep); 2946 if (test_bit(Blocked, &flags) || 2947 (rdev->badblocks.unacked_exist 2948 && !test_bit(Faulty, &flags))) 2949 len += sprintf(page+len, "blocked%s", sep); 2950 if (!test_bit(Faulty, &flags) && 2951 !test_bit(Journal, &flags) && 2952 !test_bit(In_sync, &flags)) 2953 len += sprintf(page+len, "spare%s", sep); 2954 if (test_bit(WriteErrorSeen, &flags)) 2955 len += sprintf(page+len, "write_error%s", sep); 2956 if (test_bit(WantReplacement, &flags)) 2957 len += sprintf(page+len, "want_replacement%s", sep); 2958 if (test_bit(Replacement, &flags)) 2959 len += sprintf(page+len, "replacement%s", sep); 2960 if (test_bit(ExternalBbl, &flags)) 2961 len += sprintf(page+len, "external_bbl%s", sep); 2962 if (test_bit(FailFast, &flags)) 2963 len += sprintf(page+len, "failfast%s", sep); 2964 2965 if (len) 2966 len -= strlen(sep); 2967 2968 return len+sprintf(page+len, "\n"); 2969 } 2970 2971 static ssize_t 2972 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2973 { 2974 /* can write 2975 * faulty - simulates an error 2976 * remove - disconnects the device 2977 * writemostly - sets write_mostly 2978 * -writemostly - clears write_mostly 2979 * blocked - sets the Blocked flags 2980 * -blocked - clears the Blocked and possibly simulates an error 2981 * insync - sets Insync providing device isn't active 2982 * -insync - clear Insync for a device with a slot assigned, 2983 * so that it gets rebuilt based on bitmap 2984 * write_error - sets WriteErrorSeen 2985 * -write_error - clears WriteErrorSeen 2986 * {,-}failfast - set/clear FailFast 2987 */ 2988 2989 struct mddev *mddev = rdev->mddev; 2990 int err = -EINVAL; 2991 bool need_update_sb = false; 2992 2993 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2994 md_error(rdev->mddev, rdev); 2995 2996 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 2997 err = -EBUSY; 2998 else 2999 err = 0; 3000 } else if (cmd_match(buf, "remove")) { 3001 if (rdev->mddev->pers) { 3002 clear_bit(Blocked, &rdev->flags); 3003 remove_and_add_spares(rdev->mddev, rdev); 3004 } 3005 if (rdev->raid_disk >= 0) 3006 err = -EBUSY; 3007 else { 3008 err = 0; 3009 if (mddev_is_clustered(mddev)) 3010 err = md_cluster_ops->remove_disk(mddev, rdev); 3011 3012 if (err == 0) { 3013 md_kick_rdev_from_array(rdev); 3014 if (mddev->pers) { 3015 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3016 md_wakeup_thread(mddev->thread); 3017 } 3018 md_new_event(); 3019 } 3020 } 3021 } else if (cmd_match(buf, "writemostly")) { 3022 set_bit(WriteMostly, &rdev->flags); 3023 mddev_create_serial_pool(rdev->mddev, rdev); 3024 need_update_sb = true; 3025 err = 0; 3026 } else if (cmd_match(buf, "-writemostly")) { 3027 mddev_destroy_serial_pool(rdev->mddev, rdev); 3028 clear_bit(WriteMostly, &rdev->flags); 3029 need_update_sb = true; 3030 err = 0; 3031 } else if (cmd_match(buf, "blocked")) { 3032 set_bit(Blocked, &rdev->flags); 3033 err = 0; 3034 } else if (cmd_match(buf, "-blocked")) { 3035 if (!test_bit(Faulty, &rdev->flags) && 3036 !test_bit(ExternalBbl, &rdev->flags) && 3037 rdev->badblocks.unacked_exist) { 3038 /* metadata handler doesn't understand badblocks, 3039 * so we need to fail the device 3040 */ 3041 md_error(rdev->mddev, rdev); 3042 } 3043 clear_bit(Blocked, &rdev->flags); 3044 clear_bit(BlockedBadBlocks, &rdev->flags); 3045 wake_up(&rdev->blocked_wait); 3046 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3047 md_wakeup_thread(rdev->mddev->thread); 3048 3049 err = 0; 3050 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3051 set_bit(In_sync, &rdev->flags); 3052 err = 0; 3053 } else if (cmd_match(buf, "failfast")) { 3054 set_bit(FailFast, &rdev->flags); 3055 need_update_sb = true; 3056 err = 0; 3057 } else if (cmd_match(buf, "-failfast")) { 3058 clear_bit(FailFast, &rdev->flags); 3059 need_update_sb = true; 3060 err = 0; 3061 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3062 !test_bit(Journal, &rdev->flags)) { 3063 if (rdev->mddev->pers == NULL) { 3064 clear_bit(In_sync, &rdev->flags); 3065 rdev->saved_raid_disk = rdev->raid_disk; 3066 rdev->raid_disk = -1; 3067 err = 0; 3068 } 3069 } else if (cmd_match(buf, "write_error")) { 3070 set_bit(WriteErrorSeen, &rdev->flags); 3071 err = 0; 3072 } else if (cmd_match(buf, "-write_error")) { 3073 clear_bit(WriteErrorSeen, &rdev->flags); 3074 err = 0; 3075 } else if (cmd_match(buf, "want_replacement")) { 3076 /* Any non-spare device that is not a replacement can 3077 * become want_replacement at any time, but we then need to 3078 * check if recovery is needed. 3079 */ 3080 if (rdev->raid_disk >= 0 && 3081 !test_bit(Journal, &rdev->flags) && 3082 !test_bit(Replacement, &rdev->flags)) 3083 set_bit(WantReplacement, &rdev->flags); 3084 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3085 md_wakeup_thread(rdev->mddev->thread); 3086 err = 0; 3087 } else if (cmd_match(buf, "-want_replacement")) { 3088 /* Clearing 'want_replacement' is always allowed. 3089 * Once replacements starts it is too late though. 3090 */ 3091 err = 0; 3092 clear_bit(WantReplacement, &rdev->flags); 3093 } else if (cmd_match(buf, "replacement")) { 3094 /* Can only set a device as a replacement when array has not 3095 * yet been started. Once running, replacement is automatic 3096 * from spares, or by assigning 'slot'. 3097 */ 3098 if (rdev->mddev->pers) 3099 err = -EBUSY; 3100 else { 3101 set_bit(Replacement, &rdev->flags); 3102 err = 0; 3103 } 3104 } else if (cmd_match(buf, "-replacement")) { 3105 /* Similarly, can only clear Replacement before start */ 3106 if (rdev->mddev->pers) 3107 err = -EBUSY; 3108 else { 3109 clear_bit(Replacement, &rdev->flags); 3110 err = 0; 3111 } 3112 } else if (cmd_match(buf, "re-add")) { 3113 if (!rdev->mddev->pers) 3114 err = -EINVAL; 3115 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3116 rdev->saved_raid_disk >= 0) { 3117 /* clear_bit is performed _after_ all the devices 3118 * have their local Faulty bit cleared. If any writes 3119 * happen in the meantime in the local node, they 3120 * will land in the local bitmap, which will be synced 3121 * by this node eventually 3122 */ 3123 if (!mddev_is_clustered(rdev->mddev) || 3124 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 3125 clear_bit(Faulty, &rdev->flags); 3126 err = add_bound_rdev(rdev); 3127 } 3128 } else 3129 err = -EBUSY; 3130 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3131 set_bit(ExternalBbl, &rdev->flags); 3132 rdev->badblocks.shift = 0; 3133 err = 0; 3134 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3135 clear_bit(ExternalBbl, &rdev->flags); 3136 err = 0; 3137 } 3138 if (need_update_sb) 3139 md_update_sb(mddev, 1); 3140 if (!err) 3141 sysfs_notify_dirent_safe(rdev->sysfs_state); 3142 return err ? err : len; 3143 } 3144 static struct rdev_sysfs_entry rdev_state = 3145 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3146 3147 static ssize_t 3148 errors_show(struct md_rdev *rdev, char *page) 3149 { 3150 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3151 } 3152 3153 static ssize_t 3154 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3155 { 3156 unsigned int n; 3157 int rv; 3158 3159 rv = kstrtouint(buf, 10, &n); 3160 if (rv < 0) 3161 return rv; 3162 atomic_set(&rdev->corrected_errors, n); 3163 return len; 3164 } 3165 static struct rdev_sysfs_entry rdev_errors = 3166 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3167 3168 static ssize_t 3169 slot_show(struct md_rdev *rdev, char *page) 3170 { 3171 if (test_bit(Journal, &rdev->flags)) 3172 return sprintf(page, "journal\n"); 3173 else if (rdev->raid_disk < 0) 3174 return sprintf(page, "none\n"); 3175 else 3176 return sprintf(page, "%d\n", rdev->raid_disk); 3177 } 3178 3179 static ssize_t 3180 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3181 { 3182 int slot; 3183 int err; 3184 3185 if (test_bit(Journal, &rdev->flags)) 3186 return -EBUSY; 3187 if (strncmp(buf, "none", 4)==0) 3188 slot = -1; 3189 else { 3190 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3191 if (err < 0) 3192 return err; 3193 if (slot < 0) 3194 /* overflow */ 3195 return -ENOSPC; 3196 } 3197 if (rdev->mddev->pers && slot == -1) { 3198 /* Setting 'slot' on an active array requires also 3199 * updating the 'rd%d' link, and communicating 3200 * with the personality with ->hot_*_disk. 3201 * For now we only support removing 3202 * failed/spare devices. This normally happens automatically, 3203 * but not when the metadata is externally managed. 3204 */ 3205 if (rdev->raid_disk == -1) 3206 return -EEXIST; 3207 /* personality does all needed checks */ 3208 if (rdev->mddev->pers->hot_remove_disk == NULL) 3209 return -EINVAL; 3210 clear_bit(Blocked, &rdev->flags); 3211 remove_and_add_spares(rdev->mddev, rdev); 3212 if (rdev->raid_disk >= 0) 3213 return -EBUSY; 3214 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3215 md_wakeup_thread(rdev->mddev->thread); 3216 } else if (rdev->mddev->pers) { 3217 /* Activating a spare .. or possibly reactivating 3218 * if we ever get bitmaps working here. 3219 */ 3220 int err; 3221 3222 if (rdev->raid_disk != -1) 3223 return -EBUSY; 3224 3225 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3226 return -EBUSY; 3227 3228 if (rdev->mddev->pers->hot_add_disk == NULL) 3229 return -EINVAL; 3230 3231 if (slot >= rdev->mddev->raid_disks && 3232 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3233 return -ENOSPC; 3234 3235 rdev->raid_disk = slot; 3236 if (test_bit(In_sync, &rdev->flags)) 3237 rdev->saved_raid_disk = slot; 3238 else 3239 rdev->saved_raid_disk = -1; 3240 clear_bit(In_sync, &rdev->flags); 3241 clear_bit(Bitmap_sync, &rdev->flags); 3242 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3243 if (err) { 3244 rdev->raid_disk = -1; 3245 return err; 3246 } else 3247 sysfs_notify_dirent_safe(rdev->sysfs_state); 3248 /* failure here is OK */; 3249 sysfs_link_rdev(rdev->mddev, rdev); 3250 /* don't wakeup anyone, leave that to userspace. */ 3251 } else { 3252 if (slot >= rdev->mddev->raid_disks && 3253 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3254 return -ENOSPC; 3255 rdev->raid_disk = slot; 3256 /* assume it is working */ 3257 clear_bit(Faulty, &rdev->flags); 3258 clear_bit(WriteMostly, &rdev->flags); 3259 set_bit(In_sync, &rdev->flags); 3260 sysfs_notify_dirent_safe(rdev->sysfs_state); 3261 } 3262 return len; 3263 } 3264 3265 static struct rdev_sysfs_entry rdev_slot = 3266 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3267 3268 static ssize_t 3269 offset_show(struct md_rdev *rdev, char *page) 3270 { 3271 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3272 } 3273 3274 static ssize_t 3275 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3276 { 3277 unsigned long long offset; 3278 if (kstrtoull(buf, 10, &offset) < 0) 3279 return -EINVAL; 3280 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3281 return -EBUSY; 3282 if (rdev->sectors && rdev->mddev->external) 3283 /* Must set offset before size, so overlap checks 3284 * can be sane */ 3285 return -EBUSY; 3286 rdev->data_offset = offset; 3287 rdev->new_data_offset = offset; 3288 return len; 3289 } 3290 3291 static struct rdev_sysfs_entry rdev_offset = 3292 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3293 3294 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3295 { 3296 return sprintf(page, "%llu\n", 3297 (unsigned long long)rdev->new_data_offset); 3298 } 3299 3300 static ssize_t new_offset_store(struct md_rdev *rdev, 3301 const char *buf, size_t len) 3302 { 3303 unsigned long long new_offset; 3304 struct mddev *mddev = rdev->mddev; 3305 3306 if (kstrtoull(buf, 10, &new_offset) < 0) 3307 return -EINVAL; 3308 3309 if (mddev->sync_thread || 3310 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 3311 return -EBUSY; 3312 if (new_offset == rdev->data_offset) 3313 /* reset is always permitted */ 3314 ; 3315 else if (new_offset > rdev->data_offset) { 3316 /* must not push array size beyond rdev_sectors */ 3317 if (new_offset - rdev->data_offset 3318 + mddev->dev_sectors > rdev->sectors) 3319 return -E2BIG; 3320 } 3321 /* Metadata worries about other space details. */ 3322 3323 /* decreasing the offset is inconsistent with a backwards 3324 * reshape. 3325 */ 3326 if (new_offset < rdev->data_offset && 3327 mddev->reshape_backwards) 3328 return -EINVAL; 3329 /* Increasing offset is inconsistent with forwards 3330 * reshape. reshape_direction should be set to 3331 * 'backwards' first. 3332 */ 3333 if (new_offset > rdev->data_offset && 3334 !mddev->reshape_backwards) 3335 return -EINVAL; 3336 3337 if (mddev->pers && mddev->persistent && 3338 !super_types[mddev->major_version] 3339 .allow_new_offset(rdev, new_offset)) 3340 return -E2BIG; 3341 rdev->new_data_offset = new_offset; 3342 if (new_offset > rdev->data_offset) 3343 mddev->reshape_backwards = 1; 3344 else if (new_offset < rdev->data_offset) 3345 mddev->reshape_backwards = 0; 3346 3347 return len; 3348 } 3349 static struct rdev_sysfs_entry rdev_new_offset = 3350 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3351 3352 static ssize_t 3353 rdev_size_show(struct md_rdev *rdev, char *page) 3354 { 3355 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3356 } 3357 3358 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3359 { 3360 /* check if two start/length pairs overlap */ 3361 if (a->data_offset + a->sectors <= b->data_offset) 3362 return false; 3363 if (b->data_offset + b->sectors <= a->data_offset) 3364 return false; 3365 return true; 3366 } 3367 3368 static bool md_rdev_overlaps(struct md_rdev *rdev) 3369 { 3370 struct mddev *mddev; 3371 struct md_rdev *rdev2; 3372 3373 spin_lock(&all_mddevs_lock); 3374 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3375 if (test_bit(MD_DELETED, &mddev->flags)) 3376 continue; 3377 rdev_for_each(rdev2, mddev) { 3378 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3379 md_rdevs_overlap(rdev, rdev2)) { 3380 spin_unlock(&all_mddevs_lock); 3381 return true; 3382 } 3383 } 3384 } 3385 spin_unlock(&all_mddevs_lock); 3386 return false; 3387 } 3388 3389 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3390 { 3391 unsigned long long blocks; 3392 sector_t new; 3393 3394 if (kstrtoull(buf, 10, &blocks) < 0) 3395 return -EINVAL; 3396 3397 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3398 return -EINVAL; /* sector conversion overflow */ 3399 3400 new = blocks * 2; 3401 if (new != blocks * 2) 3402 return -EINVAL; /* unsigned long long to sector_t overflow */ 3403 3404 *sectors = new; 3405 return 0; 3406 } 3407 3408 static ssize_t 3409 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3410 { 3411 struct mddev *my_mddev = rdev->mddev; 3412 sector_t oldsectors = rdev->sectors; 3413 sector_t sectors; 3414 3415 if (test_bit(Journal, &rdev->flags)) 3416 return -EBUSY; 3417 if (strict_blocks_to_sectors(buf, §ors) < 0) 3418 return -EINVAL; 3419 if (rdev->data_offset != rdev->new_data_offset) 3420 return -EINVAL; /* too confusing */ 3421 if (my_mddev->pers && rdev->raid_disk >= 0) { 3422 if (my_mddev->persistent) { 3423 sectors = super_types[my_mddev->major_version]. 3424 rdev_size_change(rdev, sectors); 3425 if (!sectors) 3426 return -EBUSY; 3427 } else if (!sectors) 3428 sectors = bdev_nr_sectors(rdev->bdev) - 3429 rdev->data_offset; 3430 if (!my_mddev->pers->resize) 3431 /* Cannot change size for RAID0 or Linear etc */ 3432 return -EINVAL; 3433 } 3434 if (sectors < my_mddev->dev_sectors) 3435 return -EINVAL; /* component must fit device */ 3436 3437 rdev->sectors = sectors; 3438 3439 /* 3440 * Check that all other rdevs with the same bdev do not overlap. This 3441 * check does not provide a hard guarantee, it just helps avoid 3442 * dangerous mistakes. 3443 */ 3444 if (sectors > oldsectors && my_mddev->external && 3445 md_rdev_overlaps(rdev)) { 3446 /* 3447 * Someone else could have slipped in a size change here, but 3448 * doing so is just silly. We put oldsectors back because we 3449 * know it is safe, and trust userspace not to race with itself. 3450 */ 3451 rdev->sectors = oldsectors; 3452 return -EBUSY; 3453 } 3454 return len; 3455 } 3456 3457 static struct rdev_sysfs_entry rdev_size = 3458 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3459 3460 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3461 { 3462 unsigned long long recovery_start = rdev->recovery_offset; 3463 3464 if (test_bit(In_sync, &rdev->flags) || 3465 recovery_start == MaxSector) 3466 return sprintf(page, "none\n"); 3467 3468 return sprintf(page, "%llu\n", recovery_start); 3469 } 3470 3471 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3472 { 3473 unsigned long long recovery_start; 3474 3475 if (cmd_match(buf, "none")) 3476 recovery_start = MaxSector; 3477 else if (kstrtoull(buf, 10, &recovery_start)) 3478 return -EINVAL; 3479 3480 if (rdev->mddev->pers && 3481 rdev->raid_disk >= 0) 3482 return -EBUSY; 3483 3484 rdev->recovery_offset = recovery_start; 3485 if (recovery_start == MaxSector) 3486 set_bit(In_sync, &rdev->flags); 3487 else 3488 clear_bit(In_sync, &rdev->flags); 3489 return len; 3490 } 3491 3492 static struct rdev_sysfs_entry rdev_recovery_start = 3493 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3494 3495 /* sysfs access to bad-blocks list. 3496 * We present two files. 3497 * 'bad-blocks' lists sector numbers and lengths of ranges that 3498 * are recorded as bad. The list is truncated to fit within 3499 * the one-page limit of sysfs. 3500 * Writing "sector length" to this file adds an acknowledged 3501 * bad block list. 3502 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3503 * been acknowledged. Writing to this file adds bad blocks 3504 * without acknowledging them. This is largely for testing. 3505 */ 3506 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3507 { 3508 return badblocks_show(&rdev->badblocks, page, 0); 3509 } 3510 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3511 { 3512 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3513 /* Maybe that ack was all we needed */ 3514 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3515 wake_up(&rdev->blocked_wait); 3516 return rv; 3517 } 3518 static struct rdev_sysfs_entry rdev_bad_blocks = 3519 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3520 3521 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3522 { 3523 return badblocks_show(&rdev->badblocks, page, 1); 3524 } 3525 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3526 { 3527 return badblocks_store(&rdev->badblocks, page, len, 1); 3528 } 3529 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3530 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3531 3532 static ssize_t 3533 ppl_sector_show(struct md_rdev *rdev, char *page) 3534 { 3535 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3536 } 3537 3538 static ssize_t 3539 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3540 { 3541 unsigned long long sector; 3542 3543 if (kstrtoull(buf, 10, §or) < 0) 3544 return -EINVAL; 3545 if (sector != (sector_t)sector) 3546 return -EINVAL; 3547 3548 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3549 rdev->raid_disk >= 0) 3550 return -EBUSY; 3551 3552 if (rdev->mddev->persistent) { 3553 if (rdev->mddev->major_version == 0) 3554 return -EINVAL; 3555 if ((sector > rdev->sb_start && 3556 sector - rdev->sb_start > S16_MAX) || 3557 (sector < rdev->sb_start && 3558 rdev->sb_start - sector > -S16_MIN)) 3559 return -EINVAL; 3560 rdev->ppl.offset = sector - rdev->sb_start; 3561 } else if (!rdev->mddev->external) { 3562 return -EBUSY; 3563 } 3564 rdev->ppl.sector = sector; 3565 return len; 3566 } 3567 3568 static struct rdev_sysfs_entry rdev_ppl_sector = 3569 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3570 3571 static ssize_t 3572 ppl_size_show(struct md_rdev *rdev, char *page) 3573 { 3574 return sprintf(page, "%u\n", rdev->ppl.size); 3575 } 3576 3577 static ssize_t 3578 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3579 { 3580 unsigned int size; 3581 3582 if (kstrtouint(buf, 10, &size) < 0) 3583 return -EINVAL; 3584 3585 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3586 rdev->raid_disk >= 0) 3587 return -EBUSY; 3588 3589 if (rdev->mddev->persistent) { 3590 if (rdev->mddev->major_version == 0) 3591 return -EINVAL; 3592 if (size > U16_MAX) 3593 return -EINVAL; 3594 } else if (!rdev->mddev->external) { 3595 return -EBUSY; 3596 } 3597 rdev->ppl.size = size; 3598 return len; 3599 } 3600 3601 static struct rdev_sysfs_entry rdev_ppl_size = 3602 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3603 3604 static struct attribute *rdev_default_attrs[] = { 3605 &rdev_state.attr, 3606 &rdev_errors.attr, 3607 &rdev_slot.attr, 3608 &rdev_offset.attr, 3609 &rdev_new_offset.attr, 3610 &rdev_size.attr, 3611 &rdev_recovery_start.attr, 3612 &rdev_bad_blocks.attr, 3613 &rdev_unack_bad_blocks.attr, 3614 &rdev_ppl_sector.attr, 3615 &rdev_ppl_size.attr, 3616 NULL, 3617 }; 3618 ATTRIBUTE_GROUPS(rdev_default); 3619 static ssize_t 3620 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3621 { 3622 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3623 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3624 3625 if (!entry->show) 3626 return -EIO; 3627 if (!rdev->mddev) 3628 return -ENODEV; 3629 return entry->show(rdev, page); 3630 } 3631 3632 static ssize_t 3633 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3634 const char *page, size_t length) 3635 { 3636 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3637 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3638 struct kernfs_node *kn = NULL; 3639 bool suspend = false; 3640 ssize_t rv; 3641 struct mddev *mddev = rdev->mddev; 3642 3643 if (!entry->store) 3644 return -EIO; 3645 if (!capable(CAP_SYS_ADMIN)) 3646 return -EACCES; 3647 if (!mddev) 3648 return -ENODEV; 3649 3650 if (entry->store == state_store) { 3651 if (cmd_match(page, "remove")) 3652 kn = sysfs_break_active_protection(kobj, attr); 3653 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3654 cmd_match(page, "writemostly") || 3655 cmd_match(page, "-writemostly")) 3656 suspend = true; 3657 } 3658 3659 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3660 if (!rv) { 3661 if (rdev->mddev == NULL) 3662 rv = -ENODEV; 3663 else 3664 rv = entry->store(rdev, page, length); 3665 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3666 } 3667 3668 if (kn) 3669 sysfs_unbreak_active_protection(kn); 3670 3671 return rv; 3672 } 3673 3674 static void rdev_free(struct kobject *ko) 3675 { 3676 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3677 kfree(rdev); 3678 } 3679 static const struct sysfs_ops rdev_sysfs_ops = { 3680 .show = rdev_attr_show, 3681 .store = rdev_attr_store, 3682 }; 3683 static const struct kobj_type rdev_ktype = { 3684 .release = rdev_free, 3685 .sysfs_ops = &rdev_sysfs_ops, 3686 .default_groups = rdev_default_groups, 3687 }; 3688 3689 int md_rdev_init(struct md_rdev *rdev) 3690 { 3691 rdev->desc_nr = -1; 3692 rdev->saved_raid_disk = -1; 3693 rdev->raid_disk = -1; 3694 rdev->flags = 0; 3695 rdev->data_offset = 0; 3696 rdev->new_data_offset = 0; 3697 rdev->sb_events = 0; 3698 rdev->last_read_error = 0; 3699 rdev->sb_loaded = 0; 3700 rdev->bb_page = NULL; 3701 atomic_set(&rdev->nr_pending, 0); 3702 atomic_set(&rdev->read_errors, 0); 3703 atomic_set(&rdev->corrected_errors, 0); 3704 3705 INIT_LIST_HEAD(&rdev->same_set); 3706 init_waitqueue_head(&rdev->blocked_wait); 3707 3708 /* Add space to store bad block list. 3709 * This reserves the space even on arrays where it cannot 3710 * be used - I wonder if that matters 3711 */ 3712 return badblocks_init(&rdev->badblocks, 0); 3713 } 3714 EXPORT_SYMBOL_GPL(md_rdev_init); 3715 3716 /* 3717 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3718 * 3719 * mark the device faulty if: 3720 * 3721 * - the device is nonexistent (zero size) 3722 * - the device has no valid superblock 3723 * 3724 * a faulty rdev _never_ has rdev->sb set. 3725 */ 3726 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3727 { 3728 struct md_rdev *rdev; 3729 sector_t size; 3730 int err; 3731 3732 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3733 if (!rdev) 3734 return ERR_PTR(-ENOMEM); 3735 3736 err = md_rdev_init(rdev); 3737 if (err) 3738 goto out_free_rdev; 3739 err = alloc_disk_sb(rdev); 3740 if (err) 3741 goto out_clear_rdev; 3742 3743 rdev->bdev_handle = bdev_open_by_dev(newdev, 3744 BLK_OPEN_READ | BLK_OPEN_WRITE, 3745 super_format == -2 ? &claim_rdev : rdev, NULL); 3746 if (IS_ERR(rdev->bdev_handle)) { 3747 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3748 MAJOR(newdev), MINOR(newdev)); 3749 err = PTR_ERR(rdev->bdev_handle); 3750 goto out_clear_rdev; 3751 } 3752 rdev->bdev = rdev->bdev_handle->bdev; 3753 3754 kobject_init(&rdev->kobj, &rdev_ktype); 3755 3756 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3757 if (!size) { 3758 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3759 rdev->bdev); 3760 err = -EINVAL; 3761 goto out_blkdev_put; 3762 } 3763 3764 if (super_format >= 0) { 3765 err = super_types[super_format]. 3766 load_super(rdev, NULL, super_minor); 3767 if (err == -EINVAL) { 3768 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3769 rdev->bdev, 3770 super_format, super_minor); 3771 goto out_blkdev_put; 3772 } 3773 if (err < 0) { 3774 pr_warn("md: could not read %pg's sb, not importing!\n", 3775 rdev->bdev); 3776 goto out_blkdev_put; 3777 } 3778 } 3779 3780 return rdev; 3781 3782 out_blkdev_put: 3783 bdev_release(rdev->bdev_handle); 3784 out_clear_rdev: 3785 md_rdev_clear(rdev); 3786 out_free_rdev: 3787 kfree(rdev); 3788 return ERR_PTR(err); 3789 } 3790 3791 /* 3792 * Check a full RAID array for plausibility 3793 */ 3794 3795 static int analyze_sbs(struct mddev *mddev) 3796 { 3797 int i; 3798 struct md_rdev *rdev, *freshest, *tmp; 3799 3800 freshest = NULL; 3801 rdev_for_each_safe(rdev, tmp, mddev) 3802 switch (super_types[mddev->major_version]. 3803 load_super(rdev, freshest, mddev->minor_version)) { 3804 case 1: 3805 freshest = rdev; 3806 break; 3807 case 0: 3808 break; 3809 default: 3810 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3811 rdev->bdev); 3812 md_kick_rdev_from_array(rdev); 3813 } 3814 3815 /* Cannot find a valid fresh disk */ 3816 if (!freshest) { 3817 pr_warn("md: cannot find a valid disk\n"); 3818 return -EINVAL; 3819 } 3820 3821 super_types[mddev->major_version]. 3822 validate_super(mddev, freshest); 3823 3824 i = 0; 3825 rdev_for_each_safe(rdev, tmp, mddev) { 3826 if (mddev->max_disks && 3827 (rdev->desc_nr >= mddev->max_disks || 3828 i > mddev->max_disks)) { 3829 pr_warn("md: %s: %pg: only %d devices permitted\n", 3830 mdname(mddev), rdev->bdev, 3831 mddev->max_disks); 3832 md_kick_rdev_from_array(rdev); 3833 continue; 3834 } 3835 if (rdev != freshest) { 3836 if (super_types[mddev->major_version]. 3837 validate_super(mddev, rdev)) { 3838 pr_warn("md: kicking non-fresh %pg from array!\n", 3839 rdev->bdev); 3840 md_kick_rdev_from_array(rdev); 3841 continue; 3842 } 3843 } 3844 if (mddev->level == LEVEL_MULTIPATH) { 3845 rdev->desc_nr = i++; 3846 rdev->raid_disk = rdev->desc_nr; 3847 set_bit(In_sync, &rdev->flags); 3848 } else if (rdev->raid_disk >= 3849 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3850 !test_bit(Journal, &rdev->flags)) { 3851 rdev->raid_disk = -1; 3852 clear_bit(In_sync, &rdev->flags); 3853 } 3854 } 3855 3856 return 0; 3857 } 3858 3859 /* Read a fixed-point number. 3860 * Numbers in sysfs attributes should be in "standard" units where 3861 * possible, so time should be in seconds. 3862 * However we internally use a a much smaller unit such as 3863 * milliseconds or jiffies. 3864 * This function takes a decimal number with a possible fractional 3865 * component, and produces an integer which is the result of 3866 * multiplying that number by 10^'scale'. 3867 * all without any floating-point arithmetic. 3868 */ 3869 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3870 { 3871 unsigned long result = 0; 3872 long decimals = -1; 3873 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3874 if (*cp == '.') 3875 decimals = 0; 3876 else if (decimals < scale) { 3877 unsigned int value; 3878 value = *cp - '0'; 3879 result = result * 10 + value; 3880 if (decimals >= 0) 3881 decimals++; 3882 } 3883 cp++; 3884 } 3885 if (*cp == '\n') 3886 cp++; 3887 if (*cp) 3888 return -EINVAL; 3889 if (decimals < 0) 3890 decimals = 0; 3891 *res = result * int_pow(10, scale - decimals); 3892 return 0; 3893 } 3894 3895 static ssize_t 3896 safe_delay_show(struct mddev *mddev, char *page) 3897 { 3898 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3899 3900 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3901 } 3902 static ssize_t 3903 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3904 { 3905 unsigned long msec; 3906 3907 if (mddev_is_clustered(mddev)) { 3908 pr_warn("md: Safemode is disabled for clustered mode\n"); 3909 return -EINVAL; 3910 } 3911 3912 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3913 return -EINVAL; 3914 if (msec == 0) 3915 mddev->safemode_delay = 0; 3916 else { 3917 unsigned long old_delay = mddev->safemode_delay; 3918 unsigned long new_delay = (msec*HZ)/1000; 3919 3920 if (new_delay == 0) 3921 new_delay = 1; 3922 mddev->safemode_delay = new_delay; 3923 if (new_delay < old_delay || old_delay == 0) 3924 mod_timer(&mddev->safemode_timer, jiffies+1); 3925 } 3926 return len; 3927 } 3928 static struct md_sysfs_entry md_safe_delay = 3929 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3930 3931 static ssize_t 3932 level_show(struct mddev *mddev, char *page) 3933 { 3934 struct md_personality *p; 3935 int ret; 3936 spin_lock(&mddev->lock); 3937 p = mddev->pers; 3938 if (p) 3939 ret = sprintf(page, "%s\n", p->name); 3940 else if (mddev->clevel[0]) 3941 ret = sprintf(page, "%s\n", mddev->clevel); 3942 else if (mddev->level != LEVEL_NONE) 3943 ret = sprintf(page, "%d\n", mddev->level); 3944 else 3945 ret = 0; 3946 spin_unlock(&mddev->lock); 3947 return ret; 3948 } 3949 3950 static ssize_t 3951 level_store(struct mddev *mddev, const char *buf, size_t len) 3952 { 3953 char clevel[16]; 3954 ssize_t rv; 3955 size_t slen = len; 3956 struct md_personality *pers, *oldpers; 3957 long level; 3958 void *priv, *oldpriv; 3959 struct md_rdev *rdev; 3960 3961 if (slen == 0 || slen >= sizeof(clevel)) 3962 return -EINVAL; 3963 3964 rv = mddev_suspend_and_lock(mddev); 3965 if (rv) 3966 return rv; 3967 3968 if (mddev->pers == NULL) { 3969 memcpy(mddev->clevel, buf, slen); 3970 if (mddev->clevel[slen-1] == '\n') 3971 slen--; 3972 mddev->clevel[slen] = 0; 3973 mddev->level = LEVEL_NONE; 3974 rv = len; 3975 goto out_unlock; 3976 } 3977 rv = -EROFS; 3978 if (!md_is_rdwr(mddev)) 3979 goto out_unlock; 3980 3981 /* request to change the personality. Need to ensure: 3982 * - array is not engaged in resync/recovery/reshape 3983 * - old personality can be suspended 3984 * - new personality will access other array. 3985 */ 3986 3987 rv = -EBUSY; 3988 if (mddev->sync_thread || 3989 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3990 mddev->reshape_position != MaxSector || 3991 mddev->sysfs_active) 3992 goto out_unlock; 3993 3994 rv = -EINVAL; 3995 if (!mddev->pers->quiesce) { 3996 pr_warn("md: %s: %s does not support online personality change\n", 3997 mdname(mddev), mddev->pers->name); 3998 goto out_unlock; 3999 } 4000 4001 /* Now find the new personality */ 4002 memcpy(clevel, buf, slen); 4003 if (clevel[slen-1] == '\n') 4004 slen--; 4005 clevel[slen] = 0; 4006 if (kstrtol(clevel, 10, &level)) 4007 level = LEVEL_NONE; 4008 4009 if (request_module("md-%s", clevel) != 0) 4010 request_module("md-level-%s", clevel); 4011 spin_lock(&pers_lock); 4012 pers = find_pers(level, clevel); 4013 if (!pers || !try_module_get(pers->owner)) { 4014 spin_unlock(&pers_lock); 4015 pr_warn("md: personality %s not loaded\n", clevel); 4016 rv = -EINVAL; 4017 goto out_unlock; 4018 } 4019 spin_unlock(&pers_lock); 4020 4021 if (pers == mddev->pers) { 4022 /* Nothing to do! */ 4023 module_put(pers->owner); 4024 rv = len; 4025 goto out_unlock; 4026 } 4027 if (!pers->takeover) { 4028 module_put(pers->owner); 4029 pr_warn("md: %s: %s does not support personality takeover\n", 4030 mdname(mddev), clevel); 4031 rv = -EINVAL; 4032 goto out_unlock; 4033 } 4034 4035 rdev_for_each(rdev, mddev) 4036 rdev->new_raid_disk = rdev->raid_disk; 4037 4038 /* ->takeover must set new_* and/or delta_disks 4039 * if it succeeds, and may set them when it fails. 4040 */ 4041 priv = pers->takeover(mddev); 4042 if (IS_ERR(priv)) { 4043 mddev->new_level = mddev->level; 4044 mddev->new_layout = mddev->layout; 4045 mddev->new_chunk_sectors = mddev->chunk_sectors; 4046 mddev->raid_disks -= mddev->delta_disks; 4047 mddev->delta_disks = 0; 4048 mddev->reshape_backwards = 0; 4049 module_put(pers->owner); 4050 pr_warn("md: %s: %s would not accept array\n", 4051 mdname(mddev), clevel); 4052 rv = PTR_ERR(priv); 4053 goto out_unlock; 4054 } 4055 4056 /* Looks like we have a winner */ 4057 mddev_detach(mddev); 4058 4059 spin_lock(&mddev->lock); 4060 oldpers = mddev->pers; 4061 oldpriv = mddev->private; 4062 mddev->pers = pers; 4063 mddev->private = priv; 4064 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4065 mddev->level = mddev->new_level; 4066 mddev->layout = mddev->new_layout; 4067 mddev->chunk_sectors = mddev->new_chunk_sectors; 4068 mddev->delta_disks = 0; 4069 mddev->reshape_backwards = 0; 4070 mddev->degraded = 0; 4071 spin_unlock(&mddev->lock); 4072 4073 if (oldpers->sync_request == NULL && 4074 mddev->external) { 4075 /* We are converting from a no-redundancy array 4076 * to a redundancy array and metadata is managed 4077 * externally so we need to be sure that writes 4078 * won't block due to a need to transition 4079 * clean->dirty 4080 * until external management is started. 4081 */ 4082 mddev->in_sync = 0; 4083 mddev->safemode_delay = 0; 4084 mddev->safemode = 0; 4085 } 4086 4087 oldpers->free(mddev, oldpriv); 4088 4089 if (oldpers->sync_request == NULL && 4090 pers->sync_request != NULL) { 4091 /* need to add the md_redundancy_group */ 4092 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4093 pr_warn("md: cannot register extra attributes for %s\n", 4094 mdname(mddev)); 4095 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4096 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4097 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4098 } 4099 if (oldpers->sync_request != NULL && 4100 pers->sync_request == NULL) { 4101 /* need to remove the md_redundancy_group */ 4102 if (mddev->to_remove == NULL) 4103 mddev->to_remove = &md_redundancy_group; 4104 } 4105 4106 module_put(oldpers->owner); 4107 4108 rdev_for_each(rdev, mddev) { 4109 if (rdev->raid_disk < 0) 4110 continue; 4111 if (rdev->new_raid_disk >= mddev->raid_disks) 4112 rdev->new_raid_disk = -1; 4113 if (rdev->new_raid_disk == rdev->raid_disk) 4114 continue; 4115 sysfs_unlink_rdev(mddev, rdev); 4116 } 4117 rdev_for_each(rdev, mddev) { 4118 if (rdev->raid_disk < 0) 4119 continue; 4120 if (rdev->new_raid_disk == rdev->raid_disk) 4121 continue; 4122 rdev->raid_disk = rdev->new_raid_disk; 4123 if (rdev->raid_disk < 0) 4124 clear_bit(In_sync, &rdev->flags); 4125 else { 4126 if (sysfs_link_rdev(mddev, rdev)) 4127 pr_warn("md: cannot register rd%d for %s after level change\n", 4128 rdev->raid_disk, mdname(mddev)); 4129 } 4130 } 4131 4132 if (pers->sync_request == NULL) { 4133 /* this is now an array without redundancy, so 4134 * it must always be in_sync 4135 */ 4136 mddev->in_sync = 1; 4137 del_timer_sync(&mddev->safemode_timer); 4138 } 4139 blk_set_stacking_limits(&mddev->queue->limits); 4140 pers->run(mddev); 4141 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4142 if (!mddev->thread) 4143 md_update_sb(mddev, 1); 4144 sysfs_notify_dirent_safe(mddev->sysfs_level); 4145 md_new_event(); 4146 rv = len; 4147 out_unlock: 4148 mddev_unlock_and_resume(mddev); 4149 return rv; 4150 } 4151 4152 static struct md_sysfs_entry md_level = 4153 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4154 4155 static ssize_t 4156 layout_show(struct mddev *mddev, char *page) 4157 { 4158 /* just a number, not meaningful for all levels */ 4159 if (mddev->reshape_position != MaxSector && 4160 mddev->layout != mddev->new_layout) 4161 return sprintf(page, "%d (%d)\n", 4162 mddev->new_layout, mddev->layout); 4163 return sprintf(page, "%d\n", mddev->layout); 4164 } 4165 4166 static ssize_t 4167 layout_store(struct mddev *mddev, const char *buf, size_t len) 4168 { 4169 unsigned int n; 4170 int err; 4171 4172 err = kstrtouint(buf, 10, &n); 4173 if (err < 0) 4174 return err; 4175 err = mddev_lock(mddev); 4176 if (err) 4177 return err; 4178 4179 if (mddev->pers) { 4180 if (mddev->pers->check_reshape == NULL) 4181 err = -EBUSY; 4182 else if (!md_is_rdwr(mddev)) 4183 err = -EROFS; 4184 else { 4185 mddev->new_layout = n; 4186 err = mddev->pers->check_reshape(mddev); 4187 if (err) 4188 mddev->new_layout = mddev->layout; 4189 } 4190 } else { 4191 mddev->new_layout = n; 4192 if (mddev->reshape_position == MaxSector) 4193 mddev->layout = n; 4194 } 4195 mddev_unlock(mddev); 4196 return err ?: len; 4197 } 4198 static struct md_sysfs_entry md_layout = 4199 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4200 4201 static ssize_t 4202 raid_disks_show(struct mddev *mddev, char *page) 4203 { 4204 if (mddev->raid_disks == 0) 4205 return 0; 4206 if (mddev->reshape_position != MaxSector && 4207 mddev->delta_disks != 0) 4208 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4209 mddev->raid_disks - mddev->delta_disks); 4210 return sprintf(page, "%d\n", mddev->raid_disks); 4211 } 4212 4213 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4214 4215 static ssize_t 4216 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4217 { 4218 unsigned int n; 4219 int err; 4220 4221 err = kstrtouint(buf, 10, &n); 4222 if (err < 0) 4223 return err; 4224 4225 err = mddev_lock(mddev); 4226 if (err) 4227 return err; 4228 if (mddev->pers) 4229 err = update_raid_disks(mddev, n); 4230 else if (mddev->reshape_position != MaxSector) { 4231 struct md_rdev *rdev; 4232 int olddisks = mddev->raid_disks - mddev->delta_disks; 4233 4234 err = -EINVAL; 4235 rdev_for_each(rdev, mddev) { 4236 if (olddisks < n && 4237 rdev->data_offset < rdev->new_data_offset) 4238 goto out_unlock; 4239 if (olddisks > n && 4240 rdev->data_offset > rdev->new_data_offset) 4241 goto out_unlock; 4242 } 4243 err = 0; 4244 mddev->delta_disks = n - olddisks; 4245 mddev->raid_disks = n; 4246 mddev->reshape_backwards = (mddev->delta_disks < 0); 4247 } else 4248 mddev->raid_disks = n; 4249 out_unlock: 4250 mddev_unlock(mddev); 4251 return err ? err : len; 4252 } 4253 static struct md_sysfs_entry md_raid_disks = 4254 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4255 4256 static ssize_t 4257 uuid_show(struct mddev *mddev, char *page) 4258 { 4259 return sprintf(page, "%pU\n", mddev->uuid); 4260 } 4261 static struct md_sysfs_entry md_uuid = 4262 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4263 4264 static ssize_t 4265 chunk_size_show(struct mddev *mddev, char *page) 4266 { 4267 if (mddev->reshape_position != MaxSector && 4268 mddev->chunk_sectors != mddev->new_chunk_sectors) 4269 return sprintf(page, "%d (%d)\n", 4270 mddev->new_chunk_sectors << 9, 4271 mddev->chunk_sectors << 9); 4272 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4273 } 4274 4275 static ssize_t 4276 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4277 { 4278 unsigned long n; 4279 int err; 4280 4281 err = kstrtoul(buf, 10, &n); 4282 if (err < 0) 4283 return err; 4284 4285 err = mddev_lock(mddev); 4286 if (err) 4287 return err; 4288 if (mddev->pers) { 4289 if (mddev->pers->check_reshape == NULL) 4290 err = -EBUSY; 4291 else if (!md_is_rdwr(mddev)) 4292 err = -EROFS; 4293 else { 4294 mddev->new_chunk_sectors = n >> 9; 4295 err = mddev->pers->check_reshape(mddev); 4296 if (err) 4297 mddev->new_chunk_sectors = mddev->chunk_sectors; 4298 } 4299 } else { 4300 mddev->new_chunk_sectors = n >> 9; 4301 if (mddev->reshape_position == MaxSector) 4302 mddev->chunk_sectors = n >> 9; 4303 } 4304 mddev_unlock(mddev); 4305 return err ?: len; 4306 } 4307 static struct md_sysfs_entry md_chunk_size = 4308 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4309 4310 static ssize_t 4311 resync_start_show(struct mddev *mddev, char *page) 4312 { 4313 if (mddev->recovery_cp == MaxSector) 4314 return sprintf(page, "none\n"); 4315 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4316 } 4317 4318 static ssize_t 4319 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4320 { 4321 unsigned long long n; 4322 int err; 4323 4324 if (cmd_match(buf, "none")) 4325 n = MaxSector; 4326 else { 4327 err = kstrtoull(buf, 10, &n); 4328 if (err < 0) 4329 return err; 4330 if (n != (sector_t)n) 4331 return -EINVAL; 4332 } 4333 4334 err = mddev_lock(mddev); 4335 if (err) 4336 return err; 4337 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4338 err = -EBUSY; 4339 4340 if (!err) { 4341 mddev->recovery_cp = n; 4342 if (mddev->pers) 4343 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4344 } 4345 mddev_unlock(mddev); 4346 return err ?: len; 4347 } 4348 static struct md_sysfs_entry md_resync_start = 4349 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4350 resync_start_show, resync_start_store); 4351 4352 /* 4353 * The array state can be: 4354 * 4355 * clear 4356 * No devices, no size, no level 4357 * Equivalent to STOP_ARRAY ioctl 4358 * inactive 4359 * May have some settings, but array is not active 4360 * all IO results in error 4361 * When written, doesn't tear down array, but just stops it 4362 * suspended (not supported yet) 4363 * All IO requests will block. The array can be reconfigured. 4364 * Writing this, if accepted, will block until array is quiescent 4365 * readonly 4366 * no resync can happen. no superblocks get written. 4367 * write requests fail 4368 * read-auto 4369 * like readonly, but behaves like 'clean' on a write request. 4370 * 4371 * clean - no pending writes, but otherwise active. 4372 * When written to inactive array, starts without resync 4373 * If a write request arrives then 4374 * if metadata is known, mark 'dirty' and switch to 'active'. 4375 * if not known, block and switch to write-pending 4376 * If written to an active array that has pending writes, then fails. 4377 * active 4378 * fully active: IO and resync can be happening. 4379 * When written to inactive array, starts with resync 4380 * 4381 * write-pending 4382 * clean, but writes are blocked waiting for 'active' to be written. 4383 * 4384 * active-idle 4385 * like active, but no writes have been seen for a while (100msec). 4386 * 4387 * broken 4388 * Array is failed. It's useful because mounted-arrays aren't stopped 4389 * when array is failed, so this state will at least alert the user that 4390 * something is wrong. 4391 */ 4392 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4393 write_pending, active_idle, broken, bad_word}; 4394 static char *array_states[] = { 4395 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4396 "write-pending", "active-idle", "broken", NULL }; 4397 4398 static int match_word(const char *word, char **list) 4399 { 4400 int n; 4401 for (n=0; list[n]; n++) 4402 if (cmd_match(word, list[n])) 4403 break; 4404 return n; 4405 } 4406 4407 static ssize_t 4408 array_state_show(struct mddev *mddev, char *page) 4409 { 4410 enum array_state st = inactive; 4411 4412 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4413 switch(mddev->ro) { 4414 case MD_RDONLY: 4415 st = readonly; 4416 break; 4417 case MD_AUTO_READ: 4418 st = read_auto; 4419 break; 4420 case MD_RDWR: 4421 spin_lock(&mddev->lock); 4422 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4423 st = write_pending; 4424 else if (mddev->in_sync) 4425 st = clean; 4426 else if (mddev->safemode) 4427 st = active_idle; 4428 else 4429 st = active; 4430 spin_unlock(&mddev->lock); 4431 } 4432 4433 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4434 st = broken; 4435 } else { 4436 if (list_empty(&mddev->disks) && 4437 mddev->raid_disks == 0 && 4438 mddev->dev_sectors == 0) 4439 st = clear; 4440 else 4441 st = inactive; 4442 } 4443 return sprintf(page, "%s\n", array_states[st]); 4444 } 4445 4446 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 4447 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 4448 static int restart_array(struct mddev *mddev); 4449 4450 static ssize_t 4451 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4452 { 4453 int err = 0; 4454 enum array_state st = match_word(buf, array_states); 4455 4456 /* No lock dependent actions */ 4457 switch (st) { 4458 case suspended: /* not supported yet */ 4459 case write_pending: /* cannot be set */ 4460 case active_idle: /* cannot be set */ 4461 case broken: /* cannot be set */ 4462 case bad_word: 4463 return -EINVAL; 4464 default: 4465 break; 4466 } 4467 4468 if (mddev->pers && (st == active || st == clean) && 4469 mddev->ro != MD_RDONLY) { 4470 /* don't take reconfig_mutex when toggling between 4471 * clean and active 4472 */ 4473 spin_lock(&mddev->lock); 4474 if (st == active) { 4475 restart_array(mddev); 4476 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4477 md_wakeup_thread(mddev->thread); 4478 wake_up(&mddev->sb_wait); 4479 } else /* st == clean */ { 4480 restart_array(mddev); 4481 if (!set_in_sync(mddev)) 4482 err = -EBUSY; 4483 } 4484 if (!err) 4485 sysfs_notify_dirent_safe(mddev->sysfs_state); 4486 spin_unlock(&mddev->lock); 4487 return err ?: len; 4488 } 4489 err = mddev_lock(mddev); 4490 if (err) 4491 return err; 4492 4493 switch (st) { 4494 case inactive: 4495 /* stop an active array, return 0 otherwise */ 4496 if (mddev->pers) 4497 err = do_md_stop(mddev, 2, NULL); 4498 break; 4499 case clear: 4500 err = do_md_stop(mddev, 0, NULL); 4501 break; 4502 case readonly: 4503 if (mddev->pers) 4504 err = md_set_readonly(mddev, NULL); 4505 else { 4506 mddev->ro = MD_RDONLY; 4507 set_disk_ro(mddev->gendisk, 1); 4508 err = do_md_run(mddev); 4509 } 4510 break; 4511 case read_auto: 4512 if (mddev->pers) { 4513 if (md_is_rdwr(mddev)) 4514 err = md_set_readonly(mddev, NULL); 4515 else if (mddev->ro == MD_RDONLY) 4516 err = restart_array(mddev); 4517 if (err == 0) { 4518 mddev->ro = MD_AUTO_READ; 4519 set_disk_ro(mddev->gendisk, 0); 4520 } 4521 } else { 4522 mddev->ro = MD_AUTO_READ; 4523 err = do_md_run(mddev); 4524 } 4525 break; 4526 case clean: 4527 if (mddev->pers) { 4528 err = restart_array(mddev); 4529 if (err) 4530 break; 4531 spin_lock(&mddev->lock); 4532 if (!set_in_sync(mddev)) 4533 err = -EBUSY; 4534 spin_unlock(&mddev->lock); 4535 } else 4536 err = -EINVAL; 4537 break; 4538 case active: 4539 if (mddev->pers) { 4540 err = restart_array(mddev); 4541 if (err) 4542 break; 4543 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4544 wake_up(&mddev->sb_wait); 4545 err = 0; 4546 } else { 4547 mddev->ro = MD_RDWR; 4548 set_disk_ro(mddev->gendisk, 0); 4549 err = do_md_run(mddev); 4550 } 4551 break; 4552 default: 4553 err = -EINVAL; 4554 break; 4555 } 4556 4557 if (!err) { 4558 if (mddev->hold_active == UNTIL_IOCTL) 4559 mddev->hold_active = 0; 4560 sysfs_notify_dirent_safe(mddev->sysfs_state); 4561 } 4562 mddev_unlock(mddev); 4563 return err ?: len; 4564 } 4565 static struct md_sysfs_entry md_array_state = 4566 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4567 4568 static ssize_t 4569 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4570 return sprintf(page, "%d\n", 4571 atomic_read(&mddev->max_corr_read_errors)); 4572 } 4573 4574 static ssize_t 4575 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4576 { 4577 unsigned int n; 4578 int rv; 4579 4580 rv = kstrtouint(buf, 10, &n); 4581 if (rv < 0) 4582 return rv; 4583 if (n > INT_MAX) 4584 return -EINVAL; 4585 atomic_set(&mddev->max_corr_read_errors, n); 4586 return len; 4587 } 4588 4589 static struct md_sysfs_entry max_corr_read_errors = 4590 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4591 max_corrected_read_errors_store); 4592 4593 static ssize_t 4594 null_show(struct mddev *mddev, char *page) 4595 { 4596 return -EINVAL; 4597 } 4598 4599 static ssize_t 4600 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4601 { 4602 /* buf must be %d:%d\n? giving major and minor numbers */ 4603 /* The new device is added to the array. 4604 * If the array has a persistent superblock, we read the 4605 * superblock to initialise info and check validity. 4606 * Otherwise, only checking done is that in bind_rdev_to_array, 4607 * which mainly checks size. 4608 */ 4609 char *e; 4610 int major = simple_strtoul(buf, &e, 10); 4611 int minor; 4612 dev_t dev; 4613 struct md_rdev *rdev; 4614 int err; 4615 4616 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4617 return -EINVAL; 4618 minor = simple_strtoul(e+1, &e, 10); 4619 if (*e && *e != '\n') 4620 return -EINVAL; 4621 dev = MKDEV(major, minor); 4622 if (major != MAJOR(dev) || 4623 minor != MINOR(dev)) 4624 return -EOVERFLOW; 4625 4626 err = mddev_suspend_and_lock(mddev); 4627 if (err) 4628 return err; 4629 if (mddev->persistent) { 4630 rdev = md_import_device(dev, mddev->major_version, 4631 mddev->minor_version); 4632 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4633 struct md_rdev *rdev0 4634 = list_entry(mddev->disks.next, 4635 struct md_rdev, same_set); 4636 err = super_types[mddev->major_version] 4637 .load_super(rdev, rdev0, mddev->minor_version); 4638 if (err < 0) 4639 goto out; 4640 } 4641 } else if (mddev->external) 4642 rdev = md_import_device(dev, -2, -1); 4643 else 4644 rdev = md_import_device(dev, -1, -1); 4645 4646 if (IS_ERR(rdev)) { 4647 mddev_unlock_and_resume(mddev); 4648 return PTR_ERR(rdev); 4649 } 4650 err = bind_rdev_to_array(rdev, mddev); 4651 out: 4652 if (err) 4653 export_rdev(rdev, mddev); 4654 mddev_unlock_and_resume(mddev); 4655 if (!err) 4656 md_new_event(); 4657 return err ? err : len; 4658 } 4659 4660 static struct md_sysfs_entry md_new_device = 4661 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4662 4663 static ssize_t 4664 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4665 { 4666 char *end; 4667 unsigned long chunk, end_chunk; 4668 int err; 4669 4670 err = mddev_lock(mddev); 4671 if (err) 4672 return err; 4673 if (!mddev->bitmap) 4674 goto out; 4675 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4676 while (*buf) { 4677 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4678 if (buf == end) break; 4679 if (*end == '-') { /* range */ 4680 buf = end + 1; 4681 end_chunk = simple_strtoul(buf, &end, 0); 4682 if (buf == end) break; 4683 } 4684 if (*end && !isspace(*end)) break; 4685 md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4686 buf = skip_spaces(end); 4687 } 4688 md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4689 out: 4690 mddev_unlock(mddev); 4691 return len; 4692 } 4693 4694 static struct md_sysfs_entry md_bitmap = 4695 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4696 4697 static ssize_t 4698 size_show(struct mddev *mddev, char *page) 4699 { 4700 return sprintf(page, "%llu\n", 4701 (unsigned long long)mddev->dev_sectors / 2); 4702 } 4703 4704 static int update_size(struct mddev *mddev, sector_t num_sectors); 4705 4706 static ssize_t 4707 size_store(struct mddev *mddev, const char *buf, size_t len) 4708 { 4709 /* If array is inactive, we can reduce the component size, but 4710 * not increase it (except from 0). 4711 * If array is active, we can try an on-line resize 4712 */ 4713 sector_t sectors; 4714 int err = strict_blocks_to_sectors(buf, §ors); 4715 4716 if (err < 0) 4717 return err; 4718 err = mddev_lock(mddev); 4719 if (err) 4720 return err; 4721 if (mddev->pers) { 4722 err = update_size(mddev, sectors); 4723 if (err == 0) 4724 md_update_sb(mddev, 1); 4725 } else { 4726 if (mddev->dev_sectors == 0 || 4727 mddev->dev_sectors > sectors) 4728 mddev->dev_sectors = sectors; 4729 else 4730 err = -ENOSPC; 4731 } 4732 mddev_unlock(mddev); 4733 return err ? err : len; 4734 } 4735 4736 static struct md_sysfs_entry md_size = 4737 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4738 4739 /* Metadata version. 4740 * This is one of 4741 * 'none' for arrays with no metadata (good luck...) 4742 * 'external' for arrays with externally managed metadata, 4743 * or N.M for internally known formats 4744 */ 4745 static ssize_t 4746 metadata_show(struct mddev *mddev, char *page) 4747 { 4748 if (mddev->persistent) 4749 return sprintf(page, "%d.%d\n", 4750 mddev->major_version, mddev->minor_version); 4751 else if (mddev->external) 4752 return sprintf(page, "external:%s\n", mddev->metadata_type); 4753 else 4754 return sprintf(page, "none\n"); 4755 } 4756 4757 static ssize_t 4758 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4759 { 4760 int major, minor; 4761 char *e; 4762 int err; 4763 /* Changing the details of 'external' metadata is 4764 * always permitted. Otherwise there must be 4765 * no devices attached to the array. 4766 */ 4767 4768 err = mddev_lock(mddev); 4769 if (err) 4770 return err; 4771 err = -EBUSY; 4772 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4773 ; 4774 else if (!list_empty(&mddev->disks)) 4775 goto out_unlock; 4776 4777 err = 0; 4778 if (cmd_match(buf, "none")) { 4779 mddev->persistent = 0; 4780 mddev->external = 0; 4781 mddev->major_version = 0; 4782 mddev->minor_version = 90; 4783 goto out_unlock; 4784 } 4785 if (strncmp(buf, "external:", 9) == 0) { 4786 size_t namelen = len-9; 4787 if (namelen >= sizeof(mddev->metadata_type)) 4788 namelen = sizeof(mddev->metadata_type)-1; 4789 memcpy(mddev->metadata_type, buf+9, namelen); 4790 mddev->metadata_type[namelen] = 0; 4791 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4792 mddev->metadata_type[--namelen] = 0; 4793 mddev->persistent = 0; 4794 mddev->external = 1; 4795 mddev->major_version = 0; 4796 mddev->minor_version = 90; 4797 goto out_unlock; 4798 } 4799 major = simple_strtoul(buf, &e, 10); 4800 err = -EINVAL; 4801 if (e==buf || *e != '.') 4802 goto out_unlock; 4803 buf = e+1; 4804 minor = simple_strtoul(buf, &e, 10); 4805 if (e==buf || (*e && *e != '\n') ) 4806 goto out_unlock; 4807 err = -ENOENT; 4808 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4809 goto out_unlock; 4810 mddev->major_version = major; 4811 mddev->minor_version = minor; 4812 mddev->persistent = 1; 4813 mddev->external = 0; 4814 err = 0; 4815 out_unlock: 4816 mddev_unlock(mddev); 4817 return err ?: len; 4818 } 4819 4820 static struct md_sysfs_entry md_metadata = 4821 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4822 4823 static ssize_t 4824 action_show(struct mddev *mddev, char *page) 4825 { 4826 char *type = "idle"; 4827 unsigned long recovery = mddev->recovery; 4828 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4829 type = "frozen"; 4830 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4831 (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4832 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4833 type = "reshape"; 4834 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4835 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4836 type = "resync"; 4837 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4838 type = "check"; 4839 else 4840 type = "repair"; 4841 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4842 type = "recover"; 4843 else if (mddev->reshape_position != MaxSector) 4844 type = "reshape"; 4845 } 4846 return sprintf(page, "%s\n", type); 4847 } 4848 4849 /** 4850 * stop_sync_thread() - wait for sync_thread to stop if it's running. 4851 * @mddev: the array. 4852 * @locked: if set, reconfig_mutex will still be held after this function 4853 * return; if not set, reconfig_mutex will be released after this 4854 * function return. 4855 * @check_seq: if set, only wait for curent running sync_thread to stop, noted 4856 * that new sync_thread can still start. 4857 */ 4858 static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) 4859 { 4860 int sync_seq; 4861 4862 if (check_seq) 4863 sync_seq = atomic_read(&mddev->sync_seq); 4864 4865 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4866 if (!locked) 4867 mddev_unlock(mddev); 4868 return; 4869 } 4870 4871 mddev_unlock(mddev); 4872 4873 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4874 /* 4875 * Thread might be blocked waiting for metadata update which will now 4876 * never happen 4877 */ 4878 md_wakeup_thread_directly(mddev->sync_thread); 4879 if (work_pending(&mddev->sync_work)) 4880 flush_work(&mddev->sync_work); 4881 4882 wait_event(resync_wait, 4883 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4884 (check_seq && sync_seq != atomic_read(&mddev->sync_seq))); 4885 4886 if (locked) 4887 mddev_lock_nointr(mddev); 4888 } 4889 4890 static void idle_sync_thread(struct mddev *mddev) 4891 { 4892 mutex_lock(&mddev->sync_mutex); 4893 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4894 4895 if (mddev_lock(mddev)) { 4896 mutex_unlock(&mddev->sync_mutex); 4897 return; 4898 } 4899 4900 stop_sync_thread(mddev, false, true); 4901 mutex_unlock(&mddev->sync_mutex); 4902 } 4903 4904 static void frozen_sync_thread(struct mddev *mddev) 4905 { 4906 mutex_lock(&mddev->sync_mutex); 4907 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4908 4909 if (mddev_lock(mddev)) { 4910 mutex_unlock(&mddev->sync_mutex); 4911 return; 4912 } 4913 4914 stop_sync_thread(mddev, false, false); 4915 mutex_unlock(&mddev->sync_mutex); 4916 } 4917 4918 static ssize_t 4919 action_store(struct mddev *mddev, const char *page, size_t len) 4920 { 4921 if (!mddev->pers || !mddev->pers->sync_request) 4922 return -EINVAL; 4923 4924 4925 if (cmd_match(page, "idle")) 4926 idle_sync_thread(mddev); 4927 else if (cmd_match(page, "frozen")) 4928 frozen_sync_thread(mddev); 4929 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4930 return -EBUSY; 4931 else if (cmd_match(page, "resync")) 4932 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4933 else if (cmd_match(page, "recover")) { 4934 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4935 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4936 } else if (cmd_match(page, "reshape")) { 4937 int err; 4938 if (mddev->pers->start_reshape == NULL) 4939 return -EINVAL; 4940 err = mddev_lock(mddev); 4941 if (!err) { 4942 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4943 err = -EBUSY; 4944 } else if (mddev->reshape_position == MaxSector || 4945 mddev->pers->check_reshape == NULL || 4946 mddev->pers->check_reshape(mddev)) { 4947 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4948 err = mddev->pers->start_reshape(mddev); 4949 } else { 4950 /* 4951 * If reshape is still in progress, and 4952 * md_check_recovery() can continue to reshape, 4953 * don't restart reshape because data can be 4954 * corrupted for raid456. 4955 */ 4956 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4957 } 4958 mddev_unlock(mddev); 4959 } 4960 if (err) 4961 return err; 4962 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 4963 } else { 4964 if (cmd_match(page, "check")) 4965 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4966 else if (!cmd_match(page, "repair")) 4967 return -EINVAL; 4968 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4969 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4970 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4971 } 4972 if (mddev->ro == MD_AUTO_READ) { 4973 /* A write to sync_action is enough to justify 4974 * canceling read-auto mode 4975 */ 4976 flush_work(&mddev->sync_work); 4977 mddev->ro = MD_RDWR; 4978 md_wakeup_thread(mddev->sync_thread); 4979 } 4980 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4981 md_wakeup_thread(mddev->thread); 4982 sysfs_notify_dirent_safe(mddev->sysfs_action); 4983 return len; 4984 } 4985 4986 static struct md_sysfs_entry md_scan_mode = 4987 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4988 4989 static ssize_t 4990 last_sync_action_show(struct mddev *mddev, char *page) 4991 { 4992 return sprintf(page, "%s\n", mddev->last_sync_action); 4993 } 4994 4995 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4996 4997 static ssize_t 4998 mismatch_cnt_show(struct mddev *mddev, char *page) 4999 { 5000 return sprintf(page, "%llu\n", 5001 (unsigned long long) 5002 atomic64_read(&mddev->resync_mismatches)); 5003 } 5004 5005 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 5006 5007 static ssize_t 5008 sync_min_show(struct mddev *mddev, char *page) 5009 { 5010 return sprintf(page, "%d (%s)\n", speed_min(mddev), 5011 mddev->sync_speed_min ? "local": "system"); 5012 } 5013 5014 static ssize_t 5015 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 5016 { 5017 unsigned int min; 5018 int rv; 5019 5020 if (strncmp(buf, "system", 6)==0) { 5021 min = 0; 5022 } else { 5023 rv = kstrtouint(buf, 10, &min); 5024 if (rv < 0) 5025 return rv; 5026 if (min == 0) 5027 return -EINVAL; 5028 } 5029 mddev->sync_speed_min = min; 5030 return len; 5031 } 5032 5033 static struct md_sysfs_entry md_sync_min = 5034 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5035 5036 static ssize_t 5037 sync_max_show(struct mddev *mddev, char *page) 5038 { 5039 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5040 mddev->sync_speed_max ? "local": "system"); 5041 } 5042 5043 static ssize_t 5044 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5045 { 5046 unsigned int max; 5047 int rv; 5048 5049 if (strncmp(buf, "system", 6)==0) { 5050 max = 0; 5051 } else { 5052 rv = kstrtouint(buf, 10, &max); 5053 if (rv < 0) 5054 return rv; 5055 if (max == 0) 5056 return -EINVAL; 5057 } 5058 mddev->sync_speed_max = max; 5059 return len; 5060 } 5061 5062 static struct md_sysfs_entry md_sync_max = 5063 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5064 5065 static ssize_t 5066 degraded_show(struct mddev *mddev, char *page) 5067 { 5068 return sprintf(page, "%d\n", mddev->degraded); 5069 } 5070 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5071 5072 static ssize_t 5073 sync_force_parallel_show(struct mddev *mddev, char *page) 5074 { 5075 return sprintf(page, "%d\n", mddev->parallel_resync); 5076 } 5077 5078 static ssize_t 5079 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5080 { 5081 long n; 5082 5083 if (kstrtol(buf, 10, &n)) 5084 return -EINVAL; 5085 5086 if (n != 0 && n != 1) 5087 return -EINVAL; 5088 5089 mddev->parallel_resync = n; 5090 5091 if (mddev->sync_thread) 5092 wake_up(&resync_wait); 5093 5094 return len; 5095 } 5096 5097 /* force parallel resync, even with shared block devices */ 5098 static struct md_sysfs_entry md_sync_force_parallel = 5099 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5100 sync_force_parallel_show, sync_force_parallel_store); 5101 5102 static ssize_t 5103 sync_speed_show(struct mddev *mddev, char *page) 5104 { 5105 unsigned long resync, dt, db; 5106 if (mddev->curr_resync == MD_RESYNC_NONE) 5107 return sprintf(page, "none\n"); 5108 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5109 dt = (jiffies - mddev->resync_mark) / HZ; 5110 if (!dt) dt++; 5111 db = resync - mddev->resync_mark_cnt; 5112 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5113 } 5114 5115 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5116 5117 static ssize_t 5118 sync_completed_show(struct mddev *mddev, char *page) 5119 { 5120 unsigned long long max_sectors, resync; 5121 5122 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5123 return sprintf(page, "none\n"); 5124 5125 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5126 mddev->curr_resync == MD_RESYNC_DELAYED) 5127 return sprintf(page, "delayed\n"); 5128 5129 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5130 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5131 max_sectors = mddev->resync_max_sectors; 5132 else 5133 max_sectors = mddev->dev_sectors; 5134 5135 resync = mddev->curr_resync_completed; 5136 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5137 } 5138 5139 static struct md_sysfs_entry md_sync_completed = 5140 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5141 5142 static ssize_t 5143 min_sync_show(struct mddev *mddev, char *page) 5144 { 5145 return sprintf(page, "%llu\n", 5146 (unsigned long long)mddev->resync_min); 5147 } 5148 static ssize_t 5149 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5150 { 5151 unsigned long long min; 5152 int err; 5153 5154 if (kstrtoull(buf, 10, &min)) 5155 return -EINVAL; 5156 5157 spin_lock(&mddev->lock); 5158 err = -EINVAL; 5159 if (min > mddev->resync_max) 5160 goto out_unlock; 5161 5162 err = -EBUSY; 5163 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5164 goto out_unlock; 5165 5166 /* Round down to multiple of 4K for safety */ 5167 mddev->resync_min = round_down(min, 8); 5168 err = 0; 5169 5170 out_unlock: 5171 spin_unlock(&mddev->lock); 5172 return err ?: len; 5173 } 5174 5175 static struct md_sysfs_entry md_min_sync = 5176 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5177 5178 static ssize_t 5179 max_sync_show(struct mddev *mddev, char *page) 5180 { 5181 if (mddev->resync_max == MaxSector) 5182 return sprintf(page, "max\n"); 5183 else 5184 return sprintf(page, "%llu\n", 5185 (unsigned long long)mddev->resync_max); 5186 } 5187 static ssize_t 5188 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5189 { 5190 int err; 5191 spin_lock(&mddev->lock); 5192 if (strncmp(buf, "max", 3) == 0) 5193 mddev->resync_max = MaxSector; 5194 else { 5195 unsigned long long max; 5196 int chunk; 5197 5198 err = -EINVAL; 5199 if (kstrtoull(buf, 10, &max)) 5200 goto out_unlock; 5201 if (max < mddev->resync_min) 5202 goto out_unlock; 5203 5204 err = -EBUSY; 5205 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5206 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5207 goto out_unlock; 5208 5209 /* Must be a multiple of chunk_size */ 5210 chunk = mddev->chunk_sectors; 5211 if (chunk) { 5212 sector_t temp = max; 5213 5214 err = -EINVAL; 5215 if (sector_div(temp, chunk)) 5216 goto out_unlock; 5217 } 5218 mddev->resync_max = max; 5219 } 5220 wake_up(&mddev->recovery_wait); 5221 err = 0; 5222 out_unlock: 5223 spin_unlock(&mddev->lock); 5224 return err ?: len; 5225 } 5226 5227 static struct md_sysfs_entry md_max_sync = 5228 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5229 5230 static ssize_t 5231 suspend_lo_show(struct mddev *mddev, char *page) 5232 { 5233 return sprintf(page, "%llu\n", 5234 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5235 } 5236 5237 static ssize_t 5238 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5239 { 5240 unsigned long long new; 5241 int err; 5242 5243 err = kstrtoull(buf, 10, &new); 5244 if (err < 0) 5245 return err; 5246 if (new != (sector_t)new) 5247 return -EINVAL; 5248 5249 err = mddev_suspend(mddev, true); 5250 if (err) 5251 return err; 5252 5253 WRITE_ONCE(mddev->suspend_lo, new); 5254 mddev_resume(mddev); 5255 5256 return len; 5257 } 5258 static struct md_sysfs_entry md_suspend_lo = 5259 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5260 5261 static ssize_t 5262 suspend_hi_show(struct mddev *mddev, char *page) 5263 { 5264 return sprintf(page, "%llu\n", 5265 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5266 } 5267 5268 static ssize_t 5269 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5270 { 5271 unsigned long long new; 5272 int err; 5273 5274 err = kstrtoull(buf, 10, &new); 5275 if (err < 0) 5276 return err; 5277 if (new != (sector_t)new) 5278 return -EINVAL; 5279 5280 err = mddev_suspend(mddev, true); 5281 if (err) 5282 return err; 5283 5284 WRITE_ONCE(mddev->suspend_hi, new); 5285 mddev_resume(mddev); 5286 5287 return len; 5288 } 5289 static struct md_sysfs_entry md_suspend_hi = 5290 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5291 5292 static ssize_t 5293 reshape_position_show(struct mddev *mddev, char *page) 5294 { 5295 if (mddev->reshape_position != MaxSector) 5296 return sprintf(page, "%llu\n", 5297 (unsigned long long)mddev->reshape_position); 5298 strcpy(page, "none\n"); 5299 return 5; 5300 } 5301 5302 static ssize_t 5303 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5304 { 5305 struct md_rdev *rdev; 5306 unsigned long long new; 5307 int err; 5308 5309 err = kstrtoull(buf, 10, &new); 5310 if (err < 0) 5311 return err; 5312 if (new != (sector_t)new) 5313 return -EINVAL; 5314 err = mddev_lock(mddev); 5315 if (err) 5316 return err; 5317 err = -EBUSY; 5318 if (mddev->pers) 5319 goto unlock; 5320 mddev->reshape_position = new; 5321 mddev->delta_disks = 0; 5322 mddev->reshape_backwards = 0; 5323 mddev->new_level = mddev->level; 5324 mddev->new_layout = mddev->layout; 5325 mddev->new_chunk_sectors = mddev->chunk_sectors; 5326 rdev_for_each(rdev, mddev) 5327 rdev->new_data_offset = rdev->data_offset; 5328 err = 0; 5329 unlock: 5330 mddev_unlock(mddev); 5331 return err ?: len; 5332 } 5333 5334 static struct md_sysfs_entry md_reshape_position = 5335 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5336 reshape_position_store); 5337 5338 static ssize_t 5339 reshape_direction_show(struct mddev *mddev, char *page) 5340 { 5341 return sprintf(page, "%s\n", 5342 mddev->reshape_backwards ? "backwards" : "forwards"); 5343 } 5344 5345 static ssize_t 5346 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5347 { 5348 int backwards = 0; 5349 int err; 5350 5351 if (cmd_match(buf, "forwards")) 5352 backwards = 0; 5353 else if (cmd_match(buf, "backwards")) 5354 backwards = 1; 5355 else 5356 return -EINVAL; 5357 if (mddev->reshape_backwards == backwards) 5358 return len; 5359 5360 err = mddev_lock(mddev); 5361 if (err) 5362 return err; 5363 /* check if we are allowed to change */ 5364 if (mddev->delta_disks) 5365 err = -EBUSY; 5366 else if (mddev->persistent && 5367 mddev->major_version == 0) 5368 err = -EINVAL; 5369 else 5370 mddev->reshape_backwards = backwards; 5371 mddev_unlock(mddev); 5372 return err ?: len; 5373 } 5374 5375 static struct md_sysfs_entry md_reshape_direction = 5376 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5377 reshape_direction_store); 5378 5379 static ssize_t 5380 array_size_show(struct mddev *mddev, char *page) 5381 { 5382 if (mddev->external_size) 5383 return sprintf(page, "%llu\n", 5384 (unsigned long long)mddev->array_sectors/2); 5385 else 5386 return sprintf(page, "default\n"); 5387 } 5388 5389 static ssize_t 5390 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5391 { 5392 sector_t sectors; 5393 int err; 5394 5395 err = mddev_lock(mddev); 5396 if (err) 5397 return err; 5398 5399 /* cluster raid doesn't support change array_sectors */ 5400 if (mddev_is_clustered(mddev)) { 5401 mddev_unlock(mddev); 5402 return -EINVAL; 5403 } 5404 5405 if (strncmp(buf, "default", 7) == 0) { 5406 if (mddev->pers) 5407 sectors = mddev->pers->size(mddev, 0, 0); 5408 else 5409 sectors = mddev->array_sectors; 5410 5411 mddev->external_size = 0; 5412 } else { 5413 if (strict_blocks_to_sectors(buf, §ors) < 0) 5414 err = -EINVAL; 5415 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5416 err = -E2BIG; 5417 else 5418 mddev->external_size = 1; 5419 } 5420 5421 if (!err) { 5422 mddev->array_sectors = sectors; 5423 if (mddev->pers) 5424 set_capacity_and_notify(mddev->gendisk, 5425 mddev->array_sectors); 5426 } 5427 mddev_unlock(mddev); 5428 return err ?: len; 5429 } 5430 5431 static struct md_sysfs_entry md_array_size = 5432 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5433 array_size_store); 5434 5435 static ssize_t 5436 consistency_policy_show(struct mddev *mddev, char *page) 5437 { 5438 int ret; 5439 5440 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5441 ret = sprintf(page, "journal\n"); 5442 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5443 ret = sprintf(page, "ppl\n"); 5444 } else if (mddev->bitmap) { 5445 ret = sprintf(page, "bitmap\n"); 5446 } else if (mddev->pers) { 5447 if (mddev->pers->sync_request) 5448 ret = sprintf(page, "resync\n"); 5449 else 5450 ret = sprintf(page, "none\n"); 5451 } else { 5452 ret = sprintf(page, "unknown\n"); 5453 } 5454 5455 return ret; 5456 } 5457 5458 static ssize_t 5459 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5460 { 5461 int err = 0; 5462 5463 if (mddev->pers) { 5464 if (mddev->pers->change_consistency_policy) 5465 err = mddev->pers->change_consistency_policy(mddev, buf); 5466 else 5467 err = -EBUSY; 5468 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5469 set_bit(MD_HAS_PPL, &mddev->flags); 5470 } else { 5471 err = -EINVAL; 5472 } 5473 5474 return err ? err : len; 5475 } 5476 5477 static struct md_sysfs_entry md_consistency_policy = 5478 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5479 consistency_policy_store); 5480 5481 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5482 { 5483 return sprintf(page, "%d\n", mddev->fail_last_dev); 5484 } 5485 5486 /* 5487 * Setting fail_last_dev to true to allow last device to be forcibly removed 5488 * from RAID1/RAID10. 5489 */ 5490 static ssize_t 5491 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5492 { 5493 int ret; 5494 bool value; 5495 5496 ret = kstrtobool(buf, &value); 5497 if (ret) 5498 return ret; 5499 5500 if (value != mddev->fail_last_dev) 5501 mddev->fail_last_dev = value; 5502 5503 return len; 5504 } 5505 static struct md_sysfs_entry md_fail_last_dev = 5506 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5507 fail_last_dev_store); 5508 5509 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5510 { 5511 if (mddev->pers == NULL || (mddev->pers->level != 1)) 5512 return sprintf(page, "n/a\n"); 5513 else 5514 return sprintf(page, "%d\n", mddev->serialize_policy); 5515 } 5516 5517 /* 5518 * Setting serialize_policy to true to enforce write IO is not reordered 5519 * for raid1. 5520 */ 5521 static ssize_t 5522 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5523 { 5524 int err; 5525 bool value; 5526 5527 err = kstrtobool(buf, &value); 5528 if (err) 5529 return err; 5530 5531 if (value == mddev->serialize_policy) 5532 return len; 5533 5534 err = mddev_suspend_and_lock(mddev); 5535 if (err) 5536 return err; 5537 if (mddev->pers == NULL || (mddev->pers->level != 1)) { 5538 pr_err("md: serialize_policy is only effective for raid1\n"); 5539 err = -EINVAL; 5540 goto unlock; 5541 } 5542 5543 if (value) 5544 mddev_create_serial_pool(mddev, NULL); 5545 else 5546 mddev_destroy_serial_pool(mddev, NULL); 5547 mddev->serialize_policy = value; 5548 unlock: 5549 mddev_unlock_and_resume(mddev); 5550 return err ?: len; 5551 } 5552 5553 static struct md_sysfs_entry md_serialize_policy = 5554 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5555 serialize_policy_store); 5556 5557 5558 static struct attribute *md_default_attrs[] = { 5559 &md_level.attr, 5560 &md_layout.attr, 5561 &md_raid_disks.attr, 5562 &md_uuid.attr, 5563 &md_chunk_size.attr, 5564 &md_size.attr, 5565 &md_resync_start.attr, 5566 &md_metadata.attr, 5567 &md_new_device.attr, 5568 &md_safe_delay.attr, 5569 &md_array_state.attr, 5570 &md_reshape_position.attr, 5571 &md_reshape_direction.attr, 5572 &md_array_size.attr, 5573 &max_corr_read_errors.attr, 5574 &md_consistency_policy.attr, 5575 &md_fail_last_dev.attr, 5576 &md_serialize_policy.attr, 5577 NULL, 5578 }; 5579 5580 static const struct attribute_group md_default_group = { 5581 .attrs = md_default_attrs, 5582 }; 5583 5584 static struct attribute *md_redundancy_attrs[] = { 5585 &md_scan_mode.attr, 5586 &md_last_scan_mode.attr, 5587 &md_mismatches.attr, 5588 &md_sync_min.attr, 5589 &md_sync_max.attr, 5590 &md_sync_speed.attr, 5591 &md_sync_force_parallel.attr, 5592 &md_sync_completed.attr, 5593 &md_min_sync.attr, 5594 &md_max_sync.attr, 5595 &md_suspend_lo.attr, 5596 &md_suspend_hi.attr, 5597 &md_bitmap.attr, 5598 &md_degraded.attr, 5599 NULL, 5600 }; 5601 static const struct attribute_group md_redundancy_group = { 5602 .name = NULL, 5603 .attrs = md_redundancy_attrs, 5604 }; 5605 5606 static const struct attribute_group *md_attr_groups[] = { 5607 &md_default_group, 5608 &md_bitmap_group, 5609 NULL, 5610 }; 5611 5612 static ssize_t 5613 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5614 { 5615 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5616 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5617 ssize_t rv; 5618 5619 if (!entry->show) 5620 return -EIO; 5621 spin_lock(&all_mddevs_lock); 5622 if (!mddev_get(mddev)) { 5623 spin_unlock(&all_mddevs_lock); 5624 return -EBUSY; 5625 } 5626 spin_unlock(&all_mddevs_lock); 5627 5628 rv = entry->show(mddev, page); 5629 mddev_put(mddev); 5630 return rv; 5631 } 5632 5633 static ssize_t 5634 md_attr_store(struct kobject *kobj, struct attribute *attr, 5635 const char *page, size_t length) 5636 { 5637 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5638 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5639 ssize_t rv; 5640 5641 if (!entry->store) 5642 return -EIO; 5643 if (!capable(CAP_SYS_ADMIN)) 5644 return -EACCES; 5645 spin_lock(&all_mddevs_lock); 5646 if (!mddev_get(mddev)) { 5647 spin_unlock(&all_mddevs_lock); 5648 return -EBUSY; 5649 } 5650 spin_unlock(&all_mddevs_lock); 5651 rv = entry->store(mddev, page, length); 5652 mddev_put(mddev); 5653 return rv; 5654 } 5655 5656 static void md_kobj_release(struct kobject *ko) 5657 { 5658 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5659 5660 if (mddev->sysfs_state) 5661 sysfs_put(mddev->sysfs_state); 5662 if (mddev->sysfs_level) 5663 sysfs_put(mddev->sysfs_level); 5664 5665 del_gendisk(mddev->gendisk); 5666 put_disk(mddev->gendisk); 5667 } 5668 5669 static const struct sysfs_ops md_sysfs_ops = { 5670 .show = md_attr_show, 5671 .store = md_attr_store, 5672 }; 5673 static const struct kobj_type md_ktype = { 5674 .release = md_kobj_release, 5675 .sysfs_ops = &md_sysfs_ops, 5676 .default_groups = md_attr_groups, 5677 }; 5678 5679 int mdp_major = 0; 5680 5681 static void mddev_delayed_delete(struct work_struct *ws) 5682 { 5683 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5684 5685 kobject_put(&mddev->kobj); 5686 } 5687 5688 struct mddev *md_alloc(dev_t dev, char *name) 5689 { 5690 /* 5691 * If dev is zero, name is the name of a device to allocate with 5692 * an arbitrary minor number. It will be "md_???" 5693 * If dev is non-zero it must be a device number with a MAJOR of 5694 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5695 * the device is being created by opening a node in /dev. 5696 * If "name" is not NULL, the device is being created by 5697 * writing to /sys/module/md_mod/parameters/new_array. 5698 */ 5699 static DEFINE_MUTEX(disks_mutex); 5700 struct mddev *mddev; 5701 struct gendisk *disk; 5702 int partitioned; 5703 int shift; 5704 int unit; 5705 int error ; 5706 5707 /* 5708 * Wait for any previous instance of this device to be completely 5709 * removed (mddev_delayed_delete). 5710 */ 5711 flush_workqueue(md_misc_wq); 5712 5713 mutex_lock(&disks_mutex); 5714 mddev = mddev_alloc(dev); 5715 if (IS_ERR(mddev)) { 5716 error = PTR_ERR(mddev); 5717 goto out_unlock; 5718 } 5719 5720 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5721 shift = partitioned ? MdpMinorShift : 0; 5722 unit = MINOR(mddev->unit) >> shift; 5723 5724 if (name && !dev) { 5725 /* Need to ensure that 'name' is not a duplicate. 5726 */ 5727 struct mddev *mddev2; 5728 spin_lock(&all_mddevs_lock); 5729 5730 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5731 if (mddev2->gendisk && 5732 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5733 spin_unlock(&all_mddevs_lock); 5734 error = -EEXIST; 5735 goto out_free_mddev; 5736 } 5737 spin_unlock(&all_mddevs_lock); 5738 } 5739 if (name && dev) 5740 /* 5741 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5742 */ 5743 mddev->hold_active = UNTIL_STOP; 5744 5745 error = -ENOMEM; 5746 disk = blk_alloc_disk(NUMA_NO_NODE); 5747 if (!disk) 5748 goto out_free_mddev; 5749 5750 disk->major = MAJOR(mddev->unit); 5751 disk->first_minor = unit << shift; 5752 disk->minors = 1 << shift; 5753 if (name) 5754 strcpy(disk->disk_name, name); 5755 else if (partitioned) 5756 sprintf(disk->disk_name, "md_d%d", unit); 5757 else 5758 sprintf(disk->disk_name, "md%d", unit); 5759 disk->fops = &md_fops; 5760 disk->private_data = mddev; 5761 5762 mddev->queue = disk->queue; 5763 blk_set_stacking_limits(&mddev->queue->limits); 5764 blk_queue_write_cache(mddev->queue, true, true); 5765 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5766 mddev->gendisk = disk; 5767 error = add_disk(disk); 5768 if (error) 5769 goto out_put_disk; 5770 5771 kobject_init(&mddev->kobj, &md_ktype); 5772 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5773 if (error) { 5774 /* 5775 * The disk is already live at this point. Clear the hold flag 5776 * and let mddev_put take care of the deletion, as it isn't any 5777 * different from a normal close on last release now. 5778 */ 5779 mddev->hold_active = 0; 5780 mutex_unlock(&disks_mutex); 5781 mddev_put(mddev); 5782 return ERR_PTR(error); 5783 } 5784 5785 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5786 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5787 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 5788 mutex_unlock(&disks_mutex); 5789 return mddev; 5790 5791 out_put_disk: 5792 put_disk(disk); 5793 out_free_mddev: 5794 mddev_free(mddev); 5795 out_unlock: 5796 mutex_unlock(&disks_mutex); 5797 return ERR_PTR(error); 5798 } 5799 5800 static int md_alloc_and_put(dev_t dev, char *name) 5801 { 5802 struct mddev *mddev = md_alloc(dev, name); 5803 5804 if (IS_ERR(mddev)) 5805 return PTR_ERR(mddev); 5806 mddev_put(mddev); 5807 return 0; 5808 } 5809 5810 static void md_probe(dev_t dev) 5811 { 5812 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 5813 return; 5814 if (create_on_open) 5815 md_alloc_and_put(dev, NULL); 5816 } 5817 5818 static int add_named_array(const char *val, const struct kernel_param *kp) 5819 { 5820 /* 5821 * val must be "md_*" or "mdNNN". 5822 * For "md_*" we allocate an array with a large free minor number, and 5823 * set the name to val. val must not already be an active name. 5824 * For "mdNNN" we allocate an array with the minor number NNN 5825 * which must not already be in use. 5826 */ 5827 int len = strlen(val); 5828 char buf[DISK_NAME_LEN]; 5829 unsigned long devnum; 5830 5831 while (len && val[len-1] == '\n') 5832 len--; 5833 if (len >= DISK_NAME_LEN) 5834 return -E2BIG; 5835 strscpy(buf, val, len+1); 5836 if (strncmp(buf, "md_", 3) == 0) 5837 return md_alloc_and_put(0, buf); 5838 if (strncmp(buf, "md", 2) == 0 && 5839 isdigit(buf[2]) && 5840 kstrtoul(buf+2, 10, &devnum) == 0 && 5841 devnum <= MINORMASK) 5842 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 5843 5844 return -EINVAL; 5845 } 5846 5847 static void md_safemode_timeout(struct timer_list *t) 5848 { 5849 struct mddev *mddev = from_timer(mddev, t, safemode_timer); 5850 5851 mddev->safemode = 1; 5852 if (mddev->external) 5853 sysfs_notify_dirent_safe(mddev->sysfs_state); 5854 5855 md_wakeup_thread(mddev->thread); 5856 } 5857 5858 static int start_dirty_degraded; 5859 5860 int md_run(struct mddev *mddev) 5861 { 5862 int err; 5863 struct md_rdev *rdev; 5864 struct md_personality *pers; 5865 bool nowait = true; 5866 5867 if (list_empty(&mddev->disks)) 5868 /* cannot run an array with no devices.. */ 5869 return -EINVAL; 5870 5871 if (mddev->pers) 5872 return -EBUSY; 5873 /* Cannot run until previous stop completes properly */ 5874 if (mddev->sysfs_active) 5875 return -EBUSY; 5876 5877 /* 5878 * Analyze all RAID superblock(s) 5879 */ 5880 if (!mddev->raid_disks) { 5881 if (!mddev->persistent) 5882 return -EINVAL; 5883 err = analyze_sbs(mddev); 5884 if (err) 5885 return -EINVAL; 5886 } 5887 5888 if (mddev->level != LEVEL_NONE) 5889 request_module("md-level-%d", mddev->level); 5890 else if (mddev->clevel[0]) 5891 request_module("md-%s", mddev->clevel); 5892 5893 /* 5894 * Drop all container device buffers, from now on 5895 * the only valid external interface is through the md 5896 * device. 5897 */ 5898 mddev->has_superblocks = false; 5899 rdev_for_each(rdev, mddev) { 5900 if (test_bit(Faulty, &rdev->flags)) 5901 continue; 5902 sync_blockdev(rdev->bdev); 5903 invalidate_bdev(rdev->bdev); 5904 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 5905 mddev->ro = MD_RDONLY; 5906 if (mddev->gendisk) 5907 set_disk_ro(mddev->gendisk, 1); 5908 } 5909 5910 if (rdev->sb_page) 5911 mddev->has_superblocks = true; 5912 5913 /* perform some consistency tests on the device. 5914 * We don't want the data to overlap the metadata, 5915 * Internal Bitmap issues have been handled elsewhere. 5916 */ 5917 if (rdev->meta_bdev) { 5918 /* Nothing to check */; 5919 } else if (rdev->data_offset < rdev->sb_start) { 5920 if (mddev->dev_sectors && 5921 rdev->data_offset + mddev->dev_sectors 5922 > rdev->sb_start) { 5923 pr_warn("md: %s: data overlaps metadata\n", 5924 mdname(mddev)); 5925 return -EINVAL; 5926 } 5927 } else { 5928 if (rdev->sb_start + rdev->sb_size/512 5929 > rdev->data_offset) { 5930 pr_warn("md: %s: metadata overlaps data\n", 5931 mdname(mddev)); 5932 return -EINVAL; 5933 } 5934 } 5935 sysfs_notify_dirent_safe(rdev->sysfs_state); 5936 nowait = nowait && bdev_nowait(rdev->bdev); 5937 } 5938 5939 if (!bioset_initialized(&mddev->bio_set)) { 5940 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5941 if (err) 5942 return err; 5943 } 5944 if (!bioset_initialized(&mddev->sync_set)) { 5945 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5946 if (err) 5947 goto exit_bio_set; 5948 } 5949 5950 if (!bioset_initialized(&mddev->io_clone_set)) { 5951 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 5952 offsetof(struct md_io_clone, bio_clone), 0); 5953 if (err) 5954 goto exit_sync_set; 5955 } 5956 5957 spin_lock(&pers_lock); 5958 pers = find_pers(mddev->level, mddev->clevel); 5959 if (!pers || !try_module_get(pers->owner)) { 5960 spin_unlock(&pers_lock); 5961 if (mddev->level != LEVEL_NONE) 5962 pr_warn("md: personality for level %d is not loaded!\n", 5963 mddev->level); 5964 else 5965 pr_warn("md: personality for level %s is not loaded!\n", 5966 mddev->clevel); 5967 err = -EINVAL; 5968 goto abort; 5969 } 5970 spin_unlock(&pers_lock); 5971 if (mddev->level != pers->level) { 5972 mddev->level = pers->level; 5973 mddev->new_level = pers->level; 5974 } 5975 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5976 5977 if (mddev->reshape_position != MaxSector && 5978 pers->start_reshape == NULL) { 5979 /* This personality cannot handle reshaping... */ 5980 module_put(pers->owner); 5981 err = -EINVAL; 5982 goto abort; 5983 } 5984 5985 if (pers->sync_request) { 5986 /* Warn if this is a potentially silly 5987 * configuration. 5988 */ 5989 struct md_rdev *rdev2; 5990 int warned = 0; 5991 5992 rdev_for_each(rdev, mddev) 5993 rdev_for_each(rdev2, mddev) { 5994 if (rdev < rdev2 && 5995 rdev->bdev->bd_disk == 5996 rdev2->bdev->bd_disk) { 5997 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 5998 mdname(mddev), 5999 rdev->bdev, 6000 rdev2->bdev); 6001 warned = 1; 6002 } 6003 } 6004 6005 if (warned) 6006 pr_warn("True protection against single-disk failure might be compromised.\n"); 6007 } 6008 6009 mddev->recovery = 0; 6010 /* may be over-ridden by personality */ 6011 mddev->resync_max_sectors = mddev->dev_sectors; 6012 6013 mddev->ok_start_degraded = start_dirty_degraded; 6014 6015 if (start_readonly && md_is_rdwr(mddev)) 6016 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6017 6018 err = pers->run(mddev); 6019 if (err) 6020 pr_warn("md: pers->run() failed ...\n"); 6021 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6022 WARN_ONCE(!mddev->external_size, 6023 "%s: default size too small, but 'external_size' not in effect?\n", 6024 __func__); 6025 pr_warn("md: invalid array_size %llu > default size %llu\n", 6026 (unsigned long long)mddev->array_sectors / 2, 6027 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6028 err = -EINVAL; 6029 } 6030 if (err == 0 && pers->sync_request && 6031 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6032 struct bitmap *bitmap; 6033 6034 bitmap = md_bitmap_create(mddev, -1); 6035 if (IS_ERR(bitmap)) { 6036 err = PTR_ERR(bitmap); 6037 pr_warn("%s: failed to create bitmap (%d)\n", 6038 mdname(mddev), err); 6039 } else 6040 mddev->bitmap = bitmap; 6041 6042 } 6043 if (err) 6044 goto bitmap_abort; 6045 6046 if (mddev->bitmap_info.max_write_behind > 0) { 6047 bool create_pool = false; 6048 6049 rdev_for_each(rdev, mddev) { 6050 if (test_bit(WriteMostly, &rdev->flags) && 6051 rdev_init_serial(rdev)) 6052 create_pool = true; 6053 } 6054 if (create_pool && mddev->serial_info_pool == NULL) { 6055 mddev->serial_info_pool = 6056 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6057 sizeof(struct serial_info)); 6058 if (!mddev->serial_info_pool) { 6059 err = -ENOMEM; 6060 goto bitmap_abort; 6061 } 6062 } 6063 } 6064 6065 if (mddev->queue) { 6066 bool nonrot = true; 6067 6068 rdev_for_each(rdev, mddev) { 6069 if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { 6070 nonrot = false; 6071 break; 6072 } 6073 } 6074 if (mddev->degraded) 6075 nonrot = false; 6076 if (nonrot) 6077 blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); 6078 else 6079 blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); 6080 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); 6081 6082 /* Set the NOWAIT flags if all underlying devices support it */ 6083 if (nowait) 6084 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); 6085 } 6086 if (pers->sync_request) { 6087 if (mddev->kobj.sd && 6088 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6089 pr_warn("md: cannot register extra attributes for %s\n", 6090 mdname(mddev)); 6091 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6092 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6093 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6094 } else if (mddev->ro == MD_AUTO_READ) 6095 mddev->ro = MD_RDWR; 6096 6097 atomic_set(&mddev->max_corr_read_errors, 6098 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6099 mddev->safemode = 0; 6100 if (mddev_is_clustered(mddev)) 6101 mddev->safemode_delay = 0; 6102 else 6103 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6104 mddev->in_sync = 1; 6105 smp_wmb(); 6106 spin_lock(&mddev->lock); 6107 mddev->pers = pers; 6108 spin_unlock(&mddev->lock); 6109 rdev_for_each(rdev, mddev) 6110 if (rdev->raid_disk >= 0) 6111 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6112 6113 if (mddev->degraded && md_is_rdwr(mddev)) 6114 /* This ensures that recovering status is reported immediately 6115 * via sysfs - until a lack of spares is confirmed. 6116 */ 6117 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6118 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6119 6120 if (mddev->sb_flags) 6121 md_update_sb(mddev, 0); 6122 6123 md_new_event(); 6124 return 0; 6125 6126 bitmap_abort: 6127 mddev_detach(mddev); 6128 if (mddev->private) 6129 pers->free(mddev, mddev->private); 6130 mddev->private = NULL; 6131 module_put(pers->owner); 6132 md_bitmap_destroy(mddev); 6133 abort: 6134 bioset_exit(&mddev->io_clone_set); 6135 exit_sync_set: 6136 bioset_exit(&mddev->sync_set); 6137 exit_bio_set: 6138 bioset_exit(&mddev->bio_set); 6139 return err; 6140 } 6141 EXPORT_SYMBOL_GPL(md_run); 6142 6143 int do_md_run(struct mddev *mddev) 6144 { 6145 int err; 6146 6147 set_bit(MD_NOT_READY, &mddev->flags); 6148 err = md_run(mddev); 6149 if (err) 6150 goto out; 6151 err = md_bitmap_load(mddev); 6152 if (err) { 6153 md_bitmap_destroy(mddev); 6154 goto out; 6155 } 6156 6157 if (mddev_is_clustered(mddev)) 6158 md_allow_write(mddev); 6159 6160 /* run start up tasks that require md_thread */ 6161 md_start(mddev); 6162 6163 md_wakeup_thread(mddev->thread); 6164 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6165 6166 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6167 clear_bit(MD_NOT_READY, &mddev->flags); 6168 mddev->changed = 1; 6169 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6170 sysfs_notify_dirent_safe(mddev->sysfs_state); 6171 sysfs_notify_dirent_safe(mddev->sysfs_action); 6172 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6173 out: 6174 clear_bit(MD_NOT_READY, &mddev->flags); 6175 return err; 6176 } 6177 6178 int md_start(struct mddev *mddev) 6179 { 6180 int ret = 0; 6181 6182 if (mddev->pers->start) { 6183 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6184 md_wakeup_thread(mddev->thread); 6185 ret = mddev->pers->start(mddev); 6186 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6187 md_wakeup_thread(mddev->sync_thread); 6188 } 6189 return ret; 6190 } 6191 EXPORT_SYMBOL_GPL(md_start); 6192 6193 static int restart_array(struct mddev *mddev) 6194 { 6195 struct gendisk *disk = mddev->gendisk; 6196 struct md_rdev *rdev; 6197 bool has_journal = false; 6198 bool has_readonly = false; 6199 6200 /* Complain if it has no devices */ 6201 if (list_empty(&mddev->disks)) 6202 return -ENXIO; 6203 if (!mddev->pers) 6204 return -EINVAL; 6205 if (md_is_rdwr(mddev)) 6206 return -EBUSY; 6207 6208 rcu_read_lock(); 6209 rdev_for_each_rcu(rdev, mddev) { 6210 if (test_bit(Journal, &rdev->flags) && 6211 !test_bit(Faulty, &rdev->flags)) 6212 has_journal = true; 6213 if (rdev_read_only(rdev)) 6214 has_readonly = true; 6215 } 6216 rcu_read_unlock(); 6217 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6218 /* Don't restart rw with journal missing/faulty */ 6219 return -EINVAL; 6220 if (has_readonly) 6221 return -EROFS; 6222 6223 mddev->safemode = 0; 6224 mddev->ro = MD_RDWR; 6225 set_disk_ro(disk, 0); 6226 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6227 /* Kick recovery or resync if necessary */ 6228 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6229 md_wakeup_thread(mddev->thread); 6230 md_wakeup_thread(mddev->sync_thread); 6231 sysfs_notify_dirent_safe(mddev->sysfs_state); 6232 return 0; 6233 } 6234 6235 static void md_clean(struct mddev *mddev) 6236 { 6237 mddev->array_sectors = 0; 6238 mddev->external_size = 0; 6239 mddev->dev_sectors = 0; 6240 mddev->raid_disks = 0; 6241 mddev->recovery_cp = 0; 6242 mddev->resync_min = 0; 6243 mddev->resync_max = MaxSector; 6244 mddev->reshape_position = MaxSector; 6245 /* we still need mddev->external in export_rdev, do not clear it yet */ 6246 mddev->persistent = 0; 6247 mddev->level = LEVEL_NONE; 6248 mddev->clevel[0] = 0; 6249 mddev->flags = 0; 6250 mddev->sb_flags = 0; 6251 mddev->ro = MD_RDWR; 6252 mddev->metadata_type[0] = 0; 6253 mddev->chunk_sectors = 0; 6254 mddev->ctime = mddev->utime = 0; 6255 mddev->layout = 0; 6256 mddev->max_disks = 0; 6257 mddev->events = 0; 6258 mddev->can_decrease_events = 0; 6259 mddev->delta_disks = 0; 6260 mddev->reshape_backwards = 0; 6261 mddev->new_level = LEVEL_NONE; 6262 mddev->new_layout = 0; 6263 mddev->new_chunk_sectors = 0; 6264 mddev->curr_resync = MD_RESYNC_NONE; 6265 atomic64_set(&mddev->resync_mismatches, 0); 6266 mddev->suspend_lo = mddev->suspend_hi = 0; 6267 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6268 mddev->recovery = 0; 6269 mddev->in_sync = 0; 6270 mddev->changed = 0; 6271 mddev->degraded = 0; 6272 mddev->safemode = 0; 6273 mddev->private = NULL; 6274 mddev->cluster_info = NULL; 6275 mddev->bitmap_info.offset = 0; 6276 mddev->bitmap_info.default_offset = 0; 6277 mddev->bitmap_info.default_space = 0; 6278 mddev->bitmap_info.chunksize = 0; 6279 mddev->bitmap_info.daemon_sleep = 0; 6280 mddev->bitmap_info.max_write_behind = 0; 6281 mddev->bitmap_info.nodes = 0; 6282 } 6283 6284 static void __md_stop_writes(struct mddev *mddev) 6285 { 6286 stop_sync_thread(mddev, true, false); 6287 del_timer_sync(&mddev->safemode_timer); 6288 6289 if (mddev->pers && mddev->pers->quiesce) { 6290 mddev->pers->quiesce(mddev, 1); 6291 mddev->pers->quiesce(mddev, 0); 6292 } 6293 md_bitmap_flush(mddev); 6294 6295 if (md_is_rdwr(mddev) && 6296 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6297 mddev->sb_flags)) { 6298 /* mark array as shutdown cleanly */ 6299 if (!mddev_is_clustered(mddev)) 6300 mddev->in_sync = 1; 6301 md_update_sb(mddev, 1); 6302 } 6303 /* disable policy to guarantee rdevs free resources for serialization */ 6304 mddev->serialize_policy = 0; 6305 mddev_destroy_serial_pool(mddev, NULL); 6306 } 6307 6308 void md_stop_writes(struct mddev *mddev) 6309 { 6310 mddev_lock_nointr(mddev); 6311 __md_stop_writes(mddev); 6312 mddev_unlock(mddev); 6313 } 6314 EXPORT_SYMBOL_GPL(md_stop_writes); 6315 6316 static void mddev_detach(struct mddev *mddev) 6317 { 6318 md_bitmap_wait_behind_writes(mddev); 6319 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6320 mddev->pers->quiesce(mddev, 1); 6321 mddev->pers->quiesce(mddev, 0); 6322 } 6323 md_unregister_thread(mddev, &mddev->thread); 6324 if (mddev->queue) 6325 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 6326 } 6327 6328 static void __md_stop(struct mddev *mddev) 6329 { 6330 struct md_personality *pers = mddev->pers; 6331 md_bitmap_destroy(mddev); 6332 mddev_detach(mddev); 6333 /* Ensure ->event_work is done */ 6334 if (mddev->event_work.func) 6335 flush_workqueue(md_misc_wq); 6336 spin_lock(&mddev->lock); 6337 mddev->pers = NULL; 6338 spin_unlock(&mddev->lock); 6339 if (mddev->private) 6340 pers->free(mddev, mddev->private); 6341 mddev->private = NULL; 6342 if (pers->sync_request && mddev->to_remove == NULL) 6343 mddev->to_remove = &md_redundancy_group; 6344 module_put(pers->owner); 6345 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6346 6347 bioset_exit(&mddev->bio_set); 6348 bioset_exit(&mddev->sync_set); 6349 bioset_exit(&mddev->io_clone_set); 6350 } 6351 6352 void md_stop(struct mddev *mddev) 6353 { 6354 lockdep_assert_held(&mddev->reconfig_mutex); 6355 6356 /* stop the array and free an attached data structures. 6357 * This is called from dm-raid 6358 */ 6359 __md_stop_writes(mddev); 6360 __md_stop(mddev); 6361 } 6362 6363 EXPORT_SYMBOL_GPL(md_stop); 6364 6365 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 6366 { 6367 int err = 0; 6368 int did_freeze = 0; 6369 6370 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6371 return -EBUSY; 6372 6373 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6374 did_freeze = 1; 6375 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6376 md_wakeup_thread(mddev->thread); 6377 } 6378 6379 stop_sync_thread(mddev, false, false); 6380 wait_event(mddev->sb_wait, 6381 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6382 mddev_lock_nointr(mddev); 6383 6384 mutex_lock(&mddev->open_mutex); 6385 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6386 mddev->sync_thread || 6387 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6388 pr_warn("md: %s still in use.\n",mdname(mddev)); 6389 err = -EBUSY; 6390 goto out; 6391 } 6392 6393 if (mddev->pers) { 6394 __md_stop_writes(mddev); 6395 6396 if (mddev->ro == MD_RDONLY) { 6397 err = -ENXIO; 6398 goto out; 6399 } 6400 6401 mddev->ro = MD_RDONLY; 6402 set_disk_ro(mddev->gendisk, 1); 6403 } 6404 6405 out: 6406 if ((mddev->pers && !err) || did_freeze) { 6407 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6408 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6409 md_wakeup_thread(mddev->thread); 6410 sysfs_notify_dirent_safe(mddev->sysfs_state); 6411 } 6412 6413 mutex_unlock(&mddev->open_mutex); 6414 return err; 6415 } 6416 6417 /* mode: 6418 * 0 - completely stop and dis-assemble array 6419 * 2 - stop but do not disassemble array 6420 */ 6421 static int do_md_stop(struct mddev *mddev, int mode, 6422 struct block_device *bdev) 6423 { 6424 struct gendisk *disk = mddev->gendisk; 6425 struct md_rdev *rdev; 6426 int did_freeze = 0; 6427 6428 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6429 did_freeze = 1; 6430 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6431 md_wakeup_thread(mddev->thread); 6432 } 6433 6434 stop_sync_thread(mddev, true, false); 6435 6436 mutex_lock(&mddev->open_mutex); 6437 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6438 mddev->sysfs_active || 6439 mddev->sync_thread || 6440 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6441 pr_warn("md: %s still in use.\n",mdname(mddev)); 6442 mutex_unlock(&mddev->open_mutex); 6443 if (did_freeze) { 6444 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6445 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6446 md_wakeup_thread(mddev->thread); 6447 } 6448 return -EBUSY; 6449 } 6450 if (mddev->pers) { 6451 if (!md_is_rdwr(mddev)) 6452 set_disk_ro(disk, 0); 6453 6454 __md_stop_writes(mddev); 6455 __md_stop(mddev); 6456 6457 /* tell userspace to handle 'inactive' */ 6458 sysfs_notify_dirent_safe(mddev->sysfs_state); 6459 6460 rdev_for_each(rdev, mddev) 6461 if (rdev->raid_disk >= 0) 6462 sysfs_unlink_rdev(mddev, rdev); 6463 6464 set_capacity_and_notify(disk, 0); 6465 mutex_unlock(&mddev->open_mutex); 6466 mddev->changed = 1; 6467 6468 if (!md_is_rdwr(mddev)) 6469 mddev->ro = MD_RDWR; 6470 } else 6471 mutex_unlock(&mddev->open_mutex); 6472 /* 6473 * Free resources if final stop 6474 */ 6475 if (mode == 0) { 6476 pr_info("md: %s stopped.\n", mdname(mddev)); 6477 6478 if (mddev->bitmap_info.file) { 6479 struct file *f = mddev->bitmap_info.file; 6480 spin_lock(&mddev->lock); 6481 mddev->bitmap_info.file = NULL; 6482 spin_unlock(&mddev->lock); 6483 fput(f); 6484 } 6485 mddev->bitmap_info.offset = 0; 6486 6487 export_array(mddev); 6488 6489 md_clean(mddev); 6490 if (mddev->hold_active == UNTIL_STOP) 6491 mddev->hold_active = 0; 6492 } 6493 md_new_event(); 6494 sysfs_notify_dirent_safe(mddev->sysfs_state); 6495 return 0; 6496 } 6497 6498 #ifndef MODULE 6499 static void autorun_array(struct mddev *mddev) 6500 { 6501 struct md_rdev *rdev; 6502 int err; 6503 6504 if (list_empty(&mddev->disks)) 6505 return; 6506 6507 pr_info("md: running: "); 6508 6509 rdev_for_each(rdev, mddev) { 6510 pr_cont("<%pg>", rdev->bdev); 6511 } 6512 pr_cont("\n"); 6513 6514 err = do_md_run(mddev); 6515 if (err) { 6516 pr_warn("md: do_md_run() returned %d\n", err); 6517 do_md_stop(mddev, 0, NULL); 6518 } 6519 } 6520 6521 /* 6522 * lets try to run arrays based on all disks that have arrived 6523 * until now. (those are in pending_raid_disks) 6524 * 6525 * the method: pick the first pending disk, collect all disks with 6526 * the same UUID, remove all from the pending list and put them into 6527 * the 'same_array' list. Then order this list based on superblock 6528 * update time (freshest comes first), kick out 'old' disks and 6529 * compare superblocks. If everything's fine then run it. 6530 * 6531 * If "unit" is allocated, then bump its reference count 6532 */ 6533 static void autorun_devices(int part) 6534 { 6535 struct md_rdev *rdev0, *rdev, *tmp; 6536 struct mddev *mddev; 6537 6538 pr_info("md: autorun ...\n"); 6539 while (!list_empty(&pending_raid_disks)) { 6540 int unit; 6541 dev_t dev; 6542 LIST_HEAD(candidates); 6543 rdev0 = list_entry(pending_raid_disks.next, 6544 struct md_rdev, same_set); 6545 6546 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6547 INIT_LIST_HEAD(&candidates); 6548 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6549 if (super_90_load(rdev, rdev0, 0) >= 0) { 6550 pr_debug("md: adding %pg ...\n", 6551 rdev->bdev); 6552 list_move(&rdev->same_set, &candidates); 6553 } 6554 /* 6555 * now we have a set of devices, with all of them having 6556 * mostly sane superblocks. It's time to allocate the 6557 * mddev. 6558 */ 6559 if (part) { 6560 dev = MKDEV(mdp_major, 6561 rdev0->preferred_minor << MdpMinorShift); 6562 unit = MINOR(dev) >> MdpMinorShift; 6563 } else { 6564 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6565 unit = MINOR(dev); 6566 } 6567 if (rdev0->preferred_minor != unit) { 6568 pr_warn("md: unit number in %pg is bad: %d\n", 6569 rdev0->bdev, rdev0->preferred_minor); 6570 break; 6571 } 6572 6573 mddev = md_alloc(dev, NULL); 6574 if (IS_ERR(mddev)) 6575 break; 6576 6577 if (mddev_suspend_and_lock(mddev)) 6578 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6579 else if (mddev->raid_disks || mddev->major_version 6580 || !list_empty(&mddev->disks)) { 6581 pr_warn("md: %s already running, cannot run %pg\n", 6582 mdname(mddev), rdev0->bdev); 6583 mddev_unlock_and_resume(mddev); 6584 } else { 6585 pr_debug("md: created %s\n", mdname(mddev)); 6586 mddev->persistent = 1; 6587 rdev_for_each_list(rdev, tmp, &candidates) { 6588 list_del_init(&rdev->same_set); 6589 if (bind_rdev_to_array(rdev, mddev)) 6590 export_rdev(rdev, mddev); 6591 } 6592 autorun_array(mddev); 6593 mddev_unlock_and_resume(mddev); 6594 } 6595 /* on success, candidates will be empty, on error 6596 * it won't... 6597 */ 6598 rdev_for_each_list(rdev, tmp, &candidates) { 6599 list_del_init(&rdev->same_set); 6600 export_rdev(rdev, mddev); 6601 } 6602 mddev_put(mddev); 6603 } 6604 pr_info("md: ... autorun DONE.\n"); 6605 } 6606 #endif /* !MODULE */ 6607 6608 static int get_version(void __user *arg) 6609 { 6610 mdu_version_t ver; 6611 6612 ver.major = MD_MAJOR_VERSION; 6613 ver.minor = MD_MINOR_VERSION; 6614 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6615 6616 if (copy_to_user(arg, &ver, sizeof(ver))) 6617 return -EFAULT; 6618 6619 return 0; 6620 } 6621 6622 static int get_array_info(struct mddev *mddev, void __user *arg) 6623 { 6624 mdu_array_info_t info; 6625 int nr,working,insync,failed,spare; 6626 struct md_rdev *rdev; 6627 6628 nr = working = insync = failed = spare = 0; 6629 rcu_read_lock(); 6630 rdev_for_each_rcu(rdev, mddev) { 6631 nr++; 6632 if (test_bit(Faulty, &rdev->flags)) 6633 failed++; 6634 else { 6635 working++; 6636 if (test_bit(In_sync, &rdev->flags)) 6637 insync++; 6638 else if (test_bit(Journal, &rdev->flags)) 6639 /* TODO: add journal count to md_u.h */ 6640 ; 6641 else 6642 spare++; 6643 } 6644 } 6645 rcu_read_unlock(); 6646 6647 info.major_version = mddev->major_version; 6648 info.minor_version = mddev->minor_version; 6649 info.patch_version = MD_PATCHLEVEL_VERSION; 6650 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6651 info.level = mddev->level; 6652 info.size = mddev->dev_sectors / 2; 6653 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6654 info.size = -1; 6655 info.nr_disks = nr; 6656 info.raid_disks = mddev->raid_disks; 6657 info.md_minor = mddev->md_minor; 6658 info.not_persistent= !mddev->persistent; 6659 6660 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6661 info.state = 0; 6662 if (mddev->in_sync) 6663 info.state = (1<<MD_SB_CLEAN); 6664 if (mddev->bitmap && mddev->bitmap_info.offset) 6665 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6666 if (mddev_is_clustered(mddev)) 6667 info.state |= (1<<MD_SB_CLUSTERED); 6668 info.active_disks = insync; 6669 info.working_disks = working; 6670 info.failed_disks = failed; 6671 info.spare_disks = spare; 6672 6673 info.layout = mddev->layout; 6674 info.chunk_size = mddev->chunk_sectors << 9; 6675 6676 if (copy_to_user(arg, &info, sizeof(info))) 6677 return -EFAULT; 6678 6679 return 0; 6680 } 6681 6682 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6683 { 6684 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6685 char *ptr; 6686 int err; 6687 6688 file = kzalloc(sizeof(*file), GFP_NOIO); 6689 if (!file) 6690 return -ENOMEM; 6691 6692 err = 0; 6693 spin_lock(&mddev->lock); 6694 /* bitmap enabled */ 6695 if (mddev->bitmap_info.file) { 6696 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6697 sizeof(file->pathname)); 6698 if (IS_ERR(ptr)) 6699 err = PTR_ERR(ptr); 6700 else 6701 memmove(file->pathname, ptr, 6702 sizeof(file->pathname)-(ptr-file->pathname)); 6703 } 6704 spin_unlock(&mddev->lock); 6705 6706 if (err == 0 && 6707 copy_to_user(arg, file, sizeof(*file))) 6708 err = -EFAULT; 6709 6710 kfree(file); 6711 return err; 6712 } 6713 6714 static int get_disk_info(struct mddev *mddev, void __user * arg) 6715 { 6716 mdu_disk_info_t info; 6717 struct md_rdev *rdev; 6718 6719 if (copy_from_user(&info, arg, sizeof(info))) 6720 return -EFAULT; 6721 6722 rcu_read_lock(); 6723 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6724 if (rdev) { 6725 info.major = MAJOR(rdev->bdev->bd_dev); 6726 info.minor = MINOR(rdev->bdev->bd_dev); 6727 info.raid_disk = rdev->raid_disk; 6728 info.state = 0; 6729 if (test_bit(Faulty, &rdev->flags)) 6730 info.state |= (1<<MD_DISK_FAULTY); 6731 else if (test_bit(In_sync, &rdev->flags)) { 6732 info.state |= (1<<MD_DISK_ACTIVE); 6733 info.state |= (1<<MD_DISK_SYNC); 6734 } 6735 if (test_bit(Journal, &rdev->flags)) 6736 info.state |= (1<<MD_DISK_JOURNAL); 6737 if (test_bit(WriteMostly, &rdev->flags)) 6738 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6739 if (test_bit(FailFast, &rdev->flags)) 6740 info.state |= (1<<MD_DISK_FAILFAST); 6741 } else { 6742 info.major = info.minor = 0; 6743 info.raid_disk = -1; 6744 info.state = (1<<MD_DISK_REMOVED); 6745 } 6746 rcu_read_unlock(); 6747 6748 if (copy_to_user(arg, &info, sizeof(info))) 6749 return -EFAULT; 6750 6751 return 0; 6752 } 6753 6754 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6755 { 6756 struct md_rdev *rdev; 6757 dev_t dev = MKDEV(info->major,info->minor); 6758 6759 if (mddev_is_clustered(mddev) && 6760 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6761 pr_warn("%s: Cannot add to clustered mddev.\n", 6762 mdname(mddev)); 6763 return -EINVAL; 6764 } 6765 6766 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6767 return -EOVERFLOW; 6768 6769 if (!mddev->raid_disks) { 6770 int err; 6771 /* expecting a device which has a superblock */ 6772 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6773 if (IS_ERR(rdev)) { 6774 pr_warn("md: md_import_device returned %ld\n", 6775 PTR_ERR(rdev)); 6776 return PTR_ERR(rdev); 6777 } 6778 if (!list_empty(&mddev->disks)) { 6779 struct md_rdev *rdev0 6780 = list_entry(mddev->disks.next, 6781 struct md_rdev, same_set); 6782 err = super_types[mddev->major_version] 6783 .load_super(rdev, rdev0, mddev->minor_version); 6784 if (err < 0) { 6785 pr_warn("md: %pg has different UUID to %pg\n", 6786 rdev->bdev, 6787 rdev0->bdev); 6788 export_rdev(rdev, mddev); 6789 return -EINVAL; 6790 } 6791 } 6792 err = bind_rdev_to_array(rdev, mddev); 6793 if (err) 6794 export_rdev(rdev, mddev); 6795 return err; 6796 } 6797 6798 /* 6799 * md_add_new_disk can be used once the array is assembled 6800 * to add "hot spares". They must already have a superblock 6801 * written 6802 */ 6803 if (mddev->pers) { 6804 int err; 6805 if (!mddev->pers->hot_add_disk) { 6806 pr_warn("%s: personality does not support diskops!\n", 6807 mdname(mddev)); 6808 return -EINVAL; 6809 } 6810 if (mddev->persistent) 6811 rdev = md_import_device(dev, mddev->major_version, 6812 mddev->minor_version); 6813 else 6814 rdev = md_import_device(dev, -1, -1); 6815 if (IS_ERR(rdev)) { 6816 pr_warn("md: md_import_device returned %ld\n", 6817 PTR_ERR(rdev)); 6818 return PTR_ERR(rdev); 6819 } 6820 /* set saved_raid_disk if appropriate */ 6821 if (!mddev->persistent) { 6822 if (info->state & (1<<MD_DISK_SYNC) && 6823 info->raid_disk < mddev->raid_disks) { 6824 rdev->raid_disk = info->raid_disk; 6825 clear_bit(Bitmap_sync, &rdev->flags); 6826 } else 6827 rdev->raid_disk = -1; 6828 rdev->saved_raid_disk = rdev->raid_disk; 6829 } else 6830 super_types[mddev->major_version]. 6831 validate_super(mddev, rdev); 6832 if ((info->state & (1<<MD_DISK_SYNC)) && 6833 rdev->raid_disk != info->raid_disk) { 6834 /* This was a hot-add request, but events doesn't 6835 * match, so reject it. 6836 */ 6837 export_rdev(rdev, mddev); 6838 return -EINVAL; 6839 } 6840 6841 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6842 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6843 set_bit(WriteMostly, &rdev->flags); 6844 else 6845 clear_bit(WriteMostly, &rdev->flags); 6846 if (info->state & (1<<MD_DISK_FAILFAST)) 6847 set_bit(FailFast, &rdev->flags); 6848 else 6849 clear_bit(FailFast, &rdev->flags); 6850 6851 if (info->state & (1<<MD_DISK_JOURNAL)) { 6852 struct md_rdev *rdev2; 6853 bool has_journal = false; 6854 6855 /* make sure no existing journal disk */ 6856 rdev_for_each(rdev2, mddev) { 6857 if (test_bit(Journal, &rdev2->flags)) { 6858 has_journal = true; 6859 break; 6860 } 6861 } 6862 if (has_journal || mddev->bitmap) { 6863 export_rdev(rdev, mddev); 6864 return -EBUSY; 6865 } 6866 set_bit(Journal, &rdev->flags); 6867 } 6868 /* 6869 * check whether the device shows up in other nodes 6870 */ 6871 if (mddev_is_clustered(mddev)) { 6872 if (info->state & (1 << MD_DISK_CANDIDATE)) 6873 set_bit(Candidate, &rdev->flags); 6874 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6875 /* --add initiated by this node */ 6876 err = md_cluster_ops->add_new_disk(mddev, rdev); 6877 if (err) { 6878 export_rdev(rdev, mddev); 6879 return err; 6880 } 6881 } 6882 } 6883 6884 rdev->raid_disk = -1; 6885 err = bind_rdev_to_array(rdev, mddev); 6886 6887 if (err) 6888 export_rdev(rdev, mddev); 6889 6890 if (mddev_is_clustered(mddev)) { 6891 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6892 if (!err) { 6893 err = md_cluster_ops->new_disk_ack(mddev, 6894 err == 0); 6895 if (err) 6896 md_kick_rdev_from_array(rdev); 6897 } 6898 } else { 6899 if (err) 6900 md_cluster_ops->add_new_disk_cancel(mddev); 6901 else 6902 err = add_bound_rdev(rdev); 6903 } 6904 6905 } else if (!err) 6906 err = add_bound_rdev(rdev); 6907 6908 return err; 6909 } 6910 6911 /* otherwise, md_add_new_disk is only allowed 6912 * for major_version==0 superblocks 6913 */ 6914 if (mddev->major_version != 0) { 6915 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 6916 return -EINVAL; 6917 } 6918 6919 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6920 int err; 6921 rdev = md_import_device(dev, -1, 0); 6922 if (IS_ERR(rdev)) { 6923 pr_warn("md: error, md_import_device() returned %ld\n", 6924 PTR_ERR(rdev)); 6925 return PTR_ERR(rdev); 6926 } 6927 rdev->desc_nr = info->number; 6928 if (info->raid_disk < mddev->raid_disks) 6929 rdev->raid_disk = info->raid_disk; 6930 else 6931 rdev->raid_disk = -1; 6932 6933 if (rdev->raid_disk < mddev->raid_disks) 6934 if (info->state & (1<<MD_DISK_SYNC)) 6935 set_bit(In_sync, &rdev->flags); 6936 6937 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6938 set_bit(WriteMostly, &rdev->flags); 6939 if (info->state & (1<<MD_DISK_FAILFAST)) 6940 set_bit(FailFast, &rdev->flags); 6941 6942 if (!mddev->persistent) { 6943 pr_debug("md: nonpersistent superblock ...\n"); 6944 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 6945 } else 6946 rdev->sb_start = calc_dev_sboffset(rdev); 6947 rdev->sectors = rdev->sb_start; 6948 6949 err = bind_rdev_to_array(rdev, mddev); 6950 if (err) { 6951 export_rdev(rdev, mddev); 6952 return err; 6953 } 6954 } 6955 6956 return 0; 6957 } 6958 6959 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6960 { 6961 struct md_rdev *rdev; 6962 6963 if (!mddev->pers) 6964 return -ENODEV; 6965 6966 rdev = find_rdev(mddev, dev); 6967 if (!rdev) 6968 return -ENXIO; 6969 6970 if (rdev->raid_disk < 0) 6971 goto kick_rdev; 6972 6973 clear_bit(Blocked, &rdev->flags); 6974 remove_and_add_spares(mddev, rdev); 6975 6976 if (rdev->raid_disk >= 0) 6977 goto busy; 6978 6979 kick_rdev: 6980 if (mddev_is_clustered(mddev)) { 6981 if (md_cluster_ops->remove_disk(mddev, rdev)) 6982 goto busy; 6983 } 6984 6985 md_kick_rdev_from_array(rdev); 6986 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6987 if (mddev->thread) 6988 md_wakeup_thread(mddev->thread); 6989 else 6990 md_update_sb(mddev, 1); 6991 md_new_event(); 6992 6993 return 0; 6994 busy: 6995 pr_debug("md: cannot remove active disk %pg from %s ...\n", 6996 rdev->bdev, mdname(mddev)); 6997 return -EBUSY; 6998 } 6999 7000 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7001 { 7002 int err; 7003 struct md_rdev *rdev; 7004 7005 if (!mddev->pers) 7006 return -ENODEV; 7007 7008 if (mddev->major_version != 0) { 7009 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7010 mdname(mddev)); 7011 return -EINVAL; 7012 } 7013 if (!mddev->pers->hot_add_disk) { 7014 pr_warn("%s: personality does not support diskops!\n", 7015 mdname(mddev)); 7016 return -EINVAL; 7017 } 7018 7019 rdev = md_import_device(dev, -1, 0); 7020 if (IS_ERR(rdev)) { 7021 pr_warn("md: error, md_import_device() returned %ld\n", 7022 PTR_ERR(rdev)); 7023 return -EINVAL; 7024 } 7025 7026 if (mddev->persistent) 7027 rdev->sb_start = calc_dev_sboffset(rdev); 7028 else 7029 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7030 7031 rdev->sectors = rdev->sb_start; 7032 7033 if (test_bit(Faulty, &rdev->flags)) { 7034 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7035 rdev->bdev, mdname(mddev)); 7036 err = -EINVAL; 7037 goto abort_export; 7038 } 7039 7040 clear_bit(In_sync, &rdev->flags); 7041 rdev->desc_nr = -1; 7042 rdev->saved_raid_disk = -1; 7043 err = bind_rdev_to_array(rdev, mddev); 7044 if (err) 7045 goto abort_export; 7046 7047 /* 7048 * The rest should better be atomic, we can have disk failures 7049 * noticed in interrupt contexts ... 7050 */ 7051 7052 rdev->raid_disk = -1; 7053 7054 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7055 if (!mddev->thread) 7056 md_update_sb(mddev, 1); 7057 /* 7058 * If the new disk does not support REQ_NOWAIT, 7059 * disable on the whole MD. 7060 */ 7061 if (!bdev_nowait(rdev->bdev)) { 7062 pr_info("%s: Disabling nowait because %pg does not support nowait\n", 7063 mdname(mddev), rdev->bdev); 7064 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); 7065 } 7066 /* 7067 * Kick recovery, maybe this spare has to be added to the 7068 * array immediately. 7069 */ 7070 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7071 md_wakeup_thread(mddev->thread); 7072 md_new_event(); 7073 return 0; 7074 7075 abort_export: 7076 export_rdev(rdev, mddev); 7077 return err; 7078 } 7079 7080 static int set_bitmap_file(struct mddev *mddev, int fd) 7081 { 7082 int err = 0; 7083 7084 if (mddev->pers) { 7085 if (!mddev->pers->quiesce || !mddev->thread) 7086 return -EBUSY; 7087 if (mddev->recovery || mddev->sync_thread) 7088 return -EBUSY; 7089 /* we should be able to change the bitmap.. */ 7090 } 7091 7092 if (fd >= 0) { 7093 struct inode *inode; 7094 struct file *f; 7095 7096 if (mddev->bitmap || mddev->bitmap_info.file) 7097 return -EEXIST; /* cannot add when bitmap is present */ 7098 7099 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7100 pr_warn("%s: bitmap files not supported by this kernel\n", 7101 mdname(mddev)); 7102 return -EINVAL; 7103 } 7104 pr_warn("%s: using deprecated bitmap file support\n", 7105 mdname(mddev)); 7106 7107 f = fget(fd); 7108 7109 if (f == NULL) { 7110 pr_warn("%s: error: failed to get bitmap file\n", 7111 mdname(mddev)); 7112 return -EBADF; 7113 } 7114 7115 inode = f->f_mapping->host; 7116 if (!S_ISREG(inode->i_mode)) { 7117 pr_warn("%s: error: bitmap file must be a regular file\n", 7118 mdname(mddev)); 7119 err = -EBADF; 7120 } else if (!(f->f_mode & FMODE_WRITE)) { 7121 pr_warn("%s: error: bitmap file must open for write\n", 7122 mdname(mddev)); 7123 err = -EBADF; 7124 } else if (atomic_read(&inode->i_writecount) != 1) { 7125 pr_warn("%s: error: bitmap file is already in use\n", 7126 mdname(mddev)); 7127 err = -EBUSY; 7128 } 7129 if (err) { 7130 fput(f); 7131 return err; 7132 } 7133 mddev->bitmap_info.file = f; 7134 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7135 } else if (mddev->bitmap == NULL) 7136 return -ENOENT; /* cannot remove what isn't there */ 7137 err = 0; 7138 if (mddev->pers) { 7139 if (fd >= 0) { 7140 struct bitmap *bitmap; 7141 7142 bitmap = md_bitmap_create(mddev, -1); 7143 if (!IS_ERR(bitmap)) { 7144 mddev->bitmap = bitmap; 7145 err = md_bitmap_load(mddev); 7146 } else 7147 err = PTR_ERR(bitmap); 7148 if (err) { 7149 md_bitmap_destroy(mddev); 7150 fd = -1; 7151 } 7152 } else if (fd < 0) { 7153 md_bitmap_destroy(mddev); 7154 } 7155 } 7156 if (fd < 0) { 7157 struct file *f = mddev->bitmap_info.file; 7158 if (f) { 7159 spin_lock(&mddev->lock); 7160 mddev->bitmap_info.file = NULL; 7161 spin_unlock(&mddev->lock); 7162 fput(f); 7163 } 7164 } 7165 7166 return err; 7167 } 7168 7169 /* 7170 * md_set_array_info is used two different ways 7171 * The original usage is when creating a new array. 7172 * In this usage, raid_disks is > 0 and it together with 7173 * level, size, not_persistent,layout,chunksize determine the 7174 * shape of the array. 7175 * This will always create an array with a type-0.90.0 superblock. 7176 * The newer usage is when assembling an array. 7177 * In this case raid_disks will be 0, and the major_version field is 7178 * use to determine which style super-blocks are to be found on the devices. 7179 * The minor and patch _version numbers are also kept incase the 7180 * super_block handler wishes to interpret them. 7181 */ 7182 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7183 { 7184 if (info->raid_disks == 0) { 7185 /* just setting version number for superblock loading */ 7186 if (info->major_version < 0 || 7187 info->major_version >= ARRAY_SIZE(super_types) || 7188 super_types[info->major_version].name == NULL) { 7189 /* maybe try to auto-load a module? */ 7190 pr_warn("md: superblock version %d not known\n", 7191 info->major_version); 7192 return -EINVAL; 7193 } 7194 mddev->major_version = info->major_version; 7195 mddev->minor_version = info->minor_version; 7196 mddev->patch_version = info->patch_version; 7197 mddev->persistent = !info->not_persistent; 7198 /* ensure mddev_put doesn't delete this now that there 7199 * is some minimal configuration. 7200 */ 7201 mddev->ctime = ktime_get_real_seconds(); 7202 return 0; 7203 } 7204 mddev->major_version = MD_MAJOR_VERSION; 7205 mddev->minor_version = MD_MINOR_VERSION; 7206 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7207 mddev->ctime = ktime_get_real_seconds(); 7208 7209 mddev->level = info->level; 7210 mddev->clevel[0] = 0; 7211 mddev->dev_sectors = 2 * (sector_t)info->size; 7212 mddev->raid_disks = info->raid_disks; 7213 /* don't set md_minor, it is determined by which /dev/md* was 7214 * openned 7215 */ 7216 if (info->state & (1<<MD_SB_CLEAN)) 7217 mddev->recovery_cp = MaxSector; 7218 else 7219 mddev->recovery_cp = 0; 7220 mddev->persistent = ! info->not_persistent; 7221 mddev->external = 0; 7222 7223 mddev->layout = info->layout; 7224 if (mddev->level == 0) 7225 /* Cannot trust RAID0 layout info here */ 7226 mddev->layout = -1; 7227 mddev->chunk_sectors = info->chunk_size >> 9; 7228 7229 if (mddev->persistent) { 7230 mddev->max_disks = MD_SB_DISKS; 7231 mddev->flags = 0; 7232 mddev->sb_flags = 0; 7233 } 7234 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7235 7236 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7237 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7238 mddev->bitmap_info.offset = 0; 7239 7240 mddev->reshape_position = MaxSector; 7241 7242 /* 7243 * Generate a 128 bit UUID 7244 */ 7245 get_random_bytes(mddev->uuid, 16); 7246 7247 mddev->new_level = mddev->level; 7248 mddev->new_chunk_sectors = mddev->chunk_sectors; 7249 mddev->new_layout = mddev->layout; 7250 mddev->delta_disks = 0; 7251 mddev->reshape_backwards = 0; 7252 7253 return 0; 7254 } 7255 7256 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7257 { 7258 lockdep_assert_held(&mddev->reconfig_mutex); 7259 7260 if (mddev->external_size) 7261 return; 7262 7263 mddev->array_sectors = array_sectors; 7264 } 7265 EXPORT_SYMBOL(md_set_array_sectors); 7266 7267 static int update_size(struct mddev *mddev, sector_t num_sectors) 7268 { 7269 struct md_rdev *rdev; 7270 int rv; 7271 int fit = (num_sectors == 0); 7272 sector_t old_dev_sectors = mddev->dev_sectors; 7273 7274 if (mddev->pers->resize == NULL) 7275 return -EINVAL; 7276 /* The "num_sectors" is the number of sectors of each device that 7277 * is used. This can only make sense for arrays with redundancy. 7278 * linear and raid0 always use whatever space is available. We can only 7279 * consider changing this number if no resync or reconstruction is 7280 * happening, and if the new size is acceptable. It must fit before the 7281 * sb_start or, if that is <data_offset, it must fit before the size 7282 * of each device. If num_sectors is zero, we find the largest size 7283 * that fits. 7284 */ 7285 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7286 mddev->sync_thread) 7287 return -EBUSY; 7288 if (!md_is_rdwr(mddev)) 7289 return -EROFS; 7290 7291 rdev_for_each(rdev, mddev) { 7292 sector_t avail = rdev->sectors; 7293 7294 if (fit && (num_sectors == 0 || num_sectors > avail)) 7295 num_sectors = avail; 7296 if (avail < num_sectors) 7297 return -ENOSPC; 7298 } 7299 rv = mddev->pers->resize(mddev, num_sectors); 7300 if (!rv) { 7301 if (mddev_is_clustered(mddev)) 7302 md_cluster_ops->update_size(mddev, old_dev_sectors); 7303 else if (mddev->queue) { 7304 set_capacity_and_notify(mddev->gendisk, 7305 mddev->array_sectors); 7306 } 7307 } 7308 return rv; 7309 } 7310 7311 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7312 { 7313 int rv; 7314 struct md_rdev *rdev; 7315 /* change the number of raid disks */ 7316 if (mddev->pers->check_reshape == NULL) 7317 return -EINVAL; 7318 if (!md_is_rdwr(mddev)) 7319 return -EROFS; 7320 if (raid_disks <= 0 || 7321 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7322 return -EINVAL; 7323 if (mddev->sync_thread || 7324 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7325 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7326 mddev->reshape_position != MaxSector) 7327 return -EBUSY; 7328 7329 rdev_for_each(rdev, mddev) { 7330 if (mddev->raid_disks < raid_disks && 7331 rdev->data_offset < rdev->new_data_offset) 7332 return -EINVAL; 7333 if (mddev->raid_disks > raid_disks && 7334 rdev->data_offset > rdev->new_data_offset) 7335 return -EINVAL; 7336 } 7337 7338 mddev->delta_disks = raid_disks - mddev->raid_disks; 7339 if (mddev->delta_disks < 0) 7340 mddev->reshape_backwards = 1; 7341 else if (mddev->delta_disks > 0) 7342 mddev->reshape_backwards = 0; 7343 7344 rv = mddev->pers->check_reshape(mddev); 7345 if (rv < 0) { 7346 mddev->delta_disks = 0; 7347 mddev->reshape_backwards = 0; 7348 } 7349 return rv; 7350 } 7351 7352 /* 7353 * update_array_info is used to change the configuration of an 7354 * on-line array. 7355 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7356 * fields in the info are checked against the array. 7357 * Any differences that cannot be handled will cause an error. 7358 * Normally, only one change can be managed at a time. 7359 */ 7360 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7361 { 7362 int rv = 0; 7363 int cnt = 0; 7364 int state = 0; 7365 7366 /* calculate expected state,ignoring low bits */ 7367 if (mddev->bitmap && mddev->bitmap_info.offset) 7368 state |= (1 << MD_SB_BITMAP_PRESENT); 7369 7370 if (mddev->major_version != info->major_version || 7371 mddev->minor_version != info->minor_version || 7372 /* mddev->patch_version != info->patch_version || */ 7373 mddev->ctime != info->ctime || 7374 mddev->level != info->level || 7375 /* mddev->layout != info->layout || */ 7376 mddev->persistent != !info->not_persistent || 7377 mddev->chunk_sectors != info->chunk_size >> 9 || 7378 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7379 ((state^info->state) & 0xfffffe00) 7380 ) 7381 return -EINVAL; 7382 /* Check there is only one change */ 7383 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7384 cnt++; 7385 if (mddev->raid_disks != info->raid_disks) 7386 cnt++; 7387 if (mddev->layout != info->layout) 7388 cnt++; 7389 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7390 cnt++; 7391 if (cnt == 0) 7392 return 0; 7393 if (cnt > 1) 7394 return -EINVAL; 7395 7396 if (mddev->layout != info->layout) { 7397 /* Change layout 7398 * we don't need to do anything at the md level, the 7399 * personality will take care of it all. 7400 */ 7401 if (mddev->pers->check_reshape == NULL) 7402 return -EINVAL; 7403 else { 7404 mddev->new_layout = info->layout; 7405 rv = mddev->pers->check_reshape(mddev); 7406 if (rv) 7407 mddev->new_layout = mddev->layout; 7408 return rv; 7409 } 7410 } 7411 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7412 rv = update_size(mddev, (sector_t)info->size * 2); 7413 7414 if (mddev->raid_disks != info->raid_disks) 7415 rv = update_raid_disks(mddev, info->raid_disks); 7416 7417 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7418 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7419 rv = -EINVAL; 7420 goto err; 7421 } 7422 if (mddev->recovery || mddev->sync_thread) { 7423 rv = -EBUSY; 7424 goto err; 7425 } 7426 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7427 struct bitmap *bitmap; 7428 /* add the bitmap */ 7429 if (mddev->bitmap) { 7430 rv = -EEXIST; 7431 goto err; 7432 } 7433 if (mddev->bitmap_info.default_offset == 0) { 7434 rv = -EINVAL; 7435 goto err; 7436 } 7437 mddev->bitmap_info.offset = 7438 mddev->bitmap_info.default_offset; 7439 mddev->bitmap_info.space = 7440 mddev->bitmap_info.default_space; 7441 bitmap = md_bitmap_create(mddev, -1); 7442 if (!IS_ERR(bitmap)) { 7443 mddev->bitmap = bitmap; 7444 rv = md_bitmap_load(mddev); 7445 } else 7446 rv = PTR_ERR(bitmap); 7447 if (rv) 7448 md_bitmap_destroy(mddev); 7449 } else { 7450 /* remove the bitmap */ 7451 if (!mddev->bitmap) { 7452 rv = -ENOENT; 7453 goto err; 7454 } 7455 if (mddev->bitmap->storage.file) { 7456 rv = -EINVAL; 7457 goto err; 7458 } 7459 if (mddev->bitmap_info.nodes) { 7460 /* hold PW on all the bitmap lock */ 7461 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7462 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7463 rv = -EPERM; 7464 md_cluster_ops->unlock_all_bitmaps(mddev); 7465 goto err; 7466 } 7467 7468 mddev->bitmap_info.nodes = 0; 7469 md_cluster_ops->leave(mddev); 7470 module_put(md_cluster_mod); 7471 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7472 } 7473 md_bitmap_destroy(mddev); 7474 mddev->bitmap_info.offset = 0; 7475 } 7476 } 7477 md_update_sb(mddev, 1); 7478 return rv; 7479 err: 7480 return rv; 7481 } 7482 7483 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7484 { 7485 struct md_rdev *rdev; 7486 int err = 0; 7487 7488 if (mddev->pers == NULL) 7489 return -ENODEV; 7490 7491 rcu_read_lock(); 7492 rdev = md_find_rdev_rcu(mddev, dev); 7493 if (!rdev) 7494 err = -ENODEV; 7495 else { 7496 md_error(mddev, rdev); 7497 if (test_bit(MD_BROKEN, &mddev->flags)) 7498 err = -EBUSY; 7499 } 7500 rcu_read_unlock(); 7501 return err; 7502 } 7503 7504 /* 7505 * We have a problem here : there is no easy way to give a CHS 7506 * virtual geometry. We currently pretend that we have a 2 heads 7507 * 4 sectors (with a BIG number of cylinders...). This drives 7508 * dosfs just mad... ;-) 7509 */ 7510 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7511 { 7512 struct mddev *mddev = bdev->bd_disk->private_data; 7513 7514 geo->heads = 2; 7515 geo->sectors = 4; 7516 geo->cylinders = mddev->array_sectors / 8; 7517 return 0; 7518 } 7519 7520 static inline bool md_ioctl_valid(unsigned int cmd) 7521 { 7522 switch (cmd) { 7523 case ADD_NEW_DISK: 7524 case GET_ARRAY_INFO: 7525 case GET_BITMAP_FILE: 7526 case GET_DISK_INFO: 7527 case HOT_ADD_DISK: 7528 case HOT_REMOVE_DISK: 7529 case RAID_VERSION: 7530 case RESTART_ARRAY_RW: 7531 case RUN_ARRAY: 7532 case SET_ARRAY_INFO: 7533 case SET_BITMAP_FILE: 7534 case SET_DISK_FAULTY: 7535 case STOP_ARRAY: 7536 case STOP_ARRAY_RO: 7537 case CLUSTERED_DISK_NACK: 7538 return true; 7539 default: 7540 return false; 7541 } 7542 } 7543 7544 static bool md_ioctl_need_suspend(unsigned int cmd) 7545 { 7546 switch (cmd) { 7547 case ADD_NEW_DISK: 7548 case HOT_ADD_DISK: 7549 case HOT_REMOVE_DISK: 7550 case SET_BITMAP_FILE: 7551 case SET_ARRAY_INFO: 7552 return true; 7553 default: 7554 return false; 7555 } 7556 } 7557 7558 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7559 { 7560 mdu_array_info_t info; 7561 int err; 7562 7563 if (!argp) 7564 memset(&info, 0, sizeof(info)); 7565 else if (copy_from_user(&info, argp, sizeof(info))) 7566 return -EFAULT; 7567 7568 if (mddev->pers) { 7569 err = update_array_info(mddev, &info); 7570 if (err) 7571 pr_warn("md: couldn't update array info. %d\n", err); 7572 return err; 7573 } 7574 7575 if (!list_empty(&mddev->disks)) { 7576 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7577 return -EBUSY; 7578 } 7579 7580 if (mddev->raid_disks) { 7581 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7582 return -EBUSY; 7583 } 7584 7585 err = md_set_array_info(mddev, &info); 7586 if (err) 7587 pr_warn("md: couldn't set array info. %d\n", err); 7588 7589 return err; 7590 } 7591 7592 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7593 unsigned int cmd, unsigned long arg) 7594 { 7595 int err = 0; 7596 void __user *argp = (void __user *)arg; 7597 struct mddev *mddev = NULL; 7598 bool did_set_md_closing = false; 7599 7600 if (!md_ioctl_valid(cmd)) 7601 return -ENOTTY; 7602 7603 switch (cmd) { 7604 case RAID_VERSION: 7605 case GET_ARRAY_INFO: 7606 case GET_DISK_INFO: 7607 break; 7608 default: 7609 if (!capable(CAP_SYS_ADMIN)) 7610 return -EACCES; 7611 } 7612 7613 /* 7614 * Commands dealing with the RAID driver but not any 7615 * particular array: 7616 */ 7617 switch (cmd) { 7618 case RAID_VERSION: 7619 err = get_version(argp); 7620 goto out; 7621 default:; 7622 } 7623 7624 /* 7625 * Commands creating/starting a new array: 7626 */ 7627 7628 mddev = bdev->bd_disk->private_data; 7629 7630 if (!mddev) { 7631 BUG(); 7632 goto out; 7633 } 7634 7635 /* Some actions do not requires the mutex */ 7636 switch (cmd) { 7637 case GET_ARRAY_INFO: 7638 if (!mddev->raid_disks && !mddev->external) 7639 err = -ENODEV; 7640 else 7641 err = get_array_info(mddev, argp); 7642 goto out; 7643 7644 case GET_DISK_INFO: 7645 if (!mddev->raid_disks && !mddev->external) 7646 err = -ENODEV; 7647 else 7648 err = get_disk_info(mddev, argp); 7649 goto out; 7650 7651 case SET_DISK_FAULTY: 7652 err = set_disk_faulty(mddev, new_decode_dev(arg)); 7653 goto out; 7654 7655 case GET_BITMAP_FILE: 7656 err = get_bitmap_file(mddev, argp); 7657 goto out; 7658 7659 } 7660 7661 if (cmd == HOT_REMOVE_DISK) 7662 /* need to ensure recovery thread has run */ 7663 wait_event_interruptible_timeout(mddev->sb_wait, 7664 !test_bit(MD_RECOVERY_NEEDED, 7665 &mddev->recovery), 7666 msecs_to_jiffies(5000)); 7667 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7668 /* Need to flush page cache, and ensure no-one else opens 7669 * and writes 7670 */ 7671 mutex_lock(&mddev->open_mutex); 7672 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 7673 mutex_unlock(&mddev->open_mutex); 7674 err = -EBUSY; 7675 goto out; 7676 } 7677 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 7678 mutex_unlock(&mddev->open_mutex); 7679 err = -EBUSY; 7680 goto out; 7681 } 7682 did_set_md_closing = true; 7683 mutex_unlock(&mddev->open_mutex); 7684 sync_blockdev(bdev); 7685 } 7686 7687 if (!md_is_rdwr(mddev)) 7688 flush_work(&mddev->sync_work); 7689 7690 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 7691 mddev_lock(mddev); 7692 if (err) { 7693 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7694 err, cmd); 7695 goto out; 7696 } 7697 7698 if (cmd == SET_ARRAY_INFO) { 7699 err = __md_set_array_info(mddev, argp); 7700 goto unlock; 7701 } 7702 7703 /* 7704 * Commands querying/configuring an existing array: 7705 */ 7706 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7707 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7708 if ((!mddev->raid_disks && !mddev->external) 7709 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7710 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7711 && cmd != GET_BITMAP_FILE) { 7712 err = -ENODEV; 7713 goto unlock; 7714 } 7715 7716 /* 7717 * Commands even a read-only array can execute: 7718 */ 7719 switch (cmd) { 7720 case RESTART_ARRAY_RW: 7721 err = restart_array(mddev); 7722 goto unlock; 7723 7724 case STOP_ARRAY: 7725 err = do_md_stop(mddev, 0, bdev); 7726 goto unlock; 7727 7728 case STOP_ARRAY_RO: 7729 err = md_set_readonly(mddev, bdev); 7730 goto unlock; 7731 7732 case HOT_REMOVE_DISK: 7733 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7734 goto unlock; 7735 7736 case ADD_NEW_DISK: 7737 /* We can support ADD_NEW_DISK on read-only arrays 7738 * only if we are re-adding a preexisting device. 7739 * So require mddev->pers and MD_DISK_SYNC. 7740 */ 7741 if (mddev->pers) { 7742 mdu_disk_info_t info; 7743 if (copy_from_user(&info, argp, sizeof(info))) 7744 err = -EFAULT; 7745 else if (!(info.state & (1<<MD_DISK_SYNC))) 7746 /* Need to clear read-only for this */ 7747 break; 7748 else 7749 err = md_add_new_disk(mddev, &info); 7750 goto unlock; 7751 } 7752 break; 7753 } 7754 7755 /* 7756 * The remaining ioctls are changing the state of the 7757 * superblock, so we do not allow them on read-only arrays. 7758 */ 7759 if (!md_is_rdwr(mddev) && mddev->pers) { 7760 if (mddev->ro != MD_AUTO_READ) { 7761 err = -EROFS; 7762 goto unlock; 7763 } 7764 mddev->ro = MD_RDWR; 7765 sysfs_notify_dirent_safe(mddev->sysfs_state); 7766 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7767 /* mddev_unlock will wake thread */ 7768 /* If a device failed while we were read-only, we 7769 * need to make sure the metadata is updated now. 7770 */ 7771 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7772 mddev_unlock(mddev); 7773 wait_event(mddev->sb_wait, 7774 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7775 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7776 mddev_lock_nointr(mddev); 7777 } 7778 } 7779 7780 switch (cmd) { 7781 case ADD_NEW_DISK: 7782 { 7783 mdu_disk_info_t info; 7784 if (copy_from_user(&info, argp, sizeof(info))) 7785 err = -EFAULT; 7786 else 7787 err = md_add_new_disk(mddev, &info); 7788 goto unlock; 7789 } 7790 7791 case CLUSTERED_DISK_NACK: 7792 if (mddev_is_clustered(mddev)) 7793 md_cluster_ops->new_disk_ack(mddev, false); 7794 else 7795 err = -EINVAL; 7796 goto unlock; 7797 7798 case HOT_ADD_DISK: 7799 err = hot_add_disk(mddev, new_decode_dev(arg)); 7800 goto unlock; 7801 7802 case RUN_ARRAY: 7803 err = do_md_run(mddev); 7804 goto unlock; 7805 7806 case SET_BITMAP_FILE: 7807 err = set_bitmap_file(mddev, (int)arg); 7808 goto unlock; 7809 7810 default: 7811 err = -EINVAL; 7812 goto unlock; 7813 } 7814 7815 unlock: 7816 if (mddev->hold_active == UNTIL_IOCTL && 7817 err != -EINVAL) 7818 mddev->hold_active = 0; 7819 7820 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 7821 mddev_unlock(mddev); 7822 7823 out: 7824 if(did_set_md_closing) 7825 clear_bit(MD_CLOSING, &mddev->flags); 7826 return err; 7827 } 7828 #ifdef CONFIG_COMPAT 7829 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 7830 unsigned int cmd, unsigned long arg) 7831 { 7832 switch (cmd) { 7833 case HOT_REMOVE_DISK: 7834 case HOT_ADD_DISK: 7835 case SET_DISK_FAULTY: 7836 case SET_BITMAP_FILE: 7837 /* These take in integer arg, do not convert */ 7838 break; 7839 default: 7840 arg = (unsigned long)compat_ptr(arg); 7841 break; 7842 } 7843 7844 return md_ioctl(bdev, mode, cmd, arg); 7845 } 7846 #endif /* CONFIG_COMPAT */ 7847 7848 static int md_set_read_only(struct block_device *bdev, bool ro) 7849 { 7850 struct mddev *mddev = bdev->bd_disk->private_data; 7851 int err; 7852 7853 err = mddev_lock(mddev); 7854 if (err) 7855 return err; 7856 7857 if (!mddev->raid_disks && !mddev->external) { 7858 err = -ENODEV; 7859 goto out_unlock; 7860 } 7861 7862 /* 7863 * Transitioning to read-auto need only happen for arrays that call 7864 * md_write_start and which are not ready for writes yet. 7865 */ 7866 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 7867 err = restart_array(mddev); 7868 if (err) 7869 goto out_unlock; 7870 mddev->ro = MD_AUTO_READ; 7871 } 7872 7873 out_unlock: 7874 mddev_unlock(mddev); 7875 return err; 7876 } 7877 7878 static int md_open(struct gendisk *disk, blk_mode_t mode) 7879 { 7880 struct mddev *mddev; 7881 int err; 7882 7883 spin_lock(&all_mddevs_lock); 7884 mddev = mddev_get(disk->private_data); 7885 spin_unlock(&all_mddevs_lock); 7886 if (!mddev) 7887 return -ENODEV; 7888 7889 err = mutex_lock_interruptible(&mddev->open_mutex); 7890 if (err) 7891 goto out; 7892 7893 err = -ENODEV; 7894 if (test_bit(MD_CLOSING, &mddev->flags)) 7895 goto out_unlock; 7896 7897 atomic_inc(&mddev->openers); 7898 mutex_unlock(&mddev->open_mutex); 7899 7900 disk_check_media_change(disk); 7901 return 0; 7902 7903 out_unlock: 7904 mutex_unlock(&mddev->open_mutex); 7905 out: 7906 mddev_put(mddev); 7907 return err; 7908 } 7909 7910 static void md_release(struct gendisk *disk) 7911 { 7912 struct mddev *mddev = disk->private_data; 7913 7914 BUG_ON(!mddev); 7915 atomic_dec(&mddev->openers); 7916 mddev_put(mddev); 7917 } 7918 7919 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 7920 { 7921 struct mddev *mddev = disk->private_data; 7922 unsigned int ret = 0; 7923 7924 if (mddev->changed) 7925 ret = DISK_EVENT_MEDIA_CHANGE; 7926 mddev->changed = 0; 7927 return ret; 7928 } 7929 7930 static void md_free_disk(struct gendisk *disk) 7931 { 7932 struct mddev *mddev = disk->private_data; 7933 7934 mddev_free(mddev); 7935 } 7936 7937 const struct block_device_operations md_fops = 7938 { 7939 .owner = THIS_MODULE, 7940 .submit_bio = md_submit_bio, 7941 .open = md_open, 7942 .release = md_release, 7943 .ioctl = md_ioctl, 7944 #ifdef CONFIG_COMPAT 7945 .compat_ioctl = md_compat_ioctl, 7946 #endif 7947 .getgeo = md_getgeo, 7948 .check_events = md_check_events, 7949 .set_read_only = md_set_read_only, 7950 .free_disk = md_free_disk, 7951 }; 7952 7953 static int md_thread(void *arg) 7954 { 7955 struct md_thread *thread = arg; 7956 7957 /* 7958 * md_thread is a 'system-thread', it's priority should be very 7959 * high. We avoid resource deadlocks individually in each 7960 * raid personality. (RAID5 does preallocation) We also use RR and 7961 * the very same RT priority as kswapd, thus we will never get 7962 * into a priority inversion deadlock. 7963 * 7964 * we definitely have to have equal or higher priority than 7965 * bdflush, otherwise bdflush will deadlock if there are too 7966 * many dirty RAID5 blocks. 7967 */ 7968 7969 allow_signal(SIGKILL); 7970 while (!kthread_should_stop()) { 7971 7972 /* We need to wait INTERRUPTIBLE so that 7973 * we don't add to the load-average. 7974 * That means we need to be sure no signals are 7975 * pending 7976 */ 7977 if (signal_pending(current)) 7978 flush_signals(current); 7979 7980 wait_event_interruptible_timeout 7981 (thread->wqueue, 7982 test_bit(THREAD_WAKEUP, &thread->flags) 7983 || kthread_should_stop() || kthread_should_park(), 7984 thread->timeout); 7985 7986 clear_bit(THREAD_WAKEUP, &thread->flags); 7987 if (kthread_should_park()) 7988 kthread_parkme(); 7989 if (!kthread_should_stop()) 7990 thread->run(thread); 7991 } 7992 7993 return 0; 7994 } 7995 7996 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 7997 { 7998 struct md_thread *t; 7999 8000 rcu_read_lock(); 8001 t = rcu_dereference(thread); 8002 if (t) 8003 wake_up_process(t->tsk); 8004 rcu_read_unlock(); 8005 } 8006 8007 void md_wakeup_thread(struct md_thread __rcu *thread) 8008 { 8009 struct md_thread *t; 8010 8011 rcu_read_lock(); 8012 t = rcu_dereference(thread); 8013 if (t) { 8014 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8015 set_bit(THREAD_WAKEUP, &t->flags); 8016 wake_up(&t->wqueue); 8017 } 8018 rcu_read_unlock(); 8019 } 8020 EXPORT_SYMBOL(md_wakeup_thread); 8021 8022 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8023 struct mddev *mddev, const char *name) 8024 { 8025 struct md_thread *thread; 8026 8027 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8028 if (!thread) 8029 return NULL; 8030 8031 init_waitqueue_head(&thread->wqueue); 8032 8033 thread->run = run; 8034 thread->mddev = mddev; 8035 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8036 thread->tsk = kthread_run(md_thread, thread, 8037 "%s_%s", 8038 mdname(thread->mddev), 8039 name); 8040 if (IS_ERR(thread->tsk)) { 8041 kfree(thread); 8042 return NULL; 8043 } 8044 return thread; 8045 } 8046 EXPORT_SYMBOL(md_register_thread); 8047 8048 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8049 { 8050 struct md_thread *thread = rcu_dereference_protected(*threadp, 8051 lockdep_is_held(&mddev->reconfig_mutex)); 8052 8053 if (!thread) 8054 return; 8055 8056 rcu_assign_pointer(*threadp, NULL); 8057 synchronize_rcu(); 8058 8059 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8060 kthread_stop(thread->tsk); 8061 kfree(thread); 8062 } 8063 EXPORT_SYMBOL(md_unregister_thread); 8064 8065 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8066 { 8067 if (!rdev || test_bit(Faulty, &rdev->flags)) 8068 return; 8069 8070 if (!mddev->pers || !mddev->pers->error_handler) 8071 return; 8072 mddev->pers->error_handler(mddev, rdev); 8073 8074 if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) 8075 return; 8076 8077 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8078 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8079 sysfs_notify_dirent_safe(rdev->sysfs_state); 8080 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8081 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8082 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8083 md_wakeup_thread(mddev->thread); 8084 } 8085 if (mddev->event_work.func) 8086 queue_work(md_misc_wq, &mddev->event_work); 8087 md_new_event(); 8088 } 8089 EXPORT_SYMBOL(md_error); 8090 8091 /* seq_file implementation /proc/mdstat */ 8092 8093 static void status_unused(struct seq_file *seq) 8094 { 8095 int i = 0; 8096 struct md_rdev *rdev; 8097 8098 seq_printf(seq, "unused devices: "); 8099 8100 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8101 i++; 8102 seq_printf(seq, "%pg ", rdev->bdev); 8103 } 8104 if (!i) 8105 seq_printf(seq, "<none>"); 8106 8107 seq_printf(seq, "\n"); 8108 } 8109 8110 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8111 { 8112 sector_t max_sectors, resync, res; 8113 unsigned long dt, db = 0; 8114 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8115 int scale, recovery_active; 8116 unsigned int per_milli; 8117 8118 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8119 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8120 max_sectors = mddev->resync_max_sectors; 8121 else 8122 max_sectors = mddev->dev_sectors; 8123 8124 resync = mddev->curr_resync; 8125 if (resync < MD_RESYNC_ACTIVE) { 8126 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8127 /* Still cleaning up */ 8128 resync = max_sectors; 8129 } else if (resync > max_sectors) { 8130 resync = max_sectors; 8131 } else { 8132 res = atomic_read(&mddev->recovery_active); 8133 /* 8134 * Resync has started, but the subtraction has overflowed or 8135 * yielded one of the special values. Force it to active to 8136 * ensure the status reports an active resync. 8137 */ 8138 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8139 resync = MD_RESYNC_ACTIVE; 8140 else 8141 resync -= res; 8142 } 8143 8144 if (resync == MD_RESYNC_NONE) { 8145 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8146 struct md_rdev *rdev; 8147 8148 rdev_for_each(rdev, mddev) 8149 if (rdev->raid_disk >= 0 && 8150 !test_bit(Faulty, &rdev->flags) && 8151 rdev->recovery_offset != MaxSector && 8152 rdev->recovery_offset) { 8153 seq_printf(seq, "\trecover=REMOTE"); 8154 return 1; 8155 } 8156 if (mddev->reshape_position != MaxSector) 8157 seq_printf(seq, "\treshape=REMOTE"); 8158 else 8159 seq_printf(seq, "\tresync=REMOTE"); 8160 return 1; 8161 } 8162 if (mddev->recovery_cp < MaxSector) { 8163 seq_printf(seq, "\tresync=PENDING"); 8164 return 1; 8165 } 8166 return 0; 8167 } 8168 if (resync < MD_RESYNC_ACTIVE) { 8169 seq_printf(seq, "\tresync=DELAYED"); 8170 return 1; 8171 } 8172 8173 WARN_ON(max_sectors == 0); 8174 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8175 * in a sector_t, and (max_sectors>>scale) will fit in a 8176 * u32, as those are the requirements for sector_div. 8177 * Thus 'scale' must be at least 10 8178 */ 8179 scale = 10; 8180 if (sizeof(sector_t) > sizeof(unsigned long)) { 8181 while ( max_sectors/2 > (1ULL<<(scale+32))) 8182 scale++; 8183 } 8184 res = (resync>>scale)*1000; 8185 sector_div(res, (u32)((max_sectors>>scale)+1)); 8186 8187 per_milli = res; 8188 { 8189 int i, x = per_milli/50, y = 20-x; 8190 seq_printf(seq, "["); 8191 for (i = 0; i < x; i++) 8192 seq_printf(seq, "="); 8193 seq_printf(seq, ">"); 8194 for (i = 0; i < y; i++) 8195 seq_printf(seq, "."); 8196 seq_printf(seq, "] "); 8197 } 8198 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8199 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8200 "reshape" : 8201 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8202 "check" : 8203 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8204 "resync" : "recovery"))), 8205 per_milli/10, per_milli % 10, 8206 (unsigned long long) resync/2, 8207 (unsigned long long) max_sectors/2); 8208 8209 /* 8210 * dt: time from mark until now 8211 * db: blocks written from mark until now 8212 * rt: remaining time 8213 * 8214 * rt is a sector_t, which is always 64bit now. We are keeping 8215 * the original algorithm, but it is not really necessary. 8216 * 8217 * Original algorithm: 8218 * So we divide before multiply in case it is 32bit and close 8219 * to the limit. 8220 * We scale the divisor (db) by 32 to avoid losing precision 8221 * near the end of resync when the number of remaining sectors 8222 * is close to 'db'. 8223 * We then divide rt by 32 after multiplying by db to compensate. 8224 * The '+1' avoids division by zero if db is very small. 8225 */ 8226 dt = ((jiffies - mddev->resync_mark) / HZ); 8227 if (!dt) dt++; 8228 8229 curr_mark_cnt = mddev->curr_mark_cnt; 8230 recovery_active = atomic_read(&mddev->recovery_active); 8231 resync_mark_cnt = mddev->resync_mark_cnt; 8232 8233 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8234 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8235 8236 rt = max_sectors - resync; /* number of remaining sectors */ 8237 rt = div64_u64(rt, db/32+1); 8238 rt *= dt; 8239 rt >>= 5; 8240 8241 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8242 ((unsigned long)rt % 60)/6); 8243 8244 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8245 return 1; 8246 } 8247 8248 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8249 __acquires(&all_mddevs_lock) 8250 { 8251 struct md_personality *pers; 8252 8253 seq_puts(seq, "Personalities : "); 8254 spin_lock(&pers_lock); 8255 list_for_each_entry(pers, &pers_list, list) 8256 seq_printf(seq, "[%s] ", pers->name); 8257 8258 spin_unlock(&pers_lock); 8259 seq_puts(seq, "\n"); 8260 seq->poll_event = atomic_read(&md_event_count); 8261 8262 spin_lock(&all_mddevs_lock); 8263 8264 return seq_list_start(&all_mddevs, *pos); 8265 } 8266 8267 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8268 { 8269 return seq_list_next(v, &all_mddevs, pos); 8270 } 8271 8272 static void md_seq_stop(struct seq_file *seq, void *v) 8273 __releases(&all_mddevs_lock) 8274 { 8275 status_unused(seq); 8276 spin_unlock(&all_mddevs_lock); 8277 } 8278 8279 static int md_seq_show(struct seq_file *seq, void *v) 8280 { 8281 struct mddev *mddev = list_entry(v, struct mddev, all_mddevs); 8282 sector_t sectors; 8283 struct md_rdev *rdev; 8284 8285 if (!mddev_get(mddev)) 8286 return 0; 8287 8288 spin_unlock(&all_mddevs_lock); 8289 spin_lock(&mddev->lock); 8290 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8291 seq_printf(seq, "%s : %sactive", mdname(mddev), 8292 mddev->pers ? "" : "in"); 8293 if (mddev->pers) { 8294 if (mddev->ro == MD_RDONLY) 8295 seq_printf(seq, " (read-only)"); 8296 if (mddev->ro == MD_AUTO_READ) 8297 seq_printf(seq, " (auto-read-only)"); 8298 seq_printf(seq, " %s", mddev->pers->name); 8299 } 8300 8301 sectors = 0; 8302 rcu_read_lock(); 8303 rdev_for_each_rcu(rdev, mddev) { 8304 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8305 8306 if (test_bit(WriteMostly, &rdev->flags)) 8307 seq_printf(seq, "(W)"); 8308 if (test_bit(Journal, &rdev->flags)) 8309 seq_printf(seq, "(J)"); 8310 if (test_bit(Faulty, &rdev->flags)) { 8311 seq_printf(seq, "(F)"); 8312 continue; 8313 } 8314 if (rdev->raid_disk < 0) 8315 seq_printf(seq, "(S)"); /* spare */ 8316 if (test_bit(Replacement, &rdev->flags)) 8317 seq_printf(seq, "(R)"); 8318 sectors += rdev->sectors; 8319 } 8320 rcu_read_unlock(); 8321 8322 if (!list_empty(&mddev->disks)) { 8323 if (mddev->pers) 8324 seq_printf(seq, "\n %llu blocks", 8325 (unsigned long long) 8326 mddev->array_sectors / 2); 8327 else 8328 seq_printf(seq, "\n %llu blocks", 8329 (unsigned long long)sectors / 2); 8330 } 8331 if (mddev->persistent) { 8332 if (mddev->major_version != 0 || 8333 mddev->minor_version != 90) { 8334 seq_printf(seq," super %d.%d", 8335 mddev->major_version, 8336 mddev->minor_version); 8337 } 8338 } else if (mddev->external) 8339 seq_printf(seq, " super external:%s", 8340 mddev->metadata_type); 8341 else 8342 seq_printf(seq, " super non-persistent"); 8343 8344 if (mddev->pers) { 8345 mddev->pers->status(seq, mddev); 8346 seq_printf(seq, "\n "); 8347 if (mddev->pers->sync_request) { 8348 if (status_resync(seq, mddev)) 8349 seq_printf(seq, "\n "); 8350 } 8351 } else 8352 seq_printf(seq, "\n "); 8353 8354 md_bitmap_status(seq, mddev->bitmap); 8355 8356 seq_printf(seq, "\n"); 8357 } 8358 spin_unlock(&mddev->lock); 8359 spin_lock(&all_mddevs_lock); 8360 if (atomic_dec_and_test(&mddev->active)) 8361 __mddev_put(mddev); 8362 8363 return 0; 8364 } 8365 8366 static const struct seq_operations md_seq_ops = { 8367 .start = md_seq_start, 8368 .next = md_seq_next, 8369 .stop = md_seq_stop, 8370 .show = md_seq_show, 8371 }; 8372 8373 static int md_seq_open(struct inode *inode, struct file *file) 8374 { 8375 struct seq_file *seq; 8376 int error; 8377 8378 error = seq_open(file, &md_seq_ops); 8379 if (error) 8380 return error; 8381 8382 seq = file->private_data; 8383 seq->poll_event = atomic_read(&md_event_count); 8384 return error; 8385 } 8386 8387 static int md_unloading; 8388 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8389 { 8390 struct seq_file *seq = filp->private_data; 8391 __poll_t mask; 8392 8393 if (md_unloading) 8394 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8395 poll_wait(filp, &md_event_waiters, wait); 8396 8397 /* always allow read */ 8398 mask = EPOLLIN | EPOLLRDNORM; 8399 8400 if (seq->poll_event != atomic_read(&md_event_count)) 8401 mask |= EPOLLERR | EPOLLPRI; 8402 return mask; 8403 } 8404 8405 static const struct proc_ops mdstat_proc_ops = { 8406 .proc_open = md_seq_open, 8407 .proc_read = seq_read, 8408 .proc_lseek = seq_lseek, 8409 .proc_release = seq_release, 8410 .proc_poll = mdstat_poll, 8411 }; 8412 8413 int register_md_personality(struct md_personality *p) 8414 { 8415 pr_debug("md: %s personality registered for level %d\n", 8416 p->name, p->level); 8417 spin_lock(&pers_lock); 8418 list_add_tail(&p->list, &pers_list); 8419 spin_unlock(&pers_lock); 8420 return 0; 8421 } 8422 EXPORT_SYMBOL(register_md_personality); 8423 8424 int unregister_md_personality(struct md_personality *p) 8425 { 8426 pr_debug("md: %s personality unregistered\n", p->name); 8427 spin_lock(&pers_lock); 8428 list_del_init(&p->list); 8429 spin_unlock(&pers_lock); 8430 return 0; 8431 } 8432 EXPORT_SYMBOL(unregister_md_personality); 8433 8434 int register_md_cluster_operations(struct md_cluster_operations *ops, 8435 struct module *module) 8436 { 8437 int ret = 0; 8438 spin_lock(&pers_lock); 8439 if (md_cluster_ops != NULL) 8440 ret = -EALREADY; 8441 else { 8442 md_cluster_ops = ops; 8443 md_cluster_mod = module; 8444 } 8445 spin_unlock(&pers_lock); 8446 return ret; 8447 } 8448 EXPORT_SYMBOL(register_md_cluster_operations); 8449 8450 int unregister_md_cluster_operations(void) 8451 { 8452 spin_lock(&pers_lock); 8453 md_cluster_ops = NULL; 8454 spin_unlock(&pers_lock); 8455 return 0; 8456 } 8457 EXPORT_SYMBOL(unregister_md_cluster_operations); 8458 8459 int md_setup_cluster(struct mddev *mddev, int nodes) 8460 { 8461 int ret; 8462 if (!md_cluster_ops) 8463 request_module("md-cluster"); 8464 spin_lock(&pers_lock); 8465 /* ensure module won't be unloaded */ 8466 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 8467 pr_warn("can't find md-cluster module or get its reference.\n"); 8468 spin_unlock(&pers_lock); 8469 return -ENOENT; 8470 } 8471 spin_unlock(&pers_lock); 8472 8473 ret = md_cluster_ops->join(mddev, nodes); 8474 if (!ret) 8475 mddev->safemode_delay = 0; 8476 return ret; 8477 } 8478 8479 void md_cluster_stop(struct mddev *mddev) 8480 { 8481 if (!md_cluster_ops) 8482 return; 8483 md_cluster_ops->leave(mddev); 8484 module_put(md_cluster_mod); 8485 } 8486 8487 static int is_mddev_idle(struct mddev *mddev, int init) 8488 { 8489 struct md_rdev *rdev; 8490 int idle; 8491 int curr_events; 8492 8493 idle = 1; 8494 rcu_read_lock(); 8495 rdev_for_each_rcu(rdev, mddev) { 8496 struct gendisk *disk = rdev->bdev->bd_disk; 8497 curr_events = (int)part_stat_read_accum(disk->part0, sectors) - 8498 atomic_read(&disk->sync_io); 8499 /* sync IO will cause sync_io to increase before the disk_stats 8500 * as sync_io is counted when a request starts, and 8501 * disk_stats is counted when it completes. 8502 * So resync activity will cause curr_events to be smaller than 8503 * when there was no such activity. 8504 * non-sync IO will cause disk_stat to increase without 8505 * increasing sync_io so curr_events will (eventually) 8506 * be larger than it was before. Once it becomes 8507 * substantially larger, the test below will cause 8508 * the array to appear non-idle, and resync will slow 8509 * down. 8510 * If there is a lot of outstanding resync activity when 8511 * we set last_event to curr_events, then all that activity 8512 * completing might cause the array to appear non-idle 8513 * and resync will be slowed down even though there might 8514 * not have been non-resync activity. This will only 8515 * happen once though. 'last_events' will soon reflect 8516 * the state where there is little or no outstanding 8517 * resync requests, and further resync activity will 8518 * always make curr_events less than last_events. 8519 * 8520 */ 8521 if (init || curr_events - rdev->last_events > 64) { 8522 rdev->last_events = curr_events; 8523 idle = 0; 8524 } 8525 } 8526 rcu_read_unlock(); 8527 return idle; 8528 } 8529 8530 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8531 { 8532 /* another "blocks" (512byte) blocks have been synced */ 8533 atomic_sub(blocks, &mddev->recovery_active); 8534 wake_up(&mddev->recovery_wait); 8535 if (!ok) { 8536 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8537 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8538 md_wakeup_thread(mddev->thread); 8539 // stop recovery, signal do_sync .... 8540 } 8541 } 8542 EXPORT_SYMBOL(md_done_sync); 8543 8544 /* md_write_start(mddev, bi) 8545 * If we need to update some array metadata (e.g. 'active' flag 8546 * in superblock) before writing, schedule a superblock update 8547 * and wait for it to complete. 8548 * A return value of 'false' means that the write wasn't recorded 8549 * and cannot proceed as the array is being suspend. 8550 */ 8551 bool md_write_start(struct mddev *mddev, struct bio *bi) 8552 { 8553 int did_change = 0; 8554 8555 if (bio_data_dir(bi) != WRITE) 8556 return true; 8557 8558 BUG_ON(mddev->ro == MD_RDONLY); 8559 if (mddev->ro == MD_AUTO_READ) { 8560 /* need to switch to read/write */ 8561 flush_work(&mddev->sync_work); 8562 mddev->ro = MD_RDWR; 8563 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8564 md_wakeup_thread(mddev->thread); 8565 md_wakeup_thread(mddev->sync_thread); 8566 did_change = 1; 8567 } 8568 rcu_read_lock(); 8569 percpu_ref_get(&mddev->writes_pending); 8570 smp_mb(); /* Match smp_mb in set_in_sync() */ 8571 if (mddev->safemode == 1) 8572 mddev->safemode = 0; 8573 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8574 if (mddev->in_sync || mddev->sync_checkers) { 8575 spin_lock(&mddev->lock); 8576 if (mddev->in_sync) { 8577 mddev->in_sync = 0; 8578 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8579 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8580 md_wakeup_thread(mddev->thread); 8581 did_change = 1; 8582 } 8583 spin_unlock(&mddev->lock); 8584 } 8585 rcu_read_unlock(); 8586 if (did_change) 8587 sysfs_notify_dirent_safe(mddev->sysfs_state); 8588 if (!mddev->has_superblocks) 8589 return true; 8590 wait_event(mddev->sb_wait, 8591 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || 8592 is_md_suspended(mddev)); 8593 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 8594 percpu_ref_put(&mddev->writes_pending); 8595 return false; 8596 } 8597 return true; 8598 } 8599 EXPORT_SYMBOL(md_write_start); 8600 8601 /* md_write_inc can only be called when md_write_start() has 8602 * already been called at least once of the current request. 8603 * It increments the counter and is useful when a single request 8604 * is split into several parts. Each part causes an increment and 8605 * so needs a matching md_write_end(). 8606 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8607 * a spinlocked region. 8608 */ 8609 void md_write_inc(struct mddev *mddev, struct bio *bi) 8610 { 8611 if (bio_data_dir(bi) != WRITE) 8612 return; 8613 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8614 percpu_ref_get(&mddev->writes_pending); 8615 } 8616 EXPORT_SYMBOL(md_write_inc); 8617 8618 void md_write_end(struct mddev *mddev) 8619 { 8620 percpu_ref_put(&mddev->writes_pending); 8621 8622 if (mddev->safemode == 2) 8623 md_wakeup_thread(mddev->thread); 8624 else if (mddev->safemode_delay) 8625 /* The roundup() ensures this only performs locking once 8626 * every ->safemode_delay jiffies 8627 */ 8628 mod_timer(&mddev->safemode_timer, 8629 roundup(jiffies, mddev->safemode_delay) + 8630 mddev->safemode_delay); 8631 } 8632 8633 EXPORT_SYMBOL(md_write_end); 8634 8635 /* This is used by raid0 and raid10 */ 8636 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8637 struct bio *bio, sector_t start, sector_t size) 8638 { 8639 struct bio *discard_bio = NULL; 8640 8641 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8642 &discard_bio) || !discard_bio) 8643 return; 8644 8645 bio_chain(discard_bio, bio); 8646 bio_clone_blkg_association(discard_bio, bio); 8647 if (mddev->gendisk) 8648 trace_block_bio_remap(discard_bio, 8649 disk_devt(mddev->gendisk), 8650 bio->bi_iter.bi_sector); 8651 submit_bio_noacct(discard_bio); 8652 } 8653 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8654 8655 static void md_end_clone_io(struct bio *bio) 8656 { 8657 struct md_io_clone *md_io_clone = bio->bi_private; 8658 struct bio *orig_bio = md_io_clone->orig_bio; 8659 struct mddev *mddev = md_io_clone->mddev; 8660 8661 if (bio->bi_status && !orig_bio->bi_status) 8662 orig_bio->bi_status = bio->bi_status; 8663 8664 if (md_io_clone->start_time) 8665 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8666 8667 bio_put(bio); 8668 bio_endio(orig_bio); 8669 percpu_ref_put(&mddev->active_io); 8670 } 8671 8672 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8673 { 8674 struct block_device *bdev = (*bio)->bi_bdev; 8675 struct md_io_clone *md_io_clone; 8676 struct bio *clone = 8677 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8678 8679 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8680 md_io_clone->orig_bio = *bio; 8681 md_io_clone->mddev = mddev; 8682 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8683 md_io_clone->start_time = bio_start_io_acct(*bio); 8684 8685 clone->bi_end_io = md_end_clone_io; 8686 clone->bi_private = md_io_clone; 8687 *bio = clone; 8688 } 8689 8690 void md_account_bio(struct mddev *mddev, struct bio **bio) 8691 { 8692 percpu_ref_get(&mddev->active_io); 8693 md_clone_bio(mddev, bio); 8694 } 8695 EXPORT_SYMBOL_GPL(md_account_bio); 8696 8697 /* md_allow_write(mddev) 8698 * Calling this ensures that the array is marked 'active' so that writes 8699 * may proceed without blocking. It is important to call this before 8700 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8701 * Must be called with mddev_lock held. 8702 */ 8703 void md_allow_write(struct mddev *mddev) 8704 { 8705 if (!mddev->pers) 8706 return; 8707 if (!md_is_rdwr(mddev)) 8708 return; 8709 if (!mddev->pers->sync_request) 8710 return; 8711 8712 spin_lock(&mddev->lock); 8713 if (mddev->in_sync) { 8714 mddev->in_sync = 0; 8715 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8716 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8717 if (mddev->safemode_delay && 8718 mddev->safemode == 0) 8719 mddev->safemode = 1; 8720 spin_unlock(&mddev->lock); 8721 md_update_sb(mddev, 0); 8722 sysfs_notify_dirent_safe(mddev->sysfs_state); 8723 /* wait for the dirty state to be recorded in the metadata */ 8724 wait_event(mddev->sb_wait, 8725 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8726 } else 8727 spin_unlock(&mddev->lock); 8728 } 8729 EXPORT_SYMBOL_GPL(md_allow_write); 8730 8731 #define SYNC_MARKS 10 8732 #define SYNC_MARK_STEP (3*HZ) 8733 #define UPDATE_FREQUENCY (5*60*HZ) 8734 void md_do_sync(struct md_thread *thread) 8735 { 8736 struct mddev *mddev = thread->mddev; 8737 struct mddev *mddev2; 8738 unsigned int currspeed = 0, window; 8739 sector_t max_sectors,j, io_sectors, recovery_done; 8740 unsigned long mark[SYNC_MARKS]; 8741 unsigned long update_time; 8742 sector_t mark_cnt[SYNC_MARKS]; 8743 int last_mark,m; 8744 sector_t last_check; 8745 int skipped = 0; 8746 struct md_rdev *rdev; 8747 char *desc, *action = NULL; 8748 struct blk_plug plug; 8749 int ret; 8750 8751 /* just incase thread restarts... */ 8752 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8753 test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) 8754 return; 8755 if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 8756 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8757 return; 8758 } 8759 8760 if (mddev_is_clustered(mddev)) { 8761 ret = md_cluster_ops->resync_start(mddev); 8762 if (ret) 8763 goto skip; 8764 8765 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8766 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8767 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8768 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8769 && ((unsigned long long)mddev->curr_resync_completed 8770 < (unsigned long long)mddev->resync_max_sectors)) 8771 goto skip; 8772 } 8773 8774 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8775 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 8776 desc = "data-check"; 8777 action = "check"; 8778 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8779 desc = "requested-resync"; 8780 action = "repair"; 8781 } else 8782 desc = "resync"; 8783 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8784 desc = "reshape"; 8785 else 8786 desc = "recovery"; 8787 8788 mddev->last_sync_action = action ?: desc; 8789 8790 /* 8791 * Before starting a resync we must have set curr_resync to 8792 * 2, and then checked that every "conflicting" array has curr_resync 8793 * less than ours. When we find one that is the same or higher 8794 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8795 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8796 * This will mean we have to start checking from the beginning again. 8797 * 8798 */ 8799 8800 do { 8801 int mddev2_minor = -1; 8802 mddev->curr_resync = MD_RESYNC_DELAYED; 8803 8804 try_again: 8805 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8806 goto skip; 8807 spin_lock(&all_mddevs_lock); 8808 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 8809 if (test_bit(MD_DELETED, &mddev2->flags)) 8810 continue; 8811 if (mddev2 == mddev) 8812 continue; 8813 if (!mddev->parallel_resync 8814 && mddev2->curr_resync 8815 && match_mddev_units(mddev, mddev2)) { 8816 DEFINE_WAIT(wq); 8817 if (mddev < mddev2 && 8818 mddev->curr_resync == MD_RESYNC_DELAYED) { 8819 /* arbitrarily yield */ 8820 mddev->curr_resync = MD_RESYNC_YIELDED; 8821 wake_up(&resync_wait); 8822 } 8823 if (mddev > mddev2 && 8824 mddev->curr_resync == MD_RESYNC_YIELDED) 8825 /* no need to wait here, we can wait the next 8826 * time 'round when curr_resync == 2 8827 */ 8828 continue; 8829 /* We need to wait 'interruptible' so as not to 8830 * contribute to the load average, and not to 8831 * be caught by 'softlockup' 8832 */ 8833 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 8834 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8835 mddev2->curr_resync >= mddev->curr_resync) { 8836 if (mddev2_minor != mddev2->md_minor) { 8837 mddev2_minor = mddev2->md_minor; 8838 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 8839 desc, mdname(mddev), 8840 mdname(mddev2)); 8841 } 8842 spin_unlock(&all_mddevs_lock); 8843 8844 if (signal_pending(current)) 8845 flush_signals(current); 8846 schedule(); 8847 finish_wait(&resync_wait, &wq); 8848 goto try_again; 8849 } 8850 finish_wait(&resync_wait, &wq); 8851 } 8852 } 8853 spin_unlock(&all_mddevs_lock); 8854 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 8855 8856 j = 0; 8857 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8858 /* resync follows the size requested by the personality, 8859 * which defaults to physical size, but can be virtual size 8860 */ 8861 max_sectors = mddev->resync_max_sectors; 8862 atomic64_set(&mddev->resync_mismatches, 0); 8863 /* we don't use the checkpoint if there's a bitmap */ 8864 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8865 j = mddev->resync_min; 8866 else if (!mddev->bitmap) 8867 j = mddev->recovery_cp; 8868 8869 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 8870 max_sectors = mddev->resync_max_sectors; 8871 /* 8872 * If the original node aborts reshaping then we continue the 8873 * reshaping, so set j again to avoid restart reshape from the 8874 * first beginning 8875 */ 8876 if (mddev_is_clustered(mddev) && 8877 mddev->reshape_position != MaxSector) 8878 j = mddev->reshape_position; 8879 } else { 8880 /* recovery follows the physical size of devices */ 8881 max_sectors = mddev->dev_sectors; 8882 j = MaxSector; 8883 rcu_read_lock(); 8884 rdev_for_each_rcu(rdev, mddev) 8885 if (rdev->raid_disk >= 0 && 8886 !test_bit(Journal, &rdev->flags) && 8887 !test_bit(Faulty, &rdev->flags) && 8888 !test_bit(In_sync, &rdev->flags) && 8889 rdev->recovery_offset < j) 8890 j = rdev->recovery_offset; 8891 rcu_read_unlock(); 8892 8893 /* If there is a bitmap, we need to make sure all 8894 * writes that started before we added a spare 8895 * complete before we start doing a recovery. 8896 * Otherwise the write might complete and (via 8897 * bitmap_endwrite) set a bit in the bitmap after the 8898 * recovery has checked that bit and skipped that 8899 * region. 8900 */ 8901 if (mddev->bitmap) { 8902 mddev->pers->quiesce(mddev, 1); 8903 mddev->pers->quiesce(mddev, 0); 8904 } 8905 } 8906 8907 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 8908 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 8909 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 8910 speed_max(mddev), desc); 8911 8912 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8913 8914 io_sectors = 0; 8915 for (m = 0; m < SYNC_MARKS; m++) { 8916 mark[m] = jiffies; 8917 mark_cnt[m] = io_sectors; 8918 } 8919 last_mark = 0; 8920 mddev->resync_mark = mark[last_mark]; 8921 mddev->resync_mark_cnt = mark_cnt[last_mark]; 8922 8923 /* 8924 * Tune reconstruction: 8925 */ 8926 window = 32 * (PAGE_SIZE / 512); 8927 pr_debug("md: using %dk window, over a total of %lluk.\n", 8928 window/2, (unsigned long long)max_sectors/2); 8929 8930 atomic_set(&mddev->recovery_active, 0); 8931 last_check = 0; 8932 8933 if (j >= MD_RESYNC_ACTIVE) { 8934 pr_debug("md: resuming %s of %s from checkpoint.\n", 8935 desc, mdname(mddev)); 8936 mddev->curr_resync = j; 8937 } else 8938 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 8939 mddev->curr_resync_completed = j; 8940 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8941 md_new_event(); 8942 update_time = jiffies; 8943 8944 blk_start_plug(&plug); 8945 while (j < max_sectors) { 8946 sector_t sectors; 8947 8948 skipped = 0; 8949 8950 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8951 ((mddev->curr_resync > mddev->curr_resync_completed && 8952 (mddev->curr_resync - mddev->curr_resync_completed) 8953 > (max_sectors >> 4)) || 8954 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 8955 (j - mddev->curr_resync_completed)*2 8956 >= mddev->resync_max - mddev->curr_resync_completed || 8957 mddev->curr_resync_completed > mddev->resync_max 8958 )) { 8959 /* time to update curr_resync_completed */ 8960 wait_event(mddev->recovery_wait, 8961 atomic_read(&mddev->recovery_active) == 0); 8962 mddev->curr_resync_completed = j; 8963 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 8964 j > mddev->recovery_cp) 8965 mddev->recovery_cp = j; 8966 update_time = jiffies; 8967 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8968 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8969 } 8970 8971 while (j >= mddev->resync_max && 8972 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8973 /* As this condition is controlled by user-space, 8974 * we can block indefinitely, so use '_interruptible' 8975 * to avoid triggering warnings. 8976 */ 8977 flush_signals(current); /* just in case */ 8978 wait_event_interruptible(mddev->recovery_wait, 8979 mddev->resync_max > j 8980 || test_bit(MD_RECOVERY_INTR, 8981 &mddev->recovery)); 8982 } 8983 8984 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8985 break; 8986 8987 sectors = mddev->pers->sync_request(mddev, j, &skipped); 8988 if (sectors == 0) { 8989 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8990 break; 8991 } 8992 8993 if (!skipped) { /* actual IO requested */ 8994 io_sectors += sectors; 8995 atomic_add(sectors, &mddev->recovery_active); 8996 } 8997 8998 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8999 break; 9000 9001 j += sectors; 9002 if (j > max_sectors) 9003 /* when skipping, extra large numbers can be returned. */ 9004 j = max_sectors; 9005 if (j >= MD_RESYNC_ACTIVE) 9006 mddev->curr_resync = j; 9007 mddev->curr_mark_cnt = io_sectors; 9008 if (last_check == 0) 9009 /* this is the earliest that rebuild will be 9010 * visible in /proc/mdstat 9011 */ 9012 md_new_event(); 9013 9014 if (last_check + window > io_sectors || j == max_sectors) 9015 continue; 9016 9017 last_check = io_sectors; 9018 repeat: 9019 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9020 /* step marks */ 9021 int next = (last_mark+1) % SYNC_MARKS; 9022 9023 mddev->resync_mark = mark[next]; 9024 mddev->resync_mark_cnt = mark_cnt[next]; 9025 mark[next] = jiffies; 9026 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9027 last_mark = next; 9028 } 9029 9030 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9031 break; 9032 9033 /* 9034 * this loop exits only if either when we are slower than 9035 * the 'hard' speed limit, or the system was IO-idle for 9036 * a jiffy. 9037 * the system might be non-idle CPU-wise, but we only care 9038 * about not overloading the IO subsystem. (things like an 9039 * e2fsck being done on the RAID array should execute fast) 9040 */ 9041 cond_resched(); 9042 9043 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9044 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9045 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9046 9047 if (currspeed > speed_min(mddev)) { 9048 if (currspeed > speed_max(mddev)) { 9049 msleep(500); 9050 goto repeat; 9051 } 9052 if (!is_mddev_idle(mddev, 0)) { 9053 /* 9054 * Give other IO more of a chance. 9055 * The faster the devices, the less we wait. 9056 */ 9057 wait_event(mddev->recovery_wait, 9058 !atomic_read(&mddev->recovery_active)); 9059 } 9060 } 9061 } 9062 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9063 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9064 ? "interrupted" : "done"); 9065 /* 9066 * this also signals 'finished resyncing' to md_stop 9067 */ 9068 blk_finish_plug(&plug); 9069 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9070 9071 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9072 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9073 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9074 mddev->curr_resync_completed = mddev->curr_resync; 9075 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9076 } 9077 mddev->pers->sync_request(mddev, max_sectors, &skipped); 9078 9079 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9080 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9081 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9082 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9083 if (mddev->curr_resync >= mddev->recovery_cp) { 9084 pr_debug("md: checkpointing %s of %s.\n", 9085 desc, mdname(mddev)); 9086 if (test_bit(MD_RECOVERY_ERROR, 9087 &mddev->recovery)) 9088 mddev->recovery_cp = 9089 mddev->curr_resync_completed; 9090 else 9091 mddev->recovery_cp = 9092 mddev->curr_resync; 9093 } 9094 } else 9095 mddev->recovery_cp = MaxSector; 9096 } else { 9097 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9098 mddev->curr_resync = MaxSector; 9099 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9100 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9101 rcu_read_lock(); 9102 rdev_for_each_rcu(rdev, mddev) 9103 if (rdev->raid_disk >= 0 && 9104 mddev->delta_disks >= 0 && 9105 !test_bit(Journal, &rdev->flags) && 9106 !test_bit(Faulty, &rdev->flags) && 9107 !test_bit(In_sync, &rdev->flags) && 9108 rdev->recovery_offset < mddev->curr_resync) 9109 rdev->recovery_offset = mddev->curr_resync; 9110 rcu_read_unlock(); 9111 } 9112 } 9113 } 9114 skip: 9115 /* set CHANGE_PENDING here since maybe another update is needed, 9116 * so other nodes are informed. It should be harmless for normal 9117 * raid */ 9118 set_mask_bits(&mddev->sb_flags, 0, 9119 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9120 9121 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9122 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9123 mddev->delta_disks > 0 && 9124 mddev->pers->finish_reshape && 9125 mddev->pers->size && 9126 mddev->queue) { 9127 mddev_lock_nointr(mddev); 9128 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9129 mddev_unlock(mddev); 9130 if (!mddev_is_clustered(mddev)) 9131 set_capacity_and_notify(mddev->gendisk, 9132 mddev->array_sectors); 9133 } 9134 9135 spin_lock(&mddev->lock); 9136 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9137 /* We completed so min/max setting can be forgotten if used. */ 9138 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9139 mddev->resync_min = 0; 9140 mddev->resync_max = MaxSector; 9141 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9142 mddev->resync_min = mddev->curr_resync_completed; 9143 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9144 mddev->curr_resync = MD_RESYNC_NONE; 9145 spin_unlock(&mddev->lock); 9146 9147 wake_up(&resync_wait); 9148 md_wakeup_thread(mddev->thread); 9149 return; 9150 } 9151 EXPORT_SYMBOL_GPL(md_do_sync); 9152 9153 static bool rdev_removeable(struct md_rdev *rdev) 9154 { 9155 /* rdev is not used. */ 9156 if (rdev->raid_disk < 0) 9157 return false; 9158 9159 /* There are still inflight io, don't remove this rdev. */ 9160 if (atomic_read(&rdev->nr_pending)) 9161 return false; 9162 9163 /* 9164 * An error occurred but has not yet been acknowledged by the metadata 9165 * handler, don't remove this rdev. 9166 */ 9167 if (test_bit(Blocked, &rdev->flags)) 9168 return false; 9169 9170 /* Fautly rdev is not used, it's safe to remove it. */ 9171 if (test_bit(Faulty, &rdev->flags)) 9172 return true; 9173 9174 /* Journal disk can only be removed if it's faulty. */ 9175 if (test_bit(Journal, &rdev->flags)) 9176 return false; 9177 9178 /* 9179 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9180 * replacement has just become active from pers->spare_active(), and 9181 * then pers->hot_remove_disk() will replace this rdev with replacement. 9182 */ 9183 if (!test_bit(In_sync, &rdev->flags)) 9184 return true; 9185 9186 return false; 9187 } 9188 9189 static bool rdev_is_spare(struct md_rdev *rdev) 9190 { 9191 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 9192 !test_bit(In_sync, &rdev->flags) && 9193 !test_bit(Journal, &rdev->flags) && 9194 !test_bit(Faulty, &rdev->flags); 9195 } 9196 9197 static bool rdev_addable(struct md_rdev *rdev) 9198 { 9199 /* rdev is already used, don't add it again. */ 9200 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9201 test_bit(Faulty, &rdev->flags)) 9202 return false; 9203 9204 /* Allow to add journal disk. */ 9205 if (test_bit(Journal, &rdev->flags)) 9206 return true; 9207 9208 /* Allow to add if array is read-write. */ 9209 if (md_is_rdwr(rdev->mddev)) 9210 return true; 9211 9212 /* 9213 * For read-only array, only allow to readd a rdev. And if bitmap is 9214 * used, don't allow to readd a rdev that is too old. 9215 */ 9216 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 9217 return true; 9218 9219 return false; 9220 } 9221 9222 static bool md_spares_need_change(struct mddev *mddev) 9223 { 9224 struct md_rdev *rdev; 9225 9226 rdev_for_each(rdev, mddev) 9227 if (rdev_removeable(rdev) || rdev_addable(rdev)) 9228 return true; 9229 return false; 9230 } 9231 9232 static int remove_and_add_spares(struct mddev *mddev, 9233 struct md_rdev *this) 9234 { 9235 struct md_rdev *rdev; 9236 int spares = 0; 9237 int removed = 0; 9238 bool remove_some = false; 9239 9240 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9241 /* Mustn't remove devices when resync thread is running */ 9242 return 0; 9243 9244 rdev_for_each(rdev, mddev) { 9245 if ((this == NULL || rdev == this) && 9246 rdev->raid_disk >= 0 && 9247 !test_bit(Blocked, &rdev->flags) && 9248 test_bit(Faulty, &rdev->flags) && 9249 atomic_read(&rdev->nr_pending)==0) { 9250 /* Faulty non-Blocked devices with nr_pending == 0 9251 * never get nr_pending incremented, 9252 * never get Faulty cleared, and never get Blocked set. 9253 * So we can synchronize_rcu now rather than once per device 9254 */ 9255 remove_some = true; 9256 set_bit(RemoveSynchronized, &rdev->flags); 9257 } 9258 } 9259 9260 if (remove_some) 9261 synchronize_rcu(); 9262 rdev_for_each(rdev, mddev) { 9263 if ((this == NULL || rdev == this) && 9264 (test_bit(RemoveSynchronized, &rdev->flags) || 9265 rdev_removeable(rdev))) { 9266 if (mddev->pers->hot_remove_disk( 9267 mddev, rdev) == 0) { 9268 sysfs_unlink_rdev(mddev, rdev); 9269 rdev->saved_raid_disk = rdev->raid_disk; 9270 rdev->raid_disk = -1; 9271 removed++; 9272 } 9273 } 9274 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) 9275 clear_bit(RemoveSynchronized, &rdev->flags); 9276 } 9277 9278 if (removed && mddev->kobj.sd) 9279 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9280 9281 if (this && removed) 9282 goto no_add; 9283 9284 rdev_for_each(rdev, mddev) { 9285 if (this && this != rdev) 9286 continue; 9287 if (rdev_is_spare(rdev)) 9288 spares++; 9289 if (!rdev_addable(rdev)) 9290 continue; 9291 if (!test_bit(Journal, &rdev->flags)) 9292 rdev->recovery_offset = 0; 9293 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9294 /* failure here is OK */ 9295 sysfs_link_rdev(mddev, rdev); 9296 if (!test_bit(Journal, &rdev->flags)) 9297 spares++; 9298 md_new_event(); 9299 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9300 } 9301 } 9302 no_add: 9303 if (removed) 9304 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9305 return spares; 9306 } 9307 9308 static bool md_choose_sync_action(struct mddev *mddev, int *spares) 9309 { 9310 /* Check if reshape is in progress first. */ 9311 if (mddev->reshape_position != MaxSector) { 9312 if (mddev->pers->check_reshape == NULL || 9313 mddev->pers->check_reshape(mddev) != 0) 9314 return false; 9315 9316 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9317 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9318 return true; 9319 } 9320 9321 /* 9322 * Remove any failed drives, then add spares if possible. Spares are 9323 * also removed and re-added, to allow the personality to fail the 9324 * re-add. 9325 */ 9326 *spares = remove_and_add_spares(mddev, NULL); 9327 if (*spares) { 9328 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9329 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9330 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9331 9332 /* Start new recovery. */ 9333 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9334 return true; 9335 } 9336 9337 /* Check if recovery is in progress. */ 9338 if (mddev->recovery_cp < MaxSector) { 9339 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9340 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9341 return true; 9342 } 9343 9344 /* Delay to choose resync/check/repair in md_do_sync(). */ 9345 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9346 return true; 9347 9348 /* Nothing to be done */ 9349 return false; 9350 } 9351 9352 static void md_start_sync(struct work_struct *ws) 9353 { 9354 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 9355 int spares = 0; 9356 bool suspend = false; 9357 9358 if (md_spares_need_change(mddev)) 9359 suspend = true; 9360 9361 suspend ? mddev_suspend_and_lock_nointr(mddev) : 9362 mddev_lock_nointr(mddev); 9363 9364 if (!md_is_rdwr(mddev)) { 9365 /* 9366 * On a read-only array we can: 9367 * - remove failed devices 9368 * - add already-in_sync devices if the array itself is in-sync. 9369 * As we only add devices that are already in-sync, we can 9370 * activate the spares immediately. 9371 */ 9372 remove_and_add_spares(mddev, NULL); 9373 goto not_running; 9374 } 9375 9376 if (!md_choose_sync_action(mddev, &spares)) 9377 goto not_running; 9378 9379 if (!mddev->pers->sync_request) 9380 goto not_running; 9381 9382 /* 9383 * We are adding a device or devices to an array which has the bitmap 9384 * stored on all devices. So make sure all bitmap pages get written. 9385 */ 9386 if (spares) 9387 md_bitmap_write_all(mddev->bitmap); 9388 9389 rcu_assign_pointer(mddev->sync_thread, 9390 md_register_thread(md_do_sync, mddev, "resync")); 9391 if (!mddev->sync_thread) { 9392 pr_warn("%s: could not start resync thread...\n", 9393 mdname(mddev)); 9394 /* leave the spares where they are, it shouldn't hurt */ 9395 goto not_running; 9396 } 9397 9398 mddev_unlock(mddev); 9399 /* 9400 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9401 * not set it again. Otherwise, we may cause issue like this one: 9402 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9403 * Therefore, use __mddev_resume(mddev, false). 9404 */ 9405 if (suspend) 9406 __mddev_resume(mddev, false); 9407 md_wakeup_thread(mddev->sync_thread); 9408 sysfs_notify_dirent_safe(mddev->sysfs_action); 9409 md_new_event(); 9410 return; 9411 9412 not_running: 9413 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9414 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9415 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9416 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9417 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9418 mddev_unlock(mddev); 9419 /* 9420 * md_start_sync was triggered by MD_RECOVERY_NEEDED, so we should 9421 * not set it again. Otherwise, we may cause issue like this one: 9422 * https://bugzilla.kernel.org/show_bug.cgi?id=218200 9423 * Therefore, use __mddev_resume(mddev, false). 9424 */ 9425 if (suspend) 9426 __mddev_resume(mddev, false); 9427 9428 wake_up(&resync_wait); 9429 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 9430 mddev->sysfs_action) 9431 sysfs_notify_dirent_safe(mddev->sysfs_action); 9432 } 9433 9434 /* 9435 * This routine is regularly called by all per-raid-array threads to 9436 * deal with generic issues like resync and super-block update. 9437 * Raid personalities that don't have a thread (linear/raid0) do not 9438 * need this as they never do any recovery or update the superblock. 9439 * 9440 * It does not do any resync itself, but rather "forks" off other threads 9441 * to do that as needed. 9442 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9443 * "->recovery" and create a thread at ->sync_thread. 9444 * When the thread finishes it sets MD_RECOVERY_DONE 9445 * and wakeups up this thread which will reap the thread and finish up. 9446 * This thread also removes any faulty devices (with nr_pending == 0). 9447 * 9448 * The overall approach is: 9449 * 1/ if the superblock needs updating, update it. 9450 * 2/ If a recovery thread is running, don't do anything else. 9451 * 3/ If recovery has finished, clean up, possibly marking spares active. 9452 * 4/ If there are any faulty devices, remove them. 9453 * 5/ If array is degraded, try to add spares devices 9454 * 6/ If array has spares or is not in-sync, start a resync thread. 9455 */ 9456 void md_check_recovery(struct mddev *mddev) 9457 { 9458 if (READ_ONCE(mddev->suspended)) 9459 return; 9460 9461 if (mddev->bitmap) 9462 md_bitmap_daemon_work(mddev); 9463 9464 if (signal_pending(current)) { 9465 if (mddev->pers->sync_request && !mddev->external) { 9466 pr_debug("md: %s in immediate safe mode\n", 9467 mdname(mddev)); 9468 mddev->safemode = 2; 9469 } 9470 flush_signals(current); 9471 } 9472 9473 if (!md_is_rdwr(mddev) && 9474 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 9475 return; 9476 if ( ! ( 9477 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9478 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9479 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9480 (mddev->external == 0 && mddev->safemode == 1) || 9481 (mddev->safemode == 2 9482 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9483 )) 9484 return; 9485 9486 if (mddev_trylock(mddev)) { 9487 bool try_set_sync = mddev->safemode != 0; 9488 9489 if (!mddev->external && mddev->safemode == 1) 9490 mddev->safemode = 0; 9491 9492 if (!md_is_rdwr(mddev)) { 9493 struct md_rdev *rdev; 9494 9495 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9496 /* sync_work already queued. */ 9497 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9498 goto unlock; 9499 } 9500 9501 if (!mddev->external && mddev->in_sync) 9502 /* 9503 * 'Blocked' flag not needed as failed devices 9504 * will be recorded if array switched to read/write. 9505 * Leaving it set will prevent the device 9506 * from being removed. 9507 */ 9508 rdev_for_each(rdev, mddev) 9509 clear_bit(Blocked, &rdev->flags); 9510 9511 /* 9512 * There is no thread, but we need to call 9513 * ->spare_active and clear saved_raid_disk 9514 */ 9515 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9516 md_reap_sync_thread(mddev); 9517 9518 /* 9519 * Let md_start_sync() to remove and add rdevs to the 9520 * array. 9521 */ 9522 if (md_spares_need_change(mddev)) { 9523 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9524 queue_work(md_misc_wq, &mddev->sync_work); 9525 } 9526 9527 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9528 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9529 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9530 9531 goto unlock; 9532 } 9533 9534 if (mddev_is_clustered(mddev)) { 9535 struct md_rdev *rdev, *tmp; 9536 /* kick the device if another node issued a 9537 * remove disk. 9538 */ 9539 rdev_for_each_safe(rdev, tmp, mddev) { 9540 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9541 rdev->raid_disk < 0) 9542 md_kick_rdev_from_array(rdev); 9543 } 9544 } 9545 9546 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9547 spin_lock(&mddev->lock); 9548 set_in_sync(mddev); 9549 spin_unlock(&mddev->lock); 9550 } 9551 9552 if (mddev->sb_flags) 9553 md_update_sb(mddev, 0); 9554 9555 /* 9556 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9557 * still set. 9558 */ 9559 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9560 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9561 /* resync/recovery still happening */ 9562 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9563 goto unlock; 9564 } 9565 9566 if (WARN_ON_ONCE(!mddev->sync_thread)) 9567 goto unlock; 9568 9569 md_reap_sync_thread(mddev); 9570 goto unlock; 9571 } 9572 9573 /* Set RUNNING before clearing NEEDED to avoid 9574 * any transients in the value of "sync_action". 9575 */ 9576 mddev->curr_resync_completed = 0; 9577 spin_lock(&mddev->lock); 9578 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9579 spin_unlock(&mddev->lock); 9580 /* Clear some bits that don't mean anything, but 9581 * might be left set 9582 */ 9583 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9584 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9585 9586 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9587 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 9588 queue_work(md_misc_wq, &mddev->sync_work); 9589 } else { 9590 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9591 wake_up(&resync_wait); 9592 } 9593 9594 unlock: 9595 wake_up(&mddev->sb_wait); 9596 mddev_unlock(mddev); 9597 } 9598 } 9599 EXPORT_SYMBOL(md_check_recovery); 9600 9601 void md_reap_sync_thread(struct mddev *mddev) 9602 { 9603 struct md_rdev *rdev; 9604 sector_t old_dev_sectors = mddev->dev_sectors; 9605 bool is_reshaped = false; 9606 9607 /* resync has finished, collect result */ 9608 md_unregister_thread(mddev, &mddev->sync_thread); 9609 atomic_inc(&mddev->sync_seq); 9610 9611 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9612 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9613 mddev->degraded != mddev->raid_disks) { 9614 /* success...*/ 9615 /* activate any spares */ 9616 if (mddev->pers->spare_active(mddev)) { 9617 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9618 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9619 } 9620 } 9621 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9622 mddev->pers->finish_reshape) { 9623 mddev->pers->finish_reshape(mddev); 9624 if (mddev_is_clustered(mddev)) 9625 is_reshaped = true; 9626 } 9627 9628 /* If array is no-longer degraded, then any saved_raid_disk 9629 * information must be scrapped. 9630 */ 9631 if (!mddev->degraded) 9632 rdev_for_each(rdev, mddev) 9633 rdev->saved_raid_disk = -1; 9634 9635 md_update_sb(mddev, 1); 9636 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9637 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9638 * clustered raid */ 9639 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9640 md_cluster_ops->resync_finish(mddev); 9641 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9642 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9643 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9644 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9645 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9646 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9647 /* 9648 * We call md_cluster_ops->update_size here because sync_size could 9649 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9650 * so it is time to update size across cluster. 9651 */ 9652 if (mddev_is_clustered(mddev) && is_reshaped 9653 && !test_bit(MD_CLOSING, &mddev->flags)) 9654 md_cluster_ops->update_size(mddev, old_dev_sectors); 9655 /* flag recovery needed just to double check */ 9656 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9657 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9658 sysfs_notify_dirent_safe(mddev->sysfs_action); 9659 md_new_event(); 9660 if (mddev->event_work.func) 9661 queue_work(md_misc_wq, &mddev->event_work); 9662 wake_up(&resync_wait); 9663 } 9664 EXPORT_SYMBOL(md_reap_sync_thread); 9665 9666 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9667 { 9668 sysfs_notify_dirent_safe(rdev->sysfs_state); 9669 wait_event_timeout(rdev->blocked_wait, 9670 !test_bit(Blocked, &rdev->flags) && 9671 !test_bit(BlockedBadBlocks, &rdev->flags), 9672 msecs_to_jiffies(5000)); 9673 rdev_dec_pending(rdev, mddev); 9674 } 9675 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9676 9677 void md_finish_reshape(struct mddev *mddev) 9678 { 9679 /* called be personality module when reshape completes. */ 9680 struct md_rdev *rdev; 9681 9682 rdev_for_each(rdev, mddev) { 9683 if (rdev->data_offset > rdev->new_data_offset) 9684 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9685 else 9686 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9687 rdev->data_offset = rdev->new_data_offset; 9688 } 9689 } 9690 EXPORT_SYMBOL(md_finish_reshape); 9691 9692 /* Bad block management */ 9693 9694 /* Returns 1 on success, 0 on failure */ 9695 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9696 int is_new) 9697 { 9698 struct mddev *mddev = rdev->mddev; 9699 int rv; 9700 if (is_new) 9701 s += rdev->new_data_offset; 9702 else 9703 s += rdev->data_offset; 9704 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 9705 if (rv == 0) { 9706 /* Make sure they get written out promptly */ 9707 if (test_bit(ExternalBbl, &rdev->flags)) 9708 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9709 sysfs_notify_dirent_safe(rdev->sysfs_state); 9710 set_mask_bits(&mddev->sb_flags, 0, 9711 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9712 md_wakeup_thread(rdev->mddev->thread); 9713 return 1; 9714 } else 9715 return 0; 9716 } 9717 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9718 9719 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9720 int is_new) 9721 { 9722 int rv; 9723 if (is_new) 9724 s += rdev->new_data_offset; 9725 else 9726 s += rdev->data_offset; 9727 rv = badblocks_clear(&rdev->badblocks, s, sectors); 9728 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 9729 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9730 return rv; 9731 } 9732 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9733 9734 static int md_notify_reboot(struct notifier_block *this, 9735 unsigned long code, void *x) 9736 { 9737 struct mddev *mddev, *n; 9738 int need_delay = 0; 9739 9740 spin_lock(&all_mddevs_lock); 9741 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 9742 if (!mddev_get(mddev)) 9743 continue; 9744 spin_unlock(&all_mddevs_lock); 9745 if (mddev_trylock(mddev)) { 9746 if (mddev->pers) 9747 __md_stop_writes(mddev); 9748 if (mddev->persistent) 9749 mddev->safemode = 2; 9750 mddev_unlock(mddev); 9751 } 9752 need_delay = 1; 9753 mddev_put(mddev); 9754 spin_lock(&all_mddevs_lock); 9755 } 9756 spin_unlock(&all_mddevs_lock); 9757 9758 /* 9759 * certain more exotic SCSI devices are known to be 9760 * volatile wrt too early system reboots. While the 9761 * right place to handle this issue is the given 9762 * driver, we do want to have a safe RAID driver ... 9763 */ 9764 if (need_delay) 9765 msleep(1000); 9766 9767 return NOTIFY_DONE; 9768 } 9769 9770 static struct notifier_block md_notifier = { 9771 .notifier_call = md_notify_reboot, 9772 .next = NULL, 9773 .priority = INT_MAX, /* before any real devices */ 9774 }; 9775 9776 static void md_geninit(void) 9777 { 9778 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9779 9780 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 9781 } 9782 9783 static int __init md_init(void) 9784 { 9785 int ret = -ENOMEM; 9786 9787 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9788 if (!md_wq) 9789 goto err_wq; 9790 9791 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9792 if (!md_misc_wq) 9793 goto err_misc_wq; 9794 9795 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 9796 0); 9797 if (!md_bitmap_wq) 9798 goto err_bitmap_wq; 9799 9800 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 9801 if (ret < 0) 9802 goto err_md; 9803 9804 ret = __register_blkdev(0, "mdp", md_probe); 9805 if (ret < 0) 9806 goto err_mdp; 9807 mdp_major = ret; 9808 9809 register_reboot_notifier(&md_notifier); 9810 raid_table_header = register_sysctl("dev/raid", raid_table); 9811 9812 md_geninit(); 9813 return 0; 9814 9815 err_mdp: 9816 unregister_blkdev(MD_MAJOR, "md"); 9817 err_md: 9818 destroy_workqueue(md_bitmap_wq); 9819 err_bitmap_wq: 9820 destroy_workqueue(md_misc_wq); 9821 err_misc_wq: 9822 destroy_workqueue(md_wq); 9823 err_wq: 9824 return ret; 9825 } 9826 9827 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9828 { 9829 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9830 struct md_rdev *rdev2, *tmp; 9831 int role, ret; 9832 9833 /* 9834 * If size is changed in another node then we need to 9835 * do resize as well. 9836 */ 9837 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9838 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9839 if (ret) 9840 pr_info("md-cluster: resize failed\n"); 9841 else 9842 md_bitmap_update_sb(mddev->bitmap); 9843 } 9844 9845 /* Check for change of roles in the active devices */ 9846 rdev_for_each_safe(rdev2, tmp, mddev) { 9847 if (test_bit(Faulty, &rdev2->flags)) 9848 continue; 9849 9850 /* Check if the roles changed */ 9851 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9852 9853 if (test_bit(Candidate, &rdev2->flags)) { 9854 if (role == MD_DISK_ROLE_FAULTY) { 9855 pr_info("md: Removing Candidate device %pg because add failed\n", 9856 rdev2->bdev); 9857 md_kick_rdev_from_array(rdev2); 9858 continue; 9859 } 9860 else 9861 clear_bit(Candidate, &rdev2->flags); 9862 } 9863 9864 if (role != rdev2->raid_disk) { 9865 /* 9866 * got activated except reshape is happening. 9867 */ 9868 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 9869 !(le32_to_cpu(sb->feature_map) & 9870 MD_FEATURE_RESHAPE_ACTIVE)) { 9871 rdev2->saved_raid_disk = role; 9872 ret = remove_and_add_spares(mddev, rdev2); 9873 pr_info("Activated spare: %pg\n", 9874 rdev2->bdev); 9875 /* wakeup mddev->thread here, so array could 9876 * perform resync with the new activated disk */ 9877 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9878 md_wakeup_thread(mddev->thread); 9879 } 9880 /* device faulty 9881 * We just want to do the minimum to mark the disk 9882 * as faulty. The recovery is performed by the 9883 * one who initiated the error. 9884 */ 9885 if (role == MD_DISK_ROLE_FAULTY || 9886 role == MD_DISK_ROLE_JOURNAL) { 9887 md_error(mddev, rdev2); 9888 clear_bit(Blocked, &rdev2->flags); 9889 } 9890 } 9891 } 9892 9893 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 9894 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9895 if (ret) 9896 pr_warn("md: updating array disks failed. %d\n", ret); 9897 } 9898 9899 /* 9900 * Since mddev->delta_disks has already updated in update_raid_disks, 9901 * so it is time to check reshape. 9902 */ 9903 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9904 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9905 /* 9906 * reshape is happening in the remote node, we need to 9907 * update reshape_position and call start_reshape. 9908 */ 9909 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 9910 if (mddev->pers->update_reshape_pos) 9911 mddev->pers->update_reshape_pos(mddev); 9912 if (mddev->pers->start_reshape) 9913 mddev->pers->start_reshape(mddev); 9914 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9915 mddev->reshape_position != MaxSector && 9916 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9917 /* reshape is just done in another node. */ 9918 mddev->reshape_position = MaxSector; 9919 if (mddev->pers->update_reshape_pos) 9920 mddev->pers->update_reshape_pos(mddev); 9921 } 9922 9923 /* Finally set the event to be up to date */ 9924 mddev->events = le64_to_cpu(sb->events); 9925 } 9926 9927 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9928 { 9929 int err; 9930 struct page *swapout = rdev->sb_page; 9931 struct mdp_superblock_1 *sb; 9932 9933 /* Store the sb page of the rdev in the swapout temporary 9934 * variable in case we err in the future 9935 */ 9936 rdev->sb_page = NULL; 9937 err = alloc_disk_sb(rdev); 9938 if (err == 0) { 9939 ClearPageUptodate(rdev->sb_page); 9940 rdev->sb_loaded = 0; 9941 err = super_types[mddev->major_version]. 9942 load_super(rdev, NULL, mddev->minor_version); 9943 } 9944 if (err < 0) { 9945 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9946 __func__, __LINE__, rdev->desc_nr, err); 9947 if (rdev->sb_page) 9948 put_page(rdev->sb_page); 9949 rdev->sb_page = swapout; 9950 rdev->sb_loaded = 1; 9951 return err; 9952 } 9953 9954 sb = page_address(rdev->sb_page); 9955 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9956 * is not set 9957 */ 9958 9959 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9960 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9961 9962 /* The other node finished recovery, call spare_active to set 9963 * device In_sync and mddev->degraded 9964 */ 9965 if (rdev->recovery_offset == MaxSector && 9966 !test_bit(In_sync, &rdev->flags) && 9967 mddev->pers->spare_active(mddev)) 9968 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9969 9970 put_page(swapout); 9971 return 0; 9972 } 9973 9974 void md_reload_sb(struct mddev *mddev, int nr) 9975 { 9976 struct md_rdev *rdev = NULL, *iter; 9977 int err; 9978 9979 /* Find the rdev */ 9980 rdev_for_each_rcu(iter, mddev) { 9981 if (iter->desc_nr == nr) { 9982 rdev = iter; 9983 break; 9984 } 9985 } 9986 9987 if (!rdev) { 9988 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9989 return; 9990 } 9991 9992 err = read_rdev(mddev, rdev); 9993 if (err < 0) 9994 return; 9995 9996 check_sb_changes(mddev, rdev); 9997 9998 /* Read all rdev's to update recovery_offset */ 9999 rdev_for_each_rcu(rdev, mddev) { 10000 if (!test_bit(Faulty, &rdev->flags)) 10001 read_rdev(mddev, rdev); 10002 } 10003 } 10004 EXPORT_SYMBOL(md_reload_sb); 10005 10006 #ifndef MODULE 10007 10008 /* 10009 * Searches all registered partitions for autorun RAID arrays 10010 * at boot time. 10011 */ 10012 10013 static DEFINE_MUTEX(detected_devices_mutex); 10014 static LIST_HEAD(all_detected_devices); 10015 struct detected_devices_node { 10016 struct list_head list; 10017 dev_t dev; 10018 }; 10019 10020 void md_autodetect_dev(dev_t dev) 10021 { 10022 struct detected_devices_node *node_detected_dev; 10023 10024 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 10025 if (node_detected_dev) { 10026 node_detected_dev->dev = dev; 10027 mutex_lock(&detected_devices_mutex); 10028 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10029 mutex_unlock(&detected_devices_mutex); 10030 } 10031 } 10032 10033 void md_autostart_arrays(int part) 10034 { 10035 struct md_rdev *rdev; 10036 struct detected_devices_node *node_detected_dev; 10037 dev_t dev; 10038 int i_scanned, i_passed; 10039 10040 i_scanned = 0; 10041 i_passed = 0; 10042 10043 pr_info("md: Autodetecting RAID arrays.\n"); 10044 10045 mutex_lock(&detected_devices_mutex); 10046 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10047 i_scanned++; 10048 node_detected_dev = list_entry(all_detected_devices.next, 10049 struct detected_devices_node, list); 10050 list_del(&node_detected_dev->list); 10051 dev = node_detected_dev->dev; 10052 kfree(node_detected_dev); 10053 mutex_unlock(&detected_devices_mutex); 10054 rdev = md_import_device(dev,0, 90); 10055 mutex_lock(&detected_devices_mutex); 10056 if (IS_ERR(rdev)) 10057 continue; 10058 10059 if (test_bit(Faulty, &rdev->flags)) 10060 continue; 10061 10062 set_bit(AutoDetected, &rdev->flags); 10063 list_add(&rdev->same_set, &pending_raid_disks); 10064 i_passed++; 10065 } 10066 mutex_unlock(&detected_devices_mutex); 10067 10068 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10069 10070 autorun_devices(part); 10071 } 10072 10073 #endif /* !MODULE */ 10074 10075 static __exit void md_exit(void) 10076 { 10077 struct mddev *mddev, *n; 10078 int delay = 1; 10079 10080 unregister_blkdev(MD_MAJOR,"md"); 10081 unregister_blkdev(mdp_major, "mdp"); 10082 unregister_reboot_notifier(&md_notifier); 10083 unregister_sysctl_table(raid_table_header); 10084 10085 /* We cannot unload the modules while some process is 10086 * waiting for us in select() or poll() - wake them up 10087 */ 10088 md_unloading = 1; 10089 while (waitqueue_active(&md_event_waiters)) { 10090 /* not safe to leave yet */ 10091 wake_up(&md_event_waiters); 10092 msleep(delay); 10093 delay += delay; 10094 } 10095 remove_proc_entry("mdstat", NULL); 10096 10097 spin_lock(&all_mddevs_lock); 10098 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 10099 if (!mddev_get(mddev)) 10100 continue; 10101 spin_unlock(&all_mddevs_lock); 10102 export_array(mddev); 10103 mddev->ctime = 0; 10104 mddev->hold_active = 0; 10105 /* 10106 * As the mddev is now fully clear, mddev_put will schedule 10107 * the mddev for destruction by a workqueue, and the 10108 * destroy_workqueue() below will wait for that to complete. 10109 */ 10110 mddev_put(mddev); 10111 spin_lock(&all_mddevs_lock); 10112 } 10113 spin_unlock(&all_mddevs_lock); 10114 10115 destroy_workqueue(md_misc_wq); 10116 destroy_workqueue(md_bitmap_wq); 10117 destroy_workqueue(md_wq); 10118 } 10119 10120 subsys_initcall(md_init); 10121 module_exit(md_exit) 10122 10123 static int get_ro(char *buffer, const struct kernel_param *kp) 10124 { 10125 return sprintf(buffer, "%d\n", start_readonly); 10126 } 10127 static int set_ro(const char *val, const struct kernel_param *kp) 10128 { 10129 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10130 } 10131 10132 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10133 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10134 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10135 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10136 10137 MODULE_LICENSE("GPL"); 10138 MODULE_DESCRIPTION("MD RAID framework"); 10139 MODULE_ALIAS("md"); 10140 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10141