1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include <trace/events/block.h> 69 #include "md.h" 70 #include "md-bitmap.h" 71 #include "md-cluster.h" 72 73 /* pers_list is a list of registered personalities protected by pers_lock. */ 74 static LIST_HEAD(pers_list); 75 static DEFINE_SPINLOCK(pers_lock); 76 77 static const struct kobj_type md_ktype; 78 79 struct md_cluster_operations *md_cluster_ops; 80 EXPORT_SYMBOL(md_cluster_ops); 81 static struct module *md_cluster_mod; 82 83 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 84 static struct workqueue_struct *md_wq; 85 static struct workqueue_struct *md_misc_wq; 86 struct workqueue_struct *md_bitmap_wq; 87 88 static int remove_and_add_spares(struct mddev *mddev, 89 struct md_rdev *this); 90 static void mddev_detach(struct mddev *mddev); 91 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 92 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 93 94 enum md_ro_state { 95 MD_RDWR, 96 MD_RDONLY, 97 MD_AUTO_READ, 98 MD_MAX_STATE 99 }; 100 101 static bool md_is_rdwr(struct mddev *mddev) 102 { 103 return (mddev->ro == MD_RDWR); 104 } 105 106 /* 107 * Default number of read corrections we'll attempt on an rdev 108 * before ejecting it from the array. We divide the read error 109 * count by 2 for every hour elapsed between read errors. 110 */ 111 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 112 /* Default safemode delay: 200 msec */ 113 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 114 /* 115 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 116 * is 1000 KB/sec, so the extra system load does not show up that much. 117 * Increase it if you want to have more _guaranteed_ speed. Note that 118 * the RAID driver will use the maximum available bandwidth if the IO 119 * subsystem is idle. There is also an 'absolute maximum' reconstruction 120 * speed limit - in case reconstruction slows down your system despite 121 * idle IO detection. 122 * 123 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 124 * or /sys/block/mdX/md/sync_speed_{min,max} 125 */ 126 127 static int sysctl_speed_limit_min = 1000; 128 static int sysctl_speed_limit_max = 200000; 129 static inline int speed_min(struct mddev *mddev) 130 { 131 return mddev->sync_speed_min ? 132 mddev->sync_speed_min : sysctl_speed_limit_min; 133 } 134 135 static inline int speed_max(struct mddev *mddev) 136 { 137 return mddev->sync_speed_max ? 138 mddev->sync_speed_max : sysctl_speed_limit_max; 139 } 140 141 static void rdev_uninit_serial(struct md_rdev *rdev) 142 { 143 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 144 return; 145 146 kvfree(rdev->serial); 147 rdev->serial = NULL; 148 } 149 150 static void rdevs_uninit_serial(struct mddev *mddev) 151 { 152 struct md_rdev *rdev; 153 154 rdev_for_each(rdev, mddev) 155 rdev_uninit_serial(rdev); 156 } 157 158 static int rdev_init_serial(struct md_rdev *rdev) 159 { 160 /* serial_nums equals with BARRIER_BUCKETS_NR */ 161 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 162 struct serial_in_rdev *serial = NULL; 163 164 if (test_bit(CollisionCheck, &rdev->flags)) 165 return 0; 166 167 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 168 GFP_KERNEL); 169 if (!serial) 170 return -ENOMEM; 171 172 for (i = 0; i < serial_nums; i++) { 173 struct serial_in_rdev *serial_tmp = &serial[i]; 174 175 spin_lock_init(&serial_tmp->serial_lock); 176 serial_tmp->serial_rb = RB_ROOT_CACHED; 177 init_waitqueue_head(&serial_tmp->serial_io_wait); 178 } 179 180 rdev->serial = serial; 181 set_bit(CollisionCheck, &rdev->flags); 182 183 return 0; 184 } 185 186 static int rdevs_init_serial(struct mddev *mddev) 187 { 188 struct md_rdev *rdev; 189 int ret = 0; 190 191 rdev_for_each(rdev, mddev) { 192 ret = rdev_init_serial(rdev); 193 if (ret) 194 break; 195 } 196 197 /* Free all resources if pool is not existed */ 198 if (ret && !mddev->serial_info_pool) 199 rdevs_uninit_serial(mddev); 200 201 return ret; 202 } 203 204 /* 205 * rdev needs to enable serial stuffs if it meets the conditions: 206 * 1. it is multi-queue device flaged with writemostly. 207 * 2. the write-behind mode is enabled. 208 */ 209 static int rdev_need_serial(struct md_rdev *rdev) 210 { 211 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 212 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 213 test_bit(WriteMostly, &rdev->flags)); 214 } 215 216 /* 217 * Init resource for rdev(s), then create serial_info_pool if: 218 * 1. rdev is the first device which return true from rdev_enable_serial. 219 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 220 */ 221 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 222 { 223 int ret = 0; 224 225 if (rdev && !rdev_need_serial(rdev) && 226 !test_bit(CollisionCheck, &rdev->flags)) 227 return; 228 229 if (!rdev) 230 ret = rdevs_init_serial(mddev); 231 else 232 ret = rdev_init_serial(rdev); 233 if (ret) 234 return; 235 236 if (mddev->serial_info_pool == NULL) { 237 /* 238 * already in memalloc noio context by 239 * mddev_suspend() 240 */ 241 mddev->serial_info_pool = 242 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 243 sizeof(struct serial_info)); 244 if (!mddev->serial_info_pool) { 245 rdevs_uninit_serial(mddev); 246 pr_err("can't alloc memory pool for serialization\n"); 247 } 248 } 249 } 250 251 /* 252 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 253 * 1. rdev is the last device flaged with CollisionCheck. 254 * 2. when bitmap is destroyed while policy is not enabled. 255 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 256 */ 257 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 258 { 259 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 260 return; 261 262 if (mddev->serial_info_pool) { 263 struct md_rdev *temp; 264 int num = 0; /* used to track if other rdevs need the pool */ 265 266 rdev_for_each(temp, mddev) { 267 if (!rdev) { 268 if (!mddev->serialize_policy || 269 !rdev_need_serial(temp)) 270 rdev_uninit_serial(temp); 271 else 272 num++; 273 } else if (temp != rdev && 274 test_bit(CollisionCheck, &temp->flags)) 275 num++; 276 } 277 278 if (rdev) 279 rdev_uninit_serial(rdev); 280 281 if (num) 282 pr_info("The mempool could be used by other devices\n"); 283 else { 284 mempool_destroy(mddev->serial_info_pool); 285 mddev->serial_info_pool = NULL; 286 } 287 } 288 } 289 290 static struct ctl_table_header *raid_table_header; 291 292 static struct ctl_table raid_table[] = { 293 { 294 .procname = "speed_limit_min", 295 .data = &sysctl_speed_limit_min, 296 .maxlen = sizeof(int), 297 .mode = S_IRUGO|S_IWUSR, 298 .proc_handler = proc_dointvec, 299 }, 300 { 301 .procname = "speed_limit_max", 302 .data = &sysctl_speed_limit_max, 303 .maxlen = sizeof(int), 304 .mode = S_IRUGO|S_IWUSR, 305 .proc_handler = proc_dointvec, 306 }, 307 { } 308 }; 309 310 static int start_readonly; 311 312 /* 313 * The original mechanism for creating an md device is to create 314 * a device node in /dev and to open it. This causes races with device-close. 315 * The preferred method is to write to the "new_array" module parameter. 316 * This can avoid races. 317 * Setting create_on_open to false disables the original mechanism 318 * so all the races disappear. 319 */ 320 static bool create_on_open = true; 321 322 /* 323 * We have a system wide 'event count' that is incremented 324 * on any 'interesting' event, and readers of /proc/mdstat 325 * can use 'poll' or 'select' to find out when the event 326 * count increases. 327 * 328 * Events are: 329 * start array, stop array, error, add device, remove device, 330 * start build, activate spare 331 */ 332 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 333 static atomic_t md_event_count; 334 void md_new_event(void) 335 { 336 atomic_inc(&md_event_count); 337 wake_up(&md_event_waiters); 338 } 339 EXPORT_SYMBOL_GPL(md_new_event); 340 341 /* 342 * Enables to iterate over all existing md arrays 343 * all_mddevs_lock protects this list. 344 */ 345 static LIST_HEAD(all_mddevs); 346 static DEFINE_SPINLOCK(all_mddevs_lock); 347 348 static bool is_md_suspended(struct mddev *mddev) 349 { 350 return percpu_ref_is_dying(&mddev->active_io); 351 } 352 /* Rather than calling directly into the personality make_request function, 353 * IO requests come here first so that we can check if the device is 354 * being suspended pending a reconfiguration. 355 * We hold a refcount over the call to ->make_request. By the time that 356 * call has finished, the bio has been linked into some internal structure 357 * and so is visible to ->quiesce(), so we don't need the refcount any more. 358 */ 359 static bool is_suspended(struct mddev *mddev, struct bio *bio) 360 { 361 if (is_md_suspended(mddev)) 362 return true; 363 if (bio_data_dir(bio) != WRITE) 364 return false; 365 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 366 return false; 367 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 368 return false; 369 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 370 return false; 371 return true; 372 } 373 374 void md_handle_request(struct mddev *mddev, struct bio *bio) 375 { 376 check_suspended: 377 if (is_suspended(mddev, bio)) { 378 DEFINE_WAIT(__wait); 379 /* Bail out if REQ_NOWAIT is set for the bio */ 380 if (bio->bi_opf & REQ_NOWAIT) { 381 bio_wouldblock_error(bio); 382 return; 383 } 384 for (;;) { 385 prepare_to_wait(&mddev->sb_wait, &__wait, 386 TASK_UNINTERRUPTIBLE); 387 if (!is_suspended(mddev, bio)) 388 break; 389 schedule(); 390 } 391 finish_wait(&mddev->sb_wait, &__wait); 392 } 393 if (!percpu_ref_tryget_live(&mddev->active_io)) 394 goto check_suspended; 395 396 if (!mddev->pers->make_request(mddev, bio)) { 397 percpu_ref_put(&mddev->active_io); 398 goto check_suspended; 399 } 400 401 percpu_ref_put(&mddev->active_io); 402 } 403 EXPORT_SYMBOL(md_handle_request); 404 405 static void md_submit_bio(struct bio *bio) 406 { 407 const int rw = bio_data_dir(bio); 408 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 409 410 if (mddev == NULL || mddev->pers == NULL) { 411 bio_io_error(bio); 412 return; 413 } 414 415 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 416 bio_io_error(bio); 417 return; 418 } 419 420 bio = bio_split_to_limits(bio); 421 if (!bio) 422 return; 423 424 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 425 if (bio_sectors(bio) != 0) 426 bio->bi_status = BLK_STS_IOERR; 427 bio_endio(bio); 428 return; 429 } 430 431 /* bio could be mergeable after passing to underlayer */ 432 bio->bi_opf &= ~REQ_NOMERGE; 433 434 md_handle_request(mddev, bio); 435 } 436 437 /* 438 * Make sure no new requests are submitted to the device, and any requests that 439 * have been submitted are completely handled. 440 */ 441 int mddev_suspend(struct mddev *mddev, bool interruptible) 442 { 443 int err = 0; 444 445 /* 446 * hold reconfig_mutex to wait for normal io will deadlock, because 447 * other context can't update super_block, and normal io can rely on 448 * updating super_block. 449 */ 450 lockdep_assert_not_held(&mddev->reconfig_mutex); 451 452 if (interruptible) 453 err = mutex_lock_interruptible(&mddev->suspend_mutex); 454 else 455 mutex_lock(&mddev->suspend_mutex); 456 if (err) 457 return err; 458 459 if (mddev->suspended) { 460 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 461 mutex_unlock(&mddev->suspend_mutex); 462 return 0; 463 } 464 465 percpu_ref_kill(&mddev->active_io); 466 if (interruptible) 467 err = wait_event_interruptible(mddev->sb_wait, 468 percpu_ref_is_zero(&mddev->active_io)); 469 else 470 wait_event(mddev->sb_wait, 471 percpu_ref_is_zero(&mddev->active_io)); 472 if (err) { 473 percpu_ref_resurrect(&mddev->active_io); 474 mutex_unlock(&mddev->suspend_mutex); 475 return err; 476 } 477 478 /* 479 * For raid456, io might be waiting for reshape to make progress, 480 * allow new reshape to start while waiting for io to be done to 481 * prevent deadlock. 482 */ 483 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 484 485 del_timer_sync(&mddev->safemode_timer); 486 /* restrict memory reclaim I/O during raid array is suspend */ 487 mddev->noio_flag = memalloc_noio_save(); 488 489 mutex_unlock(&mddev->suspend_mutex); 490 return 0; 491 } 492 EXPORT_SYMBOL_GPL(mddev_suspend); 493 494 void mddev_resume(struct mddev *mddev) 495 { 496 lockdep_assert_not_held(&mddev->reconfig_mutex); 497 498 mutex_lock(&mddev->suspend_mutex); 499 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 500 if (mddev->suspended) { 501 mutex_unlock(&mddev->suspend_mutex); 502 return; 503 } 504 505 /* entred the memalloc scope from mddev_suspend() */ 506 memalloc_noio_restore(mddev->noio_flag); 507 508 percpu_ref_resurrect(&mddev->active_io); 509 wake_up(&mddev->sb_wait); 510 511 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 512 md_wakeup_thread(mddev->thread); 513 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 514 515 mutex_unlock(&mddev->suspend_mutex); 516 } 517 EXPORT_SYMBOL_GPL(mddev_resume); 518 519 /* 520 * Generic flush handling for md 521 */ 522 523 static void md_end_flush(struct bio *bio) 524 { 525 struct md_rdev *rdev = bio->bi_private; 526 struct mddev *mddev = rdev->mddev; 527 528 bio_put(bio); 529 530 rdev_dec_pending(rdev, mddev); 531 532 if (atomic_dec_and_test(&mddev->flush_pending)) { 533 /* The pre-request flush has finished */ 534 queue_work(md_wq, &mddev->flush_work); 535 } 536 } 537 538 static void md_submit_flush_data(struct work_struct *ws); 539 540 static void submit_flushes(struct work_struct *ws) 541 { 542 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 543 struct md_rdev *rdev; 544 545 mddev->start_flush = ktime_get_boottime(); 546 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 547 atomic_set(&mddev->flush_pending, 1); 548 rcu_read_lock(); 549 rdev_for_each_rcu(rdev, mddev) 550 if (rdev->raid_disk >= 0 && 551 !test_bit(Faulty, &rdev->flags)) { 552 /* Take two references, one is dropped 553 * when request finishes, one after 554 * we reclaim rcu_read_lock 555 */ 556 struct bio *bi; 557 atomic_inc(&rdev->nr_pending); 558 atomic_inc(&rdev->nr_pending); 559 rcu_read_unlock(); 560 bi = bio_alloc_bioset(rdev->bdev, 0, 561 REQ_OP_WRITE | REQ_PREFLUSH, 562 GFP_NOIO, &mddev->bio_set); 563 bi->bi_end_io = md_end_flush; 564 bi->bi_private = rdev; 565 atomic_inc(&mddev->flush_pending); 566 submit_bio(bi); 567 rcu_read_lock(); 568 rdev_dec_pending(rdev, mddev); 569 } 570 rcu_read_unlock(); 571 if (atomic_dec_and_test(&mddev->flush_pending)) 572 queue_work(md_wq, &mddev->flush_work); 573 } 574 575 static void md_submit_flush_data(struct work_struct *ws) 576 { 577 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 578 struct bio *bio = mddev->flush_bio; 579 580 /* 581 * must reset flush_bio before calling into md_handle_request to avoid a 582 * deadlock, because other bios passed md_handle_request suspend check 583 * could wait for this and below md_handle_request could wait for those 584 * bios because of suspend check 585 */ 586 spin_lock_irq(&mddev->lock); 587 mddev->prev_flush_start = mddev->start_flush; 588 mddev->flush_bio = NULL; 589 spin_unlock_irq(&mddev->lock); 590 wake_up(&mddev->sb_wait); 591 592 if (bio->bi_iter.bi_size == 0) { 593 /* an empty barrier - all done */ 594 bio_endio(bio); 595 } else { 596 bio->bi_opf &= ~REQ_PREFLUSH; 597 md_handle_request(mddev, bio); 598 } 599 } 600 601 /* 602 * Manages consolidation of flushes and submitting any flushes needed for 603 * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is 604 * being finished in another context. Returns false if the flushing is 605 * complete but still needs the I/O portion of the bio to be processed. 606 */ 607 bool md_flush_request(struct mddev *mddev, struct bio *bio) 608 { 609 ktime_t req_start = ktime_get_boottime(); 610 spin_lock_irq(&mddev->lock); 611 /* flush requests wait until ongoing flush completes, 612 * hence coalescing all the pending requests. 613 */ 614 wait_event_lock_irq(mddev->sb_wait, 615 !mddev->flush_bio || 616 ktime_before(req_start, mddev->prev_flush_start), 617 mddev->lock); 618 /* new request after previous flush is completed */ 619 if (ktime_after(req_start, mddev->prev_flush_start)) { 620 WARN_ON(mddev->flush_bio); 621 mddev->flush_bio = bio; 622 bio = NULL; 623 } 624 spin_unlock_irq(&mddev->lock); 625 626 if (!bio) { 627 INIT_WORK(&mddev->flush_work, submit_flushes); 628 queue_work(md_wq, &mddev->flush_work); 629 } else { 630 /* flush was performed for some other bio while we waited. */ 631 if (bio->bi_iter.bi_size == 0) 632 /* an empty barrier - all done */ 633 bio_endio(bio); 634 else { 635 bio->bi_opf &= ~REQ_PREFLUSH; 636 return false; 637 } 638 } 639 return true; 640 } 641 EXPORT_SYMBOL(md_flush_request); 642 643 static inline struct mddev *mddev_get(struct mddev *mddev) 644 { 645 lockdep_assert_held(&all_mddevs_lock); 646 647 if (test_bit(MD_DELETED, &mddev->flags)) 648 return NULL; 649 atomic_inc(&mddev->active); 650 return mddev; 651 } 652 653 static void mddev_delayed_delete(struct work_struct *ws); 654 655 static void __mddev_put(struct mddev *mddev) 656 { 657 if (mddev->raid_disks || !list_empty(&mddev->disks) || 658 mddev->ctime || mddev->hold_active) 659 return; 660 661 /* Array is not configured at all, and not held active, so destroy it */ 662 set_bit(MD_DELETED, &mddev->flags); 663 664 /* 665 * Call queue_work inside the spinlock so that flush_workqueue() after 666 * mddev_find will succeed in waiting for the work to be done. 667 */ 668 queue_work(md_misc_wq, &mddev->del_work); 669 } 670 671 void mddev_put(struct mddev *mddev) 672 { 673 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 674 return; 675 676 __mddev_put(mddev); 677 spin_unlock(&all_mddevs_lock); 678 } 679 680 static void md_safemode_timeout(struct timer_list *t); 681 static void md_start_sync(struct work_struct *ws); 682 683 static void active_io_release(struct percpu_ref *ref) 684 { 685 struct mddev *mddev = container_of(ref, struct mddev, active_io); 686 687 wake_up(&mddev->sb_wait); 688 } 689 690 static void no_op(struct percpu_ref *r) {} 691 692 int mddev_init(struct mddev *mddev) 693 { 694 695 if (percpu_ref_init(&mddev->active_io, active_io_release, 696 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 697 return -ENOMEM; 698 699 if (percpu_ref_init(&mddev->writes_pending, no_op, 700 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 701 percpu_ref_exit(&mddev->active_io); 702 return -ENOMEM; 703 } 704 705 /* We want to start with the refcount at zero */ 706 percpu_ref_put(&mddev->writes_pending); 707 708 mutex_init(&mddev->open_mutex); 709 mutex_init(&mddev->reconfig_mutex); 710 mutex_init(&mddev->sync_mutex); 711 mutex_init(&mddev->suspend_mutex); 712 mutex_init(&mddev->bitmap_info.mutex); 713 INIT_LIST_HEAD(&mddev->disks); 714 INIT_LIST_HEAD(&mddev->all_mddevs); 715 INIT_LIST_HEAD(&mddev->deleting); 716 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 717 atomic_set(&mddev->active, 1); 718 atomic_set(&mddev->openers, 0); 719 atomic_set(&mddev->sync_seq, 0); 720 spin_lock_init(&mddev->lock); 721 atomic_set(&mddev->flush_pending, 0); 722 init_waitqueue_head(&mddev->sb_wait); 723 init_waitqueue_head(&mddev->recovery_wait); 724 mddev->reshape_position = MaxSector; 725 mddev->reshape_backwards = 0; 726 mddev->last_sync_action = "none"; 727 mddev->resync_min = 0; 728 mddev->resync_max = MaxSector; 729 mddev->level = LEVEL_NONE; 730 731 INIT_WORK(&mddev->sync_work, md_start_sync); 732 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 733 734 return 0; 735 } 736 EXPORT_SYMBOL_GPL(mddev_init); 737 738 void mddev_destroy(struct mddev *mddev) 739 { 740 percpu_ref_exit(&mddev->active_io); 741 percpu_ref_exit(&mddev->writes_pending); 742 } 743 EXPORT_SYMBOL_GPL(mddev_destroy); 744 745 static struct mddev *mddev_find_locked(dev_t unit) 746 { 747 struct mddev *mddev; 748 749 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 750 if (mddev->unit == unit) 751 return mddev; 752 753 return NULL; 754 } 755 756 /* find an unused unit number */ 757 static dev_t mddev_alloc_unit(void) 758 { 759 static int next_minor = 512; 760 int start = next_minor; 761 bool is_free = 0; 762 dev_t dev = 0; 763 764 while (!is_free) { 765 dev = MKDEV(MD_MAJOR, next_minor); 766 next_minor++; 767 if (next_minor > MINORMASK) 768 next_minor = 0; 769 if (next_minor == start) 770 return 0; /* Oh dear, all in use. */ 771 is_free = !mddev_find_locked(dev); 772 } 773 774 return dev; 775 } 776 777 static struct mddev *mddev_alloc(dev_t unit) 778 { 779 struct mddev *new; 780 int error; 781 782 if (unit && MAJOR(unit) != MD_MAJOR) 783 unit &= ~((1 << MdpMinorShift) - 1); 784 785 new = kzalloc(sizeof(*new), GFP_KERNEL); 786 if (!new) 787 return ERR_PTR(-ENOMEM); 788 789 error = mddev_init(new); 790 if (error) 791 goto out_free_new; 792 793 spin_lock(&all_mddevs_lock); 794 if (unit) { 795 error = -EEXIST; 796 if (mddev_find_locked(unit)) 797 goto out_destroy_new; 798 new->unit = unit; 799 if (MAJOR(unit) == MD_MAJOR) 800 new->md_minor = MINOR(unit); 801 else 802 new->md_minor = MINOR(unit) >> MdpMinorShift; 803 new->hold_active = UNTIL_IOCTL; 804 } else { 805 error = -ENODEV; 806 new->unit = mddev_alloc_unit(); 807 if (!new->unit) 808 goto out_destroy_new; 809 new->md_minor = MINOR(new->unit); 810 new->hold_active = UNTIL_STOP; 811 } 812 813 list_add(&new->all_mddevs, &all_mddevs); 814 spin_unlock(&all_mddevs_lock); 815 return new; 816 817 out_destroy_new: 818 spin_unlock(&all_mddevs_lock); 819 mddev_destroy(new); 820 out_free_new: 821 kfree(new); 822 return ERR_PTR(error); 823 } 824 825 static void mddev_free(struct mddev *mddev) 826 { 827 spin_lock(&all_mddevs_lock); 828 list_del(&mddev->all_mddevs); 829 spin_unlock(&all_mddevs_lock); 830 831 mddev_destroy(mddev); 832 kfree(mddev); 833 } 834 835 static const struct attribute_group md_redundancy_group; 836 837 void mddev_unlock(struct mddev *mddev) 838 { 839 struct md_rdev *rdev; 840 struct md_rdev *tmp; 841 LIST_HEAD(delete); 842 843 if (!list_empty(&mddev->deleting)) 844 list_splice_init(&mddev->deleting, &delete); 845 846 if (mddev->to_remove) { 847 /* These cannot be removed under reconfig_mutex as 848 * an access to the files will try to take reconfig_mutex 849 * while holding the file unremovable, which leads to 850 * a deadlock. 851 * So hold set sysfs_active while the remove in happeing, 852 * and anything else which might set ->to_remove or my 853 * otherwise change the sysfs namespace will fail with 854 * -EBUSY if sysfs_active is still set. 855 * We set sysfs_active under reconfig_mutex and elsewhere 856 * test it under the same mutex to ensure its correct value 857 * is seen. 858 */ 859 const struct attribute_group *to_remove = mddev->to_remove; 860 mddev->to_remove = NULL; 861 mddev->sysfs_active = 1; 862 mutex_unlock(&mddev->reconfig_mutex); 863 864 if (mddev->kobj.sd) { 865 if (to_remove != &md_redundancy_group) 866 sysfs_remove_group(&mddev->kobj, to_remove); 867 if (mddev->pers == NULL || 868 mddev->pers->sync_request == NULL) { 869 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 870 if (mddev->sysfs_action) 871 sysfs_put(mddev->sysfs_action); 872 if (mddev->sysfs_completed) 873 sysfs_put(mddev->sysfs_completed); 874 if (mddev->sysfs_degraded) 875 sysfs_put(mddev->sysfs_degraded); 876 mddev->sysfs_action = NULL; 877 mddev->sysfs_completed = NULL; 878 mddev->sysfs_degraded = NULL; 879 } 880 } 881 mddev->sysfs_active = 0; 882 } else 883 mutex_unlock(&mddev->reconfig_mutex); 884 885 md_wakeup_thread(mddev->thread); 886 wake_up(&mddev->sb_wait); 887 888 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 889 list_del_init(&rdev->same_set); 890 kobject_del(&rdev->kobj); 891 export_rdev(rdev, mddev); 892 } 893 } 894 EXPORT_SYMBOL_GPL(mddev_unlock); 895 896 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 897 { 898 struct md_rdev *rdev; 899 900 rdev_for_each_rcu(rdev, mddev) 901 if (rdev->desc_nr == nr) 902 return rdev; 903 904 return NULL; 905 } 906 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 907 908 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 909 { 910 struct md_rdev *rdev; 911 912 rdev_for_each(rdev, mddev) 913 if (rdev->bdev->bd_dev == dev) 914 return rdev; 915 916 return NULL; 917 } 918 919 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 920 { 921 struct md_rdev *rdev; 922 923 rdev_for_each_rcu(rdev, mddev) 924 if (rdev->bdev->bd_dev == dev) 925 return rdev; 926 927 return NULL; 928 } 929 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 930 931 static struct md_personality *find_pers(int level, char *clevel) 932 { 933 struct md_personality *pers; 934 list_for_each_entry(pers, &pers_list, list) { 935 if (level != LEVEL_NONE && pers->level == level) 936 return pers; 937 if (strcmp(pers->name, clevel)==0) 938 return pers; 939 } 940 return NULL; 941 } 942 943 /* return the offset of the super block in 512byte sectors */ 944 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 945 { 946 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 947 } 948 949 static int alloc_disk_sb(struct md_rdev *rdev) 950 { 951 rdev->sb_page = alloc_page(GFP_KERNEL); 952 if (!rdev->sb_page) 953 return -ENOMEM; 954 return 0; 955 } 956 957 void md_rdev_clear(struct md_rdev *rdev) 958 { 959 if (rdev->sb_page) { 960 put_page(rdev->sb_page); 961 rdev->sb_loaded = 0; 962 rdev->sb_page = NULL; 963 rdev->sb_start = 0; 964 rdev->sectors = 0; 965 } 966 if (rdev->bb_page) { 967 put_page(rdev->bb_page); 968 rdev->bb_page = NULL; 969 } 970 badblocks_exit(&rdev->badblocks); 971 } 972 EXPORT_SYMBOL_GPL(md_rdev_clear); 973 974 static void super_written(struct bio *bio) 975 { 976 struct md_rdev *rdev = bio->bi_private; 977 struct mddev *mddev = rdev->mddev; 978 979 if (bio->bi_status) { 980 pr_err("md: %s gets error=%d\n", __func__, 981 blk_status_to_errno(bio->bi_status)); 982 md_error(mddev, rdev); 983 if (!test_bit(Faulty, &rdev->flags) 984 && (bio->bi_opf & MD_FAILFAST)) { 985 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 986 set_bit(LastDev, &rdev->flags); 987 } 988 } else 989 clear_bit(LastDev, &rdev->flags); 990 991 bio_put(bio); 992 993 rdev_dec_pending(rdev, mddev); 994 995 if (atomic_dec_and_test(&mddev->pending_writes)) 996 wake_up(&mddev->sb_wait); 997 } 998 999 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 1000 sector_t sector, int size, struct page *page) 1001 { 1002 /* write first size bytes of page to sector of rdev 1003 * Increment mddev->pending_writes before returning 1004 * and decrement it on completion, waking up sb_wait 1005 * if zero is reached. 1006 * If an error occurred, call md_error 1007 */ 1008 struct bio *bio; 1009 1010 if (!page) 1011 return; 1012 1013 if (test_bit(Faulty, &rdev->flags)) 1014 return; 1015 1016 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1017 1, 1018 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA, 1019 GFP_NOIO, &mddev->sync_set); 1020 1021 atomic_inc(&rdev->nr_pending); 1022 1023 bio->bi_iter.bi_sector = sector; 1024 __bio_add_page(bio, page, size, 0); 1025 bio->bi_private = rdev; 1026 bio->bi_end_io = super_written; 1027 1028 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 1029 test_bit(FailFast, &rdev->flags) && 1030 !test_bit(LastDev, &rdev->flags)) 1031 bio->bi_opf |= MD_FAILFAST; 1032 1033 atomic_inc(&mddev->pending_writes); 1034 submit_bio(bio); 1035 } 1036 1037 int md_super_wait(struct mddev *mddev) 1038 { 1039 /* wait for all superblock writes that were scheduled to complete */ 1040 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1041 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1042 return -EAGAIN; 1043 return 0; 1044 } 1045 1046 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1047 struct page *page, blk_opf_t opf, bool metadata_op) 1048 { 1049 struct bio bio; 1050 struct bio_vec bvec; 1051 1052 if (metadata_op && rdev->meta_bdev) 1053 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1054 else 1055 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1056 1057 if (metadata_op) 1058 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1059 else if (rdev->mddev->reshape_position != MaxSector && 1060 (rdev->mddev->reshape_backwards == 1061 (sector >= rdev->mddev->reshape_position))) 1062 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1063 else 1064 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1065 __bio_add_page(&bio, page, size, 0); 1066 1067 submit_bio_wait(&bio); 1068 1069 return !bio.bi_status; 1070 } 1071 EXPORT_SYMBOL_GPL(sync_page_io); 1072 1073 static int read_disk_sb(struct md_rdev *rdev, int size) 1074 { 1075 if (rdev->sb_loaded) 1076 return 0; 1077 1078 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1079 goto fail; 1080 rdev->sb_loaded = 1; 1081 return 0; 1082 1083 fail: 1084 pr_err("md: disabled device %pg, could not read superblock.\n", 1085 rdev->bdev); 1086 return -EINVAL; 1087 } 1088 1089 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1090 { 1091 return sb1->set_uuid0 == sb2->set_uuid0 && 1092 sb1->set_uuid1 == sb2->set_uuid1 && 1093 sb1->set_uuid2 == sb2->set_uuid2 && 1094 sb1->set_uuid3 == sb2->set_uuid3; 1095 } 1096 1097 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1098 { 1099 int ret; 1100 mdp_super_t *tmp1, *tmp2; 1101 1102 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1103 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1104 1105 if (!tmp1 || !tmp2) { 1106 ret = 0; 1107 goto abort; 1108 } 1109 1110 *tmp1 = *sb1; 1111 *tmp2 = *sb2; 1112 1113 /* 1114 * nr_disks is not constant 1115 */ 1116 tmp1->nr_disks = 0; 1117 tmp2->nr_disks = 0; 1118 1119 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1120 abort: 1121 kfree(tmp1); 1122 kfree(tmp2); 1123 return ret; 1124 } 1125 1126 static u32 md_csum_fold(u32 csum) 1127 { 1128 csum = (csum & 0xffff) + (csum >> 16); 1129 return (csum & 0xffff) + (csum >> 16); 1130 } 1131 1132 static unsigned int calc_sb_csum(mdp_super_t *sb) 1133 { 1134 u64 newcsum = 0; 1135 u32 *sb32 = (u32*)sb; 1136 int i; 1137 unsigned int disk_csum, csum; 1138 1139 disk_csum = sb->sb_csum; 1140 sb->sb_csum = 0; 1141 1142 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1143 newcsum += sb32[i]; 1144 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1145 1146 #ifdef CONFIG_ALPHA 1147 /* This used to use csum_partial, which was wrong for several 1148 * reasons including that different results are returned on 1149 * different architectures. It isn't critical that we get exactly 1150 * the same return value as before (we always csum_fold before 1151 * testing, and that removes any differences). However as we 1152 * know that csum_partial always returned a 16bit value on 1153 * alphas, do a fold to maximise conformity to previous behaviour. 1154 */ 1155 sb->sb_csum = md_csum_fold(disk_csum); 1156 #else 1157 sb->sb_csum = disk_csum; 1158 #endif 1159 return csum; 1160 } 1161 1162 /* 1163 * Handle superblock details. 1164 * We want to be able to handle multiple superblock formats 1165 * so we have a common interface to them all, and an array of 1166 * different handlers. 1167 * We rely on user-space to write the initial superblock, and support 1168 * reading and updating of superblocks. 1169 * Interface methods are: 1170 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1171 * loads and validates a superblock on dev. 1172 * if refdev != NULL, compare superblocks on both devices 1173 * Return: 1174 * 0 - dev has a superblock that is compatible with refdev 1175 * 1 - dev has a superblock that is compatible and newer than refdev 1176 * so dev should be used as the refdev in future 1177 * -EINVAL superblock incompatible or invalid 1178 * -othererror e.g. -EIO 1179 * 1180 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1181 * Verify that dev is acceptable into mddev. 1182 * The first time, mddev->raid_disks will be 0, and data from 1183 * dev should be merged in. Subsequent calls check that dev 1184 * is new enough. Return 0 or -EINVAL 1185 * 1186 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1187 * Update the superblock for rdev with data in mddev 1188 * This does not write to disc. 1189 * 1190 */ 1191 1192 struct super_type { 1193 char *name; 1194 struct module *owner; 1195 int (*load_super)(struct md_rdev *rdev, 1196 struct md_rdev *refdev, 1197 int minor_version); 1198 int (*validate_super)(struct mddev *mddev, 1199 struct md_rdev *rdev); 1200 void (*sync_super)(struct mddev *mddev, 1201 struct md_rdev *rdev); 1202 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1203 sector_t num_sectors); 1204 int (*allow_new_offset)(struct md_rdev *rdev, 1205 unsigned long long new_offset); 1206 }; 1207 1208 /* 1209 * Check that the given mddev has no bitmap. 1210 * 1211 * This function is called from the run method of all personalities that do not 1212 * support bitmaps. It prints an error message and returns non-zero if mddev 1213 * has a bitmap. Otherwise, it returns 0. 1214 * 1215 */ 1216 int md_check_no_bitmap(struct mddev *mddev) 1217 { 1218 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1219 return 0; 1220 pr_warn("%s: bitmaps are not supported for %s\n", 1221 mdname(mddev), mddev->pers->name); 1222 return 1; 1223 } 1224 EXPORT_SYMBOL(md_check_no_bitmap); 1225 1226 /* 1227 * load_super for 0.90.0 1228 */ 1229 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1230 { 1231 mdp_super_t *sb; 1232 int ret; 1233 bool spare_disk = true; 1234 1235 /* 1236 * Calculate the position of the superblock (512byte sectors), 1237 * it's at the end of the disk. 1238 * 1239 * It also happens to be a multiple of 4Kb. 1240 */ 1241 rdev->sb_start = calc_dev_sboffset(rdev); 1242 1243 ret = read_disk_sb(rdev, MD_SB_BYTES); 1244 if (ret) 1245 return ret; 1246 1247 ret = -EINVAL; 1248 1249 sb = page_address(rdev->sb_page); 1250 1251 if (sb->md_magic != MD_SB_MAGIC) { 1252 pr_warn("md: invalid raid superblock magic on %pg\n", 1253 rdev->bdev); 1254 goto abort; 1255 } 1256 1257 if (sb->major_version != 0 || 1258 sb->minor_version < 90 || 1259 sb->minor_version > 91) { 1260 pr_warn("Bad version number %d.%d on %pg\n", 1261 sb->major_version, sb->minor_version, rdev->bdev); 1262 goto abort; 1263 } 1264 1265 if (sb->raid_disks <= 0) 1266 goto abort; 1267 1268 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1269 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1270 goto abort; 1271 } 1272 1273 rdev->preferred_minor = sb->md_minor; 1274 rdev->data_offset = 0; 1275 rdev->new_data_offset = 0; 1276 rdev->sb_size = MD_SB_BYTES; 1277 rdev->badblocks.shift = -1; 1278 1279 if (sb->level == LEVEL_MULTIPATH) 1280 rdev->desc_nr = -1; 1281 else 1282 rdev->desc_nr = sb->this_disk.number; 1283 1284 /* not spare disk, or LEVEL_MULTIPATH */ 1285 if (sb->level == LEVEL_MULTIPATH || 1286 (rdev->desc_nr >= 0 && 1287 rdev->desc_nr < MD_SB_DISKS && 1288 sb->disks[rdev->desc_nr].state & 1289 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) 1290 spare_disk = false; 1291 1292 if (!refdev) { 1293 if (!spare_disk) 1294 ret = 1; 1295 else 1296 ret = 0; 1297 } else { 1298 __u64 ev1, ev2; 1299 mdp_super_t *refsb = page_address(refdev->sb_page); 1300 if (!md_uuid_equal(refsb, sb)) { 1301 pr_warn("md: %pg has different UUID to %pg\n", 1302 rdev->bdev, refdev->bdev); 1303 goto abort; 1304 } 1305 if (!md_sb_equal(refsb, sb)) { 1306 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1307 rdev->bdev, refdev->bdev); 1308 goto abort; 1309 } 1310 ev1 = md_event(sb); 1311 ev2 = md_event(refsb); 1312 1313 if (!spare_disk && ev1 > ev2) 1314 ret = 1; 1315 else 1316 ret = 0; 1317 } 1318 rdev->sectors = rdev->sb_start; 1319 /* Limit to 4TB as metadata cannot record more than that. 1320 * (not needed for Linear and RAID0 as metadata doesn't 1321 * record this size) 1322 */ 1323 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1324 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1325 1326 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1327 /* "this cannot possibly happen" ... */ 1328 ret = -EINVAL; 1329 1330 abort: 1331 return ret; 1332 } 1333 1334 /* 1335 * validate_super for 0.90.0 1336 */ 1337 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1338 { 1339 mdp_disk_t *desc; 1340 mdp_super_t *sb = page_address(rdev->sb_page); 1341 __u64 ev1 = md_event(sb); 1342 1343 rdev->raid_disk = -1; 1344 clear_bit(Faulty, &rdev->flags); 1345 clear_bit(In_sync, &rdev->flags); 1346 clear_bit(Bitmap_sync, &rdev->flags); 1347 clear_bit(WriteMostly, &rdev->flags); 1348 1349 if (mddev->raid_disks == 0) { 1350 mddev->major_version = 0; 1351 mddev->minor_version = sb->minor_version; 1352 mddev->patch_version = sb->patch_version; 1353 mddev->external = 0; 1354 mddev->chunk_sectors = sb->chunk_size >> 9; 1355 mddev->ctime = sb->ctime; 1356 mddev->utime = sb->utime; 1357 mddev->level = sb->level; 1358 mddev->clevel[0] = 0; 1359 mddev->layout = sb->layout; 1360 mddev->raid_disks = sb->raid_disks; 1361 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1362 mddev->events = ev1; 1363 mddev->bitmap_info.offset = 0; 1364 mddev->bitmap_info.space = 0; 1365 /* bitmap can use 60 K after the 4K superblocks */ 1366 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1367 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1368 mddev->reshape_backwards = 0; 1369 1370 if (mddev->minor_version >= 91) { 1371 mddev->reshape_position = sb->reshape_position; 1372 mddev->delta_disks = sb->delta_disks; 1373 mddev->new_level = sb->new_level; 1374 mddev->new_layout = sb->new_layout; 1375 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1376 if (mddev->delta_disks < 0) 1377 mddev->reshape_backwards = 1; 1378 } else { 1379 mddev->reshape_position = MaxSector; 1380 mddev->delta_disks = 0; 1381 mddev->new_level = mddev->level; 1382 mddev->new_layout = mddev->layout; 1383 mddev->new_chunk_sectors = mddev->chunk_sectors; 1384 } 1385 if (mddev->level == 0) 1386 mddev->layout = -1; 1387 1388 if (sb->state & (1<<MD_SB_CLEAN)) 1389 mddev->recovery_cp = MaxSector; 1390 else { 1391 if (sb->events_hi == sb->cp_events_hi && 1392 sb->events_lo == sb->cp_events_lo) { 1393 mddev->recovery_cp = sb->recovery_cp; 1394 } else 1395 mddev->recovery_cp = 0; 1396 } 1397 1398 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1399 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1400 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1401 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1402 1403 mddev->max_disks = MD_SB_DISKS; 1404 1405 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1406 mddev->bitmap_info.file == NULL) { 1407 mddev->bitmap_info.offset = 1408 mddev->bitmap_info.default_offset; 1409 mddev->bitmap_info.space = 1410 mddev->bitmap_info.default_space; 1411 } 1412 1413 } else if (mddev->pers == NULL) { 1414 /* Insist on good event counter while assembling, except 1415 * for spares (which don't need an event count) */ 1416 ++ev1; 1417 if (sb->disks[rdev->desc_nr].state & ( 1418 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1419 if (ev1 < mddev->events) 1420 return -EINVAL; 1421 } else if (mddev->bitmap) { 1422 /* if adding to array with a bitmap, then we can accept an 1423 * older device ... but not too old. 1424 */ 1425 if (ev1 < mddev->bitmap->events_cleared) 1426 return 0; 1427 if (ev1 < mddev->events) 1428 set_bit(Bitmap_sync, &rdev->flags); 1429 } else { 1430 if (ev1 < mddev->events) 1431 /* just a hot-add of a new device, leave raid_disk at -1 */ 1432 return 0; 1433 } 1434 1435 if (mddev->level != LEVEL_MULTIPATH) { 1436 desc = sb->disks + rdev->desc_nr; 1437 1438 if (desc->state & (1<<MD_DISK_FAULTY)) 1439 set_bit(Faulty, &rdev->flags); 1440 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1441 desc->raid_disk < mddev->raid_disks */) { 1442 set_bit(In_sync, &rdev->flags); 1443 rdev->raid_disk = desc->raid_disk; 1444 rdev->saved_raid_disk = desc->raid_disk; 1445 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1446 /* active but not in sync implies recovery up to 1447 * reshape position. We don't know exactly where 1448 * that is, so set to zero for now */ 1449 if (mddev->minor_version >= 91) { 1450 rdev->recovery_offset = 0; 1451 rdev->raid_disk = desc->raid_disk; 1452 } 1453 } 1454 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1455 set_bit(WriteMostly, &rdev->flags); 1456 if (desc->state & (1<<MD_DISK_FAILFAST)) 1457 set_bit(FailFast, &rdev->flags); 1458 } else /* MULTIPATH are always insync */ 1459 set_bit(In_sync, &rdev->flags); 1460 return 0; 1461 } 1462 1463 /* 1464 * sync_super for 0.90.0 1465 */ 1466 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1467 { 1468 mdp_super_t *sb; 1469 struct md_rdev *rdev2; 1470 int next_spare = mddev->raid_disks; 1471 1472 /* make rdev->sb match mddev data.. 1473 * 1474 * 1/ zero out disks 1475 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1476 * 3/ any empty disks < next_spare become removed 1477 * 1478 * disks[0] gets initialised to REMOVED because 1479 * we cannot be sure from other fields if it has 1480 * been initialised or not. 1481 */ 1482 int i; 1483 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1484 1485 rdev->sb_size = MD_SB_BYTES; 1486 1487 sb = page_address(rdev->sb_page); 1488 1489 memset(sb, 0, sizeof(*sb)); 1490 1491 sb->md_magic = MD_SB_MAGIC; 1492 sb->major_version = mddev->major_version; 1493 sb->patch_version = mddev->patch_version; 1494 sb->gvalid_words = 0; /* ignored */ 1495 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1496 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1497 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1498 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1499 1500 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1501 sb->level = mddev->level; 1502 sb->size = mddev->dev_sectors / 2; 1503 sb->raid_disks = mddev->raid_disks; 1504 sb->md_minor = mddev->md_minor; 1505 sb->not_persistent = 0; 1506 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1507 sb->state = 0; 1508 sb->events_hi = (mddev->events>>32); 1509 sb->events_lo = (u32)mddev->events; 1510 1511 if (mddev->reshape_position == MaxSector) 1512 sb->minor_version = 90; 1513 else { 1514 sb->minor_version = 91; 1515 sb->reshape_position = mddev->reshape_position; 1516 sb->new_level = mddev->new_level; 1517 sb->delta_disks = mddev->delta_disks; 1518 sb->new_layout = mddev->new_layout; 1519 sb->new_chunk = mddev->new_chunk_sectors << 9; 1520 } 1521 mddev->minor_version = sb->minor_version; 1522 if (mddev->in_sync) 1523 { 1524 sb->recovery_cp = mddev->recovery_cp; 1525 sb->cp_events_hi = (mddev->events>>32); 1526 sb->cp_events_lo = (u32)mddev->events; 1527 if (mddev->recovery_cp == MaxSector) 1528 sb->state = (1<< MD_SB_CLEAN); 1529 } else 1530 sb->recovery_cp = 0; 1531 1532 sb->layout = mddev->layout; 1533 sb->chunk_size = mddev->chunk_sectors << 9; 1534 1535 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1536 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1537 1538 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1539 rdev_for_each(rdev2, mddev) { 1540 mdp_disk_t *d; 1541 int desc_nr; 1542 int is_active = test_bit(In_sync, &rdev2->flags); 1543 1544 if (rdev2->raid_disk >= 0 && 1545 sb->minor_version >= 91) 1546 /* we have nowhere to store the recovery_offset, 1547 * but if it is not below the reshape_position, 1548 * we can piggy-back on that. 1549 */ 1550 is_active = 1; 1551 if (rdev2->raid_disk < 0 || 1552 test_bit(Faulty, &rdev2->flags)) 1553 is_active = 0; 1554 if (is_active) 1555 desc_nr = rdev2->raid_disk; 1556 else 1557 desc_nr = next_spare++; 1558 rdev2->desc_nr = desc_nr; 1559 d = &sb->disks[rdev2->desc_nr]; 1560 nr_disks++; 1561 d->number = rdev2->desc_nr; 1562 d->major = MAJOR(rdev2->bdev->bd_dev); 1563 d->minor = MINOR(rdev2->bdev->bd_dev); 1564 if (is_active) 1565 d->raid_disk = rdev2->raid_disk; 1566 else 1567 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1568 if (test_bit(Faulty, &rdev2->flags)) 1569 d->state = (1<<MD_DISK_FAULTY); 1570 else if (is_active) { 1571 d->state = (1<<MD_DISK_ACTIVE); 1572 if (test_bit(In_sync, &rdev2->flags)) 1573 d->state |= (1<<MD_DISK_SYNC); 1574 active++; 1575 working++; 1576 } else { 1577 d->state = 0; 1578 spare++; 1579 working++; 1580 } 1581 if (test_bit(WriteMostly, &rdev2->flags)) 1582 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1583 if (test_bit(FailFast, &rdev2->flags)) 1584 d->state |= (1<<MD_DISK_FAILFAST); 1585 } 1586 /* now set the "removed" and "faulty" bits on any missing devices */ 1587 for (i=0 ; i < mddev->raid_disks ; i++) { 1588 mdp_disk_t *d = &sb->disks[i]; 1589 if (d->state == 0 && d->number == 0) { 1590 d->number = i; 1591 d->raid_disk = i; 1592 d->state = (1<<MD_DISK_REMOVED); 1593 d->state |= (1<<MD_DISK_FAULTY); 1594 failed++; 1595 } 1596 } 1597 sb->nr_disks = nr_disks; 1598 sb->active_disks = active; 1599 sb->working_disks = working; 1600 sb->failed_disks = failed; 1601 sb->spare_disks = spare; 1602 1603 sb->this_disk = sb->disks[rdev->desc_nr]; 1604 sb->sb_csum = calc_sb_csum(sb); 1605 } 1606 1607 /* 1608 * rdev_size_change for 0.90.0 1609 */ 1610 static unsigned long long 1611 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1612 { 1613 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1614 return 0; /* component must fit device */ 1615 if (rdev->mddev->bitmap_info.offset) 1616 return 0; /* can't move bitmap */ 1617 rdev->sb_start = calc_dev_sboffset(rdev); 1618 if (!num_sectors || num_sectors > rdev->sb_start) 1619 num_sectors = rdev->sb_start; 1620 /* Limit to 4TB as metadata cannot record more than that. 1621 * 4TB == 2^32 KB, or 2*2^32 sectors. 1622 */ 1623 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1624 num_sectors = (sector_t)(2ULL << 32) - 2; 1625 do { 1626 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1627 rdev->sb_page); 1628 } while (md_super_wait(rdev->mddev) < 0); 1629 return num_sectors; 1630 } 1631 1632 static int 1633 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1634 { 1635 /* non-zero offset changes not possible with v0.90 */ 1636 return new_offset == 0; 1637 } 1638 1639 /* 1640 * version 1 superblock 1641 */ 1642 1643 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1644 { 1645 __le32 disk_csum; 1646 u32 csum; 1647 unsigned long long newcsum; 1648 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1649 __le32 *isuper = (__le32*)sb; 1650 1651 disk_csum = sb->sb_csum; 1652 sb->sb_csum = 0; 1653 newcsum = 0; 1654 for (; size >= 4; size -= 4) 1655 newcsum += le32_to_cpu(*isuper++); 1656 1657 if (size == 2) 1658 newcsum += le16_to_cpu(*(__le16*) isuper); 1659 1660 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1661 sb->sb_csum = disk_csum; 1662 return cpu_to_le32(csum); 1663 } 1664 1665 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1666 { 1667 struct mdp_superblock_1 *sb; 1668 int ret; 1669 sector_t sb_start; 1670 sector_t sectors; 1671 int bmask; 1672 bool spare_disk = true; 1673 1674 /* 1675 * Calculate the position of the superblock in 512byte sectors. 1676 * It is always aligned to a 4K boundary and 1677 * depeding on minor_version, it can be: 1678 * 0: At least 8K, but less than 12K, from end of device 1679 * 1: At start of device 1680 * 2: 4K from start of device. 1681 */ 1682 switch(minor_version) { 1683 case 0: 1684 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1685 sb_start &= ~(sector_t)(4*2-1); 1686 break; 1687 case 1: 1688 sb_start = 0; 1689 break; 1690 case 2: 1691 sb_start = 8; 1692 break; 1693 default: 1694 return -EINVAL; 1695 } 1696 rdev->sb_start = sb_start; 1697 1698 /* superblock is rarely larger than 1K, but it can be larger, 1699 * and it is safe to read 4k, so we do that 1700 */ 1701 ret = read_disk_sb(rdev, 4096); 1702 if (ret) return ret; 1703 1704 sb = page_address(rdev->sb_page); 1705 1706 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1707 sb->major_version != cpu_to_le32(1) || 1708 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1709 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1710 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1711 return -EINVAL; 1712 1713 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1714 pr_warn("md: invalid superblock checksum on %pg\n", 1715 rdev->bdev); 1716 return -EINVAL; 1717 } 1718 if (le64_to_cpu(sb->data_size) < 10) { 1719 pr_warn("md: data_size too small on %pg\n", 1720 rdev->bdev); 1721 return -EINVAL; 1722 } 1723 if (sb->pad0 || 1724 sb->pad3[0] || 1725 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1726 /* Some padding is non-zero, might be a new feature */ 1727 return -EINVAL; 1728 1729 rdev->preferred_minor = 0xffff; 1730 rdev->data_offset = le64_to_cpu(sb->data_offset); 1731 rdev->new_data_offset = rdev->data_offset; 1732 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1733 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1734 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1735 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1736 1737 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1738 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1739 if (rdev->sb_size & bmask) 1740 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1741 1742 if (minor_version 1743 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1744 return -EINVAL; 1745 if (minor_version 1746 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1747 return -EINVAL; 1748 1749 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1750 rdev->desc_nr = -1; 1751 else 1752 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1753 1754 if (!rdev->bb_page) { 1755 rdev->bb_page = alloc_page(GFP_KERNEL); 1756 if (!rdev->bb_page) 1757 return -ENOMEM; 1758 } 1759 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1760 rdev->badblocks.count == 0) { 1761 /* need to load the bad block list. 1762 * Currently we limit it to one page. 1763 */ 1764 s32 offset; 1765 sector_t bb_sector; 1766 __le64 *bbp; 1767 int i; 1768 int sectors = le16_to_cpu(sb->bblog_size); 1769 if (sectors > (PAGE_SIZE / 512)) 1770 return -EINVAL; 1771 offset = le32_to_cpu(sb->bblog_offset); 1772 if (offset == 0) 1773 return -EINVAL; 1774 bb_sector = (long long)offset; 1775 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1776 rdev->bb_page, REQ_OP_READ, true)) 1777 return -EIO; 1778 bbp = (__le64 *)page_address(rdev->bb_page); 1779 rdev->badblocks.shift = sb->bblog_shift; 1780 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1781 u64 bb = le64_to_cpu(*bbp); 1782 int count = bb & (0x3ff); 1783 u64 sector = bb >> 10; 1784 sector <<= sb->bblog_shift; 1785 count <<= sb->bblog_shift; 1786 if (bb + 1 == 0) 1787 break; 1788 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1789 return -EINVAL; 1790 } 1791 } else if (sb->bblog_offset != 0) 1792 rdev->badblocks.shift = 0; 1793 1794 if ((le32_to_cpu(sb->feature_map) & 1795 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1796 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1797 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1798 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1799 } 1800 1801 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1802 sb->level != 0) 1803 return -EINVAL; 1804 1805 /* not spare disk, or LEVEL_MULTIPATH */ 1806 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || 1807 (rdev->desc_nr >= 0 && 1808 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1809 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1810 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) 1811 spare_disk = false; 1812 1813 if (!refdev) { 1814 if (!spare_disk) 1815 ret = 1; 1816 else 1817 ret = 0; 1818 } else { 1819 __u64 ev1, ev2; 1820 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1821 1822 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1823 sb->level != refsb->level || 1824 sb->layout != refsb->layout || 1825 sb->chunksize != refsb->chunksize) { 1826 pr_warn("md: %pg has strangely different superblock to %pg\n", 1827 rdev->bdev, 1828 refdev->bdev); 1829 return -EINVAL; 1830 } 1831 ev1 = le64_to_cpu(sb->events); 1832 ev2 = le64_to_cpu(refsb->events); 1833 1834 if (!spare_disk && ev1 > ev2) 1835 ret = 1; 1836 else 1837 ret = 0; 1838 } 1839 if (minor_version) 1840 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1841 else 1842 sectors = rdev->sb_start; 1843 if (sectors < le64_to_cpu(sb->data_size)) 1844 return -EINVAL; 1845 rdev->sectors = le64_to_cpu(sb->data_size); 1846 return ret; 1847 } 1848 1849 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1850 { 1851 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1852 __u64 ev1 = le64_to_cpu(sb->events); 1853 1854 rdev->raid_disk = -1; 1855 clear_bit(Faulty, &rdev->flags); 1856 clear_bit(In_sync, &rdev->flags); 1857 clear_bit(Bitmap_sync, &rdev->flags); 1858 clear_bit(WriteMostly, &rdev->flags); 1859 1860 if (mddev->raid_disks == 0) { 1861 mddev->major_version = 1; 1862 mddev->patch_version = 0; 1863 mddev->external = 0; 1864 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1865 mddev->ctime = le64_to_cpu(sb->ctime); 1866 mddev->utime = le64_to_cpu(sb->utime); 1867 mddev->level = le32_to_cpu(sb->level); 1868 mddev->clevel[0] = 0; 1869 mddev->layout = le32_to_cpu(sb->layout); 1870 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1871 mddev->dev_sectors = le64_to_cpu(sb->size); 1872 mddev->events = ev1; 1873 mddev->bitmap_info.offset = 0; 1874 mddev->bitmap_info.space = 0; 1875 /* Default location for bitmap is 1K after superblock 1876 * using 3K - total of 4K 1877 */ 1878 mddev->bitmap_info.default_offset = 1024 >> 9; 1879 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1880 mddev->reshape_backwards = 0; 1881 1882 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1883 memcpy(mddev->uuid, sb->set_uuid, 16); 1884 1885 mddev->max_disks = (4096-256)/2; 1886 1887 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1888 mddev->bitmap_info.file == NULL) { 1889 mddev->bitmap_info.offset = 1890 (__s32)le32_to_cpu(sb->bitmap_offset); 1891 /* Metadata doesn't record how much space is available. 1892 * For 1.0, we assume we can use up to the superblock 1893 * if before, else to 4K beyond superblock. 1894 * For others, assume no change is possible. 1895 */ 1896 if (mddev->minor_version > 0) 1897 mddev->bitmap_info.space = 0; 1898 else if (mddev->bitmap_info.offset > 0) 1899 mddev->bitmap_info.space = 1900 8 - mddev->bitmap_info.offset; 1901 else 1902 mddev->bitmap_info.space = 1903 -mddev->bitmap_info.offset; 1904 } 1905 1906 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1907 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1908 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1909 mddev->new_level = le32_to_cpu(sb->new_level); 1910 mddev->new_layout = le32_to_cpu(sb->new_layout); 1911 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1912 if (mddev->delta_disks < 0 || 1913 (mddev->delta_disks == 0 && 1914 (le32_to_cpu(sb->feature_map) 1915 & MD_FEATURE_RESHAPE_BACKWARDS))) 1916 mddev->reshape_backwards = 1; 1917 } else { 1918 mddev->reshape_position = MaxSector; 1919 mddev->delta_disks = 0; 1920 mddev->new_level = mddev->level; 1921 mddev->new_layout = mddev->layout; 1922 mddev->new_chunk_sectors = mddev->chunk_sectors; 1923 } 1924 1925 if (mddev->level == 0 && 1926 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1927 mddev->layout = -1; 1928 1929 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1930 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1931 1932 if (le32_to_cpu(sb->feature_map) & 1933 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1934 if (le32_to_cpu(sb->feature_map) & 1935 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1936 return -EINVAL; 1937 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1938 (le32_to_cpu(sb->feature_map) & 1939 MD_FEATURE_MULTIPLE_PPLS)) 1940 return -EINVAL; 1941 set_bit(MD_HAS_PPL, &mddev->flags); 1942 } 1943 } else if (mddev->pers == NULL) { 1944 /* Insist of good event counter while assembling, except for 1945 * spares (which don't need an event count) */ 1946 ++ev1; 1947 if (rdev->desc_nr >= 0 && 1948 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1949 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1950 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1951 if (ev1 < mddev->events) 1952 return -EINVAL; 1953 } else if (mddev->bitmap) { 1954 /* If adding to array with a bitmap, then we can accept an 1955 * older device, but not too old. 1956 */ 1957 if (ev1 < mddev->bitmap->events_cleared) 1958 return 0; 1959 if (ev1 < mddev->events) 1960 set_bit(Bitmap_sync, &rdev->flags); 1961 } else { 1962 if (ev1 < mddev->events) 1963 /* just a hot-add of a new device, leave raid_disk at -1 */ 1964 return 0; 1965 } 1966 if (mddev->level != LEVEL_MULTIPATH) { 1967 int role; 1968 if (rdev->desc_nr < 0 || 1969 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1970 role = MD_DISK_ROLE_SPARE; 1971 rdev->desc_nr = -1; 1972 } else 1973 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1974 switch(role) { 1975 case MD_DISK_ROLE_SPARE: /* spare */ 1976 break; 1977 case MD_DISK_ROLE_FAULTY: /* faulty */ 1978 set_bit(Faulty, &rdev->flags); 1979 break; 1980 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1981 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1982 /* journal device without journal feature */ 1983 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 1984 return -EINVAL; 1985 } 1986 set_bit(Journal, &rdev->flags); 1987 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1988 rdev->raid_disk = 0; 1989 break; 1990 default: 1991 rdev->saved_raid_disk = role; 1992 if ((le32_to_cpu(sb->feature_map) & 1993 MD_FEATURE_RECOVERY_OFFSET)) { 1994 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1995 if (!(le32_to_cpu(sb->feature_map) & 1996 MD_FEATURE_RECOVERY_BITMAP)) 1997 rdev->saved_raid_disk = -1; 1998 } else { 1999 /* 2000 * If the array is FROZEN, then the device can't 2001 * be in_sync with rest of array. 2002 */ 2003 if (!test_bit(MD_RECOVERY_FROZEN, 2004 &mddev->recovery)) 2005 set_bit(In_sync, &rdev->flags); 2006 } 2007 rdev->raid_disk = role; 2008 break; 2009 } 2010 if (sb->devflags & WriteMostly1) 2011 set_bit(WriteMostly, &rdev->flags); 2012 if (sb->devflags & FailFast1) 2013 set_bit(FailFast, &rdev->flags); 2014 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2015 set_bit(Replacement, &rdev->flags); 2016 } else /* MULTIPATH are always insync */ 2017 set_bit(In_sync, &rdev->flags); 2018 2019 return 0; 2020 } 2021 2022 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2023 { 2024 struct mdp_superblock_1 *sb; 2025 struct md_rdev *rdev2; 2026 int max_dev, i; 2027 /* make rdev->sb match mddev and rdev data. */ 2028 2029 sb = page_address(rdev->sb_page); 2030 2031 sb->feature_map = 0; 2032 sb->pad0 = 0; 2033 sb->recovery_offset = cpu_to_le64(0); 2034 memset(sb->pad3, 0, sizeof(sb->pad3)); 2035 2036 sb->utime = cpu_to_le64((__u64)mddev->utime); 2037 sb->events = cpu_to_le64(mddev->events); 2038 if (mddev->in_sync) 2039 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 2040 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2041 sb->resync_offset = cpu_to_le64(MaxSector); 2042 else 2043 sb->resync_offset = cpu_to_le64(0); 2044 2045 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2046 2047 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2048 sb->size = cpu_to_le64(mddev->dev_sectors); 2049 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2050 sb->level = cpu_to_le32(mddev->level); 2051 sb->layout = cpu_to_le32(mddev->layout); 2052 if (test_bit(FailFast, &rdev->flags)) 2053 sb->devflags |= FailFast1; 2054 else 2055 sb->devflags &= ~FailFast1; 2056 2057 if (test_bit(WriteMostly, &rdev->flags)) 2058 sb->devflags |= WriteMostly1; 2059 else 2060 sb->devflags &= ~WriteMostly1; 2061 sb->data_offset = cpu_to_le64(rdev->data_offset); 2062 sb->data_size = cpu_to_le64(rdev->sectors); 2063 2064 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2065 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2066 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2067 } 2068 2069 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2070 !test_bit(In_sync, &rdev->flags)) { 2071 sb->feature_map |= 2072 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2073 sb->recovery_offset = 2074 cpu_to_le64(rdev->recovery_offset); 2075 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2076 sb->feature_map |= 2077 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2078 } 2079 /* Note: recovery_offset and journal_tail share space */ 2080 if (test_bit(Journal, &rdev->flags)) 2081 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2082 if (test_bit(Replacement, &rdev->flags)) 2083 sb->feature_map |= 2084 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2085 2086 if (mddev->reshape_position != MaxSector) { 2087 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2088 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2089 sb->new_layout = cpu_to_le32(mddev->new_layout); 2090 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2091 sb->new_level = cpu_to_le32(mddev->new_level); 2092 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2093 if (mddev->delta_disks == 0 && 2094 mddev->reshape_backwards) 2095 sb->feature_map 2096 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2097 if (rdev->new_data_offset != rdev->data_offset) { 2098 sb->feature_map 2099 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2100 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2101 - rdev->data_offset)); 2102 } 2103 } 2104 2105 if (mddev_is_clustered(mddev)) 2106 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2107 2108 if (rdev->badblocks.count == 0) 2109 /* Nothing to do for bad blocks*/ ; 2110 else if (sb->bblog_offset == 0) 2111 /* Cannot record bad blocks on this device */ 2112 md_error(mddev, rdev); 2113 else { 2114 struct badblocks *bb = &rdev->badblocks; 2115 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2116 u64 *p = bb->page; 2117 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2118 if (bb->changed) { 2119 unsigned seq; 2120 2121 retry: 2122 seq = read_seqbegin(&bb->lock); 2123 2124 memset(bbp, 0xff, PAGE_SIZE); 2125 2126 for (i = 0 ; i < bb->count ; i++) { 2127 u64 internal_bb = p[i]; 2128 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2129 | BB_LEN(internal_bb)); 2130 bbp[i] = cpu_to_le64(store_bb); 2131 } 2132 bb->changed = 0; 2133 if (read_seqretry(&bb->lock, seq)) 2134 goto retry; 2135 2136 bb->sector = (rdev->sb_start + 2137 (int)le32_to_cpu(sb->bblog_offset)); 2138 bb->size = le16_to_cpu(sb->bblog_size); 2139 } 2140 } 2141 2142 max_dev = 0; 2143 rdev_for_each(rdev2, mddev) 2144 if (rdev2->desc_nr+1 > max_dev) 2145 max_dev = rdev2->desc_nr+1; 2146 2147 if (max_dev > le32_to_cpu(sb->max_dev)) { 2148 int bmask; 2149 sb->max_dev = cpu_to_le32(max_dev); 2150 rdev->sb_size = max_dev * 2 + 256; 2151 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2152 if (rdev->sb_size & bmask) 2153 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2154 } else 2155 max_dev = le32_to_cpu(sb->max_dev); 2156 2157 for (i=0; i<max_dev;i++) 2158 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2159 2160 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2161 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2162 2163 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2164 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2165 sb->feature_map |= 2166 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2167 else 2168 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2169 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2170 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2171 } 2172 2173 rdev_for_each(rdev2, mddev) { 2174 i = rdev2->desc_nr; 2175 if (test_bit(Faulty, &rdev2->flags)) 2176 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2177 else if (test_bit(In_sync, &rdev2->flags)) 2178 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2179 else if (test_bit(Journal, &rdev2->flags)) 2180 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2181 else if (rdev2->raid_disk >= 0) 2182 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2183 else 2184 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2185 } 2186 2187 sb->sb_csum = calc_sb_1_csum(sb); 2188 } 2189 2190 static sector_t super_1_choose_bm_space(sector_t dev_size) 2191 { 2192 sector_t bm_space; 2193 2194 /* if the device is bigger than 8Gig, save 64k for bitmap 2195 * usage, if bigger than 200Gig, save 128k 2196 */ 2197 if (dev_size < 64*2) 2198 bm_space = 0; 2199 else if (dev_size - 64*2 >= 200*1024*1024*2) 2200 bm_space = 128*2; 2201 else if (dev_size - 4*2 > 8*1024*1024*2) 2202 bm_space = 64*2; 2203 else 2204 bm_space = 4*2; 2205 return bm_space; 2206 } 2207 2208 static unsigned long long 2209 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2210 { 2211 struct mdp_superblock_1 *sb; 2212 sector_t max_sectors; 2213 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2214 return 0; /* component must fit device */ 2215 if (rdev->data_offset != rdev->new_data_offset) 2216 return 0; /* too confusing */ 2217 if (rdev->sb_start < rdev->data_offset) { 2218 /* minor versions 1 and 2; superblock before data */ 2219 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2220 if (!num_sectors || num_sectors > max_sectors) 2221 num_sectors = max_sectors; 2222 } else if (rdev->mddev->bitmap_info.offset) { 2223 /* minor version 0 with bitmap we can't move */ 2224 return 0; 2225 } else { 2226 /* minor version 0; superblock after data */ 2227 sector_t sb_start, bm_space; 2228 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2229 2230 /* 8K is for superblock */ 2231 sb_start = dev_size - 8*2; 2232 sb_start &= ~(sector_t)(4*2 - 1); 2233 2234 bm_space = super_1_choose_bm_space(dev_size); 2235 2236 /* Space that can be used to store date needs to decrease 2237 * superblock bitmap space and bad block space(4K) 2238 */ 2239 max_sectors = sb_start - bm_space - 4*2; 2240 2241 if (!num_sectors || num_sectors > max_sectors) 2242 num_sectors = max_sectors; 2243 rdev->sb_start = sb_start; 2244 } 2245 sb = page_address(rdev->sb_page); 2246 sb->data_size = cpu_to_le64(num_sectors); 2247 sb->super_offset = cpu_to_le64(rdev->sb_start); 2248 sb->sb_csum = calc_sb_1_csum(sb); 2249 do { 2250 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2251 rdev->sb_page); 2252 } while (md_super_wait(rdev->mddev) < 0); 2253 return num_sectors; 2254 2255 } 2256 2257 static int 2258 super_1_allow_new_offset(struct md_rdev *rdev, 2259 unsigned long long new_offset) 2260 { 2261 /* All necessary checks on new >= old have been done */ 2262 struct bitmap *bitmap; 2263 if (new_offset >= rdev->data_offset) 2264 return 1; 2265 2266 /* with 1.0 metadata, there is no metadata to tread on 2267 * so we can always move back */ 2268 if (rdev->mddev->minor_version == 0) 2269 return 1; 2270 2271 /* otherwise we must be sure not to step on 2272 * any metadata, so stay: 2273 * 36K beyond start of superblock 2274 * beyond end of badblocks 2275 * beyond write-intent bitmap 2276 */ 2277 if (rdev->sb_start + (32+4)*2 > new_offset) 2278 return 0; 2279 bitmap = rdev->mddev->bitmap; 2280 if (bitmap && !rdev->mddev->bitmap_info.file && 2281 rdev->sb_start + rdev->mddev->bitmap_info.offset + 2282 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 2283 return 0; 2284 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2285 return 0; 2286 2287 return 1; 2288 } 2289 2290 static struct super_type super_types[] = { 2291 [0] = { 2292 .name = "0.90.0", 2293 .owner = THIS_MODULE, 2294 .load_super = super_90_load, 2295 .validate_super = super_90_validate, 2296 .sync_super = super_90_sync, 2297 .rdev_size_change = super_90_rdev_size_change, 2298 .allow_new_offset = super_90_allow_new_offset, 2299 }, 2300 [1] = { 2301 .name = "md-1", 2302 .owner = THIS_MODULE, 2303 .load_super = super_1_load, 2304 .validate_super = super_1_validate, 2305 .sync_super = super_1_sync, 2306 .rdev_size_change = super_1_rdev_size_change, 2307 .allow_new_offset = super_1_allow_new_offset, 2308 }, 2309 }; 2310 2311 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2312 { 2313 if (mddev->sync_super) { 2314 mddev->sync_super(mddev, rdev); 2315 return; 2316 } 2317 2318 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2319 2320 super_types[mddev->major_version].sync_super(mddev, rdev); 2321 } 2322 2323 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2324 { 2325 struct md_rdev *rdev, *rdev2; 2326 2327 rcu_read_lock(); 2328 rdev_for_each_rcu(rdev, mddev1) { 2329 if (test_bit(Faulty, &rdev->flags) || 2330 test_bit(Journal, &rdev->flags) || 2331 rdev->raid_disk == -1) 2332 continue; 2333 rdev_for_each_rcu(rdev2, mddev2) { 2334 if (test_bit(Faulty, &rdev2->flags) || 2335 test_bit(Journal, &rdev2->flags) || 2336 rdev2->raid_disk == -1) 2337 continue; 2338 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2339 rcu_read_unlock(); 2340 return 1; 2341 } 2342 } 2343 } 2344 rcu_read_unlock(); 2345 return 0; 2346 } 2347 2348 static LIST_HEAD(pending_raid_disks); 2349 2350 /* 2351 * Try to register data integrity profile for an mddev 2352 * 2353 * This is called when an array is started and after a disk has been kicked 2354 * from the array. It only succeeds if all working and active component devices 2355 * are integrity capable with matching profiles. 2356 */ 2357 int md_integrity_register(struct mddev *mddev) 2358 { 2359 struct md_rdev *rdev, *reference = NULL; 2360 2361 if (list_empty(&mddev->disks)) 2362 return 0; /* nothing to do */ 2363 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2364 return 0; /* shouldn't register, or already is */ 2365 rdev_for_each(rdev, mddev) { 2366 /* skip spares and non-functional disks */ 2367 if (test_bit(Faulty, &rdev->flags)) 2368 continue; 2369 if (rdev->raid_disk < 0) 2370 continue; 2371 if (!reference) { 2372 /* Use the first rdev as the reference */ 2373 reference = rdev; 2374 continue; 2375 } 2376 /* does this rdev's profile match the reference profile? */ 2377 if (blk_integrity_compare(reference->bdev->bd_disk, 2378 rdev->bdev->bd_disk) < 0) 2379 return -EINVAL; 2380 } 2381 if (!reference || !bdev_get_integrity(reference->bdev)) 2382 return 0; 2383 /* 2384 * All component devices are integrity capable and have matching 2385 * profiles, register the common profile for the md device. 2386 */ 2387 blk_integrity_register(mddev->gendisk, 2388 bdev_get_integrity(reference->bdev)); 2389 2390 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2391 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || 2392 (mddev->level != 1 && mddev->level != 10 && 2393 bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { 2394 /* 2395 * No need to handle the failure of bioset_integrity_create, 2396 * because the function is called by md_run() -> pers->run(), 2397 * md_run calls bioset_exit -> bioset_integrity_free in case 2398 * of failure case. 2399 */ 2400 pr_err("md: failed to create integrity pool for %s\n", 2401 mdname(mddev)); 2402 return -EINVAL; 2403 } 2404 return 0; 2405 } 2406 EXPORT_SYMBOL(md_integrity_register); 2407 2408 /* 2409 * Attempt to add an rdev, but only if it is consistent with the current 2410 * integrity profile 2411 */ 2412 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2413 { 2414 struct blk_integrity *bi_mddev; 2415 2416 if (!mddev->gendisk) 2417 return 0; 2418 2419 bi_mddev = blk_get_integrity(mddev->gendisk); 2420 2421 if (!bi_mddev) /* nothing to do */ 2422 return 0; 2423 2424 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2425 pr_err("%s: incompatible integrity profile for %pg\n", 2426 mdname(mddev), rdev->bdev); 2427 return -ENXIO; 2428 } 2429 2430 return 0; 2431 } 2432 EXPORT_SYMBOL(md_integrity_add_rdev); 2433 2434 static bool rdev_read_only(struct md_rdev *rdev) 2435 { 2436 return bdev_read_only(rdev->bdev) || 2437 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2438 } 2439 2440 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2441 { 2442 char b[BDEVNAME_SIZE]; 2443 int err; 2444 2445 /* prevent duplicates */ 2446 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2447 return -EEXIST; 2448 2449 if (rdev_read_only(rdev) && mddev->pers) 2450 return -EROFS; 2451 2452 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2453 if (!test_bit(Journal, &rdev->flags) && 2454 rdev->sectors && 2455 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2456 if (mddev->pers) { 2457 /* Cannot change size, so fail 2458 * If mddev->level <= 0, then we don't care 2459 * about aligning sizes (e.g. linear) 2460 */ 2461 if (mddev->level > 0) 2462 return -ENOSPC; 2463 } else 2464 mddev->dev_sectors = rdev->sectors; 2465 } 2466 2467 /* Verify rdev->desc_nr is unique. 2468 * If it is -1, assign a free number, else 2469 * check number is not in use 2470 */ 2471 rcu_read_lock(); 2472 if (rdev->desc_nr < 0) { 2473 int choice = 0; 2474 if (mddev->pers) 2475 choice = mddev->raid_disks; 2476 while (md_find_rdev_nr_rcu(mddev, choice)) 2477 choice++; 2478 rdev->desc_nr = choice; 2479 } else { 2480 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2481 rcu_read_unlock(); 2482 return -EBUSY; 2483 } 2484 } 2485 rcu_read_unlock(); 2486 if (!test_bit(Journal, &rdev->flags) && 2487 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2488 pr_warn("md: %s: array is limited to %d devices\n", 2489 mdname(mddev), mddev->max_disks); 2490 return -EBUSY; 2491 } 2492 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2493 strreplace(b, '/', '!'); 2494 2495 rdev->mddev = mddev; 2496 pr_debug("md: bind<%s>\n", b); 2497 2498 if (mddev->raid_disks) 2499 mddev_create_serial_pool(mddev, rdev); 2500 2501 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2502 goto fail; 2503 2504 /* failure here is OK */ 2505 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2506 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2507 rdev->sysfs_unack_badblocks = 2508 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2509 rdev->sysfs_badblocks = 2510 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2511 2512 list_add_rcu(&rdev->same_set, &mddev->disks); 2513 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2514 2515 /* May as well allow recovery to be retried once */ 2516 mddev->recovery_disabled++; 2517 2518 return 0; 2519 2520 fail: 2521 pr_warn("md: failed to register dev-%s for %s\n", 2522 b, mdname(mddev)); 2523 return err; 2524 } 2525 2526 void md_autodetect_dev(dev_t dev); 2527 2528 /* just for claiming the bdev */ 2529 static struct md_rdev claim_rdev; 2530 2531 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2532 { 2533 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2534 md_rdev_clear(rdev); 2535 #ifndef MODULE 2536 if (test_bit(AutoDetected, &rdev->flags)) 2537 md_autodetect_dev(rdev->bdev->bd_dev); 2538 #endif 2539 bdev_release(rdev->bdev_handle); 2540 rdev->bdev = NULL; 2541 kobject_put(&rdev->kobj); 2542 } 2543 2544 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2545 { 2546 struct mddev *mddev = rdev->mddev; 2547 2548 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2549 list_del_rcu(&rdev->same_set); 2550 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2551 mddev_destroy_serial_pool(rdev->mddev, rdev); 2552 rdev->mddev = NULL; 2553 sysfs_remove_link(&rdev->kobj, "block"); 2554 sysfs_put(rdev->sysfs_state); 2555 sysfs_put(rdev->sysfs_unack_badblocks); 2556 sysfs_put(rdev->sysfs_badblocks); 2557 rdev->sysfs_state = NULL; 2558 rdev->sysfs_unack_badblocks = NULL; 2559 rdev->sysfs_badblocks = NULL; 2560 rdev->badblocks.count = 0; 2561 2562 synchronize_rcu(); 2563 2564 /* 2565 * kobject_del() will wait for all in progress writers to be done, where 2566 * reconfig_mutex is held, hence it can't be called under 2567 * reconfig_mutex and it's delayed to mddev_unlock(). 2568 */ 2569 list_add(&rdev->same_set, &mddev->deleting); 2570 } 2571 2572 static void export_array(struct mddev *mddev) 2573 { 2574 struct md_rdev *rdev; 2575 2576 while (!list_empty(&mddev->disks)) { 2577 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2578 same_set); 2579 md_kick_rdev_from_array(rdev); 2580 } 2581 mddev->raid_disks = 0; 2582 mddev->major_version = 0; 2583 } 2584 2585 static bool set_in_sync(struct mddev *mddev) 2586 { 2587 lockdep_assert_held(&mddev->lock); 2588 if (!mddev->in_sync) { 2589 mddev->sync_checkers++; 2590 spin_unlock(&mddev->lock); 2591 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2592 spin_lock(&mddev->lock); 2593 if (!mddev->in_sync && 2594 percpu_ref_is_zero(&mddev->writes_pending)) { 2595 mddev->in_sync = 1; 2596 /* 2597 * Ensure ->in_sync is visible before we clear 2598 * ->sync_checkers. 2599 */ 2600 smp_mb(); 2601 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2602 sysfs_notify_dirent_safe(mddev->sysfs_state); 2603 } 2604 if (--mddev->sync_checkers == 0) 2605 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2606 } 2607 if (mddev->safemode == 1) 2608 mddev->safemode = 0; 2609 return mddev->in_sync; 2610 } 2611 2612 static void sync_sbs(struct mddev *mddev, int nospares) 2613 { 2614 /* Update each superblock (in-memory image), but 2615 * if we are allowed to, skip spares which already 2616 * have the right event counter, or have one earlier 2617 * (which would mean they aren't being marked as dirty 2618 * with the rest of the array) 2619 */ 2620 struct md_rdev *rdev; 2621 rdev_for_each(rdev, mddev) { 2622 if (rdev->sb_events == mddev->events || 2623 (nospares && 2624 rdev->raid_disk < 0 && 2625 rdev->sb_events+1 == mddev->events)) { 2626 /* Don't update this superblock */ 2627 rdev->sb_loaded = 2; 2628 } else { 2629 sync_super(mddev, rdev); 2630 rdev->sb_loaded = 1; 2631 } 2632 } 2633 } 2634 2635 static bool does_sb_need_changing(struct mddev *mddev) 2636 { 2637 struct md_rdev *rdev = NULL, *iter; 2638 struct mdp_superblock_1 *sb; 2639 int role; 2640 2641 /* Find a good rdev */ 2642 rdev_for_each(iter, mddev) 2643 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2644 rdev = iter; 2645 break; 2646 } 2647 2648 /* No good device found. */ 2649 if (!rdev) 2650 return false; 2651 2652 sb = page_address(rdev->sb_page); 2653 /* Check if a device has become faulty or a spare become active */ 2654 rdev_for_each(rdev, mddev) { 2655 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2656 /* Device activated? */ 2657 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2658 !test_bit(Faulty, &rdev->flags)) 2659 return true; 2660 /* Device turned faulty? */ 2661 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2662 return true; 2663 } 2664 2665 /* Check if any mddev parameters have changed */ 2666 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2667 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2668 (mddev->layout != le32_to_cpu(sb->layout)) || 2669 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2670 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2671 return true; 2672 2673 return false; 2674 } 2675 2676 void md_update_sb(struct mddev *mddev, int force_change) 2677 { 2678 struct md_rdev *rdev; 2679 int sync_req; 2680 int nospares = 0; 2681 int any_badblocks_changed = 0; 2682 int ret = -1; 2683 2684 if (!md_is_rdwr(mddev)) { 2685 if (force_change) 2686 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2687 return; 2688 } 2689 2690 repeat: 2691 if (mddev_is_clustered(mddev)) { 2692 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2693 force_change = 1; 2694 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2695 nospares = 1; 2696 ret = md_cluster_ops->metadata_update_start(mddev); 2697 /* Has someone else has updated the sb */ 2698 if (!does_sb_need_changing(mddev)) { 2699 if (ret == 0) 2700 md_cluster_ops->metadata_update_cancel(mddev); 2701 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2702 BIT(MD_SB_CHANGE_DEVS) | 2703 BIT(MD_SB_CHANGE_CLEAN)); 2704 return; 2705 } 2706 } 2707 2708 /* 2709 * First make sure individual recovery_offsets are correct 2710 * curr_resync_completed can only be used during recovery. 2711 * During reshape/resync it might use array-addresses rather 2712 * that device addresses. 2713 */ 2714 rdev_for_each(rdev, mddev) { 2715 if (rdev->raid_disk >= 0 && 2716 mddev->delta_disks >= 0 && 2717 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2718 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2719 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2720 !test_bit(Journal, &rdev->flags) && 2721 !test_bit(In_sync, &rdev->flags) && 2722 mddev->curr_resync_completed > rdev->recovery_offset) 2723 rdev->recovery_offset = mddev->curr_resync_completed; 2724 2725 } 2726 if (!mddev->persistent) { 2727 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2728 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2729 if (!mddev->external) { 2730 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2731 rdev_for_each(rdev, mddev) { 2732 if (rdev->badblocks.changed) { 2733 rdev->badblocks.changed = 0; 2734 ack_all_badblocks(&rdev->badblocks); 2735 md_error(mddev, rdev); 2736 } 2737 clear_bit(Blocked, &rdev->flags); 2738 clear_bit(BlockedBadBlocks, &rdev->flags); 2739 wake_up(&rdev->blocked_wait); 2740 } 2741 } 2742 wake_up(&mddev->sb_wait); 2743 return; 2744 } 2745 2746 spin_lock(&mddev->lock); 2747 2748 mddev->utime = ktime_get_real_seconds(); 2749 2750 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2751 force_change = 1; 2752 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2753 /* just a clean<-> dirty transition, possibly leave spares alone, 2754 * though if events isn't the right even/odd, we will have to do 2755 * spares after all 2756 */ 2757 nospares = 1; 2758 if (force_change) 2759 nospares = 0; 2760 if (mddev->degraded) 2761 /* If the array is degraded, then skipping spares is both 2762 * dangerous and fairly pointless. 2763 * Dangerous because a device that was removed from the array 2764 * might have a event_count that still looks up-to-date, 2765 * so it can be re-added without a resync. 2766 * Pointless because if there are any spares to skip, 2767 * then a recovery will happen and soon that array won't 2768 * be degraded any more and the spare can go back to sleep then. 2769 */ 2770 nospares = 0; 2771 2772 sync_req = mddev->in_sync; 2773 2774 /* If this is just a dirty<->clean transition, and the array is clean 2775 * and 'events' is odd, we can roll back to the previous clean state */ 2776 if (nospares 2777 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2778 && mddev->can_decrease_events 2779 && mddev->events != 1) { 2780 mddev->events--; 2781 mddev->can_decrease_events = 0; 2782 } else { 2783 /* otherwise we have to go forward and ... */ 2784 mddev->events ++; 2785 mddev->can_decrease_events = nospares; 2786 } 2787 2788 /* 2789 * This 64-bit counter should never wrap. 2790 * Either we are in around ~1 trillion A.C., assuming 2791 * 1 reboot per second, or we have a bug... 2792 */ 2793 WARN_ON(mddev->events == 0); 2794 2795 rdev_for_each(rdev, mddev) { 2796 if (rdev->badblocks.changed) 2797 any_badblocks_changed++; 2798 if (test_bit(Faulty, &rdev->flags)) 2799 set_bit(FaultRecorded, &rdev->flags); 2800 } 2801 2802 sync_sbs(mddev, nospares); 2803 spin_unlock(&mddev->lock); 2804 2805 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2806 mdname(mddev), mddev->in_sync); 2807 2808 if (mddev->queue) 2809 blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2810 rewrite: 2811 md_bitmap_update_sb(mddev->bitmap); 2812 rdev_for_each(rdev, mddev) { 2813 if (rdev->sb_loaded != 1) 2814 continue; /* no noise on spare devices */ 2815 2816 if (!test_bit(Faulty, &rdev->flags)) { 2817 md_super_write(mddev,rdev, 2818 rdev->sb_start, rdev->sb_size, 2819 rdev->sb_page); 2820 pr_debug("md: (write) %pg's sb offset: %llu\n", 2821 rdev->bdev, 2822 (unsigned long long)rdev->sb_start); 2823 rdev->sb_events = mddev->events; 2824 if (rdev->badblocks.size) { 2825 md_super_write(mddev, rdev, 2826 rdev->badblocks.sector, 2827 rdev->badblocks.size << 9, 2828 rdev->bb_page); 2829 rdev->badblocks.size = 0; 2830 } 2831 2832 } else 2833 pr_debug("md: %pg (skipping faulty)\n", 2834 rdev->bdev); 2835 2836 if (mddev->level == LEVEL_MULTIPATH) 2837 /* only need to write one superblock... */ 2838 break; 2839 } 2840 if (md_super_wait(mddev) < 0) 2841 goto rewrite; 2842 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2843 2844 if (mddev_is_clustered(mddev) && ret == 0) 2845 md_cluster_ops->metadata_update_finish(mddev); 2846 2847 if (mddev->in_sync != sync_req || 2848 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2849 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2850 /* have to write it out again */ 2851 goto repeat; 2852 wake_up(&mddev->sb_wait); 2853 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2854 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2855 2856 rdev_for_each(rdev, mddev) { 2857 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2858 clear_bit(Blocked, &rdev->flags); 2859 2860 if (any_badblocks_changed) 2861 ack_all_badblocks(&rdev->badblocks); 2862 clear_bit(BlockedBadBlocks, &rdev->flags); 2863 wake_up(&rdev->blocked_wait); 2864 } 2865 } 2866 EXPORT_SYMBOL(md_update_sb); 2867 2868 static int add_bound_rdev(struct md_rdev *rdev) 2869 { 2870 struct mddev *mddev = rdev->mddev; 2871 int err = 0; 2872 bool add_journal = test_bit(Journal, &rdev->flags); 2873 2874 if (!mddev->pers->hot_remove_disk || add_journal) { 2875 /* If there is hot_add_disk but no hot_remove_disk 2876 * then added disks for geometry changes, 2877 * and should be added immediately. 2878 */ 2879 super_types[mddev->major_version]. 2880 validate_super(mddev, rdev); 2881 err = mddev->pers->hot_add_disk(mddev, rdev); 2882 if (err) { 2883 md_kick_rdev_from_array(rdev); 2884 return err; 2885 } 2886 } 2887 sysfs_notify_dirent_safe(rdev->sysfs_state); 2888 2889 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2890 if (mddev->degraded) 2891 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2892 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2893 md_new_event(); 2894 md_wakeup_thread(mddev->thread); 2895 return 0; 2896 } 2897 2898 /* words written to sysfs files may, or may not, be \n terminated. 2899 * We want to accept with case. For this we use cmd_match. 2900 */ 2901 static int cmd_match(const char *cmd, const char *str) 2902 { 2903 /* See if cmd, written into a sysfs file, matches 2904 * str. They must either be the same, or cmd can 2905 * have a trailing newline 2906 */ 2907 while (*cmd && *str && *cmd == *str) { 2908 cmd++; 2909 str++; 2910 } 2911 if (*cmd == '\n') 2912 cmd++; 2913 if (*str || *cmd) 2914 return 0; 2915 return 1; 2916 } 2917 2918 struct rdev_sysfs_entry { 2919 struct attribute attr; 2920 ssize_t (*show)(struct md_rdev *, char *); 2921 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2922 }; 2923 2924 static ssize_t 2925 state_show(struct md_rdev *rdev, char *page) 2926 { 2927 char *sep = ","; 2928 size_t len = 0; 2929 unsigned long flags = READ_ONCE(rdev->flags); 2930 2931 if (test_bit(Faulty, &flags) || 2932 (!test_bit(ExternalBbl, &flags) && 2933 rdev->badblocks.unacked_exist)) 2934 len += sprintf(page+len, "faulty%s", sep); 2935 if (test_bit(In_sync, &flags)) 2936 len += sprintf(page+len, "in_sync%s", sep); 2937 if (test_bit(Journal, &flags)) 2938 len += sprintf(page+len, "journal%s", sep); 2939 if (test_bit(WriteMostly, &flags)) 2940 len += sprintf(page+len, "write_mostly%s", sep); 2941 if (test_bit(Blocked, &flags) || 2942 (rdev->badblocks.unacked_exist 2943 && !test_bit(Faulty, &flags))) 2944 len += sprintf(page+len, "blocked%s", sep); 2945 if (!test_bit(Faulty, &flags) && 2946 !test_bit(Journal, &flags) && 2947 !test_bit(In_sync, &flags)) 2948 len += sprintf(page+len, "spare%s", sep); 2949 if (test_bit(WriteErrorSeen, &flags)) 2950 len += sprintf(page+len, "write_error%s", sep); 2951 if (test_bit(WantReplacement, &flags)) 2952 len += sprintf(page+len, "want_replacement%s", sep); 2953 if (test_bit(Replacement, &flags)) 2954 len += sprintf(page+len, "replacement%s", sep); 2955 if (test_bit(ExternalBbl, &flags)) 2956 len += sprintf(page+len, "external_bbl%s", sep); 2957 if (test_bit(FailFast, &flags)) 2958 len += sprintf(page+len, "failfast%s", sep); 2959 2960 if (len) 2961 len -= strlen(sep); 2962 2963 return len+sprintf(page+len, "\n"); 2964 } 2965 2966 static ssize_t 2967 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2968 { 2969 /* can write 2970 * faulty - simulates an error 2971 * remove - disconnects the device 2972 * writemostly - sets write_mostly 2973 * -writemostly - clears write_mostly 2974 * blocked - sets the Blocked flags 2975 * -blocked - clears the Blocked and possibly simulates an error 2976 * insync - sets Insync providing device isn't active 2977 * -insync - clear Insync for a device with a slot assigned, 2978 * so that it gets rebuilt based on bitmap 2979 * write_error - sets WriteErrorSeen 2980 * -write_error - clears WriteErrorSeen 2981 * {,-}failfast - set/clear FailFast 2982 */ 2983 2984 struct mddev *mddev = rdev->mddev; 2985 int err = -EINVAL; 2986 bool need_update_sb = false; 2987 2988 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2989 md_error(rdev->mddev, rdev); 2990 2991 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 2992 err = -EBUSY; 2993 else 2994 err = 0; 2995 } else if (cmd_match(buf, "remove")) { 2996 if (rdev->mddev->pers) { 2997 clear_bit(Blocked, &rdev->flags); 2998 remove_and_add_spares(rdev->mddev, rdev); 2999 } 3000 if (rdev->raid_disk >= 0) 3001 err = -EBUSY; 3002 else { 3003 err = 0; 3004 if (mddev_is_clustered(mddev)) 3005 err = md_cluster_ops->remove_disk(mddev, rdev); 3006 3007 if (err == 0) { 3008 md_kick_rdev_from_array(rdev); 3009 if (mddev->pers) { 3010 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3011 md_wakeup_thread(mddev->thread); 3012 } 3013 md_new_event(); 3014 } 3015 } 3016 } else if (cmd_match(buf, "writemostly")) { 3017 set_bit(WriteMostly, &rdev->flags); 3018 mddev_create_serial_pool(rdev->mddev, rdev); 3019 need_update_sb = true; 3020 err = 0; 3021 } else if (cmd_match(buf, "-writemostly")) { 3022 mddev_destroy_serial_pool(rdev->mddev, rdev); 3023 clear_bit(WriteMostly, &rdev->flags); 3024 need_update_sb = true; 3025 err = 0; 3026 } else if (cmd_match(buf, "blocked")) { 3027 set_bit(Blocked, &rdev->flags); 3028 err = 0; 3029 } else if (cmd_match(buf, "-blocked")) { 3030 if (!test_bit(Faulty, &rdev->flags) && 3031 !test_bit(ExternalBbl, &rdev->flags) && 3032 rdev->badblocks.unacked_exist) { 3033 /* metadata handler doesn't understand badblocks, 3034 * so we need to fail the device 3035 */ 3036 md_error(rdev->mddev, rdev); 3037 } 3038 clear_bit(Blocked, &rdev->flags); 3039 clear_bit(BlockedBadBlocks, &rdev->flags); 3040 wake_up(&rdev->blocked_wait); 3041 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3042 md_wakeup_thread(rdev->mddev->thread); 3043 3044 err = 0; 3045 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3046 set_bit(In_sync, &rdev->flags); 3047 err = 0; 3048 } else if (cmd_match(buf, "failfast")) { 3049 set_bit(FailFast, &rdev->flags); 3050 need_update_sb = true; 3051 err = 0; 3052 } else if (cmd_match(buf, "-failfast")) { 3053 clear_bit(FailFast, &rdev->flags); 3054 need_update_sb = true; 3055 err = 0; 3056 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3057 !test_bit(Journal, &rdev->flags)) { 3058 if (rdev->mddev->pers == NULL) { 3059 clear_bit(In_sync, &rdev->flags); 3060 rdev->saved_raid_disk = rdev->raid_disk; 3061 rdev->raid_disk = -1; 3062 err = 0; 3063 } 3064 } else if (cmd_match(buf, "write_error")) { 3065 set_bit(WriteErrorSeen, &rdev->flags); 3066 err = 0; 3067 } else if (cmd_match(buf, "-write_error")) { 3068 clear_bit(WriteErrorSeen, &rdev->flags); 3069 err = 0; 3070 } else if (cmd_match(buf, "want_replacement")) { 3071 /* Any non-spare device that is not a replacement can 3072 * become want_replacement at any time, but we then need to 3073 * check if recovery is needed. 3074 */ 3075 if (rdev->raid_disk >= 0 && 3076 !test_bit(Journal, &rdev->flags) && 3077 !test_bit(Replacement, &rdev->flags)) 3078 set_bit(WantReplacement, &rdev->flags); 3079 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3080 md_wakeup_thread(rdev->mddev->thread); 3081 err = 0; 3082 } else if (cmd_match(buf, "-want_replacement")) { 3083 /* Clearing 'want_replacement' is always allowed. 3084 * Once replacements starts it is too late though. 3085 */ 3086 err = 0; 3087 clear_bit(WantReplacement, &rdev->flags); 3088 } else if (cmd_match(buf, "replacement")) { 3089 /* Can only set a device as a replacement when array has not 3090 * yet been started. Once running, replacement is automatic 3091 * from spares, or by assigning 'slot'. 3092 */ 3093 if (rdev->mddev->pers) 3094 err = -EBUSY; 3095 else { 3096 set_bit(Replacement, &rdev->flags); 3097 err = 0; 3098 } 3099 } else if (cmd_match(buf, "-replacement")) { 3100 /* Similarly, can only clear Replacement before start */ 3101 if (rdev->mddev->pers) 3102 err = -EBUSY; 3103 else { 3104 clear_bit(Replacement, &rdev->flags); 3105 err = 0; 3106 } 3107 } else if (cmd_match(buf, "re-add")) { 3108 if (!rdev->mddev->pers) 3109 err = -EINVAL; 3110 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3111 rdev->saved_raid_disk >= 0) { 3112 /* clear_bit is performed _after_ all the devices 3113 * have their local Faulty bit cleared. If any writes 3114 * happen in the meantime in the local node, they 3115 * will land in the local bitmap, which will be synced 3116 * by this node eventually 3117 */ 3118 if (!mddev_is_clustered(rdev->mddev) || 3119 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 3120 clear_bit(Faulty, &rdev->flags); 3121 err = add_bound_rdev(rdev); 3122 } 3123 } else 3124 err = -EBUSY; 3125 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3126 set_bit(ExternalBbl, &rdev->flags); 3127 rdev->badblocks.shift = 0; 3128 err = 0; 3129 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3130 clear_bit(ExternalBbl, &rdev->flags); 3131 err = 0; 3132 } 3133 if (need_update_sb) 3134 md_update_sb(mddev, 1); 3135 if (!err) 3136 sysfs_notify_dirent_safe(rdev->sysfs_state); 3137 return err ? err : len; 3138 } 3139 static struct rdev_sysfs_entry rdev_state = 3140 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3141 3142 static ssize_t 3143 errors_show(struct md_rdev *rdev, char *page) 3144 { 3145 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3146 } 3147 3148 static ssize_t 3149 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3150 { 3151 unsigned int n; 3152 int rv; 3153 3154 rv = kstrtouint(buf, 10, &n); 3155 if (rv < 0) 3156 return rv; 3157 atomic_set(&rdev->corrected_errors, n); 3158 return len; 3159 } 3160 static struct rdev_sysfs_entry rdev_errors = 3161 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3162 3163 static ssize_t 3164 slot_show(struct md_rdev *rdev, char *page) 3165 { 3166 if (test_bit(Journal, &rdev->flags)) 3167 return sprintf(page, "journal\n"); 3168 else if (rdev->raid_disk < 0) 3169 return sprintf(page, "none\n"); 3170 else 3171 return sprintf(page, "%d\n", rdev->raid_disk); 3172 } 3173 3174 static ssize_t 3175 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3176 { 3177 int slot; 3178 int err; 3179 3180 if (test_bit(Journal, &rdev->flags)) 3181 return -EBUSY; 3182 if (strncmp(buf, "none", 4)==0) 3183 slot = -1; 3184 else { 3185 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3186 if (err < 0) 3187 return err; 3188 if (slot < 0) 3189 /* overflow */ 3190 return -ENOSPC; 3191 } 3192 if (rdev->mddev->pers && slot == -1) { 3193 /* Setting 'slot' on an active array requires also 3194 * updating the 'rd%d' link, and communicating 3195 * with the personality with ->hot_*_disk. 3196 * For now we only support removing 3197 * failed/spare devices. This normally happens automatically, 3198 * but not when the metadata is externally managed. 3199 */ 3200 if (rdev->raid_disk == -1) 3201 return -EEXIST; 3202 /* personality does all needed checks */ 3203 if (rdev->mddev->pers->hot_remove_disk == NULL) 3204 return -EINVAL; 3205 clear_bit(Blocked, &rdev->flags); 3206 remove_and_add_spares(rdev->mddev, rdev); 3207 if (rdev->raid_disk >= 0) 3208 return -EBUSY; 3209 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3210 md_wakeup_thread(rdev->mddev->thread); 3211 } else if (rdev->mddev->pers) { 3212 /* Activating a spare .. or possibly reactivating 3213 * if we ever get bitmaps working here. 3214 */ 3215 int err; 3216 3217 if (rdev->raid_disk != -1) 3218 return -EBUSY; 3219 3220 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3221 return -EBUSY; 3222 3223 if (rdev->mddev->pers->hot_add_disk == NULL) 3224 return -EINVAL; 3225 3226 if (slot >= rdev->mddev->raid_disks && 3227 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3228 return -ENOSPC; 3229 3230 rdev->raid_disk = slot; 3231 if (test_bit(In_sync, &rdev->flags)) 3232 rdev->saved_raid_disk = slot; 3233 else 3234 rdev->saved_raid_disk = -1; 3235 clear_bit(In_sync, &rdev->flags); 3236 clear_bit(Bitmap_sync, &rdev->flags); 3237 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3238 if (err) { 3239 rdev->raid_disk = -1; 3240 return err; 3241 } else 3242 sysfs_notify_dirent_safe(rdev->sysfs_state); 3243 /* failure here is OK */; 3244 sysfs_link_rdev(rdev->mddev, rdev); 3245 /* don't wakeup anyone, leave that to userspace. */ 3246 } else { 3247 if (slot >= rdev->mddev->raid_disks && 3248 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3249 return -ENOSPC; 3250 rdev->raid_disk = slot; 3251 /* assume it is working */ 3252 clear_bit(Faulty, &rdev->flags); 3253 clear_bit(WriteMostly, &rdev->flags); 3254 set_bit(In_sync, &rdev->flags); 3255 sysfs_notify_dirent_safe(rdev->sysfs_state); 3256 } 3257 return len; 3258 } 3259 3260 static struct rdev_sysfs_entry rdev_slot = 3261 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3262 3263 static ssize_t 3264 offset_show(struct md_rdev *rdev, char *page) 3265 { 3266 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3267 } 3268 3269 static ssize_t 3270 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3271 { 3272 unsigned long long offset; 3273 if (kstrtoull(buf, 10, &offset) < 0) 3274 return -EINVAL; 3275 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3276 return -EBUSY; 3277 if (rdev->sectors && rdev->mddev->external) 3278 /* Must set offset before size, so overlap checks 3279 * can be sane */ 3280 return -EBUSY; 3281 rdev->data_offset = offset; 3282 rdev->new_data_offset = offset; 3283 return len; 3284 } 3285 3286 static struct rdev_sysfs_entry rdev_offset = 3287 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3288 3289 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3290 { 3291 return sprintf(page, "%llu\n", 3292 (unsigned long long)rdev->new_data_offset); 3293 } 3294 3295 static ssize_t new_offset_store(struct md_rdev *rdev, 3296 const char *buf, size_t len) 3297 { 3298 unsigned long long new_offset; 3299 struct mddev *mddev = rdev->mddev; 3300 3301 if (kstrtoull(buf, 10, &new_offset) < 0) 3302 return -EINVAL; 3303 3304 if (mddev->sync_thread || 3305 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 3306 return -EBUSY; 3307 if (new_offset == rdev->data_offset) 3308 /* reset is always permitted */ 3309 ; 3310 else if (new_offset > rdev->data_offset) { 3311 /* must not push array size beyond rdev_sectors */ 3312 if (new_offset - rdev->data_offset 3313 + mddev->dev_sectors > rdev->sectors) 3314 return -E2BIG; 3315 } 3316 /* Metadata worries about other space details. */ 3317 3318 /* decreasing the offset is inconsistent with a backwards 3319 * reshape. 3320 */ 3321 if (new_offset < rdev->data_offset && 3322 mddev->reshape_backwards) 3323 return -EINVAL; 3324 /* Increasing offset is inconsistent with forwards 3325 * reshape. reshape_direction should be set to 3326 * 'backwards' first. 3327 */ 3328 if (new_offset > rdev->data_offset && 3329 !mddev->reshape_backwards) 3330 return -EINVAL; 3331 3332 if (mddev->pers && mddev->persistent && 3333 !super_types[mddev->major_version] 3334 .allow_new_offset(rdev, new_offset)) 3335 return -E2BIG; 3336 rdev->new_data_offset = new_offset; 3337 if (new_offset > rdev->data_offset) 3338 mddev->reshape_backwards = 1; 3339 else if (new_offset < rdev->data_offset) 3340 mddev->reshape_backwards = 0; 3341 3342 return len; 3343 } 3344 static struct rdev_sysfs_entry rdev_new_offset = 3345 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3346 3347 static ssize_t 3348 rdev_size_show(struct md_rdev *rdev, char *page) 3349 { 3350 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3351 } 3352 3353 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3354 { 3355 /* check if two start/length pairs overlap */ 3356 if (a->data_offset + a->sectors <= b->data_offset) 3357 return false; 3358 if (b->data_offset + b->sectors <= a->data_offset) 3359 return false; 3360 return true; 3361 } 3362 3363 static bool md_rdev_overlaps(struct md_rdev *rdev) 3364 { 3365 struct mddev *mddev; 3366 struct md_rdev *rdev2; 3367 3368 spin_lock(&all_mddevs_lock); 3369 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3370 if (test_bit(MD_DELETED, &mddev->flags)) 3371 continue; 3372 rdev_for_each(rdev2, mddev) { 3373 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3374 md_rdevs_overlap(rdev, rdev2)) { 3375 spin_unlock(&all_mddevs_lock); 3376 return true; 3377 } 3378 } 3379 } 3380 spin_unlock(&all_mddevs_lock); 3381 return false; 3382 } 3383 3384 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3385 { 3386 unsigned long long blocks; 3387 sector_t new; 3388 3389 if (kstrtoull(buf, 10, &blocks) < 0) 3390 return -EINVAL; 3391 3392 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3393 return -EINVAL; /* sector conversion overflow */ 3394 3395 new = blocks * 2; 3396 if (new != blocks * 2) 3397 return -EINVAL; /* unsigned long long to sector_t overflow */ 3398 3399 *sectors = new; 3400 return 0; 3401 } 3402 3403 static ssize_t 3404 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3405 { 3406 struct mddev *my_mddev = rdev->mddev; 3407 sector_t oldsectors = rdev->sectors; 3408 sector_t sectors; 3409 3410 if (test_bit(Journal, &rdev->flags)) 3411 return -EBUSY; 3412 if (strict_blocks_to_sectors(buf, §ors) < 0) 3413 return -EINVAL; 3414 if (rdev->data_offset != rdev->new_data_offset) 3415 return -EINVAL; /* too confusing */ 3416 if (my_mddev->pers && rdev->raid_disk >= 0) { 3417 if (my_mddev->persistent) { 3418 sectors = super_types[my_mddev->major_version]. 3419 rdev_size_change(rdev, sectors); 3420 if (!sectors) 3421 return -EBUSY; 3422 } else if (!sectors) 3423 sectors = bdev_nr_sectors(rdev->bdev) - 3424 rdev->data_offset; 3425 if (!my_mddev->pers->resize) 3426 /* Cannot change size for RAID0 or Linear etc */ 3427 return -EINVAL; 3428 } 3429 if (sectors < my_mddev->dev_sectors) 3430 return -EINVAL; /* component must fit device */ 3431 3432 rdev->sectors = sectors; 3433 3434 /* 3435 * Check that all other rdevs with the same bdev do not overlap. This 3436 * check does not provide a hard guarantee, it just helps avoid 3437 * dangerous mistakes. 3438 */ 3439 if (sectors > oldsectors && my_mddev->external && 3440 md_rdev_overlaps(rdev)) { 3441 /* 3442 * Someone else could have slipped in a size change here, but 3443 * doing so is just silly. We put oldsectors back because we 3444 * know it is safe, and trust userspace not to race with itself. 3445 */ 3446 rdev->sectors = oldsectors; 3447 return -EBUSY; 3448 } 3449 return len; 3450 } 3451 3452 static struct rdev_sysfs_entry rdev_size = 3453 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3454 3455 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3456 { 3457 unsigned long long recovery_start = rdev->recovery_offset; 3458 3459 if (test_bit(In_sync, &rdev->flags) || 3460 recovery_start == MaxSector) 3461 return sprintf(page, "none\n"); 3462 3463 return sprintf(page, "%llu\n", recovery_start); 3464 } 3465 3466 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3467 { 3468 unsigned long long recovery_start; 3469 3470 if (cmd_match(buf, "none")) 3471 recovery_start = MaxSector; 3472 else if (kstrtoull(buf, 10, &recovery_start)) 3473 return -EINVAL; 3474 3475 if (rdev->mddev->pers && 3476 rdev->raid_disk >= 0) 3477 return -EBUSY; 3478 3479 rdev->recovery_offset = recovery_start; 3480 if (recovery_start == MaxSector) 3481 set_bit(In_sync, &rdev->flags); 3482 else 3483 clear_bit(In_sync, &rdev->flags); 3484 return len; 3485 } 3486 3487 static struct rdev_sysfs_entry rdev_recovery_start = 3488 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3489 3490 /* sysfs access to bad-blocks list. 3491 * We present two files. 3492 * 'bad-blocks' lists sector numbers and lengths of ranges that 3493 * are recorded as bad. The list is truncated to fit within 3494 * the one-page limit of sysfs. 3495 * Writing "sector length" to this file adds an acknowledged 3496 * bad block list. 3497 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3498 * been acknowledged. Writing to this file adds bad blocks 3499 * without acknowledging them. This is largely for testing. 3500 */ 3501 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3502 { 3503 return badblocks_show(&rdev->badblocks, page, 0); 3504 } 3505 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3506 { 3507 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3508 /* Maybe that ack was all we needed */ 3509 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3510 wake_up(&rdev->blocked_wait); 3511 return rv; 3512 } 3513 static struct rdev_sysfs_entry rdev_bad_blocks = 3514 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3515 3516 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3517 { 3518 return badblocks_show(&rdev->badblocks, page, 1); 3519 } 3520 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3521 { 3522 return badblocks_store(&rdev->badblocks, page, len, 1); 3523 } 3524 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3525 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3526 3527 static ssize_t 3528 ppl_sector_show(struct md_rdev *rdev, char *page) 3529 { 3530 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3531 } 3532 3533 static ssize_t 3534 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3535 { 3536 unsigned long long sector; 3537 3538 if (kstrtoull(buf, 10, §or) < 0) 3539 return -EINVAL; 3540 if (sector != (sector_t)sector) 3541 return -EINVAL; 3542 3543 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3544 rdev->raid_disk >= 0) 3545 return -EBUSY; 3546 3547 if (rdev->mddev->persistent) { 3548 if (rdev->mddev->major_version == 0) 3549 return -EINVAL; 3550 if ((sector > rdev->sb_start && 3551 sector - rdev->sb_start > S16_MAX) || 3552 (sector < rdev->sb_start && 3553 rdev->sb_start - sector > -S16_MIN)) 3554 return -EINVAL; 3555 rdev->ppl.offset = sector - rdev->sb_start; 3556 } else if (!rdev->mddev->external) { 3557 return -EBUSY; 3558 } 3559 rdev->ppl.sector = sector; 3560 return len; 3561 } 3562 3563 static struct rdev_sysfs_entry rdev_ppl_sector = 3564 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3565 3566 static ssize_t 3567 ppl_size_show(struct md_rdev *rdev, char *page) 3568 { 3569 return sprintf(page, "%u\n", rdev->ppl.size); 3570 } 3571 3572 static ssize_t 3573 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3574 { 3575 unsigned int size; 3576 3577 if (kstrtouint(buf, 10, &size) < 0) 3578 return -EINVAL; 3579 3580 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3581 rdev->raid_disk >= 0) 3582 return -EBUSY; 3583 3584 if (rdev->mddev->persistent) { 3585 if (rdev->mddev->major_version == 0) 3586 return -EINVAL; 3587 if (size > U16_MAX) 3588 return -EINVAL; 3589 } else if (!rdev->mddev->external) { 3590 return -EBUSY; 3591 } 3592 rdev->ppl.size = size; 3593 return len; 3594 } 3595 3596 static struct rdev_sysfs_entry rdev_ppl_size = 3597 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3598 3599 static struct attribute *rdev_default_attrs[] = { 3600 &rdev_state.attr, 3601 &rdev_errors.attr, 3602 &rdev_slot.attr, 3603 &rdev_offset.attr, 3604 &rdev_new_offset.attr, 3605 &rdev_size.attr, 3606 &rdev_recovery_start.attr, 3607 &rdev_bad_blocks.attr, 3608 &rdev_unack_bad_blocks.attr, 3609 &rdev_ppl_sector.attr, 3610 &rdev_ppl_size.attr, 3611 NULL, 3612 }; 3613 ATTRIBUTE_GROUPS(rdev_default); 3614 static ssize_t 3615 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3616 { 3617 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3618 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3619 3620 if (!entry->show) 3621 return -EIO; 3622 if (!rdev->mddev) 3623 return -ENODEV; 3624 return entry->show(rdev, page); 3625 } 3626 3627 static ssize_t 3628 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3629 const char *page, size_t length) 3630 { 3631 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3632 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3633 struct kernfs_node *kn = NULL; 3634 bool suspend = false; 3635 ssize_t rv; 3636 struct mddev *mddev = rdev->mddev; 3637 3638 if (!entry->store) 3639 return -EIO; 3640 if (!capable(CAP_SYS_ADMIN)) 3641 return -EACCES; 3642 if (!mddev) 3643 return -ENODEV; 3644 3645 if (entry->store == state_store) { 3646 if (cmd_match(page, "remove")) 3647 kn = sysfs_break_active_protection(kobj, attr); 3648 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3649 cmd_match(page, "writemostly") || 3650 cmd_match(page, "-writemostly")) 3651 suspend = true; 3652 } 3653 3654 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3655 if (!rv) { 3656 if (rdev->mddev == NULL) 3657 rv = -ENODEV; 3658 else 3659 rv = entry->store(rdev, page, length); 3660 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3661 } 3662 3663 if (kn) 3664 sysfs_unbreak_active_protection(kn); 3665 3666 return rv; 3667 } 3668 3669 static void rdev_free(struct kobject *ko) 3670 { 3671 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3672 kfree(rdev); 3673 } 3674 static const struct sysfs_ops rdev_sysfs_ops = { 3675 .show = rdev_attr_show, 3676 .store = rdev_attr_store, 3677 }; 3678 static const struct kobj_type rdev_ktype = { 3679 .release = rdev_free, 3680 .sysfs_ops = &rdev_sysfs_ops, 3681 .default_groups = rdev_default_groups, 3682 }; 3683 3684 int md_rdev_init(struct md_rdev *rdev) 3685 { 3686 rdev->desc_nr = -1; 3687 rdev->saved_raid_disk = -1; 3688 rdev->raid_disk = -1; 3689 rdev->flags = 0; 3690 rdev->data_offset = 0; 3691 rdev->new_data_offset = 0; 3692 rdev->sb_events = 0; 3693 rdev->last_read_error = 0; 3694 rdev->sb_loaded = 0; 3695 rdev->bb_page = NULL; 3696 atomic_set(&rdev->nr_pending, 0); 3697 atomic_set(&rdev->read_errors, 0); 3698 atomic_set(&rdev->corrected_errors, 0); 3699 3700 INIT_LIST_HEAD(&rdev->same_set); 3701 init_waitqueue_head(&rdev->blocked_wait); 3702 3703 /* Add space to store bad block list. 3704 * This reserves the space even on arrays where it cannot 3705 * be used - I wonder if that matters 3706 */ 3707 return badblocks_init(&rdev->badblocks, 0); 3708 } 3709 EXPORT_SYMBOL_GPL(md_rdev_init); 3710 3711 /* 3712 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3713 * 3714 * mark the device faulty if: 3715 * 3716 * - the device is nonexistent (zero size) 3717 * - the device has no valid superblock 3718 * 3719 * a faulty rdev _never_ has rdev->sb set. 3720 */ 3721 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3722 { 3723 struct md_rdev *rdev; 3724 sector_t size; 3725 int err; 3726 3727 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3728 if (!rdev) 3729 return ERR_PTR(-ENOMEM); 3730 3731 err = md_rdev_init(rdev); 3732 if (err) 3733 goto out_free_rdev; 3734 err = alloc_disk_sb(rdev); 3735 if (err) 3736 goto out_clear_rdev; 3737 3738 rdev->bdev_handle = bdev_open_by_dev(newdev, 3739 BLK_OPEN_READ | BLK_OPEN_WRITE, 3740 super_format == -2 ? &claim_rdev : rdev, NULL); 3741 if (IS_ERR(rdev->bdev_handle)) { 3742 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3743 MAJOR(newdev), MINOR(newdev)); 3744 err = PTR_ERR(rdev->bdev_handle); 3745 goto out_clear_rdev; 3746 } 3747 rdev->bdev = rdev->bdev_handle->bdev; 3748 3749 kobject_init(&rdev->kobj, &rdev_ktype); 3750 3751 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3752 if (!size) { 3753 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3754 rdev->bdev); 3755 err = -EINVAL; 3756 goto out_blkdev_put; 3757 } 3758 3759 if (super_format >= 0) { 3760 err = super_types[super_format]. 3761 load_super(rdev, NULL, super_minor); 3762 if (err == -EINVAL) { 3763 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3764 rdev->bdev, 3765 super_format, super_minor); 3766 goto out_blkdev_put; 3767 } 3768 if (err < 0) { 3769 pr_warn("md: could not read %pg's sb, not importing!\n", 3770 rdev->bdev); 3771 goto out_blkdev_put; 3772 } 3773 } 3774 3775 return rdev; 3776 3777 out_blkdev_put: 3778 bdev_release(rdev->bdev_handle); 3779 out_clear_rdev: 3780 md_rdev_clear(rdev); 3781 out_free_rdev: 3782 kfree(rdev); 3783 return ERR_PTR(err); 3784 } 3785 3786 /* 3787 * Check a full RAID array for plausibility 3788 */ 3789 3790 static int analyze_sbs(struct mddev *mddev) 3791 { 3792 int i; 3793 struct md_rdev *rdev, *freshest, *tmp; 3794 3795 freshest = NULL; 3796 rdev_for_each_safe(rdev, tmp, mddev) 3797 switch (super_types[mddev->major_version]. 3798 load_super(rdev, freshest, mddev->minor_version)) { 3799 case 1: 3800 freshest = rdev; 3801 break; 3802 case 0: 3803 break; 3804 default: 3805 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3806 rdev->bdev); 3807 md_kick_rdev_from_array(rdev); 3808 } 3809 3810 /* Cannot find a valid fresh disk */ 3811 if (!freshest) { 3812 pr_warn("md: cannot find a valid disk\n"); 3813 return -EINVAL; 3814 } 3815 3816 super_types[mddev->major_version]. 3817 validate_super(mddev, freshest); 3818 3819 i = 0; 3820 rdev_for_each_safe(rdev, tmp, mddev) { 3821 if (mddev->max_disks && 3822 (rdev->desc_nr >= mddev->max_disks || 3823 i > mddev->max_disks)) { 3824 pr_warn("md: %s: %pg: only %d devices permitted\n", 3825 mdname(mddev), rdev->bdev, 3826 mddev->max_disks); 3827 md_kick_rdev_from_array(rdev); 3828 continue; 3829 } 3830 if (rdev != freshest) { 3831 if (super_types[mddev->major_version]. 3832 validate_super(mddev, rdev)) { 3833 pr_warn("md: kicking non-fresh %pg from array!\n", 3834 rdev->bdev); 3835 md_kick_rdev_from_array(rdev); 3836 continue; 3837 } 3838 } 3839 if (mddev->level == LEVEL_MULTIPATH) { 3840 rdev->desc_nr = i++; 3841 rdev->raid_disk = rdev->desc_nr; 3842 set_bit(In_sync, &rdev->flags); 3843 } else if (rdev->raid_disk >= 3844 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3845 !test_bit(Journal, &rdev->flags)) { 3846 rdev->raid_disk = -1; 3847 clear_bit(In_sync, &rdev->flags); 3848 } 3849 } 3850 3851 return 0; 3852 } 3853 3854 /* Read a fixed-point number. 3855 * Numbers in sysfs attributes should be in "standard" units where 3856 * possible, so time should be in seconds. 3857 * However we internally use a a much smaller unit such as 3858 * milliseconds or jiffies. 3859 * This function takes a decimal number with a possible fractional 3860 * component, and produces an integer which is the result of 3861 * multiplying that number by 10^'scale'. 3862 * all without any floating-point arithmetic. 3863 */ 3864 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3865 { 3866 unsigned long result = 0; 3867 long decimals = -1; 3868 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3869 if (*cp == '.') 3870 decimals = 0; 3871 else if (decimals < scale) { 3872 unsigned int value; 3873 value = *cp - '0'; 3874 result = result * 10 + value; 3875 if (decimals >= 0) 3876 decimals++; 3877 } 3878 cp++; 3879 } 3880 if (*cp == '\n') 3881 cp++; 3882 if (*cp) 3883 return -EINVAL; 3884 if (decimals < 0) 3885 decimals = 0; 3886 *res = result * int_pow(10, scale - decimals); 3887 return 0; 3888 } 3889 3890 static ssize_t 3891 safe_delay_show(struct mddev *mddev, char *page) 3892 { 3893 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3894 3895 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3896 } 3897 static ssize_t 3898 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3899 { 3900 unsigned long msec; 3901 3902 if (mddev_is_clustered(mddev)) { 3903 pr_warn("md: Safemode is disabled for clustered mode\n"); 3904 return -EINVAL; 3905 } 3906 3907 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3908 return -EINVAL; 3909 if (msec == 0) 3910 mddev->safemode_delay = 0; 3911 else { 3912 unsigned long old_delay = mddev->safemode_delay; 3913 unsigned long new_delay = (msec*HZ)/1000; 3914 3915 if (new_delay == 0) 3916 new_delay = 1; 3917 mddev->safemode_delay = new_delay; 3918 if (new_delay < old_delay || old_delay == 0) 3919 mod_timer(&mddev->safemode_timer, jiffies+1); 3920 } 3921 return len; 3922 } 3923 static struct md_sysfs_entry md_safe_delay = 3924 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3925 3926 static ssize_t 3927 level_show(struct mddev *mddev, char *page) 3928 { 3929 struct md_personality *p; 3930 int ret; 3931 spin_lock(&mddev->lock); 3932 p = mddev->pers; 3933 if (p) 3934 ret = sprintf(page, "%s\n", p->name); 3935 else if (mddev->clevel[0]) 3936 ret = sprintf(page, "%s\n", mddev->clevel); 3937 else if (mddev->level != LEVEL_NONE) 3938 ret = sprintf(page, "%d\n", mddev->level); 3939 else 3940 ret = 0; 3941 spin_unlock(&mddev->lock); 3942 return ret; 3943 } 3944 3945 static ssize_t 3946 level_store(struct mddev *mddev, const char *buf, size_t len) 3947 { 3948 char clevel[16]; 3949 ssize_t rv; 3950 size_t slen = len; 3951 struct md_personality *pers, *oldpers; 3952 long level; 3953 void *priv, *oldpriv; 3954 struct md_rdev *rdev; 3955 3956 if (slen == 0 || slen >= sizeof(clevel)) 3957 return -EINVAL; 3958 3959 rv = mddev_suspend_and_lock(mddev); 3960 if (rv) 3961 return rv; 3962 3963 if (mddev->pers == NULL) { 3964 memcpy(mddev->clevel, buf, slen); 3965 if (mddev->clevel[slen-1] == '\n') 3966 slen--; 3967 mddev->clevel[slen] = 0; 3968 mddev->level = LEVEL_NONE; 3969 rv = len; 3970 goto out_unlock; 3971 } 3972 rv = -EROFS; 3973 if (!md_is_rdwr(mddev)) 3974 goto out_unlock; 3975 3976 /* request to change the personality. Need to ensure: 3977 * - array is not engaged in resync/recovery/reshape 3978 * - old personality can be suspended 3979 * - new personality will access other array. 3980 */ 3981 3982 rv = -EBUSY; 3983 if (mddev->sync_thread || 3984 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3985 mddev->reshape_position != MaxSector || 3986 mddev->sysfs_active) 3987 goto out_unlock; 3988 3989 rv = -EINVAL; 3990 if (!mddev->pers->quiesce) { 3991 pr_warn("md: %s: %s does not support online personality change\n", 3992 mdname(mddev), mddev->pers->name); 3993 goto out_unlock; 3994 } 3995 3996 /* Now find the new personality */ 3997 memcpy(clevel, buf, slen); 3998 if (clevel[slen-1] == '\n') 3999 slen--; 4000 clevel[slen] = 0; 4001 if (kstrtol(clevel, 10, &level)) 4002 level = LEVEL_NONE; 4003 4004 if (request_module("md-%s", clevel) != 0) 4005 request_module("md-level-%s", clevel); 4006 spin_lock(&pers_lock); 4007 pers = find_pers(level, clevel); 4008 if (!pers || !try_module_get(pers->owner)) { 4009 spin_unlock(&pers_lock); 4010 pr_warn("md: personality %s not loaded\n", clevel); 4011 rv = -EINVAL; 4012 goto out_unlock; 4013 } 4014 spin_unlock(&pers_lock); 4015 4016 if (pers == mddev->pers) { 4017 /* Nothing to do! */ 4018 module_put(pers->owner); 4019 rv = len; 4020 goto out_unlock; 4021 } 4022 if (!pers->takeover) { 4023 module_put(pers->owner); 4024 pr_warn("md: %s: %s does not support personality takeover\n", 4025 mdname(mddev), clevel); 4026 rv = -EINVAL; 4027 goto out_unlock; 4028 } 4029 4030 rdev_for_each(rdev, mddev) 4031 rdev->new_raid_disk = rdev->raid_disk; 4032 4033 /* ->takeover must set new_* and/or delta_disks 4034 * if it succeeds, and may set them when it fails. 4035 */ 4036 priv = pers->takeover(mddev); 4037 if (IS_ERR(priv)) { 4038 mddev->new_level = mddev->level; 4039 mddev->new_layout = mddev->layout; 4040 mddev->new_chunk_sectors = mddev->chunk_sectors; 4041 mddev->raid_disks -= mddev->delta_disks; 4042 mddev->delta_disks = 0; 4043 mddev->reshape_backwards = 0; 4044 module_put(pers->owner); 4045 pr_warn("md: %s: %s would not accept array\n", 4046 mdname(mddev), clevel); 4047 rv = PTR_ERR(priv); 4048 goto out_unlock; 4049 } 4050 4051 /* Looks like we have a winner */ 4052 mddev_detach(mddev); 4053 4054 spin_lock(&mddev->lock); 4055 oldpers = mddev->pers; 4056 oldpriv = mddev->private; 4057 mddev->pers = pers; 4058 mddev->private = priv; 4059 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4060 mddev->level = mddev->new_level; 4061 mddev->layout = mddev->new_layout; 4062 mddev->chunk_sectors = mddev->new_chunk_sectors; 4063 mddev->delta_disks = 0; 4064 mddev->reshape_backwards = 0; 4065 mddev->degraded = 0; 4066 spin_unlock(&mddev->lock); 4067 4068 if (oldpers->sync_request == NULL && 4069 mddev->external) { 4070 /* We are converting from a no-redundancy array 4071 * to a redundancy array and metadata is managed 4072 * externally so we need to be sure that writes 4073 * won't block due to a need to transition 4074 * clean->dirty 4075 * until external management is started. 4076 */ 4077 mddev->in_sync = 0; 4078 mddev->safemode_delay = 0; 4079 mddev->safemode = 0; 4080 } 4081 4082 oldpers->free(mddev, oldpriv); 4083 4084 if (oldpers->sync_request == NULL && 4085 pers->sync_request != NULL) { 4086 /* need to add the md_redundancy_group */ 4087 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4088 pr_warn("md: cannot register extra attributes for %s\n", 4089 mdname(mddev)); 4090 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4091 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4092 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4093 } 4094 if (oldpers->sync_request != NULL && 4095 pers->sync_request == NULL) { 4096 /* need to remove the md_redundancy_group */ 4097 if (mddev->to_remove == NULL) 4098 mddev->to_remove = &md_redundancy_group; 4099 } 4100 4101 module_put(oldpers->owner); 4102 4103 rdev_for_each(rdev, mddev) { 4104 if (rdev->raid_disk < 0) 4105 continue; 4106 if (rdev->new_raid_disk >= mddev->raid_disks) 4107 rdev->new_raid_disk = -1; 4108 if (rdev->new_raid_disk == rdev->raid_disk) 4109 continue; 4110 sysfs_unlink_rdev(mddev, rdev); 4111 } 4112 rdev_for_each(rdev, mddev) { 4113 if (rdev->raid_disk < 0) 4114 continue; 4115 if (rdev->new_raid_disk == rdev->raid_disk) 4116 continue; 4117 rdev->raid_disk = rdev->new_raid_disk; 4118 if (rdev->raid_disk < 0) 4119 clear_bit(In_sync, &rdev->flags); 4120 else { 4121 if (sysfs_link_rdev(mddev, rdev)) 4122 pr_warn("md: cannot register rd%d for %s after level change\n", 4123 rdev->raid_disk, mdname(mddev)); 4124 } 4125 } 4126 4127 if (pers->sync_request == NULL) { 4128 /* this is now an array without redundancy, so 4129 * it must always be in_sync 4130 */ 4131 mddev->in_sync = 1; 4132 del_timer_sync(&mddev->safemode_timer); 4133 } 4134 blk_set_stacking_limits(&mddev->queue->limits); 4135 pers->run(mddev); 4136 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4137 if (!mddev->thread) 4138 md_update_sb(mddev, 1); 4139 sysfs_notify_dirent_safe(mddev->sysfs_level); 4140 md_new_event(); 4141 rv = len; 4142 out_unlock: 4143 mddev_unlock_and_resume(mddev); 4144 return rv; 4145 } 4146 4147 static struct md_sysfs_entry md_level = 4148 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4149 4150 static ssize_t 4151 layout_show(struct mddev *mddev, char *page) 4152 { 4153 /* just a number, not meaningful for all levels */ 4154 if (mddev->reshape_position != MaxSector && 4155 mddev->layout != mddev->new_layout) 4156 return sprintf(page, "%d (%d)\n", 4157 mddev->new_layout, mddev->layout); 4158 return sprintf(page, "%d\n", mddev->layout); 4159 } 4160 4161 static ssize_t 4162 layout_store(struct mddev *mddev, const char *buf, size_t len) 4163 { 4164 unsigned int n; 4165 int err; 4166 4167 err = kstrtouint(buf, 10, &n); 4168 if (err < 0) 4169 return err; 4170 err = mddev_lock(mddev); 4171 if (err) 4172 return err; 4173 4174 if (mddev->pers) { 4175 if (mddev->pers->check_reshape == NULL) 4176 err = -EBUSY; 4177 else if (!md_is_rdwr(mddev)) 4178 err = -EROFS; 4179 else { 4180 mddev->new_layout = n; 4181 err = mddev->pers->check_reshape(mddev); 4182 if (err) 4183 mddev->new_layout = mddev->layout; 4184 } 4185 } else { 4186 mddev->new_layout = n; 4187 if (mddev->reshape_position == MaxSector) 4188 mddev->layout = n; 4189 } 4190 mddev_unlock(mddev); 4191 return err ?: len; 4192 } 4193 static struct md_sysfs_entry md_layout = 4194 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4195 4196 static ssize_t 4197 raid_disks_show(struct mddev *mddev, char *page) 4198 { 4199 if (mddev->raid_disks == 0) 4200 return 0; 4201 if (mddev->reshape_position != MaxSector && 4202 mddev->delta_disks != 0) 4203 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4204 mddev->raid_disks - mddev->delta_disks); 4205 return sprintf(page, "%d\n", mddev->raid_disks); 4206 } 4207 4208 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4209 4210 static ssize_t 4211 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4212 { 4213 unsigned int n; 4214 int err; 4215 4216 err = kstrtouint(buf, 10, &n); 4217 if (err < 0) 4218 return err; 4219 4220 err = mddev_lock(mddev); 4221 if (err) 4222 return err; 4223 if (mddev->pers) 4224 err = update_raid_disks(mddev, n); 4225 else if (mddev->reshape_position != MaxSector) { 4226 struct md_rdev *rdev; 4227 int olddisks = mddev->raid_disks - mddev->delta_disks; 4228 4229 err = -EINVAL; 4230 rdev_for_each(rdev, mddev) { 4231 if (olddisks < n && 4232 rdev->data_offset < rdev->new_data_offset) 4233 goto out_unlock; 4234 if (olddisks > n && 4235 rdev->data_offset > rdev->new_data_offset) 4236 goto out_unlock; 4237 } 4238 err = 0; 4239 mddev->delta_disks = n - olddisks; 4240 mddev->raid_disks = n; 4241 mddev->reshape_backwards = (mddev->delta_disks < 0); 4242 } else 4243 mddev->raid_disks = n; 4244 out_unlock: 4245 mddev_unlock(mddev); 4246 return err ? err : len; 4247 } 4248 static struct md_sysfs_entry md_raid_disks = 4249 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4250 4251 static ssize_t 4252 uuid_show(struct mddev *mddev, char *page) 4253 { 4254 return sprintf(page, "%pU\n", mddev->uuid); 4255 } 4256 static struct md_sysfs_entry md_uuid = 4257 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4258 4259 static ssize_t 4260 chunk_size_show(struct mddev *mddev, char *page) 4261 { 4262 if (mddev->reshape_position != MaxSector && 4263 mddev->chunk_sectors != mddev->new_chunk_sectors) 4264 return sprintf(page, "%d (%d)\n", 4265 mddev->new_chunk_sectors << 9, 4266 mddev->chunk_sectors << 9); 4267 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4268 } 4269 4270 static ssize_t 4271 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4272 { 4273 unsigned long n; 4274 int err; 4275 4276 err = kstrtoul(buf, 10, &n); 4277 if (err < 0) 4278 return err; 4279 4280 err = mddev_lock(mddev); 4281 if (err) 4282 return err; 4283 if (mddev->pers) { 4284 if (mddev->pers->check_reshape == NULL) 4285 err = -EBUSY; 4286 else if (!md_is_rdwr(mddev)) 4287 err = -EROFS; 4288 else { 4289 mddev->new_chunk_sectors = n >> 9; 4290 err = mddev->pers->check_reshape(mddev); 4291 if (err) 4292 mddev->new_chunk_sectors = mddev->chunk_sectors; 4293 } 4294 } else { 4295 mddev->new_chunk_sectors = n >> 9; 4296 if (mddev->reshape_position == MaxSector) 4297 mddev->chunk_sectors = n >> 9; 4298 } 4299 mddev_unlock(mddev); 4300 return err ?: len; 4301 } 4302 static struct md_sysfs_entry md_chunk_size = 4303 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4304 4305 static ssize_t 4306 resync_start_show(struct mddev *mddev, char *page) 4307 { 4308 if (mddev->recovery_cp == MaxSector) 4309 return sprintf(page, "none\n"); 4310 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4311 } 4312 4313 static ssize_t 4314 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4315 { 4316 unsigned long long n; 4317 int err; 4318 4319 if (cmd_match(buf, "none")) 4320 n = MaxSector; 4321 else { 4322 err = kstrtoull(buf, 10, &n); 4323 if (err < 0) 4324 return err; 4325 if (n != (sector_t)n) 4326 return -EINVAL; 4327 } 4328 4329 err = mddev_lock(mddev); 4330 if (err) 4331 return err; 4332 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4333 err = -EBUSY; 4334 4335 if (!err) { 4336 mddev->recovery_cp = n; 4337 if (mddev->pers) 4338 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4339 } 4340 mddev_unlock(mddev); 4341 return err ?: len; 4342 } 4343 static struct md_sysfs_entry md_resync_start = 4344 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4345 resync_start_show, resync_start_store); 4346 4347 /* 4348 * The array state can be: 4349 * 4350 * clear 4351 * No devices, no size, no level 4352 * Equivalent to STOP_ARRAY ioctl 4353 * inactive 4354 * May have some settings, but array is not active 4355 * all IO results in error 4356 * When written, doesn't tear down array, but just stops it 4357 * suspended (not supported yet) 4358 * All IO requests will block. The array can be reconfigured. 4359 * Writing this, if accepted, will block until array is quiescent 4360 * readonly 4361 * no resync can happen. no superblocks get written. 4362 * write requests fail 4363 * read-auto 4364 * like readonly, but behaves like 'clean' on a write request. 4365 * 4366 * clean - no pending writes, but otherwise active. 4367 * When written to inactive array, starts without resync 4368 * If a write request arrives then 4369 * if metadata is known, mark 'dirty' and switch to 'active'. 4370 * if not known, block and switch to write-pending 4371 * If written to an active array that has pending writes, then fails. 4372 * active 4373 * fully active: IO and resync can be happening. 4374 * When written to inactive array, starts with resync 4375 * 4376 * write-pending 4377 * clean, but writes are blocked waiting for 'active' to be written. 4378 * 4379 * active-idle 4380 * like active, but no writes have been seen for a while (100msec). 4381 * 4382 * broken 4383 * Array is failed. It's useful because mounted-arrays aren't stopped 4384 * when array is failed, so this state will at least alert the user that 4385 * something is wrong. 4386 */ 4387 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4388 write_pending, active_idle, broken, bad_word}; 4389 static char *array_states[] = { 4390 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4391 "write-pending", "active-idle", "broken", NULL }; 4392 4393 static int match_word(const char *word, char **list) 4394 { 4395 int n; 4396 for (n=0; list[n]; n++) 4397 if (cmd_match(word, list[n])) 4398 break; 4399 return n; 4400 } 4401 4402 static ssize_t 4403 array_state_show(struct mddev *mddev, char *page) 4404 { 4405 enum array_state st = inactive; 4406 4407 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4408 switch(mddev->ro) { 4409 case MD_RDONLY: 4410 st = readonly; 4411 break; 4412 case MD_AUTO_READ: 4413 st = read_auto; 4414 break; 4415 case MD_RDWR: 4416 spin_lock(&mddev->lock); 4417 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4418 st = write_pending; 4419 else if (mddev->in_sync) 4420 st = clean; 4421 else if (mddev->safemode) 4422 st = active_idle; 4423 else 4424 st = active; 4425 spin_unlock(&mddev->lock); 4426 } 4427 4428 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4429 st = broken; 4430 } else { 4431 if (list_empty(&mddev->disks) && 4432 mddev->raid_disks == 0 && 4433 mddev->dev_sectors == 0) 4434 st = clear; 4435 else 4436 st = inactive; 4437 } 4438 return sprintf(page, "%s\n", array_states[st]); 4439 } 4440 4441 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 4442 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 4443 static int restart_array(struct mddev *mddev); 4444 4445 static ssize_t 4446 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4447 { 4448 int err = 0; 4449 enum array_state st = match_word(buf, array_states); 4450 4451 /* No lock dependent actions */ 4452 switch (st) { 4453 case suspended: /* not supported yet */ 4454 case write_pending: /* cannot be set */ 4455 case active_idle: /* cannot be set */ 4456 case broken: /* cannot be set */ 4457 case bad_word: 4458 return -EINVAL; 4459 default: 4460 break; 4461 } 4462 4463 if (mddev->pers && (st == active || st == clean) && 4464 mddev->ro != MD_RDONLY) { 4465 /* don't take reconfig_mutex when toggling between 4466 * clean and active 4467 */ 4468 spin_lock(&mddev->lock); 4469 if (st == active) { 4470 restart_array(mddev); 4471 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4472 md_wakeup_thread(mddev->thread); 4473 wake_up(&mddev->sb_wait); 4474 } else /* st == clean */ { 4475 restart_array(mddev); 4476 if (!set_in_sync(mddev)) 4477 err = -EBUSY; 4478 } 4479 if (!err) 4480 sysfs_notify_dirent_safe(mddev->sysfs_state); 4481 spin_unlock(&mddev->lock); 4482 return err ?: len; 4483 } 4484 err = mddev_lock(mddev); 4485 if (err) 4486 return err; 4487 4488 switch (st) { 4489 case inactive: 4490 /* stop an active array, return 0 otherwise */ 4491 if (mddev->pers) 4492 err = do_md_stop(mddev, 2, NULL); 4493 break; 4494 case clear: 4495 err = do_md_stop(mddev, 0, NULL); 4496 break; 4497 case readonly: 4498 if (mddev->pers) 4499 err = md_set_readonly(mddev, NULL); 4500 else { 4501 mddev->ro = MD_RDONLY; 4502 set_disk_ro(mddev->gendisk, 1); 4503 err = do_md_run(mddev); 4504 } 4505 break; 4506 case read_auto: 4507 if (mddev->pers) { 4508 if (md_is_rdwr(mddev)) 4509 err = md_set_readonly(mddev, NULL); 4510 else if (mddev->ro == MD_RDONLY) 4511 err = restart_array(mddev); 4512 if (err == 0) { 4513 mddev->ro = MD_AUTO_READ; 4514 set_disk_ro(mddev->gendisk, 0); 4515 } 4516 } else { 4517 mddev->ro = MD_AUTO_READ; 4518 err = do_md_run(mddev); 4519 } 4520 break; 4521 case clean: 4522 if (mddev->pers) { 4523 err = restart_array(mddev); 4524 if (err) 4525 break; 4526 spin_lock(&mddev->lock); 4527 if (!set_in_sync(mddev)) 4528 err = -EBUSY; 4529 spin_unlock(&mddev->lock); 4530 } else 4531 err = -EINVAL; 4532 break; 4533 case active: 4534 if (mddev->pers) { 4535 err = restart_array(mddev); 4536 if (err) 4537 break; 4538 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4539 wake_up(&mddev->sb_wait); 4540 err = 0; 4541 } else { 4542 mddev->ro = MD_RDWR; 4543 set_disk_ro(mddev->gendisk, 0); 4544 err = do_md_run(mddev); 4545 } 4546 break; 4547 default: 4548 err = -EINVAL; 4549 break; 4550 } 4551 4552 if (!err) { 4553 if (mddev->hold_active == UNTIL_IOCTL) 4554 mddev->hold_active = 0; 4555 sysfs_notify_dirent_safe(mddev->sysfs_state); 4556 } 4557 mddev_unlock(mddev); 4558 return err ?: len; 4559 } 4560 static struct md_sysfs_entry md_array_state = 4561 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4562 4563 static ssize_t 4564 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4565 return sprintf(page, "%d\n", 4566 atomic_read(&mddev->max_corr_read_errors)); 4567 } 4568 4569 static ssize_t 4570 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4571 { 4572 unsigned int n; 4573 int rv; 4574 4575 rv = kstrtouint(buf, 10, &n); 4576 if (rv < 0) 4577 return rv; 4578 if (n > INT_MAX) 4579 return -EINVAL; 4580 atomic_set(&mddev->max_corr_read_errors, n); 4581 return len; 4582 } 4583 4584 static struct md_sysfs_entry max_corr_read_errors = 4585 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4586 max_corrected_read_errors_store); 4587 4588 static ssize_t 4589 null_show(struct mddev *mddev, char *page) 4590 { 4591 return -EINVAL; 4592 } 4593 4594 static ssize_t 4595 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4596 { 4597 /* buf must be %d:%d\n? giving major and minor numbers */ 4598 /* The new device is added to the array. 4599 * If the array has a persistent superblock, we read the 4600 * superblock to initialise info and check validity. 4601 * Otherwise, only checking done is that in bind_rdev_to_array, 4602 * which mainly checks size. 4603 */ 4604 char *e; 4605 int major = simple_strtoul(buf, &e, 10); 4606 int minor; 4607 dev_t dev; 4608 struct md_rdev *rdev; 4609 int err; 4610 4611 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4612 return -EINVAL; 4613 minor = simple_strtoul(e+1, &e, 10); 4614 if (*e && *e != '\n') 4615 return -EINVAL; 4616 dev = MKDEV(major, minor); 4617 if (major != MAJOR(dev) || 4618 minor != MINOR(dev)) 4619 return -EOVERFLOW; 4620 4621 err = mddev_suspend_and_lock(mddev); 4622 if (err) 4623 return err; 4624 if (mddev->persistent) { 4625 rdev = md_import_device(dev, mddev->major_version, 4626 mddev->minor_version); 4627 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4628 struct md_rdev *rdev0 4629 = list_entry(mddev->disks.next, 4630 struct md_rdev, same_set); 4631 err = super_types[mddev->major_version] 4632 .load_super(rdev, rdev0, mddev->minor_version); 4633 if (err < 0) 4634 goto out; 4635 } 4636 } else if (mddev->external) 4637 rdev = md_import_device(dev, -2, -1); 4638 else 4639 rdev = md_import_device(dev, -1, -1); 4640 4641 if (IS_ERR(rdev)) { 4642 mddev_unlock_and_resume(mddev); 4643 return PTR_ERR(rdev); 4644 } 4645 err = bind_rdev_to_array(rdev, mddev); 4646 out: 4647 if (err) 4648 export_rdev(rdev, mddev); 4649 mddev_unlock_and_resume(mddev); 4650 if (!err) 4651 md_new_event(); 4652 return err ? err : len; 4653 } 4654 4655 static struct md_sysfs_entry md_new_device = 4656 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4657 4658 static ssize_t 4659 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4660 { 4661 char *end; 4662 unsigned long chunk, end_chunk; 4663 int err; 4664 4665 err = mddev_lock(mddev); 4666 if (err) 4667 return err; 4668 if (!mddev->bitmap) 4669 goto out; 4670 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4671 while (*buf) { 4672 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4673 if (buf == end) break; 4674 if (*end == '-') { /* range */ 4675 buf = end + 1; 4676 end_chunk = simple_strtoul(buf, &end, 0); 4677 if (buf == end) break; 4678 } 4679 if (*end && !isspace(*end)) break; 4680 md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4681 buf = skip_spaces(end); 4682 } 4683 md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4684 out: 4685 mddev_unlock(mddev); 4686 return len; 4687 } 4688 4689 static struct md_sysfs_entry md_bitmap = 4690 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4691 4692 static ssize_t 4693 size_show(struct mddev *mddev, char *page) 4694 { 4695 return sprintf(page, "%llu\n", 4696 (unsigned long long)mddev->dev_sectors / 2); 4697 } 4698 4699 static int update_size(struct mddev *mddev, sector_t num_sectors); 4700 4701 static ssize_t 4702 size_store(struct mddev *mddev, const char *buf, size_t len) 4703 { 4704 /* If array is inactive, we can reduce the component size, but 4705 * not increase it (except from 0). 4706 * If array is active, we can try an on-line resize 4707 */ 4708 sector_t sectors; 4709 int err = strict_blocks_to_sectors(buf, §ors); 4710 4711 if (err < 0) 4712 return err; 4713 err = mddev_lock(mddev); 4714 if (err) 4715 return err; 4716 if (mddev->pers) { 4717 err = update_size(mddev, sectors); 4718 if (err == 0) 4719 md_update_sb(mddev, 1); 4720 } else { 4721 if (mddev->dev_sectors == 0 || 4722 mddev->dev_sectors > sectors) 4723 mddev->dev_sectors = sectors; 4724 else 4725 err = -ENOSPC; 4726 } 4727 mddev_unlock(mddev); 4728 return err ? err : len; 4729 } 4730 4731 static struct md_sysfs_entry md_size = 4732 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4733 4734 /* Metadata version. 4735 * This is one of 4736 * 'none' for arrays with no metadata (good luck...) 4737 * 'external' for arrays with externally managed metadata, 4738 * or N.M for internally known formats 4739 */ 4740 static ssize_t 4741 metadata_show(struct mddev *mddev, char *page) 4742 { 4743 if (mddev->persistent) 4744 return sprintf(page, "%d.%d\n", 4745 mddev->major_version, mddev->minor_version); 4746 else if (mddev->external) 4747 return sprintf(page, "external:%s\n", mddev->metadata_type); 4748 else 4749 return sprintf(page, "none\n"); 4750 } 4751 4752 static ssize_t 4753 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4754 { 4755 int major, minor; 4756 char *e; 4757 int err; 4758 /* Changing the details of 'external' metadata is 4759 * always permitted. Otherwise there must be 4760 * no devices attached to the array. 4761 */ 4762 4763 err = mddev_lock(mddev); 4764 if (err) 4765 return err; 4766 err = -EBUSY; 4767 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4768 ; 4769 else if (!list_empty(&mddev->disks)) 4770 goto out_unlock; 4771 4772 err = 0; 4773 if (cmd_match(buf, "none")) { 4774 mddev->persistent = 0; 4775 mddev->external = 0; 4776 mddev->major_version = 0; 4777 mddev->minor_version = 90; 4778 goto out_unlock; 4779 } 4780 if (strncmp(buf, "external:", 9) == 0) { 4781 size_t namelen = len-9; 4782 if (namelen >= sizeof(mddev->metadata_type)) 4783 namelen = sizeof(mddev->metadata_type)-1; 4784 memcpy(mddev->metadata_type, buf+9, namelen); 4785 mddev->metadata_type[namelen] = 0; 4786 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4787 mddev->metadata_type[--namelen] = 0; 4788 mddev->persistent = 0; 4789 mddev->external = 1; 4790 mddev->major_version = 0; 4791 mddev->minor_version = 90; 4792 goto out_unlock; 4793 } 4794 major = simple_strtoul(buf, &e, 10); 4795 err = -EINVAL; 4796 if (e==buf || *e != '.') 4797 goto out_unlock; 4798 buf = e+1; 4799 minor = simple_strtoul(buf, &e, 10); 4800 if (e==buf || (*e && *e != '\n') ) 4801 goto out_unlock; 4802 err = -ENOENT; 4803 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4804 goto out_unlock; 4805 mddev->major_version = major; 4806 mddev->minor_version = minor; 4807 mddev->persistent = 1; 4808 mddev->external = 0; 4809 err = 0; 4810 out_unlock: 4811 mddev_unlock(mddev); 4812 return err ?: len; 4813 } 4814 4815 static struct md_sysfs_entry md_metadata = 4816 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4817 4818 static ssize_t 4819 action_show(struct mddev *mddev, char *page) 4820 { 4821 char *type = "idle"; 4822 unsigned long recovery = mddev->recovery; 4823 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4824 type = "frozen"; 4825 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4826 (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4827 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4828 type = "reshape"; 4829 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4830 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4831 type = "resync"; 4832 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4833 type = "check"; 4834 else 4835 type = "repair"; 4836 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4837 type = "recover"; 4838 else if (mddev->reshape_position != MaxSector) 4839 type = "reshape"; 4840 } 4841 return sprintf(page, "%s\n", type); 4842 } 4843 4844 static void stop_sync_thread(struct mddev *mddev) 4845 { 4846 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4847 return; 4848 4849 if (mddev_lock(mddev)) 4850 return; 4851 4852 /* 4853 * Check again in case MD_RECOVERY_RUNNING is cleared before lock is 4854 * held. 4855 */ 4856 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4857 mddev_unlock(mddev); 4858 return; 4859 } 4860 4861 if (work_pending(&mddev->del_work)) 4862 flush_workqueue(md_misc_wq); 4863 4864 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4865 /* 4866 * Thread might be blocked waiting for metadata update which will now 4867 * never happen 4868 */ 4869 md_wakeup_thread_directly(mddev->sync_thread); 4870 4871 mddev_unlock(mddev); 4872 } 4873 4874 static void idle_sync_thread(struct mddev *mddev) 4875 { 4876 int sync_seq = atomic_read(&mddev->sync_seq); 4877 4878 mutex_lock(&mddev->sync_mutex); 4879 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4880 stop_sync_thread(mddev); 4881 4882 wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || 4883 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4884 4885 mutex_unlock(&mddev->sync_mutex); 4886 } 4887 4888 static void frozen_sync_thread(struct mddev *mddev) 4889 { 4890 mutex_lock(&mddev->sync_mutex); 4891 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4892 stop_sync_thread(mddev); 4893 4894 wait_event(resync_wait, mddev->sync_thread == NULL && 4895 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4896 4897 mutex_unlock(&mddev->sync_mutex); 4898 } 4899 4900 static ssize_t 4901 action_store(struct mddev *mddev, const char *page, size_t len) 4902 { 4903 if (!mddev->pers || !mddev->pers->sync_request) 4904 return -EINVAL; 4905 4906 4907 if (cmd_match(page, "idle")) 4908 idle_sync_thread(mddev); 4909 else if (cmd_match(page, "frozen")) 4910 frozen_sync_thread(mddev); 4911 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4912 return -EBUSY; 4913 else if (cmd_match(page, "resync")) 4914 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4915 else if (cmd_match(page, "recover")) { 4916 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4917 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4918 } else if (cmd_match(page, "reshape")) { 4919 int err; 4920 if (mddev->pers->start_reshape == NULL) 4921 return -EINVAL; 4922 err = mddev_lock(mddev); 4923 if (!err) { 4924 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4925 err = -EBUSY; 4926 } else if (mddev->reshape_position == MaxSector || 4927 mddev->pers->check_reshape == NULL || 4928 mddev->pers->check_reshape(mddev)) { 4929 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4930 err = mddev->pers->start_reshape(mddev); 4931 } else { 4932 /* 4933 * If reshape is still in progress, and 4934 * md_check_recovery() can continue to reshape, 4935 * don't restart reshape because data can be 4936 * corrupted for raid456. 4937 */ 4938 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4939 } 4940 mddev_unlock(mddev); 4941 } 4942 if (err) 4943 return err; 4944 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 4945 } else { 4946 if (cmd_match(page, "check")) 4947 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4948 else if (!cmd_match(page, "repair")) 4949 return -EINVAL; 4950 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4951 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4952 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4953 } 4954 if (mddev->ro == MD_AUTO_READ) { 4955 /* A write to sync_action is enough to justify 4956 * canceling read-auto mode 4957 */ 4958 flush_work(&mddev->sync_work); 4959 mddev->ro = MD_RDWR; 4960 md_wakeup_thread(mddev->sync_thread); 4961 } 4962 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4963 md_wakeup_thread(mddev->thread); 4964 sysfs_notify_dirent_safe(mddev->sysfs_action); 4965 return len; 4966 } 4967 4968 static struct md_sysfs_entry md_scan_mode = 4969 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4970 4971 static ssize_t 4972 last_sync_action_show(struct mddev *mddev, char *page) 4973 { 4974 return sprintf(page, "%s\n", mddev->last_sync_action); 4975 } 4976 4977 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4978 4979 static ssize_t 4980 mismatch_cnt_show(struct mddev *mddev, char *page) 4981 { 4982 return sprintf(page, "%llu\n", 4983 (unsigned long long) 4984 atomic64_read(&mddev->resync_mismatches)); 4985 } 4986 4987 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4988 4989 static ssize_t 4990 sync_min_show(struct mddev *mddev, char *page) 4991 { 4992 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4993 mddev->sync_speed_min ? "local": "system"); 4994 } 4995 4996 static ssize_t 4997 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4998 { 4999 unsigned int min; 5000 int rv; 5001 5002 if (strncmp(buf, "system", 6)==0) { 5003 min = 0; 5004 } else { 5005 rv = kstrtouint(buf, 10, &min); 5006 if (rv < 0) 5007 return rv; 5008 if (min == 0) 5009 return -EINVAL; 5010 } 5011 mddev->sync_speed_min = min; 5012 return len; 5013 } 5014 5015 static struct md_sysfs_entry md_sync_min = 5016 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5017 5018 static ssize_t 5019 sync_max_show(struct mddev *mddev, char *page) 5020 { 5021 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5022 mddev->sync_speed_max ? "local": "system"); 5023 } 5024 5025 static ssize_t 5026 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5027 { 5028 unsigned int max; 5029 int rv; 5030 5031 if (strncmp(buf, "system", 6)==0) { 5032 max = 0; 5033 } else { 5034 rv = kstrtouint(buf, 10, &max); 5035 if (rv < 0) 5036 return rv; 5037 if (max == 0) 5038 return -EINVAL; 5039 } 5040 mddev->sync_speed_max = max; 5041 return len; 5042 } 5043 5044 static struct md_sysfs_entry md_sync_max = 5045 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5046 5047 static ssize_t 5048 degraded_show(struct mddev *mddev, char *page) 5049 { 5050 return sprintf(page, "%d\n", mddev->degraded); 5051 } 5052 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5053 5054 static ssize_t 5055 sync_force_parallel_show(struct mddev *mddev, char *page) 5056 { 5057 return sprintf(page, "%d\n", mddev->parallel_resync); 5058 } 5059 5060 static ssize_t 5061 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5062 { 5063 long n; 5064 5065 if (kstrtol(buf, 10, &n)) 5066 return -EINVAL; 5067 5068 if (n != 0 && n != 1) 5069 return -EINVAL; 5070 5071 mddev->parallel_resync = n; 5072 5073 if (mddev->sync_thread) 5074 wake_up(&resync_wait); 5075 5076 return len; 5077 } 5078 5079 /* force parallel resync, even with shared block devices */ 5080 static struct md_sysfs_entry md_sync_force_parallel = 5081 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5082 sync_force_parallel_show, sync_force_parallel_store); 5083 5084 static ssize_t 5085 sync_speed_show(struct mddev *mddev, char *page) 5086 { 5087 unsigned long resync, dt, db; 5088 if (mddev->curr_resync == MD_RESYNC_NONE) 5089 return sprintf(page, "none\n"); 5090 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5091 dt = (jiffies - mddev->resync_mark) / HZ; 5092 if (!dt) dt++; 5093 db = resync - mddev->resync_mark_cnt; 5094 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5095 } 5096 5097 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5098 5099 static ssize_t 5100 sync_completed_show(struct mddev *mddev, char *page) 5101 { 5102 unsigned long long max_sectors, resync; 5103 5104 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5105 return sprintf(page, "none\n"); 5106 5107 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5108 mddev->curr_resync == MD_RESYNC_DELAYED) 5109 return sprintf(page, "delayed\n"); 5110 5111 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5112 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5113 max_sectors = mddev->resync_max_sectors; 5114 else 5115 max_sectors = mddev->dev_sectors; 5116 5117 resync = mddev->curr_resync_completed; 5118 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5119 } 5120 5121 static struct md_sysfs_entry md_sync_completed = 5122 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5123 5124 static ssize_t 5125 min_sync_show(struct mddev *mddev, char *page) 5126 { 5127 return sprintf(page, "%llu\n", 5128 (unsigned long long)mddev->resync_min); 5129 } 5130 static ssize_t 5131 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5132 { 5133 unsigned long long min; 5134 int err; 5135 5136 if (kstrtoull(buf, 10, &min)) 5137 return -EINVAL; 5138 5139 spin_lock(&mddev->lock); 5140 err = -EINVAL; 5141 if (min > mddev->resync_max) 5142 goto out_unlock; 5143 5144 err = -EBUSY; 5145 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5146 goto out_unlock; 5147 5148 /* Round down to multiple of 4K for safety */ 5149 mddev->resync_min = round_down(min, 8); 5150 err = 0; 5151 5152 out_unlock: 5153 spin_unlock(&mddev->lock); 5154 return err ?: len; 5155 } 5156 5157 static struct md_sysfs_entry md_min_sync = 5158 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5159 5160 static ssize_t 5161 max_sync_show(struct mddev *mddev, char *page) 5162 { 5163 if (mddev->resync_max == MaxSector) 5164 return sprintf(page, "max\n"); 5165 else 5166 return sprintf(page, "%llu\n", 5167 (unsigned long long)mddev->resync_max); 5168 } 5169 static ssize_t 5170 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5171 { 5172 int err; 5173 spin_lock(&mddev->lock); 5174 if (strncmp(buf, "max", 3) == 0) 5175 mddev->resync_max = MaxSector; 5176 else { 5177 unsigned long long max; 5178 int chunk; 5179 5180 err = -EINVAL; 5181 if (kstrtoull(buf, 10, &max)) 5182 goto out_unlock; 5183 if (max < mddev->resync_min) 5184 goto out_unlock; 5185 5186 err = -EBUSY; 5187 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5188 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5189 goto out_unlock; 5190 5191 /* Must be a multiple of chunk_size */ 5192 chunk = mddev->chunk_sectors; 5193 if (chunk) { 5194 sector_t temp = max; 5195 5196 err = -EINVAL; 5197 if (sector_div(temp, chunk)) 5198 goto out_unlock; 5199 } 5200 mddev->resync_max = max; 5201 } 5202 wake_up(&mddev->recovery_wait); 5203 err = 0; 5204 out_unlock: 5205 spin_unlock(&mddev->lock); 5206 return err ?: len; 5207 } 5208 5209 static struct md_sysfs_entry md_max_sync = 5210 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5211 5212 static ssize_t 5213 suspend_lo_show(struct mddev *mddev, char *page) 5214 { 5215 return sprintf(page, "%llu\n", 5216 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5217 } 5218 5219 static ssize_t 5220 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5221 { 5222 unsigned long long new; 5223 int err; 5224 5225 err = kstrtoull(buf, 10, &new); 5226 if (err < 0) 5227 return err; 5228 if (new != (sector_t)new) 5229 return -EINVAL; 5230 5231 err = mddev_suspend(mddev, true); 5232 if (err) 5233 return err; 5234 5235 WRITE_ONCE(mddev->suspend_lo, new); 5236 mddev_resume(mddev); 5237 5238 return len; 5239 } 5240 static struct md_sysfs_entry md_suspend_lo = 5241 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5242 5243 static ssize_t 5244 suspend_hi_show(struct mddev *mddev, char *page) 5245 { 5246 return sprintf(page, "%llu\n", 5247 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5248 } 5249 5250 static ssize_t 5251 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5252 { 5253 unsigned long long new; 5254 int err; 5255 5256 err = kstrtoull(buf, 10, &new); 5257 if (err < 0) 5258 return err; 5259 if (new != (sector_t)new) 5260 return -EINVAL; 5261 5262 err = mddev_suspend(mddev, true); 5263 if (err) 5264 return err; 5265 5266 WRITE_ONCE(mddev->suspend_hi, new); 5267 mddev_resume(mddev); 5268 5269 return len; 5270 } 5271 static struct md_sysfs_entry md_suspend_hi = 5272 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5273 5274 static ssize_t 5275 reshape_position_show(struct mddev *mddev, char *page) 5276 { 5277 if (mddev->reshape_position != MaxSector) 5278 return sprintf(page, "%llu\n", 5279 (unsigned long long)mddev->reshape_position); 5280 strcpy(page, "none\n"); 5281 return 5; 5282 } 5283 5284 static ssize_t 5285 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5286 { 5287 struct md_rdev *rdev; 5288 unsigned long long new; 5289 int err; 5290 5291 err = kstrtoull(buf, 10, &new); 5292 if (err < 0) 5293 return err; 5294 if (new != (sector_t)new) 5295 return -EINVAL; 5296 err = mddev_lock(mddev); 5297 if (err) 5298 return err; 5299 err = -EBUSY; 5300 if (mddev->pers) 5301 goto unlock; 5302 mddev->reshape_position = new; 5303 mddev->delta_disks = 0; 5304 mddev->reshape_backwards = 0; 5305 mddev->new_level = mddev->level; 5306 mddev->new_layout = mddev->layout; 5307 mddev->new_chunk_sectors = mddev->chunk_sectors; 5308 rdev_for_each(rdev, mddev) 5309 rdev->new_data_offset = rdev->data_offset; 5310 err = 0; 5311 unlock: 5312 mddev_unlock(mddev); 5313 return err ?: len; 5314 } 5315 5316 static struct md_sysfs_entry md_reshape_position = 5317 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5318 reshape_position_store); 5319 5320 static ssize_t 5321 reshape_direction_show(struct mddev *mddev, char *page) 5322 { 5323 return sprintf(page, "%s\n", 5324 mddev->reshape_backwards ? "backwards" : "forwards"); 5325 } 5326 5327 static ssize_t 5328 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5329 { 5330 int backwards = 0; 5331 int err; 5332 5333 if (cmd_match(buf, "forwards")) 5334 backwards = 0; 5335 else if (cmd_match(buf, "backwards")) 5336 backwards = 1; 5337 else 5338 return -EINVAL; 5339 if (mddev->reshape_backwards == backwards) 5340 return len; 5341 5342 err = mddev_lock(mddev); 5343 if (err) 5344 return err; 5345 /* check if we are allowed to change */ 5346 if (mddev->delta_disks) 5347 err = -EBUSY; 5348 else if (mddev->persistent && 5349 mddev->major_version == 0) 5350 err = -EINVAL; 5351 else 5352 mddev->reshape_backwards = backwards; 5353 mddev_unlock(mddev); 5354 return err ?: len; 5355 } 5356 5357 static struct md_sysfs_entry md_reshape_direction = 5358 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5359 reshape_direction_store); 5360 5361 static ssize_t 5362 array_size_show(struct mddev *mddev, char *page) 5363 { 5364 if (mddev->external_size) 5365 return sprintf(page, "%llu\n", 5366 (unsigned long long)mddev->array_sectors/2); 5367 else 5368 return sprintf(page, "default\n"); 5369 } 5370 5371 static ssize_t 5372 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5373 { 5374 sector_t sectors; 5375 int err; 5376 5377 err = mddev_lock(mddev); 5378 if (err) 5379 return err; 5380 5381 /* cluster raid doesn't support change array_sectors */ 5382 if (mddev_is_clustered(mddev)) { 5383 mddev_unlock(mddev); 5384 return -EINVAL; 5385 } 5386 5387 if (strncmp(buf, "default", 7) == 0) { 5388 if (mddev->pers) 5389 sectors = mddev->pers->size(mddev, 0, 0); 5390 else 5391 sectors = mddev->array_sectors; 5392 5393 mddev->external_size = 0; 5394 } else { 5395 if (strict_blocks_to_sectors(buf, §ors) < 0) 5396 err = -EINVAL; 5397 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5398 err = -E2BIG; 5399 else 5400 mddev->external_size = 1; 5401 } 5402 5403 if (!err) { 5404 mddev->array_sectors = sectors; 5405 if (mddev->pers) 5406 set_capacity_and_notify(mddev->gendisk, 5407 mddev->array_sectors); 5408 } 5409 mddev_unlock(mddev); 5410 return err ?: len; 5411 } 5412 5413 static struct md_sysfs_entry md_array_size = 5414 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5415 array_size_store); 5416 5417 static ssize_t 5418 consistency_policy_show(struct mddev *mddev, char *page) 5419 { 5420 int ret; 5421 5422 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5423 ret = sprintf(page, "journal\n"); 5424 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5425 ret = sprintf(page, "ppl\n"); 5426 } else if (mddev->bitmap) { 5427 ret = sprintf(page, "bitmap\n"); 5428 } else if (mddev->pers) { 5429 if (mddev->pers->sync_request) 5430 ret = sprintf(page, "resync\n"); 5431 else 5432 ret = sprintf(page, "none\n"); 5433 } else { 5434 ret = sprintf(page, "unknown\n"); 5435 } 5436 5437 return ret; 5438 } 5439 5440 static ssize_t 5441 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5442 { 5443 int err = 0; 5444 5445 if (mddev->pers) { 5446 if (mddev->pers->change_consistency_policy) 5447 err = mddev->pers->change_consistency_policy(mddev, buf); 5448 else 5449 err = -EBUSY; 5450 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5451 set_bit(MD_HAS_PPL, &mddev->flags); 5452 } else { 5453 err = -EINVAL; 5454 } 5455 5456 return err ? err : len; 5457 } 5458 5459 static struct md_sysfs_entry md_consistency_policy = 5460 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5461 consistency_policy_store); 5462 5463 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5464 { 5465 return sprintf(page, "%d\n", mddev->fail_last_dev); 5466 } 5467 5468 /* 5469 * Setting fail_last_dev to true to allow last device to be forcibly removed 5470 * from RAID1/RAID10. 5471 */ 5472 static ssize_t 5473 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5474 { 5475 int ret; 5476 bool value; 5477 5478 ret = kstrtobool(buf, &value); 5479 if (ret) 5480 return ret; 5481 5482 if (value != mddev->fail_last_dev) 5483 mddev->fail_last_dev = value; 5484 5485 return len; 5486 } 5487 static struct md_sysfs_entry md_fail_last_dev = 5488 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5489 fail_last_dev_store); 5490 5491 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5492 { 5493 if (mddev->pers == NULL || (mddev->pers->level != 1)) 5494 return sprintf(page, "n/a\n"); 5495 else 5496 return sprintf(page, "%d\n", mddev->serialize_policy); 5497 } 5498 5499 /* 5500 * Setting serialize_policy to true to enforce write IO is not reordered 5501 * for raid1. 5502 */ 5503 static ssize_t 5504 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5505 { 5506 int err; 5507 bool value; 5508 5509 err = kstrtobool(buf, &value); 5510 if (err) 5511 return err; 5512 5513 if (value == mddev->serialize_policy) 5514 return len; 5515 5516 err = mddev_suspend_and_lock(mddev); 5517 if (err) 5518 return err; 5519 if (mddev->pers == NULL || (mddev->pers->level != 1)) { 5520 pr_err("md: serialize_policy is only effective for raid1\n"); 5521 err = -EINVAL; 5522 goto unlock; 5523 } 5524 5525 if (value) 5526 mddev_create_serial_pool(mddev, NULL); 5527 else 5528 mddev_destroy_serial_pool(mddev, NULL); 5529 mddev->serialize_policy = value; 5530 unlock: 5531 mddev_unlock_and_resume(mddev); 5532 return err ?: len; 5533 } 5534 5535 static struct md_sysfs_entry md_serialize_policy = 5536 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5537 serialize_policy_store); 5538 5539 5540 static struct attribute *md_default_attrs[] = { 5541 &md_level.attr, 5542 &md_layout.attr, 5543 &md_raid_disks.attr, 5544 &md_uuid.attr, 5545 &md_chunk_size.attr, 5546 &md_size.attr, 5547 &md_resync_start.attr, 5548 &md_metadata.attr, 5549 &md_new_device.attr, 5550 &md_safe_delay.attr, 5551 &md_array_state.attr, 5552 &md_reshape_position.attr, 5553 &md_reshape_direction.attr, 5554 &md_array_size.attr, 5555 &max_corr_read_errors.attr, 5556 &md_consistency_policy.attr, 5557 &md_fail_last_dev.attr, 5558 &md_serialize_policy.attr, 5559 NULL, 5560 }; 5561 5562 static const struct attribute_group md_default_group = { 5563 .attrs = md_default_attrs, 5564 }; 5565 5566 static struct attribute *md_redundancy_attrs[] = { 5567 &md_scan_mode.attr, 5568 &md_last_scan_mode.attr, 5569 &md_mismatches.attr, 5570 &md_sync_min.attr, 5571 &md_sync_max.attr, 5572 &md_sync_speed.attr, 5573 &md_sync_force_parallel.attr, 5574 &md_sync_completed.attr, 5575 &md_min_sync.attr, 5576 &md_max_sync.attr, 5577 &md_suspend_lo.attr, 5578 &md_suspend_hi.attr, 5579 &md_bitmap.attr, 5580 &md_degraded.attr, 5581 NULL, 5582 }; 5583 static const struct attribute_group md_redundancy_group = { 5584 .name = NULL, 5585 .attrs = md_redundancy_attrs, 5586 }; 5587 5588 static const struct attribute_group *md_attr_groups[] = { 5589 &md_default_group, 5590 &md_bitmap_group, 5591 NULL, 5592 }; 5593 5594 static ssize_t 5595 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5596 { 5597 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5598 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5599 ssize_t rv; 5600 5601 if (!entry->show) 5602 return -EIO; 5603 spin_lock(&all_mddevs_lock); 5604 if (!mddev_get(mddev)) { 5605 spin_unlock(&all_mddevs_lock); 5606 return -EBUSY; 5607 } 5608 spin_unlock(&all_mddevs_lock); 5609 5610 rv = entry->show(mddev, page); 5611 mddev_put(mddev); 5612 return rv; 5613 } 5614 5615 static ssize_t 5616 md_attr_store(struct kobject *kobj, struct attribute *attr, 5617 const char *page, size_t length) 5618 { 5619 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5620 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5621 ssize_t rv; 5622 5623 if (!entry->store) 5624 return -EIO; 5625 if (!capable(CAP_SYS_ADMIN)) 5626 return -EACCES; 5627 spin_lock(&all_mddevs_lock); 5628 if (!mddev_get(mddev)) { 5629 spin_unlock(&all_mddevs_lock); 5630 return -EBUSY; 5631 } 5632 spin_unlock(&all_mddevs_lock); 5633 rv = entry->store(mddev, page, length); 5634 mddev_put(mddev); 5635 return rv; 5636 } 5637 5638 static void md_kobj_release(struct kobject *ko) 5639 { 5640 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5641 5642 if (mddev->sysfs_state) 5643 sysfs_put(mddev->sysfs_state); 5644 if (mddev->sysfs_level) 5645 sysfs_put(mddev->sysfs_level); 5646 5647 del_gendisk(mddev->gendisk); 5648 put_disk(mddev->gendisk); 5649 } 5650 5651 static const struct sysfs_ops md_sysfs_ops = { 5652 .show = md_attr_show, 5653 .store = md_attr_store, 5654 }; 5655 static const struct kobj_type md_ktype = { 5656 .release = md_kobj_release, 5657 .sysfs_ops = &md_sysfs_ops, 5658 .default_groups = md_attr_groups, 5659 }; 5660 5661 int mdp_major = 0; 5662 5663 static void mddev_delayed_delete(struct work_struct *ws) 5664 { 5665 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5666 5667 kobject_put(&mddev->kobj); 5668 } 5669 5670 struct mddev *md_alloc(dev_t dev, char *name) 5671 { 5672 /* 5673 * If dev is zero, name is the name of a device to allocate with 5674 * an arbitrary minor number. It will be "md_???" 5675 * If dev is non-zero it must be a device number with a MAJOR of 5676 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5677 * the device is being created by opening a node in /dev. 5678 * If "name" is not NULL, the device is being created by 5679 * writing to /sys/module/md_mod/parameters/new_array. 5680 */ 5681 static DEFINE_MUTEX(disks_mutex); 5682 struct mddev *mddev; 5683 struct gendisk *disk; 5684 int partitioned; 5685 int shift; 5686 int unit; 5687 int error ; 5688 5689 /* 5690 * Wait for any previous instance of this device to be completely 5691 * removed (mddev_delayed_delete). 5692 */ 5693 flush_workqueue(md_misc_wq); 5694 5695 mutex_lock(&disks_mutex); 5696 mddev = mddev_alloc(dev); 5697 if (IS_ERR(mddev)) { 5698 error = PTR_ERR(mddev); 5699 goto out_unlock; 5700 } 5701 5702 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5703 shift = partitioned ? MdpMinorShift : 0; 5704 unit = MINOR(mddev->unit) >> shift; 5705 5706 if (name && !dev) { 5707 /* Need to ensure that 'name' is not a duplicate. 5708 */ 5709 struct mddev *mddev2; 5710 spin_lock(&all_mddevs_lock); 5711 5712 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5713 if (mddev2->gendisk && 5714 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5715 spin_unlock(&all_mddevs_lock); 5716 error = -EEXIST; 5717 goto out_free_mddev; 5718 } 5719 spin_unlock(&all_mddevs_lock); 5720 } 5721 if (name && dev) 5722 /* 5723 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5724 */ 5725 mddev->hold_active = UNTIL_STOP; 5726 5727 error = -ENOMEM; 5728 disk = blk_alloc_disk(NUMA_NO_NODE); 5729 if (!disk) 5730 goto out_free_mddev; 5731 5732 disk->major = MAJOR(mddev->unit); 5733 disk->first_minor = unit << shift; 5734 disk->minors = 1 << shift; 5735 if (name) 5736 strcpy(disk->disk_name, name); 5737 else if (partitioned) 5738 sprintf(disk->disk_name, "md_d%d", unit); 5739 else 5740 sprintf(disk->disk_name, "md%d", unit); 5741 disk->fops = &md_fops; 5742 disk->private_data = mddev; 5743 5744 mddev->queue = disk->queue; 5745 blk_set_stacking_limits(&mddev->queue->limits); 5746 blk_queue_write_cache(mddev->queue, true, true); 5747 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5748 mddev->gendisk = disk; 5749 error = add_disk(disk); 5750 if (error) 5751 goto out_put_disk; 5752 5753 kobject_init(&mddev->kobj, &md_ktype); 5754 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5755 if (error) { 5756 /* 5757 * The disk is already live at this point. Clear the hold flag 5758 * and let mddev_put take care of the deletion, as it isn't any 5759 * different from a normal close on last release now. 5760 */ 5761 mddev->hold_active = 0; 5762 mutex_unlock(&disks_mutex); 5763 mddev_put(mddev); 5764 return ERR_PTR(error); 5765 } 5766 5767 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5768 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5769 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 5770 mutex_unlock(&disks_mutex); 5771 return mddev; 5772 5773 out_put_disk: 5774 put_disk(disk); 5775 out_free_mddev: 5776 mddev_free(mddev); 5777 out_unlock: 5778 mutex_unlock(&disks_mutex); 5779 return ERR_PTR(error); 5780 } 5781 5782 static int md_alloc_and_put(dev_t dev, char *name) 5783 { 5784 struct mddev *mddev = md_alloc(dev, name); 5785 5786 if (IS_ERR(mddev)) 5787 return PTR_ERR(mddev); 5788 mddev_put(mddev); 5789 return 0; 5790 } 5791 5792 static void md_probe(dev_t dev) 5793 { 5794 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 5795 return; 5796 if (create_on_open) 5797 md_alloc_and_put(dev, NULL); 5798 } 5799 5800 static int add_named_array(const char *val, const struct kernel_param *kp) 5801 { 5802 /* 5803 * val must be "md_*" or "mdNNN". 5804 * For "md_*" we allocate an array with a large free minor number, and 5805 * set the name to val. val must not already be an active name. 5806 * For "mdNNN" we allocate an array with the minor number NNN 5807 * which must not already be in use. 5808 */ 5809 int len = strlen(val); 5810 char buf[DISK_NAME_LEN]; 5811 unsigned long devnum; 5812 5813 while (len && val[len-1] == '\n') 5814 len--; 5815 if (len >= DISK_NAME_LEN) 5816 return -E2BIG; 5817 strscpy(buf, val, len+1); 5818 if (strncmp(buf, "md_", 3) == 0) 5819 return md_alloc_and_put(0, buf); 5820 if (strncmp(buf, "md", 2) == 0 && 5821 isdigit(buf[2]) && 5822 kstrtoul(buf+2, 10, &devnum) == 0 && 5823 devnum <= MINORMASK) 5824 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 5825 5826 return -EINVAL; 5827 } 5828 5829 static void md_safemode_timeout(struct timer_list *t) 5830 { 5831 struct mddev *mddev = from_timer(mddev, t, safemode_timer); 5832 5833 mddev->safemode = 1; 5834 if (mddev->external) 5835 sysfs_notify_dirent_safe(mddev->sysfs_state); 5836 5837 md_wakeup_thread(mddev->thread); 5838 } 5839 5840 static int start_dirty_degraded; 5841 5842 int md_run(struct mddev *mddev) 5843 { 5844 int err; 5845 struct md_rdev *rdev; 5846 struct md_personality *pers; 5847 bool nowait = true; 5848 5849 if (list_empty(&mddev->disks)) 5850 /* cannot run an array with no devices.. */ 5851 return -EINVAL; 5852 5853 if (mddev->pers) 5854 return -EBUSY; 5855 /* Cannot run until previous stop completes properly */ 5856 if (mddev->sysfs_active) 5857 return -EBUSY; 5858 5859 /* 5860 * Analyze all RAID superblock(s) 5861 */ 5862 if (!mddev->raid_disks) { 5863 if (!mddev->persistent) 5864 return -EINVAL; 5865 err = analyze_sbs(mddev); 5866 if (err) 5867 return -EINVAL; 5868 } 5869 5870 if (mddev->level != LEVEL_NONE) 5871 request_module("md-level-%d", mddev->level); 5872 else if (mddev->clevel[0]) 5873 request_module("md-%s", mddev->clevel); 5874 5875 /* 5876 * Drop all container device buffers, from now on 5877 * the only valid external interface is through the md 5878 * device. 5879 */ 5880 mddev->has_superblocks = false; 5881 rdev_for_each(rdev, mddev) { 5882 if (test_bit(Faulty, &rdev->flags)) 5883 continue; 5884 sync_blockdev(rdev->bdev); 5885 invalidate_bdev(rdev->bdev); 5886 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 5887 mddev->ro = MD_RDONLY; 5888 if (mddev->gendisk) 5889 set_disk_ro(mddev->gendisk, 1); 5890 } 5891 5892 if (rdev->sb_page) 5893 mddev->has_superblocks = true; 5894 5895 /* perform some consistency tests on the device. 5896 * We don't want the data to overlap the metadata, 5897 * Internal Bitmap issues have been handled elsewhere. 5898 */ 5899 if (rdev->meta_bdev) { 5900 /* Nothing to check */; 5901 } else if (rdev->data_offset < rdev->sb_start) { 5902 if (mddev->dev_sectors && 5903 rdev->data_offset + mddev->dev_sectors 5904 > rdev->sb_start) { 5905 pr_warn("md: %s: data overlaps metadata\n", 5906 mdname(mddev)); 5907 return -EINVAL; 5908 } 5909 } else { 5910 if (rdev->sb_start + rdev->sb_size/512 5911 > rdev->data_offset) { 5912 pr_warn("md: %s: metadata overlaps data\n", 5913 mdname(mddev)); 5914 return -EINVAL; 5915 } 5916 } 5917 sysfs_notify_dirent_safe(rdev->sysfs_state); 5918 nowait = nowait && bdev_nowait(rdev->bdev); 5919 } 5920 5921 if (!bioset_initialized(&mddev->bio_set)) { 5922 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5923 if (err) 5924 return err; 5925 } 5926 if (!bioset_initialized(&mddev->sync_set)) { 5927 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5928 if (err) 5929 goto exit_bio_set; 5930 } 5931 5932 if (!bioset_initialized(&mddev->io_clone_set)) { 5933 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 5934 offsetof(struct md_io_clone, bio_clone), 0); 5935 if (err) 5936 goto exit_sync_set; 5937 } 5938 5939 spin_lock(&pers_lock); 5940 pers = find_pers(mddev->level, mddev->clevel); 5941 if (!pers || !try_module_get(pers->owner)) { 5942 spin_unlock(&pers_lock); 5943 if (mddev->level != LEVEL_NONE) 5944 pr_warn("md: personality for level %d is not loaded!\n", 5945 mddev->level); 5946 else 5947 pr_warn("md: personality for level %s is not loaded!\n", 5948 mddev->clevel); 5949 err = -EINVAL; 5950 goto abort; 5951 } 5952 spin_unlock(&pers_lock); 5953 if (mddev->level != pers->level) { 5954 mddev->level = pers->level; 5955 mddev->new_level = pers->level; 5956 } 5957 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5958 5959 if (mddev->reshape_position != MaxSector && 5960 pers->start_reshape == NULL) { 5961 /* This personality cannot handle reshaping... */ 5962 module_put(pers->owner); 5963 err = -EINVAL; 5964 goto abort; 5965 } 5966 5967 if (pers->sync_request) { 5968 /* Warn if this is a potentially silly 5969 * configuration. 5970 */ 5971 struct md_rdev *rdev2; 5972 int warned = 0; 5973 5974 rdev_for_each(rdev, mddev) 5975 rdev_for_each(rdev2, mddev) { 5976 if (rdev < rdev2 && 5977 rdev->bdev->bd_disk == 5978 rdev2->bdev->bd_disk) { 5979 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 5980 mdname(mddev), 5981 rdev->bdev, 5982 rdev2->bdev); 5983 warned = 1; 5984 } 5985 } 5986 5987 if (warned) 5988 pr_warn("True protection against single-disk failure might be compromised.\n"); 5989 } 5990 5991 mddev->recovery = 0; 5992 /* may be over-ridden by personality */ 5993 mddev->resync_max_sectors = mddev->dev_sectors; 5994 5995 mddev->ok_start_degraded = start_dirty_degraded; 5996 5997 if (start_readonly && md_is_rdwr(mddev)) 5998 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 5999 6000 err = pers->run(mddev); 6001 if (err) 6002 pr_warn("md: pers->run() failed ...\n"); 6003 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6004 WARN_ONCE(!mddev->external_size, 6005 "%s: default size too small, but 'external_size' not in effect?\n", 6006 __func__); 6007 pr_warn("md: invalid array_size %llu > default size %llu\n", 6008 (unsigned long long)mddev->array_sectors / 2, 6009 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6010 err = -EINVAL; 6011 } 6012 if (err == 0 && pers->sync_request && 6013 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6014 struct bitmap *bitmap; 6015 6016 bitmap = md_bitmap_create(mddev, -1); 6017 if (IS_ERR(bitmap)) { 6018 err = PTR_ERR(bitmap); 6019 pr_warn("%s: failed to create bitmap (%d)\n", 6020 mdname(mddev), err); 6021 } else 6022 mddev->bitmap = bitmap; 6023 6024 } 6025 if (err) 6026 goto bitmap_abort; 6027 6028 if (mddev->bitmap_info.max_write_behind > 0) { 6029 bool create_pool = false; 6030 6031 rdev_for_each(rdev, mddev) { 6032 if (test_bit(WriteMostly, &rdev->flags) && 6033 rdev_init_serial(rdev)) 6034 create_pool = true; 6035 } 6036 if (create_pool && mddev->serial_info_pool == NULL) { 6037 mddev->serial_info_pool = 6038 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6039 sizeof(struct serial_info)); 6040 if (!mddev->serial_info_pool) { 6041 err = -ENOMEM; 6042 goto bitmap_abort; 6043 } 6044 } 6045 } 6046 6047 if (mddev->queue) { 6048 bool nonrot = true; 6049 6050 rdev_for_each(rdev, mddev) { 6051 if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { 6052 nonrot = false; 6053 break; 6054 } 6055 } 6056 if (mddev->degraded) 6057 nonrot = false; 6058 if (nonrot) 6059 blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); 6060 else 6061 blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); 6062 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); 6063 6064 /* Set the NOWAIT flags if all underlying devices support it */ 6065 if (nowait) 6066 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); 6067 } 6068 if (pers->sync_request) { 6069 if (mddev->kobj.sd && 6070 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6071 pr_warn("md: cannot register extra attributes for %s\n", 6072 mdname(mddev)); 6073 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6074 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6075 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6076 } else if (mddev->ro == MD_AUTO_READ) 6077 mddev->ro = MD_RDWR; 6078 6079 atomic_set(&mddev->max_corr_read_errors, 6080 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6081 mddev->safemode = 0; 6082 if (mddev_is_clustered(mddev)) 6083 mddev->safemode_delay = 0; 6084 else 6085 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6086 mddev->in_sync = 1; 6087 smp_wmb(); 6088 spin_lock(&mddev->lock); 6089 mddev->pers = pers; 6090 spin_unlock(&mddev->lock); 6091 rdev_for_each(rdev, mddev) 6092 if (rdev->raid_disk >= 0) 6093 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6094 6095 if (mddev->degraded && md_is_rdwr(mddev)) 6096 /* This ensures that recovering status is reported immediately 6097 * via sysfs - until a lack of spares is confirmed. 6098 */ 6099 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6100 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6101 6102 if (mddev->sb_flags) 6103 md_update_sb(mddev, 0); 6104 6105 md_new_event(); 6106 return 0; 6107 6108 bitmap_abort: 6109 mddev_detach(mddev); 6110 if (mddev->private) 6111 pers->free(mddev, mddev->private); 6112 mddev->private = NULL; 6113 module_put(pers->owner); 6114 md_bitmap_destroy(mddev); 6115 abort: 6116 bioset_exit(&mddev->io_clone_set); 6117 exit_sync_set: 6118 bioset_exit(&mddev->sync_set); 6119 exit_bio_set: 6120 bioset_exit(&mddev->bio_set); 6121 return err; 6122 } 6123 EXPORT_SYMBOL_GPL(md_run); 6124 6125 int do_md_run(struct mddev *mddev) 6126 { 6127 int err; 6128 6129 set_bit(MD_NOT_READY, &mddev->flags); 6130 err = md_run(mddev); 6131 if (err) 6132 goto out; 6133 err = md_bitmap_load(mddev); 6134 if (err) { 6135 md_bitmap_destroy(mddev); 6136 goto out; 6137 } 6138 6139 if (mddev_is_clustered(mddev)) 6140 md_allow_write(mddev); 6141 6142 /* run start up tasks that require md_thread */ 6143 md_start(mddev); 6144 6145 md_wakeup_thread(mddev->thread); 6146 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6147 6148 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6149 clear_bit(MD_NOT_READY, &mddev->flags); 6150 mddev->changed = 1; 6151 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6152 sysfs_notify_dirent_safe(mddev->sysfs_state); 6153 sysfs_notify_dirent_safe(mddev->sysfs_action); 6154 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6155 out: 6156 clear_bit(MD_NOT_READY, &mddev->flags); 6157 return err; 6158 } 6159 6160 int md_start(struct mddev *mddev) 6161 { 6162 int ret = 0; 6163 6164 if (mddev->pers->start) { 6165 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6166 md_wakeup_thread(mddev->thread); 6167 ret = mddev->pers->start(mddev); 6168 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6169 md_wakeup_thread(mddev->sync_thread); 6170 } 6171 return ret; 6172 } 6173 EXPORT_SYMBOL_GPL(md_start); 6174 6175 static int restart_array(struct mddev *mddev) 6176 { 6177 struct gendisk *disk = mddev->gendisk; 6178 struct md_rdev *rdev; 6179 bool has_journal = false; 6180 bool has_readonly = false; 6181 6182 /* Complain if it has no devices */ 6183 if (list_empty(&mddev->disks)) 6184 return -ENXIO; 6185 if (!mddev->pers) 6186 return -EINVAL; 6187 if (md_is_rdwr(mddev)) 6188 return -EBUSY; 6189 6190 rcu_read_lock(); 6191 rdev_for_each_rcu(rdev, mddev) { 6192 if (test_bit(Journal, &rdev->flags) && 6193 !test_bit(Faulty, &rdev->flags)) 6194 has_journal = true; 6195 if (rdev_read_only(rdev)) 6196 has_readonly = true; 6197 } 6198 rcu_read_unlock(); 6199 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6200 /* Don't restart rw with journal missing/faulty */ 6201 return -EINVAL; 6202 if (has_readonly) 6203 return -EROFS; 6204 6205 mddev->safemode = 0; 6206 mddev->ro = MD_RDWR; 6207 set_disk_ro(disk, 0); 6208 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6209 /* Kick recovery or resync if necessary */ 6210 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6211 md_wakeup_thread(mddev->thread); 6212 md_wakeup_thread(mddev->sync_thread); 6213 sysfs_notify_dirent_safe(mddev->sysfs_state); 6214 return 0; 6215 } 6216 6217 static void md_clean(struct mddev *mddev) 6218 { 6219 mddev->array_sectors = 0; 6220 mddev->external_size = 0; 6221 mddev->dev_sectors = 0; 6222 mddev->raid_disks = 0; 6223 mddev->recovery_cp = 0; 6224 mddev->resync_min = 0; 6225 mddev->resync_max = MaxSector; 6226 mddev->reshape_position = MaxSector; 6227 /* we still need mddev->external in export_rdev, do not clear it yet */ 6228 mddev->persistent = 0; 6229 mddev->level = LEVEL_NONE; 6230 mddev->clevel[0] = 0; 6231 mddev->flags = 0; 6232 mddev->sb_flags = 0; 6233 mddev->ro = MD_RDWR; 6234 mddev->metadata_type[0] = 0; 6235 mddev->chunk_sectors = 0; 6236 mddev->ctime = mddev->utime = 0; 6237 mddev->layout = 0; 6238 mddev->max_disks = 0; 6239 mddev->events = 0; 6240 mddev->can_decrease_events = 0; 6241 mddev->delta_disks = 0; 6242 mddev->reshape_backwards = 0; 6243 mddev->new_level = LEVEL_NONE; 6244 mddev->new_layout = 0; 6245 mddev->new_chunk_sectors = 0; 6246 mddev->curr_resync = MD_RESYNC_NONE; 6247 atomic64_set(&mddev->resync_mismatches, 0); 6248 mddev->suspend_lo = mddev->suspend_hi = 0; 6249 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6250 mddev->recovery = 0; 6251 mddev->in_sync = 0; 6252 mddev->changed = 0; 6253 mddev->degraded = 0; 6254 mddev->safemode = 0; 6255 mddev->private = NULL; 6256 mddev->cluster_info = NULL; 6257 mddev->bitmap_info.offset = 0; 6258 mddev->bitmap_info.default_offset = 0; 6259 mddev->bitmap_info.default_space = 0; 6260 mddev->bitmap_info.chunksize = 0; 6261 mddev->bitmap_info.daemon_sleep = 0; 6262 mddev->bitmap_info.max_write_behind = 0; 6263 mddev->bitmap_info.nodes = 0; 6264 } 6265 6266 static void __md_stop_writes(struct mddev *mddev) 6267 { 6268 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6269 if (work_pending(&mddev->del_work)) 6270 flush_workqueue(md_misc_wq); 6271 if (mddev->sync_thread) { 6272 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6273 md_reap_sync_thread(mddev); 6274 } 6275 6276 del_timer_sync(&mddev->safemode_timer); 6277 6278 if (mddev->pers && mddev->pers->quiesce) { 6279 mddev->pers->quiesce(mddev, 1); 6280 mddev->pers->quiesce(mddev, 0); 6281 } 6282 md_bitmap_flush(mddev); 6283 6284 if (md_is_rdwr(mddev) && 6285 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6286 mddev->sb_flags)) { 6287 /* mark array as shutdown cleanly */ 6288 if (!mddev_is_clustered(mddev)) 6289 mddev->in_sync = 1; 6290 md_update_sb(mddev, 1); 6291 } 6292 /* disable policy to guarantee rdevs free resources for serialization */ 6293 mddev->serialize_policy = 0; 6294 mddev_destroy_serial_pool(mddev, NULL); 6295 } 6296 6297 void md_stop_writes(struct mddev *mddev) 6298 { 6299 mddev_lock_nointr(mddev); 6300 __md_stop_writes(mddev); 6301 mddev_unlock(mddev); 6302 } 6303 EXPORT_SYMBOL_GPL(md_stop_writes); 6304 6305 static void mddev_detach(struct mddev *mddev) 6306 { 6307 md_bitmap_wait_behind_writes(mddev); 6308 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6309 mddev->pers->quiesce(mddev, 1); 6310 mddev->pers->quiesce(mddev, 0); 6311 } 6312 md_unregister_thread(mddev, &mddev->thread); 6313 if (mddev->queue) 6314 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 6315 } 6316 6317 static void __md_stop(struct mddev *mddev) 6318 { 6319 struct md_personality *pers = mddev->pers; 6320 md_bitmap_destroy(mddev); 6321 mddev_detach(mddev); 6322 /* Ensure ->event_work is done */ 6323 if (mddev->event_work.func) 6324 flush_workqueue(md_misc_wq); 6325 spin_lock(&mddev->lock); 6326 mddev->pers = NULL; 6327 spin_unlock(&mddev->lock); 6328 if (mddev->private) 6329 pers->free(mddev, mddev->private); 6330 mddev->private = NULL; 6331 if (pers->sync_request && mddev->to_remove == NULL) 6332 mddev->to_remove = &md_redundancy_group; 6333 module_put(pers->owner); 6334 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6335 6336 bioset_exit(&mddev->bio_set); 6337 bioset_exit(&mddev->sync_set); 6338 bioset_exit(&mddev->io_clone_set); 6339 } 6340 6341 void md_stop(struct mddev *mddev) 6342 { 6343 lockdep_assert_held(&mddev->reconfig_mutex); 6344 6345 /* stop the array and free an attached data structures. 6346 * This is called from dm-raid 6347 */ 6348 __md_stop_writes(mddev); 6349 __md_stop(mddev); 6350 } 6351 6352 EXPORT_SYMBOL_GPL(md_stop); 6353 6354 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 6355 { 6356 int err = 0; 6357 int did_freeze = 0; 6358 6359 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6360 did_freeze = 1; 6361 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6362 md_wakeup_thread(mddev->thread); 6363 } 6364 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6365 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6366 6367 /* 6368 * Thread might be blocked waiting for metadata update which will now 6369 * never happen 6370 */ 6371 md_wakeup_thread_directly(mddev->sync_thread); 6372 6373 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6374 return -EBUSY; 6375 mddev_unlock(mddev); 6376 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 6377 &mddev->recovery)); 6378 wait_event(mddev->sb_wait, 6379 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6380 mddev_lock_nointr(mddev); 6381 6382 mutex_lock(&mddev->open_mutex); 6383 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6384 mddev->sync_thread || 6385 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6386 pr_warn("md: %s still in use.\n",mdname(mddev)); 6387 if (did_freeze) { 6388 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6389 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6390 md_wakeup_thread(mddev->thread); 6391 } 6392 err = -EBUSY; 6393 goto out; 6394 } 6395 if (mddev->pers) { 6396 __md_stop_writes(mddev); 6397 6398 err = -ENXIO; 6399 if (mddev->ro == MD_RDONLY) 6400 goto out; 6401 mddev->ro = MD_RDONLY; 6402 set_disk_ro(mddev->gendisk, 1); 6403 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6404 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6405 md_wakeup_thread(mddev->thread); 6406 sysfs_notify_dirent_safe(mddev->sysfs_state); 6407 err = 0; 6408 } 6409 out: 6410 mutex_unlock(&mddev->open_mutex); 6411 return err; 6412 } 6413 6414 /* mode: 6415 * 0 - completely stop and dis-assemble array 6416 * 2 - stop but do not disassemble array 6417 */ 6418 static int do_md_stop(struct mddev *mddev, int mode, 6419 struct block_device *bdev) 6420 { 6421 struct gendisk *disk = mddev->gendisk; 6422 struct md_rdev *rdev; 6423 int did_freeze = 0; 6424 6425 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6426 did_freeze = 1; 6427 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6428 md_wakeup_thread(mddev->thread); 6429 } 6430 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6431 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6432 6433 /* 6434 * Thread might be blocked waiting for metadata update which will now 6435 * never happen 6436 */ 6437 md_wakeup_thread_directly(mddev->sync_thread); 6438 6439 mddev_unlock(mddev); 6440 wait_event(resync_wait, (mddev->sync_thread == NULL && 6441 !test_bit(MD_RECOVERY_RUNNING, 6442 &mddev->recovery))); 6443 mddev_lock_nointr(mddev); 6444 6445 mutex_lock(&mddev->open_mutex); 6446 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6447 mddev->sysfs_active || 6448 mddev->sync_thread || 6449 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6450 pr_warn("md: %s still in use.\n",mdname(mddev)); 6451 mutex_unlock(&mddev->open_mutex); 6452 if (did_freeze) { 6453 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6454 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6455 md_wakeup_thread(mddev->thread); 6456 } 6457 return -EBUSY; 6458 } 6459 if (mddev->pers) { 6460 if (!md_is_rdwr(mddev)) 6461 set_disk_ro(disk, 0); 6462 6463 __md_stop_writes(mddev); 6464 __md_stop(mddev); 6465 6466 /* tell userspace to handle 'inactive' */ 6467 sysfs_notify_dirent_safe(mddev->sysfs_state); 6468 6469 rdev_for_each(rdev, mddev) 6470 if (rdev->raid_disk >= 0) 6471 sysfs_unlink_rdev(mddev, rdev); 6472 6473 set_capacity_and_notify(disk, 0); 6474 mutex_unlock(&mddev->open_mutex); 6475 mddev->changed = 1; 6476 6477 if (!md_is_rdwr(mddev)) 6478 mddev->ro = MD_RDWR; 6479 } else 6480 mutex_unlock(&mddev->open_mutex); 6481 /* 6482 * Free resources if final stop 6483 */ 6484 if (mode == 0) { 6485 pr_info("md: %s stopped.\n", mdname(mddev)); 6486 6487 if (mddev->bitmap_info.file) { 6488 struct file *f = mddev->bitmap_info.file; 6489 spin_lock(&mddev->lock); 6490 mddev->bitmap_info.file = NULL; 6491 spin_unlock(&mddev->lock); 6492 fput(f); 6493 } 6494 mddev->bitmap_info.offset = 0; 6495 6496 export_array(mddev); 6497 6498 md_clean(mddev); 6499 if (mddev->hold_active == UNTIL_STOP) 6500 mddev->hold_active = 0; 6501 } 6502 md_new_event(); 6503 sysfs_notify_dirent_safe(mddev->sysfs_state); 6504 return 0; 6505 } 6506 6507 #ifndef MODULE 6508 static void autorun_array(struct mddev *mddev) 6509 { 6510 struct md_rdev *rdev; 6511 int err; 6512 6513 if (list_empty(&mddev->disks)) 6514 return; 6515 6516 pr_info("md: running: "); 6517 6518 rdev_for_each(rdev, mddev) { 6519 pr_cont("<%pg>", rdev->bdev); 6520 } 6521 pr_cont("\n"); 6522 6523 err = do_md_run(mddev); 6524 if (err) { 6525 pr_warn("md: do_md_run() returned %d\n", err); 6526 do_md_stop(mddev, 0, NULL); 6527 } 6528 } 6529 6530 /* 6531 * lets try to run arrays based on all disks that have arrived 6532 * until now. (those are in pending_raid_disks) 6533 * 6534 * the method: pick the first pending disk, collect all disks with 6535 * the same UUID, remove all from the pending list and put them into 6536 * the 'same_array' list. Then order this list based on superblock 6537 * update time (freshest comes first), kick out 'old' disks and 6538 * compare superblocks. If everything's fine then run it. 6539 * 6540 * If "unit" is allocated, then bump its reference count 6541 */ 6542 static void autorun_devices(int part) 6543 { 6544 struct md_rdev *rdev0, *rdev, *tmp; 6545 struct mddev *mddev; 6546 6547 pr_info("md: autorun ...\n"); 6548 while (!list_empty(&pending_raid_disks)) { 6549 int unit; 6550 dev_t dev; 6551 LIST_HEAD(candidates); 6552 rdev0 = list_entry(pending_raid_disks.next, 6553 struct md_rdev, same_set); 6554 6555 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6556 INIT_LIST_HEAD(&candidates); 6557 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6558 if (super_90_load(rdev, rdev0, 0) >= 0) { 6559 pr_debug("md: adding %pg ...\n", 6560 rdev->bdev); 6561 list_move(&rdev->same_set, &candidates); 6562 } 6563 /* 6564 * now we have a set of devices, with all of them having 6565 * mostly sane superblocks. It's time to allocate the 6566 * mddev. 6567 */ 6568 if (part) { 6569 dev = MKDEV(mdp_major, 6570 rdev0->preferred_minor << MdpMinorShift); 6571 unit = MINOR(dev) >> MdpMinorShift; 6572 } else { 6573 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6574 unit = MINOR(dev); 6575 } 6576 if (rdev0->preferred_minor != unit) { 6577 pr_warn("md: unit number in %pg is bad: %d\n", 6578 rdev0->bdev, rdev0->preferred_minor); 6579 break; 6580 } 6581 6582 mddev = md_alloc(dev, NULL); 6583 if (IS_ERR(mddev)) 6584 break; 6585 6586 if (mddev_suspend_and_lock(mddev)) 6587 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6588 else if (mddev->raid_disks || mddev->major_version 6589 || !list_empty(&mddev->disks)) { 6590 pr_warn("md: %s already running, cannot run %pg\n", 6591 mdname(mddev), rdev0->bdev); 6592 mddev_unlock_and_resume(mddev); 6593 } else { 6594 pr_debug("md: created %s\n", mdname(mddev)); 6595 mddev->persistent = 1; 6596 rdev_for_each_list(rdev, tmp, &candidates) { 6597 list_del_init(&rdev->same_set); 6598 if (bind_rdev_to_array(rdev, mddev)) 6599 export_rdev(rdev, mddev); 6600 } 6601 autorun_array(mddev); 6602 mddev_unlock_and_resume(mddev); 6603 } 6604 /* on success, candidates will be empty, on error 6605 * it won't... 6606 */ 6607 rdev_for_each_list(rdev, tmp, &candidates) { 6608 list_del_init(&rdev->same_set); 6609 export_rdev(rdev, mddev); 6610 } 6611 mddev_put(mddev); 6612 } 6613 pr_info("md: ... autorun DONE.\n"); 6614 } 6615 #endif /* !MODULE */ 6616 6617 static int get_version(void __user *arg) 6618 { 6619 mdu_version_t ver; 6620 6621 ver.major = MD_MAJOR_VERSION; 6622 ver.minor = MD_MINOR_VERSION; 6623 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6624 6625 if (copy_to_user(arg, &ver, sizeof(ver))) 6626 return -EFAULT; 6627 6628 return 0; 6629 } 6630 6631 static int get_array_info(struct mddev *mddev, void __user *arg) 6632 { 6633 mdu_array_info_t info; 6634 int nr,working,insync,failed,spare; 6635 struct md_rdev *rdev; 6636 6637 nr = working = insync = failed = spare = 0; 6638 rcu_read_lock(); 6639 rdev_for_each_rcu(rdev, mddev) { 6640 nr++; 6641 if (test_bit(Faulty, &rdev->flags)) 6642 failed++; 6643 else { 6644 working++; 6645 if (test_bit(In_sync, &rdev->flags)) 6646 insync++; 6647 else if (test_bit(Journal, &rdev->flags)) 6648 /* TODO: add journal count to md_u.h */ 6649 ; 6650 else 6651 spare++; 6652 } 6653 } 6654 rcu_read_unlock(); 6655 6656 info.major_version = mddev->major_version; 6657 info.minor_version = mddev->minor_version; 6658 info.patch_version = MD_PATCHLEVEL_VERSION; 6659 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6660 info.level = mddev->level; 6661 info.size = mddev->dev_sectors / 2; 6662 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6663 info.size = -1; 6664 info.nr_disks = nr; 6665 info.raid_disks = mddev->raid_disks; 6666 info.md_minor = mddev->md_minor; 6667 info.not_persistent= !mddev->persistent; 6668 6669 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6670 info.state = 0; 6671 if (mddev->in_sync) 6672 info.state = (1<<MD_SB_CLEAN); 6673 if (mddev->bitmap && mddev->bitmap_info.offset) 6674 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6675 if (mddev_is_clustered(mddev)) 6676 info.state |= (1<<MD_SB_CLUSTERED); 6677 info.active_disks = insync; 6678 info.working_disks = working; 6679 info.failed_disks = failed; 6680 info.spare_disks = spare; 6681 6682 info.layout = mddev->layout; 6683 info.chunk_size = mddev->chunk_sectors << 9; 6684 6685 if (copy_to_user(arg, &info, sizeof(info))) 6686 return -EFAULT; 6687 6688 return 0; 6689 } 6690 6691 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6692 { 6693 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6694 char *ptr; 6695 int err; 6696 6697 file = kzalloc(sizeof(*file), GFP_NOIO); 6698 if (!file) 6699 return -ENOMEM; 6700 6701 err = 0; 6702 spin_lock(&mddev->lock); 6703 /* bitmap enabled */ 6704 if (mddev->bitmap_info.file) { 6705 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6706 sizeof(file->pathname)); 6707 if (IS_ERR(ptr)) 6708 err = PTR_ERR(ptr); 6709 else 6710 memmove(file->pathname, ptr, 6711 sizeof(file->pathname)-(ptr-file->pathname)); 6712 } 6713 spin_unlock(&mddev->lock); 6714 6715 if (err == 0 && 6716 copy_to_user(arg, file, sizeof(*file))) 6717 err = -EFAULT; 6718 6719 kfree(file); 6720 return err; 6721 } 6722 6723 static int get_disk_info(struct mddev *mddev, void __user * arg) 6724 { 6725 mdu_disk_info_t info; 6726 struct md_rdev *rdev; 6727 6728 if (copy_from_user(&info, arg, sizeof(info))) 6729 return -EFAULT; 6730 6731 rcu_read_lock(); 6732 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6733 if (rdev) { 6734 info.major = MAJOR(rdev->bdev->bd_dev); 6735 info.minor = MINOR(rdev->bdev->bd_dev); 6736 info.raid_disk = rdev->raid_disk; 6737 info.state = 0; 6738 if (test_bit(Faulty, &rdev->flags)) 6739 info.state |= (1<<MD_DISK_FAULTY); 6740 else if (test_bit(In_sync, &rdev->flags)) { 6741 info.state |= (1<<MD_DISK_ACTIVE); 6742 info.state |= (1<<MD_DISK_SYNC); 6743 } 6744 if (test_bit(Journal, &rdev->flags)) 6745 info.state |= (1<<MD_DISK_JOURNAL); 6746 if (test_bit(WriteMostly, &rdev->flags)) 6747 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6748 if (test_bit(FailFast, &rdev->flags)) 6749 info.state |= (1<<MD_DISK_FAILFAST); 6750 } else { 6751 info.major = info.minor = 0; 6752 info.raid_disk = -1; 6753 info.state = (1<<MD_DISK_REMOVED); 6754 } 6755 rcu_read_unlock(); 6756 6757 if (copy_to_user(arg, &info, sizeof(info))) 6758 return -EFAULT; 6759 6760 return 0; 6761 } 6762 6763 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6764 { 6765 struct md_rdev *rdev; 6766 dev_t dev = MKDEV(info->major,info->minor); 6767 6768 if (mddev_is_clustered(mddev) && 6769 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6770 pr_warn("%s: Cannot add to clustered mddev.\n", 6771 mdname(mddev)); 6772 return -EINVAL; 6773 } 6774 6775 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6776 return -EOVERFLOW; 6777 6778 if (!mddev->raid_disks) { 6779 int err; 6780 /* expecting a device which has a superblock */ 6781 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6782 if (IS_ERR(rdev)) { 6783 pr_warn("md: md_import_device returned %ld\n", 6784 PTR_ERR(rdev)); 6785 return PTR_ERR(rdev); 6786 } 6787 if (!list_empty(&mddev->disks)) { 6788 struct md_rdev *rdev0 6789 = list_entry(mddev->disks.next, 6790 struct md_rdev, same_set); 6791 err = super_types[mddev->major_version] 6792 .load_super(rdev, rdev0, mddev->minor_version); 6793 if (err < 0) { 6794 pr_warn("md: %pg has different UUID to %pg\n", 6795 rdev->bdev, 6796 rdev0->bdev); 6797 export_rdev(rdev, mddev); 6798 return -EINVAL; 6799 } 6800 } 6801 err = bind_rdev_to_array(rdev, mddev); 6802 if (err) 6803 export_rdev(rdev, mddev); 6804 return err; 6805 } 6806 6807 /* 6808 * md_add_new_disk can be used once the array is assembled 6809 * to add "hot spares". They must already have a superblock 6810 * written 6811 */ 6812 if (mddev->pers) { 6813 int err; 6814 if (!mddev->pers->hot_add_disk) { 6815 pr_warn("%s: personality does not support diskops!\n", 6816 mdname(mddev)); 6817 return -EINVAL; 6818 } 6819 if (mddev->persistent) 6820 rdev = md_import_device(dev, mddev->major_version, 6821 mddev->minor_version); 6822 else 6823 rdev = md_import_device(dev, -1, -1); 6824 if (IS_ERR(rdev)) { 6825 pr_warn("md: md_import_device returned %ld\n", 6826 PTR_ERR(rdev)); 6827 return PTR_ERR(rdev); 6828 } 6829 /* set saved_raid_disk if appropriate */ 6830 if (!mddev->persistent) { 6831 if (info->state & (1<<MD_DISK_SYNC) && 6832 info->raid_disk < mddev->raid_disks) { 6833 rdev->raid_disk = info->raid_disk; 6834 clear_bit(Bitmap_sync, &rdev->flags); 6835 } else 6836 rdev->raid_disk = -1; 6837 rdev->saved_raid_disk = rdev->raid_disk; 6838 } else 6839 super_types[mddev->major_version]. 6840 validate_super(mddev, rdev); 6841 if ((info->state & (1<<MD_DISK_SYNC)) && 6842 rdev->raid_disk != info->raid_disk) { 6843 /* This was a hot-add request, but events doesn't 6844 * match, so reject it. 6845 */ 6846 export_rdev(rdev, mddev); 6847 return -EINVAL; 6848 } 6849 6850 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6851 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6852 set_bit(WriteMostly, &rdev->flags); 6853 else 6854 clear_bit(WriteMostly, &rdev->flags); 6855 if (info->state & (1<<MD_DISK_FAILFAST)) 6856 set_bit(FailFast, &rdev->flags); 6857 else 6858 clear_bit(FailFast, &rdev->flags); 6859 6860 if (info->state & (1<<MD_DISK_JOURNAL)) { 6861 struct md_rdev *rdev2; 6862 bool has_journal = false; 6863 6864 /* make sure no existing journal disk */ 6865 rdev_for_each(rdev2, mddev) { 6866 if (test_bit(Journal, &rdev2->flags)) { 6867 has_journal = true; 6868 break; 6869 } 6870 } 6871 if (has_journal || mddev->bitmap) { 6872 export_rdev(rdev, mddev); 6873 return -EBUSY; 6874 } 6875 set_bit(Journal, &rdev->flags); 6876 } 6877 /* 6878 * check whether the device shows up in other nodes 6879 */ 6880 if (mddev_is_clustered(mddev)) { 6881 if (info->state & (1 << MD_DISK_CANDIDATE)) 6882 set_bit(Candidate, &rdev->flags); 6883 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6884 /* --add initiated by this node */ 6885 err = md_cluster_ops->add_new_disk(mddev, rdev); 6886 if (err) { 6887 export_rdev(rdev, mddev); 6888 return err; 6889 } 6890 } 6891 } 6892 6893 rdev->raid_disk = -1; 6894 err = bind_rdev_to_array(rdev, mddev); 6895 6896 if (err) 6897 export_rdev(rdev, mddev); 6898 6899 if (mddev_is_clustered(mddev)) { 6900 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6901 if (!err) { 6902 err = md_cluster_ops->new_disk_ack(mddev, 6903 err == 0); 6904 if (err) 6905 md_kick_rdev_from_array(rdev); 6906 } 6907 } else { 6908 if (err) 6909 md_cluster_ops->add_new_disk_cancel(mddev); 6910 else 6911 err = add_bound_rdev(rdev); 6912 } 6913 6914 } else if (!err) 6915 err = add_bound_rdev(rdev); 6916 6917 return err; 6918 } 6919 6920 /* otherwise, md_add_new_disk is only allowed 6921 * for major_version==0 superblocks 6922 */ 6923 if (mddev->major_version != 0) { 6924 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 6925 return -EINVAL; 6926 } 6927 6928 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6929 int err; 6930 rdev = md_import_device(dev, -1, 0); 6931 if (IS_ERR(rdev)) { 6932 pr_warn("md: error, md_import_device() returned %ld\n", 6933 PTR_ERR(rdev)); 6934 return PTR_ERR(rdev); 6935 } 6936 rdev->desc_nr = info->number; 6937 if (info->raid_disk < mddev->raid_disks) 6938 rdev->raid_disk = info->raid_disk; 6939 else 6940 rdev->raid_disk = -1; 6941 6942 if (rdev->raid_disk < mddev->raid_disks) 6943 if (info->state & (1<<MD_DISK_SYNC)) 6944 set_bit(In_sync, &rdev->flags); 6945 6946 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6947 set_bit(WriteMostly, &rdev->flags); 6948 if (info->state & (1<<MD_DISK_FAILFAST)) 6949 set_bit(FailFast, &rdev->flags); 6950 6951 if (!mddev->persistent) { 6952 pr_debug("md: nonpersistent superblock ...\n"); 6953 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 6954 } else 6955 rdev->sb_start = calc_dev_sboffset(rdev); 6956 rdev->sectors = rdev->sb_start; 6957 6958 err = bind_rdev_to_array(rdev, mddev); 6959 if (err) { 6960 export_rdev(rdev, mddev); 6961 return err; 6962 } 6963 } 6964 6965 return 0; 6966 } 6967 6968 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6969 { 6970 struct md_rdev *rdev; 6971 6972 if (!mddev->pers) 6973 return -ENODEV; 6974 6975 rdev = find_rdev(mddev, dev); 6976 if (!rdev) 6977 return -ENXIO; 6978 6979 if (rdev->raid_disk < 0) 6980 goto kick_rdev; 6981 6982 clear_bit(Blocked, &rdev->flags); 6983 remove_and_add_spares(mddev, rdev); 6984 6985 if (rdev->raid_disk >= 0) 6986 goto busy; 6987 6988 kick_rdev: 6989 if (mddev_is_clustered(mddev)) { 6990 if (md_cluster_ops->remove_disk(mddev, rdev)) 6991 goto busy; 6992 } 6993 6994 md_kick_rdev_from_array(rdev); 6995 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6996 if (mddev->thread) 6997 md_wakeup_thread(mddev->thread); 6998 else 6999 md_update_sb(mddev, 1); 7000 md_new_event(); 7001 7002 return 0; 7003 busy: 7004 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7005 rdev->bdev, mdname(mddev)); 7006 return -EBUSY; 7007 } 7008 7009 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7010 { 7011 int err; 7012 struct md_rdev *rdev; 7013 7014 if (!mddev->pers) 7015 return -ENODEV; 7016 7017 if (mddev->major_version != 0) { 7018 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7019 mdname(mddev)); 7020 return -EINVAL; 7021 } 7022 if (!mddev->pers->hot_add_disk) { 7023 pr_warn("%s: personality does not support diskops!\n", 7024 mdname(mddev)); 7025 return -EINVAL; 7026 } 7027 7028 rdev = md_import_device(dev, -1, 0); 7029 if (IS_ERR(rdev)) { 7030 pr_warn("md: error, md_import_device() returned %ld\n", 7031 PTR_ERR(rdev)); 7032 return -EINVAL; 7033 } 7034 7035 if (mddev->persistent) 7036 rdev->sb_start = calc_dev_sboffset(rdev); 7037 else 7038 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7039 7040 rdev->sectors = rdev->sb_start; 7041 7042 if (test_bit(Faulty, &rdev->flags)) { 7043 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7044 rdev->bdev, mdname(mddev)); 7045 err = -EINVAL; 7046 goto abort_export; 7047 } 7048 7049 clear_bit(In_sync, &rdev->flags); 7050 rdev->desc_nr = -1; 7051 rdev->saved_raid_disk = -1; 7052 err = bind_rdev_to_array(rdev, mddev); 7053 if (err) 7054 goto abort_export; 7055 7056 /* 7057 * The rest should better be atomic, we can have disk failures 7058 * noticed in interrupt contexts ... 7059 */ 7060 7061 rdev->raid_disk = -1; 7062 7063 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7064 if (!mddev->thread) 7065 md_update_sb(mddev, 1); 7066 /* 7067 * If the new disk does not support REQ_NOWAIT, 7068 * disable on the whole MD. 7069 */ 7070 if (!bdev_nowait(rdev->bdev)) { 7071 pr_info("%s: Disabling nowait because %pg does not support nowait\n", 7072 mdname(mddev), rdev->bdev); 7073 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); 7074 } 7075 /* 7076 * Kick recovery, maybe this spare has to be added to the 7077 * array immediately. 7078 */ 7079 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7080 md_wakeup_thread(mddev->thread); 7081 md_new_event(); 7082 return 0; 7083 7084 abort_export: 7085 export_rdev(rdev, mddev); 7086 return err; 7087 } 7088 7089 static int set_bitmap_file(struct mddev *mddev, int fd) 7090 { 7091 int err = 0; 7092 7093 if (mddev->pers) { 7094 if (!mddev->pers->quiesce || !mddev->thread) 7095 return -EBUSY; 7096 if (mddev->recovery || mddev->sync_thread) 7097 return -EBUSY; 7098 /* we should be able to change the bitmap.. */ 7099 } 7100 7101 if (fd >= 0) { 7102 struct inode *inode; 7103 struct file *f; 7104 7105 if (mddev->bitmap || mddev->bitmap_info.file) 7106 return -EEXIST; /* cannot add when bitmap is present */ 7107 7108 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7109 pr_warn("%s: bitmap files not supported by this kernel\n", 7110 mdname(mddev)); 7111 return -EINVAL; 7112 } 7113 pr_warn("%s: using deprecated bitmap file support\n", 7114 mdname(mddev)); 7115 7116 f = fget(fd); 7117 7118 if (f == NULL) { 7119 pr_warn("%s: error: failed to get bitmap file\n", 7120 mdname(mddev)); 7121 return -EBADF; 7122 } 7123 7124 inode = f->f_mapping->host; 7125 if (!S_ISREG(inode->i_mode)) { 7126 pr_warn("%s: error: bitmap file must be a regular file\n", 7127 mdname(mddev)); 7128 err = -EBADF; 7129 } else if (!(f->f_mode & FMODE_WRITE)) { 7130 pr_warn("%s: error: bitmap file must open for write\n", 7131 mdname(mddev)); 7132 err = -EBADF; 7133 } else if (atomic_read(&inode->i_writecount) != 1) { 7134 pr_warn("%s: error: bitmap file is already in use\n", 7135 mdname(mddev)); 7136 err = -EBUSY; 7137 } 7138 if (err) { 7139 fput(f); 7140 return err; 7141 } 7142 mddev->bitmap_info.file = f; 7143 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7144 } else if (mddev->bitmap == NULL) 7145 return -ENOENT; /* cannot remove what isn't there */ 7146 err = 0; 7147 if (mddev->pers) { 7148 if (fd >= 0) { 7149 struct bitmap *bitmap; 7150 7151 bitmap = md_bitmap_create(mddev, -1); 7152 if (!IS_ERR(bitmap)) { 7153 mddev->bitmap = bitmap; 7154 err = md_bitmap_load(mddev); 7155 } else 7156 err = PTR_ERR(bitmap); 7157 if (err) { 7158 md_bitmap_destroy(mddev); 7159 fd = -1; 7160 } 7161 } else if (fd < 0) { 7162 md_bitmap_destroy(mddev); 7163 } 7164 } 7165 if (fd < 0) { 7166 struct file *f = mddev->bitmap_info.file; 7167 if (f) { 7168 spin_lock(&mddev->lock); 7169 mddev->bitmap_info.file = NULL; 7170 spin_unlock(&mddev->lock); 7171 fput(f); 7172 } 7173 } 7174 7175 return err; 7176 } 7177 7178 /* 7179 * md_set_array_info is used two different ways 7180 * The original usage is when creating a new array. 7181 * In this usage, raid_disks is > 0 and it together with 7182 * level, size, not_persistent,layout,chunksize determine the 7183 * shape of the array. 7184 * This will always create an array with a type-0.90.0 superblock. 7185 * The newer usage is when assembling an array. 7186 * In this case raid_disks will be 0, and the major_version field is 7187 * use to determine which style super-blocks are to be found on the devices. 7188 * The minor and patch _version numbers are also kept incase the 7189 * super_block handler wishes to interpret them. 7190 */ 7191 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7192 { 7193 if (info->raid_disks == 0) { 7194 /* just setting version number for superblock loading */ 7195 if (info->major_version < 0 || 7196 info->major_version >= ARRAY_SIZE(super_types) || 7197 super_types[info->major_version].name == NULL) { 7198 /* maybe try to auto-load a module? */ 7199 pr_warn("md: superblock version %d not known\n", 7200 info->major_version); 7201 return -EINVAL; 7202 } 7203 mddev->major_version = info->major_version; 7204 mddev->minor_version = info->minor_version; 7205 mddev->patch_version = info->patch_version; 7206 mddev->persistent = !info->not_persistent; 7207 /* ensure mddev_put doesn't delete this now that there 7208 * is some minimal configuration. 7209 */ 7210 mddev->ctime = ktime_get_real_seconds(); 7211 return 0; 7212 } 7213 mddev->major_version = MD_MAJOR_VERSION; 7214 mddev->minor_version = MD_MINOR_VERSION; 7215 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7216 mddev->ctime = ktime_get_real_seconds(); 7217 7218 mddev->level = info->level; 7219 mddev->clevel[0] = 0; 7220 mddev->dev_sectors = 2 * (sector_t)info->size; 7221 mddev->raid_disks = info->raid_disks; 7222 /* don't set md_minor, it is determined by which /dev/md* was 7223 * openned 7224 */ 7225 if (info->state & (1<<MD_SB_CLEAN)) 7226 mddev->recovery_cp = MaxSector; 7227 else 7228 mddev->recovery_cp = 0; 7229 mddev->persistent = ! info->not_persistent; 7230 mddev->external = 0; 7231 7232 mddev->layout = info->layout; 7233 if (mddev->level == 0) 7234 /* Cannot trust RAID0 layout info here */ 7235 mddev->layout = -1; 7236 mddev->chunk_sectors = info->chunk_size >> 9; 7237 7238 if (mddev->persistent) { 7239 mddev->max_disks = MD_SB_DISKS; 7240 mddev->flags = 0; 7241 mddev->sb_flags = 0; 7242 } 7243 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7244 7245 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7246 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7247 mddev->bitmap_info.offset = 0; 7248 7249 mddev->reshape_position = MaxSector; 7250 7251 /* 7252 * Generate a 128 bit UUID 7253 */ 7254 get_random_bytes(mddev->uuid, 16); 7255 7256 mddev->new_level = mddev->level; 7257 mddev->new_chunk_sectors = mddev->chunk_sectors; 7258 mddev->new_layout = mddev->layout; 7259 mddev->delta_disks = 0; 7260 mddev->reshape_backwards = 0; 7261 7262 return 0; 7263 } 7264 7265 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7266 { 7267 lockdep_assert_held(&mddev->reconfig_mutex); 7268 7269 if (mddev->external_size) 7270 return; 7271 7272 mddev->array_sectors = array_sectors; 7273 } 7274 EXPORT_SYMBOL(md_set_array_sectors); 7275 7276 static int update_size(struct mddev *mddev, sector_t num_sectors) 7277 { 7278 struct md_rdev *rdev; 7279 int rv; 7280 int fit = (num_sectors == 0); 7281 sector_t old_dev_sectors = mddev->dev_sectors; 7282 7283 if (mddev->pers->resize == NULL) 7284 return -EINVAL; 7285 /* The "num_sectors" is the number of sectors of each device that 7286 * is used. This can only make sense for arrays with redundancy. 7287 * linear and raid0 always use whatever space is available. We can only 7288 * consider changing this number if no resync or reconstruction is 7289 * happening, and if the new size is acceptable. It must fit before the 7290 * sb_start or, if that is <data_offset, it must fit before the size 7291 * of each device. If num_sectors is zero, we find the largest size 7292 * that fits. 7293 */ 7294 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7295 mddev->sync_thread) 7296 return -EBUSY; 7297 if (!md_is_rdwr(mddev)) 7298 return -EROFS; 7299 7300 rdev_for_each(rdev, mddev) { 7301 sector_t avail = rdev->sectors; 7302 7303 if (fit && (num_sectors == 0 || num_sectors > avail)) 7304 num_sectors = avail; 7305 if (avail < num_sectors) 7306 return -ENOSPC; 7307 } 7308 rv = mddev->pers->resize(mddev, num_sectors); 7309 if (!rv) { 7310 if (mddev_is_clustered(mddev)) 7311 md_cluster_ops->update_size(mddev, old_dev_sectors); 7312 else if (mddev->queue) { 7313 set_capacity_and_notify(mddev->gendisk, 7314 mddev->array_sectors); 7315 } 7316 } 7317 return rv; 7318 } 7319 7320 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7321 { 7322 int rv; 7323 struct md_rdev *rdev; 7324 /* change the number of raid disks */ 7325 if (mddev->pers->check_reshape == NULL) 7326 return -EINVAL; 7327 if (!md_is_rdwr(mddev)) 7328 return -EROFS; 7329 if (raid_disks <= 0 || 7330 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7331 return -EINVAL; 7332 if (mddev->sync_thread || 7333 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7334 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7335 mddev->reshape_position != MaxSector) 7336 return -EBUSY; 7337 7338 rdev_for_each(rdev, mddev) { 7339 if (mddev->raid_disks < raid_disks && 7340 rdev->data_offset < rdev->new_data_offset) 7341 return -EINVAL; 7342 if (mddev->raid_disks > raid_disks && 7343 rdev->data_offset > rdev->new_data_offset) 7344 return -EINVAL; 7345 } 7346 7347 mddev->delta_disks = raid_disks - mddev->raid_disks; 7348 if (mddev->delta_disks < 0) 7349 mddev->reshape_backwards = 1; 7350 else if (mddev->delta_disks > 0) 7351 mddev->reshape_backwards = 0; 7352 7353 rv = mddev->pers->check_reshape(mddev); 7354 if (rv < 0) { 7355 mddev->delta_disks = 0; 7356 mddev->reshape_backwards = 0; 7357 } 7358 return rv; 7359 } 7360 7361 /* 7362 * update_array_info is used to change the configuration of an 7363 * on-line array. 7364 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7365 * fields in the info are checked against the array. 7366 * Any differences that cannot be handled will cause an error. 7367 * Normally, only one change can be managed at a time. 7368 */ 7369 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7370 { 7371 int rv = 0; 7372 int cnt = 0; 7373 int state = 0; 7374 7375 /* calculate expected state,ignoring low bits */ 7376 if (mddev->bitmap && mddev->bitmap_info.offset) 7377 state |= (1 << MD_SB_BITMAP_PRESENT); 7378 7379 if (mddev->major_version != info->major_version || 7380 mddev->minor_version != info->minor_version || 7381 /* mddev->patch_version != info->patch_version || */ 7382 mddev->ctime != info->ctime || 7383 mddev->level != info->level || 7384 /* mddev->layout != info->layout || */ 7385 mddev->persistent != !info->not_persistent || 7386 mddev->chunk_sectors != info->chunk_size >> 9 || 7387 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7388 ((state^info->state) & 0xfffffe00) 7389 ) 7390 return -EINVAL; 7391 /* Check there is only one change */ 7392 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7393 cnt++; 7394 if (mddev->raid_disks != info->raid_disks) 7395 cnt++; 7396 if (mddev->layout != info->layout) 7397 cnt++; 7398 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7399 cnt++; 7400 if (cnt == 0) 7401 return 0; 7402 if (cnt > 1) 7403 return -EINVAL; 7404 7405 if (mddev->layout != info->layout) { 7406 /* Change layout 7407 * we don't need to do anything at the md level, the 7408 * personality will take care of it all. 7409 */ 7410 if (mddev->pers->check_reshape == NULL) 7411 return -EINVAL; 7412 else { 7413 mddev->new_layout = info->layout; 7414 rv = mddev->pers->check_reshape(mddev); 7415 if (rv) 7416 mddev->new_layout = mddev->layout; 7417 return rv; 7418 } 7419 } 7420 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7421 rv = update_size(mddev, (sector_t)info->size * 2); 7422 7423 if (mddev->raid_disks != info->raid_disks) 7424 rv = update_raid_disks(mddev, info->raid_disks); 7425 7426 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7427 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7428 rv = -EINVAL; 7429 goto err; 7430 } 7431 if (mddev->recovery || mddev->sync_thread) { 7432 rv = -EBUSY; 7433 goto err; 7434 } 7435 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7436 struct bitmap *bitmap; 7437 /* add the bitmap */ 7438 if (mddev->bitmap) { 7439 rv = -EEXIST; 7440 goto err; 7441 } 7442 if (mddev->bitmap_info.default_offset == 0) { 7443 rv = -EINVAL; 7444 goto err; 7445 } 7446 mddev->bitmap_info.offset = 7447 mddev->bitmap_info.default_offset; 7448 mddev->bitmap_info.space = 7449 mddev->bitmap_info.default_space; 7450 bitmap = md_bitmap_create(mddev, -1); 7451 if (!IS_ERR(bitmap)) { 7452 mddev->bitmap = bitmap; 7453 rv = md_bitmap_load(mddev); 7454 } else 7455 rv = PTR_ERR(bitmap); 7456 if (rv) 7457 md_bitmap_destroy(mddev); 7458 } else { 7459 /* remove the bitmap */ 7460 if (!mddev->bitmap) { 7461 rv = -ENOENT; 7462 goto err; 7463 } 7464 if (mddev->bitmap->storage.file) { 7465 rv = -EINVAL; 7466 goto err; 7467 } 7468 if (mddev->bitmap_info.nodes) { 7469 /* hold PW on all the bitmap lock */ 7470 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7471 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7472 rv = -EPERM; 7473 md_cluster_ops->unlock_all_bitmaps(mddev); 7474 goto err; 7475 } 7476 7477 mddev->bitmap_info.nodes = 0; 7478 md_cluster_ops->leave(mddev); 7479 module_put(md_cluster_mod); 7480 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7481 } 7482 md_bitmap_destroy(mddev); 7483 mddev->bitmap_info.offset = 0; 7484 } 7485 } 7486 md_update_sb(mddev, 1); 7487 return rv; 7488 err: 7489 return rv; 7490 } 7491 7492 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7493 { 7494 struct md_rdev *rdev; 7495 int err = 0; 7496 7497 if (mddev->pers == NULL) 7498 return -ENODEV; 7499 7500 rcu_read_lock(); 7501 rdev = md_find_rdev_rcu(mddev, dev); 7502 if (!rdev) 7503 err = -ENODEV; 7504 else { 7505 md_error(mddev, rdev); 7506 if (test_bit(MD_BROKEN, &mddev->flags)) 7507 err = -EBUSY; 7508 } 7509 rcu_read_unlock(); 7510 return err; 7511 } 7512 7513 /* 7514 * We have a problem here : there is no easy way to give a CHS 7515 * virtual geometry. We currently pretend that we have a 2 heads 7516 * 4 sectors (with a BIG number of cylinders...). This drives 7517 * dosfs just mad... ;-) 7518 */ 7519 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7520 { 7521 struct mddev *mddev = bdev->bd_disk->private_data; 7522 7523 geo->heads = 2; 7524 geo->sectors = 4; 7525 geo->cylinders = mddev->array_sectors / 8; 7526 return 0; 7527 } 7528 7529 static inline bool md_ioctl_valid(unsigned int cmd) 7530 { 7531 switch (cmd) { 7532 case ADD_NEW_DISK: 7533 case GET_ARRAY_INFO: 7534 case GET_BITMAP_FILE: 7535 case GET_DISK_INFO: 7536 case HOT_ADD_DISK: 7537 case HOT_REMOVE_DISK: 7538 case RAID_VERSION: 7539 case RESTART_ARRAY_RW: 7540 case RUN_ARRAY: 7541 case SET_ARRAY_INFO: 7542 case SET_BITMAP_FILE: 7543 case SET_DISK_FAULTY: 7544 case STOP_ARRAY: 7545 case STOP_ARRAY_RO: 7546 case CLUSTERED_DISK_NACK: 7547 return true; 7548 default: 7549 return false; 7550 } 7551 } 7552 7553 static bool md_ioctl_need_suspend(unsigned int cmd) 7554 { 7555 switch (cmd) { 7556 case ADD_NEW_DISK: 7557 case HOT_ADD_DISK: 7558 case HOT_REMOVE_DISK: 7559 case SET_BITMAP_FILE: 7560 case SET_ARRAY_INFO: 7561 return true; 7562 default: 7563 return false; 7564 } 7565 } 7566 7567 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7568 { 7569 mdu_array_info_t info; 7570 int err; 7571 7572 if (!argp) 7573 memset(&info, 0, sizeof(info)); 7574 else if (copy_from_user(&info, argp, sizeof(info))) 7575 return -EFAULT; 7576 7577 if (mddev->pers) { 7578 err = update_array_info(mddev, &info); 7579 if (err) 7580 pr_warn("md: couldn't update array info. %d\n", err); 7581 return err; 7582 } 7583 7584 if (!list_empty(&mddev->disks)) { 7585 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7586 return -EBUSY; 7587 } 7588 7589 if (mddev->raid_disks) { 7590 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7591 return -EBUSY; 7592 } 7593 7594 err = md_set_array_info(mddev, &info); 7595 if (err) 7596 pr_warn("md: couldn't set array info. %d\n", err); 7597 7598 return err; 7599 } 7600 7601 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7602 unsigned int cmd, unsigned long arg) 7603 { 7604 int err = 0; 7605 void __user *argp = (void __user *)arg; 7606 struct mddev *mddev = NULL; 7607 bool did_set_md_closing = false; 7608 7609 if (!md_ioctl_valid(cmd)) 7610 return -ENOTTY; 7611 7612 switch (cmd) { 7613 case RAID_VERSION: 7614 case GET_ARRAY_INFO: 7615 case GET_DISK_INFO: 7616 break; 7617 default: 7618 if (!capable(CAP_SYS_ADMIN)) 7619 return -EACCES; 7620 } 7621 7622 /* 7623 * Commands dealing with the RAID driver but not any 7624 * particular array: 7625 */ 7626 switch (cmd) { 7627 case RAID_VERSION: 7628 err = get_version(argp); 7629 goto out; 7630 default:; 7631 } 7632 7633 /* 7634 * Commands creating/starting a new array: 7635 */ 7636 7637 mddev = bdev->bd_disk->private_data; 7638 7639 if (!mddev) { 7640 BUG(); 7641 goto out; 7642 } 7643 7644 /* Some actions do not requires the mutex */ 7645 switch (cmd) { 7646 case GET_ARRAY_INFO: 7647 if (!mddev->raid_disks && !mddev->external) 7648 err = -ENODEV; 7649 else 7650 err = get_array_info(mddev, argp); 7651 goto out; 7652 7653 case GET_DISK_INFO: 7654 if (!mddev->raid_disks && !mddev->external) 7655 err = -ENODEV; 7656 else 7657 err = get_disk_info(mddev, argp); 7658 goto out; 7659 7660 case SET_DISK_FAULTY: 7661 err = set_disk_faulty(mddev, new_decode_dev(arg)); 7662 goto out; 7663 7664 case GET_BITMAP_FILE: 7665 err = get_bitmap_file(mddev, argp); 7666 goto out; 7667 7668 } 7669 7670 if (cmd == HOT_REMOVE_DISK) 7671 /* need to ensure recovery thread has run */ 7672 wait_event_interruptible_timeout(mddev->sb_wait, 7673 !test_bit(MD_RECOVERY_NEEDED, 7674 &mddev->recovery), 7675 msecs_to_jiffies(5000)); 7676 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7677 /* Need to flush page cache, and ensure no-one else opens 7678 * and writes 7679 */ 7680 mutex_lock(&mddev->open_mutex); 7681 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 7682 mutex_unlock(&mddev->open_mutex); 7683 err = -EBUSY; 7684 goto out; 7685 } 7686 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 7687 mutex_unlock(&mddev->open_mutex); 7688 err = -EBUSY; 7689 goto out; 7690 } 7691 did_set_md_closing = true; 7692 mutex_unlock(&mddev->open_mutex); 7693 sync_blockdev(bdev); 7694 } 7695 7696 if (!md_is_rdwr(mddev)) 7697 flush_work(&mddev->sync_work); 7698 7699 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 7700 mddev_lock(mddev); 7701 if (err) { 7702 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7703 err, cmd); 7704 goto out; 7705 } 7706 7707 if (cmd == SET_ARRAY_INFO) { 7708 err = __md_set_array_info(mddev, argp); 7709 goto unlock; 7710 } 7711 7712 /* 7713 * Commands querying/configuring an existing array: 7714 */ 7715 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7716 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7717 if ((!mddev->raid_disks && !mddev->external) 7718 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7719 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7720 && cmd != GET_BITMAP_FILE) { 7721 err = -ENODEV; 7722 goto unlock; 7723 } 7724 7725 /* 7726 * Commands even a read-only array can execute: 7727 */ 7728 switch (cmd) { 7729 case RESTART_ARRAY_RW: 7730 err = restart_array(mddev); 7731 goto unlock; 7732 7733 case STOP_ARRAY: 7734 err = do_md_stop(mddev, 0, bdev); 7735 goto unlock; 7736 7737 case STOP_ARRAY_RO: 7738 err = md_set_readonly(mddev, bdev); 7739 goto unlock; 7740 7741 case HOT_REMOVE_DISK: 7742 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7743 goto unlock; 7744 7745 case ADD_NEW_DISK: 7746 /* We can support ADD_NEW_DISK on read-only arrays 7747 * only if we are re-adding a preexisting device. 7748 * So require mddev->pers and MD_DISK_SYNC. 7749 */ 7750 if (mddev->pers) { 7751 mdu_disk_info_t info; 7752 if (copy_from_user(&info, argp, sizeof(info))) 7753 err = -EFAULT; 7754 else if (!(info.state & (1<<MD_DISK_SYNC))) 7755 /* Need to clear read-only for this */ 7756 break; 7757 else 7758 err = md_add_new_disk(mddev, &info); 7759 goto unlock; 7760 } 7761 break; 7762 } 7763 7764 /* 7765 * The remaining ioctls are changing the state of the 7766 * superblock, so we do not allow them on read-only arrays. 7767 */ 7768 if (!md_is_rdwr(mddev) && mddev->pers) { 7769 if (mddev->ro != MD_AUTO_READ) { 7770 err = -EROFS; 7771 goto unlock; 7772 } 7773 mddev->ro = MD_RDWR; 7774 sysfs_notify_dirent_safe(mddev->sysfs_state); 7775 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7776 /* mddev_unlock will wake thread */ 7777 /* If a device failed while we were read-only, we 7778 * need to make sure the metadata is updated now. 7779 */ 7780 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7781 mddev_unlock(mddev); 7782 wait_event(mddev->sb_wait, 7783 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7784 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7785 mddev_lock_nointr(mddev); 7786 } 7787 } 7788 7789 switch (cmd) { 7790 case ADD_NEW_DISK: 7791 { 7792 mdu_disk_info_t info; 7793 if (copy_from_user(&info, argp, sizeof(info))) 7794 err = -EFAULT; 7795 else 7796 err = md_add_new_disk(mddev, &info); 7797 goto unlock; 7798 } 7799 7800 case CLUSTERED_DISK_NACK: 7801 if (mddev_is_clustered(mddev)) 7802 md_cluster_ops->new_disk_ack(mddev, false); 7803 else 7804 err = -EINVAL; 7805 goto unlock; 7806 7807 case HOT_ADD_DISK: 7808 err = hot_add_disk(mddev, new_decode_dev(arg)); 7809 goto unlock; 7810 7811 case RUN_ARRAY: 7812 err = do_md_run(mddev); 7813 goto unlock; 7814 7815 case SET_BITMAP_FILE: 7816 err = set_bitmap_file(mddev, (int)arg); 7817 goto unlock; 7818 7819 default: 7820 err = -EINVAL; 7821 goto unlock; 7822 } 7823 7824 unlock: 7825 if (mddev->hold_active == UNTIL_IOCTL && 7826 err != -EINVAL) 7827 mddev->hold_active = 0; 7828 7829 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 7830 mddev_unlock(mddev); 7831 7832 out: 7833 if(did_set_md_closing) 7834 clear_bit(MD_CLOSING, &mddev->flags); 7835 return err; 7836 } 7837 #ifdef CONFIG_COMPAT 7838 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 7839 unsigned int cmd, unsigned long arg) 7840 { 7841 switch (cmd) { 7842 case HOT_REMOVE_DISK: 7843 case HOT_ADD_DISK: 7844 case SET_DISK_FAULTY: 7845 case SET_BITMAP_FILE: 7846 /* These take in integer arg, do not convert */ 7847 break; 7848 default: 7849 arg = (unsigned long)compat_ptr(arg); 7850 break; 7851 } 7852 7853 return md_ioctl(bdev, mode, cmd, arg); 7854 } 7855 #endif /* CONFIG_COMPAT */ 7856 7857 static int md_set_read_only(struct block_device *bdev, bool ro) 7858 { 7859 struct mddev *mddev = bdev->bd_disk->private_data; 7860 int err; 7861 7862 err = mddev_lock(mddev); 7863 if (err) 7864 return err; 7865 7866 if (!mddev->raid_disks && !mddev->external) { 7867 err = -ENODEV; 7868 goto out_unlock; 7869 } 7870 7871 /* 7872 * Transitioning to read-auto need only happen for arrays that call 7873 * md_write_start and which are not ready for writes yet. 7874 */ 7875 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 7876 err = restart_array(mddev); 7877 if (err) 7878 goto out_unlock; 7879 mddev->ro = MD_AUTO_READ; 7880 } 7881 7882 out_unlock: 7883 mddev_unlock(mddev); 7884 return err; 7885 } 7886 7887 static int md_open(struct gendisk *disk, blk_mode_t mode) 7888 { 7889 struct mddev *mddev; 7890 int err; 7891 7892 spin_lock(&all_mddevs_lock); 7893 mddev = mddev_get(disk->private_data); 7894 spin_unlock(&all_mddevs_lock); 7895 if (!mddev) 7896 return -ENODEV; 7897 7898 err = mutex_lock_interruptible(&mddev->open_mutex); 7899 if (err) 7900 goto out; 7901 7902 err = -ENODEV; 7903 if (test_bit(MD_CLOSING, &mddev->flags)) 7904 goto out_unlock; 7905 7906 atomic_inc(&mddev->openers); 7907 mutex_unlock(&mddev->open_mutex); 7908 7909 disk_check_media_change(disk); 7910 return 0; 7911 7912 out_unlock: 7913 mutex_unlock(&mddev->open_mutex); 7914 out: 7915 mddev_put(mddev); 7916 return err; 7917 } 7918 7919 static void md_release(struct gendisk *disk) 7920 { 7921 struct mddev *mddev = disk->private_data; 7922 7923 BUG_ON(!mddev); 7924 atomic_dec(&mddev->openers); 7925 mddev_put(mddev); 7926 } 7927 7928 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 7929 { 7930 struct mddev *mddev = disk->private_data; 7931 unsigned int ret = 0; 7932 7933 if (mddev->changed) 7934 ret = DISK_EVENT_MEDIA_CHANGE; 7935 mddev->changed = 0; 7936 return ret; 7937 } 7938 7939 static void md_free_disk(struct gendisk *disk) 7940 { 7941 struct mddev *mddev = disk->private_data; 7942 7943 mddev_free(mddev); 7944 } 7945 7946 const struct block_device_operations md_fops = 7947 { 7948 .owner = THIS_MODULE, 7949 .submit_bio = md_submit_bio, 7950 .open = md_open, 7951 .release = md_release, 7952 .ioctl = md_ioctl, 7953 #ifdef CONFIG_COMPAT 7954 .compat_ioctl = md_compat_ioctl, 7955 #endif 7956 .getgeo = md_getgeo, 7957 .check_events = md_check_events, 7958 .set_read_only = md_set_read_only, 7959 .free_disk = md_free_disk, 7960 }; 7961 7962 static int md_thread(void *arg) 7963 { 7964 struct md_thread *thread = arg; 7965 7966 /* 7967 * md_thread is a 'system-thread', it's priority should be very 7968 * high. We avoid resource deadlocks individually in each 7969 * raid personality. (RAID5 does preallocation) We also use RR and 7970 * the very same RT priority as kswapd, thus we will never get 7971 * into a priority inversion deadlock. 7972 * 7973 * we definitely have to have equal or higher priority than 7974 * bdflush, otherwise bdflush will deadlock if there are too 7975 * many dirty RAID5 blocks. 7976 */ 7977 7978 allow_signal(SIGKILL); 7979 while (!kthread_should_stop()) { 7980 7981 /* We need to wait INTERRUPTIBLE so that 7982 * we don't add to the load-average. 7983 * That means we need to be sure no signals are 7984 * pending 7985 */ 7986 if (signal_pending(current)) 7987 flush_signals(current); 7988 7989 wait_event_interruptible_timeout 7990 (thread->wqueue, 7991 test_bit(THREAD_WAKEUP, &thread->flags) 7992 || kthread_should_stop() || kthread_should_park(), 7993 thread->timeout); 7994 7995 clear_bit(THREAD_WAKEUP, &thread->flags); 7996 if (kthread_should_park()) 7997 kthread_parkme(); 7998 if (!kthread_should_stop()) 7999 thread->run(thread); 8000 } 8001 8002 return 0; 8003 } 8004 8005 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 8006 { 8007 struct md_thread *t; 8008 8009 rcu_read_lock(); 8010 t = rcu_dereference(thread); 8011 if (t) 8012 wake_up_process(t->tsk); 8013 rcu_read_unlock(); 8014 } 8015 8016 void md_wakeup_thread(struct md_thread __rcu *thread) 8017 { 8018 struct md_thread *t; 8019 8020 rcu_read_lock(); 8021 t = rcu_dereference(thread); 8022 if (t) { 8023 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8024 set_bit(THREAD_WAKEUP, &t->flags); 8025 wake_up(&t->wqueue); 8026 } 8027 rcu_read_unlock(); 8028 } 8029 EXPORT_SYMBOL(md_wakeup_thread); 8030 8031 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8032 struct mddev *mddev, const char *name) 8033 { 8034 struct md_thread *thread; 8035 8036 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8037 if (!thread) 8038 return NULL; 8039 8040 init_waitqueue_head(&thread->wqueue); 8041 8042 thread->run = run; 8043 thread->mddev = mddev; 8044 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8045 thread->tsk = kthread_run(md_thread, thread, 8046 "%s_%s", 8047 mdname(thread->mddev), 8048 name); 8049 if (IS_ERR(thread->tsk)) { 8050 kfree(thread); 8051 return NULL; 8052 } 8053 return thread; 8054 } 8055 EXPORT_SYMBOL(md_register_thread); 8056 8057 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8058 { 8059 struct md_thread *thread = rcu_dereference_protected(*threadp, 8060 lockdep_is_held(&mddev->reconfig_mutex)); 8061 8062 if (!thread) 8063 return; 8064 8065 rcu_assign_pointer(*threadp, NULL); 8066 synchronize_rcu(); 8067 8068 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8069 kthread_stop(thread->tsk); 8070 kfree(thread); 8071 } 8072 EXPORT_SYMBOL(md_unregister_thread); 8073 8074 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8075 { 8076 if (!rdev || test_bit(Faulty, &rdev->flags)) 8077 return; 8078 8079 if (!mddev->pers || !mddev->pers->error_handler) 8080 return; 8081 mddev->pers->error_handler(mddev, rdev); 8082 8083 if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) 8084 return; 8085 8086 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8087 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8088 sysfs_notify_dirent_safe(rdev->sysfs_state); 8089 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8090 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8091 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8092 md_wakeup_thread(mddev->thread); 8093 } 8094 if (mddev->event_work.func) 8095 queue_work(md_misc_wq, &mddev->event_work); 8096 md_new_event(); 8097 } 8098 EXPORT_SYMBOL(md_error); 8099 8100 /* seq_file implementation /proc/mdstat */ 8101 8102 static void status_unused(struct seq_file *seq) 8103 { 8104 int i = 0; 8105 struct md_rdev *rdev; 8106 8107 seq_printf(seq, "unused devices: "); 8108 8109 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8110 i++; 8111 seq_printf(seq, "%pg ", rdev->bdev); 8112 } 8113 if (!i) 8114 seq_printf(seq, "<none>"); 8115 8116 seq_printf(seq, "\n"); 8117 } 8118 8119 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8120 { 8121 sector_t max_sectors, resync, res; 8122 unsigned long dt, db = 0; 8123 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8124 int scale, recovery_active; 8125 unsigned int per_milli; 8126 8127 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8128 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8129 max_sectors = mddev->resync_max_sectors; 8130 else 8131 max_sectors = mddev->dev_sectors; 8132 8133 resync = mddev->curr_resync; 8134 if (resync < MD_RESYNC_ACTIVE) { 8135 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8136 /* Still cleaning up */ 8137 resync = max_sectors; 8138 } else if (resync > max_sectors) { 8139 resync = max_sectors; 8140 } else { 8141 res = atomic_read(&mddev->recovery_active); 8142 /* 8143 * Resync has started, but the subtraction has overflowed or 8144 * yielded one of the special values. Force it to active to 8145 * ensure the status reports an active resync. 8146 */ 8147 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8148 resync = MD_RESYNC_ACTIVE; 8149 else 8150 resync -= res; 8151 } 8152 8153 if (resync == MD_RESYNC_NONE) { 8154 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8155 struct md_rdev *rdev; 8156 8157 rdev_for_each(rdev, mddev) 8158 if (rdev->raid_disk >= 0 && 8159 !test_bit(Faulty, &rdev->flags) && 8160 rdev->recovery_offset != MaxSector && 8161 rdev->recovery_offset) { 8162 seq_printf(seq, "\trecover=REMOTE"); 8163 return 1; 8164 } 8165 if (mddev->reshape_position != MaxSector) 8166 seq_printf(seq, "\treshape=REMOTE"); 8167 else 8168 seq_printf(seq, "\tresync=REMOTE"); 8169 return 1; 8170 } 8171 if (mddev->recovery_cp < MaxSector) { 8172 seq_printf(seq, "\tresync=PENDING"); 8173 return 1; 8174 } 8175 return 0; 8176 } 8177 if (resync < MD_RESYNC_ACTIVE) { 8178 seq_printf(seq, "\tresync=DELAYED"); 8179 return 1; 8180 } 8181 8182 WARN_ON(max_sectors == 0); 8183 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8184 * in a sector_t, and (max_sectors>>scale) will fit in a 8185 * u32, as those are the requirements for sector_div. 8186 * Thus 'scale' must be at least 10 8187 */ 8188 scale = 10; 8189 if (sizeof(sector_t) > sizeof(unsigned long)) { 8190 while ( max_sectors/2 > (1ULL<<(scale+32))) 8191 scale++; 8192 } 8193 res = (resync>>scale)*1000; 8194 sector_div(res, (u32)((max_sectors>>scale)+1)); 8195 8196 per_milli = res; 8197 { 8198 int i, x = per_milli/50, y = 20-x; 8199 seq_printf(seq, "["); 8200 for (i = 0; i < x; i++) 8201 seq_printf(seq, "="); 8202 seq_printf(seq, ">"); 8203 for (i = 0; i < y; i++) 8204 seq_printf(seq, "."); 8205 seq_printf(seq, "] "); 8206 } 8207 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8208 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8209 "reshape" : 8210 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8211 "check" : 8212 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8213 "resync" : "recovery"))), 8214 per_milli/10, per_milli % 10, 8215 (unsigned long long) resync/2, 8216 (unsigned long long) max_sectors/2); 8217 8218 /* 8219 * dt: time from mark until now 8220 * db: blocks written from mark until now 8221 * rt: remaining time 8222 * 8223 * rt is a sector_t, which is always 64bit now. We are keeping 8224 * the original algorithm, but it is not really necessary. 8225 * 8226 * Original algorithm: 8227 * So we divide before multiply in case it is 32bit and close 8228 * to the limit. 8229 * We scale the divisor (db) by 32 to avoid losing precision 8230 * near the end of resync when the number of remaining sectors 8231 * is close to 'db'. 8232 * We then divide rt by 32 after multiplying by db to compensate. 8233 * The '+1' avoids division by zero if db is very small. 8234 */ 8235 dt = ((jiffies - mddev->resync_mark) / HZ); 8236 if (!dt) dt++; 8237 8238 curr_mark_cnt = mddev->curr_mark_cnt; 8239 recovery_active = atomic_read(&mddev->recovery_active); 8240 resync_mark_cnt = mddev->resync_mark_cnt; 8241 8242 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8243 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8244 8245 rt = max_sectors - resync; /* number of remaining sectors */ 8246 rt = div64_u64(rt, db/32+1); 8247 rt *= dt; 8248 rt >>= 5; 8249 8250 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8251 ((unsigned long)rt % 60)/6); 8252 8253 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8254 return 1; 8255 } 8256 8257 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8258 __acquires(&all_mddevs_lock) 8259 { 8260 struct md_personality *pers; 8261 8262 seq_puts(seq, "Personalities : "); 8263 spin_lock(&pers_lock); 8264 list_for_each_entry(pers, &pers_list, list) 8265 seq_printf(seq, "[%s] ", pers->name); 8266 8267 spin_unlock(&pers_lock); 8268 seq_puts(seq, "\n"); 8269 seq->poll_event = atomic_read(&md_event_count); 8270 8271 spin_lock(&all_mddevs_lock); 8272 8273 return seq_list_start(&all_mddevs, *pos); 8274 } 8275 8276 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8277 { 8278 return seq_list_next(v, &all_mddevs, pos); 8279 } 8280 8281 static void md_seq_stop(struct seq_file *seq, void *v) 8282 __releases(&all_mddevs_lock) 8283 { 8284 status_unused(seq); 8285 spin_unlock(&all_mddevs_lock); 8286 } 8287 8288 static int md_seq_show(struct seq_file *seq, void *v) 8289 { 8290 struct mddev *mddev = list_entry(v, struct mddev, all_mddevs); 8291 sector_t sectors; 8292 struct md_rdev *rdev; 8293 8294 if (!mddev_get(mddev)) 8295 return 0; 8296 8297 spin_unlock(&all_mddevs_lock); 8298 spin_lock(&mddev->lock); 8299 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8300 seq_printf(seq, "%s : %sactive", mdname(mddev), 8301 mddev->pers ? "" : "in"); 8302 if (mddev->pers) { 8303 if (mddev->ro == MD_RDONLY) 8304 seq_printf(seq, " (read-only)"); 8305 if (mddev->ro == MD_AUTO_READ) 8306 seq_printf(seq, " (auto-read-only)"); 8307 seq_printf(seq, " %s", mddev->pers->name); 8308 } 8309 8310 sectors = 0; 8311 rcu_read_lock(); 8312 rdev_for_each_rcu(rdev, mddev) { 8313 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8314 8315 if (test_bit(WriteMostly, &rdev->flags)) 8316 seq_printf(seq, "(W)"); 8317 if (test_bit(Journal, &rdev->flags)) 8318 seq_printf(seq, "(J)"); 8319 if (test_bit(Faulty, &rdev->flags)) { 8320 seq_printf(seq, "(F)"); 8321 continue; 8322 } 8323 if (rdev->raid_disk < 0) 8324 seq_printf(seq, "(S)"); /* spare */ 8325 if (test_bit(Replacement, &rdev->flags)) 8326 seq_printf(seq, "(R)"); 8327 sectors += rdev->sectors; 8328 } 8329 rcu_read_unlock(); 8330 8331 if (!list_empty(&mddev->disks)) { 8332 if (mddev->pers) 8333 seq_printf(seq, "\n %llu blocks", 8334 (unsigned long long) 8335 mddev->array_sectors / 2); 8336 else 8337 seq_printf(seq, "\n %llu blocks", 8338 (unsigned long long)sectors / 2); 8339 } 8340 if (mddev->persistent) { 8341 if (mddev->major_version != 0 || 8342 mddev->minor_version != 90) { 8343 seq_printf(seq," super %d.%d", 8344 mddev->major_version, 8345 mddev->minor_version); 8346 } 8347 } else if (mddev->external) 8348 seq_printf(seq, " super external:%s", 8349 mddev->metadata_type); 8350 else 8351 seq_printf(seq, " super non-persistent"); 8352 8353 if (mddev->pers) { 8354 mddev->pers->status(seq, mddev); 8355 seq_printf(seq, "\n "); 8356 if (mddev->pers->sync_request) { 8357 if (status_resync(seq, mddev)) 8358 seq_printf(seq, "\n "); 8359 } 8360 } else 8361 seq_printf(seq, "\n "); 8362 8363 md_bitmap_status(seq, mddev->bitmap); 8364 8365 seq_printf(seq, "\n"); 8366 } 8367 spin_unlock(&mddev->lock); 8368 spin_lock(&all_mddevs_lock); 8369 if (atomic_dec_and_test(&mddev->active)) 8370 __mddev_put(mddev); 8371 8372 return 0; 8373 } 8374 8375 static const struct seq_operations md_seq_ops = { 8376 .start = md_seq_start, 8377 .next = md_seq_next, 8378 .stop = md_seq_stop, 8379 .show = md_seq_show, 8380 }; 8381 8382 static int md_seq_open(struct inode *inode, struct file *file) 8383 { 8384 struct seq_file *seq; 8385 int error; 8386 8387 error = seq_open(file, &md_seq_ops); 8388 if (error) 8389 return error; 8390 8391 seq = file->private_data; 8392 seq->poll_event = atomic_read(&md_event_count); 8393 return error; 8394 } 8395 8396 static int md_unloading; 8397 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8398 { 8399 struct seq_file *seq = filp->private_data; 8400 __poll_t mask; 8401 8402 if (md_unloading) 8403 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8404 poll_wait(filp, &md_event_waiters, wait); 8405 8406 /* always allow read */ 8407 mask = EPOLLIN | EPOLLRDNORM; 8408 8409 if (seq->poll_event != atomic_read(&md_event_count)) 8410 mask |= EPOLLERR | EPOLLPRI; 8411 return mask; 8412 } 8413 8414 static const struct proc_ops mdstat_proc_ops = { 8415 .proc_open = md_seq_open, 8416 .proc_read = seq_read, 8417 .proc_lseek = seq_lseek, 8418 .proc_release = seq_release, 8419 .proc_poll = mdstat_poll, 8420 }; 8421 8422 int register_md_personality(struct md_personality *p) 8423 { 8424 pr_debug("md: %s personality registered for level %d\n", 8425 p->name, p->level); 8426 spin_lock(&pers_lock); 8427 list_add_tail(&p->list, &pers_list); 8428 spin_unlock(&pers_lock); 8429 return 0; 8430 } 8431 EXPORT_SYMBOL(register_md_personality); 8432 8433 int unregister_md_personality(struct md_personality *p) 8434 { 8435 pr_debug("md: %s personality unregistered\n", p->name); 8436 spin_lock(&pers_lock); 8437 list_del_init(&p->list); 8438 spin_unlock(&pers_lock); 8439 return 0; 8440 } 8441 EXPORT_SYMBOL(unregister_md_personality); 8442 8443 int register_md_cluster_operations(struct md_cluster_operations *ops, 8444 struct module *module) 8445 { 8446 int ret = 0; 8447 spin_lock(&pers_lock); 8448 if (md_cluster_ops != NULL) 8449 ret = -EALREADY; 8450 else { 8451 md_cluster_ops = ops; 8452 md_cluster_mod = module; 8453 } 8454 spin_unlock(&pers_lock); 8455 return ret; 8456 } 8457 EXPORT_SYMBOL(register_md_cluster_operations); 8458 8459 int unregister_md_cluster_operations(void) 8460 { 8461 spin_lock(&pers_lock); 8462 md_cluster_ops = NULL; 8463 spin_unlock(&pers_lock); 8464 return 0; 8465 } 8466 EXPORT_SYMBOL(unregister_md_cluster_operations); 8467 8468 int md_setup_cluster(struct mddev *mddev, int nodes) 8469 { 8470 int ret; 8471 if (!md_cluster_ops) 8472 request_module("md-cluster"); 8473 spin_lock(&pers_lock); 8474 /* ensure module won't be unloaded */ 8475 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 8476 pr_warn("can't find md-cluster module or get its reference.\n"); 8477 spin_unlock(&pers_lock); 8478 return -ENOENT; 8479 } 8480 spin_unlock(&pers_lock); 8481 8482 ret = md_cluster_ops->join(mddev, nodes); 8483 if (!ret) 8484 mddev->safemode_delay = 0; 8485 return ret; 8486 } 8487 8488 void md_cluster_stop(struct mddev *mddev) 8489 { 8490 if (!md_cluster_ops) 8491 return; 8492 md_cluster_ops->leave(mddev); 8493 module_put(md_cluster_mod); 8494 } 8495 8496 static int is_mddev_idle(struct mddev *mddev, int init) 8497 { 8498 struct md_rdev *rdev; 8499 int idle; 8500 int curr_events; 8501 8502 idle = 1; 8503 rcu_read_lock(); 8504 rdev_for_each_rcu(rdev, mddev) { 8505 struct gendisk *disk = rdev->bdev->bd_disk; 8506 curr_events = (int)part_stat_read_accum(disk->part0, sectors) - 8507 atomic_read(&disk->sync_io); 8508 /* sync IO will cause sync_io to increase before the disk_stats 8509 * as sync_io is counted when a request starts, and 8510 * disk_stats is counted when it completes. 8511 * So resync activity will cause curr_events to be smaller than 8512 * when there was no such activity. 8513 * non-sync IO will cause disk_stat to increase without 8514 * increasing sync_io so curr_events will (eventually) 8515 * be larger than it was before. Once it becomes 8516 * substantially larger, the test below will cause 8517 * the array to appear non-idle, and resync will slow 8518 * down. 8519 * If there is a lot of outstanding resync activity when 8520 * we set last_event to curr_events, then all that activity 8521 * completing might cause the array to appear non-idle 8522 * and resync will be slowed down even though there might 8523 * not have been non-resync activity. This will only 8524 * happen once though. 'last_events' will soon reflect 8525 * the state where there is little or no outstanding 8526 * resync requests, and further resync activity will 8527 * always make curr_events less than last_events. 8528 * 8529 */ 8530 if (init || curr_events - rdev->last_events > 64) { 8531 rdev->last_events = curr_events; 8532 idle = 0; 8533 } 8534 } 8535 rcu_read_unlock(); 8536 return idle; 8537 } 8538 8539 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8540 { 8541 /* another "blocks" (512byte) blocks have been synced */ 8542 atomic_sub(blocks, &mddev->recovery_active); 8543 wake_up(&mddev->recovery_wait); 8544 if (!ok) { 8545 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8546 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8547 md_wakeup_thread(mddev->thread); 8548 // stop recovery, signal do_sync .... 8549 } 8550 } 8551 EXPORT_SYMBOL(md_done_sync); 8552 8553 /* md_write_start(mddev, bi) 8554 * If we need to update some array metadata (e.g. 'active' flag 8555 * in superblock) before writing, schedule a superblock update 8556 * and wait for it to complete. 8557 * A return value of 'false' means that the write wasn't recorded 8558 * and cannot proceed as the array is being suspend. 8559 */ 8560 bool md_write_start(struct mddev *mddev, struct bio *bi) 8561 { 8562 int did_change = 0; 8563 8564 if (bio_data_dir(bi) != WRITE) 8565 return true; 8566 8567 BUG_ON(mddev->ro == MD_RDONLY); 8568 if (mddev->ro == MD_AUTO_READ) { 8569 /* need to switch to read/write */ 8570 flush_work(&mddev->sync_work); 8571 mddev->ro = MD_RDWR; 8572 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8573 md_wakeup_thread(mddev->thread); 8574 md_wakeup_thread(mddev->sync_thread); 8575 did_change = 1; 8576 } 8577 rcu_read_lock(); 8578 percpu_ref_get(&mddev->writes_pending); 8579 smp_mb(); /* Match smp_mb in set_in_sync() */ 8580 if (mddev->safemode == 1) 8581 mddev->safemode = 0; 8582 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8583 if (mddev->in_sync || mddev->sync_checkers) { 8584 spin_lock(&mddev->lock); 8585 if (mddev->in_sync) { 8586 mddev->in_sync = 0; 8587 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8588 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8589 md_wakeup_thread(mddev->thread); 8590 did_change = 1; 8591 } 8592 spin_unlock(&mddev->lock); 8593 } 8594 rcu_read_unlock(); 8595 if (did_change) 8596 sysfs_notify_dirent_safe(mddev->sysfs_state); 8597 if (!mddev->has_superblocks) 8598 return true; 8599 wait_event(mddev->sb_wait, 8600 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || 8601 is_md_suspended(mddev)); 8602 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 8603 percpu_ref_put(&mddev->writes_pending); 8604 return false; 8605 } 8606 return true; 8607 } 8608 EXPORT_SYMBOL(md_write_start); 8609 8610 /* md_write_inc can only be called when md_write_start() has 8611 * already been called at least once of the current request. 8612 * It increments the counter and is useful when a single request 8613 * is split into several parts. Each part causes an increment and 8614 * so needs a matching md_write_end(). 8615 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8616 * a spinlocked region. 8617 */ 8618 void md_write_inc(struct mddev *mddev, struct bio *bi) 8619 { 8620 if (bio_data_dir(bi) != WRITE) 8621 return; 8622 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8623 percpu_ref_get(&mddev->writes_pending); 8624 } 8625 EXPORT_SYMBOL(md_write_inc); 8626 8627 void md_write_end(struct mddev *mddev) 8628 { 8629 percpu_ref_put(&mddev->writes_pending); 8630 8631 if (mddev->safemode == 2) 8632 md_wakeup_thread(mddev->thread); 8633 else if (mddev->safemode_delay) 8634 /* The roundup() ensures this only performs locking once 8635 * every ->safemode_delay jiffies 8636 */ 8637 mod_timer(&mddev->safemode_timer, 8638 roundup(jiffies, mddev->safemode_delay) + 8639 mddev->safemode_delay); 8640 } 8641 8642 EXPORT_SYMBOL(md_write_end); 8643 8644 /* This is used by raid0 and raid10 */ 8645 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8646 struct bio *bio, sector_t start, sector_t size) 8647 { 8648 struct bio *discard_bio = NULL; 8649 8650 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8651 &discard_bio) || !discard_bio) 8652 return; 8653 8654 bio_chain(discard_bio, bio); 8655 bio_clone_blkg_association(discard_bio, bio); 8656 if (mddev->gendisk) 8657 trace_block_bio_remap(discard_bio, 8658 disk_devt(mddev->gendisk), 8659 bio->bi_iter.bi_sector); 8660 submit_bio_noacct(discard_bio); 8661 } 8662 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8663 8664 static void md_end_clone_io(struct bio *bio) 8665 { 8666 struct md_io_clone *md_io_clone = bio->bi_private; 8667 struct bio *orig_bio = md_io_clone->orig_bio; 8668 struct mddev *mddev = md_io_clone->mddev; 8669 8670 orig_bio->bi_status = bio->bi_status; 8671 8672 if (md_io_clone->start_time) 8673 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8674 8675 bio_put(bio); 8676 bio_endio(orig_bio); 8677 percpu_ref_put(&mddev->active_io); 8678 } 8679 8680 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8681 { 8682 struct block_device *bdev = (*bio)->bi_bdev; 8683 struct md_io_clone *md_io_clone; 8684 struct bio *clone = 8685 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8686 8687 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8688 md_io_clone->orig_bio = *bio; 8689 md_io_clone->mddev = mddev; 8690 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8691 md_io_clone->start_time = bio_start_io_acct(*bio); 8692 8693 clone->bi_end_io = md_end_clone_io; 8694 clone->bi_private = md_io_clone; 8695 *bio = clone; 8696 } 8697 8698 void md_account_bio(struct mddev *mddev, struct bio **bio) 8699 { 8700 percpu_ref_get(&mddev->active_io); 8701 md_clone_bio(mddev, bio); 8702 } 8703 EXPORT_SYMBOL_GPL(md_account_bio); 8704 8705 /* md_allow_write(mddev) 8706 * Calling this ensures that the array is marked 'active' so that writes 8707 * may proceed without blocking. It is important to call this before 8708 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8709 * Must be called with mddev_lock held. 8710 */ 8711 void md_allow_write(struct mddev *mddev) 8712 { 8713 if (!mddev->pers) 8714 return; 8715 if (!md_is_rdwr(mddev)) 8716 return; 8717 if (!mddev->pers->sync_request) 8718 return; 8719 8720 spin_lock(&mddev->lock); 8721 if (mddev->in_sync) { 8722 mddev->in_sync = 0; 8723 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8724 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8725 if (mddev->safemode_delay && 8726 mddev->safemode == 0) 8727 mddev->safemode = 1; 8728 spin_unlock(&mddev->lock); 8729 md_update_sb(mddev, 0); 8730 sysfs_notify_dirent_safe(mddev->sysfs_state); 8731 /* wait for the dirty state to be recorded in the metadata */ 8732 wait_event(mddev->sb_wait, 8733 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8734 } else 8735 spin_unlock(&mddev->lock); 8736 } 8737 EXPORT_SYMBOL_GPL(md_allow_write); 8738 8739 #define SYNC_MARKS 10 8740 #define SYNC_MARK_STEP (3*HZ) 8741 #define UPDATE_FREQUENCY (5*60*HZ) 8742 void md_do_sync(struct md_thread *thread) 8743 { 8744 struct mddev *mddev = thread->mddev; 8745 struct mddev *mddev2; 8746 unsigned int currspeed = 0, window; 8747 sector_t max_sectors,j, io_sectors, recovery_done; 8748 unsigned long mark[SYNC_MARKS]; 8749 unsigned long update_time; 8750 sector_t mark_cnt[SYNC_MARKS]; 8751 int last_mark,m; 8752 sector_t last_check; 8753 int skipped = 0; 8754 struct md_rdev *rdev; 8755 char *desc, *action = NULL; 8756 struct blk_plug plug; 8757 int ret; 8758 8759 /* just incase thread restarts... */ 8760 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8761 test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) 8762 return; 8763 if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 8764 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8765 return; 8766 } 8767 8768 if (mddev_is_clustered(mddev)) { 8769 ret = md_cluster_ops->resync_start(mddev); 8770 if (ret) 8771 goto skip; 8772 8773 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8774 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8775 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8776 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8777 && ((unsigned long long)mddev->curr_resync_completed 8778 < (unsigned long long)mddev->resync_max_sectors)) 8779 goto skip; 8780 } 8781 8782 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8783 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 8784 desc = "data-check"; 8785 action = "check"; 8786 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8787 desc = "requested-resync"; 8788 action = "repair"; 8789 } else 8790 desc = "resync"; 8791 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8792 desc = "reshape"; 8793 else 8794 desc = "recovery"; 8795 8796 mddev->last_sync_action = action ?: desc; 8797 8798 /* 8799 * Before starting a resync we must have set curr_resync to 8800 * 2, and then checked that every "conflicting" array has curr_resync 8801 * less than ours. When we find one that is the same or higher 8802 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8803 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8804 * This will mean we have to start checking from the beginning again. 8805 * 8806 */ 8807 8808 do { 8809 int mddev2_minor = -1; 8810 mddev->curr_resync = MD_RESYNC_DELAYED; 8811 8812 try_again: 8813 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8814 goto skip; 8815 spin_lock(&all_mddevs_lock); 8816 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 8817 if (test_bit(MD_DELETED, &mddev2->flags)) 8818 continue; 8819 if (mddev2 == mddev) 8820 continue; 8821 if (!mddev->parallel_resync 8822 && mddev2->curr_resync 8823 && match_mddev_units(mddev, mddev2)) { 8824 DEFINE_WAIT(wq); 8825 if (mddev < mddev2 && 8826 mddev->curr_resync == MD_RESYNC_DELAYED) { 8827 /* arbitrarily yield */ 8828 mddev->curr_resync = MD_RESYNC_YIELDED; 8829 wake_up(&resync_wait); 8830 } 8831 if (mddev > mddev2 && 8832 mddev->curr_resync == MD_RESYNC_YIELDED) 8833 /* no need to wait here, we can wait the next 8834 * time 'round when curr_resync == 2 8835 */ 8836 continue; 8837 /* We need to wait 'interruptible' so as not to 8838 * contribute to the load average, and not to 8839 * be caught by 'softlockup' 8840 */ 8841 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 8842 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8843 mddev2->curr_resync >= mddev->curr_resync) { 8844 if (mddev2_minor != mddev2->md_minor) { 8845 mddev2_minor = mddev2->md_minor; 8846 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 8847 desc, mdname(mddev), 8848 mdname(mddev2)); 8849 } 8850 spin_unlock(&all_mddevs_lock); 8851 8852 if (signal_pending(current)) 8853 flush_signals(current); 8854 schedule(); 8855 finish_wait(&resync_wait, &wq); 8856 goto try_again; 8857 } 8858 finish_wait(&resync_wait, &wq); 8859 } 8860 } 8861 spin_unlock(&all_mddevs_lock); 8862 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 8863 8864 j = 0; 8865 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8866 /* resync follows the size requested by the personality, 8867 * which defaults to physical size, but can be virtual size 8868 */ 8869 max_sectors = mddev->resync_max_sectors; 8870 atomic64_set(&mddev->resync_mismatches, 0); 8871 /* we don't use the checkpoint if there's a bitmap */ 8872 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8873 j = mddev->resync_min; 8874 else if (!mddev->bitmap) 8875 j = mddev->recovery_cp; 8876 8877 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 8878 max_sectors = mddev->resync_max_sectors; 8879 /* 8880 * If the original node aborts reshaping then we continue the 8881 * reshaping, so set j again to avoid restart reshape from the 8882 * first beginning 8883 */ 8884 if (mddev_is_clustered(mddev) && 8885 mddev->reshape_position != MaxSector) 8886 j = mddev->reshape_position; 8887 } else { 8888 /* recovery follows the physical size of devices */ 8889 max_sectors = mddev->dev_sectors; 8890 j = MaxSector; 8891 rcu_read_lock(); 8892 rdev_for_each_rcu(rdev, mddev) 8893 if (rdev->raid_disk >= 0 && 8894 !test_bit(Journal, &rdev->flags) && 8895 !test_bit(Faulty, &rdev->flags) && 8896 !test_bit(In_sync, &rdev->flags) && 8897 rdev->recovery_offset < j) 8898 j = rdev->recovery_offset; 8899 rcu_read_unlock(); 8900 8901 /* If there is a bitmap, we need to make sure all 8902 * writes that started before we added a spare 8903 * complete before we start doing a recovery. 8904 * Otherwise the write might complete and (via 8905 * bitmap_endwrite) set a bit in the bitmap after the 8906 * recovery has checked that bit and skipped that 8907 * region. 8908 */ 8909 if (mddev->bitmap) { 8910 mddev->pers->quiesce(mddev, 1); 8911 mddev->pers->quiesce(mddev, 0); 8912 } 8913 } 8914 8915 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 8916 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 8917 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 8918 speed_max(mddev), desc); 8919 8920 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8921 8922 io_sectors = 0; 8923 for (m = 0; m < SYNC_MARKS; m++) { 8924 mark[m] = jiffies; 8925 mark_cnt[m] = io_sectors; 8926 } 8927 last_mark = 0; 8928 mddev->resync_mark = mark[last_mark]; 8929 mddev->resync_mark_cnt = mark_cnt[last_mark]; 8930 8931 /* 8932 * Tune reconstruction: 8933 */ 8934 window = 32 * (PAGE_SIZE / 512); 8935 pr_debug("md: using %dk window, over a total of %lluk.\n", 8936 window/2, (unsigned long long)max_sectors/2); 8937 8938 atomic_set(&mddev->recovery_active, 0); 8939 last_check = 0; 8940 8941 if (j >= MD_RESYNC_ACTIVE) { 8942 pr_debug("md: resuming %s of %s from checkpoint.\n", 8943 desc, mdname(mddev)); 8944 mddev->curr_resync = j; 8945 } else 8946 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 8947 mddev->curr_resync_completed = j; 8948 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8949 md_new_event(); 8950 update_time = jiffies; 8951 8952 blk_start_plug(&plug); 8953 while (j < max_sectors) { 8954 sector_t sectors; 8955 8956 skipped = 0; 8957 8958 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8959 ((mddev->curr_resync > mddev->curr_resync_completed && 8960 (mddev->curr_resync - mddev->curr_resync_completed) 8961 > (max_sectors >> 4)) || 8962 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 8963 (j - mddev->curr_resync_completed)*2 8964 >= mddev->resync_max - mddev->curr_resync_completed || 8965 mddev->curr_resync_completed > mddev->resync_max 8966 )) { 8967 /* time to update curr_resync_completed */ 8968 wait_event(mddev->recovery_wait, 8969 atomic_read(&mddev->recovery_active) == 0); 8970 mddev->curr_resync_completed = j; 8971 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 8972 j > mddev->recovery_cp) 8973 mddev->recovery_cp = j; 8974 update_time = jiffies; 8975 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8976 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8977 } 8978 8979 while (j >= mddev->resync_max && 8980 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8981 /* As this condition is controlled by user-space, 8982 * we can block indefinitely, so use '_interruptible' 8983 * to avoid triggering warnings. 8984 */ 8985 flush_signals(current); /* just in case */ 8986 wait_event_interruptible(mddev->recovery_wait, 8987 mddev->resync_max > j 8988 || test_bit(MD_RECOVERY_INTR, 8989 &mddev->recovery)); 8990 } 8991 8992 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8993 break; 8994 8995 sectors = mddev->pers->sync_request(mddev, j, &skipped); 8996 if (sectors == 0) { 8997 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8998 break; 8999 } 9000 9001 if (!skipped) { /* actual IO requested */ 9002 io_sectors += sectors; 9003 atomic_add(sectors, &mddev->recovery_active); 9004 } 9005 9006 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9007 break; 9008 9009 j += sectors; 9010 if (j > max_sectors) 9011 /* when skipping, extra large numbers can be returned. */ 9012 j = max_sectors; 9013 if (j >= MD_RESYNC_ACTIVE) 9014 mddev->curr_resync = j; 9015 mddev->curr_mark_cnt = io_sectors; 9016 if (last_check == 0) 9017 /* this is the earliest that rebuild will be 9018 * visible in /proc/mdstat 9019 */ 9020 md_new_event(); 9021 9022 if (last_check + window > io_sectors || j == max_sectors) 9023 continue; 9024 9025 last_check = io_sectors; 9026 repeat: 9027 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9028 /* step marks */ 9029 int next = (last_mark+1) % SYNC_MARKS; 9030 9031 mddev->resync_mark = mark[next]; 9032 mddev->resync_mark_cnt = mark_cnt[next]; 9033 mark[next] = jiffies; 9034 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9035 last_mark = next; 9036 } 9037 9038 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9039 break; 9040 9041 /* 9042 * this loop exits only if either when we are slower than 9043 * the 'hard' speed limit, or the system was IO-idle for 9044 * a jiffy. 9045 * the system might be non-idle CPU-wise, but we only care 9046 * about not overloading the IO subsystem. (things like an 9047 * e2fsck being done on the RAID array should execute fast) 9048 */ 9049 cond_resched(); 9050 9051 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9052 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9053 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9054 9055 if (currspeed > speed_min(mddev)) { 9056 if (currspeed > speed_max(mddev)) { 9057 msleep(500); 9058 goto repeat; 9059 } 9060 if (!is_mddev_idle(mddev, 0)) { 9061 /* 9062 * Give other IO more of a chance. 9063 * The faster the devices, the less we wait. 9064 */ 9065 wait_event(mddev->recovery_wait, 9066 !atomic_read(&mddev->recovery_active)); 9067 } 9068 } 9069 } 9070 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9071 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9072 ? "interrupted" : "done"); 9073 /* 9074 * this also signals 'finished resyncing' to md_stop 9075 */ 9076 blk_finish_plug(&plug); 9077 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9078 9079 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9080 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9081 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9082 mddev->curr_resync_completed = mddev->curr_resync; 9083 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9084 } 9085 mddev->pers->sync_request(mddev, max_sectors, &skipped); 9086 9087 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9088 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9089 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9090 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9091 if (mddev->curr_resync >= mddev->recovery_cp) { 9092 pr_debug("md: checkpointing %s of %s.\n", 9093 desc, mdname(mddev)); 9094 if (test_bit(MD_RECOVERY_ERROR, 9095 &mddev->recovery)) 9096 mddev->recovery_cp = 9097 mddev->curr_resync_completed; 9098 else 9099 mddev->recovery_cp = 9100 mddev->curr_resync; 9101 } 9102 } else 9103 mddev->recovery_cp = MaxSector; 9104 } else { 9105 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9106 mddev->curr_resync = MaxSector; 9107 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9108 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9109 rcu_read_lock(); 9110 rdev_for_each_rcu(rdev, mddev) 9111 if (rdev->raid_disk >= 0 && 9112 mddev->delta_disks >= 0 && 9113 !test_bit(Journal, &rdev->flags) && 9114 !test_bit(Faulty, &rdev->flags) && 9115 !test_bit(In_sync, &rdev->flags) && 9116 rdev->recovery_offset < mddev->curr_resync) 9117 rdev->recovery_offset = mddev->curr_resync; 9118 rcu_read_unlock(); 9119 } 9120 } 9121 } 9122 skip: 9123 /* set CHANGE_PENDING here since maybe another update is needed, 9124 * so other nodes are informed. It should be harmless for normal 9125 * raid */ 9126 set_mask_bits(&mddev->sb_flags, 0, 9127 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9128 9129 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9130 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9131 mddev->delta_disks > 0 && 9132 mddev->pers->finish_reshape && 9133 mddev->pers->size && 9134 mddev->queue) { 9135 mddev_lock_nointr(mddev); 9136 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9137 mddev_unlock(mddev); 9138 if (!mddev_is_clustered(mddev)) 9139 set_capacity_and_notify(mddev->gendisk, 9140 mddev->array_sectors); 9141 } 9142 9143 spin_lock(&mddev->lock); 9144 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9145 /* We completed so min/max setting can be forgotten if used. */ 9146 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9147 mddev->resync_min = 0; 9148 mddev->resync_max = MaxSector; 9149 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9150 mddev->resync_min = mddev->curr_resync_completed; 9151 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9152 mddev->curr_resync = MD_RESYNC_NONE; 9153 spin_unlock(&mddev->lock); 9154 9155 wake_up(&resync_wait); 9156 md_wakeup_thread(mddev->thread); 9157 return; 9158 } 9159 EXPORT_SYMBOL_GPL(md_do_sync); 9160 9161 static bool rdev_removeable(struct md_rdev *rdev) 9162 { 9163 /* rdev is not used. */ 9164 if (rdev->raid_disk < 0) 9165 return false; 9166 9167 /* There are still inflight io, don't remove this rdev. */ 9168 if (atomic_read(&rdev->nr_pending)) 9169 return false; 9170 9171 /* 9172 * An error occurred but has not yet been acknowledged by the metadata 9173 * handler, don't remove this rdev. 9174 */ 9175 if (test_bit(Blocked, &rdev->flags)) 9176 return false; 9177 9178 /* Fautly rdev is not used, it's safe to remove it. */ 9179 if (test_bit(Faulty, &rdev->flags)) 9180 return true; 9181 9182 /* Journal disk can only be removed if it's faulty. */ 9183 if (test_bit(Journal, &rdev->flags)) 9184 return false; 9185 9186 /* 9187 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9188 * replacement has just become active from pers->spare_active(), and 9189 * then pers->hot_remove_disk() will replace this rdev with replacement. 9190 */ 9191 if (!test_bit(In_sync, &rdev->flags)) 9192 return true; 9193 9194 return false; 9195 } 9196 9197 static bool rdev_is_spare(struct md_rdev *rdev) 9198 { 9199 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 9200 !test_bit(In_sync, &rdev->flags) && 9201 !test_bit(Journal, &rdev->flags) && 9202 !test_bit(Faulty, &rdev->flags); 9203 } 9204 9205 static bool rdev_addable(struct md_rdev *rdev) 9206 { 9207 /* rdev is already used, don't add it again. */ 9208 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9209 test_bit(Faulty, &rdev->flags)) 9210 return false; 9211 9212 /* Allow to add journal disk. */ 9213 if (test_bit(Journal, &rdev->flags)) 9214 return true; 9215 9216 /* Allow to add if array is read-write. */ 9217 if (md_is_rdwr(rdev->mddev)) 9218 return true; 9219 9220 /* 9221 * For read-only array, only allow to readd a rdev. And if bitmap is 9222 * used, don't allow to readd a rdev that is too old. 9223 */ 9224 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 9225 return true; 9226 9227 return false; 9228 } 9229 9230 static bool md_spares_need_change(struct mddev *mddev) 9231 { 9232 struct md_rdev *rdev; 9233 9234 rdev_for_each(rdev, mddev) 9235 if (rdev_removeable(rdev) || rdev_addable(rdev)) 9236 return true; 9237 return false; 9238 } 9239 9240 static int remove_and_add_spares(struct mddev *mddev, 9241 struct md_rdev *this) 9242 { 9243 struct md_rdev *rdev; 9244 int spares = 0; 9245 int removed = 0; 9246 bool remove_some = false; 9247 9248 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9249 /* Mustn't remove devices when resync thread is running */ 9250 return 0; 9251 9252 rdev_for_each(rdev, mddev) { 9253 if ((this == NULL || rdev == this) && 9254 rdev->raid_disk >= 0 && 9255 !test_bit(Blocked, &rdev->flags) && 9256 test_bit(Faulty, &rdev->flags) && 9257 atomic_read(&rdev->nr_pending)==0) { 9258 /* Faulty non-Blocked devices with nr_pending == 0 9259 * never get nr_pending incremented, 9260 * never get Faulty cleared, and never get Blocked set. 9261 * So we can synchronize_rcu now rather than once per device 9262 */ 9263 remove_some = true; 9264 set_bit(RemoveSynchronized, &rdev->flags); 9265 } 9266 } 9267 9268 if (remove_some) 9269 synchronize_rcu(); 9270 rdev_for_each(rdev, mddev) { 9271 if ((this == NULL || rdev == this) && 9272 (test_bit(RemoveSynchronized, &rdev->flags) || 9273 rdev_removeable(rdev))) { 9274 if (mddev->pers->hot_remove_disk( 9275 mddev, rdev) == 0) { 9276 sysfs_unlink_rdev(mddev, rdev); 9277 rdev->saved_raid_disk = rdev->raid_disk; 9278 rdev->raid_disk = -1; 9279 removed++; 9280 } 9281 } 9282 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) 9283 clear_bit(RemoveSynchronized, &rdev->flags); 9284 } 9285 9286 if (removed && mddev->kobj.sd) 9287 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9288 9289 if (this && removed) 9290 goto no_add; 9291 9292 rdev_for_each(rdev, mddev) { 9293 if (this && this != rdev) 9294 continue; 9295 if (rdev_is_spare(rdev)) 9296 spares++; 9297 if (!rdev_addable(rdev)) 9298 continue; 9299 if (!test_bit(Journal, &rdev->flags)) 9300 rdev->recovery_offset = 0; 9301 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9302 /* failure here is OK */ 9303 sysfs_link_rdev(mddev, rdev); 9304 if (!test_bit(Journal, &rdev->flags)) 9305 spares++; 9306 md_new_event(); 9307 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9308 } 9309 } 9310 no_add: 9311 if (removed) 9312 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9313 return spares; 9314 } 9315 9316 static bool md_choose_sync_action(struct mddev *mddev, int *spares) 9317 { 9318 /* Check if reshape is in progress first. */ 9319 if (mddev->reshape_position != MaxSector) { 9320 if (mddev->pers->check_reshape == NULL || 9321 mddev->pers->check_reshape(mddev) != 0) 9322 return false; 9323 9324 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9325 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9326 return true; 9327 } 9328 9329 /* 9330 * Remove any failed drives, then add spares if possible. Spares are 9331 * also removed and re-added, to allow the personality to fail the 9332 * re-add. 9333 */ 9334 *spares = remove_and_add_spares(mddev, NULL); 9335 if (*spares) { 9336 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9337 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9338 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9339 9340 /* Start new recovery. */ 9341 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9342 return true; 9343 } 9344 9345 /* Check if recovery is in progress. */ 9346 if (mddev->recovery_cp < MaxSector) { 9347 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9348 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9349 return true; 9350 } 9351 9352 /* Delay to choose resync/check/repair in md_do_sync(). */ 9353 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9354 return true; 9355 9356 /* Nothing to be done */ 9357 return false; 9358 } 9359 9360 static void md_start_sync(struct work_struct *ws) 9361 { 9362 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 9363 int spares = 0; 9364 bool suspend = false; 9365 9366 if (md_spares_need_change(mddev)) 9367 suspend = true; 9368 9369 suspend ? mddev_suspend_and_lock_nointr(mddev) : 9370 mddev_lock_nointr(mddev); 9371 9372 if (!md_is_rdwr(mddev)) { 9373 /* 9374 * On a read-only array we can: 9375 * - remove failed devices 9376 * - add already-in_sync devices if the array itself is in-sync. 9377 * As we only add devices that are already in-sync, we can 9378 * activate the spares immediately. 9379 */ 9380 remove_and_add_spares(mddev, NULL); 9381 goto not_running; 9382 } 9383 9384 if (!md_choose_sync_action(mddev, &spares)) 9385 goto not_running; 9386 9387 if (!mddev->pers->sync_request) 9388 goto not_running; 9389 9390 /* 9391 * We are adding a device or devices to an array which has the bitmap 9392 * stored on all devices. So make sure all bitmap pages get written. 9393 */ 9394 if (spares) 9395 md_bitmap_write_all(mddev->bitmap); 9396 9397 rcu_assign_pointer(mddev->sync_thread, 9398 md_register_thread(md_do_sync, mddev, "resync")); 9399 if (!mddev->sync_thread) { 9400 pr_warn("%s: could not start resync thread...\n", 9401 mdname(mddev)); 9402 /* leave the spares where they are, it shouldn't hurt */ 9403 goto not_running; 9404 } 9405 9406 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 9407 md_wakeup_thread(mddev->sync_thread); 9408 sysfs_notify_dirent_safe(mddev->sysfs_action); 9409 md_new_event(); 9410 return; 9411 9412 not_running: 9413 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9414 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9415 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9416 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9417 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9418 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 9419 9420 wake_up(&resync_wait); 9421 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 9422 mddev->sysfs_action) 9423 sysfs_notify_dirent_safe(mddev->sysfs_action); 9424 } 9425 9426 /* 9427 * This routine is regularly called by all per-raid-array threads to 9428 * deal with generic issues like resync and super-block update. 9429 * Raid personalities that don't have a thread (linear/raid0) do not 9430 * need this as they never do any recovery or update the superblock. 9431 * 9432 * It does not do any resync itself, but rather "forks" off other threads 9433 * to do that as needed. 9434 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9435 * "->recovery" and create a thread at ->sync_thread. 9436 * When the thread finishes it sets MD_RECOVERY_DONE 9437 * and wakeups up this thread which will reap the thread and finish up. 9438 * This thread also removes any faulty devices (with nr_pending == 0). 9439 * 9440 * The overall approach is: 9441 * 1/ if the superblock needs updating, update it. 9442 * 2/ If a recovery thread is running, don't do anything else. 9443 * 3/ If recovery has finished, clean up, possibly marking spares active. 9444 * 4/ If there are any faulty devices, remove them. 9445 * 5/ If array is degraded, try to add spares devices 9446 * 6/ If array has spares or is not in-sync, start a resync thread. 9447 */ 9448 void md_check_recovery(struct mddev *mddev) 9449 { 9450 if (READ_ONCE(mddev->suspended)) 9451 return; 9452 9453 if (mddev->bitmap) 9454 md_bitmap_daemon_work(mddev); 9455 9456 if (signal_pending(current)) { 9457 if (mddev->pers->sync_request && !mddev->external) { 9458 pr_debug("md: %s in immediate safe mode\n", 9459 mdname(mddev)); 9460 mddev->safemode = 2; 9461 } 9462 flush_signals(current); 9463 } 9464 9465 if (!md_is_rdwr(mddev) && 9466 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 9467 return; 9468 if ( ! ( 9469 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9470 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9471 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9472 (mddev->external == 0 && mddev->safemode == 1) || 9473 (mddev->safemode == 2 9474 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9475 )) 9476 return; 9477 9478 if (mddev_trylock(mddev)) { 9479 bool try_set_sync = mddev->safemode != 0; 9480 9481 if (!mddev->external && mddev->safemode == 1) 9482 mddev->safemode = 0; 9483 9484 if (!md_is_rdwr(mddev)) { 9485 struct md_rdev *rdev; 9486 9487 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9488 /* sync_work already queued. */ 9489 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9490 goto unlock; 9491 } 9492 9493 if (!mddev->external && mddev->in_sync) 9494 /* 9495 * 'Blocked' flag not needed as failed devices 9496 * will be recorded if array switched to read/write. 9497 * Leaving it set will prevent the device 9498 * from being removed. 9499 */ 9500 rdev_for_each(rdev, mddev) 9501 clear_bit(Blocked, &rdev->flags); 9502 9503 /* 9504 * There is no thread, but we need to call 9505 * ->spare_active and clear saved_raid_disk 9506 */ 9507 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9508 md_reap_sync_thread(mddev); 9509 9510 /* 9511 * Let md_start_sync() to remove and add rdevs to the 9512 * array. 9513 */ 9514 if (md_spares_need_change(mddev)) { 9515 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9516 queue_work(md_misc_wq, &mddev->sync_work); 9517 } 9518 9519 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9520 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9521 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9522 9523 goto unlock; 9524 } 9525 9526 if (mddev_is_clustered(mddev)) { 9527 struct md_rdev *rdev, *tmp; 9528 /* kick the device if another node issued a 9529 * remove disk. 9530 */ 9531 rdev_for_each_safe(rdev, tmp, mddev) { 9532 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9533 rdev->raid_disk < 0) 9534 md_kick_rdev_from_array(rdev); 9535 } 9536 } 9537 9538 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9539 spin_lock(&mddev->lock); 9540 set_in_sync(mddev); 9541 spin_unlock(&mddev->lock); 9542 } 9543 9544 if (mddev->sb_flags) 9545 md_update_sb(mddev, 0); 9546 9547 /* 9548 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9549 * still set. 9550 */ 9551 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9552 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9553 /* resync/recovery still happening */ 9554 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9555 goto unlock; 9556 } 9557 9558 if (WARN_ON_ONCE(!mddev->sync_thread)) 9559 goto unlock; 9560 9561 md_reap_sync_thread(mddev); 9562 goto unlock; 9563 } 9564 9565 /* Set RUNNING before clearing NEEDED to avoid 9566 * any transients in the value of "sync_action". 9567 */ 9568 mddev->curr_resync_completed = 0; 9569 spin_lock(&mddev->lock); 9570 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9571 spin_unlock(&mddev->lock); 9572 /* Clear some bits that don't mean anything, but 9573 * might be left set 9574 */ 9575 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9576 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9577 9578 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9579 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 9580 queue_work(md_misc_wq, &mddev->sync_work); 9581 } else { 9582 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9583 wake_up(&resync_wait); 9584 } 9585 9586 unlock: 9587 wake_up(&mddev->sb_wait); 9588 mddev_unlock(mddev); 9589 } 9590 } 9591 EXPORT_SYMBOL(md_check_recovery); 9592 9593 void md_reap_sync_thread(struct mddev *mddev) 9594 { 9595 struct md_rdev *rdev; 9596 sector_t old_dev_sectors = mddev->dev_sectors; 9597 bool is_reshaped = false; 9598 9599 /* resync has finished, collect result */ 9600 md_unregister_thread(mddev, &mddev->sync_thread); 9601 atomic_inc(&mddev->sync_seq); 9602 9603 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9604 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9605 mddev->degraded != mddev->raid_disks) { 9606 /* success...*/ 9607 /* activate any spares */ 9608 if (mddev->pers->spare_active(mddev)) { 9609 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9610 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9611 } 9612 } 9613 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9614 mddev->pers->finish_reshape) { 9615 mddev->pers->finish_reshape(mddev); 9616 if (mddev_is_clustered(mddev)) 9617 is_reshaped = true; 9618 } 9619 9620 /* If array is no-longer degraded, then any saved_raid_disk 9621 * information must be scrapped. 9622 */ 9623 if (!mddev->degraded) 9624 rdev_for_each(rdev, mddev) 9625 rdev->saved_raid_disk = -1; 9626 9627 md_update_sb(mddev, 1); 9628 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9629 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9630 * clustered raid */ 9631 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9632 md_cluster_ops->resync_finish(mddev); 9633 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9634 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9635 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9636 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9637 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9638 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9639 /* 9640 * We call md_cluster_ops->update_size here because sync_size could 9641 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9642 * so it is time to update size across cluster. 9643 */ 9644 if (mddev_is_clustered(mddev) && is_reshaped 9645 && !test_bit(MD_CLOSING, &mddev->flags)) 9646 md_cluster_ops->update_size(mddev, old_dev_sectors); 9647 /* flag recovery needed just to double check */ 9648 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9649 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9650 sysfs_notify_dirent_safe(mddev->sysfs_action); 9651 md_new_event(); 9652 if (mddev->event_work.func) 9653 queue_work(md_misc_wq, &mddev->event_work); 9654 wake_up(&resync_wait); 9655 } 9656 EXPORT_SYMBOL(md_reap_sync_thread); 9657 9658 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9659 { 9660 sysfs_notify_dirent_safe(rdev->sysfs_state); 9661 wait_event_timeout(rdev->blocked_wait, 9662 !test_bit(Blocked, &rdev->flags) && 9663 !test_bit(BlockedBadBlocks, &rdev->flags), 9664 msecs_to_jiffies(5000)); 9665 rdev_dec_pending(rdev, mddev); 9666 } 9667 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9668 9669 void md_finish_reshape(struct mddev *mddev) 9670 { 9671 /* called be personality module when reshape completes. */ 9672 struct md_rdev *rdev; 9673 9674 rdev_for_each(rdev, mddev) { 9675 if (rdev->data_offset > rdev->new_data_offset) 9676 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9677 else 9678 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9679 rdev->data_offset = rdev->new_data_offset; 9680 } 9681 } 9682 EXPORT_SYMBOL(md_finish_reshape); 9683 9684 /* Bad block management */ 9685 9686 /* Returns 1 on success, 0 on failure */ 9687 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9688 int is_new) 9689 { 9690 struct mddev *mddev = rdev->mddev; 9691 int rv; 9692 if (is_new) 9693 s += rdev->new_data_offset; 9694 else 9695 s += rdev->data_offset; 9696 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 9697 if (rv == 0) { 9698 /* Make sure they get written out promptly */ 9699 if (test_bit(ExternalBbl, &rdev->flags)) 9700 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9701 sysfs_notify_dirent_safe(rdev->sysfs_state); 9702 set_mask_bits(&mddev->sb_flags, 0, 9703 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9704 md_wakeup_thread(rdev->mddev->thread); 9705 return 1; 9706 } else 9707 return 0; 9708 } 9709 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9710 9711 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9712 int is_new) 9713 { 9714 int rv; 9715 if (is_new) 9716 s += rdev->new_data_offset; 9717 else 9718 s += rdev->data_offset; 9719 rv = badblocks_clear(&rdev->badblocks, s, sectors); 9720 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 9721 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9722 return rv; 9723 } 9724 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9725 9726 static int md_notify_reboot(struct notifier_block *this, 9727 unsigned long code, void *x) 9728 { 9729 struct mddev *mddev, *n; 9730 int need_delay = 0; 9731 9732 spin_lock(&all_mddevs_lock); 9733 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 9734 if (!mddev_get(mddev)) 9735 continue; 9736 spin_unlock(&all_mddevs_lock); 9737 if (mddev_trylock(mddev)) { 9738 if (mddev->pers) 9739 __md_stop_writes(mddev); 9740 if (mddev->persistent) 9741 mddev->safemode = 2; 9742 mddev_unlock(mddev); 9743 } 9744 need_delay = 1; 9745 mddev_put(mddev); 9746 spin_lock(&all_mddevs_lock); 9747 } 9748 spin_unlock(&all_mddevs_lock); 9749 9750 /* 9751 * certain more exotic SCSI devices are known to be 9752 * volatile wrt too early system reboots. While the 9753 * right place to handle this issue is the given 9754 * driver, we do want to have a safe RAID driver ... 9755 */ 9756 if (need_delay) 9757 msleep(1000); 9758 9759 return NOTIFY_DONE; 9760 } 9761 9762 static struct notifier_block md_notifier = { 9763 .notifier_call = md_notify_reboot, 9764 .next = NULL, 9765 .priority = INT_MAX, /* before any real devices */ 9766 }; 9767 9768 static void md_geninit(void) 9769 { 9770 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9771 9772 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 9773 } 9774 9775 static int __init md_init(void) 9776 { 9777 int ret = -ENOMEM; 9778 9779 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9780 if (!md_wq) 9781 goto err_wq; 9782 9783 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9784 if (!md_misc_wq) 9785 goto err_misc_wq; 9786 9787 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 9788 0); 9789 if (!md_bitmap_wq) 9790 goto err_bitmap_wq; 9791 9792 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 9793 if (ret < 0) 9794 goto err_md; 9795 9796 ret = __register_blkdev(0, "mdp", md_probe); 9797 if (ret < 0) 9798 goto err_mdp; 9799 mdp_major = ret; 9800 9801 register_reboot_notifier(&md_notifier); 9802 raid_table_header = register_sysctl("dev/raid", raid_table); 9803 9804 md_geninit(); 9805 return 0; 9806 9807 err_mdp: 9808 unregister_blkdev(MD_MAJOR, "md"); 9809 err_md: 9810 destroy_workqueue(md_bitmap_wq); 9811 err_bitmap_wq: 9812 destroy_workqueue(md_misc_wq); 9813 err_misc_wq: 9814 destroy_workqueue(md_wq); 9815 err_wq: 9816 return ret; 9817 } 9818 9819 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9820 { 9821 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9822 struct md_rdev *rdev2, *tmp; 9823 int role, ret; 9824 9825 /* 9826 * If size is changed in another node then we need to 9827 * do resize as well. 9828 */ 9829 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9830 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9831 if (ret) 9832 pr_info("md-cluster: resize failed\n"); 9833 else 9834 md_bitmap_update_sb(mddev->bitmap); 9835 } 9836 9837 /* Check for change of roles in the active devices */ 9838 rdev_for_each_safe(rdev2, tmp, mddev) { 9839 if (test_bit(Faulty, &rdev2->flags)) 9840 continue; 9841 9842 /* Check if the roles changed */ 9843 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9844 9845 if (test_bit(Candidate, &rdev2->flags)) { 9846 if (role == MD_DISK_ROLE_FAULTY) { 9847 pr_info("md: Removing Candidate device %pg because add failed\n", 9848 rdev2->bdev); 9849 md_kick_rdev_from_array(rdev2); 9850 continue; 9851 } 9852 else 9853 clear_bit(Candidate, &rdev2->flags); 9854 } 9855 9856 if (role != rdev2->raid_disk) { 9857 /* 9858 * got activated except reshape is happening. 9859 */ 9860 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 9861 !(le32_to_cpu(sb->feature_map) & 9862 MD_FEATURE_RESHAPE_ACTIVE)) { 9863 rdev2->saved_raid_disk = role; 9864 ret = remove_and_add_spares(mddev, rdev2); 9865 pr_info("Activated spare: %pg\n", 9866 rdev2->bdev); 9867 /* wakeup mddev->thread here, so array could 9868 * perform resync with the new activated disk */ 9869 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9870 md_wakeup_thread(mddev->thread); 9871 } 9872 /* device faulty 9873 * We just want to do the minimum to mark the disk 9874 * as faulty. The recovery is performed by the 9875 * one who initiated the error. 9876 */ 9877 if (role == MD_DISK_ROLE_FAULTY || 9878 role == MD_DISK_ROLE_JOURNAL) { 9879 md_error(mddev, rdev2); 9880 clear_bit(Blocked, &rdev2->flags); 9881 } 9882 } 9883 } 9884 9885 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 9886 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9887 if (ret) 9888 pr_warn("md: updating array disks failed. %d\n", ret); 9889 } 9890 9891 /* 9892 * Since mddev->delta_disks has already updated in update_raid_disks, 9893 * so it is time to check reshape. 9894 */ 9895 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9896 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9897 /* 9898 * reshape is happening in the remote node, we need to 9899 * update reshape_position and call start_reshape. 9900 */ 9901 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 9902 if (mddev->pers->update_reshape_pos) 9903 mddev->pers->update_reshape_pos(mddev); 9904 if (mddev->pers->start_reshape) 9905 mddev->pers->start_reshape(mddev); 9906 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9907 mddev->reshape_position != MaxSector && 9908 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9909 /* reshape is just done in another node. */ 9910 mddev->reshape_position = MaxSector; 9911 if (mddev->pers->update_reshape_pos) 9912 mddev->pers->update_reshape_pos(mddev); 9913 } 9914 9915 /* Finally set the event to be up to date */ 9916 mddev->events = le64_to_cpu(sb->events); 9917 } 9918 9919 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9920 { 9921 int err; 9922 struct page *swapout = rdev->sb_page; 9923 struct mdp_superblock_1 *sb; 9924 9925 /* Store the sb page of the rdev in the swapout temporary 9926 * variable in case we err in the future 9927 */ 9928 rdev->sb_page = NULL; 9929 err = alloc_disk_sb(rdev); 9930 if (err == 0) { 9931 ClearPageUptodate(rdev->sb_page); 9932 rdev->sb_loaded = 0; 9933 err = super_types[mddev->major_version]. 9934 load_super(rdev, NULL, mddev->minor_version); 9935 } 9936 if (err < 0) { 9937 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9938 __func__, __LINE__, rdev->desc_nr, err); 9939 if (rdev->sb_page) 9940 put_page(rdev->sb_page); 9941 rdev->sb_page = swapout; 9942 rdev->sb_loaded = 1; 9943 return err; 9944 } 9945 9946 sb = page_address(rdev->sb_page); 9947 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9948 * is not set 9949 */ 9950 9951 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9952 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9953 9954 /* The other node finished recovery, call spare_active to set 9955 * device In_sync and mddev->degraded 9956 */ 9957 if (rdev->recovery_offset == MaxSector && 9958 !test_bit(In_sync, &rdev->flags) && 9959 mddev->pers->spare_active(mddev)) 9960 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9961 9962 put_page(swapout); 9963 return 0; 9964 } 9965 9966 void md_reload_sb(struct mddev *mddev, int nr) 9967 { 9968 struct md_rdev *rdev = NULL, *iter; 9969 int err; 9970 9971 /* Find the rdev */ 9972 rdev_for_each_rcu(iter, mddev) { 9973 if (iter->desc_nr == nr) { 9974 rdev = iter; 9975 break; 9976 } 9977 } 9978 9979 if (!rdev) { 9980 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9981 return; 9982 } 9983 9984 err = read_rdev(mddev, rdev); 9985 if (err < 0) 9986 return; 9987 9988 check_sb_changes(mddev, rdev); 9989 9990 /* Read all rdev's to update recovery_offset */ 9991 rdev_for_each_rcu(rdev, mddev) { 9992 if (!test_bit(Faulty, &rdev->flags)) 9993 read_rdev(mddev, rdev); 9994 } 9995 } 9996 EXPORT_SYMBOL(md_reload_sb); 9997 9998 #ifndef MODULE 9999 10000 /* 10001 * Searches all registered partitions for autorun RAID arrays 10002 * at boot time. 10003 */ 10004 10005 static DEFINE_MUTEX(detected_devices_mutex); 10006 static LIST_HEAD(all_detected_devices); 10007 struct detected_devices_node { 10008 struct list_head list; 10009 dev_t dev; 10010 }; 10011 10012 void md_autodetect_dev(dev_t dev) 10013 { 10014 struct detected_devices_node *node_detected_dev; 10015 10016 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 10017 if (node_detected_dev) { 10018 node_detected_dev->dev = dev; 10019 mutex_lock(&detected_devices_mutex); 10020 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10021 mutex_unlock(&detected_devices_mutex); 10022 } 10023 } 10024 10025 void md_autostart_arrays(int part) 10026 { 10027 struct md_rdev *rdev; 10028 struct detected_devices_node *node_detected_dev; 10029 dev_t dev; 10030 int i_scanned, i_passed; 10031 10032 i_scanned = 0; 10033 i_passed = 0; 10034 10035 pr_info("md: Autodetecting RAID arrays.\n"); 10036 10037 mutex_lock(&detected_devices_mutex); 10038 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10039 i_scanned++; 10040 node_detected_dev = list_entry(all_detected_devices.next, 10041 struct detected_devices_node, list); 10042 list_del(&node_detected_dev->list); 10043 dev = node_detected_dev->dev; 10044 kfree(node_detected_dev); 10045 mutex_unlock(&detected_devices_mutex); 10046 rdev = md_import_device(dev,0, 90); 10047 mutex_lock(&detected_devices_mutex); 10048 if (IS_ERR(rdev)) 10049 continue; 10050 10051 if (test_bit(Faulty, &rdev->flags)) 10052 continue; 10053 10054 set_bit(AutoDetected, &rdev->flags); 10055 list_add(&rdev->same_set, &pending_raid_disks); 10056 i_passed++; 10057 } 10058 mutex_unlock(&detected_devices_mutex); 10059 10060 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10061 10062 autorun_devices(part); 10063 } 10064 10065 #endif /* !MODULE */ 10066 10067 static __exit void md_exit(void) 10068 { 10069 struct mddev *mddev, *n; 10070 int delay = 1; 10071 10072 unregister_blkdev(MD_MAJOR,"md"); 10073 unregister_blkdev(mdp_major, "mdp"); 10074 unregister_reboot_notifier(&md_notifier); 10075 unregister_sysctl_table(raid_table_header); 10076 10077 /* We cannot unload the modules while some process is 10078 * waiting for us in select() or poll() - wake them up 10079 */ 10080 md_unloading = 1; 10081 while (waitqueue_active(&md_event_waiters)) { 10082 /* not safe to leave yet */ 10083 wake_up(&md_event_waiters); 10084 msleep(delay); 10085 delay += delay; 10086 } 10087 remove_proc_entry("mdstat", NULL); 10088 10089 spin_lock(&all_mddevs_lock); 10090 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 10091 if (!mddev_get(mddev)) 10092 continue; 10093 spin_unlock(&all_mddevs_lock); 10094 export_array(mddev); 10095 mddev->ctime = 0; 10096 mddev->hold_active = 0; 10097 /* 10098 * As the mddev is now fully clear, mddev_put will schedule 10099 * the mddev for destruction by a workqueue, and the 10100 * destroy_workqueue() below will wait for that to complete. 10101 */ 10102 mddev_put(mddev); 10103 spin_lock(&all_mddevs_lock); 10104 } 10105 spin_unlock(&all_mddevs_lock); 10106 10107 destroy_workqueue(md_misc_wq); 10108 destroy_workqueue(md_bitmap_wq); 10109 destroy_workqueue(md_wq); 10110 } 10111 10112 subsys_initcall(md_init); 10113 module_exit(md_exit) 10114 10115 static int get_ro(char *buffer, const struct kernel_param *kp) 10116 { 10117 return sprintf(buffer, "%d\n", start_readonly); 10118 } 10119 static int set_ro(const char *val, const struct kernel_param *kp) 10120 { 10121 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10122 } 10123 10124 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10125 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10126 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10127 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10128 10129 MODULE_LICENSE("GPL"); 10130 MODULE_DESCRIPTION("MD RAID framework"); 10131 MODULE_ALIAS("md"); 10132 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10133