1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 md.c : Multiple Devices driver for Linux 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar 5 6 completely rewritten, based on the MD driver code from Marc Zyngier 7 8 Changes: 9 10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 14 - kmod support by: Cyrus Durgin 15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 17 18 - lots of fixes and improvements to the RAID1/RAID5 and generic 19 RAID code (such as request based resynchronization): 20 21 Neil Brown <neilb@cse.unsw.edu.au>. 22 23 - persistent bitmap code 24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 25 26 27 Errors, Warnings, etc. 28 Please use: 29 pr_crit() for error conditions that risk data loss 30 pr_err() for error conditions that are unexpected, like an IO error 31 or internal inconsistency 32 pr_warn() for error conditions that could have been predicated, like 33 adding a device to an array when it has incompatible metadata 34 pr_info() for every interesting, very rare events, like an array starting 35 or stopping, or resync starting or stopping 36 pr_debug() for everything else. 37 38 */ 39 40 #include <linux/sched/mm.h> 41 #include <linux/sched/signal.h> 42 #include <linux/kthread.h> 43 #include <linux/blkdev.h> 44 #include <linux/blk-integrity.h> 45 #include <linux/badblocks.h> 46 #include <linux/sysctl.h> 47 #include <linux/seq_file.h> 48 #include <linux/fs.h> 49 #include <linux/poll.h> 50 #include <linux/ctype.h> 51 #include <linux/string.h> 52 #include <linux/hdreg.h> 53 #include <linux/proc_fs.h> 54 #include <linux/random.h> 55 #include <linux/major.h> 56 #include <linux/module.h> 57 #include <linux/reboot.h> 58 #include <linux/file.h> 59 #include <linux/compat.h> 60 #include <linux/delay.h> 61 #include <linux/raid/md_p.h> 62 #include <linux/raid/md_u.h> 63 #include <linux/raid/detect.h> 64 #include <linux/slab.h> 65 #include <linux/percpu-refcount.h> 66 #include <linux/part_stat.h> 67 68 #include <trace/events/block.h> 69 #include "md.h" 70 #include "md-bitmap.h" 71 #include "md-cluster.h" 72 73 /* pers_list is a list of registered personalities protected by pers_lock. */ 74 static LIST_HEAD(pers_list); 75 static DEFINE_SPINLOCK(pers_lock); 76 77 static const struct kobj_type md_ktype; 78 79 struct md_cluster_operations *md_cluster_ops; 80 EXPORT_SYMBOL(md_cluster_ops); 81 static struct module *md_cluster_mod; 82 83 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 84 static struct workqueue_struct *md_wq; 85 static struct workqueue_struct *md_misc_wq; 86 struct workqueue_struct *md_bitmap_wq; 87 88 static int remove_and_add_spares(struct mddev *mddev, 89 struct md_rdev *this); 90 static void mddev_detach(struct mddev *mddev); 91 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); 92 static void md_wakeup_thread_directly(struct md_thread __rcu *thread); 93 94 enum md_ro_state { 95 MD_RDWR, 96 MD_RDONLY, 97 MD_AUTO_READ, 98 MD_MAX_STATE 99 }; 100 101 static bool md_is_rdwr(struct mddev *mddev) 102 { 103 return (mddev->ro == MD_RDWR); 104 } 105 106 /* 107 * Default number of read corrections we'll attempt on an rdev 108 * before ejecting it from the array. We divide the read error 109 * count by 2 for every hour elapsed between read errors. 110 */ 111 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 112 /* Default safemode delay: 200 msec */ 113 #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) 114 /* 115 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 116 * is 1000 KB/sec, so the extra system load does not show up that much. 117 * Increase it if you want to have more _guaranteed_ speed. Note that 118 * the RAID driver will use the maximum available bandwidth if the IO 119 * subsystem is idle. There is also an 'absolute maximum' reconstruction 120 * speed limit - in case reconstruction slows down your system despite 121 * idle IO detection. 122 * 123 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 124 * or /sys/block/mdX/md/sync_speed_{min,max} 125 */ 126 127 static int sysctl_speed_limit_min = 1000; 128 static int sysctl_speed_limit_max = 200000; 129 static inline int speed_min(struct mddev *mddev) 130 { 131 return mddev->sync_speed_min ? 132 mddev->sync_speed_min : sysctl_speed_limit_min; 133 } 134 135 static inline int speed_max(struct mddev *mddev) 136 { 137 return mddev->sync_speed_max ? 138 mddev->sync_speed_max : sysctl_speed_limit_max; 139 } 140 141 static void rdev_uninit_serial(struct md_rdev *rdev) 142 { 143 if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) 144 return; 145 146 kvfree(rdev->serial); 147 rdev->serial = NULL; 148 } 149 150 static void rdevs_uninit_serial(struct mddev *mddev) 151 { 152 struct md_rdev *rdev; 153 154 rdev_for_each(rdev, mddev) 155 rdev_uninit_serial(rdev); 156 } 157 158 static int rdev_init_serial(struct md_rdev *rdev) 159 { 160 /* serial_nums equals with BARRIER_BUCKETS_NR */ 161 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t)))); 162 struct serial_in_rdev *serial = NULL; 163 164 if (test_bit(CollisionCheck, &rdev->flags)) 165 return 0; 166 167 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums, 168 GFP_KERNEL); 169 if (!serial) 170 return -ENOMEM; 171 172 for (i = 0; i < serial_nums; i++) { 173 struct serial_in_rdev *serial_tmp = &serial[i]; 174 175 spin_lock_init(&serial_tmp->serial_lock); 176 serial_tmp->serial_rb = RB_ROOT_CACHED; 177 init_waitqueue_head(&serial_tmp->serial_io_wait); 178 } 179 180 rdev->serial = serial; 181 set_bit(CollisionCheck, &rdev->flags); 182 183 return 0; 184 } 185 186 static int rdevs_init_serial(struct mddev *mddev) 187 { 188 struct md_rdev *rdev; 189 int ret = 0; 190 191 rdev_for_each(rdev, mddev) { 192 ret = rdev_init_serial(rdev); 193 if (ret) 194 break; 195 } 196 197 /* Free all resources if pool is not existed */ 198 if (ret && !mddev->serial_info_pool) 199 rdevs_uninit_serial(mddev); 200 201 return ret; 202 } 203 204 /* 205 * rdev needs to enable serial stuffs if it meets the conditions: 206 * 1. it is multi-queue device flaged with writemostly. 207 * 2. the write-behind mode is enabled. 208 */ 209 static int rdev_need_serial(struct md_rdev *rdev) 210 { 211 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && 212 rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && 213 test_bit(WriteMostly, &rdev->flags)); 214 } 215 216 /* 217 * Init resource for rdev(s), then create serial_info_pool if: 218 * 1. rdev is the first device which return true from rdev_enable_serial. 219 * 2. rdev is NULL, means we want to enable serialization for all rdevs. 220 */ 221 void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 222 { 223 int ret = 0; 224 225 if (rdev && !rdev_need_serial(rdev) && 226 !test_bit(CollisionCheck, &rdev->flags)) 227 return; 228 229 if (!rdev) 230 ret = rdevs_init_serial(mddev); 231 else 232 ret = rdev_init_serial(rdev); 233 if (ret) 234 return; 235 236 if (mddev->serial_info_pool == NULL) { 237 /* 238 * already in memalloc noio context by 239 * mddev_suspend() 240 */ 241 mddev->serial_info_pool = 242 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 243 sizeof(struct serial_info)); 244 if (!mddev->serial_info_pool) { 245 rdevs_uninit_serial(mddev); 246 pr_err("can't alloc memory pool for serialization\n"); 247 } 248 } 249 } 250 251 /* 252 * Free resource from rdev(s), and destroy serial_info_pool under conditions: 253 * 1. rdev is the last device flaged with CollisionCheck. 254 * 2. when bitmap is destroyed while policy is not enabled. 255 * 3. for disable policy, the pool is destroyed only when no rdev needs it. 256 */ 257 void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) 258 { 259 if (rdev && !test_bit(CollisionCheck, &rdev->flags)) 260 return; 261 262 if (mddev->serial_info_pool) { 263 struct md_rdev *temp; 264 int num = 0; /* used to track if other rdevs need the pool */ 265 266 rdev_for_each(temp, mddev) { 267 if (!rdev) { 268 if (!mddev->serialize_policy || 269 !rdev_need_serial(temp)) 270 rdev_uninit_serial(temp); 271 else 272 num++; 273 } else if (temp != rdev && 274 test_bit(CollisionCheck, &temp->flags)) 275 num++; 276 } 277 278 if (rdev) 279 rdev_uninit_serial(rdev); 280 281 if (num) 282 pr_info("The mempool could be used by other devices\n"); 283 else { 284 mempool_destroy(mddev->serial_info_pool); 285 mddev->serial_info_pool = NULL; 286 } 287 } 288 } 289 290 static struct ctl_table_header *raid_table_header; 291 292 static struct ctl_table raid_table[] = { 293 { 294 .procname = "speed_limit_min", 295 .data = &sysctl_speed_limit_min, 296 .maxlen = sizeof(int), 297 .mode = S_IRUGO|S_IWUSR, 298 .proc_handler = proc_dointvec, 299 }, 300 { 301 .procname = "speed_limit_max", 302 .data = &sysctl_speed_limit_max, 303 .maxlen = sizeof(int), 304 .mode = S_IRUGO|S_IWUSR, 305 .proc_handler = proc_dointvec, 306 }, 307 }; 308 309 static int start_readonly; 310 311 /* 312 * The original mechanism for creating an md device is to create 313 * a device node in /dev and to open it. This causes races with device-close. 314 * The preferred method is to write to the "new_array" module parameter. 315 * This can avoid races. 316 * Setting create_on_open to false disables the original mechanism 317 * so all the races disappear. 318 */ 319 static bool create_on_open = true; 320 321 /* 322 * We have a system wide 'event count' that is incremented 323 * on any 'interesting' event, and readers of /proc/mdstat 324 * can use 'poll' or 'select' to find out when the event 325 * count increases. 326 * 327 * Events are: 328 * start array, stop array, error, add device, remove device, 329 * start build, activate spare 330 */ 331 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 332 static atomic_t md_event_count; 333 void md_new_event(void) 334 { 335 atomic_inc(&md_event_count); 336 wake_up(&md_event_waiters); 337 } 338 EXPORT_SYMBOL_GPL(md_new_event); 339 340 /* 341 * Enables to iterate over all existing md arrays 342 * all_mddevs_lock protects this list. 343 */ 344 static LIST_HEAD(all_mddevs); 345 static DEFINE_SPINLOCK(all_mddevs_lock); 346 347 static bool is_md_suspended(struct mddev *mddev) 348 { 349 return percpu_ref_is_dying(&mddev->active_io); 350 } 351 /* Rather than calling directly into the personality make_request function, 352 * IO requests come here first so that we can check if the device is 353 * being suspended pending a reconfiguration. 354 * We hold a refcount over the call to ->make_request. By the time that 355 * call has finished, the bio has been linked into some internal structure 356 * and so is visible to ->quiesce(), so we don't need the refcount any more. 357 */ 358 static bool is_suspended(struct mddev *mddev, struct bio *bio) 359 { 360 if (is_md_suspended(mddev)) 361 return true; 362 if (bio_data_dir(bio) != WRITE) 363 return false; 364 if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi)) 365 return false; 366 if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi)) 367 return false; 368 if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo)) 369 return false; 370 return true; 371 } 372 373 void md_handle_request(struct mddev *mddev, struct bio *bio) 374 { 375 check_suspended: 376 if (is_suspended(mddev, bio)) { 377 DEFINE_WAIT(__wait); 378 /* Bail out if REQ_NOWAIT is set for the bio */ 379 if (bio->bi_opf & REQ_NOWAIT) { 380 bio_wouldblock_error(bio); 381 return; 382 } 383 for (;;) { 384 prepare_to_wait(&mddev->sb_wait, &__wait, 385 TASK_UNINTERRUPTIBLE); 386 if (!is_suspended(mddev, bio)) 387 break; 388 schedule(); 389 } 390 finish_wait(&mddev->sb_wait, &__wait); 391 } 392 if (!percpu_ref_tryget_live(&mddev->active_io)) 393 goto check_suspended; 394 395 if (!mddev->pers->make_request(mddev, bio)) { 396 percpu_ref_put(&mddev->active_io); 397 goto check_suspended; 398 } 399 400 percpu_ref_put(&mddev->active_io); 401 } 402 EXPORT_SYMBOL(md_handle_request); 403 404 static void md_submit_bio(struct bio *bio) 405 { 406 const int rw = bio_data_dir(bio); 407 struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; 408 409 if (mddev == NULL || mddev->pers == NULL) { 410 bio_io_error(bio); 411 return; 412 } 413 414 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { 415 bio_io_error(bio); 416 return; 417 } 418 419 bio = bio_split_to_limits(bio); 420 if (!bio) 421 return; 422 423 if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) { 424 if (bio_sectors(bio) != 0) 425 bio->bi_status = BLK_STS_IOERR; 426 bio_endio(bio); 427 return; 428 } 429 430 /* bio could be mergeable after passing to underlayer */ 431 bio->bi_opf &= ~REQ_NOMERGE; 432 433 md_handle_request(mddev, bio); 434 } 435 436 /* 437 * Make sure no new requests are submitted to the device, and any requests that 438 * have been submitted are completely handled. 439 */ 440 int mddev_suspend(struct mddev *mddev, bool interruptible) 441 { 442 int err = 0; 443 444 /* 445 * hold reconfig_mutex to wait for normal io will deadlock, because 446 * other context can't update super_block, and normal io can rely on 447 * updating super_block. 448 */ 449 lockdep_assert_not_held(&mddev->reconfig_mutex); 450 451 if (interruptible) 452 err = mutex_lock_interruptible(&mddev->suspend_mutex); 453 else 454 mutex_lock(&mddev->suspend_mutex); 455 if (err) 456 return err; 457 458 if (mddev->suspended) { 459 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 460 mutex_unlock(&mddev->suspend_mutex); 461 return 0; 462 } 463 464 percpu_ref_kill(&mddev->active_io); 465 if (interruptible) 466 err = wait_event_interruptible(mddev->sb_wait, 467 percpu_ref_is_zero(&mddev->active_io)); 468 else 469 wait_event(mddev->sb_wait, 470 percpu_ref_is_zero(&mddev->active_io)); 471 if (err) { 472 percpu_ref_resurrect(&mddev->active_io); 473 mutex_unlock(&mddev->suspend_mutex); 474 return err; 475 } 476 477 /* 478 * For raid456, io might be waiting for reshape to make progress, 479 * allow new reshape to start while waiting for io to be done to 480 * prevent deadlock. 481 */ 482 WRITE_ONCE(mddev->suspended, mddev->suspended + 1); 483 484 del_timer_sync(&mddev->safemode_timer); 485 /* restrict memory reclaim I/O during raid array is suspend */ 486 mddev->noio_flag = memalloc_noio_save(); 487 488 mutex_unlock(&mddev->suspend_mutex); 489 return 0; 490 } 491 EXPORT_SYMBOL_GPL(mddev_suspend); 492 493 void mddev_resume(struct mddev *mddev) 494 { 495 lockdep_assert_not_held(&mddev->reconfig_mutex); 496 497 mutex_lock(&mddev->suspend_mutex); 498 WRITE_ONCE(mddev->suspended, mddev->suspended - 1); 499 if (mddev->suspended) { 500 mutex_unlock(&mddev->suspend_mutex); 501 return; 502 } 503 504 /* entred the memalloc scope from mddev_suspend() */ 505 memalloc_noio_restore(mddev->noio_flag); 506 507 percpu_ref_resurrect(&mddev->active_io); 508 wake_up(&mddev->sb_wait); 509 510 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 511 md_wakeup_thread(mddev->thread); 512 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 513 514 mutex_unlock(&mddev->suspend_mutex); 515 } 516 EXPORT_SYMBOL_GPL(mddev_resume); 517 518 /* 519 * Generic flush handling for md 520 */ 521 522 static void md_end_flush(struct bio *bio) 523 { 524 struct md_rdev *rdev = bio->bi_private; 525 struct mddev *mddev = rdev->mddev; 526 527 bio_put(bio); 528 529 rdev_dec_pending(rdev, mddev); 530 531 if (atomic_dec_and_test(&mddev->flush_pending)) { 532 /* The pair is percpu_ref_get() from md_flush_request() */ 533 percpu_ref_put(&mddev->active_io); 534 535 /* The pre-request flush has finished */ 536 queue_work(md_wq, &mddev->flush_work); 537 } 538 } 539 540 static void md_submit_flush_data(struct work_struct *ws); 541 542 static void submit_flushes(struct work_struct *ws) 543 { 544 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 545 struct md_rdev *rdev; 546 547 mddev->start_flush = ktime_get_boottime(); 548 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 549 atomic_set(&mddev->flush_pending, 1); 550 rcu_read_lock(); 551 rdev_for_each_rcu(rdev, mddev) 552 if (rdev->raid_disk >= 0 && 553 !test_bit(Faulty, &rdev->flags)) { 554 struct bio *bi; 555 556 atomic_inc(&rdev->nr_pending); 557 rcu_read_unlock(); 558 bi = bio_alloc_bioset(rdev->bdev, 0, 559 REQ_OP_WRITE | REQ_PREFLUSH, 560 GFP_NOIO, &mddev->bio_set); 561 bi->bi_end_io = md_end_flush; 562 bi->bi_private = rdev; 563 atomic_inc(&mddev->flush_pending); 564 submit_bio(bi); 565 rcu_read_lock(); 566 } 567 rcu_read_unlock(); 568 if (atomic_dec_and_test(&mddev->flush_pending)) 569 queue_work(md_wq, &mddev->flush_work); 570 } 571 572 static void md_submit_flush_data(struct work_struct *ws) 573 { 574 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 575 struct bio *bio = mddev->flush_bio; 576 577 /* 578 * must reset flush_bio before calling into md_handle_request to avoid a 579 * deadlock, because other bios passed md_handle_request suspend check 580 * could wait for this and below md_handle_request could wait for those 581 * bios because of suspend check 582 */ 583 spin_lock_irq(&mddev->lock); 584 mddev->prev_flush_start = mddev->start_flush; 585 mddev->flush_bio = NULL; 586 spin_unlock_irq(&mddev->lock); 587 wake_up(&mddev->sb_wait); 588 589 if (bio->bi_iter.bi_size == 0) { 590 /* an empty barrier - all done */ 591 bio_endio(bio); 592 } else { 593 bio->bi_opf &= ~REQ_PREFLUSH; 594 md_handle_request(mddev, bio); 595 } 596 } 597 598 /* 599 * Manages consolidation of flushes and submitting any flushes needed for 600 * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is 601 * being finished in another context. Returns false if the flushing is 602 * complete but still needs the I/O portion of the bio to be processed. 603 */ 604 bool md_flush_request(struct mddev *mddev, struct bio *bio) 605 { 606 ktime_t req_start = ktime_get_boottime(); 607 spin_lock_irq(&mddev->lock); 608 /* flush requests wait until ongoing flush completes, 609 * hence coalescing all the pending requests. 610 */ 611 wait_event_lock_irq(mddev->sb_wait, 612 !mddev->flush_bio || 613 ktime_before(req_start, mddev->prev_flush_start), 614 mddev->lock); 615 /* new request after previous flush is completed */ 616 if (ktime_after(req_start, mddev->prev_flush_start)) { 617 WARN_ON(mddev->flush_bio); 618 /* 619 * Grab a reference to make sure mddev_suspend() will wait for 620 * this flush to be done. 621 * 622 * md_flush_reqeust() is called under md_handle_request() and 623 * 'active_io' is already grabbed, hence percpu_ref_is_zero() 624 * won't pass, percpu_ref_tryget_live() can't be used because 625 * percpu_ref_kill() can be called by mddev_suspend() 626 * concurrently. 627 */ 628 WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 629 percpu_ref_get(&mddev->active_io); 630 mddev->flush_bio = bio; 631 bio = NULL; 632 } 633 spin_unlock_irq(&mddev->lock); 634 635 if (!bio) { 636 INIT_WORK(&mddev->flush_work, submit_flushes); 637 queue_work(md_wq, &mddev->flush_work); 638 } else { 639 /* flush was performed for some other bio while we waited. */ 640 if (bio->bi_iter.bi_size == 0) 641 /* an empty barrier - all done */ 642 bio_endio(bio); 643 else { 644 bio->bi_opf &= ~REQ_PREFLUSH; 645 return false; 646 } 647 } 648 return true; 649 } 650 EXPORT_SYMBOL(md_flush_request); 651 652 static inline struct mddev *mddev_get(struct mddev *mddev) 653 { 654 lockdep_assert_held(&all_mddevs_lock); 655 656 if (test_bit(MD_DELETED, &mddev->flags)) 657 return NULL; 658 atomic_inc(&mddev->active); 659 return mddev; 660 } 661 662 static void mddev_delayed_delete(struct work_struct *ws); 663 664 static void __mddev_put(struct mddev *mddev) 665 { 666 if (mddev->raid_disks || !list_empty(&mddev->disks) || 667 mddev->ctime || mddev->hold_active) 668 return; 669 670 /* Array is not configured at all, and not held active, so destroy it */ 671 set_bit(MD_DELETED, &mddev->flags); 672 673 /* 674 * Call queue_work inside the spinlock so that flush_workqueue() after 675 * mddev_find will succeed in waiting for the work to be done. 676 */ 677 queue_work(md_misc_wq, &mddev->del_work); 678 } 679 680 void mddev_put(struct mddev *mddev) 681 { 682 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 683 return; 684 685 __mddev_put(mddev); 686 spin_unlock(&all_mddevs_lock); 687 } 688 689 static void md_safemode_timeout(struct timer_list *t); 690 static void md_start_sync(struct work_struct *ws); 691 692 static void active_io_release(struct percpu_ref *ref) 693 { 694 struct mddev *mddev = container_of(ref, struct mddev, active_io); 695 696 wake_up(&mddev->sb_wait); 697 } 698 699 static void no_op(struct percpu_ref *r) {} 700 701 int mddev_init(struct mddev *mddev) 702 { 703 704 if (percpu_ref_init(&mddev->active_io, active_io_release, 705 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 706 return -ENOMEM; 707 708 if (percpu_ref_init(&mddev->writes_pending, no_op, 709 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { 710 percpu_ref_exit(&mddev->active_io); 711 return -ENOMEM; 712 } 713 714 /* We want to start with the refcount at zero */ 715 percpu_ref_put(&mddev->writes_pending); 716 717 mutex_init(&mddev->open_mutex); 718 mutex_init(&mddev->reconfig_mutex); 719 mutex_init(&mddev->sync_mutex); 720 mutex_init(&mddev->suspend_mutex); 721 mutex_init(&mddev->bitmap_info.mutex); 722 INIT_LIST_HEAD(&mddev->disks); 723 INIT_LIST_HEAD(&mddev->all_mddevs); 724 INIT_LIST_HEAD(&mddev->deleting); 725 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); 726 atomic_set(&mddev->active, 1); 727 atomic_set(&mddev->openers, 0); 728 atomic_set(&mddev->sync_seq, 0); 729 spin_lock_init(&mddev->lock); 730 atomic_set(&mddev->flush_pending, 0); 731 init_waitqueue_head(&mddev->sb_wait); 732 init_waitqueue_head(&mddev->recovery_wait); 733 mddev->reshape_position = MaxSector; 734 mddev->reshape_backwards = 0; 735 mddev->last_sync_action = "none"; 736 mddev->resync_min = 0; 737 mddev->resync_max = MaxSector; 738 mddev->level = LEVEL_NONE; 739 740 INIT_WORK(&mddev->sync_work, md_start_sync); 741 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 742 743 return 0; 744 } 745 EXPORT_SYMBOL_GPL(mddev_init); 746 747 void mddev_destroy(struct mddev *mddev) 748 { 749 percpu_ref_exit(&mddev->active_io); 750 percpu_ref_exit(&mddev->writes_pending); 751 } 752 EXPORT_SYMBOL_GPL(mddev_destroy); 753 754 static struct mddev *mddev_find_locked(dev_t unit) 755 { 756 struct mddev *mddev; 757 758 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 759 if (mddev->unit == unit) 760 return mddev; 761 762 return NULL; 763 } 764 765 /* find an unused unit number */ 766 static dev_t mddev_alloc_unit(void) 767 { 768 static int next_minor = 512; 769 int start = next_minor; 770 bool is_free = 0; 771 dev_t dev = 0; 772 773 while (!is_free) { 774 dev = MKDEV(MD_MAJOR, next_minor); 775 next_minor++; 776 if (next_minor > MINORMASK) 777 next_minor = 0; 778 if (next_minor == start) 779 return 0; /* Oh dear, all in use. */ 780 is_free = !mddev_find_locked(dev); 781 } 782 783 return dev; 784 } 785 786 static struct mddev *mddev_alloc(dev_t unit) 787 { 788 struct mddev *new; 789 int error; 790 791 if (unit && MAJOR(unit) != MD_MAJOR) 792 unit &= ~((1 << MdpMinorShift) - 1); 793 794 new = kzalloc(sizeof(*new), GFP_KERNEL); 795 if (!new) 796 return ERR_PTR(-ENOMEM); 797 798 error = mddev_init(new); 799 if (error) 800 goto out_free_new; 801 802 spin_lock(&all_mddevs_lock); 803 if (unit) { 804 error = -EEXIST; 805 if (mddev_find_locked(unit)) 806 goto out_destroy_new; 807 new->unit = unit; 808 if (MAJOR(unit) == MD_MAJOR) 809 new->md_minor = MINOR(unit); 810 else 811 new->md_minor = MINOR(unit) >> MdpMinorShift; 812 new->hold_active = UNTIL_IOCTL; 813 } else { 814 error = -ENODEV; 815 new->unit = mddev_alloc_unit(); 816 if (!new->unit) 817 goto out_destroy_new; 818 new->md_minor = MINOR(new->unit); 819 new->hold_active = UNTIL_STOP; 820 } 821 822 list_add(&new->all_mddevs, &all_mddevs); 823 spin_unlock(&all_mddevs_lock); 824 return new; 825 826 out_destroy_new: 827 spin_unlock(&all_mddevs_lock); 828 mddev_destroy(new); 829 out_free_new: 830 kfree(new); 831 return ERR_PTR(error); 832 } 833 834 static void mddev_free(struct mddev *mddev) 835 { 836 spin_lock(&all_mddevs_lock); 837 list_del(&mddev->all_mddevs); 838 spin_unlock(&all_mddevs_lock); 839 840 mddev_destroy(mddev); 841 kfree(mddev); 842 } 843 844 static const struct attribute_group md_redundancy_group; 845 846 void mddev_unlock(struct mddev *mddev) 847 { 848 struct md_rdev *rdev; 849 struct md_rdev *tmp; 850 LIST_HEAD(delete); 851 852 if (!list_empty(&mddev->deleting)) 853 list_splice_init(&mddev->deleting, &delete); 854 855 if (mddev->to_remove) { 856 /* These cannot be removed under reconfig_mutex as 857 * an access to the files will try to take reconfig_mutex 858 * while holding the file unremovable, which leads to 859 * a deadlock. 860 * So hold set sysfs_active while the remove in happeing, 861 * and anything else which might set ->to_remove or my 862 * otherwise change the sysfs namespace will fail with 863 * -EBUSY if sysfs_active is still set. 864 * We set sysfs_active under reconfig_mutex and elsewhere 865 * test it under the same mutex to ensure its correct value 866 * is seen. 867 */ 868 const struct attribute_group *to_remove = mddev->to_remove; 869 mddev->to_remove = NULL; 870 mddev->sysfs_active = 1; 871 mutex_unlock(&mddev->reconfig_mutex); 872 873 if (mddev->kobj.sd) { 874 if (to_remove != &md_redundancy_group) 875 sysfs_remove_group(&mddev->kobj, to_remove); 876 if (mddev->pers == NULL || 877 mddev->pers->sync_request == NULL) { 878 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 879 if (mddev->sysfs_action) 880 sysfs_put(mddev->sysfs_action); 881 if (mddev->sysfs_completed) 882 sysfs_put(mddev->sysfs_completed); 883 if (mddev->sysfs_degraded) 884 sysfs_put(mddev->sysfs_degraded); 885 mddev->sysfs_action = NULL; 886 mddev->sysfs_completed = NULL; 887 mddev->sysfs_degraded = NULL; 888 } 889 } 890 mddev->sysfs_active = 0; 891 } else 892 mutex_unlock(&mddev->reconfig_mutex); 893 894 md_wakeup_thread(mddev->thread); 895 wake_up(&mddev->sb_wait); 896 897 list_for_each_entry_safe(rdev, tmp, &delete, same_set) { 898 list_del_init(&rdev->same_set); 899 kobject_del(&rdev->kobj); 900 export_rdev(rdev, mddev); 901 } 902 } 903 EXPORT_SYMBOL_GPL(mddev_unlock); 904 905 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 906 { 907 struct md_rdev *rdev; 908 909 rdev_for_each_rcu(rdev, mddev) 910 if (rdev->desc_nr == nr) 911 return rdev; 912 913 return NULL; 914 } 915 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 916 917 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 918 { 919 struct md_rdev *rdev; 920 921 rdev_for_each(rdev, mddev) 922 if (rdev->bdev->bd_dev == dev) 923 return rdev; 924 925 return NULL; 926 } 927 928 struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) 929 { 930 struct md_rdev *rdev; 931 932 rdev_for_each_rcu(rdev, mddev) 933 if (rdev->bdev->bd_dev == dev) 934 return rdev; 935 936 return NULL; 937 } 938 EXPORT_SYMBOL_GPL(md_find_rdev_rcu); 939 940 static struct md_personality *find_pers(int level, char *clevel) 941 { 942 struct md_personality *pers; 943 list_for_each_entry(pers, &pers_list, list) { 944 if (level != LEVEL_NONE && pers->level == level) 945 return pers; 946 if (strcmp(pers->name, clevel)==0) 947 return pers; 948 } 949 return NULL; 950 } 951 952 /* return the offset of the super block in 512byte sectors */ 953 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 954 { 955 return MD_NEW_SIZE_SECTORS(bdev_nr_sectors(rdev->bdev)); 956 } 957 958 static int alloc_disk_sb(struct md_rdev *rdev) 959 { 960 rdev->sb_page = alloc_page(GFP_KERNEL); 961 if (!rdev->sb_page) 962 return -ENOMEM; 963 return 0; 964 } 965 966 void md_rdev_clear(struct md_rdev *rdev) 967 { 968 if (rdev->sb_page) { 969 put_page(rdev->sb_page); 970 rdev->sb_loaded = 0; 971 rdev->sb_page = NULL; 972 rdev->sb_start = 0; 973 rdev->sectors = 0; 974 } 975 if (rdev->bb_page) { 976 put_page(rdev->bb_page); 977 rdev->bb_page = NULL; 978 } 979 badblocks_exit(&rdev->badblocks); 980 } 981 EXPORT_SYMBOL_GPL(md_rdev_clear); 982 983 static void super_written(struct bio *bio) 984 { 985 struct md_rdev *rdev = bio->bi_private; 986 struct mddev *mddev = rdev->mddev; 987 988 if (bio->bi_status) { 989 pr_err("md: %s gets error=%d\n", __func__, 990 blk_status_to_errno(bio->bi_status)); 991 md_error(mddev, rdev); 992 if (!test_bit(Faulty, &rdev->flags) 993 && (bio->bi_opf & MD_FAILFAST)) { 994 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 995 set_bit(LastDev, &rdev->flags); 996 } 997 } else 998 clear_bit(LastDev, &rdev->flags); 999 1000 bio_put(bio); 1001 1002 rdev_dec_pending(rdev, mddev); 1003 1004 if (atomic_dec_and_test(&mddev->pending_writes)) 1005 wake_up(&mddev->sb_wait); 1006 } 1007 1008 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 1009 sector_t sector, int size, struct page *page) 1010 { 1011 /* write first size bytes of page to sector of rdev 1012 * Increment mddev->pending_writes before returning 1013 * and decrement it on completion, waking up sb_wait 1014 * if zero is reached. 1015 * If an error occurred, call md_error 1016 */ 1017 struct bio *bio; 1018 1019 if (!page) 1020 return; 1021 1022 if (test_bit(Faulty, &rdev->flags)) 1023 return; 1024 1025 bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev, 1026 1, 1027 REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META 1028 | REQ_PREFLUSH | REQ_FUA, 1029 GFP_NOIO, &mddev->sync_set); 1030 1031 atomic_inc(&rdev->nr_pending); 1032 1033 bio->bi_iter.bi_sector = sector; 1034 __bio_add_page(bio, page, size, 0); 1035 bio->bi_private = rdev; 1036 bio->bi_end_io = super_written; 1037 1038 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 1039 test_bit(FailFast, &rdev->flags) && 1040 !test_bit(LastDev, &rdev->flags)) 1041 bio->bi_opf |= MD_FAILFAST; 1042 1043 atomic_inc(&mddev->pending_writes); 1044 submit_bio(bio); 1045 } 1046 1047 int md_super_wait(struct mddev *mddev) 1048 { 1049 /* wait for all superblock writes that were scheduled to complete */ 1050 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1051 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 1052 return -EAGAIN; 1053 return 0; 1054 } 1055 1056 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 1057 struct page *page, blk_opf_t opf, bool metadata_op) 1058 { 1059 struct bio bio; 1060 struct bio_vec bvec; 1061 1062 if (metadata_op && rdev->meta_bdev) 1063 bio_init(&bio, rdev->meta_bdev, &bvec, 1, opf); 1064 else 1065 bio_init(&bio, rdev->bdev, &bvec, 1, opf); 1066 1067 if (metadata_op) 1068 bio.bi_iter.bi_sector = sector + rdev->sb_start; 1069 else if (rdev->mddev->reshape_position != MaxSector && 1070 (rdev->mddev->reshape_backwards == 1071 (sector >= rdev->mddev->reshape_position))) 1072 bio.bi_iter.bi_sector = sector + rdev->new_data_offset; 1073 else 1074 bio.bi_iter.bi_sector = sector + rdev->data_offset; 1075 __bio_add_page(&bio, page, size, 0); 1076 1077 submit_bio_wait(&bio); 1078 1079 return !bio.bi_status; 1080 } 1081 EXPORT_SYMBOL_GPL(sync_page_io); 1082 1083 static int read_disk_sb(struct md_rdev *rdev, int size) 1084 { 1085 if (rdev->sb_loaded) 1086 return 0; 1087 1088 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, true)) 1089 goto fail; 1090 rdev->sb_loaded = 1; 1091 return 0; 1092 1093 fail: 1094 pr_err("md: disabled device %pg, could not read superblock.\n", 1095 rdev->bdev); 1096 return -EINVAL; 1097 } 1098 1099 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1100 { 1101 return sb1->set_uuid0 == sb2->set_uuid0 && 1102 sb1->set_uuid1 == sb2->set_uuid1 && 1103 sb1->set_uuid2 == sb2->set_uuid2 && 1104 sb1->set_uuid3 == sb2->set_uuid3; 1105 } 1106 1107 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 1108 { 1109 int ret; 1110 mdp_super_t *tmp1, *tmp2; 1111 1112 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 1113 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 1114 1115 if (!tmp1 || !tmp2) { 1116 ret = 0; 1117 goto abort; 1118 } 1119 1120 *tmp1 = *sb1; 1121 *tmp2 = *sb2; 1122 1123 /* 1124 * nr_disks is not constant 1125 */ 1126 tmp1->nr_disks = 0; 1127 tmp2->nr_disks = 0; 1128 1129 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 1130 abort: 1131 kfree(tmp1); 1132 kfree(tmp2); 1133 return ret; 1134 } 1135 1136 static u32 md_csum_fold(u32 csum) 1137 { 1138 csum = (csum & 0xffff) + (csum >> 16); 1139 return (csum & 0xffff) + (csum >> 16); 1140 } 1141 1142 static unsigned int calc_sb_csum(mdp_super_t *sb) 1143 { 1144 u64 newcsum = 0; 1145 u32 *sb32 = (u32*)sb; 1146 int i; 1147 unsigned int disk_csum, csum; 1148 1149 disk_csum = sb->sb_csum; 1150 sb->sb_csum = 0; 1151 1152 for (i = 0; i < MD_SB_BYTES/4 ; i++) 1153 newcsum += sb32[i]; 1154 csum = (newcsum & 0xffffffff) + (newcsum>>32); 1155 1156 #ifdef CONFIG_ALPHA 1157 /* This used to use csum_partial, which was wrong for several 1158 * reasons including that different results are returned on 1159 * different architectures. It isn't critical that we get exactly 1160 * the same return value as before (we always csum_fold before 1161 * testing, and that removes any differences). However as we 1162 * know that csum_partial always returned a 16bit value on 1163 * alphas, do a fold to maximise conformity to previous behaviour. 1164 */ 1165 sb->sb_csum = md_csum_fold(disk_csum); 1166 #else 1167 sb->sb_csum = disk_csum; 1168 #endif 1169 return csum; 1170 } 1171 1172 /* 1173 * Handle superblock details. 1174 * We want to be able to handle multiple superblock formats 1175 * so we have a common interface to them all, and an array of 1176 * different handlers. 1177 * We rely on user-space to write the initial superblock, and support 1178 * reading and updating of superblocks. 1179 * Interface methods are: 1180 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 1181 * loads and validates a superblock on dev. 1182 * if refdev != NULL, compare superblocks on both devices 1183 * Return: 1184 * 0 - dev has a superblock that is compatible with refdev 1185 * 1 - dev has a superblock that is compatible and newer than refdev 1186 * so dev should be used as the refdev in future 1187 * -EINVAL superblock incompatible or invalid 1188 * -othererror e.g. -EIO 1189 * 1190 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 1191 * Verify that dev is acceptable into mddev. 1192 * The first time, mddev->raid_disks will be 0, and data from 1193 * dev should be merged in. Subsequent calls check that dev 1194 * is new enough. Return 0 or -EINVAL 1195 * 1196 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 1197 * Update the superblock for rdev with data in mddev 1198 * This does not write to disc. 1199 * 1200 */ 1201 1202 struct super_type { 1203 char *name; 1204 struct module *owner; 1205 int (*load_super)(struct md_rdev *rdev, 1206 struct md_rdev *refdev, 1207 int minor_version); 1208 int (*validate_super)(struct mddev *mddev, 1209 struct md_rdev *freshest, 1210 struct md_rdev *rdev); 1211 void (*sync_super)(struct mddev *mddev, 1212 struct md_rdev *rdev); 1213 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 1214 sector_t num_sectors); 1215 int (*allow_new_offset)(struct md_rdev *rdev, 1216 unsigned long long new_offset); 1217 }; 1218 1219 /* 1220 * Check that the given mddev has no bitmap. 1221 * 1222 * This function is called from the run method of all personalities that do not 1223 * support bitmaps. It prints an error message and returns non-zero if mddev 1224 * has a bitmap. Otherwise, it returns 0. 1225 * 1226 */ 1227 int md_check_no_bitmap(struct mddev *mddev) 1228 { 1229 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1230 return 0; 1231 pr_warn("%s: bitmaps are not supported for %s\n", 1232 mdname(mddev), mddev->pers->name); 1233 return 1; 1234 } 1235 EXPORT_SYMBOL(md_check_no_bitmap); 1236 1237 /* 1238 * load_super for 0.90.0 1239 */ 1240 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1241 { 1242 mdp_super_t *sb; 1243 int ret; 1244 bool spare_disk = true; 1245 1246 /* 1247 * Calculate the position of the superblock (512byte sectors), 1248 * it's at the end of the disk. 1249 * 1250 * It also happens to be a multiple of 4Kb. 1251 */ 1252 rdev->sb_start = calc_dev_sboffset(rdev); 1253 1254 ret = read_disk_sb(rdev, MD_SB_BYTES); 1255 if (ret) 1256 return ret; 1257 1258 ret = -EINVAL; 1259 1260 sb = page_address(rdev->sb_page); 1261 1262 if (sb->md_magic != MD_SB_MAGIC) { 1263 pr_warn("md: invalid raid superblock magic on %pg\n", 1264 rdev->bdev); 1265 goto abort; 1266 } 1267 1268 if (sb->major_version != 0 || 1269 sb->minor_version < 90 || 1270 sb->minor_version > 91) { 1271 pr_warn("Bad version number %d.%d on %pg\n", 1272 sb->major_version, sb->minor_version, rdev->bdev); 1273 goto abort; 1274 } 1275 1276 if (sb->raid_disks <= 0) 1277 goto abort; 1278 1279 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1280 pr_warn("md: invalid superblock checksum on %pg\n", rdev->bdev); 1281 goto abort; 1282 } 1283 1284 rdev->preferred_minor = sb->md_minor; 1285 rdev->data_offset = 0; 1286 rdev->new_data_offset = 0; 1287 rdev->sb_size = MD_SB_BYTES; 1288 rdev->badblocks.shift = -1; 1289 1290 rdev->desc_nr = sb->this_disk.number; 1291 1292 /* not spare disk */ 1293 if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS && 1294 sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1295 spare_disk = false; 1296 1297 if (!refdev) { 1298 if (!spare_disk) 1299 ret = 1; 1300 else 1301 ret = 0; 1302 } else { 1303 __u64 ev1, ev2; 1304 mdp_super_t *refsb = page_address(refdev->sb_page); 1305 if (!md_uuid_equal(refsb, sb)) { 1306 pr_warn("md: %pg has different UUID to %pg\n", 1307 rdev->bdev, refdev->bdev); 1308 goto abort; 1309 } 1310 if (!md_sb_equal(refsb, sb)) { 1311 pr_warn("md: %pg has same UUID but different superblock to %pg\n", 1312 rdev->bdev, refdev->bdev); 1313 goto abort; 1314 } 1315 ev1 = md_event(sb); 1316 ev2 = md_event(refsb); 1317 1318 if (!spare_disk && ev1 > ev2) 1319 ret = 1; 1320 else 1321 ret = 0; 1322 } 1323 rdev->sectors = rdev->sb_start; 1324 /* Limit to 4TB as metadata cannot record more than that. 1325 * (not needed for Linear and RAID0 as metadata doesn't 1326 * record this size) 1327 */ 1328 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1329 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1330 1331 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1332 /* "this cannot possibly happen" ... */ 1333 ret = -EINVAL; 1334 1335 abort: 1336 return ret; 1337 } 1338 1339 /* 1340 * validate_super for 0.90.0 1341 * note: we are not using "freshest" for 0.9 superblock 1342 */ 1343 static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1344 { 1345 mdp_disk_t *desc; 1346 mdp_super_t *sb = page_address(rdev->sb_page); 1347 __u64 ev1 = md_event(sb); 1348 1349 rdev->raid_disk = -1; 1350 clear_bit(Faulty, &rdev->flags); 1351 clear_bit(In_sync, &rdev->flags); 1352 clear_bit(Bitmap_sync, &rdev->flags); 1353 clear_bit(WriteMostly, &rdev->flags); 1354 1355 if (mddev->raid_disks == 0) { 1356 mddev->major_version = 0; 1357 mddev->minor_version = sb->minor_version; 1358 mddev->patch_version = sb->patch_version; 1359 mddev->external = 0; 1360 mddev->chunk_sectors = sb->chunk_size >> 9; 1361 mddev->ctime = sb->ctime; 1362 mddev->utime = sb->utime; 1363 mddev->level = sb->level; 1364 mddev->clevel[0] = 0; 1365 mddev->layout = sb->layout; 1366 mddev->raid_disks = sb->raid_disks; 1367 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1368 mddev->events = ev1; 1369 mddev->bitmap_info.offset = 0; 1370 mddev->bitmap_info.space = 0; 1371 /* bitmap can use 60 K after the 4K superblocks */ 1372 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1373 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1374 mddev->reshape_backwards = 0; 1375 1376 if (mddev->minor_version >= 91) { 1377 mddev->reshape_position = sb->reshape_position; 1378 mddev->delta_disks = sb->delta_disks; 1379 mddev->new_level = sb->new_level; 1380 mddev->new_layout = sb->new_layout; 1381 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1382 if (mddev->delta_disks < 0) 1383 mddev->reshape_backwards = 1; 1384 } else { 1385 mddev->reshape_position = MaxSector; 1386 mddev->delta_disks = 0; 1387 mddev->new_level = mddev->level; 1388 mddev->new_layout = mddev->layout; 1389 mddev->new_chunk_sectors = mddev->chunk_sectors; 1390 } 1391 if (mddev->level == 0) 1392 mddev->layout = -1; 1393 1394 if (sb->state & (1<<MD_SB_CLEAN)) 1395 mddev->recovery_cp = MaxSector; 1396 else { 1397 if (sb->events_hi == sb->cp_events_hi && 1398 sb->events_lo == sb->cp_events_lo) { 1399 mddev->recovery_cp = sb->recovery_cp; 1400 } else 1401 mddev->recovery_cp = 0; 1402 } 1403 1404 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1405 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1406 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1407 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1408 1409 mddev->max_disks = MD_SB_DISKS; 1410 1411 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1412 mddev->bitmap_info.file == NULL) { 1413 mddev->bitmap_info.offset = 1414 mddev->bitmap_info.default_offset; 1415 mddev->bitmap_info.space = 1416 mddev->bitmap_info.default_space; 1417 } 1418 1419 } else if (mddev->pers == NULL) { 1420 /* Insist on good event counter while assembling, except 1421 * for spares (which don't need an event count) */ 1422 ++ev1; 1423 if (sb->disks[rdev->desc_nr].state & ( 1424 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1425 if (ev1 < mddev->events) 1426 return -EINVAL; 1427 } else if (mddev->bitmap) { 1428 /* if adding to array with a bitmap, then we can accept an 1429 * older device ... but not too old. 1430 */ 1431 if (ev1 < mddev->bitmap->events_cleared) 1432 return 0; 1433 if (ev1 < mddev->events) 1434 set_bit(Bitmap_sync, &rdev->flags); 1435 } else { 1436 if (ev1 < mddev->events) 1437 /* just a hot-add of a new device, leave raid_disk at -1 */ 1438 return 0; 1439 } 1440 1441 desc = sb->disks + rdev->desc_nr; 1442 1443 if (desc->state & (1<<MD_DISK_FAULTY)) 1444 set_bit(Faulty, &rdev->flags); 1445 else if (desc->state & (1<<MD_DISK_SYNC)) { 1446 set_bit(In_sync, &rdev->flags); 1447 rdev->raid_disk = desc->raid_disk; 1448 rdev->saved_raid_disk = desc->raid_disk; 1449 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1450 /* active but not in sync implies recovery up to 1451 * reshape position. We don't know exactly where 1452 * that is, so set to zero for now 1453 */ 1454 if (mddev->minor_version >= 91) { 1455 rdev->recovery_offset = 0; 1456 rdev->raid_disk = desc->raid_disk; 1457 } 1458 } 1459 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1460 set_bit(WriteMostly, &rdev->flags); 1461 if (desc->state & (1<<MD_DISK_FAILFAST)) 1462 set_bit(FailFast, &rdev->flags); 1463 return 0; 1464 } 1465 1466 /* 1467 * sync_super for 0.90.0 1468 */ 1469 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1470 { 1471 mdp_super_t *sb; 1472 struct md_rdev *rdev2; 1473 int next_spare = mddev->raid_disks; 1474 1475 /* make rdev->sb match mddev data.. 1476 * 1477 * 1/ zero out disks 1478 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1479 * 3/ any empty disks < next_spare become removed 1480 * 1481 * disks[0] gets initialised to REMOVED because 1482 * we cannot be sure from other fields if it has 1483 * been initialised or not. 1484 */ 1485 int i; 1486 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1487 1488 rdev->sb_size = MD_SB_BYTES; 1489 1490 sb = page_address(rdev->sb_page); 1491 1492 memset(sb, 0, sizeof(*sb)); 1493 1494 sb->md_magic = MD_SB_MAGIC; 1495 sb->major_version = mddev->major_version; 1496 sb->patch_version = mddev->patch_version; 1497 sb->gvalid_words = 0; /* ignored */ 1498 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1499 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1500 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1501 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1502 1503 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1504 sb->level = mddev->level; 1505 sb->size = mddev->dev_sectors / 2; 1506 sb->raid_disks = mddev->raid_disks; 1507 sb->md_minor = mddev->md_minor; 1508 sb->not_persistent = 0; 1509 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1510 sb->state = 0; 1511 sb->events_hi = (mddev->events>>32); 1512 sb->events_lo = (u32)mddev->events; 1513 1514 if (mddev->reshape_position == MaxSector) 1515 sb->minor_version = 90; 1516 else { 1517 sb->minor_version = 91; 1518 sb->reshape_position = mddev->reshape_position; 1519 sb->new_level = mddev->new_level; 1520 sb->delta_disks = mddev->delta_disks; 1521 sb->new_layout = mddev->new_layout; 1522 sb->new_chunk = mddev->new_chunk_sectors << 9; 1523 } 1524 mddev->minor_version = sb->minor_version; 1525 if (mddev->in_sync) 1526 { 1527 sb->recovery_cp = mddev->recovery_cp; 1528 sb->cp_events_hi = (mddev->events>>32); 1529 sb->cp_events_lo = (u32)mddev->events; 1530 if (mddev->recovery_cp == MaxSector) 1531 sb->state = (1<< MD_SB_CLEAN); 1532 } else 1533 sb->recovery_cp = 0; 1534 1535 sb->layout = mddev->layout; 1536 sb->chunk_size = mddev->chunk_sectors << 9; 1537 1538 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1539 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1540 1541 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1542 rdev_for_each(rdev2, mddev) { 1543 mdp_disk_t *d; 1544 int desc_nr; 1545 int is_active = test_bit(In_sync, &rdev2->flags); 1546 1547 if (rdev2->raid_disk >= 0 && 1548 sb->minor_version >= 91) 1549 /* we have nowhere to store the recovery_offset, 1550 * but if it is not below the reshape_position, 1551 * we can piggy-back on that. 1552 */ 1553 is_active = 1; 1554 if (rdev2->raid_disk < 0 || 1555 test_bit(Faulty, &rdev2->flags)) 1556 is_active = 0; 1557 if (is_active) 1558 desc_nr = rdev2->raid_disk; 1559 else 1560 desc_nr = next_spare++; 1561 rdev2->desc_nr = desc_nr; 1562 d = &sb->disks[rdev2->desc_nr]; 1563 nr_disks++; 1564 d->number = rdev2->desc_nr; 1565 d->major = MAJOR(rdev2->bdev->bd_dev); 1566 d->minor = MINOR(rdev2->bdev->bd_dev); 1567 if (is_active) 1568 d->raid_disk = rdev2->raid_disk; 1569 else 1570 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1571 if (test_bit(Faulty, &rdev2->flags)) 1572 d->state = (1<<MD_DISK_FAULTY); 1573 else if (is_active) { 1574 d->state = (1<<MD_DISK_ACTIVE); 1575 if (test_bit(In_sync, &rdev2->flags)) 1576 d->state |= (1<<MD_DISK_SYNC); 1577 active++; 1578 working++; 1579 } else { 1580 d->state = 0; 1581 spare++; 1582 working++; 1583 } 1584 if (test_bit(WriteMostly, &rdev2->flags)) 1585 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1586 if (test_bit(FailFast, &rdev2->flags)) 1587 d->state |= (1<<MD_DISK_FAILFAST); 1588 } 1589 /* now set the "removed" and "faulty" bits on any missing devices */ 1590 for (i=0 ; i < mddev->raid_disks ; i++) { 1591 mdp_disk_t *d = &sb->disks[i]; 1592 if (d->state == 0 && d->number == 0) { 1593 d->number = i; 1594 d->raid_disk = i; 1595 d->state = (1<<MD_DISK_REMOVED); 1596 d->state |= (1<<MD_DISK_FAULTY); 1597 failed++; 1598 } 1599 } 1600 sb->nr_disks = nr_disks; 1601 sb->active_disks = active; 1602 sb->working_disks = working; 1603 sb->failed_disks = failed; 1604 sb->spare_disks = spare; 1605 1606 sb->this_disk = sb->disks[rdev->desc_nr]; 1607 sb->sb_csum = calc_sb_csum(sb); 1608 } 1609 1610 /* 1611 * rdev_size_change for 0.90.0 1612 */ 1613 static unsigned long long 1614 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1615 { 1616 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1617 return 0; /* component must fit device */ 1618 if (rdev->mddev->bitmap_info.offset) 1619 return 0; /* can't move bitmap */ 1620 rdev->sb_start = calc_dev_sboffset(rdev); 1621 if (!num_sectors || num_sectors > rdev->sb_start) 1622 num_sectors = rdev->sb_start; 1623 /* Limit to 4TB as metadata cannot record more than that. 1624 * 4TB == 2^32 KB, or 2*2^32 sectors. 1625 */ 1626 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1627 num_sectors = (sector_t)(2ULL << 32) - 2; 1628 do { 1629 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1630 rdev->sb_page); 1631 } while (md_super_wait(rdev->mddev) < 0); 1632 return num_sectors; 1633 } 1634 1635 static int 1636 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1637 { 1638 /* non-zero offset changes not possible with v0.90 */ 1639 return new_offset == 0; 1640 } 1641 1642 /* 1643 * version 1 superblock 1644 */ 1645 1646 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1647 { 1648 __le32 disk_csum; 1649 u32 csum; 1650 unsigned long long newcsum; 1651 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1652 __le32 *isuper = (__le32*)sb; 1653 1654 disk_csum = sb->sb_csum; 1655 sb->sb_csum = 0; 1656 newcsum = 0; 1657 for (; size >= 4; size -= 4) 1658 newcsum += le32_to_cpu(*isuper++); 1659 1660 if (size == 2) 1661 newcsum += le16_to_cpu(*(__le16*) isuper); 1662 1663 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1664 sb->sb_csum = disk_csum; 1665 return cpu_to_le32(csum); 1666 } 1667 1668 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1669 { 1670 struct mdp_superblock_1 *sb; 1671 int ret; 1672 sector_t sb_start; 1673 sector_t sectors; 1674 int bmask; 1675 bool spare_disk = true; 1676 1677 /* 1678 * Calculate the position of the superblock in 512byte sectors. 1679 * It is always aligned to a 4K boundary and 1680 * depeding on minor_version, it can be: 1681 * 0: At least 8K, but less than 12K, from end of device 1682 * 1: At start of device 1683 * 2: 4K from start of device. 1684 */ 1685 switch(minor_version) { 1686 case 0: 1687 sb_start = bdev_nr_sectors(rdev->bdev) - 8 * 2; 1688 sb_start &= ~(sector_t)(4*2-1); 1689 break; 1690 case 1: 1691 sb_start = 0; 1692 break; 1693 case 2: 1694 sb_start = 8; 1695 break; 1696 default: 1697 return -EINVAL; 1698 } 1699 rdev->sb_start = sb_start; 1700 1701 /* superblock is rarely larger than 1K, but it can be larger, 1702 * and it is safe to read 4k, so we do that 1703 */ 1704 ret = read_disk_sb(rdev, 4096); 1705 if (ret) return ret; 1706 1707 sb = page_address(rdev->sb_page); 1708 1709 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1710 sb->major_version != cpu_to_le32(1) || 1711 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1712 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1713 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1714 return -EINVAL; 1715 1716 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1717 pr_warn("md: invalid superblock checksum on %pg\n", 1718 rdev->bdev); 1719 return -EINVAL; 1720 } 1721 if (le64_to_cpu(sb->data_size) < 10) { 1722 pr_warn("md: data_size too small on %pg\n", 1723 rdev->bdev); 1724 return -EINVAL; 1725 } 1726 if (sb->pad0 || 1727 sb->pad3[0] || 1728 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1729 /* Some padding is non-zero, might be a new feature */ 1730 return -EINVAL; 1731 1732 rdev->preferred_minor = 0xffff; 1733 rdev->data_offset = le64_to_cpu(sb->data_offset); 1734 rdev->new_data_offset = rdev->data_offset; 1735 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1736 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1737 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1738 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1739 1740 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1741 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1742 if (rdev->sb_size & bmask) 1743 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1744 1745 if (minor_version 1746 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1747 return -EINVAL; 1748 if (minor_version 1749 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1750 return -EINVAL; 1751 1752 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1753 1754 if (!rdev->bb_page) { 1755 rdev->bb_page = alloc_page(GFP_KERNEL); 1756 if (!rdev->bb_page) 1757 return -ENOMEM; 1758 } 1759 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1760 rdev->badblocks.count == 0) { 1761 /* need to load the bad block list. 1762 * Currently we limit it to one page. 1763 */ 1764 s32 offset; 1765 sector_t bb_sector; 1766 __le64 *bbp; 1767 int i; 1768 int sectors = le16_to_cpu(sb->bblog_size); 1769 if (sectors > (PAGE_SIZE / 512)) 1770 return -EINVAL; 1771 offset = le32_to_cpu(sb->bblog_offset); 1772 if (offset == 0) 1773 return -EINVAL; 1774 bb_sector = (long long)offset; 1775 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1776 rdev->bb_page, REQ_OP_READ, true)) 1777 return -EIO; 1778 bbp = (__le64 *)page_address(rdev->bb_page); 1779 rdev->badblocks.shift = sb->bblog_shift; 1780 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1781 u64 bb = le64_to_cpu(*bbp); 1782 int count = bb & (0x3ff); 1783 u64 sector = bb >> 10; 1784 sector <<= sb->bblog_shift; 1785 count <<= sb->bblog_shift; 1786 if (bb + 1 == 0) 1787 break; 1788 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1789 return -EINVAL; 1790 } 1791 } else if (sb->bblog_offset != 0) 1792 rdev->badblocks.shift = 0; 1793 1794 if ((le32_to_cpu(sb->feature_map) & 1795 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { 1796 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); 1797 rdev->ppl.size = le16_to_cpu(sb->ppl.size); 1798 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; 1799 } 1800 1801 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && 1802 sb->level != 0) 1803 return -EINVAL; 1804 1805 /* not spare disk */ 1806 if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1807 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1808 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1809 spare_disk = false; 1810 1811 if (!refdev) { 1812 if (!spare_disk) 1813 ret = 1; 1814 else 1815 ret = 0; 1816 } else { 1817 __u64 ev1, ev2; 1818 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1819 1820 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1821 sb->level != refsb->level || 1822 sb->layout != refsb->layout || 1823 sb->chunksize != refsb->chunksize) { 1824 pr_warn("md: %pg has strangely different superblock to %pg\n", 1825 rdev->bdev, 1826 refdev->bdev); 1827 return -EINVAL; 1828 } 1829 ev1 = le64_to_cpu(sb->events); 1830 ev2 = le64_to_cpu(refsb->events); 1831 1832 if (!spare_disk && ev1 > ev2) 1833 ret = 1; 1834 else 1835 ret = 0; 1836 } 1837 if (minor_version) 1838 sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 1839 else 1840 sectors = rdev->sb_start; 1841 if (sectors < le64_to_cpu(sb->data_size)) 1842 return -EINVAL; 1843 rdev->sectors = le64_to_cpu(sb->data_size); 1844 return ret; 1845 } 1846 1847 static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev) 1848 { 1849 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1850 __u64 ev1 = le64_to_cpu(sb->events); 1851 int role; 1852 1853 rdev->raid_disk = -1; 1854 clear_bit(Faulty, &rdev->flags); 1855 clear_bit(In_sync, &rdev->flags); 1856 clear_bit(Bitmap_sync, &rdev->flags); 1857 clear_bit(WriteMostly, &rdev->flags); 1858 1859 if (mddev->raid_disks == 0) { 1860 mddev->major_version = 1; 1861 mddev->patch_version = 0; 1862 mddev->external = 0; 1863 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1864 mddev->ctime = le64_to_cpu(sb->ctime); 1865 mddev->utime = le64_to_cpu(sb->utime); 1866 mddev->level = le32_to_cpu(sb->level); 1867 mddev->clevel[0] = 0; 1868 mddev->layout = le32_to_cpu(sb->layout); 1869 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1870 mddev->dev_sectors = le64_to_cpu(sb->size); 1871 mddev->events = ev1; 1872 mddev->bitmap_info.offset = 0; 1873 mddev->bitmap_info.space = 0; 1874 /* Default location for bitmap is 1K after superblock 1875 * using 3K - total of 4K 1876 */ 1877 mddev->bitmap_info.default_offset = 1024 >> 9; 1878 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1879 mddev->reshape_backwards = 0; 1880 1881 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1882 memcpy(mddev->uuid, sb->set_uuid, 16); 1883 1884 mddev->max_disks = (4096-256)/2; 1885 1886 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1887 mddev->bitmap_info.file == NULL) { 1888 mddev->bitmap_info.offset = 1889 (__s32)le32_to_cpu(sb->bitmap_offset); 1890 /* Metadata doesn't record how much space is available. 1891 * For 1.0, we assume we can use up to the superblock 1892 * if before, else to 4K beyond superblock. 1893 * For others, assume no change is possible. 1894 */ 1895 if (mddev->minor_version > 0) 1896 mddev->bitmap_info.space = 0; 1897 else if (mddev->bitmap_info.offset > 0) 1898 mddev->bitmap_info.space = 1899 8 - mddev->bitmap_info.offset; 1900 else 1901 mddev->bitmap_info.space = 1902 -mddev->bitmap_info.offset; 1903 } 1904 1905 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1906 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1907 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1908 mddev->new_level = le32_to_cpu(sb->new_level); 1909 mddev->new_layout = le32_to_cpu(sb->new_layout); 1910 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1911 if (mddev->delta_disks < 0 || 1912 (mddev->delta_disks == 0 && 1913 (le32_to_cpu(sb->feature_map) 1914 & MD_FEATURE_RESHAPE_BACKWARDS))) 1915 mddev->reshape_backwards = 1; 1916 } else { 1917 mddev->reshape_position = MaxSector; 1918 mddev->delta_disks = 0; 1919 mddev->new_level = mddev->level; 1920 mddev->new_layout = mddev->layout; 1921 mddev->new_chunk_sectors = mddev->chunk_sectors; 1922 } 1923 1924 if (mddev->level == 0 && 1925 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) 1926 mddev->layout = -1; 1927 1928 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1929 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1930 1931 if (le32_to_cpu(sb->feature_map) & 1932 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { 1933 if (le32_to_cpu(sb->feature_map) & 1934 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) 1935 return -EINVAL; 1936 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && 1937 (le32_to_cpu(sb->feature_map) & 1938 MD_FEATURE_MULTIPLE_PPLS)) 1939 return -EINVAL; 1940 set_bit(MD_HAS_PPL, &mddev->flags); 1941 } 1942 } else if (mddev->pers == NULL) { 1943 /* Insist of good event counter while assembling, except for 1944 * spares (which don't need an event count). 1945 * Similar to mdadm, we allow event counter difference of 1 1946 * from the freshest device. 1947 */ 1948 if (rdev->desc_nr >= 0 && 1949 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1950 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1951 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1952 if (ev1 + 1 < mddev->events) 1953 return -EINVAL; 1954 } else if (mddev->bitmap) { 1955 /* If adding to array with a bitmap, then we can accept an 1956 * older device, but not too old. 1957 */ 1958 if (ev1 < mddev->bitmap->events_cleared) 1959 return 0; 1960 if (ev1 < mddev->events) 1961 set_bit(Bitmap_sync, &rdev->flags); 1962 } else { 1963 if (ev1 < mddev->events) 1964 /* just a hot-add of a new device, leave raid_disk at -1 */ 1965 return 0; 1966 } 1967 1968 if (rdev->desc_nr < 0 || 1969 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1970 role = MD_DISK_ROLE_SPARE; 1971 rdev->desc_nr = -1; 1972 } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) { 1973 /* 1974 * If we are assembling, and our event counter is smaller than the 1975 * highest event counter, we cannot trust our superblock about the role. 1976 * It could happen that our rdev was marked as Faulty, and all other 1977 * superblocks were updated with +1 event counter. 1978 * Then, before the next superblock update, which typically happens when 1979 * remove_and_add_spares() removes the device from the array, there was 1980 * a crash or reboot. 1981 * If we allow current rdev without consulting the freshest superblock, 1982 * we could cause data corruption. 1983 * Note that in this case our event counter is smaller by 1 than the 1984 * highest, otherwise, this rdev would not be allowed into array; 1985 * both kernel and mdadm allow event counter difference of 1. 1986 */ 1987 struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page); 1988 u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev); 1989 1990 if (rdev->desc_nr >= freshest_max_dev) { 1991 /* this is unexpected, better not proceed */ 1992 pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n", 1993 mdname(mddev), rdev->bdev, rdev->desc_nr, 1994 freshest->bdev, freshest_max_dev); 1995 return -EUCLEAN; 1996 } 1997 1998 role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]); 1999 pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n", 2000 mdname(mddev), rdev->bdev, role, role, freshest->bdev); 2001 } else { 2002 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2003 } 2004 switch (role) { 2005 case MD_DISK_ROLE_SPARE: /* spare */ 2006 break; 2007 case MD_DISK_ROLE_FAULTY: /* faulty */ 2008 set_bit(Faulty, &rdev->flags); 2009 break; 2010 case MD_DISK_ROLE_JOURNAL: /* journal device */ 2011 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 2012 /* journal device without journal feature */ 2013 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 2014 return -EINVAL; 2015 } 2016 set_bit(Journal, &rdev->flags); 2017 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 2018 rdev->raid_disk = 0; 2019 break; 2020 default: 2021 rdev->saved_raid_disk = role; 2022 if ((le32_to_cpu(sb->feature_map) & 2023 MD_FEATURE_RECOVERY_OFFSET)) { 2024 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 2025 if (!(le32_to_cpu(sb->feature_map) & 2026 MD_FEATURE_RECOVERY_BITMAP)) 2027 rdev->saved_raid_disk = -1; 2028 } else { 2029 /* 2030 * If the array is FROZEN, then the device can't 2031 * be in_sync with rest of array. 2032 */ 2033 if (!test_bit(MD_RECOVERY_FROZEN, 2034 &mddev->recovery)) 2035 set_bit(In_sync, &rdev->flags); 2036 } 2037 rdev->raid_disk = role; 2038 break; 2039 } 2040 if (sb->devflags & WriteMostly1) 2041 set_bit(WriteMostly, &rdev->flags); 2042 if (sb->devflags & FailFast1) 2043 set_bit(FailFast, &rdev->flags); 2044 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 2045 set_bit(Replacement, &rdev->flags); 2046 2047 return 0; 2048 } 2049 2050 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 2051 { 2052 struct mdp_superblock_1 *sb; 2053 struct md_rdev *rdev2; 2054 int max_dev, i; 2055 /* make rdev->sb match mddev and rdev data. */ 2056 2057 sb = page_address(rdev->sb_page); 2058 2059 sb->feature_map = 0; 2060 sb->pad0 = 0; 2061 sb->recovery_offset = cpu_to_le64(0); 2062 memset(sb->pad3, 0, sizeof(sb->pad3)); 2063 2064 sb->utime = cpu_to_le64((__u64)mddev->utime); 2065 sb->events = cpu_to_le64(mddev->events); 2066 if (mddev->in_sync) 2067 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 2068 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 2069 sb->resync_offset = cpu_to_le64(MaxSector); 2070 else 2071 sb->resync_offset = cpu_to_le64(0); 2072 2073 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 2074 2075 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 2076 sb->size = cpu_to_le64(mddev->dev_sectors); 2077 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 2078 sb->level = cpu_to_le32(mddev->level); 2079 sb->layout = cpu_to_le32(mddev->layout); 2080 if (test_bit(FailFast, &rdev->flags)) 2081 sb->devflags |= FailFast1; 2082 else 2083 sb->devflags &= ~FailFast1; 2084 2085 if (test_bit(WriteMostly, &rdev->flags)) 2086 sb->devflags |= WriteMostly1; 2087 else 2088 sb->devflags &= ~WriteMostly1; 2089 sb->data_offset = cpu_to_le64(rdev->data_offset); 2090 sb->data_size = cpu_to_le64(rdev->sectors); 2091 2092 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 2093 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 2094 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 2095 } 2096 2097 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 2098 !test_bit(In_sync, &rdev->flags)) { 2099 sb->feature_map |= 2100 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 2101 sb->recovery_offset = 2102 cpu_to_le64(rdev->recovery_offset); 2103 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 2104 sb->feature_map |= 2105 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 2106 } 2107 /* Note: recovery_offset and journal_tail share space */ 2108 if (test_bit(Journal, &rdev->flags)) 2109 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 2110 if (test_bit(Replacement, &rdev->flags)) 2111 sb->feature_map |= 2112 cpu_to_le32(MD_FEATURE_REPLACEMENT); 2113 2114 if (mddev->reshape_position != MaxSector) { 2115 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 2116 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 2117 sb->new_layout = cpu_to_le32(mddev->new_layout); 2118 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 2119 sb->new_level = cpu_to_le32(mddev->new_level); 2120 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 2121 if (mddev->delta_disks == 0 && 2122 mddev->reshape_backwards) 2123 sb->feature_map 2124 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 2125 if (rdev->new_data_offset != rdev->data_offset) { 2126 sb->feature_map 2127 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 2128 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 2129 - rdev->data_offset)); 2130 } 2131 } 2132 2133 if (mddev_is_clustered(mddev)) 2134 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 2135 2136 if (rdev->badblocks.count == 0) 2137 /* Nothing to do for bad blocks*/ ; 2138 else if (sb->bblog_offset == 0) 2139 /* Cannot record bad blocks on this device */ 2140 md_error(mddev, rdev); 2141 else { 2142 struct badblocks *bb = &rdev->badblocks; 2143 __le64 *bbp = (__le64 *)page_address(rdev->bb_page); 2144 u64 *p = bb->page; 2145 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 2146 if (bb->changed) { 2147 unsigned seq; 2148 2149 retry: 2150 seq = read_seqbegin(&bb->lock); 2151 2152 memset(bbp, 0xff, PAGE_SIZE); 2153 2154 for (i = 0 ; i < bb->count ; i++) { 2155 u64 internal_bb = p[i]; 2156 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 2157 | BB_LEN(internal_bb)); 2158 bbp[i] = cpu_to_le64(store_bb); 2159 } 2160 bb->changed = 0; 2161 if (read_seqretry(&bb->lock, seq)) 2162 goto retry; 2163 2164 bb->sector = (rdev->sb_start + 2165 (int)le32_to_cpu(sb->bblog_offset)); 2166 bb->size = le16_to_cpu(sb->bblog_size); 2167 } 2168 } 2169 2170 max_dev = 0; 2171 rdev_for_each(rdev2, mddev) 2172 if (rdev2->desc_nr+1 > max_dev) 2173 max_dev = rdev2->desc_nr+1; 2174 2175 if (max_dev > le32_to_cpu(sb->max_dev)) { 2176 int bmask; 2177 sb->max_dev = cpu_to_le32(max_dev); 2178 rdev->sb_size = max_dev * 2 + 256; 2179 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 2180 if (rdev->sb_size & bmask) 2181 rdev->sb_size = (rdev->sb_size | bmask) + 1; 2182 } else 2183 max_dev = le32_to_cpu(sb->max_dev); 2184 2185 for (i=0; i<max_dev;i++) 2186 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2187 2188 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 2189 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 2190 2191 if (test_bit(MD_HAS_PPL, &mddev->flags)) { 2192 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) 2193 sb->feature_map |= 2194 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); 2195 else 2196 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); 2197 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); 2198 sb->ppl.size = cpu_to_le16(rdev->ppl.size); 2199 } 2200 2201 rdev_for_each(rdev2, mddev) { 2202 i = rdev2->desc_nr; 2203 if (test_bit(Faulty, &rdev2->flags)) 2204 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 2205 else if (test_bit(In_sync, &rdev2->flags)) 2206 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2207 else if (test_bit(Journal, &rdev2->flags)) 2208 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 2209 else if (rdev2->raid_disk >= 0) 2210 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 2211 else 2212 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 2213 } 2214 2215 sb->sb_csum = calc_sb_1_csum(sb); 2216 } 2217 2218 static sector_t super_1_choose_bm_space(sector_t dev_size) 2219 { 2220 sector_t bm_space; 2221 2222 /* if the device is bigger than 8Gig, save 64k for bitmap 2223 * usage, if bigger than 200Gig, save 128k 2224 */ 2225 if (dev_size < 64*2) 2226 bm_space = 0; 2227 else if (dev_size - 64*2 >= 200*1024*1024*2) 2228 bm_space = 128*2; 2229 else if (dev_size - 4*2 > 8*1024*1024*2) 2230 bm_space = 64*2; 2231 else 2232 bm_space = 4*2; 2233 return bm_space; 2234 } 2235 2236 static unsigned long long 2237 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 2238 { 2239 struct mdp_superblock_1 *sb; 2240 sector_t max_sectors; 2241 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 2242 return 0; /* component must fit device */ 2243 if (rdev->data_offset != rdev->new_data_offset) 2244 return 0; /* too confusing */ 2245 if (rdev->sb_start < rdev->data_offset) { 2246 /* minor versions 1 and 2; superblock before data */ 2247 max_sectors = bdev_nr_sectors(rdev->bdev) - rdev->data_offset; 2248 if (!num_sectors || num_sectors > max_sectors) 2249 num_sectors = max_sectors; 2250 } else if (rdev->mddev->bitmap_info.offset) { 2251 /* minor version 0 with bitmap we can't move */ 2252 return 0; 2253 } else { 2254 /* minor version 0; superblock after data */ 2255 sector_t sb_start, bm_space; 2256 sector_t dev_size = bdev_nr_sectors(rdev->bdev); 2257 2258 /* 8K is for superblock */ 2259 sb_start = dev_size - 8*2; 2260 sb_start &= ~(sector_t)(4*2 - 1); 2261 2262 bm_space = super_1_choose_bm_space(dev_size); 2263 2264 /* Space that can be used to store date needs to decrease 2265 * superblock bitmap space and bad block space(4K) 2266 */ 2267 max_sectors = sb_start - bm_space - 4*2; 2268 2269 if (!num_sectors || num_sectors > max_sectors) 2270 num_sectors = max_sectors; 2271 rdev->sb_start = sb_start; 2272 } 2273 sb = page_address(rdev->sb_page); 2274 sb->data_size = cpu_to_le64(num_sectors); 2275 sb->super_offset = cpu_to_le64(rdev->sb_start); 2276 sb->sb_csum = calc_sb_1_csum(sb); 2277 do { 2278 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 2279 rdev->sb_page); 2280 } while (md_super_wait(rdev->mddev) < 0); 2281 return num_sectors; 2282 2283 } 2284 2285 static int 2286 super_1_allow_new_offset(struct md_rdev *rdev, 2287 unsigned long long new_offset) 2288 { 2289 /* All necessary checks on new >= old have been done */ 2290 struct bitmap *bitmap; 2291 if (new_offset >= rdev->data_offset) 2292 return 1; 2293 2294 /* with 1.0 metadata, there is no metadata to tread on 2295 * so we can always move back */ 2296 if (rdev->mddev->minor_version == 0) 2297 return 1; 2298 2299 /* otherwise we must be sure not to step on 2300 * any metadata, so stay: 2301 * 36K beyond start of superblock 2302 * beyond end of badblocks 2303 * beyond write-intent bitmap 2304 */ 2305 if (rdev->sb_start + (32+4)*2 > new_offset) 2306 return 0; 2307 bitmap = rdev->mddev->bitmap; 2308 if (bitmap && !rdev->mddev->bitmap_info.file && 2309 rdev->sb_start + rdev->mddev->bitmap_info.offset + 2310 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 2311 return 0; 2312 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2313 return 0; 2314 2315 return 1; 2316 } 2317 2318 static struct super_type super_types[] = { 2319 [0] = { 2320 .name = "0.90.0", 2321 .owner = THIS_MODULE, 2322 .load_super = super_90_load, 2323 .validate_super = super_90_validate, 2324 .sync_super = super_90_sync, 2325 .rdev_size_change = super_90_rdev_size_change, 2326 .allow_new_offset = super_90_allow_new_offset, 2327 }, 2328 [1] = { 2329 .name = "md-1", 2330 .owner = THIS_MODULE, 2331 .load_super = super_1_load, 2332 .validate_super = super_1_validate, 2333 .sync_super = super_1_sync, 2334 .rdev_size_change = super_1_rdev_size_change, 2335 .allow_new_offset = super_1_allow_new_offset, 2336 }, 2337 }; 2338 2339 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 2340 { 2341 if (mddev->sync_super) { 2342 mddev->sync_super(mddev, rdev); 2343 return; 2344 } 2345 2346 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 2347 2348 super_types[mddev->major_version].sync_super(mddev, rdev); 2349 } 2350 2351 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 2352 { 2353 struct md_rdev *rdev, *rdev2; 2354 2355 rcu_read_lock(); 2356 rdev_for_each_rcu(rdev, mddev1) { 2357 if (test_bit(Faulty, &rdev->flags) || 2358 test_bit(Journal, &rdev->flags) || 2359 rdev->raid_disk == -1) 2360 continue; 2361 rdev_for_each_rcu(rdev2, mddev2) { 2362 if (test_bit(Faulty, &rdev2->flags) || 2363 test_bit(Journal, &rdev2->flags) || 2364 rdev2->raid_disk == -1) 2365 continue; 2366 if (rdev->bdev->bd_disk == rdev2->bdev->bd_disk) { 2367 rcu_read_unlock(); 2368 return 1; 2369 } 2370 } 2371 } 2372 rcu_read_unlock(); 2373 return 0; 2374 } 2375 2376 static LIST_HEAD(pending_raid_disks); 2377 2378 /* 2379 * Try to register data integrity profile for an mddev 2380 * 2381 * This is called when an array is started and after a disk has been kicked 2382 * from the array. It only succeeds if all working and active component devices 2383 * are integrity capable with matching profiles. 2384 */ 2385 int md_integrity_register(struct mddev *mddev) 2386 { 2387 struct md_rdev *rdev, *reference = NULL; 2388 2389 if (list_empty(&mddev->disks)) 2390 return 0; /* nothing to do */ 2391 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2392 return 0; /* shouldn't register, or already is */ 2393 rdev_for_each(rdev, mddev) { 2394 /* skip spares and non-functional disks */ 2395 if (test_bit(Faulty, &rdev->flags)) 2396 continue; 2397 if (rdev->raid_disk < 0) 2398 continue; 2399 if (!reference) { 2400 /* Use the first rdev as the reference */ 2401 reference = rdev; 2402 continue; 2403 } 2404 /* does this rdev's profile match the reference profile? */ 2405 if (blk_integrity_compare(reference->bdev->bd_disk, 2406 rdev->bdev->bd_disk) < 0) 2407 return -EINVAL; 2408 } 2409 if (!reference || !bdev_get_integrity(reference->bdev)) 2410 return 0; 2411 /* 2412 * All component devices are integrity capable and have matching 2413 * profiles, register the common profile for the md device. 2414 */ 2415 blk_integrity_register(mddev->gendisk, 2416 bdev_get_integrity(reference->bdev)); 2417 2418 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2419 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || 2420 (mddev->level != 1 && mddev->level != 10 && 2421 bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { 2422 /* 2423 * No need to handle the failure of bioset_integrity_create, 2424 * because the function is called by md_run() -> pers->run(), 2425 * md_run calls bioset_exit -> bioset_integrity_free in case 2426 * of failure case. 2427 */ 2428 pr_err("md: failed to create integrity pool for %s\n", 2429 mdname(mddev)); 2430 return -EINVAL; 2431 } 2432 return 0; 2433 } 2434 EXPORT_SYMBOL(md_integrity_register); 2435 2436 /* 2437 * Attempt to add an rdev, but only if it is consistent with the current 2438 * integrity profile 2439 */ 2440 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2441 { 2442 struct blk_integrity *bi_mddev; 2443 2444 if (!mddev->gendisk) 2445 return 0; 2446 2447 bi_mddev = blk_get_integrity(mddev->gendisk); 2448 2449 if (!bi_mddev) /* nothing to do */ 2450 return 0; 2451 2452 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2453 pr_err("%s: incompatible integrity profile for %pg\n", 2454 mdname(mddev), rdev->bdev); 2455 return -ENXIO; 2456 } 2457 2458 return 0; 2459 } 2460 EXPORT_SYMBOL(md_integrity_add_rdev); 2461 2462 static bool rdev_read_only(struct md_rdev *rdev) 2463 { 2464 return bdev_read_only(rdev->bdev) || 2465 (rdev->meta_bdev && bdev_read_only(rdev->meta_bdev)); 2466 } 2467 2468 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2469 { 2470 char b[BDEVNAME_SIZE]; 2471 int err; 2472 2473 /* prevent duplicates */ 2474 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2475 return -EEXIST; 2476 2477 if (rdev_read_only(rdev) && mddev->pers) 2478 return -EROFS; 2479 2480 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2481 if (!test_bit(Journal, &rdev->flags) && 2482 rdev->sectors && 2483 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2484 if (mddev->pers) { 2485 /* Cannot change size, so fail 2486 * If mddev->level <= 0, then we don't care 2487 * about aligning sizes (e.g. linear) 2488 */ 2489 if (mddev->level > 0) 2490 return -ENOSPC; 2491 } else 2492 mddev->dev_sectors = rdev->sectors; 2493 } 2494 2495 /* Verify rdev->desc_nr is unique. 2496 * If it is -1, assign a free number, else 2497 * check number is not in use 2498 */ 2499 rcu_read_lock(); 2500 if (rdev->desc_nr < 0) { 2501 int choice = 0; 2502 if (mddev->pers) 2503 choice = mddev->raid_disks; 2504 while (md_find_rdev_nr_rcu(mddev, choice)) 2505 choice++; 2506 rdev->desc_nr = choice; 2507 } else { 2508 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2509 rcu_read_unlock(); 2510 return -EBUSY; 2511 } 2512 } 2513 rcu_read_unlock(); 2514 if (!test_bit(Journal, &rdev->flags) && 2515 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2516 pr_warn("md: %s: array is limited to %d devices\n", 2517 mdname(mddev), mddev->max_disks); 2518 return -EBUSY; 2519 } 2520 snprintf(b, sizeof(b), "%pg", rdev->bdev); 2521 strreplace(b, '/', '!'); 2522 2523 rdev->mddev = mddev; 2524 pr_debug("md: bind<%s>\n", b); 2525 2526 if (mddev->raid_disks) 2527 mddev_create_serial_pool(mddev, rdev); 2528 2529 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2530 goto fail; 2531 2532 /* failure here is OK */ 2533 err = sysfs_create_link(&rdev->kobj, bdev_kobj(rdev->bdev), "block"); 2534 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2535 rdev->sysfs_unack_badblocks = 2536 sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); 2537 rdev->sysfs_badblocks = 2538 sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); 2539 2540 list_add_rcu(&rdev->same_set, &mddev->disks); 2541 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2542 2543 /* May as well allow recovery to be retried once */ 2544 mddev->recovery_disabled++; 2545 2546 return 0; 2547 2548 fail: 2549 pr_warn("md: failed to register dev-%s for %s\n", 2550 b, mdname(mddev)); 2551 return err; 2552 } 2553 2554 void md_autodetect_dev(dev_t dev); 2555 2556 /* just for claiming the bdev */ 2557 static struct md_rdev claim_rdev; 2558 2559 static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) 2560 { 2561 pr_debug("md: export_rdev(%pg)\n", rdev->bdev); 2562 md_rdev_clear(rdev); 2563 #ifndef MODULE 2564 if (test_bit(AutoDetected, &rdev->flags)) 2565 md_autodetect_dev(rdev->bdev->bd_dev); 2566 #endif 2567 bdev_release(rdev->bdev_handle); 2568 rdev->bdev = NULL; 2569 kobject_put(&rdev->kobj); 2570 } 2571 2572 static void md_kick_rdev_from_array(struct md_rdev *rdev) 2573 { 2574 struct mddev *mddev = rdev->mddev; 2575 2576 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2577 list_del_rcu(&rdev->same_set); 2578 pr_debug("md: unbind<%pg>\n", rdev->bdev); 2579 mddev_destroy_serial_pool(rdev->mddev, rdev); 2580 rdev->mddev = NULL; 2581 sysfs_remove_link(&rdev->kobj, "block"); 2582 sysfs_put(rdev->sysfs_state); 2583 sysfs_put(rdev->sysfs_unack_badblocks); 2584 sysfs_put(rdev->sysfs_badblocks); 2585 rdev->sysfs_state = NULL; 2586 rdev->sysfs_unack_badblocks = NULL; 2587 rdev->sysfs_badblocks = NULL; 2588 rdev->badblocks.count = 0; 2589 2590 synchronize_rcu(); 2591 2592 /* 2593 * kobject_del() will wait for all in progress writers to be done, where 2594 * reconfig_mutex is held, hence it can't be called under 2595 * reconfig_mutex and it's delayed to mddev_unlock(). 2596 */ 2597 list_add(&rdev->same_set, &mddev->deleting); 2598 } 2599 2600 static void export_array(struct mddev *mddev) 2601 { 2602 struct md_rdev *rdev; 2603 2604 while (!list_empty(&mddev->disks)) { 2605 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2606 same_set); 2607 md_kick_rdev_from_array(rdev); 2608 } 2609 mddev->raid_disks = 0; 2610 mddev->major_version = 0; 2611 } 2612 2613 static bool set_in_sync(struct mddev *mddev) 2614 { 2615 lockdep_assert_held(&mddev->lock); 2616 if (!mddev->in_sync) { 2617 mddev->sync_checkers++; 2618 spin_unlock(&mddev->lock); 2619 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); 2620 spin_lock(&mddev->lock); 2621 if (!mddev->in_sync && 2622 percpu_ref_is_zero(&mddev->writes_pending)) { 2623 mddev->in_sync = 1; 2624 /* 2625 * Ensure ->in_sync is visible before we clear 2626 * ->sync_checkers. 2627 */ 2628 smp_mb(); 2629 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2630 sysfs_notify_dirent_safe(mddev->sysfs_state); 2631 } 2632 if (--mddev->sync_checkers == 0) 2633 percpu_ref_switch_to_percpu(&mddev->writes_pending); 2634 } 2635 if (mddev->safemode == 1) 2636 mddev->safemode = 0; 2637 return mddev->in_sync; 2638 } 2639 2640 static void sync_sbs(struct mddev *mddev, int nospares) 2641 { 2642 /* Update each superblock (in-memory image), but 2643 * if we are allowed to, skip spares which already 2644 * have the right event counter, or have one earlier 2645 * (which would mean they aren't being marked as dirty 2646 * with the rest of the array) 2647 */ 2648 struct md_rdev *rdev; 2649 rdev_for_each(rdev, mddev) { 2650 if (rdev->sb_events == mddev->events || 2651 (nospares && 2652 rdev->raid_disk < 0 && 2653 rdev->sb_events+1 == mddev->events)) { 2654 /* Don't update this superblock */ 2655 rdev->sb_loaded = 2; 2656 } else { 2657 sync_super(mddev, rdev); 2658 rdev->sb_loaded = 1; 2659 } 2660 } 2661 } 2662 2663 static bool does_sb_need_changing(struct mddev *mddev) 2664 { 2665 struct md_rdev *rdev = NULL, *iter; 2666 struct mdp_superblock_1 *sb; 2667 int role; 2668 2669 /* Find a good rdev */ 2670 rdev_for_each(iter, mddev) 2671 if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { 2672 rdev = iter; 2673 break; 2674 } 2675 2676 /* No good device found. */ 2677 if (!rdev) 2678 return false; 2679 2680 sb = page_address(rdev->sb_page); 2681 /* Check if a device has become faulty or a spare become active */ 2682 rdev_for_each(rdev, mddev) { 2683 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2684 /* Device activated? */ 2685 if (role == MD_DISK_ROLE_SPARE && rdev->raid_disk >= 0 && 2686 !test_bit(Faulty, &rdev->flags)) 2687 return true; 2688 /* Device turned faulty? */ 2689 if (test_bit(Faulty, &rdev->flags) && (role < MD_DISK_ROLE_MAX)) 2690 return true; 2691 } 2692 2693 /* Check if any mddev parameters have changed */ 2694 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2695 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2696 (mddev->layout != le32_to_cpu(sb->layout)) || 2697 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2698 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2699 return true; 2700 2701 return false; 2702 } 2703 2704 void md_update_sb(struct mddev *mddev, int force_change) 2705 { 2706 struct md_rdev *rdev; 2707 int sync_req; 2708 int nospares = 0; 2709 int any_badblocks_changed = 0; 2710 int ret = -1; 2711 2712 if (!md_is_rdwr(mddev)) { 2713 if (force_change) 2714 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2715 return; 2716 } 2717 2718 repeat: 2719 if (mddev_is_clustered(mddev)) { 2720 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2721 force_change = 1; 2722 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2723 nospares = 1; 2724 ret = md_cluster_ops->metadata_update_start(mddev); 2725 /* Has someone else has updated the sb */ 2726 if (!does_sb_need_changing(mddev)) { 2727 if (ret == 0) 2728 md_cluster_ops->metadata_update_cancel(mddev); 2729 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2730 BIT(MD_SB_CHANGE_DEVS) | 2731 BIT(MD_SB_CHANGE_CLEAN)); 2732 return; 2733 } 2734 } 2735 2736 /* 2737 * First make sure individual recovery_offsets are correct 2738 * curr_resync_completed can only be used during recovery. 2739 * During reshape/resync it might use array-addresses rather 2740 * that device addresses. 2741 */ 2742 rdev_for_each(rdev, mddev) { 2743 if (rdev->raid_disk >= 0 && 2744 mddev->delta_disks >= 0 && 2745 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 2746 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 2747 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 2748 !test_bit(Journal, &rdev->flags) && 2749 !test_bit(In_sync, &rdev->flags) && 2750 mddev->curr_resync_completed > rdev->recovery_offset) 2751 rdev->recovery_offset = mddev->curr_resync_completed; 2752 2753 } 2754 if (!mddev->persistent) { 2755 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2756 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2757 if (!mddev->external) { 2758 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2759 rdev_for_each(rdev, mddev) { 2760 if (rdev->badblocks.changed) { 2761 rdev->badblocks.changed = 0; 2762 ack_all_badblocks(&rdev->badblocks); 2763 md_error(mddev, rdev); 2764 } 2765 clear_bit(Blocked, &rdev->flags); 2766 clear_bit(BlockedBadBlocks, &rdev->flags); 2767 wake_up(&rdev->blocked_wait); 2768 } 2769 } 2770 wake_up(&mddev->sb_wait); 2771 return; 2772 } 2773 2774 spin_lock(&mddev->lock); 2775 2776 mddev->utime = ktime_get_real_seconds(); 2777 2778 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2779 force_change = 1; 2780 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2781 /* just a clean<-> dirty transition, possibly leave spares alone, 2782 * though if events isn't the right even/odd, we will have to do 2783 * spares after all 2784 */ 2785 nospares = 1; 2786 if (force_change) 2787 nospares = 0; 2788 if (mddev->degraded) 2789 /* If the array is degraded, then skipping spares is both 2790 * dangerous and fairly pointless. 2791 * Dangerous because a device that was removed from the array 2792 * might have a event_count that still looks up-to-date, 2793 * so it can be re-added without a resync. 2794 * Pointless because if there are any spares to skip, 2795 * then a recovery will happen and soon that array won't 2796 * be degraded any more and the spare can go back to sleep then. 2797 */ 2798 nospares = 0; 2799 2800 sync_req = mddev->in_sync; 2801 2802 /* If this is just a dirty<->clean transition, and the array is clean 2803 * and 'events' is odd, we can roll back to the previous clean state */ 2804 if (nospares 2805 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2806 && mddev->can_decrease_events 2807 && mddev->events != 1) { 2808 mddev->events--; 2809 mddev->can_decrease_events = 0; 2810 } else { 2811 /* otherwise we have to go forward and ... */ 2812 mddev->events ++; 2813 mddev->can_decrease_events = nospares; 2814 } 2815 2816 /* 2817 * This 64-bit counter should never wrap. 2818 * Either we are in around ~1 trillion A.C., assuming 2819 * 1 reboot per second, or we have a bug... 2820 */ 2821 WARN_ON(mddev->events == 0); 2822 2823 rdev_for_each(rdev, mddev) { 2824 if (rdev->badblocks.changed) 2825 any_badblocks_changed++; 2826 if (test_bit(Faulty, &rdev->flags)) 2827 set_bit(FaultRecorded, &rdev->flags); 2828 } 2829 2830 sync_sbs(mddev, nospares); 2831 spin_unlock(&mddev->lock); 2832 2833 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2834 mdname(mddev), mddev->in_sync); 2835 2836 if (mddev->queue) 2837 blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2838 rewrite: 2839 md_bitmap_update_sb(mddev->bitmap); 2840 rdev_for_each(rdev, mddev) { 2841 if (rdev->sb_loaded != 1) 2842 continue; /* no noise on spare devices */ 2843 2844 if (!test_bit(Faulty, &rdev->flags)) { 2845 md_super_write(mddev,rdev, 2846 rdev->sb_start, rdev->sb_size, 2847 rdev->sb_page); 2848 pr_debug("md: (write) %pg's sb offset: %llu\n", 2849 rdev->bdev, 2850 (unsigned long long)rdev->sb_start); 2851 rdev->sb_events = mddev->events; 2852 if (rdev->badblocks.size) { 2853 md_super_write(mddev, rdev, 2854 rdev->badblocks.sector, 2855 rdev->badblocks.size << 9, 2856 rdev->bb_page); 2857 rdev->badblocks.size = 0; 2858 } 2859 2860 } else 2861 pr_debug("md: %pg (skipping faulty)\n", 2862 rdev->bdev); 2863 } 2864 if (md_super_wait(mddev) < 0) 2865 goto rewrite; 2866 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2867 2868 if (mddev_is_clustered(mddev) && ret == 0) 2869 md_cluster_ops->metadata_update_finish(mddev); 2870 2871 if (mddev->in_sync != sync_req || 2872 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2873 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2874 /* have to write it out again */ 2875 goto repeat; 2876 wake_up(&mddev->sb_wait); 2877 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2878 sysfs_notify_dirent_safe(mddev->sysfs_completed); 2879 2880 rdev_for_each(rdev, mddev) { 2881 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2882 clear_bit(Blocked, &rdev->flags); 2883 2884 if (any_badblocks_changed) 2885 ack_all_badblocks(&rdev->badblocks); 2886 clear_bit(BlockedBadBlocks, &rdev->flags); 2887 wake_up(&rdev->blocked_wait); 2888 } 2889 } 2890 EXPORT_SYMBOL(md_update_sb); 2891 2892 static int add_bound_rdev(struct md_rdev *rdev) 2893 { 2894 struct mddev *mddev = rdev->mddev; 2895 int err = 0; 2896 bool add_journal = test_bit(Journal, &rdev->flags); 2897 2898 if (!mddev->pers->hot_remove_disk || add_journal) { 2899 /* If there is hot_add_disk but no hot_remove_disk 2900 * then added disks for geometry changes, 2901 * and should be added immediately. 2902 */ 2903 super_types[mddev->major_version]. 2904 validate_super(mddev, NULL/*freshest*/, rdev); 2905 err = mddev->pers->hot_add_disk(mddev, rdev); 2906 if (err) { 2907 md_kick_rdev_from_array(rdev); 2908 return err; 2909 } 2910 } 2911 sysfs_notify_dirent_safe(rdev->sysfs_state); 2912 2913 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2914 if (mddev->degraded) 2915 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2916 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2917 md_new_event(); 2918 md_wakeup_thread(mddev->thread); 2919 return 0; 2920 } 2921 2922 /* words written to sysfs files may, or may not, be \n terminated. 2923 * We want to accept with case. For this we use cmd_match. 2924 */ 2925 static int cmd_match(const char *cmd, const char *str) 2926 { 2927 /* See if cmd, written into a sysfs file, matches 2928 * str. They must either be the same, or cmd can 2929 * have a trailing newline 2930 */ 2931 while (*cmd && *str && *cmd == *str) { 2932 cmd++; 2933 str++; 2934 } 2935 if (*cmd == '\n') 2936 cmd++; 2937 if (*str || *cmd) 2938 return 0; 2939 return 1; 2940 } 2941 2942 struct rdev_sysfs_entry { 2943 struct attribute attr; 2944 ssize_t (*show)(struct md_rdev *, char *); 2945 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2946 }; 2947 2948 static ssize_t 2949 state_show(struct md_rdev *rdev, char *page) 2950 { 2951 char *sep = ","; 2952 size_t len = 0; 2953 unsigned long flags = READ_ONCE(rdev->flags); 2954 2955 if (test_bit(Faulty, &flags) || 2956 (!test_bit(ExternalBbl, &flags) && 2957 rdev->badblocks.unacked_exist)) 2958 len += sprintf(page+len, "faulty%s", sep); 2959 if (test_bit(In_sync, &flags)) 2960 len += sprintf(page+len, "in_sync%s", sep); 2961 if (test_bit(Journal, &flags)) 2962 len += sprintf(page+len, "journal%s", sep); 2963 if (test_bit(WriteMostly, &flags)) 2964 len += sprintf(page+len, "write_mostly%s", sep); 2965 if (test_bit(Blocked, &flags) || 2966 (rdev->badblocks.unacked_exist 2967 && !test_bit(Faulty, &flags))) 2968 len += sprintf(page+len, "blocked%s", sep); 2969 if (!test_bit(Faulty, &flags) && 2970 !test_bit(Journal, &flags) && 2971 !test_bit(In_sync, &flags)) 2972 len += sprintf(page+len, "spare%s", sep); 2973 if (test_bit(WriteErrorSeen, &flags)) 2974 len += sprintf(page+len, "write_error%s", sep); 2975 if (test_bit(WantReplacement, &flags)) 2976 len += sprintf(page+len, "want_replacement%s", sep); 2977 if (test_bit(Replacement, &flags)) 2978 len += sprintf(page+len, "replacement%s", sep); 2979 if (test_bit(ExternalBbl, &flags)) 2980 len += sprintf(page+len, "external_bbl%s", sep); 2981 if (test_bit(FailFast, &flags)) 2982 len += sprintf(page+len, "failfast%s", sep); 2983 2984 if (len) 2985 len -= strlen(sep); 2986 2987 return len+sprintf(page+len, "\n"); 2988 } 2989 2990 static ssize_t 2991 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2992 { 2993 /* can write 2994 * faulty - simulates an error 2995 * remove - disconnects the device 2996 * writemostly - sets write_mostly 2997 * -writemostly - clears write_mostly 2998 * blocked - sets the Blocked flags 2999 * -blocked - clears the Blocked and possibly simulates an error 3000 * insync - sets Insync providing device isn't active 3001 * -insync - clear Insync for a device with a slot assigned, 3002 * so that it gets rebuilt based on bitmap 3003 * write_error - sets WriteErrorSeen 3004 * -write_error - clears WriteErrorSeen 3005 * {,-}failfast - set/clear FailFast 3006 */ 3007 3008 struct mddev *mddev = rdev->mddev; 3009 int err = -EINVAL; 3010 bool need_update_sb = false; 3011 3012 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 3013 md_error(rdev->mddev, rdev); 3014 3015 if (test_bit(MD_BROKEN, &rdev->mddev->flags)) 3016 err = -EBUSY; 3017 else 3018 err = 0; 3019 } else if (cmd_match(buf, "remove")) { 3020 if (rdev->mddev->pers) { 3021 clear_bit(Blocked, &rdev->flags); 3022 remove_and_add_spares(rdev->mddev, rdev); 3023 } 3024 if (rdev->raid_disk >= 0) 3025 err = -EBUSY; 3026 else { 3027 err = 0; 3028 if (mddev_is_clustered(mddev)) 3029 err = md_cluster_ops->remove_disk(mddev, rdev); 3030 3031 if (err == 0) { 3032 md_kick_rdev_from_array(rdev); 3033 if (mddev->pers) { 3034 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3035 md_wakeup_thread(mddev->thread); 3036 } 3037 md_new_event(); 3038 } 3039 } 3040 } else if (cmd_match(buf, "writemostly")) { 3041 set_bit(WriteMostly, &rdev->flags); 3042 mddev_create_serial_pool(rdev->mddev, rdev); 3043 need_update_sb = true; 3044 err = 0; 3045 } else if (cmd_match(buf, "-writemostly")) { 3046 mddev_destroy_serial_pool(rdev->mddev, rdev); 3047 clear_bit(WriteMostly, &rdev->flags); 3048 need_update_sb = true; 3049 err = 0; 3050 } else if (cmd_match(buf, "blocked")) { 3051 set_bit(Blocked, &rdev->flags); 3052 err = 0; 3053 } else if (cmd_match(buf, "-blocked")) { 3054 if (!test_bit(Faulty, &rdev->flags) && 3055 !test_bit(ExternalBbl, &rdev->flags) && 3056 rdev->badblocks.unacked_exist) { 3057 /* metadata handler doesn't understand badblocks, 3058 * so we need to fail the device 3059 */ 3060 md_error(rdev->mddev, rdev); 3061 } 3062 clear_bit(Blocked, &rdev->flags); 3063 clear_bit(BlockedBadBlocks, &rdev->flags); 3064 wake_up(&rdev->blocked_wait); 3065 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3066 md_wakeup_thread(rdev->mddev->thread); 3067 3068 err = 0; 3069 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 3070 set_bit(In_sync, &rdev->flags); 3071 err = 0; 3072 } else if (cmd_match(buf, "failfast")) { 3073 set_bit(FailFast, &rdev->flags); 3074 need_update_sb = true; 3075 err = 0; 3076 } else if (cmd_match(buf, "-failfast")) { 3077 clear_bit(FailFast, &rdev->flags); 3078 need_update_sb = true; 3079 err = 0; 3080 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 3081 !test_bit(Journal, &rdev->flags)) { 3082 if (rdev->mddev->pers == NULL) { 3083 clear_bit(In_sync, &rdev->flags); 3084 rdev->saved_raid_disk = rdev->raid_disk; 3085 rdev->raid_disk = -1; 3086 err = 0; 3087 } 3088 } else if (cmd_match(buf, "write_error")) { 3089 set_bit(WriteErrorSeen, &rdev->flags); 3090 err = 0; 3091 } else if (cmd_match(buf, "-write_error")) { 3092 clear_bit(WriteErrorSeen, &rdev->flags); 3093 err = 0; 3094 } else if (cmd_match(buf, "want_replacement")) { 3095 /* Any non-spare device that is not a replacement can 3096 * become want_replacement at any time, but we then need to 3097 * check if recovery is needed. 3098 */ 3099 if (rdev->raid_disk >= 0 && 3100 !test_bit(Journal, &rdev->flags) && 3101 !test_bit(Replacement, &rdev->flags)) 3102 set_bit(WantReplacement, &rdev->flags); 3103 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3104 md_wakeup_thread(rdev->mddev->thread); 3105 err = 0; 3106 } else if (cmd_match(buf, "-want_replacement")) { 3107 /* Clearing 'want_replacement' is always allowed. 3108 * Once replacements starts it is too late though. 3109 */ 3110 err = 0; 3111 clear_bit(WantReplacement, &rdev->flags); 3112 } else if (cmd_match(buf, "replacement")) { 3113 /* Can only set a device as a replacement when array has not 3114 * yet been started. Once running, replacement is automatic 3115 * from spares, or by assigning 'slot'. 3116 */ 3117 if (rdev->mddev->pers) 3118 err = -EBUSY; 3119 else { 3120 set_bit(Replacement, &rdev->flags); 3121 err = 0; 3122 } 3123 } else if (cmd_match(buf, "-replacement")) { 3124 /* Similarly, can only clear Replacement before start */ 3125 if (rdev->mddev->pers) 3126 err = -EBUSY; 3127 else { 3128 clear_bit(Replacement, &rdev->flags); 3129 err = 0; 3130 } 3131 } else if (cmd_match(buf, "re-add")) { 3132 if (!rdev->mddev->pers) 3133 err = -EINVAL; 3134 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 3135 rdev->saved_raid_disk >= 0) { 3136 /* clear_bit is performed _after_ all the devices 3137 * have their local Faulty bit cleared. If any writes 3138 * happen in the meantime in the local node, they 3139 * will land in the local bitmap, which will be synced 3140 * by this node eventually 3141 */ 3142 if (!mddev_is_clustered(rdev->mddev) || 3143 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 3144 clear_bit(Faulty, &rdev->flags); 3145 err = add_bound_rdev(rdev); 3146 } 3147 } else 3148 err = -EBUSY; 3149 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 3150 set_bit(ExternalBbl, &rdev->flags); 3151 rdev->badblocks.shift = 0; 3152 err = 0; 3153 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 3154 clear_bit(ExternalBbl, &rdev->flags); 3155 err = 0; 3156 } 3157 if (need_update_sb) 3158 md_update_sb(mddev, 1); 3159 if (!err) 3160 sysfs_notify_dirent_safe(rdev->sysfs_state); 3161 return err ? err : len; 3162 } 3163 static struct rdev_sysfs_entry rdev_state = 3164 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 3165 3166 static ssize_t 3167 errors_show(struct md_rdev *rdev, char *page) 3168 { 3169 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 3170 } 3171 3172 static ssize_t 3173 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 3174 { 3175 unsigned int n; 3176 int rv; 3177 3178 rv = kstrtouint(buf, 10, &n); 3179 if (rv < 0) 3180 return rv; 3181 atomic_set(&rdev->corrected_errors, n); 3182 return len; 3183 } 3184 static struct rdev_sysfs_entry rdev_errors = 3185 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 3186 3187 static ssize_t 3188 slot_show(struct md_rdev *rdev, char *page) 3189 { 3190 if (test_bit(Journal, &rdev->flags)) 3191 return sprintf(page, "journal\n"); 3192 else if (rdev->raid_disk < 0) 3193 return sprintf(page, "none\n"); 3194 else 3195 return sprintf(page, "%d\n", rdev->raid_disk); 3196 } 3197 3198 static ssize_t 3199 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 3200 { 3201 int slot; 3202 int err; 3203 3204 if (test_bit(Journal, &rdev->flags)) 3205 return -EBUSY; 3206 if (strncmp(buf, "none", 4)==0) 3207 slot = -1; 3208 else { 3209 err = kstrtouint(buf, 10, (unsigned int *)&slot); 3210 if (err < 0) 3211 return err; 3212 if (slot < 0) 3213 /* overflow */ 3214 return -ENOSPC; 3215 } 3216 if (rdev->mddev->pers && slot == -1) { 3217 /* Setting 'slot' on an active array requires also 3218 * updating the 'rd%d' link, and communicating 3219 * with the personality with ->hot_*_disk. 3220 * For now we only support removing 3221 * failed/spare devices. This normally happens automatically, 3222 * but not when the metadata is externally managed. 3223 */ 3224 if (rdev->raid_disk == -1) 3225 return -EEXIST; 3226 /* personality does all needed checks */ 3227 if (rdev->mddev->pers->hot_remove_disk == NULL) 3228 return -EINVAL; 3229 clear_bit(Blocked, &rdev->flags); 3230 remove_and_add_spares(rdev->mddev, rdev); 3231 if (rdev->raid_disk >= 0) 3232 return -EBUSY; 3233 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 3234 md_wakeup_thread(rdev->mddev->thread); 3235 } else if (rdev->mddev->pers) { 3236 /* Activating a spare .. or possibly reactivating 3237 * if we ever get bitmaps working here. 3238 */ 3239 int err; 3240 3241 if (rdev->raid_disk != -1) 3242 return -EBUSY; 3243 3244 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 3245 return -EBUSY; 3246 3247 if (rdev->mddev->pers->hot_add_disk == NULL) 3248 return -EINVAL; 3249 3250 if (slot >= rdev->mddev->raid_disks && 3251 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3252 return -ENOSPC; 3253 3254 rdev->raid_disk = slot; 3255 if (test_bit(In_sync, &rdev->flags)) 3256 rdev->saved_raid_disk = slot; 3257 else 3258 rdev->saved_raid_disk = -1; 3259 clear_bit(In_sync, &rdev->flags); 3260 clear_bit(Bitmap_sync, &rdev->flags); 3261 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); 3262 if (err) { 3263 rdev->raid_disk = -1; 3264 return err; 3265 } else 3266 sysfs_notify_dirent_safe(rdev->sysfs_state); 3267 /* failure here is OK */; 3268 sysfs_link_rdev(rdev->mddev, rdev); 3269 /* don't wakeup anyone, leave that to userspace. */ 3270 } else { 3271 if (slot >= rdev->mddev->raid_disks && 3272 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 3273 return -ENOSPC; 3274 rdev->raid_disk = slot; 3275 /* assume it is working */ 3276 clear_bit(Faulty, &rdev->flags); 3277 clear_bit(WriteMostly, &rdev->flags); 3278 set_bit(In_sync, &rdev->flags); 3279 sysfs_notify_dirent_safe(rdev->sysfs_state); 3280 } 3281 return len; 3282 } 3283 3284 static struct rdev_sysfs_entry rdev_slot = 3285 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 3286 3287 static ssize_t 3288 offset_show(struct md_rdev *rdev, char *page) 3289 { 3290 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 3291 } 3292 3293 static ssize_t 3294 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 3295 { 3296 unsigned long long offset; 3297 if (kstrtoull(buf, 10, &offset) < 0) 3298 return -EINVAL; 3299 if (rdev->mddev->pers && rdev->raid_disk >= 0) 3300 return -EBUSY; 3301 if (rdev->sectors && rdev->mddev->external) 3302 /* Must set offset before size, so overlap checks 3303 * can be sane */ 3304 return -EBUSY; 3305 rdev->data_offset = offset; 3306 rdev->new_data_offset = offset; 3307 return len; 3308 } 3309 3310 static struct rdev_sysfs_entry rdev_offset = 3311 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 3312 3313 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 3314 { 3315 return sprintf(page, "%llu\n", 3316 (unsigned long long)rdev->new_data_offset); 3317 } 3318 3319 static ssize_t new_offset_store(struct md_rdev *rdev, 3320 const char *buf, size_t len) 3321 { 3322 unsigned long long new_offset; 3323 struct mddev *mddev = rdev->mddev; 3324 3325 if (kstrtoull(buf, 10, &new_offset) < 0) 3326 return -EINVAL; 3327 3328 if (mddev->sync_thread || 3329 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 3330 return -EBUSY; 3331 if (new_offset == rdev->data_offset) 3332 /* reset is always permitted */ 3333 ; 3334 else if (new_offset > rdev->data_offset) { 3335 /* must not push array size beyond rdev_sectors */ 3336 if (new_offset - rdev->data_offset 3337 + mddev->dev_sectors > rdev->sectors) 3338 return -E2BIG; 3339 } 3340 /* Metadata worries about other space details. */ 3341 3342 /* decreasing the offset is inconsistent with a backwards 3343 * reshape. 3344 */ 3345 if (new_offset < rdev->data_offset && 3346 mddev->reshape_backwards) 3347 return -EINVAL; 3348 /* Increasing offset is inconsistent with forwards 3349 * reshape. reshape_direction should be set to 3350 * 'backwards' first. 3351 */ 3352 if (new_offset > rdev->data_offset && 3353 !mddev->reshape_backwards) 3354 return -EINVAL; 3355 3356 if (mddev->pers && mddev->persistent && 3357 !super_types[mddev->major_version] 3358 .allow_new_offset(rdev, new_offset)) 3359 return -E2BIG; 3360 rdev->new_data_offset = new_offset; 3361 if (new_offset > rdev->data_offset) 3362 mddev->reshape_backwards = 1; 3363 else if (new_offset < rdev->data_offset) 3364 mddev->reshape_backwards = 0; 3365 3366 return len; 3367 } 3368 static struct rdev_sysfs_entry rdev_new_offset = 3369 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 3370 3371 static ssize_t 3372 rdev_size_show(struct md_rdev *rdev, char *page) 3373 { 3374 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 3375 } 3376 3377 static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) 3378 { 3379 /* check if two start/length pairs overlap */ 3380 if (a->data_offset + a->sectors <= b->data_offset) 3381 return false; 3382 if (b->data_offset + b->sectors <= a->data_offset) 3383 return false; 3384 return true; 3385 } 3386 3387 static bool md_rdev_overlaps(struct md_rdev *rdev) 3388 { 3389 struct mddev *mddev; 3390 struct md_rdev *rdev2; 3391 3392 spin_lock(&all_mddevs_lock); 3393 list_for_each_entry(mddev, &all_mddevs, all_mddevs) { 3394 if (test_bit(MD_DELETED, &mddev->flags)) 3395 continue; 3396 rdev_for_each(rdev2, mddev) { 3397 if (rdev != rdev2 && rdev->bdev == rdev2->bdev && 3398 md_rdevs_overlap(rdev, rdev2)) { 3399 spin_unlock(&all_mddevs_lock); 3400 return true; 3401 } 3402 } 3403 } 3404 spin_unlock(&all_mddevs_lock); 3405 return false; 3406 } 3407 3408 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 3409 { 3410 unsigned long long blocks; 3411 sector_t new; 3412 3413 if (kstrtoull(buf, 10, &blocks) < 0) 3414 return -EINVAL; 3415 3416 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 3417 return -EINVAL; /* sector conversion overflow */ 3418 3419 new = blocks * 2; 3420 if (new != blocks * 2) 3421 return -EINVAL; /* unsigned long long to sector_t overflow */ 3422 3423 *sectors = new; 3424 return 0; 3425 } 3426 3427 static ssize_t 3428 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3429 { 3430 struct mddev *my_mddev = rdev->mddev; 3431 sector_t oldsectors = rdev->sectors; 3432 sector_t sectors; 3433 3434 if (test_bit(Journal, &rdev->flags)) 3435 return -EBUSY; 3436 if (strict_blocks_to_sectors(buf, §ors) < 0) 3437 return -EINVAL; 3438 if (rdev->data_offset != rdev->new_data_offset) 3439 return -EINVAL; /* too confusing */ 3440 if (my_mddev->pers && rdev->raid_disk >= 0) { 3441 if (my_mddev->persistent) { 3442 sectors = super_types[my_mddev->major_version]. 3443 rdev_size_change(rdev, sectors); 3444 if (!sectors) 3445 return -EBUSY; 3446 } else if (!sectors) 3447 sectors = bdev_nr_sectors(rdev->bdev) - 3448 rdev->data_offset; 3449 if (!my_mddev->pers->resize) 3450 /* Cannot change size for RAID0 or Linear etc */ 3451 return -EINVAL; 3452 } 3453 if (sectors < my_mddev->dev_sectors) 3454 return -EINVAL; /* component must fit device */ 3455 3456 rdev->sectors = sectors; 3457 3458 /* 3459 * Check that all other rdevs with the same bdev do not overlap. This 3460 * check does not provide a hard guarantee, it just helps avoid 3461 * dangerous mistakes. 3462 */ 3463 if (sectors > oldsectors && my_mddev->external && 3464 md_rdev_overlaps(rdev)) { 3465 /* 3466 * Someone else could have slipped in a size change here, but 3467 * doing so is just silly. We put oldsectors back because we 3468 * know it is safe, and trust userspace not to race with itself. 3469 */ 3470 rdev->sectors = oldsectors; 3471 return -EBUSY; 3472 } 3473 return len; 3474 } 3475 3476 static struct rdev_sysfs_entry rdev_size = 3477 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3478 3479 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3480 { 3481 unsigned long long recovery_start = rdev->recovery_offset; 3482 3483 if (test_bit(In_sync, &rdev->flags) || 3484 recovery_start == MaxSector) 3485 return sprintf(page, "none\n"); 3486 3487 return sprintf(page, "%llu\n", recovery_start); 3488 } 3489 3490 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3491 { 3492 unsigned long long recovery_start; 3493 3494 if (cmd_match(buf, "none")) 3495 recovery_start = MaxSector; 3496 else if (kstrtoull(buf, 10, &recovery_start)) 3497 return -EINVAL; 3498 3499 if (rdev->mddev->pers && 3500 rdev->raid_disk >= 0) 3501 return -EBUSY; 3502 3503 rdev->recovery_offset = recovery_start; 3504 if (recovery_start == MaxSector) 3505 set_bit(In_sync, &rdev->flags); 3506 else 3507 clear_bit(In_sync, &rdev->flags); 3508 return len; 3509 } 3510 3511 static struct rdev_sysfs_entry rdev_recovery_start = 3512 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3513 3514 /* sysfs access to bad-blocks list. 3515 * We present two files. 3516 * 'bad-blocks' lists sector numbers and lengths of ranges that 3517 * are recorded as bad. The list is truncated to fit within 3518 * the one-page limit of sysfs. 3519 * Writing "sector length" to this file adds an acknowledged 3520 * bad block list. 3521 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3522 * been acknowledged. Writing to this file adds bad blocks 3523 * without acknowledging them. This is largely for testing. 3524 */ 3525 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3526 { 3527 return badblocks_show(&rdev->badblocks, page, 0); 3528 } 3529 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3530 { 3531 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3532 /* Maybe that ack was all we needed */ 3533 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3534 wake_up(&rdev->blocked_wait); 3535 return rv; 3536 } 3537 static struct rdev_sysfs_entry rdev_bad_blocks = 3538 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3539 3540 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3541 { 3542 return badblocks_show(&rdev->badblocks, page, 1); 3543 } 3544 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3545 { 3546 return badblocks_store(&rdev->badblocks, page, len, 1); 3547 } 3548 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3549 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3550 3551 static ssize_t 3552 ppl_sector_show(struct md_rdev *rdev, char *page) 3553 { 3554 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector); 3555 } 3556 3557 static ssize_t 3558 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len) 3559 { 3560 unsigned long long sector; 3561 3562 if (kstrtoull(buf, 10, §or) < 0) 3563 return -EINVAL; 3564 if (sector != (sector_t)sector) 3565 return -EINVAL; 3566 3567 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3568 rdev->raid_disk >= 0) 3569 return -EBUSY; 3570 3571 if (rdev->mddev->persistent) { 3572 if (rdev->mddev->major_version == 0) 3573 return -EINVAL; 3574 if ((sector > rdev->sb_start && 3575 sector - rdev->sb_start > S16_MAX) || 3576 (sector < rdev->sb_start && 3577 rdev->sb_start - sector > -S16_MIN)) 3578 return -EINVAL; 3579 rdev->ppl.offset = sector - rdev->sb_start; 3580 } else if (!rdev->mddev->external) { 3581 return -EBUSY; 3582 } 3583 rdev->ppl.sector = sector; 3584 return len; 3585 } 3586 3587 static struct rdev_sysfs_entry rdev_ppl_sector = 3588 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store); 3589 3590 static ssize_t 3591 ppl_size_show(struct md_rdev *rdev, char *page) 3592 { 3593 return sprintf(page, "%u\n", rdev->ppl.size); 3594 } 3595 3596 static ssize_t 3597 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len) 3598 { 3599 unsigned int size; 3600 3601 if (kstrtouint(buf, 10, &size) < 0) 3602 return -EINVAL; 3603 3604 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) && 3605 rdev->raid_disk >= 0) 3606 return -EBUSY; 3607 3608 if (rdev->mddev->persistent) { 3609 if (rdev->mddev->major_version == 0) 3610 return -EINVAL; 3611 if (size > U16_MAX) 3612 return -EINVAL; 3613 } else if (!rdev->mddev->external) { 3614 return -EBUSY; 3615 } 3616 rdev->ppl.size = size; 3617 return len; 3618 } 3619 3620 static struct rdev_sysfs_entry rdev_ppl_size = 3621 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store); 3622 3623 static struct attribute *rdev_default_attrs[] = { 3624 &rdev_state.attr, 3625 &rdev_errors.attr, 3626 &rdev_slot.attr, 3627 &rdev_offset.attr, 3628 &rdev_new_offset.attr, 3629 &rdev_size.attr, 3630 &rdev_recovery_start.attr, 3631 &rdev_bad_blocks.attr, 3632 &rdev_unack_bad_blocks.attr, 3633 &rdev_ppl_sector.attr, 3634 &rdev_ppl_size.attr, 3635 NULL, 3636 }; 3637 ATTRIBUTE_GROUPS(rdev_default); 3638 static ssize_t 3639 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3640 { 3641 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3642 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3643 3644 if (!entry->show) 3645 return -EIO; 3646 if (!rdev->mddev) 3647 return -ENODEV; 3648 return entry->show(rdev, page); 3649 } 3650 3651 static ssize_t 3652 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3653 const char *page, size_t length) 3654 { 3655 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3656 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3657 struct kernfs_node *kn = NULL; 3658 bool suspend = false; 3659 ssize_t rv; 3660 struct mddev *mddev = rdev->mddev; 3661 3662 if (!entry->store) 3663 return -EIO; 3664 if (!capable(CAP_SYS_ADMIN)) 3665 return -EACCES; 3666 if (!mddev) 3667 return -ENODEV; 3668 3669 if (entry->store == state_store) { 3670 if (cmd_match(page, "remove")) 3671 kn = sysfs_break_active_protection(kobj, attr); 3672 if (cmd_match(page, "remove") || cmd_match(page, "re-add") || 3673 cmd_match(page, "writemostly") || 3674 cmd_match(page, "-writemostly")) 3675 suspend = true; 3676 } 3677 3678 rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev); 3679 if (!rv) { 3680 if (rdev->mddev == NULL) 3681 rv = -ENODEV; 3682 else 3683 rv = entry->store(rdev, page, length); 3684 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 3685 } 3686 3687 if (kn) 3688 sysfs_unbreak_active_protection(kn); 3689 3690 return rv; 3691 } 3692 3693 static void rdev_free(struct kobject *ko) 3694 { 3695 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3696 kfree(rdev); 3697 } 3698 static const struct sysfs_ops rdev_sysfs_ops = { 3699 .show = rdev_attr_show, 3700 .store = rdev_attr_store, 3701 }; 3702 static const struct kobj_type rdev_ktype = { 3703 .release = rdev_free, 3704 .sysfs_ops = &rdev_sysfs_ops, 3705 .default_groups = rdev_default_groups, 3706 }; 3707 3708 int md_rdev_init(struct md_rdev *rdev) 3709 { 3710 rdev->desc_nr = -1; 3711 rdev->saved_raid_disk = -1; 3712 rdev->raid_disk = -1; 3713 rdev->flags = 0; 3714 rdev->data_offset = 0; 3715 rdev->new_data_offset = 0; 3716 rdev->sb_events = 0; 3717 rdev->last_read_error = 0; 3718 rdev->sb_loaded = 0; 3719 rdev->bb_page = NULL; 3720 atomic_set(&rdev->nr_pending, 0); 3721 atomic_set(&rdev->read_errors, 0); 3722 atomic_set(&rdev->corrected_errors, 0); 3723 3724 INIT_LIST_HEAD(&rdev->same_set); 3725 init_waitqueue_head(&rdev->blocked_wait); 3726 3727 /* Add space to store bad block list. 3728 * This reserves the space even on arrays where it cannot 3729 * be used - I wonder if that matters 3730 */ 3731 return badblocks_init(&rdev->badblocks, 0); 3732 } 3733 EXPORT_SYMBOL_GPL(md_rdev_init); 3734 3735 /* 3736 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3737 * 3738 * mark the device faulty if: 3739 * 3740 * - the device is nonexistent (zero size) 3741 * - the device has no valid superblock 3742 * 3743 * a faulty rdev _never_ has rdev->sb set. 3744 */ 3745 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3746 { 3747 struct md_rdev *rdev; 3748 sector_t size; 3749 int err; 3750 3751 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3752 if (!rdev) 3753 return ERR_PTR(-ENOMEM); 3754 3755 err = md_rdev_init(rdev); 3756 if (err) 3757 goto out_free_rdev; 3758 err = alloc_disk_sb(rdev); 3759 if (err) 3760 goto out_clear_rdev; 3761 3762 rdev->bdev_handle = bdev_open_by_dev(newdev, 3763 BLK_OPEN_READ | BLK_OPEN_WRITE, 3764 super_format == -2 ? &claim_rdev : rdev, NULL); 3765 if (IS_ERR(rdev->bdev_handle)) { 3766 pr_warn("md: could not open device unknown-block(%u,%u).\n", 3767 MAJOR(newdev), MINOR(newdev)); 3768 err = PTR_ERR(rdev->bdev_handle); 3769 goto out_clear_rdev; 3770 } 3771 rdev->bdev = rdev->bdev_handle->bdev; 3772 3773 kobject_init(&rdev->kobj, &rdev_ktype); 3774 3775 size = bdev_nr_bytes(rdev->bdev) >> BLOCK_SIZE_BITS; 3776 if (!size) { 3777 pr_warn("md: %pg has zero or unknown size, marking faulty!\n", 3778 rdev->bdev); 3779 err = -EINVAL; 3780 goto out_blkdev_put; 3781 } 3782 3783 if (super_format >= 0) { 3784 err = super_types[super_format]. 3785 load_super(rdev, NULL, super_minor); 3786 if (err == -EINVAL) { 3787 pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n", 3788 rdev->bdev, 3789 super_format, super_minor); 3790 goto out_blkdev_put; 3791 } 3792 if (err < 0) { 3793 pr_warn("md: could not read %pg's sb, not importing!\n", 3794 rdev->bdev); 3795 goto out_blkdev_put; 3796 } 3797 } 3798 3799 return rdev; 3800 3801 out_blkdev_put: 3802 bdev_release(rdev->bdev_handle); 3803 out_clear_rdev: 3804 md_rdev_clear(rdev); 3805 out_free_rdev: 3806 kfree(rdev); 3807 return ERR_PTR(err); 3808 } 3809 3810 /* 3811 * Check a full RAID array for plausibility 3812 */ 3813 3814 static int analyze_sbs(struct mddev *mddev) 3815 { 3816 int i; 3817 struct md_rdev *rdev, *freshest, *tmp; 3818 3819 freshest = NULL; 3820 rdev_for_each_safe(rdev, tmp, mddev) 3821 switch (super_types[mddev->major_version]. 3822 load_super(rdev, freshest, mddev->minor_version)) { 3823 case 1: 3824 freshest = rdev; 3825 break; 3826 case 0: 3827 break; 3828 default: 3829 pr_warn("md: fatal superblock inconsistency in %pg -- removing from array\n", 3830 rdev->bdev); 3831 md_kick_rdev_from_array(rdev); 3832 } 3833 3834 /* Cannot find a valid fresh disk */ 3835 if (!freshest) { 3836 pr_warn("md: cannot find a valid disk\n"); 3837 return -EINVAL; 3838 } 3839 3840 super_types[mddev->major_version]. 3841 validate_super(mddev, NULL/*freshest*/, freshest); 3842 3843 i = 0; 3844 rdev_for_each_safe(rdev, tmp, mddev) { 3845 if (mddev->max_disks && 3846 (rdev->desc_nr >= mddev->max_disks || 3847 i > mddev->max_disks)) { 3848 pr_warn("md: %s: %pg: only %d devices permitted\n", 3849 mdname(mddev), rdev->bdev, 3850 mddev->max_disks); 3851 md_kick_rdev_from_array(rdev); 3852 continue; 3853 } 3854 if (rdev != freshest) { 3855 if (super_types[mddev->major_version]. 3856 validate_super(mddev, freshest, rdev)) { 3857 pr_warn("md: kicking non-fresh %pg from array!\n", 3858 rdev->bdev); 3859 md_kick_rdev_from_array(rdev); 3860 continue; 3861 } 3862 } 3863 if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) && 3864 !test_bit(Journal, &rdev->flags)) { 3865 rdev->raid_disk = -1; 3866 clear_bit(In_sync, &rdev->flags); 3867 } 3868 } 3869 3870 return 0; 3871 } 3872 3873 /* Read a fixed-point number. 3874 * Numbers in sysfs attributes should be in "standard" units where 3875 * possible, so time should be in seconds. 3876 * However we internally use a a much smaller unit such as 3877 * milliseconds or jiffies. 3878 * This function takes a decimal number with a possible fractional 3879 * component, and produces an integer which is the result of 3880 * multiplying that number by 10^'scale'. 3881 * all without any floating-point arithmetic. 3882 */ 3883 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3884 { 3885 unsigned long result = 0; 3886 long decimals = -1; 3887 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3888 if (*cp == '.') 3889 decimals = 0; 3890 else if (decimals < scale) { 3891 unsigned int value; 3892 value = *cp - '0'; 3893 result = result * 10 + value; 3894 if (decimals >= 0) 3895 decimals++; 3896 } 3897 cp++; 3898 } 3899 if (*cp == '\n') 3900 cp++; 3901 if (*cp) 3902 return -EINVAL; 3903 if (decimals < 0) 3904 decimals = 0; 3905 *res = result * int_pow(10, scale - decimals); 3906 return 0; 3907 } 3908 3909 static ssize_t 3910 safe_delay_show(struct mddev *mddev, char *page) 3911 { 3912 unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; 3913 3914 return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); 3915 } 3916 static ssize_t 3917 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3918 { 3919 unsigned long msec; 3920 3921 if (mddev_is_clustered(mddev)) { 3922 pr_warn("md: Safemode is disabled for clustered mode\n"); 3923 return -EINVAL; 3924 } 3925 3926 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) 3927 return -EINVAL; 3928 if (msec == 0) 3929 mddev->safemode_delay = 0; 3930 else { 3931 unsigned long old_delay = mddev->safemode_delay; 3932 unsigned long new_delay = (msec*HZ)/1000; 3933 3934 if (new_delay == 0) 3935 new_delay = 1; 3936 mddev->safemode_delay = new_delay; 3937 if (new_delay < old_delay || old_delay == 0) 3938 mod_timer(&mddev->safemode_timer, jiffies+1); 3939 } 3940 return len; 3941 } 3942 static struct md_sysfs_entry md_safe_delay = 3943 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3944 3945 static ssize_t 3946 level_show(struct mddev *mddev, char *page) 3947 { 3948 struct md_personality *p; 3949 int ret; 3950 spin_lock(&mddev->lock); 3951 p = mddev->pers; 3952 if (p) 3953 ret = sprintf(page, "%s\n", p->name); 3954 else if (mddev->clevel[0]) 3955 ret = sprintf(page, "%s\n", mddev->clevel); 3956 else if (mddev->level != LEVEL_NONE) 3957 ret = sprintf(page, "%d\n", mddev->level); 3958 else 3959 ret = 0; 3960 spin_unlock(&mddev->lock); 3961 return ret; 3962 } 3963 3964 static ssize_t 3965 level_store(struct mddev *mddev, const char *buf, size_t len) 3966 { 3967 char clevel[16]; 3968 ssize_t rv; 3969 size_t slen = len; 3970 struct md_personality *pers, *oldpers; 3971 long level; 3972 void *priv, *oldpriv; 3973 struct md_rdev *rdev; 3974 3975 if (slen == 0 || slen >= sizeof(clevel)) 3976 return -EINVAL; 3977 3978 rv = mddev_suspend_and_lock(mddev); 3979 if (rv) 3980 return rv; 3981 3982 if (mddev->pers == NULL) { 3983 memcpy(mddev->clevel, buf, slen); 3984 if (mddev->clevel[slen-1] == '\n') 3985 slen--; 3986 mddev->clevel[slen] = 0; 3987 mddev->level = LEVEL_NONE; 3988 rv = len; 3989 goto out_unlock; 3990 } 3991 rv = -EROFS; 3992 if (!md_is_rdwr(mddev)) 3993 goto out_unlock; 3994 3995 /* request to change the personality. Need to ensure: 3996 * - array is not engaged in resync/recovery/reshape 3997 * - old personality can be suspended 3998 * - new personality will access other array. 3999 */ 4000 4001 rv = -EBUSY; 4002 if (mddev->sync_thread || 4003 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4004 mddev->reshape_position != MaxSector || 4005 mddev->sysfs_active) 4006 goto out_unlock; 4007 4008 rv = -EINVAL; 4009 if (!mddev->pers->quiesce) { 4010 pr_warn("md: %s: %s does not support online personality change\n", 4011 mdname(mddev), mddev->pers->name); 4012 goto out_unlock; 4013 } 4014 4015 /* Now find the new personality */ 4016 memcpy(clevel, buf, slen); 4017 if (clevel[slen-1] == '\n') 4018 slen--; 4019 clevel[slen] = 0; 4020 if (kstrtol(clevel, 10, &level)) 4021 level = LEVEL_NONE; 4022 4023 if (request_module("md-%s", clevel) != 0) 4024 request_module("md-level-%s", clevel); 4025 spin_lock(&pers_lock); 4026 pers = find_pers(level, clevel); 4027 if (!pers || !try_module_get(pers->owner)) { 4028 spin_unlock(&pers_lock); 4029 pr_warn("md: personality %s not loaded\n", clevel); 4030 rv = -EINVAL; 4031 goto out_unlock; 4032 } 4033 spin_unlock(&pers_lock); 4034 4035 if (pers == mddev->pers) { 4036 /* Nothing to do! */ 4037 module_put(pers->owner); 4038 rv = len; 4039 goto out_unlock; 4040 } 4041 if (!pers->takeover) { 4042 module_put(pers->owner); 4043 pr_warn("md: %s: %s does not support personality takeover\n", 4044 mdname(mddev), clevel); 4045 rv = -EINVAL; 4046 goto out_unlock; 4047 } 4048 4049 rdev_for_each(rdev, mddev) 4050 rdev->new_raid_disk = rdev->raid_disk; 4051 4052 /* ->takeover must set new_* and/or delta_disks 4053 * if it succeeds, and may set them when it fails. 4054 */ 4055 priv = pers->takeover(mddev); 4056 if (IS_ERR(priv)) { 4057 mddev->new_level = mddev->level; 4058 mddev->new_layout = mddev->layout; 4059 mddev->new_chunk_sectors = mddev->chunk_sectors; 4060 mddev->raid_disks -= mddev->delta_disks; 4061 mddev->delta_disks = 0; 4062 mddev->reshape_backwards = 0; 4063 module_put(pers->owner); 4064 pr_warn("md: %s: %s would not accept array\n", 4065 mdname(mddev), clevel); 4066 rv = PTR_ERR(priv); 4067 goto out_unlock; 4068 } 4069 4070 /* Looks like we have a winner */ 4071 mddev_detach(mddev); 4072 4073 spin_lock(&mddev->lock); 4074 oldpers = mddev->pers; 4075 oldpriv = mddev->private; 4076 mddev->pers = pers; 4077 mddev->private = priv; 4078 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4079 mddev->level = mddev->new_level; 4080 mddev->layout = mddev->new_layout; 4081 mddev->chunk_sectors = mddev->new_chunk_sectors; 4082 mddev->delta_disks = 0; 4083 mddev->reshape_backwards = 0; 4084 mddev->degraded = 0; 4085 spin_unlock(&mddev->lock); 4086 4087 if (oldpers->sync_request == NULL && 4088 mddev->external) { 4089 /* We are converting from a no-redundancy array 4090 * to a redundancy array and metadata is managed 4091 * externally so we need to be sure that writes 4092 * won't block due to a need to transition 4093 * clean->dirty 4094 * until external management is started. 4095 */ 4096 mddev->in_sync = 0; 4097 mddev->safemode_delay = 0; 4098 mddev->safemode = 0; 4099 } 4100 4101 oldpers->free(mddev, oldpriv); 4102 4103 if (oldpers->sync_request == NULL && 4104 pers->sync_request != NULL) { 4105 /* need to add the md_redundancy_group */ 4106 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4107 pr_warn("md: cannot register extra attributes for %s\n", 4108 mdname(mddev)); 4109 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4110 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 4111 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 4112 } 4113 if (oldpers->sync_request != NULL && 4114 pers->sync_request == NULL) { 4115 /* need to remove the md_redundancy_group */ 4116 if (mddev->to_remove == NULL) 4117 mddev->to_remove = &md_redundancy_group; 4118 } 4119 4120 module_put(oldpers->owner); 4121 4122 rdev_for_each(rdev, mddev) { 4123 if (rdev->raid_disk < 0) 4124 continue; 4125 if (rdev->new_raid_disk >= mddev->raid_disks) 4126 rdev->new_raid_disk = -1; 4127 if (rdev->new_raid_disk == rdev->raid_disk) 4128 continue; 4129 sysfs_unlink_rdev(mddev, rdev); 4130 } 4131 rdev_for_each(rdev, mddev) { 4132 if (rdev->raid_disk < 0) 4133 continue; 4134 if (rdev->new_raid_disk == rdev->raid_disk) 4135 continue; 4136 rdev->raid_disk = rdev->new_raid_disk; 4137 if (rdev->raid_disk < 0) 4138 clear_bit(In_sync, &rdev->flags); 4139 else { 4140 if (sysfs_link_rdev(mddev, rdev)) 4141 pr_warn("md: cannot register rd%d for %s after level change\n", 4142 rdev->raid_disk, mdname(mddev)); 4143 } 4144 } 4145 4146 if (pers->sync_request == NULL) { 4147 /* this is now an array without redundancy, so 4148 * it must always be in_sync 4149 */ 4150 mddev->in_sync = 1; 4151 del_timer_sync(&mddev->safemode_timer); 4152 } 4153 blk_set_stacking_limits(&mddev->queue->limits); 4154 pers->run(mddev); 4155 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 4156 if (!mddev->thread) 4157 md_update_sb(mddev, 1); 4158 sysfs_notify_dirent_safe(mddev->sysfs_level); 4159 md_new_event(); 4160 rv = len; 4161 out_unlock: 4162 mddev_unlock_and_resume(mddev); 4163 return rv; 4164 } 4165 4166 static struct md_sysfs_entry md_level = 4167 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4168 4169 static ssize_t 4170 layout_show(struct mddev *mddev, char *page) 4171 { 4172 /* just a number, not meaningful for all levels */ 4173 if (mddev->reshape_position != MaxSector && 4174 mddev->layout != mddev->new_layout) 4175 return sprintf(page, "%d (%d)\n", 4176 mddev->new_layout, mddev->layout); 4177 return sprintf(page, "%d\n", mddev->layout); 4178 } 4179 4180 static ssize_t 4181 layout_store(struct mddev *mddev, const char *buf, size_t len) 4182 { 4183 unsigned int n; 4184 int err; 4185 4186 err = kstrtouint(buf, 10, &n); 4187 if (err < 0) 4188 return err; 4189 err = mddev_lock(mddev); 4190 if (err) 4191 return err; 4192 4193 if (mddev->pers) { 4194 if (mddev->pers->check_reshape == NULL) 4195 err = -EBUSY; 4196 else if (!md_is_rdwr(mddev)) 4197 err = -EROFS; 4198 else { 4199 mddev->new_layout = n; 4200 err = mddev->pers->check_reshape(mddev); 4201 if (err) 4202 mddev->new_layout = mddev->layout; 4203 } 4204 } else { 4205 mddev->new_layout = n; 4206 if (mddev->reshape_position == MaxSector) 4207 mddev->layout = n; 4208 } 4209 mddev_unlock(mddev); 4210 return err ?: len; 4211 } 4212 static struct md_sysfs_entry md_layout = 4213 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 4214 4215 static ssize_t 4216 raid_disks_show(struct mddev *mddev, char *page) 4217 { 4218 if (mddev->raid_disks == 0) 4219 return 0; 4220 if (mddev->reshape_position != MaxSector && 4221 mddev->delta_disks != 0) 4222 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 4223 mddev->raid_disks - mddev->delta_disks); 4224 return sprintf(page, "%d\n", mddev->raid_disks); 4225 } 4226 4227 static int update_raid_disks(struct mddev *mddev, int raid_disks); 4228 4229 static ssize_t 4230 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 4231 { 4232 unsigned int n; 4233 int err; 4234 4235 err = kstrtouint(buf, 10, &n); 4236 if (err < 0) 4237 return err; 4238 4239 err = mddev_lock(mddev); 4240 if (err) 4241 return err; 4242 if (mddev->pers) 4243 err = update_raid_disks(mddev, n); 4244 else if (mddev->reshape_position != MaxSector) { 4245 struct md_rdev *rdev; 4246 int olddisks = mddev->raid_disks - mddev->delta_disks; 4247 4248 err = -EINVAL; 4249 rdev_for_each(rdev, mddev) { 4250 if (olddisks < n && 4251 rdev->data_offset < rdev->new_data_offset) 4252 goto out_unlock; 4253 if (olddisks > n && 4254 rdev->data_offset > rdev->new_data_offset) 4255 goto out_unlock; 4256 } 4257 err = 0; 4258 mddev->delta_disks = n - olddisks; 4259 mddev->raid_disks = n; 4260 mddev->reshape_backwards = (mddev->delta_disks < 0); 4261 } else 4262 mddev->raid_disks = n; 4263 out_unlock: 4264 mddev_unlock(mddev); 4265 return err ? err : len; 4266 } 4267 static struct md_sysfs_entry md_raid_disks = 4268 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 4269 4270 static ssize_t 4271 uuid_show(struct mddev *mddev, char *page) 4272 { 4273 return sprintf(page, "%pU\n", mddev->uuid); 4274 } 4275 static struct md_sysfs_entry md_uuid = 4276 __ATTR(uuid, S_IRUGO, uuid_show, NULL); 4277 4278 static ssize_t 4279 chunk_size_show(struct mddev *mddev, char *page) 4280 { 4281 if (mddev->reshape_position != MaxSector && 4282 mddev->chunk_sectors != mddev->new_chunk_sectors) 4283 return sprintf(page, "%d (%d)\n", 4284 mddev->new_chunk_sectors << 9, 4285 mddev->chunk_sectors << 9); 4286 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 4287 } 4288 4289 static ssize_t 4290 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 4291 { 4292 unsigned long n; 4293 int err; 4294 4295 err = kstrtoul(buf, 10, &n); 4296 if (err < 0) 4297 return err; 4298 4299 err = mddev_lock(mddev); 4300 if (err) 4301 return err; 4302 if (mddev->pers) { 4303 if (mddev->pers->check_reshape == NULL) 4304 err = -EBUSY; 4305 else if (!md_is_rdwr(mddev)) 4306 err = -EROFS; 4307 else { 4308 mddev->new_chunk_sectors = n >> 9; 4309 err = mddev->pers->check_reshape(mddev); 4310 if (err) 4311 mddev->new_chunk_sectors = mddev->chunk_sectors; 4312 } 4313 } else { 4314 mddev->new_chunk_sectors = n >> 9; 4315 if (mddev->reshape_position == MaxSector) 4316 mddev->chunk_sectors = n >> 9; 4317 } 4318 mddev_unlock(mddev); 4319 return err ?: len; 4320 } 4321 static struct md_sysfs_entry md_chunk_size = 4322 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 4323 4324 static ssize_t 4325 resync_start_show(struct mddev *mddev, char *page) 4326 { 4327 if (mddev->recovery_cp == MaxSector) 4328 return sprintf(page, "none\n"); 4329 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 4330 } 4331 4332 static ssize_t 4333 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 4334 { 4335 unsigned long long n; 4336 int err; 4337 4338 if (cmd_match(buf, "none")) 4339 n = MaxSector; 4340 else { 4341 err = kstrtoull(buf, 10, &n); 4342 if (err < 0) 4343 return err; 4344 if (n != (sector_t)n) 4345 return -EINVAL; 4346 } 4347 4348 err = mddev_lock(mddev); 4349 if (err) 4350 return err; 4351 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4352 err = -EBUSY; 4353 4354 if (!err) { 4355 mddev->recovery_cp = n; 4356 if (mddev->pers) 4357 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4358 } 4359 mddev_unlock(mddev); 4360 return err ?: len; 4361 } 4362 static struct md_sysfs_entry md_resync_start = 4363 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 4364 resync_start_show, resync_start_store); 4365 4366 /* 4367 * The array state can be: 4368 * 4369 * clear 4370 * No devices, no size, no level 4371 * Equivalent to STOP_ARRAY ioctl 4372 * inactive 4373 * May have some settings, but array is not active 4374 * all IO results in error 4375 * When written, doesn't tear down array, but just stops it 4376 * suspended (not supported yet) 4377 * All IO requests will block. The array can be reconfigured. 4378 * Writing this, if accepted, will block until array is quiescent 4379 * readonly 4380 * no resync can happen. no superblocks get written. 4381 * write requests fail 4382 * read-auto 4383 * like readonly, but behaves like 'clean' on a write request. 4384 * 4385 * clean - no pending writes, but otherwise active. 4386 * When written to inactive array, starts without resync 4387 * If a write request arrives then 4388 * if metadata is known, mark 'dirty' and switch to 'active'. 4389 * if not known, block and switch to write-pending 4390 * If written to an active array that has pending writes, then fails. 4391 * active 4392 * fully active: IO and resync can be happening. 4393 * When written to inactive array, starts with resync 4394 * 4395 * write-pending 4396 * clean, but writes are blocked waiting for 'active' to be written. 4397 * 4398 * active-idle 4399 * like active, but no writes have been seen for a while (100msec). 4400 * 4401 * broken 4402 * Array is failed. It's useful because mounted-arrays aren't stopped 4403 * when array is failed, so this state will at least alert the user that 4404 * something is wrong. 4405 */ 4406 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 4407 write_pending, active_idle, broken, bad_word}; 4408 static char *array_states[] = { 4409 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 4410 "write-pending", "active-idle", "broken", NULL }; 4411 4412 static int match_word(const char *word, char **list) 4413 { 4414 int n; 4415 for (n=0; list[n]; n++) 4416 if (cmd_match(word, list[n])) 4417 break; 4418 return n; 4419 } 4420 4421 static ssize_t 4422 array_state_show(struct mddev *mddev, char *page) 4423 { 4424 enum array_state st = inactive; 4425 4426 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) { 4427 switch(mddev->ro) { 4428 case MD_RDONLY: 4429 st = readonly; 4430 break; 4431 case MD_AUTO_READ: 4432 st = read_auto; 4433 break; 4434 case MD_RDWR: 4435 spin_lock(&mddev->lock); 4436 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 4437 st = write_pending; 4438 else if (mddev->in_sync) 4439 st = clean; 4440 else if (mddev->safemode) 4441 st = active_idle; 4442 else 4443 st = active; 4444 spin_unlock(&mddev->lock); 4445 } 4446 4447 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean) 4448 st = broken; 4449 } else { 4450 if (list_empty(&mddev->disks) && 4451 mddev->raid_disks == 0 && 4452 mddev->dev_sectors == 0) 4453 st = clear; 4454 else 4455 st = inactive; 4456 } 4457 return sprintf(page, "%s\n", array_states[st]); 4458 } 4459 4460 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 4461 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 4462 static int restart_array(struct mddev *mddev); 4463 4464 static ssize_t 4465 array_state_store(struct mddev *mddev, const char *buf, size_t len) 4466 { 4467 int err = 0; 4468 enum array_state st = match_word(buf, array_states); 4469 4470 /* No lock dependent actions */ 4471 switch (st) { 4472 case suspended: /* not supported yet */ 4473 case write_pending: /* cannot be set */ 4474 case active_idle: /* cannot be set */ 4475 case broken: /* cannot be set */ 4476 case bad_word: 4477 return -EINVAL; 4478 default: 4479 break; 4480 } 4481 4482 if (mddev->pers && (st == active || st == clean) && 4483 mddev->ro != MD_RDONLY) { 4484 /* don't take reconfig_mutex when toggling between 4485 * clean and active 4486 */ 4487 spin_lock(&mddev->lock); 4488 if (st == active) { 4489 restart_array(mddev); 4490 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4491 md_wakeup_thread(mddev->thread); 4492 wake_up(&mddev->sb_wait); 4493 } else /* st == clean */ { 4494 restart_array(mddev); 4495 if (!set_in_sync(mddev)) 4496 err = -EBUSY; 4497 } 4498 if (!err) 4499 sysfs_notify_dirent_safe(mddev->sysfs_state); 4500 spin_unlock(&mddev->lock); 4501 return err ?: len; 4502 } 4503 err = mddev_lock(mddev); 4504 if (err) 4505 return err; 4506 4507 switch (st) { 4508 case inactive: 4509 /* stop an active array, return 0 otherwise */ 4510 if (mddev->pers) 4511 err = do_md_stop(mddev, 2, NULL); 4512 break; 4513 case clear: 4514 err = do_md_stop(mddev, 0, NULL); 4515 break; 4516 case readonly: 4517 if (mddev->pers) 4518 err = md_set_readonly(mddev, NULL); 4519 else { 4520 mddev->ro = MD_RDONLY; 4521 set_disk_ro(mddev->gendisk, 1); 4522 err = do_md_run(mddev); 4523 } 4524 break; 4525 case read_auto: 4526 if (mddev->pers) { 4527 if (md_is_rdwr(mddev)) 4528 err = md_set_readonly(mddev, NULL); 4529 else if (mddev->ro == MD_RDONLY) 4530 err = restart_array(mddev); 4531 if (err == 0) { 4532 mddev->ro = MD_AUTO_READ; 4533 set_disk_ro(mddev->gendisk, 0); 4534 } 4535 } else { 4536 mddev->ro = MD_AUTO_READ; 4537 err = do_md_run(mddev); 4538 } 4539 break; 4540 case clean: 4541 if (mddev->pers) { 4542 err = restart_array(mddev); 4543 if (err) 4544 break; 4545 spin_lock(&mddev->lock); 4546 if (!set_in_sync(mddev)) 4547 err = -EBUSY; 4548 spin_unlock(&mddev->lock); 4549 } else 4550 err = -EINVAL; 4551 break; 4552 case active: 4553 if (mddev->pers) { 4554 err = restart_array(mddev); 4555 if (err) 4556 break; 4557 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4558 wake_up(&mddev->sb_wait); 4559 err = 0; 4560 } else { 4561 mddev->ro = MD_RDWR; 4562 set_disk_ro(mddev->gendisk, 0); 4563 err = do_md_run(mddev); 4564 } 4565 break; 4566 default: 4567 err = -EINVAL; 4568 break; 4569 } 4570 4571 if (!err) { 4572 if (mddev->hold_active == UNTIL_IOCTL) 4573 mddev->hold_active = 0; 4574 sysfs_notify_dirent_safe(mddev->sysfs_state); 4575 } 4576 mddev_unlock(mddev); 4577 return err ?: len; 4578 } 4579 static struct md_sysfs_entry md_array_state = 4580 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4581 4582 static ssize_t 4583 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4584 return sprintf(page, "%d\n", 4585 atomic_read(&mddev->max_corr_read_errors)); 4586 } 4587 4588 static ssize_t 4589 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4590 { 4591 unsigned int n; 4592 int rv; 4593 4594 rv = kstrtouint(buf, 10, &n); 4595 if (rv < 0) 4596 return rv; 4597 if (n > INT_MAX) 4598 return -EINVAL; 4599 atomic_set(&mddev->max_corr_read_errors, n); 4600 return len; 4601 } 4602 4603 static struct md_sysfs_entry max_corr_read_errors = 4604 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4605 max_corrected_read_errors_store); 4606 4607 static ssize_t 4608 null_show(struct mddev *mddev, char *page) 4609 { 4610 return -EINVAL; 4611 } 4612 4613 static ssize_t 4614 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4615 { 4616 /* buf must be %d:%d\n? giving major and minor numbers */ 4617 /* The new device is added to the array. 4618 * If the array has a persistent superblock, we read the 4619 * superblock to initialise info and check validity. 4620 * Otherwise, only checking done is that in bind_rdev_to_array, 4621 * which mainly checks size. 4622 */ 4623 char *e; 4624 int major = simple_strtoul(buf, &e, 10); 4625 int minor; 4626 dev_t dev; 4627 struct md_rdev *rdev; 4628 int err; 4629 4630 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4631 return -EINVAL; 4632 minor = simple_strtoul(e+1, &e, 10); 4633 if (*e && *e != '\n') 4634 return -EINVAL; 4635 dev = MKDEV(major, minor); 4636 if (major != MAJOR(dev) || 4637 minor != MINOR(dev)) 4638 return -EOVERFLOW; 4639 4640 err = mddev_suspend_and_lock(mddev); 4641 if (err) 4642 return err; 4643 if (mddev->persistent) { 4644 rdev = md_import_device(dev, mddev->major_version, 4645 mddev->minor_version); 4646 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4647 struct md_rdev *rdev0 4648 = list_entry(mddev->disks.next, 4649 struct md_rdev, same_set); 4650 err = super_types[mddev->major_version] 4651 .load_super(rdev, rdev0, mddev->minor_version); 4652 if (err < 0) 4653 goto out; 4654 } 4655 } else if (mddev->external) 4656 rdev = md_import_device(dev, -2, -1); 4657 else 4658 rdev = md_import_device(dev, -1, -1); 4659 4660 if (IS_ERR(rdev)) { 4661 mddev_unlock_and_resume(mddev); 4662 return PTR_ERR(rdev); 4663 } 4664 err = bind_rdev_to_array(rdev, mddev); 4665 out: 4666 if (err) 4667 export_rdev(rdev, mddev); 4668 mddev_unlock_and_resume(mddev); 4669 if (!err) 4670 md_new_event(); 4671 return err ? err : len; 4672 } 4673 4674 static struct md_sysfs_entry md_new_device = 4675 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4676 4677 static ssize_t 4678 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4679 { 4680 char *end; 4681 unsigned long chunk, end_chunk; 4682 int err; 4683 4684 err = mddev_lock(mddev); 4685 if (err) 4686 return err; 4687 if (!mddev->bitmap) 4688 goto out; 4689 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4690 while (*buf) { 4691 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4692 if (buf == end) break; 4693 if (*end == '-') { /* range */ 4694 buf = end + 1; 4695 end_chunk = simple_strtoul(buf, &end, 0); 4696 if (buf == end) break; 4697 } 4698 if (*end && !isspace(*end)) break; 4699 md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4700 buf = skip_spaces(end); 4701 } 4702 md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4703 out: 4704 mddev_unlock(mddev); 4705 return len; 4706 } 4707 4708 static struct md_sysfs_entry md_bitmap = 4709 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4710 4711 static ssize_t 4712 size_show(struct mddev *mddev, char *page) 4713 { 4714 return sprintf(page, "%llu\n", 4715 (unsigned long long)mddev->dev_sectors / 2); 4716 } 4717 4718 static int update_size(struct mddev *mddev, sector_t num_sectors); 4719 4720 static ssize_t 4721 size_store(struct mddev *mddev, const char *buf, size_t len) 4722 { 4723 /* If array is inactive, we can reduce the component size, but 4724 * not increase it (except from 0). 4725 * If array is active, we can try an on-line resize 4726 */ 4727 sector_t sectors; 4728 int err = strict_blocks_to_sectors(buf, §ors); 4729 4730 if (err < 0) 4731 return err; 4732 err = mddev_lock(mddev); 4733 if (err) 4734 return err; 4735 if (mddev->pers) { 4736 err = update_size(mddev, sectors); 4737 if (err == 0) 4738 md_update_sb(mddev, 1); 4739 } else { 4740 if (mddev->dev_sectors == 0 || 4741 mddev->dev_sectors > sectors) 4742 mddev->dev_sectors = sectors; 4743 else 4744 err = -ENOSPC; 4745 } 4746 mddev_unlock(mddev); 4747 return err ? err : len; 4748 } 4749 4750 static struct md_sysfs_entry md_size = 4751 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4752 4753 /* Metadata version. 4754 * This is one of 4755 * 'none' for arrays with no metadata (good luck...) 4756 * 'external' for arrays with externally managed metadata, 4757 * or N.M for internally known formats 4758 */ 4759 static ssize_t 4760 metadata_show(struct mddev *mddev, char *page) 4761 { 4762 if (mddev->persistent) 4763 return sprintf(page, "%d.%d\n", 4764 mddev->major_version, mddev->minor_version); 4765 else if (mddev->external) 4766 return sprintf(page, "external:%s\n", mddev->metadata_type); 4767 else 4768 return sprintf(page, "none\n"); 4769 } 4770 4771 static ssize_t 4772 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4773 { 4774 int major, minor; 4775 char *e; 4776 int err; 4777 /* Changing the details of 'external' metadata is 4778 * always permitted. Otherwise there must be 4779 * no devices attached to the array. 4780 */ 4781 4782 err = mddev_lock(mddev); 4783 if (err) 4784 return err; 4785 err = -EBUSY; 4786 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4787 ; 4788 else if (!list_empty(&mddev->disks)) 4789 goto out_unlock; 4790 4791 err = 0; 4792 if (cmd_match(buf, "none")) { 4793 mddev->persistent = 0; 4794 mddev->external = 0; 4795 mddev->major_version = 0; 4796 mddev->minor_version = 90; 4797 goto out_unlock; 4798 } 4799 if (strncmp(buf, "external:", 9) == 0) { 4800 size_t namelen = len-9; 4801 if (namelen >= sizeof(mddev->metadata_type)) 4802 namelen = sizeof(mddev->metadata_type)-1; 4803 memcpy(mddev->metadata_type, buf+9, namelen); 4804 mddev->metadata_type[namelen] = 0; 4805 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4806 mddev->metadata_type[--namelen] = 0; 4807 mddev->persistent = 0; 4808 mddev->external = 1; 4809 mddev->major_version = 0; 4810 mddev->minor_version = 90; 4811 goto out_unlock; 4812 } 4813 major = simple_strtoul(buf, &e, 10); 4814 err = -EINVAL; 4815 if (e==buf || *e != '.') 4816 goto out_unlock; 4817 buf = e+1; 4818 minor = simple_strtoul(buf, &e, 10); 4819 if (e==buf || (*e && *e != '\n') ) 4820 goto out_unlock; 4821 err = -ENOENT; 4822 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4823 goto out_unlock; 4824 mddev->major_version = major; 4825 mddev->minor_version = minor; 4826 mddev->persistent = 1; 4827 mddev->external = 0; 4828 err = 0; 4829 out_unlock: 4830 mddev_unlock(mddev); 4831 return err ?: len; 4832 } 4833 4834 static struct md_sysfs_entry md_metadata = 4835 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4836 4837 static ssize_t 4838 action_show(struct mddev *mddev, char *page) 4839 { 4840 char *type = "idle"; 4841 unsigned long recovery = mddev->recovery; 4842 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4843 type = "frozen"; 4844 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4845 (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4846 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4847 type = "reshape"; 4848 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4849 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4850 type = "resync"; 4851 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4852 type = "check"; 4853 else 4854 type = "repair"; 4855 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4856 type = "recover"; 4857 else if (mddev->reshape_position != MaxSector) 4858 type = "reshape"; 4859 } 4860 return sprintf(page, "%s\n", type); 4861 } 4862 4863 static void stop_sync_thread(struct mddev *mddev) 4864 { 4865 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4866 return; 4867 4868 if (mddev_lock(mddev)) 4869 return; 4870 4871 /* 4872 * Check again in case MD_RECOVERY_RUNNING is cleared before lock is 4873 * held. 4874 */ 4875 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4876 mddev_unlock(mddev); 4877 return; 4878 } 4879 4880 if (work_pending(&mddev->del_work)) 4881 flush_workqueue(md_misc_wq); 4882 4883 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4884 /* 4885 * Thread might be blocked waiting for metadata update which will now 4886 * never happen 4887 */ 4888 md_wakeup_thread_directly(mddev->sync_thread); 4889 4890 mddev_unlock(mddev); 4891 } 4892 4893 static void idle_sync_thread(struct mddev *mddev) 4894 { 4895 int sync_seq = atomic_read(&mddev->sync_seq); 4896 4897 mutex_lock(&mddev->sync_mutex); 4898 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4899 stop_sync_thread(mddev); 4900 4901 wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) || 4902 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4903 4904 mutex_unlock(&mddev->sync_mutex); 4905 } 4906 4907 static void frozen_sync_thread(struct mddev *mddev) 4908 { 4909 mutex_lock(&mddev->sync_mutex); 4910 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4911 stop_sync_thread(mddev); 4912 4913 wait_event(resync_wait, mddev->sync_thread == NULL && 4914 !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); 4915 4916 mutex_unlock(&mddev->sync_mutex); 4917 } 4918 4919 static ssize_t 4920 action_store(struct mddev *mddev, const char *page, size_t len) 4921 { 4922 if (!mddev->pers || !mddev->pers->sync_request) 4923 return -EINVAL; 4924 4925 4926 if (cmd_match(page, "idle")) 4927 idle_sync_thread(mddev); 4928 else if (cmd_match(page, "frozen")) 4929 frozen_sync_thread(mddev); 4930 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4931 return -EBUSY; 4932 else if (cmd_match(page, "resync")) 4933 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4934 else if (cmd_match(page, "recover")) { 4935 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4936 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4937 } else if (cmd_match(page, "reshape")) { 4938 int err; 4939 if (mddev->pers->start_reshape == NULL) 4940 return -EINVAL; 4941 err = mddev_lock(mddev); 4942 if (!err) { 4943 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 4944 err = -EBUSY; 4945 } else if (mddev->reshape_position == MaxSector || 4946 mddev->pers->check_reshape == NULL || 4947 mddev->pers->check_reshape(mddev)) { 4948 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4949 err = mddev->pers->start_reshape(mddev); 4950 } else { 4951 /* 4952 * If reshape is still in progress, and 4953 * md_check_recovery() can continue to reshape, 4954 * don't restart reshape because data can be 4955 * corrupted for raid456. 4956 */ 4957 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4958 } 4959 mddev_unlock(mddev); 4960 } 4961 if (err) 4962 return err; 4963 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 4964 } else { 4965 if (cmd_match(page, "check")) 4966 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4967 else if (!cmd_match(page, "repair")) 4968 return -EINVAL; 4969 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4970 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4971 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4972 } 4973 if (mddev->ro == MD_AUTO_READ) { 4974 /* A write to sync_action is enough to justify 4975 * canceling read-auto mode 4976 */ 4977 flush_work(&mddev->sync_work); 4978 mddev->ro = MD_RDWR; 4979 md_wakeup_thread(mddev->sync_thread); 4980 } 4981 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4982 md_wakeup_thread(mddev->thread); 4983 sysfs_notify_dirent_safe(mddev->sysfs_action); 4984 return len; 4985 } 4986 4987 static struct md_sysfs_entry md_scan_mode = 4988 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4989 4990 static ssize_t 4991 last_sync_action_show(struct mddev *mddev, char *page) 4992 { 4993 return sprintf(page, "%s\n", mddev->last_sync_action); 4994 } 4995 4996 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4997 4998 static ssize_t 4999 mismatch_cnt_show(struct mddev *mddev, char *page) 5000 { 5001 return sprintf(page, "%llu\n", 5002 (unsigned long long) 5003 atomic64_read(&mddev->resync_mismatches)); 5004 } 5005 5006 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 5007 5008 static ssize_t 5009 sync_min_show(struct mddev *mddev, char *page) 5010 { 5011 return sprintf(page, "%d (%s)\n", speed_min(mddev), 5012 mddev->sync_speed_min ? "local": "system"); 5013 } 5014 5015 static ssize_t 5016 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 5017 { 5018 unsigned int min; 5019 int rv; 5020 5021 if (strncmp(buf, "system", 6)==0) { 5022 min = 0; 5023 } else { 5024 rv = kstrtouint(buf, 10, &min); 5025 if (rv < 0) 5026 return rv; 5027 if (min == 0) 5028 return -EINVAL; 5029 } 5030 mddev->sync_speed_min = min; 5031 return len; 5032 } 5033 5034 static struct md_sysfs_entry md_sync_min = 5035 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 5036 5037 static ssize_t 5038 sync_max_show(struct mddev *mddev, char *page) 5039 { 5040 return sprintf(page, "%d (%s)\n", speed_max(mddev), 5041 mddev->sync_speed_max ? "local": "system"); 5042 } 5043 5044 static ssize_t 5045 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 5046 { 5047 unsigned int max; 5048 int rv; 5049 5050 if (strncmp(buf, "system", 6)==0) { 5051 max = 0; 5052 } else { 5053 rv = kstrtouint(buf, 10, &max); 5054 if (rv < 0) 5055 return rv; 5056 if (max == 0) 5057 return -EINVAL; 5058 } 5059 mddev->sync_speed_max = max; 5060 return len; 5061 } 5062 5063 static struct md_sysfs_entry md_sync_max = 5064 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 5065 5066 static ssize_t 5067 degraded_show(struct mddev *mddev, char *page) 5068 { 5069 return sprintf(page, "%d\n", mddev->degraded); 5070 } 5071 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 5072 5073 static ssize_t 5074 sync_force_parallel_show(struct mddev *mddev, char *page) 5075 { 5076 return sprintf(page, "%d\n", mddev->parallel_resync); 5077 } 5078 5079 static ssize_t 5080 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 5081 { 5082 long n; 5083 5084 if (kstrtol(buf, 10, &n)) 5085 return -EINVAL; 5086 5087 if (n != 0 && n != 1) 5088 return -EINVAL; 5089 5090 mddev->parallel_resync = n; 5091 5092 if (mddev->sync_thread) 5093 wake_up(&resync_wait); 5094 5095 return len; 5096 } 5097 5098 /* force parallel resync, even with shared block devices */ 5099 static struct md_sysfs_entry md_sync_force_parallel = 5100 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 5101 sync_force_parallel_show, sync_force_parallel_store); 5102 5103 static ssize_t 5104 sync_speed_show(struct mddev *mddev, char *page) 5105 { 5106 unsigned long resync, dt, db; 5107 if (mddev->curr_resync == MD_RESYNC_NONE) 5108 return sprintf(page, "none\n"); 5109 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 5110 dt = (jiffies - mddev->resync_mark) / HZ; 5111 if (!dt) dt++; 5112 db = resync - mddev->resync_mark_cnt; 5113 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 5114 } 5115 5116 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 5117 5118 static ssize_t 5119 sync_completed_show(struct mddev *mddev, char *page) 5120 { 5121 unsigned long long max_sectors, resync; 5122 5123 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5124 return sprintf(page, "none\n"); 5125 5126 if (mddev->curr_resync == MD_RESYNC_YIELDED || 5127 mddev->curr_resync == MD_RESYNC_DELAYED) 5128 return sprintf(page, "delayed\n"); 5129 5130 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 5131 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5132 max_sectors = mddev->resync_max_sectors; 5133 else 5134 max_sectors = mddev->dev_sectors; 5135 5136 resync = mddev->curr_resync_completed; 5137 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 5138 } 5139 5140 static struct md_sysfs_entry md_sync_completed = 5141 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 5142 5143 static ssize_t 5144 min_sync_show(struct mddev *mddev, char *page) 5145 { 5146 return sprintf(page, "%llu\n", 5147 (unsigned long long)mddev->resync_min); 5148 } 5149 static ssize_t 5150 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 5151 { 5152 unsigned long long min; 5153 int err; 5154 5155 if (kstrtoull(buf, 10, &min)) 5156 return -EINVAL; 5157 5158 spin_lock(&mddev->lock); 5159 err = -EINVAL; 5160 if (min > mddev->resync_max) 5161 goto out_unlock; 5162 5163 err = -EBUSY; 5164 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5165 goto out_unlock; 5166 5167 /* Round down to multiple of 4K for safety */ 5168 mddev->resync_min = round_down(min, 8); 5169 err = 0; 5170 5171 out_unlock: 5172 spin_unlock(&mddev->lock); 5173 return err ?: len; 5174 } 5175 5176 static struct md_sysfs_entry md_min_sync = 5177 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 5178 5179 static ssize_t 5180 max_sync_show(struct mddev *mddev, char *page) 5181 { 5182 if (mddev->resync_max == MaxSector) 5183 return sprintf(page, "max\n"); 5184 else 5185 return sprintf(page, "%llu\n", 5186 (unsigned long long)mddev->resync_max); 5187 } 5188 static ssize_t 5189 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 5190 { 5191 int err; 5192 spin_lock(&mddev->lock); 5193 if (strncmp(buf, "max", 3) == 0) 5194 mddev->resync_max = MaxSector; 5195 else { 5196 unsigned long long max; 5197 int chunk; 5198 5199 err = -EINVAL; 5200 if (kstrtoull(buf, 10, &max)) 5201 goto out_unlock; 5202 if (max < mddev->resync_min) 5203 goto out_unlock; 5204 5205 err = -EBUSY; 5206 if (max < mddev->resync_max && md_is_rdwr(mddev) && 5207 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5208 goto out_unlock; 5209 5210 /* Must be a multiple of chunk_size */ 5211 chunk = mddev->chunk_sectors; 5212 if (chunk) { 5213 sector_t temp = max; 5214 5215 err = -EINVAL; 5216 if (sector_div(temp, chunk)) 5217 goto out_unlock; 5218 } 5219 mddev->resync_max = max; 5220 } 5221 wake_up(&mddev->recovery_wait); 5222 err = 0; 5223 out_unlock: 5224 spin_unlock(&mddev->lock); 5225 return err ?: len; 5226 } 5227 5228 static struct md_sysfs_entry md_max_sync = 5229 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 5230 5231 static ssize_t 5232 suspend_lo_show(struct mddev *mddev, char *page) 5233 { 5234 return sprintf(page, "%llu\n", 5235 (unsigned long long)READ_ONCE(mddev->suspend_lo)); 5236 } 5237 5238 static ssize_t 5239 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 5240 { 5241 unsigned long long new; 5242 int err; 5243 5244 err = kstrtoull(buf, 10, &new); 5245 if (err < 0) 5246 return err; 5247 if (new != (sector_t)new) 5248 return -EINVAL; 5249 5250 err = mddev_suspend(mddev, true); 5251 if (err) 5252 return err; 5253 5254 WRITE_ONCE(mddev->suspend_lo, new); 5255 mddev_resume(mddev); 5256 5257 return len; 5258 } 5259 static struct md_sysfs_entry md_suspend_lo = 5260 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 5261 5262 static ssize_t 5263 suspend_hi_show(struct mddev *mddev, char *page) 5264 { 5265 return sprintf(page, "%llu\n", 5266 (unsigned long long)READ_ONCE(mddev->suspend_hi)); 5267 } 5268 5269 static ssize_t 5270 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 5271 { 5272 unsigned long long new; 5273 int err; 5274 5275 err = kstrtoull(buf, 10, &new); 5276 if (err < 0) 5277 return err; 5278 if (new != (sector_t)new) 5279 return -EINVAL; 5280 5281 err = mddev_suspend(mddev, true); 5282 if (err) 5283 return err; 5284 5285 WRITE_ONCE(mddev->suspend_hi, new); 5286 mddev_resume(mddev); 5287 5288 return len; 5289 } 5290 static struct md_sysfs_entry md_suspend_hi = 5291 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 5292 5293 static ssize_t 5294 reshape_position_show(struct mddev *mddev, char *page) 5295 { 5296 if (mddev->reshape_position != MaxSector) 5297 return sprintf(page, "%llu\n", 5298 (unsigned long long)mddev->reshape_position); 5299 strcpy(page, "none\n"); 5300 return 5; 5301 } 5302 5303 static ssize_t 5304 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 5305 { 5306 struct md_rdev *rdev; 5307 unsigned long long new; 5308 int err; 5309 5310 err = kstrtoull(buf, 10, &new); 5311 if (err < 0) 5312 return err; 5313 if (new != (sector_t)new) 5314 return -EINVAL; 5315 err = mddev_lock(mddev); 5316 if (err) 5317 return err; 5318 err = -EBUSY; 5319 if (mddev->pers) 5320 goto unlock; 5321 mddev->reshape_position = new; 5322 mddev->delta_disks = 0; 5323 mddev->reshape_backwards = 0; 5324 mddev->new_level = mddev->level; 5325 mddev->new_layout = mddev->layout; 5326 mddev->new_chunk_sectors = mddev->chunk_sectors; 5327 rdev_for_each(rdev, mddev) 5328 rdev->new_data_offset = rdev->data_offset; 5329 err = 0; 5330 unlock: 5331 mddev_unlock(mddev); 5332 return err ?: len; 5333 } 5334 5335 static struct md_sysfs_entry md_reshape_position = 5336 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 5337 reshape_position_store); 5338 5339 static ssize_t 5340 reshape_direction_show(struct mddev *mddev, char *page) 5341 { 5342 return sprintf(page, "%s\n", 5343 mddev->reshape_backwards ? "backwards" : "forwards"); 5344 } 5345 5346 static ssize_t 5347 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 5348 { 5349 int backwards = 0; 5350 int err; 5351 5352 if (cmd_match(buf, "forwards")) 5353 backwards = 0; 5354 else if (cmd_match(buf, "backwards")) 5355 backwards = 1; 5356 else 5357 return -EINVAL; 5358 if (mddev->reshape_backwards == backwards) 5359 return len; 5360 5361 err = mddev_lock(mddev); 5362 if (err) 5363 return err; 5364 /* check if we are allowed to change */ 5365 if (mddev->delta_disks) 5366 err = -EBUSY; 5367 else if (mddev->persistent && 5368 mddev->major_version == 0) 5369 err = -EINVAL; 5370 else 5371 mddev->reshape_backwards = backwards; 5372 mddev_unlock(mddev); 5373 return err ?: len; 5374 } 5375 5376 static struct md_sysfs_entry md_reshape_direction = 5377 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 5378 reshape_direction_store); 5379 5380 static ssize_t 5381 array_size_show(struct mddev *mddev, char *page) 5382 { 5383 if (mddev->external_size) 5384 return sprintf(page, "%llu\n", 5385 (unsigned long long)mddev->array_sectors/2); 5386 else 5387 return sprintf(page, "default\n"); 5388 } 5389 5390 static ssize_t 5391 array_size_store(struct mddev *mddev, const char *buf, size_t len) 5392 { 5393 sector_t sectors; 5394 int err; 5395 5396 err = mddev_lock(mddev); 5397 if (err) 5398 return err; 5399 5400 /* cluster raid doesn't support change array_sectors */ 5401 if (mddev_is_clustered(mddev)) { 5402 mddev_unlock(mddev); 5403 return -EINVAL; 5404 } 5405 5406 if (strncmp(buf, "default", 7) == 0) { 5407 if (mddev->pers) 5408 sectors = mddev->pers->size(mddev, 0, 0); 5409 else 5410 sectors = mddev->array_sectors; 5411 5412 mddev->external_size = 0; 5413 } else { 5414 if (strict_blocks_to_sectors(buf, §ors) < 0) 5415 err = -EINVAL; 5416 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 5417 err = -E2BIG; 5418 else 5419 mddev->external_size = 1; 5420 } 5421 5422 if (!err) { 5423 mddev->array_sectors = sectors; 5424 if (mddev->pers) 5425 set_capacity_and_notify(mddev->gendisk, 5426 mddev->array_sectors); 5427 } 5428 mddev_unlock(mddev); 5429 return err ?: len; 5430 } 5431 5432 static struct md_sysfs_entry md_array_size = 5433 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 5434 array_size_store); 5435 5436 static ssize_t 5437 consistency_policy_show(struct mddev *mddev, char *page) 5438 { 5439 int ret; 5440 5441 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5442 ret = sprintf(page, "journal\n"); 5443 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) { 5444 ret = sprintf(page, "ppl\n"); 5445 } else if (mddev->bitmap) { 5446 ret = sprintf(page, "bitmap\n"); 5447 } else if (mddev->pers) { 5448 if (mddev->pers->sync_request) 5449 ret = sprintf(page, "resync\n"); 5450 else 5451 ret = sprintf(page, "none\n"); 5452 } else { 5453 ret = sprintf(page, "unknown\n"); 5454 } 5455 5456 return ret; 5457 } 5458 5459 static ssize_t 5460 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len) 5461 { 5462 int err = 0; 5463 5464 if (mddev->pers) { 5465 if (mddev->pers->change_consistency_policy) 5466 err = mddev->pers->change_consistency_policy(mddev, buf); 5467 else 5468 err = -EBUSY; 5469 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) { 5470 set_bit(MD_HAS_PPL, &mddev->flags); 5471 } else { 5472 err = -EINVAL; 5473 } 5474 5475 return err ? err : len; 5476 } 5477 5478 static struct md_sysfs_entry md_consistency_policy = 5479 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, 5480 consistency_policy_store); 5481 5482 static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) 5483 { 5484 return sprintf(page, "%d\n", mddev->fail_last_dev); 5485 } 5486 5487 /* 5488 * Setting fail_last_dev to true to allow last device to be forcibly removed 5489 * from RAID1/RAID10. 5490 */ 5491 static ssize_t 5492 fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) 5493 { 5494 int ret; 5495 bool value; 5496 5497 ret = kstrtobool(buf, &value); 5498 if (ret) 5499 return ret; 5500 5501 if (value != mddev->fail_last_dev) 5502 mddev->fail_last_dev = value; 5503 5504 return len; 5505 } 5506 static struct md_sysfs_entry md_fail_last_dev = 5507 __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, 5508 fail_last_dev_store); 5509 5510 static ssize_t serialize_policy_show(struct mddev *mddev, char *page) 5511 { 5512 if (mddev->pers == NULL || (mddev->pers->level != 1)) 5513 return sprintf(page, "n/a\n"); 5514 else 5515 return sprintf(page, "%d\n", mddev->serialize_policy); 5516 } 5517 5518 /* 5519 * Setting serialize_policy to true to enforce write IO is not reordered 5520 * for raid1. 5521 */ 5522 static ssize_t 5523 serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) 5524 { 5525 int err; 5526 bool value; 5527 5528 err = kstrtobool(buf, &value); 5529 if (err) 5530 return err; 5531 5532 if (value == mddev->serialize_policy) 5533 return len; 5534 5535 err = mddev_suspend_and_lock(mddev); 5536 if (err) 5537 return err; 5538 if (mddev->pers == NULL || (mddev->pers->level != 1)) { 5539 pr_err("md: serialize_policy is only effective for raid1\n"); 5540 err = -EINVAL; 5541 goto unlock; 5542 } 5543 5544 if (value) 5545 mddev_create_serial_pool(mddev, NULL); 5546 else 5547 mddev_destroy_serial_pool(mddev, NULL); 5548 mddev->serialize_policy = value; 5549 unlock: 5550 mddev_unlock_and_resume(mddev); 5551 return err ?: len; 5552 } 5553 5554 static struct md_sysfs_entry md_serialize_policy = 5555 __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, 5556 serialize_policy_store); 5557 5558 5559 static struct attribute *md_default_attrs[] = { 5560 &md_level.attr, 5561 &md_layout.attr, 5562 &md_raid_disks.attr, 5563 &md_uuid.attr, 5564 &md_chunk_size.attr, 5565 &md_size.attr, 5566 &md_resync_start.attr, 5567 &md_metadata.attr, 5568 &md_new_device.attr, 5569 &md_safe_delay.attr, 5570 &md_array_state.attr, 5571 &md_reshape_position.attr, 5572 &md_reshape_direction.attr, 5573 &md_array_size.attr, 5574 &max_corr_read_errors.attr, 5575 &md_consistency_policy.attr, 5576 &md_fail_last_dev.attr, 5577 &md_serialize_policy.attr, 5578 NULL, 5579 }; 5580 5581 static const struct attribute_group md_default_group = { 5582 .attrs = md_default_attrs, 5583 }; 5584 5585 static struct attribute *md_redundancy_attrs[] = { 5586 &md_scan_mode.attr, 5587 &md_last_scan_mode.attr, 5588 &md_mismatches.attr, 5589 &md_sync_min.attr, 5590 &md_sync_max.attr, 5591 &md_sync_speed.attr, 5592 &md_sync_force_parallel.attr, 5593 &md_sync_completed.attr, 5594 &md_min_sync.attr, 5595 &md_max_sync.attr, 5596 &md_suspend_lo.attr, 5597 &md_suspend_hi.attr, 5598 &md_bitmap.attr, 5599 &md_degraded.attr, 5600 NULL, 5601 }; 5602 static const struct attribute_group md_redundancy_group = { 5603 .name = NULL, 5604 .attrs = md_redundancy_attrs, 5605 }; 5606 5607 static const struct attribute_group *md_attr_groups[] = { 5608 &md_default_group, 5609 &md_bitmap_group, 5610 NULL, 5611 }; 5612 5613 static ssize_t 5614 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 5615 { 5616 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5617 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5618 ssize_t rv; 5619 5620 if (!entry->show) 5621 return -EIO; 5622 spin_lock(&all_mddevs_lock); 5623 if (!mddev_get(mddev)) { 5624 spin_unlock(&all_mddevs_lock); 5625 return -EBUSY; 5626 } 5627 spin_unlock(&all_mddevs_lock); 5628 5629 rv = entry->show(mddev, page); 5630 mddev_put(mddev); 5631 return rv; 5632 } 5633 5634 static ssize_t 5635 md_attr_store(struct kobject *kobj, struct attribute *attr, 5636 const char *page, size_t length) 5637 { 5638 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 5639 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 5640 ssize_t rv; 5641 5642 if (!entry->store) 5643 return -EIO; 5644 if (!capable(CAP_SYS_ADMIN)) 5645 return -EACCES; 5646 spin_lock(&all_mddevs_lock); 5647 if (!mddev_get(mddev)) { 5648 spin_unlock(&all_mddevs_lock); 5649 return -EBUSY; 5650 } 5651 spin_unlock(&all_mddevs_lock); 5652 rv = entry->store(mddev, page, length); 5653 mddev_put(mddev); 5654 return rv; 5655 } 5656 5657 static void md_kobj_release(struct kobject *ko) 5658 { 5659 struct mddev *mddev = container_of(ko, struct mddev, kobj); 5660 5661 if (mddev->sysfs_state) 5662 sysfs_put(mddev->sysfs_state); 5663 if (mddev->sysfs_level) 5664 sysfs_put(mddev->sysfs_level); 5665 5666 del_gendisk(mddev->gendisk); 5667 put_disk(mddev->gendisk); 5668 } 5669 5670 static const struct sysfs_ops md_sysfs_ops = { 5671 .show = md_attr_show, 5672 .store = md_attr_store, 5673 }; 5674 static const struct kobj_type md_ktype = { 5675 .release = md_kobj_release, 5676 .sysfs_ops = &md_sysfs_ops, 5677 .default_groups = md_attr_groups, 5678 }; 5679 5680 int mdp_major = 0; 5681 5682 static void mddev_delayed_delete(struct work_struct *ws) 5683 { 5684 struct mddev *mddev = container_of(ws, struct mddev, del_work); 5685 5686 kobject_put(&mddev->kobj); 5687 } 5688 5689 struct mddev *md_alloc(dev_t dev, char *name) 5690 { 5691 /* 5692 * If dev is zero, name is the name of a device to allocate with 5693 * an arbitrary minor number. It will be "md_???" 5694 * If dev is non-zero it must be a device number with a MAJOR of 5695 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then 5696 * the device is being created by opening a node in /dev. 5697 * If "name" is not NULL, the device is being created by 5698 * writing to /sys/module/md_mod/parameters/new_array. 5699 */ 5700 static DEFINE_MUTEX(disks_mutex); 5701 struct mddev *mddev; 5702 struct gendisk *disk; 5703 int partitioned; 5704 int shift; 5705 int unit; 5706 int error ; 5707 5708 /* 5709 * Wait for any previous instance of this device to be completely 5710 * removed (mddev_delayed_delete). 5711 */ 5712 flush_workqueue(md_misc_wq); 5713 5714 mutex_lock(&disks_mutex); 5715 mddev = mddev_alloc(dev); 5716 if (IS_ERR(mddev)) { 5717 error = PTR_ERR(mddev); 5718 goto out_unlock; 5719 } 5720 5721 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5722 shift = partitioned ? MdpMinorShift : 0; 5723 unit = MINOR(mddev->unit) >> shift; 5724 5725 if (name && !dev) { 5726 /* Need to ensure that 'name' is not a duplicate. 5727 */ 5728 struct mddev *mddev2; 5729 spin_lock(&all_mddevs_lock); 5730 5731 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5732 if (mddev2->gendisk && 5733 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5734 spin_unlock(&all_mddevs_lock); 5735 error = -EEXIST; 5736 goto out_free_mddev; 5737 } 5738 spin_unlock(&all_mddevs_lock); 5739 } 5740 if (name && dev) 5741 /* 5742 * Creating /dev/mdNNN via "newarray", so adjust hold_active. 5743 */ 5744 mddev->hold_active = UNTIL_STOP; 5745 5746 error = -ENOMEM; 5747 disk = blk_alloc_disk(NUMA_NO_NODE); 5748 if (!disk) 5749 goto out_free_mddev; 5750 5751 disk->major = MAJOR(mddev->unit); 5752 disk->first_minor = unit << shift; 5753 disk->minors = 1 << shift; 5754 if (name) 5755 strcpy(disk->disk_name, name); 5756 else if (partitioned) 5757 sprintf(disk->disk_name, "md_d%d", unit); 5758 else 5759 sprintf(disk->disk_name, "md%d", unit); 5760 disk->fops = &md_fops; 5761 disk->private_data = mddev; 5762 5763 mddev->queue = disk->queue; 5764 blk_set_stacking_limits(&mddev->queue->limits); 5765 blk_queue_write_cache(mddev->queue, true, true); 5766 disk->events |= DISK_EVENT_MEDIA_CHANGE; 5767 mddev->gendisk = disk; 5768 error = add_disk(disk); 5769 if (error) 5770 goto out_put_disk; 5771 5772 kobject_init(&mddev->kobj, &md_ktype); 5773 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); 5774 if (error) { 5775 /* 5776 * The disk is already live at this point. Clear the hold flag 5777 * and let mddev_put take care of the deletion, as it isn't any 5778 * different from a normal close on last release now. 5779 */ 5780 mddev->hold_active = 0; 5781 mutex_unlock(&disks_mutex); 5782 mddev_put(mddev); 5783 return ERR_PTR(error); 5784 } 5785 5786 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5787 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5788 mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); 5789 mutex_unlock(&disks_mutex); 5790 return mddev; 5791 5792 out_put_disk: 5793 put_disk(disk); 5794 out_free_mddev: 5795 mddev_free(mddev); 5796 out_unlock: 5797 mutex_unlock(&disks_mutex); 5798 return ERR_PTR(error); 5799 } 5800 5801 static int md_alloc_and_put(dev_t dev, char *name) 5802 { 5803 struct mddev *mddev = md_alloc(dev, name); 5804 5805 if (IS_ERR(mddev)) 5806 return PTR_ERR(mddev); 5807 mddev_put(mddev); 5808 return 0; 5809 } 5810 5811 static void md_probe(dev_t dev) 5812 { 5813 if (MAJOR(dev) == MD_MAJOR && MINOR(dev) >= 512) 5814 return; 5815 if (create_on_open) 5816 md_alloc_and_put(dev, NULL); 5817 } 5818 5819 static int add_named_array(const char *val, const struct kernel_param *kp) 5820 { 5821 /* 5822 * val must be "md_*" or "mdNNN". 5823 * For "md_*" we allocate an array with a large free minor number, and 5824 * set the name to val. val must not already be an active name. 5825 * For "mdNNN" we allocate an array with the minor number NNN 5826 * which must not already be in use. 5827 */ 5828 int len = strlen(val); 5829 char buf[DISK_NAME_LEN]; 5830 unsigned long devnum; 5831 5832 while (len && val[len-1] == '\n') 5833 len--; 5834 if (len >= DISK_NAME_LEN) 5835 return -E2BIG; 5836 strscpy(buf, val, len+1); 5837 if (strncmp(buf, "md_", 3) == 0) 5838 return md_alloc_and_put(0, buf); 5839 if (strncmp(buf, "md", 2) == 0 && 5840 isdigit(buf[2]) && 5841 kstrtoul(buf+2, 10, &devnum) == 0 && 5842 devnum <= MINORMASK) 5843 return md_alloc_and_put(MKDEV(MD_MAJOR, devnum), NULL); 5844 5845 return -EINVAL; 5846 } 5847 5848 static void md_safemode_timeout(struct timer_list *t) 5849 { 5850 struct mddev *mddev = from_timer(mddev, t, safemode_timer); 5851 5852 mddev->safemode = 1; 5853 if (mddev->external) 5854 sysfs_notify_dirent_safe(mddev->sysfs_state); 5855 5856 md_wakeup_thread(mddev->thread); 5857 } 5858 5859 static int start_dirty_degraded; 5860 5861 int md_run(struct mddev *mddev) 5862 { 5863 int err; 5864 struct md_rdev *rdev; 5865 struct md_personality *pers; 5866 bool nowait = true; 5867 5868 if (list_empty(&mddev->disks)) 5869 /* cannot run an array with no devices.. */ 5870 return -EINVAL; 5871 5872 if (mddev->pers) 5873 return -EBUSY; 5874 /* Cannot run until previous stop completes properly */ 5875 if (mddev->sysfs_active) 5876 return -EBUSY; 5877 5878 /* 5879 * Analyze all RAID superblock(s) 5880 */ 5881 if (!mddev->raid_disks) { 5882 if (!mddev->persistent) 5883 return -EINVAL; 5884 err = analyze_sbs(mddev); 5885 if (err) 5886 return -EINVAL; 5887 } 5888 5889 if (mddev->level != LEVEL_NONE) 5890 request_module("md-level-%d", mddev->level); 5891 else if (mddev->clevel[0]) 5892 request_module("md-%s", mddev->clevel); 5893 5894 /* 5895 * Drop all container device buffers, from now on 5896 * the only valid external interface is through the md 5897 * device. 5898 */ 5899 mddev->has_superblocks = false; 5900 rdev_for_each(rdev, mddev) { 5901 if (test_bit(Faulty, &rdev->flags)) 5902 continue; 5903 sync_blockdev(rdev->bdev); 5904 invalidate_bdev(rdev->bdev); 5905 if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { 5906 mddev->ro = MD_RDONLY; 5907 if (mddev->gendisk) 5908 set_disk_ro(mddev->gendisk, 1); 5909 } 5910 5911 if (rdev->sb_page) 5912 mddev->has_superblocks = true; 5913 5914 /* perform some consistency tests on the device. 5915 * We don't want the data to overlap the metadata, 5916 * Internal Bitmap issues have been handled elsewhere. 5917 */ 5918 if (rdev->meta_bdev) { 5919 /* Nothing to check */; 5920 } else if (rdev->data_offset < rdev->sb_start) { 5921 if (mddev->dev_sectors && 5922 rdev->data_offset + mddev->dev_sectors 5923 > rdev->sb_start) { 5924 pr_warn("md: %s: data overlaps metadata\n", 5925 mdname(mddev)); 5926 return -EINVAL; 5927 } 5928 } else { 5929 if (rdev->sb_start + rdev->sb_size/512 5930 > rdev->data_offset) { 5931 pr_warn("md: %s: metadata overlaps data\n", 5932 mdname(mddev)); 5933 return -EINVAL; 5934 } 5935 } 5936 sysfs_notify_dirent_safe(rdev->sysfs_state); 5937 nowait = nowait && bdev_nowait(rdev->bdev); 5938 } 5939 5940 if (!bioset_initialized(&mddev->bio_set)) { 5941 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5942 if (err) 5943 return err; 5944 } 5945 if (!bioset_initialized(&mddev->sync_set)) { 5946 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); 5947 if (err) 5948 goto exit_bio_set; 5949 } 5950 5951 if (!bioset_initialized(&mddev->io_clone_set)) { 5952 err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, 5953 offsetof(struct md_io_clone, bio_clone), 0); 5954 if (err) 5955 goto exit_sync_set; 5956 } 5957 5958 spin_lock(&pers_lock); 5959 pers = find_pers(mddev->level, mddev->clevel); 5960 if (!pers || !try_module_get(pers->owner)) { 5961 spin_unlock(&pers_lock); 5962 if (mddev->level != LEVEL_NONE) 5963 pr_warn("md: personality for level %d is not loaded!\n", 5964 mddev->level); 5965 else 5966 pr_warn("md: personality for level %s is not loaded!\n", 5967 mddev->clevel); 5968 err = -EINVAL; 5969 goto abort; 5970 } 5971 spin_unlock(&pers_lock); 5972 if (mddev->level != pers->level) { 5973 mddev->level = pers->level; 5974 mddev->new_level = pers->level; 5975 } 5976 strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5977 5978 if (mddev->reshape_position != MaxSector && 5979 pers->start_reshape == NULL) { 5980 /* This personality cannot handle reshaping... */ 5981 module_put(pers->owner); 5982 err = -EINVAL; 5983 goto abort; 5984 } 5985 5986 if (pers->sync_request) { 5987 /* Warn if this is a potentially silly 5988 * configuration. 5989 */ 5990 struct md_rdev *rdev2; 5991 int warned = 0; 5992 5993 rdev_for_each(rdev, mddev) 5994 rdev_for_each(rdev2, mddev) { 5995 if (rdev < rdev2 && 5996 rdev->bdev->bd_disk == 5997 rdev2->bdev->bd_disk) { 5998 pr_warn("%s: WARNING: %pg appears to be on the same physical disk as %pg.\n", 5999 mdname(mddev), 6000 rdev->bdev, 6001 rdev2->bdev); 6002 warned = 1; 6003 } 6004 } 6005 6006 if (warned) 6007 pr_warn("True protection against single-disk failure might be compromised.\n"); 6008 } 6009 6010 mddev->recovery = 0; 6011 /* may be over-ridden by personality */ 6012 mddev->resync_max_sectors = mddev->dev_sectors; 6013 6014 mddev->ok_start_degraded = start_dirty_degraded; 6015 6016 if (start_readonly && md_is_rdwr(mddev)) 6017 mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */ 6018 6019 err = pers->run(mddev); 6020 if (err) 6021 pr_warn("md: pers->run() failed ...\n"); 6022 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 6023 WARN_ONCE(!mddev->external_size, 6024 "%s: default size too small, but 'external_size' not in effect?\n", 6025 __func__); 6026 pr_warn("md: invalid array_size %llu > default size %llu\n", 6027 (unsigned long long)mddev->array_sectors / 2, 6028 (unsigned long long)pers->size(mddev, 0, 0) / 2); 6029 err = -EINVAL; 6030 } 6031 if (err == 0 && pers->sync_request && 6032 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6033 struct bitmap *bitmap; 6034 6035 bitmap = md_bitmap_create(mddev, -1); 6036 if (IS_ERR(bitmap)) { 6037 err = PTR_ERR(bitmap); 6038 pr_warn("%s: failed to create bitmap (%d)\n", 6039 mdname(mddev), err); 6040 } else 6041 mddev->bitmap = bitmap; 6042 6043 } 6044 if (err) 6045 goto bitmap_abort; 6046 6047 if (mddev->bitmap_info.max_write_behind > 0) { 6048 bool create_pool = false; 6049 6050 rdev_for_each(rdev, mddev) { 6051 if (test_bit(WriteMostly, &rdev->flags) && 6052 rdev_init_serial(rdev)) 6053 create_pool = true; 6054 } 6055 if (create_pool && mddev->serial_info_pool == NULL) { 6056 mddev->serial_info_pool = 6057 mempool_create_kmalloc_pool(NR_SERIAL_INFOS, 6058 sizeof(struct serial_info)); 6059 if (!mddev->serial_info_pool) { 6060 err = -ENOMEM; 6061 goto bitmap_abort; 6062 } 6063 } 6064 } 6065 6066 if (mddev->queue) { 6067 bool nonrot = true; 6068 6069 rdev_for_each(rdev, mddev) { 6070 if (rdev->raid_disk >= 0 && !bdev_nonrot(rdev->bdev)) { 6071 nonrot = false; 6072 break; 6073 } 6074 } 6075 if (mddev->degraded) 6076 nonrot = false; 6077 if (nonrot) 6078 blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); 6079 else 6080 blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); 6081 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); 6082 6083 /* Set the NOWAIT flags if all underlying devices support it */ 6084 if (nowait) 6085 blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); 6086 } 6087 if (pers->sync_request) { 6088 if (mddev->kobj.sd && 6089 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 6090 pr_warn("md: cannot register extra attributes for %s\n", 6091 mdname(mddev)); 6092 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 6093 mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); 6094 mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); 6095 } else if (mddev->ro == MD_AUTO_READ) 6096 mddev->ro = MD_RDWR; 6097 6098 atomic_set(&mddev->max_corr_read_errors, 6099 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 6100 mddev->safemode = 0; 6101 if (mddev_is_clustered(mddev)) 6102 mddev->safemode_delay = 0; 6103 else 6104 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 6105 mddev->in_sync = 1; 6106 smp_wmb(); 6107 spin_lock(&mddev->lock); 6108 mddev->pers = pers; 6109 spin_unlock(&mddev->lock); 6110 rdev_for_each(rdev, mddev) 6111 if (rdev->raid_disk >= 0) 6112 sysfs_link_rdev(mddev, rdev); /* failure here is OK */ 6113 6114 if (mddev->degraded && md_is_rdwr(mddev)) 6115 /* This ensures that recovering status is reported immediately 6116 * via sysfs - until a lack of spares is confirmed. 6117 */ 6118 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6119 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6120 6121 if (mddev->sb_flags) 6122 md_update_sb(mddev, 0); 6123 6124 md_new_event(); 6125 return 0; 6126 6127 bitmap_abort: 6128 mddev_detach(mddev); 6129 if (mddev->private) 6130 pers->free(mddev, mddev->private); 6131 mddev->private = NULL; 6132 module_put(pers->owner); 6133 md_bitmap_destroy(mddev); 6134 abort: 6135 bioset_exit(&mddev->io_clone_set); 6136 exit_sync_set: 6137 bioset_exit(&mddev->sync_set); 6138 exit_bio_set: 6139 bioset_exit(&mddev->bio_set); 6140 return err; 6141 } 6142 EXPORT_SYMBOL_GPL(md_run); 6143 6144 int do_md_run(struct mddev *mddev) 6145 { 6146 int err; 6147 6148 set_bit(MD_NOT_READY, &mddev->flags); 6149 err = md_run(mddev); 6150 if (err) 6151 goto out; 6152 err = md_bitmap_load(mddev); 6153 if (err) { 6154 md_bitmap_destroy(mddev); 6155 goto out; 6156 } 6157 6158 if (mddev_is_clustered(mddev)) 6159 md_allow_write(mddev); 6160 6161 /* run start up tasks that require md_thread */ 6162 md_start(mddev); 6163 6164 md_wakeup_thread(mddev->thread); 6165 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 6166 6167 set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 6168 clear_bit(MD_NOT_READY, &mddev->flags); 6169 mddev->changed = 1; 6170 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 6171 sysfs_notify_dirent_safe(mddev->sysfs_state); 6172 sysfs_notify_dirent_safe(mddev->sysfs_action); 6173 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 6174 out: 6175 clear_bit(MD_NOT_READY, &mddev->flags); 6176 return err; 6177 } 6178 6179 int md_start(struct mddev *mddev) 6180 { 6181 int ret = 0; 6182 6183 if (mddev->pers->start) { 6184 set_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6185 md_wakeup_thread(mddev->thread); 6186 ret = mddev->pers->start(mddev); 6187 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); 6188 md_wakeup_thread(mddev->sync_thread); 6189 } 6190 return ret; 6191 } 6192 EXPORT_SYMBOL_GPL(md_start); 6193 6194 static int restart_array(struct mddev *mddev) 6195 { 6196 struct gendisk *disk = mddev->gendisk; 6197 struct md_rdev *rdev; 6198 bool has_journal = false; 6199 bool has_readonly = false; 6200 6201 /* Complain if it has no devices */ 6202 if (list_empty(&mddev->disks)) 6203 return -ENXIO; 6204 if (!mddev->pers) 6205 return -EINVAL; 6206 if (md_is_rdwr(mddev)) 6207 return -EBUSY; 6208 6209 rcu_read_lock(); 6210 rdev_for_each_rcu(rdev, mddev) { 6211 if (test_bit(Journal, &rdev->flags) && 6212 !test_bit(Faulty, &rdev->flags)) 6213 has_journal = true; 6214 if (rdev_read_only(rdev)) 6215 has_readonly = true; 6216 } 6217 rcu_read_unlock(); 6218 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal) 6219 /* Don't restart rw with journal missing/faulty */ 6220 return -EINVAL; 6221 if (has_readonly) 6222 return -EROFS; 6223 6224 mddev->safemode = 0; 6225 mddev->ro = MD_RDWR; 6226 set_disk_ro(disk, 0); 6227 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 6228 /* Kick recovery or resync if necessary */ 6229 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6230 md_wakeup_thread(mddev->thread); 6231 md_wakeup_thread(mddev->sync_thread); 6232 sysfs_notify_dirent_safe(mddev->sysfs_state); 6233 return 0; 6234 } 6235 6236 static void md_clean(struct mddev *mddev) 6237 { 6238 mddev->array_sectors = 0; 6239 mddev->external_size = 0; 6240 mddev->dev_sectors = 0; 6241 mddev->raid_disks = 0; 6242 mddev->recovery_cp = 0; 6243 mddev->resync_min = 0; 6244 mddev->resync_max = MaxSector; 6245 mddev->reshape_position = MaxSector; 6246 /* we still need mddev->external in export_rdev, do not clear it yet */ 6247 mddev->persistent = 0; 6248 mddev->level = LEVEL_NONE; 6249 mddev->clevel[0] = 0; 6250 mddev->flags = 0; 6251 mddev->sb_flags = 0; 6252 mddev->ro = MD_RDWR; 6253 mddev->metadata_type[0] = 0; 6254 mddev->chunk_sectors = 0; 6255 mddev->ctime = mddev->utime = 0; 6256 mddev->layout = 0; 6257 mddev->max_disks = 0; 6258 mddev->events = 0; 6259 mddev->can_decrease_events = 0; 6260 mddev->delta_disks = 0; 6261 mddev->reshape_backwards = 0; 6262 mddev->new_level = LEVEL_NONE; 6263 mddev->new_layout = 0; 6264 mddev->new_chunk_sectors = 0; 6265 mddev->curr_resync = MD_RESYNC_NONE; 6266 atomic64_set(&mddev->resync_mismatches, 0); 6267 mddev->suspend_lo = mddev->suspend_hi = 0; 6268 mddev->sync_speed_min = mddev->sync_speed_max = 0; 6269 mddev->recovery = 0; 6270 mddev->in_sync = 0; 6271 mddev->changed = 0; 6272 mddev->degraded = 0; 6273 mddev->safemode = 0; 6274 mddev->private = NULL; 6275 mddev->cluster_info = NULL; 6276 mddev->bitmap_info.offset = 0; 6277 mddev->bitmap_info.default_offset = 0; 6278 mddev->bitmap_info.default_space = 0; 6279 mddev->bitmap_info.chunksize = 0; 6280 mddev->bitmap_info.daemon_sleep = 0; 6281 mddev->bitmap_info.max_write_behind = 0; 6282 mddev->bitmap_info.nodes = 0; 6283 } 6284 6285 static void __md_stop_writes(struct mddev *mddev) 6286 { 6287 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6288 if (work_pending(&mddev->del_work)) 6289 flush_workqueue(md_misc_wq); 6290 if (mddev->sync_thread) { 6291 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6292 md_reap_sync_thread(mddev); 6293 } 6294 6295 del_timer_sync(&mddev->safemode_timer); 6296 6297 if (mddev->pers && mddev->pers->quiesce) { 6298 mddev->pers->quiesce(mddev, 1); 6299 mddev->pers->quiesce(mddev, 0); 6300 } 6301 md_bitmap_flush(mddev); 6302 6303 if (md_is_rdwr(mddev) && 6304 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 6305 mddev->sb_flags)) { 6306 /* mark array as shutdown cleanly */ 6307 if (!mddev_is_clustered(mddev)) 6308 mddev->in_sync = 1; 6309 md_update_sb(mddev, 1); 6310 } 6311 /* disable policy to guarantee rdevs free resources for serialization */ 6312 mddev->serialize_policy = 0; 6313 mddev_destroy_serial_pool(mddev, NULL); 6314 } 6315 6316 void md_stop_writes(struct mddev *mddev) 6317 { 6318 mddev_lock_nointr(mddev); 6319 __md_stop_writes(mddev); 6320 mddev_unlock(mddev); 6321 } 6322 EXPORT_SYMBOL_GPL(md_stop_writes); 6323 6324 static void mddev_detach(struct mddev *mddev) 6325 { 6326 md_bitmap_wait_behind_writes(mddev); 6327 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6328 mddev->pers->quiesce(mddev, 1); 6329 mddev->pers->quiesce(mddev, 0); 6330 } 6331 md_unregister_thread(mddev, &mddev->thread); 6332 if (mddev->queue) 6333 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 6334 } 6335 6336 static void __md_stop(struct mddev *mddev) 6337 { 6338 struct md_personality *pers = mddev->pers; 6339 md_bitmap_destroy(mddev); 6340 mddev_detach(mddev); 6341 /* Ensure ->event_work is done */ 6342 if (mddev->event_work.func) 6343 flush_workqueue(md_misc_wq); 6344 spin_lock(&mddev->lock); 6345 mddev->pers = NULL; 6346 spin_unlock(&mddev->lock); 6347 if (mddev->private) 6348 pers->free(mddev, mddev->private); 6349 mddev->private = NULL; 6350 if (pers->sync_request && mddev->to_remove == NULL) 6351 mddev->to_remove = &md_redundancy_group; 6352 module_put(pers->owner); 6353 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6354 6355 bioset_exit(&mddev->bio_set); 6356 bioset_exit(&mddev->sync_set); 6357 bioset_exit(&mddev->io_clone_set); 6358 } 6359 6360 void md_stop(struct mddev *mddev) 6361 { 6362 lockdep_assert_held(&mddev->reconfig_mutex); 6363 6364 /* stop the array and free an attached data structures. 6365 * This is called from dm-raid 6366 */ 6367 __md_stop_writes(mddev); 6368 __md_stop(mddev); 6369 } 6370 6371 EXPORT_SYMBOL_GPL(md_stop); 6372 6373 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 6374 { 6375 int err = 0; 6376 int did_freeze = 0; 6377 6378 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6379 did_freeze = 1; 6380 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6381 md_wakeup_thread(mddev->thread); 6382 } 6383 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6384 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6385 6386 /* 6387 * Thread might be blocked waiting for metadata update which will now 6388 * never happen 6389 */ 6390 md_wakeup_thread_directly(mddev->sync_thread); 6391 6392 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6393 return -EBUSY; 6394 mddev_unlock(mddev); 6395 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 6396 &mddev->recovery)); 6397 wait_event(mddev->sb_wait, 6398 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 6399 mddev_lock_nointr(mddev); 6400 6401 mutex_lock(&mddev->open_mutex); 6402 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6403 mddev->sync_thread || 6404 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6405 pr_warn("md: %s still in use.\n",mdname(mddev)); 6406 if (did_freeze) { 6407 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6408 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6409 md_wakeup_thread(mddev->thread); 6410 } 6411 err = -EBUSY; 6412 goto out; 6413 } 6414 if (mddev->pers) { 6415 __md_stop_writes(mddev); 6416 6417 err = -ENXIO; 6418 if (mddev->ro == MD_RDONLY) 6419 goto out; 6420 mddev->ro = MD_RDONLY; 6421 set_disk_ro(mddev->gendisk, 1); 6422 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6423 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6424 md_wakeup_thread(mddev->thread); 6425 sysfs_notify_dirent_safe(mddev->sysfs_state); 6426 err = 0; 6427 } 6428 out: 6429 mutex_unlock(&mddev->open_mutex); 6430 return err; 6431 } 6432 6433 /* mode: 6434 * 0 - completely stop and dis-assemble array 6435 * 2 - stop but do not disassemble array 6436 */ 6437 static int do_md_stop(struct mddev *mddev, int mode, 6438 struct block_device *bdev) 6439 { 6440 struct gendisk *disk = mddev->gendisk; 6441 struct md_rdev *rdev; 6442 int did_freeze = 0; 6443 6444 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 6445 did_freeze = 1; 6446 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6447 md_wakeup_thread(mddev->thread); 6448 } 6449 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 6450 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6451 6452 /* 6453 * Thread might be blocked waiting for metadata update which will now 6454 * never happen 6455 */ 6456 md_wakeup_thread_directly(mddev->sync_thread); 6457 6458 mddev_unlock(mddev); 6459 wait_event(resync_wait, (mddev->sync_thread == NULL && 6460 !test_bit(MD_RECOVERY_RUNNING, 6461 &mddev->recovery))); 6462 mddev_lock_nointr(mddev); 6463 6464 mutex_lock(&mddev->open_mutex); 6465 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 6466 mddev->sysfs_active || 6467 mddev->sync_thread || 6468 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 6469 pr_warn("md: %s still in use.\n",mdname(mddev)); 6470 mutex_unlock(&mddev->open_mutex); 6471 if (did_freeze) { 6472 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 6473 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6474 md_wakeup_thread(mddev->thread); 6475 } 6476 return -EBUSY; 6477 } 6478 if (mddev->pers) { 6479 if (!md_is_rdwr(mddev)) 6480 set_disk_ro(disk, 0); 6481 6482 __md_stop_writes(mddev); 6483 __md_stop(mddev); 6484 6485 /* tell userspace to handle 'inactive' */ 6486 sysfs_notify_dirent_safe(mddev->sysfs_state); 6487 6488 rdev_for_each(rdev, mddev) 6489 if (rdev->raid_disk >= 0) 6490 sysfs_unlink_rdev(mddev, rdev); 6491 6492 set_capacity_and_notify(disk, 0); 6493 mutex_unlock(&mddev->open_mutex); 6494 mddev->changed = 1; 6495 6496 if (!md_is_rdwr(mddev)) 6497 mddev->ro = MD_RDWR; 6498 } else 6499 mutex_unlock(&mddev->open_mutex); 6500 /* 6501 * Free resources if final stop 6502 */ 6503 if (mode == 0) { 6504 pr_info("md: %s stopped.\n", mdname(mddev)); 6505 6506 if (mddev->bitmap_info.file) { 6507 struct file *f = mddev->bitmap_info.file; 6508 spin_lock(&mddev->lock); 6509 mddev->bitmap_info.file = NULL; 6510 spin_unlock(&mddev->lock); 6511 fput(f); 6512 } 6513 mddev->bitmap_info.offset = 0; 6514 6515 export_array(mddev); 6516 6517 md_clean(mddev); 6518 if (mddev->hold_active == UNTIL_STOP) 6519 mddev->hold_active = 0; 6520 } 6521 md_new_event(); 6522 sysfs_notify_dirent_safe(mddev->sysfs_state); 6523 return 0; 6524 } 6525 6526 #ifndef MODULE 6527 static void autorun_array(struct mddev *mddev) 6528 { 6529 struct md_rdev *rdev; 6530 int err; 6531 6532 if (list_empty(&mddev->disks)) 6533 return; 6534 6535 pr_info("md: running: "); 6536 6537 rdev_for_each(rdev, mddev) { 6538 pr_cont("<%pg>", rdev->bdev); 6539 } 6540 pr_cont("\n"); 6541 6542 err = do_md_run(mddev); 6543 if (err) { 6544 pr_warn("md: do_md_run() returned %d\n", err); 6545 do_md_stop(mddev, 0, NULL); 6546 } 6547 } 6548 6549 /* 6550 * lets try to run arrays based on all disks that have arrived 6551 * until now. (those are in pending_raid_disks) 6552 * 6553 * the method: pick the first pending disk, collect all disks with 6554 * the same UUID, remove all from the pending list and put them into 6555 * the 'same_array' list. Then order this list based on superblock 6556 * update time (freshest comes first), kick out 'old' disks and 6557 * compare superblocks. If everything's fine then run it. 6558 * 6559 * If "unit" is allocated, then bump its reference count 6560 */ 6561 static void autorun_devices(int part) 6562 { 6563 struct md_rdev *rdev0, *rdev, *tmp; 6564 struct mddev *mddev; 6565 6566 pr_info("md: autorun ...\n"); 6567 while (!list_empty(&pending_raid_disks)) { 6568 int unit; 6569 dev_t dev; 6570 LIST_HEAD(candidates); 6571 rdev0 = list_entry(pending_raid_disks.next, 6572 struct md_rdev, same_set); 6573 6574 pr_debug("md: considering %pg ...\n", rdev0->bdev); 6575 INIT_LIST_HEAD(&candidates); 6576 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 6577 if (super_90_load(rdev, rdev0, 0) >= 0) { 6578 pr_debug("md: adding %pg ...\n", 6579 rdev->bdev); 6580 list_move(&rdev->same_set, &candidates); 6581 } 6582 /* 6583 * now we have a set of devices, with all of them having 6584 * mostly sane superblocks. It's time to allocate the 6585 * mddev. 6586 */ 6587 if (part) { 6588 dev = MKDEV(mdp_major, 6589 rdev0->preferred_minor << MdpMinorShift); 6590 unit = MINOR(dev) >> MdpMinorShift; 6591 } else { 6592 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 6593 unit = MINOR(dev); 6594 } 6595 if (rdev0->preferred_minor != unit) { 6596 pr_warn("md: unit number in %pg is bad: %d\n", 6597 rdev0->bdev, rdev0->preferred_minor); 6598 break; 6599 } 6600 6601 mddev = md_alloc(dev, NULL); 6602 if (IS_ERR(mddev)) 6603 break; 6604 6605 if (mddev_suspend_and_lock(mddev)) 6606 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 6607 else if (mddev->raid_disks || mddev->major_version 6608 || !list_empty(&mddev->disks)) { 6609 pr_warn("md: %s already running, cannot run %pg\n", 6610 mdname(mddev), rdev0->bdev); 6611 mddev_unlock_and_resume(mddev); 6612 } else { 6613 pr_debug("md: created %s\n", mdname(mddev)); 6614 mddev->persistent = 1; 6615 rdev_for_each_list(rdev, tmp, &candidates) { 6616 list_del_init(&rdev->same_set); 6617 if (bind_rdev_to_array(rdev, mddev)) 6618 export_rdev(rdev, mddev); 6619 } 6620 autorun_array(mddev); 6621 mddev_unlock_and_resume(mddev); 6622 } 6623 /* on success, candidates will be empty, on error 6624 * it won't... 6625 */ 6626 rdev_for_each_list(rdev, tmp, &candidates) { 6627 list_del_init(&rdev->same_set); 6628 export_rdev(rdev, mddev); 6629 } 6630 mddev_put(mddev); 6631 } 6632 pr_info("md: ... autorun DONE.\n"); 6633 } 6634 #endif /* !MODULE */ 6635 6636 static int get_version(void __user *arg) 6637 { 6638 mdu_version_t ver; 6639 6640 ver.major = MD_MAJOR_VERSION; 6641 ver.minor = MD_MINOR_VERSION; 6642 ver.patchlevel = MD_PATCHLEVEL_VERSION; 6643 6644 if (copy_to_user(arg, &ver, sizeof(ver))) 6645 return -EFAULT; 6646 6647 return 0; 6648 } 6649 6650 static int get_array_info(struct mddev *mddev, void __user *arg) 6651 { 6652 mdu_array_info_t info; 6653 int nr,working,insync,failed,spare; 6654 struct md_rdev *rdev; 6655 6656 nr = working = insync = failed = spare = 0; 6657 rcu_read_lock(); 6658 rdev_for_each_rcu(rdev, mddev) { 6659 nr++; 6660 if (test_bit(Faulty, &rdev->flags)) 6661 failed++; 6662 else { 6663 working++; 6664 if (test_bit(In_sync, &rdev->flags)) 6665 insync++; 6666 else if (test_bit(Journal, &rdev->flags)) 6667 /* TODO: add journal count to md_u.h */ 6668 ; 6669 else 6670 spare++; 6671 } 6672 } 6673 rcu_read_unlock(); 6674 6675 info.major_version = mddev->major_version; 6676 info.minor_version = mddev->minor_version; 6677 info.patch_version = MD_PATCHLEVEL_VERSION; 6678 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 6679 info.level = mddev->level; 6680 info.size = mddev->dev_sectors / 2; 6681 if (info.size != mddev->dev_sectors / 2) /* overflow */ 6682 info.size = -1; 6683 info.nr_disks = nr; 6684 info.raid_disks = mddev->raid_disks; 6685 info.md_minor = mddev->md_minor; 6686 info.not_persistent= !mddev->persistent; 6687 6688 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 6689 info.state = 0; 6690 if (mddev->in_sync) 6691 info.state = (1<<MD_SB_CLEAN); 6692 if (mddev->bitmap && mddev->bitmap_info.offset) 6693 info.state |= (1<<MD_SB_BITMAP_PRESENT); 6694 if (mddev_is_clustered(mddev)) 6695 info.state |= (1<<MD_SB_CLUSTERED); 6696 info.active_disks = insync; 6697 info.working_disks = working; 6698 info.failed_disks = failed; 6699 info.spare_disks = spare; 6700 6701 info.layout = mddev->layout; 6702 info.chunk_size = mddev->chunk_sectors << 9; 6703 6704 if (copy_to_user(arg, &info, sizeof(info))) 6705 return -EFAULT; 6706 6707 return 0; 6708 } 6709 6710 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 6711 { 6712 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 6713 char *ptr; 6714 int err; 6715 6716 file = kzalloc(sizeof(*file), GFP_NOIO); 6717 if (!file) 6718 return -ENOMEM; 6719 6720 err = 0; 6721 spin_lock(&mddev->lock); 6722 /* bitmap enabled */ 6723 if (mddev->bitmap_info.file) { 6724 ptr = file_path(mddev->bitmap_info.file, file->pathname, 6725 sizeof(file->pathname)); 6726 if (IS_ERR(ptr)) 6727 err = PTR_ERR(ptr); 6728 else 6729 memmove(file->pathname, ptr, 6730 sizeof(file->pathname)-(ptr-file->pathname)); 6731 } 6732 spin_unlock(&mddev->lock); 6733 6734 if (err == 0 && 6735 copy_to_user(arg, file, sizeof(*file))) 6736 err = -EFAULT; 6737 6738 kfree(file); 6739 return err; 6740 } 6741 6742 static int get_disk_info(struct mddev *mddev, void __user * arg) 6743 { 6744 mdu_disk_info_t info; 6745 struct md_rdev *rdev; 6746 6747 if (copy_from_user(&info, arg, sizeof(info))) 6748 return -EFAULT; 6749 6750 rcu_read_lock(); 6751 rdev = md_find_rdev_nr_rcu(mddev, info.number); 6752 if (rdev) { 6753 info.major = MAJOR(rdev->bdev->bd_dev); 6754 info.minor = MINOR(rdev->bdev->bd_dev); 6755 info.raid_disk = rdev->raid_disk; 6756 info.state = 0; 6757 if (test_bit(Faulty, &rdev->flags)) 6758 info.state |= (1<<MD_DISK_FAULTY); 6759 else if (test_bit(In_sync, &rdev->flags)) { 6760 info.state |= (1<<MD_DISK_ACTIVE); 6761 info.state |= (1<<MD_DISK_SYNC); 6762 } 6763 if (test_bit(Journal, &rdev->flags)) 6764 info.state |= (1<<MD_DISK_JOURNAL); 6765 if (test_bit(WriteMostly, &rdev->flags)) 6766 info.state |= (1<<MD_DISK_WRITEMOSTLY); 6767 if (test_bit(FailFast, &rdev->flags)) 6768 info.state |= (1<<MD_DISK_FAILFAST); 6769 } else { 6770 info.major = info.minor = 0; 6771 info.raid_disk = -1; 6772 info.state = (1<<MD_DISK_REMOVED); 6773 } 6774 rcu_read_unlock(); 6775 6776 if (copy_to_user(arg, &info, sizeof(info))) 6777 return -EFAULT; 6778 6779 return 0; 6780 } 6781 6782 int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) 6783 { 6784 struct md_rdev *rdev; 6785 dev_t dev = MKDEV(info->major,info->minor); 6786 6787 if (mddev_is_clustered(mddev) && 6788 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6789 pr_warn("%s: Cannot add to clustered mddev.\n", 6790 mdname(mddev)); 6791 return -EINVAL; 6792 } 6793 6794 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6795 return -EOVERFLOW; 6796 6797 if (!mddev->raid_disks) { 6798 int err; 6799 /* expecting a device which has a superblock */ 6800 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6801 if (IS_ERR(rdev)) { 6802 pr_warn("md: md_import_device returned %ld\n", 6803 PTR_ERR(rdev)); 6804 return PTR_ERR(rdev); 6805 } 6806 if (!list_empty(&mddev->disks)) { 6807 struct md_rdev *rdev0 6808 = list_entry(mddev->disks.next, 6809 struct md_rdev, same_set); 6810 err = super_types[mddev->major_version] 6811 .load_super(rdev, rdev0, mddev->minor_version); 6812 if (err < 0) { 6813 pr_warn("md: %pg has different UUID to %pg\n", 6814 rdev->bdev, 6815 rdev0->bdev); 6816 export_rdev(rdev, mddev); 6817 return -EINVAL; 6818 } 6819 } 6820 err = bind_rdev_to_array(rdev, mddev); 6821 if (err) 6822 export_rdev(rdev, mddev); 6823 return err; 6824 } 6825 6826 /* 6827 * md_add_new_disk can be used once the array is assembled 6828 * to add "hot spares". They must already have a superblock 6829 * written 6830 */ 6831 if (mddev->pers) { 6832 int err; 6833 if (!mddev->pers->hot_add_disk) { 6834 pr_warn("%s: personality does not support diskops!\n", 6835 mdname(mddev)); 6836 return -EINVAL; 6837 } 6838 if (mddev->persistent) 6839 rdev = md_import_device(dev, mddev->major_version, 6840 mddev->minor_version); 6841 else 6842 rdev = md_import_device(dev, -1, -1); 6843 if (IS_ERR(rdev)) { 6844 pr_warn("md: md_import_device returned %ld\n", 6845 PTR_ERR(rdev)); 6846 return PTR_ERR(rdev); 6847 } 6848 /* set saved_raid_disk if appropriate */ 6849 if (!mddev->persistent) { 6850 if (info->state & (1<<MD_DISK_SYNC) && 6851 info->raid_disk < mddev->raid_disks) { 6852 rdev->raid_disk = info->raid_disk; 6853 clear_bit(Bitmap_sync, &rdev->flags); 6854 } else 6855 rdev->raid_disk = -1; 6856 rdev->saved_raid_disk = rdev->raid_disk; 6857 } else 6858 super_types[mddev->major_version]. 6859 validate_super(mddev, NULL/*freshest*/, rdev); 6860 if ((info->state & (1<<MD_DISK_SYNC)) && 6861 rdev->raid_disk != info->raid_disk) { 6862 /* This was a hot-add request, but events doesn't 6863 * match, so reject it. 6864 */ 6865 export_rdev(rdev, mddev); 6866 return -EINVAL; 6867 } 6868 6869 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6870 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6871 set_bit(WriteMostly, &rdev->flags); 6872 else 6873 clear_bit(WriteMostly, &rdev->flags); 6874 if (info->state & (1<<MD_DISK_FAILFAST)) 6875 set_bit(FailFast, &rdev->flags); 6876 else 6877 clear_bit(FailFast, &rdev->flags); 6878 6879 if (info->state & (1<<MD_DISK_JOURNAL)) { 6880 struct md_rdev *rdev2; 6881 bool has_journal = false; 6882 6883 /* make sure no existing journal disk */ 6884 rdev_for_each(rdev2, mddev) { 6885 if (test_bit(Journal, &rdev2->flags)) { 6886 has_journal = true; 6887 break; 6888 } 6889 } 6890 if (has_journal || mddev->bitmap) { 6891 export_rdev(rdev, mddev); 6892 return -EBUSY; 6893 } 6894 set_bit(Journal, &rdev->flags); 6895 } 6896 /* 6897 * check whether the device shows up in other nodes 6898 */ 6899 if (mddev_is_clustered(mddev)) { 6900 if (info->state & (1 << MD_DISK_CANDIDATE)) 6901 set_bit(Candidate, &rdev->flags); 6902 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6903 /* --add initiated by this node */ 6904 err = md_cluster_ops->add_new_disk(mddev, rdev); 6905 if (err) { 6906 export_rdev(rdev, mddev); 6907 return err; 6908 } 6909 } 6910 } 6911 6912 rdev->raid_disk = -1; 6913 err = bind_rdev_to_array(rdev, mddev); 6914 6915 if (err) 6916 export_rdev(rdev, mddev); 6917 6918 if (mddev_is_clustered(mddev)) { 6919 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6920 if (!err) { 6921 err = md_cluster_ops->new_disk_ack(mddev, 6922 err == 0); 6923 if (err) 6924 md_kick_rdev_from_array(rdev); 6925 } 6926 } else { 6927 if (err) 6928 md_cluster_ops->add_new_disk_cancel(mddev); 6929 else 6930 err = add_bound_rdev(rdev); 6931 } 6932 6933 } else if (!err) 6934 err = add_bound_rdev(rdev); 6935 6936 return err; 6937 } 6938 6939 /* otherwise, md_add_new_disk is only allowed 6940 * for major_version==0 superblocks 6941 */ 6942 if (mddev->major_version != 0) { 6943 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 6944 return -EINVAL; 6945 } 6946 6947 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6948 int err; 6949 rdev = md_import_device(dev, -1, 0); 6950 if (IS_ERR(rdev)) { 6951 pr_warn("md: error, md_import_device() returned %ld\n", 6952 PTR_ERR(rdev)); 6953 return PTR_ERR(rdev); 6954 } 6955 rdev->desc_nr = info->number; 6956 if (info->raid_disk < mddev->raid_disks) 6957 rdev->raid_disk = info->raid_disk; 6958 else 6959 rdev->raid_disk = -1; 6960 6961 if (rdev->raid_disk < mddev->raid_disks) 6962 if (info->state & (1<<MD_DISK_SYNC)) 6963 set_bit(In_sync, &rdev->flags); 6964 6965 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6966 set_bit(WriteMostly, &rdev->flags); 6967 if (info->state & (1<<MD_DISK_FAILFAST)) 6968 set_bit(FailFast, &rdev->flags); 6969 6970 if (!mddev->persistent) { 6971 pr_debug("md: nonpersistent superblock ...\n"); 6972 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 6973 } else 6974 rdev->sb_start = calc_dev_sboffset(rdev); 6975 rdev->sectors = rdev->sb_start; 6976 6977 err = bind_rdev_to_array(rdev, mddev); 6978 if (err) { 6979 export_rdev(rdev, mddev); 6980 return err; 6981 } 6982 } 6983 6984 return 0; 6985 } 6986 6987 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6988 { 6989 struct md_rdev *rdev; 6990 6991 if (!mddev->pers) 6992 return -ENODEV; 6993 6994 rdev = find_rdev(mddev, dev); 6995 if (!rdev) 6996 return -ENXIO; 6997 6998 if (rdev->raid_disk < 0) 6999 goto kick_rdev; 7000 7001 clear_bit(Blocked, &rdev->flags); 7002 remove_and_add_spares(mddev, rdev); 7003 7004 if (rdev->raid_disk >= 0) 7005 goto busy; 7006 7007 kick_rdev: 7008 if (mddev_is_clustered(mddev)) { 7009 if (md_cluster_ops->remove_disk(mddev, rdev)) 7010 goto busy; 7011 } 7012 7013 md_kick_rdev_from_array(rdev); 7014 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7015 if (mddev->thread) 7016 md_wakeup_thread(mddev->thread); 7017 else 7018 md_update_sb(mddev, 1); 7019 md_new_event(); 7020 7021 return 0; 7022 busy: 7023 pr_debug("md: cannot remove active disk %pg from %s ...\n", 7024 rdev->bdev, mdname(mddev)); 7025 return -EBUSY; 7026 } 7027 7028 static int hot_add_disk(struct mddev *mddev, dev_t dev) 7029 { 7030 int err; 7031 struct md_rdev *rdev; 7032 7033 if (!mddev->pers) 7034 return -ENODEV; 7035 7036 if (mddev->major_version != 0) { 7037 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 7038 mdname(mddev)); 7039 return -EINVAL; 7040 } 7041 if (!mddev->pers->hot_add_disk) { 7042 pr_warn("%s: personality does not support diskops!\n", 7043 mdname(mddev)); 7044 return -EINVAL; 7045 } 7046 7047 rdev = md_import_device(dev, -1, 0); 7048 if (IS_ERR(rdev)) { 7049 pr_warn("md: error, md_import_device() returned %ld\n", 7050 PTR_ERR(rdev)); 7051 return -EINVAL; 7052 } 7053 7054 if (mddev->persistent) 7055 rdev->sb_start = calc_dev_sboffset(rdev); 7056 else 7057 rdev->sb_start = bdev_nr_sectors(rdev->bdev); 7058 7059 rdev->sectors = rdev->sb_start; 7060 7061 if (test_bit(Faulty, &rdev->flags)) { 7062 pr_warn("md: can not hot-add faulty %pg disk to %s!\n", 7063 rdev->bdev, mdname(mddev)); 7064 err = -EINVAL; 7065 goto abort_export; 7066 } 7067 7068 clear_bit(In_sync, &rdev->flags); 7069 rdev->desc_nr = -1; 7070 rdev->saved_raid_disk = -1; 7071 err = bind_rdev_to_array(rdev, mddev); 7072 if (err) 7073 goto abort_export; 7074 7075 /* 7076 * The rest should better be atomic, we can have disk failures 7077 * noticed in interrupt contexts ... 7078 */ 7079 7080 rdev->raid_disk = -1; 7081 7082 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7083 if (!mddev->thread) 7084 md_update_sb(mddev, 1); 7085 /* 7086 * If the new disk does not support REQ_NOWAIT, 7087 * disable on the whole MD. 7088 */ 7089 if (!bdev_nowait(rdev->bdev)) { 7090 pr_info("%s: Disabling nowait because %pg does not support nowait\n", 7091 mdname(mddev), rdev->bdev); 7092 blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); 7093 } 7094 /* 7095 * Kick recovery, maybe this spare has to be added to the 7096 * array immediately. 7097 */ 7098 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7099 md_wakeup_thread(mddev->thread); 7100 md_new_event(); 7101 return 0; 7102 7103 abort_export: 7104 export_rdev(rdev, mddev); 7105 return err; 7106 } 7107 7108 static int set_bitmap_file(struct mddev *mddev, int fd) 7109 { 7110 int err = 0; 7111 7112 if (mddev->pers) { 7113 if (!mddev->pers->quiesce || !mddev->thread) 7114 return -EBUSY; 7115 if (mddev->recovery || mddev->sync_thread) 7116 return -EBUSY; 7117 /* we should be able to change the bitmap.. */ 7118 } 7119 7120 if (fd >= 0) { 7121 struct inode *inode; 7122 struct file *f; 7123 7124 if (mddev->bitmap || mddev->bitmap_info.file) 7125 return -EEXIST; /* cannot add when bitmap is present */ 7126 7127 if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) { 7128 pr_warn("%s: bitmap files not supported by this kernel\n", 7129 mdname(mddev)); 7130 return -EINVAL; 7131 } 7132 pr_warn("%s: using deprecated bitmap file support\n", 7133 mdname(mddev)); 7134 7135 f = fget(fd); 7136 7137 if (f == NULL) { 7138 pr_warn("%s: error: failed to get bitmap file\n", 7139 mdname(mddev)); 7140 return -EBADF; 7141 } 7142 7143 inode = f->f_mapping->host; 7144 if (!S_ISREG(inode->i_mode)) { 7145 pr_warn("%s: error: bitmap file must be a regular file\n", 7146 mdname(mddev)); 7147 err = -EBADF; 7148 } else if (!(f->f_mode & FMODE_WRITE)) { 7149 pr_warn("%s: error: bitmap file must open for write\n", 7150 mdname(mddev)); 7151 err = -EBADF; 7152 } else if (atomic_read(&inode->i_writecount) != 1) { 7153 pr_warn("%s: error: bitmap file is already in use\n", 7154 mdname(mddev)); 7155 err = -EBUSY; 7156 } 7157 if (err) { 7158 fput(f); 7159 return err; 7160 } 7161 mddev->bitmap_info.file = f; 7162 mddev->bitmap_info.offset = 0; /* file overrides offset */ 7163 } else if (mddev->bitmap == NULL) 7164 return -ENOENT; /* cannot remove what isn't there */ 7165 err = 0; 7166 if (mddev->pers) { 7167 if (fd >= 0) { 7168 struct bitmap *bitmap; 7169 7170 bitmap = md_bitmap_create(mddev, -1); 7171 if (!IS_ERR(bitmap)) { 7172 mddev->bitmap = bitmap; 7173 err = md_bitmap_load(mddev); 7174 } else 7175 err = PTR_ERR(bitmap); 7176 if (err) { 7177 md_bitmap_destroy(mddev); 7178 fd = -1; 7179 } 7180 } else if (fd < 0) { 7181 md_bitmap_destroy(mddev); 7182 } 7183 } 7184 if (fd < 0) { 7185 struct file *f = mddev->bitmap_info.file; 7186 if (f) { 7187 spin_lock(&mddev->lock); 7188 mddev->bitmap_info.file = NULL; 7189 spin_unlock(&mddev->lock); 7190 fput(f); 7191 } 7192 } 7193 7194 return err; 7195 } 7196 7197 /* 7198 * md_set_array_info is used two different ways 7199 * The original usage is when creating a new array. 7200 * In this usage, raid_disks is > 0 and it together with 7201 * level, size, not_persistent,layout,chunksize determine the 7202 * shape of the array. 7203 * This will always create an array with a type-0.90.0 superblock. 7204 * The newer usage is when assembling an array. 7205 * In this case raid_disks will be 0, and the major_version field is 7206 * use to determine which style super-blocks are to be found on the devices. 7207 * The minor and patch _version numbers are also kept incase the 7208 * super_block handler wishes to interpret them. 7209 */ 7210 int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) 7211 { 7212 if (info->raid_disks == 0) { 7213 /* just setting version number for superblock loading */ 7214 if (info->major_version < 0 || 7215 info->major_version >= ARRAY_SIZE(super_types) || 7216 super_types[info->major_version].name == NULL) { 7217 /* maybe try to auto-load a module? */ 7218 pr_warn("md: superblock version %d not known\n", 7219 info->major_version); 7220 return -EINVAL; 7221 } 7222 mddev->major_version = info->major_version; 7223 mddev->minor_version = info->minor_version; 7224 mddev->patch_version = info->patch_version; 7225 mddev->persistent = !info->not_persistent; 7226 /* ensure mddev_put doesn't delete this now that there 7227 * is some minimal configuration. 7228 */ 7229 mddev->ctime = ktime_get_real_seconds(); 7230 return 0; 7231 } 7232 mddev->major_version = MD_MAJOR_VERSION; 7233 mddev->minor_version = MD_MINOR_VERSION; 7234 mddev->patch_version = MD_PATCHLEVEL_VERSION; 7235 mddev->ctime = ktime_get_real_seconds(); 7236 7237 mddev->level = info->level; 7238 mddev->clevel[0] = 0; 7239 mddev->dev_sectors = 2 * (sector_t)info->size; 7240 mddev->raid_disks = info->raid_disks; 7241 /* don't set md_minor, it is determined by which /dev/md* was 7242 * openned 7243 */ 7244 if (info->state & (1<<MD_SB_CLEAN)) 7245 mddev->recovery_cp = MaxSector; 7246 else 7247 mddev->recovery_cp = 0; 7248 mddev->persistent = ! info->not_persistent; 7249 mddev->external = 0; 7250 7251 mddev->layout = info->layout; 7252 if (mddev->level == 0) 7253 /* Cannot trust RAID0 layout info here */ 7254 mddev->layout = -1; 7255 mddev->chunk_sectors = info->chunk_size >> 9; 7256 7257 if (mddev->persistent) { 7258 mddev->max_disks = MD_SB_DISKS; 7259 mddev->flags = 0; 7260 mddev->sb_flags = 0; 7261 } 7262 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 7263 7264 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 7265 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 7266 mddev->bitmap_info.offset = 0; 7267 7268 mddev->reshape_position = MaxSector; 7269 7270 /* 7271 * Generate a 128 bit UUID 7272 */ 7273 get_random_bytes(mddev->uuid, 16); 7274 7275 mddev->new_level = mddev->level; 7276 mddev->new_chunk_sectors = mddev->chunk_sectors; 7277 mddev->new_layout = mddev->layout; 7278 mddev->delta_disks = 0; 7279 mddev->reshape_backwards = 0; 7280 7281 return 0; 7282 } 7283 7284 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 7285 { 7286 lockdep_assert_held(&mddev->reconfig_mutex); 7287 7288 if (mddev->external_size) 7289 return; 7290 7291 mddev->array_sectors = array_sectors; 7292 } 7293 EXPORT_SYMBOL(md_set_array_sectors); 7294 7295 static int update_size(struct mddev *mddev, sector_t num_sectors) 7296 { 7297 struct md_rdev *rdev; 7298 int rv; 7299 int fit = (num_sectors == 0); 7300 sector_t old_dev_sectors = mddev->dev_sectors; 7301 7302 if (mddev->pers->resize == NULL) 7303 return -EINVAL; 7304 /* The "num_sectors" is the number of sectors of each device that 7305 * is used. This can only make sense for arrays with redundancy. 7306 * linear and raid0 always use whatever space is available. We can only 7307 * consider changing this number if no resync or reconstruction is 7308 * happening, and if the new size is acceptable. It must fit before the 7309 * sb_start or, if that is <data_offset, it must fit before the size 7310 * of each device. If num_sectors is zero, we find the largest size 7311 * that fits. 7312 */ 7313 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7314 mddev->sync_thread) 7315 return -EBUSY; 7316 if (!md_is_rdwr(mddev)) 7317 return -EROFS; 7318 7319 rdev_for_each(rdev, mddev) { 7320 sector_t avail = rdev->sectors; 7321 7322 if (fit && (num_sectors == 0 || num_sectors > avail)) 7323 num_sectors = avail; 7324 if (avail < num_sectors) 7325 return -ENOSPC; 7326 } 7327 rv = mddev->pers->resize(mddev, num_sectors); 7328 if (!rv) { 7329 if (mddev_is_clustered(mddev)) 7330 md_cluster_ops->update_size(mddev, old_dev_sectors); 7331 else if (mddev->queue) { 7332 set_capacity_and_notify(mddev->gendisk, 7333 mddev->array_sectors); 7334 } 7335 } 7336 return rv; 7337 } 7338 7339 static int update_raid_disks(struct mddev *mddev, int raid_disks) 7340 { 7341 int rv; 7342 struct md_rdev *rdev; 7343 /* change the number of raid disks */ 7344 if (mddev->pers->check_reshape == NULL) 7345 return -EINVAL; 7346 if (!md_is_rdwr(mddev)) 7347 return -EROFS; 7348 if (raid_disks <= 0 || 7349 (mddev->max_disks && raid_disks >= mddev->max_disks)) 7350 return -EINVAL; 7351 if (mddev->sync_thread || 7352 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 7353 test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || 7354 mddev->reshape_position != MaxSector) 7355 return -EBUSY; 7356 7357 rdev_for_each(rdev, mddev) { 7358 if (mddev->raid_disks < raid_disks && 7359 rdev->data_offset < rdev->new_data_offset) 7360 return -EINVAL; 7361 if (mddev->raid_disks > raid_disks && 7362 rdev->data_offset > rdev->new_data_offset) 7363 return -EINVAL; 7364 } 7365 7366 mddev->delta_disks = raid_disks - mddev->raid_disks; 7367 if (mddev->delta_disks < 0) 7368 mddev->reshape_backwards = 1; 7369 else if (mddev->delta_disks > 0) 7370 mddev->reshape_backwards = 0; 7371 7372 rv = mddev->pers->check_reshape(mddev); 7373 if (rv < 0) { 7374 mddev->delta_disks = 0; 7375 mddev->reshape_backwards = 0; 7376 } 7377 return rv; 7378 } 7379 7380 /* 7381 * update_array_info is used to change the configuration of an 7382 * on-line array. 7383 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 7384 * fields in the info are checked against the array. 7385 * Any differences that cannot be handled will cause an error. 7386 * Normally, only one change can be managed at a time. 7387 */ 7388 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 7389 { 7390 int rv = 0; 7391 int cnt = 0; 7392 int state = 0; 7393 7394 /* calculate expected state,ignoring low bits */ 7395 if (mddev->bitmap && mddev->bitmap_info.offset) 7396 state |= (1 << MD_SB_BITMAP_PRESENT); 7397 7398 if (mddev->major_version != info->major_version || 7399 mddev->minor_version != info->minor_version || 7400 /* mddev->patch_version != info->patch_version || */ 7401 mddev->ctime != info->ctime || 7402 mddev->level != info->level || 7403 /* mddev->layout != info->layout || */ 7404 mddev->persistent != !info->not_persistent || 7405 mddev->chunk_sectors != info->chunk_size >> 9 || 7406 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 7407 ((state^info->state) & 0xfffffe00) 7408 ) 7409 return -EINVAL; 7410 /* Check there is only one change */ 7411 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7412 cnt++; 7413 if (mddev->raid_disks != info->raid_disks) 7414 cnt++; 7415 if (mddev->layout != info->layout) 7416 cnt++; 7417 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 7418 cnt++; 7419 if (cnt == 0) 7420 return 0; 7421 if (cnt > 1) 7422 return -EINVAL; 7423 7424 if (mddev->layout != info->layout) { 7425 /* Change layout 7426 * we don't need to do anything at the md level, the 7427 * personality will take care of it all. 7428 */ 7429 if (mddev->pers->check_reshape == NULL) 7430 return -EINVAL; 7431 else { 7432 mddev->new_layout = info->layout; 7433 rv = mddev->pers->check_reshape(mddev); 7434 if (rv) 7435 mddev->new_layout = mddev->layout; 7436 return rv; 7437 } 7438 } 7439 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 7440 rv = update_size(mddev, (sector_t)info->size * 2); 7441 7442 if (mddev->raid_disks != info->raid_disks) 7443 rv = update_raid_disks(mddev, info->raid_disks); 7444 7445 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 7446 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 7447 rv = -EINVAL; 7448 goto err; 7449 } 7450 if (mddev->recovery || mddev->sync_thread) { 7451 rv = -EBUSY; 7452 goto err; 7453 } 7454 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7455 struct bitmap *bitmap; 7456 /* add the bitmap */ 7457 if (mddev->bitmap) { 7458 rv = -EEXIST; 7459 goto err; 7460 } 7461 if (mddev->bitmap_info.default_offset == 0) { 7462 rv = -EINVAL; 7463 goto err; 7464 } 7465 mddev->bitmap_info.offset = 7466 mddev->bitmap_info.default_offset; 7467 mddev->bitmap_info.space = 7468 mddev->bitmap_info.default_space; 7469 bitmap = md_bitmap_create(mddev, -1); 7470 if (!IS_ERR(bitmap)) { 7471 mddev->bitmap = bitmap; 7472 rv = md_bitmap_load(mddev); 7473 } else 7474 rv = PTR_ERR(bitmap); 7475 if (rv) 7476 md_bitmap_destroy(mddev); 7477 } else { 7478 /* remove the bitmap */ 7479 if (!mddev->bitmap) { 7480 rv = -ENOENT; 7481 goto err; 7482 } 7483 if (mddev->bitmap->storage.file) { 7484 rv = -EINVAL; 7485 goto err; 7486 } 7487 if (mddev->bitmap_info.nodes) { 7488 /* hold PW on all the bitmap lock */ 7489 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 7490 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 7491 rv = -EPERM; 7492 md_cluster_ops->unlock_all_bitmaps(mddev); 7493 goto err; 7494 } 7495 7496 mddev->bitmap_info.nodes = 0; 7497 md_cluster_ops->leave(mddev); 7498 module_put(md_cluster_mod); 7499 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7500 } 7501 md_bitmap_destroy(mddev); 7502 mddev->bitmap_info.offset = 0; 7503 } 7504 } 7505 md_update_sb(mddev, 1); 7506 return rv; 7507 err: 7508 return rv; 7509 } 7510 7511 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 7512 { 7513 struct md_rdev *rdev; 7514 int err = 0; 7515 7516 if (mddev->pers == NULL) 7517 return -ENODEV; 7518 7519 rcu_read_lock(); 7520 rdev = md_find_rdev_rcu(mddev, dev); 7521 if (!rdev) 7522 err = -ENODEV; 7523 else { 7524 md_error(mddev, rdev); 7525 if (test_bit(MD_BROKEN, &mddev->flags)) 7526 err = -EBUSY; 7527 } 7528 rcu_read_unlock(); 7529 return err; 7530 } 7531 7532 /* 7533 * We have a problem here : there is no easy way to give a CHS 7534 * virtual geometry. We currently pretend that we have a 2 heads 7535 * 4 sectors (with a BIG number of cylinders...). This drives 7536 * dosfs just mad... ;-) 7537 */ 7538 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 7539 { 7540 struct mddev *mddev = bdev->bd_disk->private_data; 7541 7542 geo->heads = 2; 7543 geo->sectors = 4; 7544 geo->cylinders = mddev->array_sectors / 8; 7545 return 0; 7546 } 7547 7548 static inline bool md_ioctl_valid(unsigned int cmd) 7549 { 7550 switch (cmd) { 7551 case ADD_NEW_DISK: 7552 case GET_ARRAY_INFO: 7553 case GET_BITMAP_FILE: 7554 case GET_DISK_INFO: 7555 case HOT_ADD_DISK: 7556 case HOT_REMOVE_DISK: 7557 case RAID_VERSION: 7558 case RESTART_ARRAY_RW: 7559 case RUN_ARRAY: 7560 case SET_ARRAY_INFO: 7561 case SET_BITMAP_FILE: 7562 case SET_DISK_FAULTY: 7563 case STOP_ARRAY: 7564 case STOP_ARRAY_RO: 7565 case CLUSTERED_DISK_NACK: 7566 return true; 7567 default: 7568 return false; 7569 } 7570 } 7571 7572 static bool md_ioctl_need_suspend(unsigned int cmd) 7573 { 7574 switch (cmd) { 7575 case ADD_NEW_DISK: 7576 case HOT_ADD_DISK: 7577 case HOT_REMOVE_DISK: 7578 case SET_BITMAP_FILE: 7579 case SET_ARRAY_INFO: 7580 return true; 7581 default: 7582 return false; 7583 } 7584 } 7585 7586 static int __md_set_array_info(struct mddev *mddev, void __user *argp) 7587 { 7588 mdu_array_info_t info; 7589 int err; 7590 7591 if (!argp) 7592 memset(&info, 0, sizeof(info)); 7593 else if (copy_from_user(&info, argp, sizeof(info))) 7594 return -EFAULT; 7595 7596 if (mddev->pers) { 7597 err = update_array_info(mddev, &info); 7598 if (err) 7599 pr_warn("md: couldn't update array info. %d\n", err); 7600 return err; 7601 } 7602 7603 if (!list_empty(&mddev->disks)) { 7604 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 7605 return -EBUSY; 7606 } 7607 7608 if (mddev->raid_disks) { 7609 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 7610 return -EBUSY; 7611 } 7612 7613 err = md_set_array_info(mddev, &info); 7614 if (err) 7615 pr_warn("md: couldn't set array info. %d\n", err); 7616 7617 return err; 7618 } 7619 7620 static int md_ioctl(struct block_device *bdev, blk_mode_t mode, 7621 unsigned int cmd, unsigned long arg) 7622 { 7623 int err = 0; 7624 void __user *argp = (void __user *)arg; 7625 struct mddev *mddev = NULL; 7626 bool did_set_md_closing = false; 7627 7628 if (!md_ioctl_valid(cmd)) 7629 return -ENOTTY; 7630 7631 switch (cmd) { 7632 case RAID_VERSION: 7633 case GET_ARRAY_INFO: 7634 case GET_DISK_INFO: 7635 break; 7636 default: 7637 if (!capable(CAP_SYS_ADMIN)) 7638 return -EACCES; 7639 } 7640 7641 /* 7642 * Commands dealing with the RAID driver but not any 7643 * particular array: 7644 */ 7645 switch (cmd) { 7646 case RAID_VERSION: 7647 err = get_version(argp); 7648 goto out; 7649 default:; 7650 } 7651 7652 /* 7653 * Commands creating/starting a new array: 7654 */ 7655 7656 mddev = bdev->bd_disk->private_data; 7657 7658 if (!mddev) { 7659 BUG(); 7660 goto out; 7661 } 7662 7663 /* Some actions do not requires the mutex */ 7664 switch (cmd) { 7665 case GET_ARRAY_INFO: 7666 if (!mddev->raid_disks && !mddev->external) 7667 err = -ENODEV; 7668 else 7669 err = get_array_info(mddev, argp); 7670 goto out; 7671 7672 case GET_DISK_INFO: 7673 if (!mddev->raid_disks && !mddev->external) 7674 err = -ENODEV; 7675 else 7676 err = get_disk_info(mddev, argp); 7677 goto out; 7678 7679 case SET_DISK_FAULTY: 7680 err = set_disk_faulty(mddev, new_decode_dev(arg)); 7681 goto out; 7682 7683 case GET_BITMAP_FILE: 7684 err = get_bitmap_file(mddev, argp); 7685 goto out; 7686 7687 } 7688 7689 if (cmd == HOT_REMOVE_DISK) 7690 /* need to ensure recovery thread has run */ 7691 wait_event_interruptible_timeout(mddev->sb_wait, 7692 !test_bit(MD_RECOVERY_NEEDED, 7693 &mddev->recovery), 7694 msecs_to_jiffies(5000)); 7695 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 7696 /* Need to flush page cache, and ensure no-one else opens 7697 * and writes 7698 */ 7699 mutex_lock(&mddev->open_mutex); 7700 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 7701 mutex_unlock(&mddev->open_mutex); 7702 err = -EBUSY; 7703 goto out; 7704 } 7705 if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { 7706 mutex_unlock(&mddev->open_mutex); 7707 err = -EBUSY; 7708 goto out; 7709 } 7710 did_set_md_closing = true; 7711 mutex_unlock(&mddev->open_mutex); 7712 sync_blockdev(bdev); 7713 } 7714 7715 if (!md_is_rdwr(mddev)) 7716 flush_work(&mddev->sync_work); 7717 7718 err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) : 7719 mddev_lock(mddev); 7720 if (err) { 7721 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 7722 err, cmd); 7723 goto out; 7724 } 7725 7726 if (cmd == SET_ARRAY_INFO) { 7727 err = __md_set_array_info(mddev, argp); 7728 goto unlock; 7729 } 7730 7731 /* 7732 * Commands querying/configuring an existing array: 7733 */ 7734 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 7735 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 7736 if ((!mddev->raid_disks && !mddev->external) 7737 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 7738 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 7739 && cmd != GET_BITMAP_FILE) { 7740 err = -ENODEV; 7741 goto unlock; 7742 } 7743 7744 /* 7745 * Commands even a read-only array can execute: 7746 */ 7747 switch (cmd) { 7748 case RESTART_ARRAY_RW: 7749 err = restart_array(mddev); 7750 goto unlock; 7751 7752 case STOP_ARRAY: 7753 err = do_md_stop(mddev, 0, bdev); 7754 goto unlock; 7755 7756 case STOP_ARRAY_RO: 7757 err = md_set_readonly(mddev, bdev); 7758 goto unlock; 7759 7760 case HOT_REMOVE_DISK: 7761 err = hot_remove_disk(mddev, new_decode_dev(arg)); 7762 goto unlock; 7763 7764 case ADD_NEW_DISK: 7765 /* We can support ADD_NEW_DISK on read-only arrays 7766 * only if we are re-adding a preexisting device. 7767 * So require mddev->pers and MD_DISK_SYNC. 7768 */ 7769 if (mddev->pers) { 7770 mdu_disk_info_t info; 7771 if (copy_from_user(&info, argp, sizeof(info))) 7772 err = -EFAULT; 7773 else if (!(info.state & (1<<MD_DISK_SYNC))) 7774 /* Need to clear read-only for this */ 7775 break; 7776 else 7777 err = md_add_new_disk(mddev, &info); 7778 goto unlock; 7779 } 7780 break; 7781 } 7782 7783 /* 7784 * The remaining ioctls are changing the state of the 7785 * superblock, so we do not allow them on read-only arrays. 7786 */ 7787 if (!md_is_rdwr(mddev) && mddev->pers) { 7788 if (mddev->ro != MD_AUTO_READ) { 7789 err = -EROFS; 7790 goto unlock; 7791 } 7792 mddev->ro = MD_RDWR; 7793 sysfs_notify_dirent_safe(mddev->sysfs_state); 7794 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7795 /* mddev_unlock will wake thread */ 7796 /* If a device failed while we were read-only, we 7797 * need to make sure the metadata is updated now. 7798 */ 7799 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7800 mddev_unlock(mddev); 7801 wait_event(mddev->sb_wait, 7802 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7803 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7804 mddev_lock_nointr(mddev); 7805 } 7806 } 7807 7808 switch (cmd) { 7809 case ADD_NEW_DISK: 7810 { 7811 mdu_disk_info_t info; 7812 if (copy_from_user(&info, argp, sizeof(info))) 7813 err = -EFAULT; 7814 else 7815 err = md_add_new_disk(mddev, &info); 7816 goto unlock; 7817 } 7818 7819 case CLUSTERED_DISK_NACK: 7820 if (mddev_is_clustered(mddev)) 7821 md_cluster_ops->new_disk_ack(mddev, false); 7822 else 7823 err = -EINVAL; 7824 goto unlock; 7825 7826 case HOT_ADD_DISK: 7827 err = hot_add_disk(mddev, new_decode_dev(arg)); 7828 goto unlock; 7829 7830 case RUN_ARRAY: 7831 err = do_md_run(mddev); 7832 goto unlock; 7833 7834 case SET_BITMAP_FILE: 7835 err = set_bitmap_file(mddev, (int)arg); 7836 goto unlock; 7837 7838 default: 7839 err = -EINVAL; 7840 goto unlock; 7841 } 7842 7843 unlock: 7844 if (mddev->hold_active == UNTIL_IOCTL && 7845 err != -EINVAL) 7846 mddev->hold_active = 0; 7847 7848 md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) : 7849 mddev_unlock(mddev); 7850 7851 out: 7852 if(did_set_md_closing) 7853 clear_bit(MD_CLOSING, &mddev->flags); 7854 return err; 7855 } 7856 #ifdef CONFIG_COMPAT 7857 static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, 7858 unsigned int cmd, unsigned long arg) 7859 { 7860 switch (cmd) { 7861 case HOT_REMOVE_DISK: 7862 case HOT_ADD_DISK: 7863 case SET_DISK_FAULTY: 7864 case SET_BITMAP_FILE: 7865 /* These take in integer arg, do not convert */ 7866 break; 7867 default: 7868 arg = (unsigned long)compat_ptr(arg); 7869 break; 7870 } 7871 7872 return md_ioctl(bdev, mode, cmd, arg); 7873 } 7874 #endif /* CONFIG_COMPAT */ 7875 7876 static int md_set_read_only(struct block_device *bdev, bool ro) 7877 { 7878 struct mddev *mddev = bdev->bd_disk->private_data; 7879 int err; 7880 7881 err = mddev_lock(mddev); 7882 if (err) 7883 return err; 7884 7885 if (!mddev->raid_disks && !mddev->external) { 7886 err = -ENODEV; 7887 goto out_unlock; 7888 } 7889 7890 /* 7891 * Transitioning to read-auto need only happen for arrays that call 7892 * md_write_start and which are not ready for writes yet. 7893 */ 7894 if (!ro && mddev->ro == MD_RDONLY && mddev->pers) { 7895 err = restart_array(mddev); 7896 if (err) 7897 goto out_unlock; 7898 mddev->ro = MD_AUTO_READ; 7899 } 7900 7901 out_unlock: 7902 mddev_unlock(mddev); 7903 return err; 7904 } 7905 7906 static int md_open(struct gendisk *disk, blk_mode_t mode) 7907 { 7908 struct mddev *mddev; 7909 int err; 7910 7911 spin_lock(&all_mddevs_lock); 7912 mddev = mddev_get(disk->private_data); 7913 spin_unlock(&all_mddevs_lock); 7914 if (!mddev) 7915 return -ENODEV; 7916 7917 err = mutex_lock_interruptible(&mddev->open_mutex); 7918 if (err) 7919 goto out; 7920 7921 err = -ENODEV; 7922 if (test_bit(MD_CLOSING, &mddev->flags)) 7923 goto out_unlock; 7924 7925 atomic_inc(&mddev->openers); 7926 mutex_unlock(&mddev->open_mutex); 7927 7928 disk_check_media_change(disk); 7929 return 0; 7930 7931 out_unlock: 7932 mutex_unlock(&mddev->open_mutex); 7933 out: 7934 mddev_put(mddev); 7935 return err; 7936 } 7937 7938 static void md_release(struct gendisk *disk) 7939 { 7940 struct mddev *mddev = disk->private_data; 7941 7942 BUG_ON(!mddev); 7943 atomic_dec(&mddev->openers); 7944 mddev_put(mddev); 7945 } 7946 7947 static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) 7948 { 7949 struct mddev *mddev = disk->private_data; 7950 unsigned int ret = 0; 7951 7952 if (mddev->changed) 7953 ret = DISK_EVENT_MEDIA_CHANGE; 7954 mddev->changed = 0; 7955 return ret; 7956 } 7957 7958 static void md_free_disk(struct gendisk *disk) 7959 { 7960 struct mddev *mddev = disk->private_data; 7961 7962 mddev_free(mddev); 7963 } 7964 7965 const struct block_device_operations md_fops = 7966 { 7967 .owner = THIS_MODULE, 7968 .submit_bio = md_submit_bio, 7969 .open = md_open, 7970 .release = md_release, 7971 .ioctl = md_ioctl, 7972 #ifdef CONFIG_COMPAT 7973 .compat_ioctl = md_compat_ioctl, 7974 #endif 7975 .getgeo = md_getgeo, 7976 .check_events = md_check_events, 7977 .set_read_only = md_set_read_only, 7978 .free_disk = md_free_disk, 7979 }; 7980 7981 static int md_thread(void *arg) 7982 { 7983 struct md_thread *thread = arg; 7984 7985 /* 7986 * md_thread is a 'system-thread', it's priority should be very 7987 * high. We avoid resource deadlocks individually in each 7988 * raid personality. (RAID5 does preallocation) We also use RR and 7989 * the very same RT priority as kswapd, thus we will never get 7990 * into a priority inversion deadlock. 7991 * 7992 * we definitely have to have equal or higher priority than 7993 * bdflush, otherwise bdflush will deadlock if there are too 7994 * many dirty RAID5 blocks. 7995 */ 7996 7997 allow_signal(SIGKILL); 7998 while (!kthread_should_stop()) { 7999 8000 /* We need to wait INTERRUPTIBLE so that 8001 * we don't add to the load-average. 8002 * That means we need to be sure no signals are 8003 * pending 8004 */ 8005 if (signal_pending(current)) 8006 flush_signals(current); 8007 8008 wait_event_interruptible_timeout 8009 (thread->wqueue, 8010 test_bit(THREAD_WAKEUP, &thread->flags) 8011 || kthread_should_stop() || kthread_should_park(), 8012 thread->timeout); 8013 8014 clear_bit(THREAD_WAKEUP, &thread->flags); 8015 if (kthread_should_park()) 8016 kthread_parkme(); 8017 if (!kthread_should_stop()) 8018 thread->run(thread); 8019 } 8020 8021 return 0; 8022 } 8023 8024 static void md_wakeup_thread_directly(struct md_thread __rcu *thread) 8025 { 8026 struct md_thread *t; 8027 8028 rcu_read_lock(); 8029 t = rcu_dereference(thread); 8030 if (t) 8031 wake_up_process(t->tsk); 8032 rcu_read_unlock(); 8033 } 8034 8035 void md_wakeup_thread(struct md_thread __rcu *thread) 8036 { 8037 struct md_thread *t; 8038 8039 rcu_read_lock(); 8040 t = rcu_dereference(thread); 8041 if (t) { 8042 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8043 set_bit(THREAD_WAKEUP, &t->flags); 8044 wake_up(&t->wqueue); 8045 } 8046 rcu_read_unlock(); 8047 } 8048 EXPORT_SYMBOL(md_wakeup_thread); 8049 8050 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 8051 struct mddev *mddev, const char *name) 8052 { 8053 struct md_thread *thread; 8054 8055 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 8056 if (!thread) 8057 return NULL; 8058 8059 init_waitqueue_head(&thread->wqueue); 8060 8061 thread->run = run; 8062 thread->mddev = mddev; 8063 thread->timeout = MAX_SCHEDULE_TIMEOUT; 8064 thread->tsk = kthread_run(md_thread, thread, 8065 "%s_%s", 8066 mdname(thread->mddev), 8067 name); 8068 if (IS_ERR(thread->tsk)) { 8069 kfree(thread); 8070 return NULL; 8071 } 8072 return thread; 8073 } 8074 EXPORT_SYMBOL(md_register_thread); 8075 8076 void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp) 8077 { 8078 struct md_thread *thread = rcu_dereference_protected(*threadp, 8079 lockdep_is_held(&mddev->reconfig_mutex)); 8080 8081 if (!thread) 8082 return; 8083 8084 rcu_assign_pointer(*threadp, NULL); 8085 synchronize_rcu(); 8086 8087 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 8088 kthread_stop(thread->tsk); 8089 kfree(thread); 8090 } 8091 EXPORT_SYMBOL(md_unregister_thread); 8092 8093 void md_error(struct mddev *mddev, struct md_rdev *rdev) 8094 { 8095 if (!rdev || test_bit(Faulty, &rdev->flags)) 8096 return; 8097 8098 if (!mddev->pers || !mddev->pers->error_handler) 8099 return; 8100 mddev->pers->error_handler(mddev, rdev); 8101 8102 if (mddev->pers->level == 0) 8103 return; 8104 8105 if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) 8106 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8107 sysfs_notify_dirent_safe(rdev->sysfs_state); 8108 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8109 if (!test_bit(MD_BROKEN, &mddev->flags)) { 8110 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8111 md_wakeup_thread(mddev->thread); 8112 } 8113 if (mddev->event_work.func) 8114 queue_work(md_misc_wq, &mddev->event_work); 8115 md_new_event(); 8116 } 8117 EXPORT_SYMBOL(md_error); 8118 8119 /* seq_file implementation /proc/mdstat */ 8120 8121 static void status_unused(struct seq_file *seq) 8122 { 8123 int i = 0; 8124 struct md_rdev *rdev; 8125 8126 seq_printf(seq, "unused devices: "); 8127 8128 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 8129 i++; 8130 seq_printf(seq, "%pg ", rdev->bdev); 8131 } 8132 if (!i) 8133 seq_printf(seq, "<none>"); 8134 8135 seq_printf(seq, "\n"); 8136 } 8137 8138 static int status_resync(struct seq_file *seq, struct mddev *mddev) 8139 { 8140 sector_t max_sectors, resync, res; 8141 unsigned long dt, db = 0; 8142 sector_t rt, curr_mark_cnt, resync_mark_cnt; 8143 int scale, recovery_active; 8144 unsigned int per_milli; 8145 8146 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8147 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8148 max_sectors = mddev->resync_max_sectors; 8149 else 8150 max_sectors = mddev->dev_sectors; 8151 8152 resync = mddev->curr_resync; 8153 if (resync < MD_RESYNC_ACTIVE) { 8154 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 8155 /* Still cleaning up */ 8156 resync = max_sectors; 8157 } else if (resync > max_sectors) { 8158 resync = max_sectors; 8159 } else { 8160 res = atomic_read(&mddev->recovery_active); 8161 /* 8162 * Resync has started, but the subtraction has overflowed or 8163 * yielded one of the special values. Force it to active to 8164 * ensure the status reports an active resync. 8165 */ 8166 if (resync < res || resync - res < MD_RESYNC_ACTIVE) 8167 resync = MD_RESYNC_ACTIVE; 8168 else 8169 resync -= res; 8170 } 8171 8172 if (resync == MD_RESYNC_NONE) { 8173 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { 8174 struct md_rdev *rdev; 8175 8176 rdev_for_each(rdev, mddev) 8177 if (rdev->raid_disk >= 0 && 8178 !test_bit(Faulty, &rdev->flags) && 8179 rdev->recovery_offset != MaxSector && 8180 rdev->recovery_offset) { 8181 seq_printf(seq, "\trecover=REMOTE"); 8182 return 1; 8183 } 8184 if (mddev->reshape_position != MaxSector) 8185 seq_printf(seq, "\treshape=REMOTE"); 8186 else 8187 seq_printf(seq, "\tresync=REMOTE"); 8188 return 1; 8189 } 8190 if (mddev->recovery_cp < MaxSector) { 8191 seq_printf(seq, "\tresync=PENDING"); 8192 return 1; 8193 } 8194 return 0; 8195 } 8196 if (resync < MD_RESYNC_ACTIVE) { 8197 seq_printf(seq, "\tresync=DELAYED"); 8198 return 1; 8199 } 8200 8201 WARN_ON(max_sectors == 0); 8202 /* Pick 'scale' such that (resync>>scale)*1000 will fit 8203 * in a sector_t, and (max_sectors>>scale) will fit in a 8204 * u32, as those are the requirements for sector_div. 8205 * Thus 'scale' must be at least 10 8206 */ 8207 scale = 10; 8208 if (sizeof(sector_t) > sizeof(unsigned long)) { 8209 while ( max_sectors/2 > (1ULL<<(scale+32))) 8210 scale++; 8211 } 8212 res = (resync>>scale)*1000; 8213 sector_div(res, (u32)((max_sectors>>scale)+1)); 8214 8215 per_milli = res; 8216 { 8217 int i, x = per_milli/50, y = 20-x; 8218 seq_printf(seq, "["); 8219 for (i = 0; i < x; i++) 8220 seq_printf(seq, "="); 8221 seq_printf(seq, ">"); 8222 for (i = 0; i < y; i++) 8223 seq_printf(seq, "."); 8224 seq_printf(seq, "] "); 8225 } 8226 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 8227 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 8228 "reshape" : 8229 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 8230 "check" : 8231 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 8232 "resync" : "recovery"))), 8233 per_milli/10, per_milli % 10, 8234 (unsigned long long) resync/2, 8235 (unsigned long long) max_sectors/2); 8236 8237 /* 8238 * dt: time from mark until now 8239 * db: blocks written from mark until now 8240 * rt: remaining time 8241 * 8242 * rt is a sector_t, which is always 64bit now. We are keeping 8243 * the original algorithm, but it is not really necessary. 8244 * 8245 * Original algorithm: 8246 * So we divide before multiply in case it is 32bit and close 8247 * to the limit. 8248 * We scale the divisor (db) by 32 to avoid losing precision 8249 * near the end of resync when the number of remaining sectors 8250 * is close to 'db'. 8251 * We then divide rt by 32 after multiplying by db to compensate. 8252 * The '+1' avoids division by zero if db is very small. 8253 */ 8254 dt = ((jiffies - mddev->resync_mark) / HZ); 8255 if (!dt) dt++; 8256 8257 curr_mark_cnt = mddev->curr_mark_cnt; 8258 recovery_active = atomic_read(&mddev->recovery_active); 8259 resync_mark_cnt = mddev->resync_mark_cnt; 8260 8261 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) 8262 db = curr_mark_cnt - (recovery_active + resync_mark_cnt); 8263 8264 rt = max_sectors - resync; /* number of remaining sectors */ 8265 rt = div64_u64(rt, db/32+1); 8266 rt *= dt; 8267 rt >>= 5; 8268 8269 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 8270 ((unsigned long)rt % 60)/6); 8271 8272 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 8273 return 1; 8274 } 8275 8276 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 8277 __acquires(&all_mddevs_lock) 8278 { 8279 struct md_personality *pers; 8280 8281 seq_puts(seq, "Personalities : "); 8282 spin_lock(&pers_lock); 8283 list_for_each_entry(pers, &pers_list, list) 8284 seq_printf(seq, "[%s] ", pers->name); 8285 8286 spin_unlock(&pers_lock); 8287 seq_puts(seq, "\n"); 8288 seq->poll_event = atomic_read(&md_event_count); 8289 8290 spin_lock(&all_mddevs_lock); 8291 8292 return seq_list_start(&all_mddevs, *pos); 8293 } 8294 8295 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 8296 { 8297 return seq_list_next(v, &all_mddevs, pos); 8298 } 8299 8300 static void md_seq_stop(struct seq_file *seq, void *v) 8301 __releases(&all_mddevs_lock) 8302 { 8303 status_unused(seq); 8304 spin_unlock(&all_mddevs_lock); 8305 } 8306 8307 static int md_seq_show(struct seq_file *seq, void *v) 8308 { 8309 struct mddev *mddev = list_entry(v, struct mddev, all_mddevs); 8310 sector_t sectors; 8311 struct md_rdev *rdev; 8312 8313 if (!mddev_get(mddev)) 8314 return 0; 8315 8316 spin_unlock(&all_mddevs_lock); 8317 spin_lock(&mddev->lock); 8318 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8319 seq_printf(seq, "%s : %sactive", mdname(mddev), 8320 mddev->pers ? "" : "in"); 8321 if (mddev->pers) { 8322 if (mddev->ro == MD_RDONLY) 8323 seq_printf(seq, " (read-only)"); 8324 if (mddev->ro == MD_AUTO_READ) 8325 seq_printf(seq, " (auto-read-only)"); 8326 seq_printf(seq, " %s", mddev->pers->name); 8327 } 8328 8329 sectors = 0; 8330 rcu_read_lock(); 8331 rdev_for_each_rcu(rdev, mddev) { 8332 seq_printf(seq, " %pg[%d]", rdev->bdev, rdev->desc_nr); 8333 8334 if (test_bit(WriteMostly, &rdev->flags)) 8335 seq_printf(seq, "(W)"); 8336 if (test_bit(Journal, &rdev->flags)) 8337 seq_printf(seq, "(J)"); 8338 if (test_bit(Faulty, &rdev->flags)) { 8339 seq_printf(seq, "(F)"); 8340 continue; 8341 } 8342 if (rdev->raid_disk < 0) 8343 seq_printf(seq, "(S)"); /* spare */ 8344 if (test_bit(Replacement, &rdev->flags)) 8345 seq_printf(seq, "(R)"); 8346 sectors += rdev->sectors; 8347 } 8348 rcu_read_unlock(); 8349 8350 if (!list_empty(&mddev->disks)) { 8351 if (mddev->pers) 8352 seq_printf(seq, "\n %llu blocks", 8353 (unsigned long long) 8354 mddev->array_sectors / 2); 8355 else 8356 seq_printf(seq, "\n %llu blocks", 8357 (unsigned long long)sectors / 2); 8358 } 8359 if (mddev->persistent) { 8360 if (mddev->major_version != 0 || 8361 mddev->minor_version != 90) { 8362 seq_printf(seq," super %d.%d", 8363 mddev->major_version, 8364 mddev->minor_version); 8365 } 8366 } else if (mddev->external) 8367 seq_printf(seq, " super external:%s", 8368 mddev->metadata_type); 8369 else 8370 seq_printf(seq, " super non-persistent"); 8371 8372 if (mddev->pers) { 8373 mddev->pers->status(seq, mddev); 8374 seq_printf(seq, "\n "); 8375 if (mddev->pers->sync_request) { 8376 if (status_resync(seq, mddev)) 8377 seq_printf(seq, "\n "); 8378 } 8379 } else 8380 seq_printf(seq, "\n "); 8381 8382 md_bitmap_status(seq, mddev->bitmap); 8383 8384 seq_printf(seq, "\n"); 8385 } 8386 spin_unlock(&mddev->lock); 8387 spin_lock(&all_mddevs_lock); 8388 if (atomic_dec_and_test(&mddev->active)) 8389 __mddev_put(mddev); 8390 8391 return 0; 8392 } 8393 8394 static const struct seq_operations md_seq_ops = { 8395 .start = md_seq_start, 8396 .next = md_seq_next, 8397 .stop = md_seq_stop, 8398 .show = md_seq_show, 8399 }; 8400 8401 static int md_seq_open(struct inode *inode, struct file *file) 8402 { 8403 struct seq_file *seq; 8404 int error; 8405 8406 error = seq_open(file, &md_seq_ops); 8407 if (error) 8408 return error; 8409 8410 seq = file->private_data; 8411 seq->poll_event = atomic_read(&md_event_count); 8412 return error; 8413 } 8414 8415 static int md_unloading; 8416 static __poll_t mdstat_poll(struct file *filp, poll_table *wait) 8417 { 8418 struct seq_file *seq = filp->private_data; 8419 __poll_t mask; 8420 8421 if (md_unloading) 8422 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; 8423 poll_wait(filp, &md_event_waiters, wait); 8424 8425 /* always allow read */ 8426 mask = EPOLLIN | EPOLLRDNORM; 8427 8428 if (seq->poll_event != atomic_read(&md_event_count)) 8429 mask |= EPOLLERR | EPOLLPRI; 8430 return mask; 8431 } 8432 8433 static const struct proc_ops mdstat_proc_ops = { 8434 .proc_open = md_seq_open, 8435 .proc_read = seq_read, 8436 .proc_lseek = seq_lseek, 8437 .proc_release = seq_release, 8438 .proc_poll = mdstat_poll, 8439 }; 8440 8441 int register_md_personality(struct md_personality *p) 8442 { 8443 pr_debug("md: %s personality registered for level %d\n", 8444 p->name, p->level); 8445 spin_lock(&pers_lock); 8446 list_add_tail(&p->list, &pers_list); 8447 spin_unlock(&pers_lock); 8448 return 0; 8449 } 8450 EXPORT_SYMBOL(register_md_personality); 8451 8452 int unregister_md_personality(struct md_personality *p) 8453 { 8454 pr_debug("md: %s personality unregistered\n", p->name); 8455 spin_lock(&pers_lock); 8456 list_del_init(&p->list); 8457 spin_unlock(&pers_lock); 8458 return 0; 8459 } 8460 EXPORT_SYMBOL(unregister_md_personality); 8461 8462 int register_md_cluster_operations(struct md_cluster_operations *ops, 8463 struct module *module) 8464 { 8465 int ret = 0; 8466 spin_lock(&pers_lock); 8467 if (md_cluster_ops != NULL) 8468 ret = -EALREADY; 8469 else { 8470 md_cluster_ops = ops; 8471 md_cluster_mod = module; 8472 } 8473 spin_unlock(&pers_lock); 8474 return ret; 8475 } 8476 EXPORT_SYMBOL(register_md_cluster_operations); 8477 8478 int unregister_md_cluster_operations(void) 8479 { 8480 spin_lock(&pers_lock); 8481 md_cluster_ops = NULL; 8482 spin_unlock(&pers_lock); 8483 return 0; 8484 } 8485 EXPORT_SYMBOL(unregister_md_cluster_operations); 8486 8487 int md_setup_cluster(struct mddev *mddev, int nodes) 8488 { 8489 int ret; 8490 if (!md_cluster_ops) 8491 request_module("md-cluster"); 8492 spin_lock(&pers_lock); 8493 /* ensure module won't be unloaded */ 8494 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 8495 pr_warn("can't find md-cluster module or get its reference.\n"); 8496 spin_unlock(&pers_lock); 8497 return -ENOENT; 8498 } 8499 spin_unlock(&pers_lock); 8500 8501 ret = md_cluster_ops->join(mddev, nodes); 8502 if (!ret) 8503 mddev->safemode_delay = 0; 8504 return ret; 8505 } 8506 8507 void md_cluster_stop(struct mddev *mddev) 8508 { 8509 if (!md_cluster_ops) 8510 return; 8511 md_cluster_ops->leave(mddev); 8512 module_put(md_cluster_mod); 8513 } 8514 8515 static int is_mddev_idle(struct mddev *mddev, int init) 8516 { 8517 struct md_rdev *rdev; 8518 int idle; 8519 int curr_events; 8520 8521 idle = 1; 8522 rcu_read_lock(); 8523 rdev_for_each_rcu(rdev, mddev) { 8524 struct gendisk *disk = rdev->bdev->bd_disk; 8525 curr_events = (int)part_stat_read_accum(disk->part0, sectors) - 8526 atomic_read(&disk->sync_io); 8527 /* sync IO will cause sync_io to increase before the disk_stats 8528 * as sync_io is counted when a request starts, and 8529 * disk_stats is counted when it completes. 8530 * So resync activity will cause curr_events to be smaller than 8531 * when there was no such activity. 8532 * non-sync IO will cause disk_stat to increase without 8533 * increasing sync_io so curr_events will (eventually) 8534 * be larger than it was before. Once it becomes 8535 * substantially larger, the test below will cause 8536 * the array to appear non-idle, and resync will slow 8537 * down. 8538 * If there is a lot of outstanding resync activity when 8539 * we set last_event to curr_events, then all that activity 8540 * completing might cause the array to appear non-idle 8541 * and resync will be slowed down even though there might 8542 * not have been non-resync activity. This will only 8543 * happen once though. 'last_events' will soon reflect 8544 * the state where there is little or no outstanding 8545 * resync requests, and further resync activity will 8546 * always make curr_events less than last_events. 8547 * 8548 */ 8549 if (init || curr_events - rdev->last_events > 64) { 8550 rdev->last_events = curr_events; 8551 idle = 0; 8552 } 8553 } 8554 rcu_read_unlock(); 8555 return idle; 8556 } 8557 8558 void md_done_sync(struct mddev *mddev, int blocks, int ok) 8559 { 8560 /* another "blocks" (512byte) blocks have been synced */ 8561 atomic_sub(blocks, &mddev->recovery_active); 8562 wake_up(&mddev->recovery_wait); 8563 if (!ok) { 8564 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8565 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 8566 md_wakeup_thread(mddev->thread); 8567 // stop recovery, signal do_sync .... 8568 } 8569 } 8570 EXPORT_SYMBOL(md_done_sync); 8571 8572 /* md_write_start(mddev, bi) 8573 * If we need to update some array metadata (e.g. 'active' flag 8574 * in superblock) before writing, schedule a superblock update 8575 * and wait for it to complete. 8576 * A return value of 'false' means that the write wasn't recorded 8577 * and cannot proceed as the array is being suspend. 8578 */ 8579 bool md_write_start(struct mddev *mddev, struct bio *bi) 8580 { 8581 int did_change = 0; 8582 8583 if (bio_data_dir(bi) != WRITE) 8584 return true; 8585 8586 BUG_ON(mddev->ro == MD_RDONLY); 8587 if (mddev->ro == MD_AUTO_READ) { 8588 /* need to switch to read/write */ 8589 flush_work(&mddev->sync_work); 8590 mddev->ro = MD_RDWR; 8591 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8592 md_wakeup_thread(mddev->thread); 8593 md_wakeup_thread(mddev->sync_thread); 8594 did_change = 1; 8595 } 8596 rcu_read_lock(); 8597 percpu_ref_get(&mddev->writes_pending); 8598 smp_mb(); /* Match smp_mb in set_in_sync() */ 8599 if (mddev->safemode == 1) 8600 mddev->safemode = 0; 8601 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ 8602 if (mddev->in_sync || mddev->sync_checkers) { 8603 spin_lock(&mddev->lock); 8604 if (mddev->in_sync) { 8605 mddev->in_sync = 0; 8606 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8607 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8608 md_wakeup_thread(mddev->thread); 8609 did_change = 1; 8610 } 8611 spin_unlock(&mddev->lock); 8612 } 8613 rcu_read_unlock(); 8614 if (did_change) 8615 sysfs_notify_dirent_safe(mddev->sysfs_state); 8616 if (!mddev->has_superblocks) 8617 return true; 8618 wait_event(mddev->sb_wait, 8619 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || 8620 is_md_suspended(mddev)); 8621 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 8622 percpu_ref_put(&mddev->writes_pending); 8623 return false; 8624 } 8625 return true; 8626 } 8627 EXPORT_SYMBOL(md_write_start); 8628 8629 /* md_write_inc can only be called when md_write_start() has 8630 * already been called at least once of the current request. 8631 * It increments the counter and is useful when a single request 8632 * is split into several parts. Each part causes an increment and 8633 * so needs a matching md_write_end(). 8634 * Unlike md_write_start(), it is safe to call md_write_inc() inside 8635 * a spinlocked region. 8636 */ 8637 void md_write_inc(struct mddev *mddev, struct bio *bi) 8638 { 8639 if (bio_data_dir(bi) != WRITE) 8640 return; 8641 WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev)); 8642 percpu_ref_get(&mddev->writes_pending); 8643 } 8644 EXPORT_SYMBOL(md_write_inc); 8645 8646 void md_write_end(struct mddev *mddev) 8647 { 8648 percpu_ref_put(&mddev->writes_pending); 8649 8650 if (mddev->safemode == 2) 8651 md_wakeup_thread(mddev->thread); 8652 else if (mddev->safemode_delay) 8653 /* The roundup() ensures this only performs locking once 8654 * every ->safemode_delay jiffies 8655 */ 8656 mod_timer(&mddev->safemode_timer, 8657 roundup(jiffies, mddev->safemode_delay) + 8658 mddev->safemode_delay); 8659 } 8660 8661 EXPORT_SYMBOL(md_write_end); 8662 8663 /* This is used by raid0 and raid10 */ 8664 void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, 8665 struct bio *bio, sector_t start, sector_t size) 8666 { 8667 struct bio *discard_bio = NULL; 8668 8669 if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, 8670 &discard_bio) || !discard_bio) 8671 return; 8672 8673 bio_chain(discard_bio, bio); 8674 bio_clone_blkg_association(discard_bio, bio); 8675 if (mddev->gendisk) 8676 trace_block_bio_remap(discard_bio, 8677 disk_devt(mddev->gendisk), 8678 bio->bi_iter.bi_sector); 8679 submit_bio_noacct(discard_bio); 8680 } 8681 EXPORT_SYMBOL_GPL(md_submit_discard_bio); 8682 8683 static void md_end_clone_io(struct bio *bio) 8684 { 8685 struct md_io_clone *md_io_clone = bio->bi_private; 8686 struct bio *orig_bio = md_io_clone->orig_bio; 8687 struct mddev *mddev = md_io_clone->mddev; 8688 8689 if (bio->bi_status && !orig_bio->bi_status) 8690 orig_bio->bi_status = bio->bi_status; 8691 8692 if (md_io_clone->start_time) 8693 bio_end_io_acct(orig_bio, md_io_clone->start_time); 8694 8695 bio_put(bio); 8696 bio_endio(orig_bio); 8697 percpu_ref_put(&mddev->active_io); 8698 } 8699 8700 static void md_clone_bio(struct mddev *mddev, struct bio **bio) 8701 { 8702 struct block_device *bdev = (*bio)->bi_bdev; 8703 struct md_io_clone *md_io_clone; 8704 struct bio *clone = 8705 bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set); 8706 8707 md_io_clone = container_of(clone, struct md_io_clone, bio_clone); 8708 md_io_clone->orig_bio = *bio; 8709 md_io_clone->mddev = mddev; 8710 if (blk_queue_io_stat(bdev->bd_disk->queue)) 8711 md_io_clone->start_time = bio_start_io_acct(*bio); 8712 8713 clone->bi_end_io = md_end_clone_io; 8714 clone->bi_private = md_io_clone; 8715 *bio = clone; 8716 } 8717 8718 void md_account_bio(struct mddev *mddev, struct bio **bio) 8719 { 8720 percpu_ref_get(&mddev->active_io); 8721 md_clone_bio(mddev, bio); 8722 } 8723 EXPORT_SYMBOL_GPL(md_account_bio); 8724 8725 /* md_allow_write(mddev) 8726 * Calling this ensures that the array is marked 'active' so that writes 8727 * may proceed without blocking. It is important to call this before 8728 * attempting a GFP_KERNEL allocation while holding the mddev lock. 8729 * Must be called with mddev_lock held. 8730 */ 8731 void md_allow_write(struct mddev *mddev) 8732 { 8733 if (!mddev->pers) 8734 return; 8735 if (!md_is_rdwr(mddev)) 8736 return; 8737 if (!mddev->pers->sync_request) 8738 return; 8739 8740 spin_lock(&mddev->lock); 8741 if (mddev->in_sync) { 8742 mddev->in_sync = 0; 8743 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8744 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8745 if (mddev->safemode_delay && 8746 mddev->safemode == 0) 8747 mddev->safemode = 1; 8748 spin_unlock(&mddev->lock); 8749 md_update_sb(mddev, 0); 8750 sysfs_notify_dirent_safe(mddev->sysfs_state); 8751 /* wait for the dirty state to be recorded in the metadata */ 8752 wait_event(mddev->sb_wait, 8753 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 8754 } else 8755 spin_unlock(&mddev->lock); 8756 } 8757 EXPORT_SYMBOL_GPL(md_allow_write); 8758 8759 #define SYNC_MARKS 10 8760 #define SYNC_MARK_STEP (3*HZ) 8761 #define UPDATE_FREQUENCY (5*60*HZ) 8762 void md_do_sync(struct md_thread *thread) 8763 { 8764 struct mddev *mddev = thread->mddev; 8765 struct mddev *mddev2; 8766 unsigned int currspeed = 0, window; 8767 sector_t max_sectors,j, io_sectors, recovery_done; 8768 unsigned long mark[SYNC_MARKS]; 8769 unsigned long update_time; 8770 sector_t mark_cnt[SYNC_MARKS]; 8771 int last_mark,m; 8772 sector_t last_check; 8773 int skipped = 0; 8774 struct md_rdev *rdev; 8775 char *desc, *action = NULL; 8776 struct blk_plug plug; 8777 int ret; 8778 8779 /* just incase thread restarts... */ 8780 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8781 test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) 8782 return; 8783 if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */ 8784 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8785 return; 8786 } 8787 8788 if (mddev_is_clustered(mddev)) { 8789 ret = md_cluster_ops->resync_start(mddev); 8790 if (ret) 8791 goto skip; 8792 8793 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 8794 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 8795 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 8796 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 8797 && ((unsigned long long)mddev->curr_resync_completed 8798 < (unsigned long long)mddev->resync_max_sectors)) 8799 goto skip; 8800 } 8801 8802 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8803 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 8804 desc = "data-check"; 8805 action = "check"; 8806 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8807 desc = "requested-resync"; 8808 action = "repair"; 8809 } else 8810 desc = "resync"; 8811 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 8812 desc = "reshape"; 8813 else 8814 desc = "recovery"; 8815 8816 mddev->last_sync_action = action ?: desc; 8817 8818 /* 8819 * Before starting a resync we must have set curr_resync to 8820 * 2, and then checked that every "conflicting" array has curr_resync 8821 * less than ours. When we find one that is the same or higher 8822 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 8823 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 8824 * This will mean we have to start checking from the beginning again. 8825 * 8826 */ 8827 8828 do { 8829 int mddev2_minor = -1; 8830 mddev->curr_resync = MD_RESYNC_DELAYED; 8831 8832 try_again: 8833 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8834 goto skip; 8835 spin_lock(&all_mddevs_lock); 8836 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { 8837 if (test_bit(MD_DELETED, &mddev2->flags)) 8838 continue; 8839 if (mddev2 == mddev) 8840 continue; 8841 if (!mddev->parallel_resync 8842 && mddev2->curr_resync 8843 && match_mddev_units(mddev, mddev2)) { 8844 DEFINE_WAIT(wq); 8845 if (mddev < mddev2 && 8846 mddev->curr_resync == MD_RESYNC_DELAYED) { 8847 /* arbitrarily yield */ 8848 mddev->curr_resync = MD_RESYNC_YIELDED; 8849 wake_up(&resync_wait); 8850 } 8851 if (mddev > mddev2 && 8852 mddev->curr_resync == MD_RESYNC_YIELDED) 8853 /* no need to wait here, we can wait the next 8854 * time 'round when curr_resync == 2 8855 */ 8856 continue; 8857 /* We need to wait 'interruptible' so as not to 8858 * contribute to the load average, and not to 8859 * be caught by 'softlockup' 8860 */ 8861 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 8862 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8863 mddev2->curr_resync >= mddev->curr_resync) { 8864 if (mddev2_minor != mddev2->md_minor) { 8865 mddev2_minor = mddev2->md_minor; 8866 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 8867 desc, mdname(mddev), 8868 mdname(mddev2)); 8869 } 8870 spin_unlock(&all_mddevs_lock); 8871 8872 if (signal_pending(current)) 8873 flush_signals(current); 8874 schedule(); 8875 finish_wait(&resync_wait, &wq); 8876 goto try_again; 8877 } 8878 finish_wait(&resync_wait, &wq); 8879 } 8880 } 8881 spin_unlock(&all_mddevs_lock); 8882 } while (mddev->curr_resync < MD_RESYNC_DELAYED); 8883 8884 j = 0; 8885 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8886 /* resync follows the size requested by the personality, 8887 * which defaults to physical size, but can be virtual size 8888 */ 8889 max_sectors = mddev->resync_max_sectors; 8890 atomic64_set(&mddev->resync_mismatches, 0); 8891 /* we don't use the checkpoint if there's a bitmap */ 8892 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8893 j = mddev->resync_min; 8894 else if (!mddev->bitmap) 8895 j = mddev->recovery_cp; 8896 8897 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 8898 max_sectors = mddev->resync_max_sectors; 8899 /* 8900 * If the original node aborts reshaping then we continue the 8901 * reshaping, so set j again to avoid restart reshape from the 8902 * first beginning 8903 */ 8904 if (mddev_is_clustered(mddev) && 8905 mddev->reshape_position != MaxSector) 8906 j = mddev->reshape_position; 8907 } else { 8908 /* recovery follows the physical size of devices */ 8909 max_sectors = mddev->dev_sectors; 8910 j = MaxSector; 8911 rcu_read_lock(); 8912 rdev_for_each_rcu(rdev, mddev) 8913 if (rdev->raid_disk >= 0 && 8914 !test_bit(Journal, &rdev->flags) && 8915 !test_bit(Faulty, &rdev->flags) && 8916 !test_bit(In_sync, &rdev->flags) && 8917 rdev->recovery_offset < j) 8918 j = rdev->recovery_offset; 8919 rcu_read_unlock(); 8920 8921 /* If there is a bitmap, we need to make sure all 8922 * writes that started before we added a spare 8923 * complete before we start doing a recovery. 8924 * Otherwise the write might complete and (via 8925 * bitmap_endwrite) set a bit in the bitmap after the 8926 * recovery has checked that bit and skipped that 8927 * region. 8928 */ 8929 if (mddev->bitmap) { 8930 mddev->pers->quiesce(mddev, 1); 8931 mddev->pers->quiesce(mddev, 0); 8932 } 8933 } 8934 8935 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 8936 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 8937 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 8938 speed_max(mddev), desc); 8939 8940 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8941 8942 io_sectors = 0; 8943 for (m = 0; m < SYNC_MARKS; m++) { 8944 mark[m] = jiffies; 8945 mark_cnt[m] = io_sectors; 8946 } 8947 last_mark = 0; 8948 mddev->resync_mark = mark[last_mark]; 8949 mddev->resync_mark_cnt = mark_cnt[last_mark]; 8950 8951 /* 8952 * Tune reconstruction: 8953 */ 8954 window = 32 * (PAGE_SIZE / 512); 8955 pr_debug("md: using %dk window, over a total of %lluk.\n", 8956 window/2, (unsigned long long)max_sectors/2); 8957 8958 atomic_set(&mddev->recovery_active, 0); 8959 last_check = 0; 8960 8961 if (j >= MD_RESYNC_ACTIVE) { 8962 pr_debug("md: resuming %s of %s from checkpoint.\n", 8963 desc, mdname(mddev)); 8964 mddev->curr_resync = j; 8965 } else 8966 mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ 8967 mddev->curr_resync_completed = j; 8968 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8969 md_new_event(); 8970 update_time = jiffies; 8971 8972 blk_start_plug(&plug); 8973 while (j < max_sectors) { 8974 sector_t sectors; 8975 8976 skipped = 0; 8977 8978 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8979 ((mddev->curr_resync > mddev->curr_resync_completed && 8980 (mddev->curr_resync - mddev->curr_resync_completed) 8981 > (max_sectors >> 4)) || 8982 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 8983 (j - mddev->curr_resync_completed)*2 8984 >= mddev->resync_max - mddev->curr_resync_completed || 8985 mddev->curr_resync_completed > mddev->resync_max 8986 )) { 8987 /* time to update curr_resync_completed */ 8988 wait_event(mddev->recovery_wait, 8989 atomic_read(&mddev->recovery_active) == 0); 8990 mddev->curr_resync_completed = j; 8991 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 8992 j > mddev->recovery_cp) 8993 mddev->recovery_cp = j; 8994 update_time = jiffies; 8995 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8996 sysfs_notify_dirent_safe(mddev->sysfs_completed); 8997 } 8998 8999 while (j >= mddev->resync_max && 9000 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9001 /* As this condition is controlled by user-space, 9002 * we can block indefinitely, so use '_interruptible' 9003 * to avoid triggering warnings. 9004 */ 9005 flush_signals(current); /* just in case */ 9006 wait_event_interruptible(mddev->recovery_wait, 9007 mddev->resync_max > j 9008 || test_bit(MD_RECOVERY_INTR, 9009 &mddev->recovery)); 9010 } 9011 9012 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9013 break; 9014 9015 sectors = mddev->pers->sync_request(mddev, j, &skipped); 9016 if (sectors == 0) { 9017 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9018 break; 9019 } 9020 9021 if (!skipped) { /* actual IO requested */ 9022 io_sectors += sectors; 9023 atomic_add(sectors, &mddev->recovery_active); 9024 } 9025 9026 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9027 break; 9028 9029 j += sectors; 9030 if (j > max_sectors) 9031 /* when skipping, extra large numbers can be returned. */ 9032 j = max_sectors; 9033 if (j >= MD_RESYNC_ACTIVE) 9034 mddev->curr_resync = j; 9035 mddev->curr_mark_cnt = io_sectors; 9036 if (last_check == 0) 9037 /* this is the earliest that rebuild will be 9038 * visible in /proc/mdstat 9039 */ 9040 md_new_event(); 9041 9042 if (last_check + window > io_sectors || j == max_sectors) 9043 continue; 9044 9045 last_check = io_sectors; 9046 repeat: 9047 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 9048 /* step marks */ 9049 int next = (last_mark+1) % SYNC_MARKS; 9050 9051 mddev->resync_mark = mark[next]; 9052 mddev->resync_mark_cnt = mark_cnt[next]; 9053 mark[next] = jiffies; 9054 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 9055 last_mark = next; 9056 } 9057 9058 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9059 break; 9060 9061 /* 9062 * this loop exits only if either when we are slower than 9063 * the 'hard' speed limit, or the system was IO-idle for 9064 * a jiffy. 9065 * the system might be non-idle CPU-wise, but we only care 9066 * about not overloading the IO subsystem. (things like an 9067 * e2fsck being done on the RAID array should execute fast) 9068 */ 9069 cond_resched(); 9070 9071 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 9072 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 9073 /((jiffies-mddev->resync_mark)/HZ +1) +1; 9074 9075 if (currspeed > speed_min(mddev)) { 9076 if (currspeed > speed_max(mddev)) { 9077 msleep(500); 9078 goto repeat; 9079 } 9080 if (!is_mddev_idle(mddev, 0)) { 9081 /* 9082 * Give other IO more of a chance. 9083 * The faster the devices, the less we wait. 9084 */ 9085 wait_event(mddev->recovery_wait, 9086 !atomic_read(&mddev->recovery_active)); 9087 } 9088 } 9089 } 9090 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 9091 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 9092 ? "interrupted" : "done"); 9093 /* 9094 * this also signals 'finished resyncing' to md_stop 9095 */ 9096 blk_finish_plug(&plug); 9097 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 9098 9099 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9100 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9101 mddev->curr_resync >= MD_RESYNC_ACTIVE) { 9102 mddev->curr_resync_completed = mddev->curr_resync; 9103 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9104 } 9105 mddev->pers->sync_request(mddev, max_sectors, &skipped); 9106 9107 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 9108 mddev->curr_resync > MD_RESYNC_ACTIVE) { 9109 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 9110 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9111 if (mddev->curr_resync >= mddev->recovery_cp) { 9112 pr_debug("md: checkpointing %s of %s.\n", 9113 desc, mdname(mddev)); 9114 if (test_bit(MD_RECOVERY_ERROR, 9115 &mddev->recovery)) 9116 mddev->recovery_cp = 9117 mddev->curr_resync_completed; 9118 else 9119 mddev->recovery_cp = 9120 mddev->curr_resync; 9121 } 9122 } else 9123 mddev->recovery_cp = MaxSector; 9124 } else { 9125 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 9126 mddev->curr_resync = MaxSector; 9127 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9128 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { 9129 rcu_read_lock(); 9130 rdev_for_each_rcu(rdev, mddev) 9131 if (rdev->raid_disk >= 0 && 9132 mddev->delta_disks >= 0 && 9133 !test_bit(Journal, &rdev->flags) && 9134 !test_bit(Faulty, &rdev->flags) && 9135 !test_bit(In_sync, &rdev->flags) && 9136 rdev->recovery_offset < mddev->curr_resync) 9137 rdev->recovery_offset = mddev->curr_resync; 9138 rcu_read_unlock(); 9139 } 9140 } 9141 } 9142 skip: 9143 /* set CHANGE_PENDING here since maybe another update is needed, 9144 * so other nodes are informed. It should be harmless for normal 9145 * raid */ 9146 set_mask_bits(&mddev->sb_flags, 0, 9147 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 9148 9149 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9150 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9151 mddev->delta_disks > 0 && 9152 mddev->pers->finish_reshape && 9153 mddev->pers->size && 9154 mddev->queue) { 9155 mddev_lock_nointr(mddev); 9156 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); 9157 mddev_unlock(mddev); 9158 if (!mddev_is_clustered(mddev)) 9159 set_capacity_and_notify(mddev->gendisk, 9160 mddev->array_sectors); 9161 } 9162 9163 spin_lock(&mddev->lock); 9164 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 9165 /* We completed so min/max setting can be forgotten if used. */ 9166 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9167 mddev->resync_min = 0; 9168 mddev->resync_max = MaxSector; 9169 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 9170 mddev->resync_min = mddev->curr_resync_completed; 9171 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 9172 mddev->curr_resync = MD_RESYNC_NONE; 9173 spin_unlock(&mddev->lock); 9174 9175 wake_up(&resync_wait); 9176 md_wakeup_thread(mddev->thread); 9177 return; 9178 } 9179 EXPORT_SYMBOL_GPL(md_do_sync); 9180 9181 static bool rdev_removeable(struct md_rdev *rdev) 9182 { 9183 /* rdev is not used. */ 9184 if (rdev->raid_disk < 0) 9185 return false; 9186 9187 /* There are still inflight io, don't remove this rdev. */ 9188 if (atomic_read(&rdev->nr_pending)) 9189 return false; 9190 9191 /* 9192 * An error occurred but has not yet been acknowledged by the metadata 9193 * handler, don't remove this rdev. 9194 */ 9195 if (test_bit(Blocked, &rdev->flags)) 9196 return false; 9197 9198 /* Fautly rdev is not used, it's safe to remove it. */ 9199 if (test_bit(Faulty, &rdev->flags)) 9200 return true; 9201 9202 /* Journal disk can only be removed if it's faulty. */ 9203 if (test_bit(Journal, &rdev->flags)) 9204 return false; 9205 9206 /* 9207 * 'In_sync' is cleared while 'raid_disk' is valid, which means 9208 * replacement has just become active from pers->spare_active(), and 9209 * then pers->hot_remove_disk() will replace this rdev with replacement. 9210 */ 9211 if (!test_bit(In_sync, &rdev->flags)) 9212 return true; 9213 9214 return false; 9215 } 9216 9217 static bool rdev_is_spare(struct md_rdev *rdev) 9218 { 9219 return !test_bit(Candidate, &rdev->flags) && rdev->raid_disk >= 0 && 9220 !test_bit(In_sync, &rdev->flags) && 9221 !test_bit(Journal, &rdev->flags) && 9222 !test_bit(Faulty, &rdev->flags); 9223 } 9224 9225 static bool rdev_addable(struct md_rdev *rdev) 9226 { 9227 /* rdev is already used, don't add it again. */ 9228 if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || 9229 test_bit(Faulty, &rdev->flags)) 9230 return false; 9231 9232 /* Allow to add journal disk. */ 9233 if (test_bit(Journal, &rdev->flags)) 9234 return true; 9235 9236 /* Allow to add if array is read-write. */ 9237 if (md_is_rdwr(rdev->mddev)) 9238 return true; 9239 9240 /* 9241 * For read-only array, only allow to readd a rdev. And if bitmap is 9242 * used, don't allow to readd a rdev that is too old. 9243 */ 9244 if (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags)) 9245 return true; 9246 9247 return false; 9248 } 9249 9250 static bool md_spares_need_change(struct mddev *mddev) 9251 { 9252 struct md_rdev *rdev; 9253 9254 rdev_for_each(rdev, mddev) 9255 if (rdev_removeable(rdev) || rdev_addable(rdev)) 9256 return true; 9257 return false; 9258 } 9259 9260 static int remove_and_add_spares(struct mddev *mddev, 9261 struct md_rdev *this) 9262 { 9263 struct md_rdev *rdev; 9264 int spares = 0; 9265 int removed = 0; 9266 9267 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 9268 /* Mustn't remove devices when resync thread is running */ 9269 return 0; 9270 9271 rdev_for_each(rdev, mddev) { 9272 if ((this == NULL || rdev == this) && rdev_removeable(rdev) && 9273 !mddev->pers->hot_remove_disk(mddev, rdev)) { 9274 sysfs_unlink_rdev(mddev, rdev); 9275 rdev->saved_raid_disk = rdev->raid_disk; 9276 rdev->raid_disk = -1; 9277 removed++; 9278 } 9279 } 9280 9281 if (removed && mddev->kobj.sd) 9282 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9283 9284 if (this && removed) 9285 goto no_add; 9286 9287 rdev_for_each(rdev, mddev) { 9288 if (this && this != rdev) 9289 continue; 9290 if (rdev_is_spare(rdev)) 9291 spares++; 9292 if (!rdev_addable(rdev)) 9293 continue; 9294 if (!test_bit(Journal, &rdev->flags)) 9295 rdev->recovery_offset = 0; 9296 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { 9297 /* failure here is OK */ 9298 sysfs_link_rdev(mddev, rdev); 9299 if (!test_bit(Journal, &rdev->flags)) 9300 spares++; 9301 md_new_event(); 9302 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9303 } 9304 } 9305 no_add: 9306 if (removed) 9307 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9308 return spares; 9309 } 9310 9311 static bool md_choose_sync_action(struct mddev *mddev, int *spares) 9312 { 9313 /* Check if reshape is in progress first. */ 9314 if (mddev->reshape_position != MaxSector) { 9315 if (mddev->pers->check_reshape == NULL || 9316 mddev->pers->check_reshape(mddev) != 0) 9317 return false; 9318 9319 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9320 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9321 return true; 9322 } 9323 9324 /* 9325 * Remove any failed drives, then add spares if possible. Spares are 9326 * also removed and re-added, to allow the personality to fail the 9327 * re-add. 9328 */ 9329 *spares = remove_and_add_spares(mddev, NULL); 9330 if (*spares) { 9331 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9332 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9333 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9334 9335 /* Start new recovery. */ 9336 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9337 return true; 9338 } 9339 9340 /* Check if recovery is in progress. */ 9341 if (mddev->recovery_cp < MaxSector) { 9342 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9343 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9344 return true; 9345 } 9346 9347 /* Delay to choose resync/check/repair in md_do_sync(). */ 9348 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 9349 return true; 9350 9351 /* Nothing to be done */ 9352 return false; 9353 } 9354 9355 static void md_start_sync(struct work_struct *ws) 9356 { 9357 struct mddev *mddev = container_of(ws, struct mddev, sync_work); 9358 int spares = 0; 9359 bool suspend = false; 9360 9361 if (md_spares_need_change(mddev)) 9362 suspend = true; 9363 9364 suspend ? mddev_suspend_and_lock_nointr(mddev) : 9365 mddev_lock_nointr(mddev); 9366 9367 if (!md_is_rdwr(mddev)) { 9368 /* 9369 * On a read-only array we can: 9370 * - remove failed devices 9371 * - add already-in_sync devices if the array itself is in-sync. 9372 * As we only add devices that are already in-sync, we can 9373 * activate the spares immediately. 9374 */ 9375 remove_and_add_spares(mddev, NULL); 9376 goto not_running; 9377 } 9378 9379 if (!md_choose_sync_action(mddev, &spares)) 9380 goto not_running; 9381 9382 if (!mddev->pers->sync_request) 9383 goto not_running; 9384 9385 /* 9386 * We are adding a device or devices to an array which has the bitmap 9387 * stored on all devices. So make sure all bitmap pages get written. 9388 */ 9389 if (spares) 9390 md_bitmap_write_all(mddev->bitmap); 9391 9392 rcu_assign_pointer(mddev->sync_thread, 9393 md_register_thread(md_do_sync, mddev, "resync")); 9394 if (!mddev->sync_thread) { 9395 pr_warn("%s: could not start resync thread...\n", 9396 mdname(mddev)); 9397 /* leave the spares where they are, it shouldn't hurt */ 9398 goto not_running; 9399 } 9400 9401 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 9402 md_wakeup_thread(mddev->sync_thread); 9403 sysfs_notify_dirent_safe(mddev->sysfs_action); 9404 md_new_event(); 9405 return; 9406 9407 not_running: 9408 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9409 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9410 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9411 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9412 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9413 suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev); 9414 9415 wake_up(&resync_wait); 9416 if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && 9417 mddev->sysfs_action) 9418 sysfs_notify_dirent_safe(mddev->sysfs_action); 9419 } 9420 9421 /* 9422 * This routine is regularly called by all per-raid-array threads to 9423 * deal with generic issues like resync and super-block update. 9424 * Raid personalities that don't have a thread (linear/raid0) do not 9425 * need this as they never do any recovery or update the superblock. 9426 * 9427 * It does not do any resync itself, but rather "forks" off other threads 9428 * to do that as needed. 9429 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 9430 * "->recovery" and create a thread at ->sync_thread. 9431 * When the thread finishes it sets MD_RECOVERY_DONE 9432 * and wakeups up this thread which will reap the thread and finish up. 9433 * This thread also removes any faulty devices (with nr_pending == 0). 9434 * 9435 * The overall approach is: 9436 * 1/ if the superblock needs updating, update it. 9437 * 2/ If a recovery thread is running, don't do anything else. 9438 * 3/ If recovery has finished, clean up, possibly marking spares active. 9439 * 4/ If there are any faulty devices, remove them. 9440 * 5/ If array is degraded, try to add spares devices 9441 * 6/ If array has spares or is not in-sync, start a resync thread. 9442 */ 9443 void md_check_recovery(struct mddev *mddev) 9444 { 9445 if (READ_ONCE(mddev->suspended)) 9446 return; 9447 9448 if (mddev->bitmap) 9449 md_bitmap_daemon_work(mddev); 9450 9451 if (signal_pending(current)) { 9452 if (mddev->pers->sync_request && !mddev->external) { 9453 pr_debug("md: %s in immediate safe mode\n", 9454 mdname(mddev)); 9455 mddev->safemode = 2; 9456 } 9457 flush_signals(current); 9458 } 9459 9460 if (!md_is_rdwr(mddev) && 9461 !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 9462 return; 9463 if ( ! ( 9464 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 9465 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 9466 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 9467 (mddev->external == 0 && mddev->safemode == 1) || 9468 (mddev->safemode == 2 9469 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 9470 )) 9471 return; 9472 9473 if (mddev_trylock(mddev)) { 9474 bool try_set_sync = mddev->safemode != 0; 9475 9476 if (!mddev->external && mddev->safemode == 1) 9477 mddev->safemode = 0; 9478 9479 if (!md_is_rdwr(mddev)) { 9480 struct md_rdev *rdev; 9481 9482 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9483 /* sync_work already queued. */ 9484 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9485 goto unlock; 9486 } 9487 9488 if (!mddev->external && mddev->in_sync) 9489 /* 9490 * 'Blocked' flag not needed as failed devices 9491 * will be recorded if array switched to read/write. 9492 * Leaving it set will prevent the device 9493 * from being removed. 9494 */ 9495 rdev_for_each(rdev, mddev) 9496 clear_bit(Blocked, &rdev->flags); 9497 9498 /* 9499 * There is no thread, but we need to call 9500 * ->spare_active and clear saved_raid_disk 9501 */ 9502 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 9503 md_reap_sync_thread(mddev); 9504 9505 /* 9506 * Let md_start_sync() to remove and add rdevs to the 9507 * array. 9508 */ 9509 if (md_spares_need_change(mddev)) { 9510 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9511 queue_work(md_misc_wq, &mddev->sync_work); 9512 } 9513 9514 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 9515 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9516 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 9517 9518 goto unlock; 9519 } 9520 9521 if (mddev_is_clustered(mddev)) { 9522 struct md_rdev *rdev, *tmp; 9523 /* kick the device if another node issued a 9524 * remove disk. 9525 */ 9526 rdev_for_each_safe(rdev, tmp, mddev) { 9527 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 9528 rdev->raid_disk < 0) 9529 md_kick_rdev_from_array(rdev); 9530 } 9531 } 9532 9533 if (try_set_sync && !mddev->external && !mddev->in_sync) { 9534 spin_lock(&mddev->lock); 9535 set_in_sync(mddev); 9536 spin_unlock(&mddev->lock); 9537 } 9538 9539 if (mddev->sb_flags) 9540 md_update_sb(mddev, 0); 9541 9542 /* 9543 * Never start a new sync thread if MD_RECOVERY_RUNNING is 9544 * still set. 9545 */ 9546 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 9547 if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 9548 /* resync/recovery still happening */ 9549 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9550 goto unlock; 9551 } 9552 9553 if (WARN_ON_ONCE(!mddev->sync_thread)) 9554 goto unlock; 9555 9556 md_reap_sync_thread(mddev); 9557 goto unlock; 9558 } 9559 9560 /* Set RUNNING before clearing NEEDED to avoid 9561 * any transients in the value of "sync_action". 9562 */ 9563 mddev->curr_resync_completed = 0; 9564 spin_lock(&mddev->lock); 9565 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9566 spin_unlock(&mddev->lock); 9567 /* Clear some bits that don't mean anything, but 9568 * might be left set 9569 */ 9570 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 9571 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9572 9573 if (test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && 9574 !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 9575 queue_work(md_misc_wq, &mddev->sync_work); 9576 } else { 9577 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9578 wake_up(&resync_wait); 9579 } 9580 9581 unlock: 9582 wake_up(&mddev->sb_wait); 9583 mddev_unlock(mddev); 9584 } 9585 } 9586 EXPORT_SYMBOL(md_check_recovery); 9587 9588 void md_reap_sync_thread(struct mddev *mddev) 9589 { 9590 struct md_rdev *rdev; 9591 sector_t old_dev_sectors = mddev->dev_sectors; 9592 bool is_reshaped = false; 9593 9594 /* resync has finished, collect result */ 9595 md_unregister_thread(mddev, &mddev->sync_thread); 9596 atomic_inc(&mddev->sync_seq); 9597 9598 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 9599 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 9600 mddev->degraded != mddev->raid_disks) { 9601 /* success...*/ 9602 /* activate any spares */ 9603 if (mddev->pers->spare_active(mddev)) { 9604 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9605 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 9606 } 9607 } 9608 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 9609 mddev->pers->finish_reshape) { 9610 mddev->pers->finish_reshape(mddev); 9611 if (mddev_is_clustered(mddev)) 9612 is_reshaped = true; 9613 } 9614 9615 /* If array is no-longer degraded, then any saved_raid_disk 9616 * information must be scrapped. 9617 */ 9618 if (!mddev->degraded) 9619 rdev_for_each(rdev, mddev) 9620 rdev->saved_raid_disk = -1; 9621 9622 md_update_sb(mddev, 1); 9623 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 9624 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 9625 * clustered raid */ 9626 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 9627 md_cluster_ops->resync_finish(mddev); 9628 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 9629 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 9630 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 9631 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 9632 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 9633 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 9634 /* 9635 * We call md_cluster_ops->update_size here because sync_size could 9636 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, 9637 * so it is time to update size across cluster. 9638 */ 9639 if (mddev_is_clustered(mddev) && is_reshaped 9640 && !test_bit(MD_CLOSING, &mddev->flags)) 9641 md_cluster_ops->update_size(mddev, old_dev_sectors); 9642 /* flag recovery needed just to double check */ 9643 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9644 sysfs_notify_dirent_safe(mddev->sysfs_completed); 9645 sysfs_notify_dirent_safe(mddev->sysfs_action); 9646 md_new_event(); 9647 if (mddev->event_work.func) 9648 queue_work(md_misc_wq, &mddev->event_work); 9649 wake_up(&resync_wait); 9650 } 9651 EXPORT_SYMBOL(md_reap_sync_thread); 9652 9653 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 9654 { 9655 sysfs_notify_dirent_safe(rdev->sysfs_state); 9656 wait_event_timeout(rdev->blocked_wait, 9657 !test_bit(Blocked, &rdev->flags) && 9658 !test_bit(BlockedBadBlocks, &rdev->flags), 9659 msecs_to_jiffies(5000)); 9660 rdev_dec_pending(rdev, mddev); 9661 } 9662 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 9663 9664 void md_finish_reshape(struct mddev *mddev) 9665 { 9666 /* called be personality module when reshape completes. */ 9667 struct md_rdev *rdev; 9668 9669 rdev_for_each(rdev, mddev) { 9670 if (rdev->data_offset > rdev->new_data_offset) 9671 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 9672 else 9673 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 9674 rdev->data_offset = rdev->new_data_offset; 9675 } 9676 } 9677 EXPORT_SYMBOL(md_finish_reshape); 9678 9679 /* Bad block management */ 9680 9681 /* Returns 1 on success, 0 on failure */ 9682 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9683 int is_new) 9684 { 9685 struct mddev *mddev = rdev->mddev; 9686 int rv; 9687 if (is_new) 9688 s += rdev->new_data_offset; 9689 else 9690 s += rdev->data_offset; 9691 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 9692 if (rv == 0) { 9693 /* Make sure they get written out promptly */ 9694 if (test_bit(ExternalBbl, &rdev->flags)) 9695 sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); 9696 sysfs_notify_dirent_safe(rdev->sysfs_state); 9697 set_mask_bits(&mddev->sb_flags, 0, 9698 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 9699 md_wakeup_thread(rdev->mddev->thread); 9700 return 1; 9701 } else 9702 return 0; 9703 } 9704 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 9705 9706 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 9707 int is_new) 9708 { 9709 int rv; 9710 if (is_new) 9711 s += rdev->new_data_offset; 9712 else 9713 s += rdev->data_offset; 9714 rv = badblocks_clear(&rdev->badblocks, s, sectors); 9715 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 9716 sysfs_notify_dirent_safe(rdev->sysfs_badblocks); 9717 return rv; 9718 } 9719 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 9720 9721 static int md_notify_reboot(struct notifier_block *this, 9722 unsigned long code, void *x) 9723 { 9724 struct mddev *mddev, *n; 9725 int need_delay = 0; 9726 9727 spin_lock(&all_mddevs_lock); 9728 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 9729 if (!mddev_get(mddev)) 9730 continue; 9731 spin_unlock(&all_mddevs_lock); 9732 if (mddev_trylock(mddev)) { 9733 if (mddev->pers) 9734 __md_stop_writes(mddev); 9735 if (mddev->persistent) 9736 mddev->safemode = 2; 9737 mddev_unlock(mddev); 9738 } 9739 need_delay = 1; 9740 mddev_put(mddev); 9741 spin_lock(&all_mddevs_lock); 9742 } 9743 spin_unlock(&all_mddevs_lock); 9744 9745 /* 9746 * certain more exotic SCSI devices are known to be 9747 * volatile wrt too early system reboots. While the 9748 * right place to handle this issue is the given 9749 * driver, we do want to have a safe RAID driver ... 9750 */ 9751 if (need_delay) 9752 msleep(1000); 9753 9754 return NOTIFY_DONE; 9755 } 9756 9757 static struct notifier_block md_notifier = { 9758 .notifier_call = md_notify_reboot, 9759 .next = NULL, 9760 .priority = INT_MAX, /* before any real devices */ 9761 }; 9762 9763 static void md_geninit(void) 9764 { 9765 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 9766 9767 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops); 9768 } 9769 9770 static int __init md_init(void) 9771 { 9772 int ret = -ENOMEM; 9773 9774 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 9775 if (!md_wq) 9776 goto err_wq; 9777 9778 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 9779 if (!md_misc_wq) 9780 goto err_misc_wq; 9781 9782 md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, 9783 0); 9784 if (!md_bitmap_wq) 9785 goto err_bitmap_wq; 9786 9787 ret = __register_blkdev(MD_MAJOR, "md", md_probe); 9788 if (ret < 0) 9789 goto err_md; 9790 9791 ret = __register_blkdev(0, "mdp", md_probe); 9792 if (ret < 0) 9793 goto err_mdp; 9794 mdp_major = ret; 9795 9796 register_reboot_notifier(&md_notifier); 9797 raid_table_header = register_sysctl("dev/raid", raid_table); 9798 9799 md_geninit(); 9800 return 0; 9801 9802 err_mdp: 9803 unregister_blkdev(MD_MAJOR, "md"); 9804 err_md: 9805 destroy_workqueue(md_bitmap_wq); 9806 err_bitmap_wq: 9807 destroy_workqueue(md_misc_wq); 9808 err_misc_wq: 9809 destroy_workqueue(md_wq); 9810 err_wq: 9811 return ret; 9812 } 9813 9814 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 9815 { 9816 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 9817 struct md_rdev *rdev2, *tmp; 9818 int role, ret; 9819 9820 /* 9821 * If size is changed in another node then we need to 9822 * do resize as well. 9823 */ 9824 if (mddev->dev_sectors != le64_to_cpu(sb->size)) { 9825 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); 9826 if (ret) 9827 pr_info("md-cluster: resize failed\n"); 9828 else 9829 md_bitmap_update_sb(mddev->bitmap); 9830 } 9831 9832 /* Check for change of roles in the active devices */ 9833 rdev_for_each_safe(rdev2, tmp, mddev) { 9834 if (test_bit(Faulty, &rdev2->flags)) 9835 continue; 9836 9837 /* Check if the roles changed */ 9838 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 9839 9840 if (test_bit(Candidate, &rdev2->flags)) { 9841 if (role == MD_DISK_ROLE_FAULTY) { 9842 pr_info("md: Removing Candidate device %pg because add failed\n", 9843 rdev2->bdev); 9844 md_kick_rdev_from_array(rdev2); 9845 continue; 9846 } 9847 else 9848 clear_bit(Candidate, &rdev2->flags); 9849 } 9850 9851 if (role != rdev2->raid_disk) { 9852 /* 9853 * got activated except reshape is happening. 9854 */ 9855 if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && 9856 !(le32_to_cpu(sb->feature_map) & 9857 MD_FEATURE_RESHAPE_ACTIVE)) { 9858 rdev2->saved_raid_disk = role; 9859 ret = remove_and_add_spares(mddev, rdev2); 9860 pr_info("Activated spare: %pg\n", 9861 rdev2->bdev); 9862 /* wakeup mddev->thread here, so array could 9863 * perform resync with the new activated disk */ 9864 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 9865 md_wakeup_thread(mddev->thread); 9866 } 9867 /* device faulty 9868 * We just want to do the minimum to mark the disk 9869 * as faulty. The recovery is performed by the 9870 * one who initiated the error. 9871 */ 9872 if (role == MD_DISK_ROLE_FAULTY || 9873 role == MD_DISK_ROLE_JOURNAL) { 9874 md_error(mddev, rdev2); 9875 clear_bit(Blocked, &rdev2->flags); 9876 } 9877 } 9878 } 9879 9880 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { 9881 ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 9882 if (ret) 9883 pr_warn("md: updating array disks failed. %d\n", ret); 9884 } 9885 9886 /* 9887 * Since mddev->delta_disks has already updated in update_raid_disks, 9888 * so it is time to check reshape. 9889 */ 9890 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9891 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9892 /* 9893 * reshape is happening in the remote node, we need to 9894 * update reshape_position and call start_reshape. 9895 */ 9896 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 9897 if (mddev->pers->update_reshape_pos) 9898 mddev->pers->update_reshape_pos(mddev); 9899 if (mddev->pers->start_reshape) 9900 mddev->pers->start_reshape(mddev); 9901 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 9902 mddev->reshape_position != MaxSector && 9903 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 9904 /* reshape is just done in another node. */ 9905 mddev->reshape_position = MaxSector; 9906 if (mddev->pers->update_reshape_pos) 9907 mddev->pers->update_reshape_pos(mddev); 9908 } 9909 9910 /* Finally set the event to be up to date */ 9911 mddev->events = le64_to_cpu(sb->events); 9912 } 9913 9914 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 9915 { 9916 int err; 9917 struct page *swapout = rdev->sb_page; 9918 struct mdp_superblock_1 *sb; 9919 9920 /* Store the sb page of the rdev in the swapout temporary 9921 * variable in case we err in the future 9922 */ 9923 rdev->sb_page = NULL; 9924 err = alloc_disk_sb(rdev); 9925 if (err == 0) { 9926 ClearPageUptodate(rdev->sb_page); 9927 rdev->sb_loaded = 0; 9928 err = super_types[mddev->major_version]. 9929 load_super(rdev, NULL, mddev->minor_version); 9930 } 9931 if (err < 0) { 9932 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 9933 __func__, __LINE__, rdev->desc_nr, err); 9934 if (rdev->sb_page) 9935 put_page(rdev->sb_page); 9936 rdev->sb_page = swapout; 9937 rdev->sb_loaded = 1; 9938 return err; 9939 } 9940 9941 sb = page_address(rdev->sb_page); 9942 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 9943 * is not set 9944 */ 9945 9946 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 9947 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 9948 9949 /* The other node finished recovery, call spare_active to set 9950 * device In_sync and mddev->degraded 9951 */ 9952 if (rdev->recovery_offset == MaxSector && 9953 !test_bit(In_sync, &rdev->flags) && 9954 mddev->pers->spare_active(mddev)) 9955 sysfs_notify_dirent_safe(mddev->sysfs_degraded); 9956 9957 put_page(swapout); 9958 return 0; 9959 } 9960 9961 void md_reload_sb(struct mddev *mddev, int nr) 9962 { 9963 struct md_rdev *rdev = NULL, *iter; 9964 int err; 9965 9966 /* Find the rdev */ 9967 rdev_for_each_rcu(iter, mddev) { 9968 if (iter->desc_nr == nr) { 9969 rdev = iter; 9970 break; 9971 } 9972 } 9973 9974 if (!rdev) { 9975 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 9976 return; 9977 } 9978 9979 err = read_rdev(mddev, rdev); 9980 if (err < 0) 9981 return; 9982 9983 check_sb_changes(mddev, rdev); 9984 9985 /* Read all rdev's to update recovery_offset */ 9986 rdev_for_each_rcu(rdev, mddev) { 9987 if (!test_bit(Faulty, &rdev->flags)) 9988 read_rdev(mddev, rdev); 9989 } 9990 } 9991 EXPORT_SYMBOL(md_reload_sb); 9992 9993 #ifndef MODULE 9994 9995 /* 9996 * Searches all registered partitions for autorun RAID arrays 9997 * at boot time. 9998 */ 9999 10000 static DEFINE_MUTEX(detected_devices_mutex); 10001 static LIST_HEAD(all_detected_devices); 10002 struct detected_devices_node { 10003 struct list_head list; 10004 dev_t dev; 10005 }; 10006 10007 void md_autodetect_dev(dev_t dev) 10008 { 10009 struct detected_devices_node *node_detected_dev; 10010 10011 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 10012 if (node_detected_dev) { 10013 node_detected_dev->dev = dev; 10014 mutex_lock(&detected_devices_mutex); 10015 list_add_tail(&node_detected_dev->list, &all_detected_devices); 10016 mutex_unlock(&detected_devices_mutex); 10017 } 10018 } 10019 10020 void md_autostart_arrays(int part) 10021 { 10022 struct md_rdev *rdev; 10023 struct detected_devices_node *node_detected_dev; 10024 dev_t dev; 10025 int i_scanned, i_passed; 10026 10027 i_scanned = 0; 10028 i_passed = 0; 10029 10030 pr_info("md: Autodetecting RAID arrays.\n"); 10031 10032 mutex_lock(&detected_devices_mutex); 10033 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 10034 i_scanned++; 10035 node_detected_dev = list_entry(all_detected_devices.next, 10036 struct detected_devices_node, list); 10037 list_del(&node_detected_dev->list); 10038 dev = node_detected_dev->dev; 10039 kfree(node_detected_dev); 10040 mutex_unlock(&detected_devices_mutex); 10041 rdev = md_import_device(dev,0, 90); 10042 mutex_lock(&detected_devices_mutex); 10043 if (IS_ERR(rdev)) 10044 continue; 10045 10046 if (test_bit(Faulty, &rdev->flags)) 10047 continue; 10048 10049 set_bit(AutoDetected, &rdev->flags); 10050 list_add(&rdev->same_set, &pending_raid_disks); 10051 i_passed++; 10052 } 10053 mutex_unlock(&detected_devices_mutex); 10054 10055 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 10056 10057 autorun_devices(part); 10058 } 10059 10060 #endif /* !MODULE */ 10061 10062 static __exit void md_exit(void) 10063 { 10064 struct mddev *mddev, *n; 10065 int delay = 1; 10066 10067 unregister_blkdev(MD_MAJOR,"md"); 10068 unregister_blkdev(mdp_major, "mdp"); 10069 unregister_reboot_notifier(&md_notifier); 10070 unregister_sysctl_table(raid_table_header); 10071 10072 /* We cannot unload the modules while some process is 10073 * waiting for us in select() or poll() - wake them up 10074 */ 10075 md_unloading = 1; 10076 while (waitqueue_active(&md_event_waiters)) { 10077 /* not safe to leave yet */ 10078 wake_up(&md_event_waiters); 10079 msleep(delay); 10080 delay += delay; 10081 } 10082 remove_proc_entry("mdstat", NULL); 10083 10084 spin_lock(&all_mddevs_lock); 10085 list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { 10086 if (!mddev_get(mddev)) 10087 continue; 10088 spin_unlock(&all_mddevs_lock); 10089 export_array(mddev); 10090 mddev->ctime = 0; 10091 mddev->hold_active = 0; 10092 /* 10093 * As the mddev is now fully clear, mddev_put will schedule 10094 * the mddev for destruction by a workqueue, and the 10095 * destroy_workqueue() below will wait for that to complete. 10096 */ 10097 mddev_put(mddev); 10098 spin_lock(&all_mddevs_lock); 10099 } 10100 spin_unlock(&all_mddevs_lock); 10101 10102 destroy_workqueue(md_misc_wq); 10103 destroy_workqueue(md_bitmap_wq); 10104 destroy_workqueue(md_wq); 10105 } 10106 10107 subsys_initcall(md_init); 10108 module_exit(md_exit) 10109 10110 static int get_ro(char *buffer, const struct kernel_param *kp) 10111 { 10112 return sprintf(buffer, "%d\n", start_readonly); 10113 } 10114 static int set_ro(const char *val, const struct kernel_param *kp) 10115 { 10116 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 10117 } 10118 10119 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 10120 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 10121 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 10122 module_param(create_on_open, bool, S_IRUSR|S_IWUSR); 10123 10124 MODULE_LICENSE("GPL"); 10125 MODULE_DESCRIPTION("MD RAID framework"); 10126 MODULE_ALIAS("md"); 10127 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 10128