1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 34 Errors, Warnings, etc. 35 Please use: 36 pr_crit() for error conditions that risk data loss 37 pr_err() for error conditions that are unexpected, like an IO error 38 or internal inconsistency 39 pr_warn() for error conditions that could have been predicated, like 40 adding a device to an array when it has incompatible metadata 41 pr_info() for every interesting, very rare events, like an array starting 42 or stopping, or resync starting or stopping 43 pr_debug() for everything else. 44 45 */ 46 47 #include <linux/sched/signal.h> 48 #include <linux/kthread.h> 49 #include <linux/blkdev.h> 50 #include <linux/badblocks.h> 51 #include <linux/sysctl.h> 52 #include <linux/seq_file.h> 53 #include <linux/fs.h> 54 #include <linux/poll.h> 55 #include <linux/ctype.h> 56 #include <linux/string.h> 57 #include <linux/hdreg.h> 58 #include <linux/proc_fs.h> 59 #include <linux/random.h> 60 #include <linux/module.h> 61 #include <linux/reboot.h> 62 #include <linux/file.h> 63 #include <linux/compat.h> 64 #include <linux/delay.h> 65 #include <linux/raid/md_p.h> 66 #include <linux/raid/md_u.h> 67 #include <linux/slab.h> 68 #include <trace/events/block.h> 69 #include "md.h" 70 #include "bitmap.h" 71 #include "md-cluster.h" 72 73 #ifndef MODULE 74 static void autostart_arrays(int part); 75 #endif 76 77 /* pers_list is a list of registered personalities protected 78 * by pers_lock. 79 * pers_lock does extra service to protect accesses to 80 * mddev->thread when the mutex cannot be held. 81 */ 82 static LIST_HEAD(pers_list); 83 static DEFINE_SPINLOCK(pers_lock); 84 85 struct md_cluster_operations *md_cluster_ops; 86 EXPORT_SYMBOL(md_cluster_ops); 87 struct module *md_cluster_mod; 88 EXPORT_SYMBOL(md_cluster_mod); 89 90 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 91 static struct workqueue_struct *md_wq; 92 static struct workqueue_struct *md_misc_wq; 93 94 static int remove_and_add_spares(struct mddev *mddev, 95 struct md_rdev *this); 96 static void mddev_detach(struct mddev *mddev); 97 98 /* 99 * Default number of read corrections we'll attempt on an rdev 100 * before ejecting it from the array. We divide the read error 101 * count by 2 for every hour elapsed between read errors. 102 */ 103 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 104 /* 105 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 106 * is 1000 KB/sec, so the extra system load does not show up that much. 107 * Increase it if you want to have more _guaranteed_ speed. Note that 108 * the RAID driver will use the maximum available bandwidth if the IO 109 * subsystem is idle. There is also an 'absolute maximum' reconstruction 110 * speed limit - in case reconstruction slows down your system despite 111 * idle IO detection. 112 * 113 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 114 * or /sys/block/mdX/md/sync_speed_{min,max} 115 */ 116 117 static int sysctl_speed_limit_min = 1000; 118 static int sysctl_speed_limit_max = 200000; 119 static inline int speed_min(struct mddev *mddev) 120 { 121 return mddev->sync_speed_min ? 122 mddev->sync_speed_min : sysctl_speed_limit_min; 123 } 124 125 static inline int speed_max(struct mddev *mddev) 126 { 127 return mddev->sync_speed_max ? 128 mddev->sync_speed_max : sysctl_speed_limit_max; 129 } 130 131 static struct ctl_table_header *raid_table_header; 132 133 static struct ctl_table raid_table[] = { 134 { 135 .procname = "speed_limit_min", 136 .data = &sysctl_speed_limit_min, 137 .maxlen = sizeof(int), 138 .mode = S_IRUGO|S_IWUSR, 139 .proc_handler = proc_dointvec, 140 }, 141 { 142 .procname = "speed_limit_max", 143 .data = &sysctl_speed_limit_max, 144 .maxlen = sizeof(int), 145 .mode = S_IRUGO|S_IWUSR, 146 .proc_handler = proc_dointvec, 147 }, 148 { } 149 }; 150 151 static struct ctl_table raid_dir_table[] = { 152 { 153 .procname = "raid", 154 .maxlen = 0, 155 .mode = S_IRUGO|S_IXUGO, 156 .child = raid_table, 157 }, 158 { } 159 }; 160 161 static struct ctl_table raid_root_table[] = { 162 { 163 .procname = "dev", 164 .maxlen = 0, 165 .mode = 0555, 166 .child = raid_dir_table, 167 }, 168 { } 169 }; 170 171 static const struct block_device_operations md_fops; 172 173 static int start_readonly; 174 175 /* bio_clone_mddev 176 * like bio_clone, but with a local bio set 177 */ 178 179 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 180 struct mddev *mddev) 181 { 182 struct bio *b; 183 184 if (!mddev || !mddev->bio_set) 185 return bio_alloc(gfp_mask, nr_iovecs); 186 187 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); 188 if (!b) 189 return NULL; 190 return b; 191 } 192 EXPORT_SYMBOL_GPL(bio_alloc_mddev); 193 194 /* 195 * We have a system wide 'event count' that is incremented 196 * on any 'interesting' event, and readers of /proc/mdstat 197 * can use 'poll' or 'select' to find out when the event 198 * count increases. 199 * 200 * Events are: 201 * start array, stop array, error, add device, remove device, 202 * start build, activate spare 203 */ 204 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 205 static atomic_t md_event_count; 206 void md_new_event(struct mddev *mddev) 207 { 208 atomic_inc(&md_event_count); 209 wake_up(&md_event_waiters); 210 } 211 EXPORT_SYMBOL_GPL(md_new_event); 212 213 /* 214 * Enables to iterate over all existing md arrays 215 * all_mddevs_lock protects this list. 216 */ 217 static LIST_HEAD(all_mddevs); 218 static DEFINE_SPINLOCK(all_mddevs_lock); 219 220 /* 221 * iterates through all used mddevs in the system. 222 * We take care to grab the all_mddevs_lock whenever navigating 223 * the list, and to always hold a refcount when unlocked. 224 * Any code which breaks out of this loop while own 225 * a reference to the current mddev and must mddev_put it. 226 */ 227 #define for_each_mddev(_mddev,_tmp) \ 228 \ 229 for (({ spin_lock(&all_mddevs_lock); \ 230 _tmp = all_mddevs.next; \ 231 _mddev = NULL;}); \ 232 ({ if (_tmp != &all_mddevs) \ 233 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 234 spin_unlock(&all_mddevs_lock); \ 235 if (_mddev) mddev_put(_mddev); \ 236 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 237 _tmp != &all_mddevs;}); \ 238 ({ spin_lock(&all_mddevs_lock); \ 239 _tmp = _tmp->next;}) \ 240 ) 241 242 /* Rather than calling directly into the personality make_request function, 243 * IO requests come here first so that we can check if the device is 244 * being suspended pending a reconfiguration. 245 * We hold a refcount over the call to ->make_request. By the time that 246 * call has finished, the bio has been linked into some internal structure 247 * and so is visible to ->quiesce(), so we don't need the refcount any more. 248 */ 249 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) 250 { 251 const int rw = bio_data_dir(bio); 252 struct mddev *mddev = q->queuedata; 253 unsigned int sectors; 254 int cpu; 255 256 blk_queue_split(q, &bio, q->bio_split); 257 258 if (mddev == NULL || mddev->pers == NULL) { 259 bio_io_error(bio); 260 return BLK_QC_T_NONE; 261 } 262 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 263 if (bio_sectors(bio) != 0) 264 bio->bi_error = -EROFS; 265 bio_endio(bio); 266 return BLK_QC_T_NONE; 267 } 268 smp_rmb(); /* Ensure implications of 'active' are visible */ 269 rcu_read_lock(); 270 if (mddev->suspended) { 271 DEFINE_WAIT(__wait); 272 for (;;) { 273 prepare_to_wait(&mddev->sb_wait, &__wait, 274 TASK_UNINTERRUPTIBLE); 275 if (!mddev->suspended) 276 break; 277 rcu_read_unlock(); 278 schedule(); 279 rcu_read_lock(); 280 } 281 finish_wait(&mddev->sb_wait, &__wait); 282 } 283 atomic_inc(&mddev->active_io); 284 rcu_read_unlock(); 285 286 /* 287 * save the sectors now since our bio can 288 * go away inside make_request 289 */ 290 sectors = bio_sectors(bio); 291 /* bio could be mergeable after passing to underlayer */ 292 bio->bi_opf &= ~REQ_NOMERGE; 293 mddev->pers->make_request(mddev, bio); 294 295 cpu = part_stat_lock(); 296 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 297 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); 298 part_stat_unlock(); 299 300 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 301 wake_up(&mddev->sb_wait); 302 303 return BLK_QC_T_NONE; 304 } 305 306 /* mddev_suspend makes sure no new requests are submitted 307 * to the device, and that any requests that have been submitted 308 * are completely handled. 309 * Once mddev_detach() is called and completes, the module will be 310 * completely unused. 311 */ 312 void mddev_suspend(struct mddev *mddev) 313 { 314 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); 315 if (mddev->suspended++) 316 return; 317 synchronize_rcu(); 318 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 319 mddev->pers->quiesce(mddev, 1); 320 321 del_timer_sync(&mddev->safemode_timer); 322 } 323 EXPORT_SYMBOL_GPL(mddev_suspend); 324 325 void mddev_resume(struct mddev *mddev) 326 { 327 if (--mddev->suspended) 328 return; 329 wake_up(&mddev->sb_wait); 330 mddev->pers->quiesce(mddev, 0); 331 332 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 333 md_wakeup_thread(mddev->thread); 334 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 335 } 336 EXPORT_SYMBOL_GPL(mddev_resume); 337 338 int mddev_congested(struct mddev *mddev, int bits) 339 { 340 struct md_personality *pers = mddev->pers; 341 int ret = 0; 342 343 rcu_read_lock(); 344 if (mddev->suspended) 345 ret = 1; 346 else if (pers && pers->congested) 347 ret = pers->congested(mddev, bits); 348 rcu_read_unlock(); 349 return ret; 350 } 351 EXPORT_SYMBOL_GPL(mddev_congested); 352 static int md_congested(void *data, int bits) 353 { 354 struct mddev *mddev = data; 355 return mddev_congested(mddev, bits); 356 } 357 358 /* 359 * Generic flush handling for md 360 */ 361 362 static void md_end_flush(struct bio *bio) 363 { 364 struct md_rdev *rdev = bio->bi_private; 365 struct mddev *mddev = rdev->mddev; 366 367 rdev_dec_pending(rdev, mddev); 368 369 if (atomic_dec_and_test(&mddev->flush_pending)) { 370 /* The pre-request flush has finished */ 371 queue_work(md_wq, &mddev->flush_work); 372 } 373 bio_put(bio); 374 } 375 376 static void md_submit_flush_data(struct work_struct *ws); 377 378 static void submit_flushes(struct work_struct *ws) 379 { 380 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 381 struct md_rdev *rdev; 382 383 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 384 atomic_set(&mddev->flush_pending, 1); 385 rcu_read_lock(); 386 rdev_for_each_rcu(rdev, mddev) 387 if (rdev->raid_disk >= 0 && 388 !test_bit(Faulty, &rdev->flags)) { 389 /* Take two references, one is dropped 390 * when request finishes, one after 391 * we reclaim rcu_read_lock 392 */ 393 struct bio *bi; 394 atomic_inc(&rdev->nr_pending); 395 atomic_inc(&rdev->nr_pending); 396 rcu_read_unlock(); 397 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 398 bi->bi_end_io = md_end_flush; 399 bi->bi_private = rdev; 400 bi->bi_bdev = rdev->bdev; 401 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 402 atomic_inc(&mddev->flush_pending); 403 submit_bio(bi); 404 rcu_read_lock(); 405 rdev_dec_pending(rdev, mddev); 406 } 407 rcu_read_unlock(); 408 if (atomic_dec_and_test(&mddev->flush_pending)) 409 queue_work(md_wq, &mddev->flush_work); 410 } 411 412 static void md_submit_flush_data(struct work_struct *ws) 413 { 414 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 415 struct bio *bio = mddev->flush_bio; 416 417 if (bio->bi_iter.bi_size == 0) 418 /* an empty barrier - all done */ 419 bio_endio(bio); 420 else { 421 bio->bi_opf &= ~REQ_PREFLUSH; 422 mddev->pers->make_request(mddev, bio); 423 } 424 425 mddev->flush_bio = NULL; 426 wake_up(&mddev->sb_wait); 427 } 428 429 void md_flush_request(struct mddev *mddev, struct bio *bio) 430 { 431 spin_lock_irq(&mddev->lock); 432 wait_event_lock_irq(mddev->sb_wait, 433 !mddev->flush_bio, 434 mddev->lock); 435 mddev->flush_bio = bio; 436 spin_unlock_irq(&mddev->lock); 437 438 INIT_WORK(&mddev->flush_work, submit_flushes); 439 queue_work(md_wq, &mddev->flush_work); 440 } 441 EXPORT_SYMBOL(md_flush_request); 442 443 static inline struct mddev *mddev_get(struct mddev *mddev) 444 { 445 atomic_inc(&mddev->active); 446 return mddev; 447 } 448 449 static void mddev_delayed_delete(struct work_struct *ws); 450 451 static void mddev_put(struct mddev *mddev) 452 { 453 struct bio_set *bs = NULL; 454 455 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 456 return; 457 if (!mddev->raid_disks && list_empty(&mddev->disks) && 458 mddev->ctime == 0 && !mddev->hold_active) { 459 /* Array is not configured at all, and not held active, 460 * so destroy it */ 461 list_del_init(&mddev->all_mddevs); 462 bs = mddev->bio_set; 463 mddev->bio_set = NULL; 464 if (mddev->gendisk) { 465 /* We did a probe so need to clean up. Call 466 * queue_work inside the spinlock so that 467 * flush_workqueue() after mddev_find will 468 * succeed in waiting for the work to be done. 469 */ 470 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 471 queue_work(md_misc_wq, &mddev->del_work); 472 } else 473 kfree(mddev); 474 } 475 spin_unlock(&all_mddevs_lock); 476 if (bs) 477 bioset_free(bs); 478 } 479 480 static void md_safemode_timeout(unsigned long data); 481 482 void mddev_init(struct mddev *mddev) 483 { 484 mutex_init(&mddev->open_mutex); 485 mutex_init(&mddev->reconfig_mutex); 486 mutex_init(&mddev->bitmap_info.mutex); 487 INIT_LIST_HEAD(&mddev->disks); 488 INIT_LIST_HEAD(&mddev->all_mddevs); 489 setup_timer(&mddev->safemode_timer, md_safemode_timeout, 490 (unsigned long) mddev); 491 atomic_set(&mddev->active, 1); 492 atomic_set(&mddev->openers, 0); 493 atomic_set(&mddev->active_io, 0); 494 spin_lock_init(&mddev->lock); 495 atomic_set(&mddev->flush_pending, 0); 496 init_waitqueue_head(&mddev->sb_wait); 497 init_waitqueue_head(&mddev->recovery_wait); 498 mddev->reshape_position = MaxSector; 499 mddev->reshape_backwards = 0; 500 mddev->last_sync_action = "none"; 501 mddev->resync_min = 0; 502 mddev->resync_max = MaxSector; 503 mddev->level = LEVEL_NONE; 504 } 505 EXPORT_SYMBOL_GPL(mddev_init); 506 507 static struct mddev *mddev_find(dev_t unit) 508 { 509 struct mddev *mddev, *new = NULL; 510 511 if (unit && MAJOR(unit) != MD_MAJOR) 512 unit &= ~((1<<MdpMinorShift)-1); 513 514 retry: 515 spin_lock(&all_mddevs_lock); 516 517 if (unit) { 518 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 519 if (mddev->unit == unit) { 520 mddev_get(mddev); 521 spin_unlock(&all_mddevs_lock); 522 kfree(new); 523 return mddev; 524 } 525 526 if (new) { 527 list_add(&new->all_mddevs, &all_mddevs); 528 spin_unlock(&all_mddevs_lock); 529 new->hold_active = UNTIL_IOCTL; 530 return new; 531 } 532 } else if (new) { 533 /* find an unused unit number */ 534 static int next_minor = 512; 535 int start = next_minor; 536 int is_free = 0; 537 int dev = 0; 538 while (!is_free) { 539 dev = MKDEV(MD_MAJOR, next_minor); 540 next_minor++; 541 if (next_minor > MINORMASK) 542 next_minor = 0; 543 if (next_minor == start) { 544 /* Oh dear, all in use. */ 545 spin_unlock(&all_mddevs_lock); 546 kfree(new); 547 return NULL; 548 } 549 550 is_free = 1; 551 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 552 if (mddev->unit == dev) { 553 is_free = 0; 554 break; 555 } 556 } 557 new->unit = dev; 558 new->md_minor = MINOR(dev); 559 new->hold_active = UNTIL_STOP; 560 list_add(&new->all_mddevs, &all_mddevs); 561 spin_unlock(&all_mddevs_lock); 562 return new; 563 } 564 spin_unlock(&all_mddevs_lock); 565 566 new = kzalloc(sizeof(*new), GFP_KERNEL); 567 if (!new) 568 return NULL; 569 570 new->unit = unit; 571 if (MAJOR(unit) == MD_MAJOR) 572 new->md_minor = MINOR(unit); 573 else 574 new->md_minor = MINOR(unit) >> MdpMinorShift; 575 576 mddev_init(new); 577 578 goto retry; 579 } 580 581 static struct attribute_group md_redundancy_group; 582 583 void mddev_unlock(struct mddev *mddev) 584 { 585 if (mddev->to_remove) { 586 /* These cannot be removed under reconfig_mutex as 587 * an access to the files will try to take reconfig_mutex 588 * while holding the file unremovable, which leads to 589 * a deadlock. 590 * So hold set sysfs_active while the remove in happeing, 591 * and anything else which might set ->to_remove or my 592 * otherwise change the sysfs namespace will fail with 593 * -EBUSY if sysfs_active is still set. 594 * We set sysfs_active under reconfig_mutex and elsewhere 595 * test it under the same mutex to ensure its correct value 596 * is seen. 597 */ 598 struct attribute_group *to_remove = mddev->to_remove; 599 mddev->to_remove = NULL; 600 mddev->sysfs_active = 1; 601 mutex_unlock(&mddev->reconfig_mutex); 602 603 if (mddev->kobj.sd) { 604 if (to_remove != &md_redundancy_group) 605 sysfs_remove_group(&mddev->kobj, to_remove); 606 if (mddev->pers == NULL || 607 mddev->pers->sync_request == NULL) { 608 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 609 if (mddev->sysfs_action) 610 sysfs_put(mddev->sysfs_action); 611 mddev->sysfs_action = NULL; 612 } 613 } 614 mddev->sysfs_active = 0; 615 } else 616 mutex_unlock(&mddev->reconfig_mutex); 617 618 /* As we've dropped the mutex we need a spinlock to 619 * make sure the thread doesn't disappear 620 */ 621 spin_lock(&pers_lock); 622 md_wakeup_thread(mddev->thread); 623 spin_unlock(&pers_lock); 624 } 625 EXPORT_SYMBOL_GPL(mddev_unlock); 626 627 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 628 { 629 struct md_rdev *rdev; 630 631 rdev_for_each_rcu(rdev, mddev) 632 if (rdev->desc_nr == nr) 633 return rdev; 634 635 return NULL; 636 } 637 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 638 639 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 640 { 641 struct md_rdev *rdev; 642 643 rdev_for_each(rdev, mddev) 644 if (rdev->bdev->bd_dev == dev) 645 return rdev; 646 647 return NULL; 648 } 649 650 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 651 { 652 struct md_rdev *rdev; 653 654 rdev_for_each_rcu(rdev, mddev) 655 if (rdev->bdev->bd_dev == dev) 656 return rdev; 657 658 return NULL; 659 } 660 661 static struct md_personality *find_pers(int level, char *clevel) 662 { 663 struct md_personality *pers; 664 list_for_each_entry(pers, &pers_list, list) { 665 if (level != LEVEL_NONE && pers->level == level) 666 return pers; 667 if (strcmp(pers->name, clevel)==0) 668 return pers; 669 } 670 return NULL; 671 } 672 673 /* return the offset of the super block in 512byte sectors */ 674 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 675 { 676 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 677 return MD_NEW_SIZE_SECTORS(num_sectors); 678 } 679 680 static int alloc_disk_sb(struct md_rdev *rdev) 681 { 682 rdev->sb_page = alloc_page(GFP_KERNEL); 683 if (!rdev->sb_page) 684 return -ENOMEM; 685 return 0; 686 } 687 688 void md_rdev_clear(struct md_rdev *rdev) 689 { 690 if (rdev->sb_page) { 691 put_page(rdev->sb_page); 692 rdev->sb_loaded = 0; 693 rdev->sb_page = NULL; 694 rdev->sb_start = 0; 695 rdev->sectors = 0; 696 } 697 if (rdev->bb_page) { 698 put_page(rdev->bb_page); 699 rdev->bb_page = NULL; 700 } 701 badblocks_exit(&rdev->badblocks); 702 } 703 EXPORT_SYMBOL_GPL(md_rdev_clear); 704 705 static void super_written(struct bio *bio) 706 { 707 struct md_rdev *rdev = bio->bi_private; 708 struct mddev *mddev = rdev->mddev; 709 710 if (bio->bi_error) { 711 pr_err("md: super_written gets error=%d\n", bio->bi_error); 712 md_error(mddev, rdev); 713 if (!test_bit(Faulty, &rdev->flags) 714 && (bio->bi_opf & MD_FAILFAST)) { 715 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); 716 set_bit(LastDev, &rdev->flags); 717 } 718 } else 719 clear_bit(LastDev, &rdev->flags); 720 721 if (atomic_dec_and_test(&mddev->pending_writes)) 722 wake_up(&mddev->sb_wait); 723 rdev_dec_pending(rdev, mddev); 724 bio_put(bio); 725 } 726 727 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 728 sector_t sector, int size, struct page *page) 729 { 730 /* write first size bytes of page to sector of rdev 731 * Increment mddev->pending_writes before returning 732 * and decrement it on completion, waking up sb_wait 733 * if zero is reached. 734 * If an error occurred, call md_error 735 */ 736 struct bio *bio; 737 int ff = 0; 738 739 if (test_bit(Faulty, &rdev->flags)) 740 return; 741 742 bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); 743 744 atomic_inc(&rdev->nr_pending); 745 746 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; 747 bio->bi_iter.bi_sector = sector; 748 bio_add_page(bio, page, size, 0); 749 bio->bi_private = rdev; 750 bio->bi_end_io = super_written; 751 752 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && 753 test_bit(FailFast, &rdev->flags) && 754 !test_bit(LastDev, &rdev->flags)) 755 ff = MD_FAILFAST; 756 bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff; 757 758 atomic_inc(&mddev->pending_writes); 759 submit_bio(bio); 760 } 761 762 int md_super_wait(struct mddev *mddev) 763 { 764 /* wait for all superblock writes that were scheduled to complete */ 765 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 766 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) 767 return -EAGAIN; 768 return 0; 769 } 770 771 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 772 struct page *page, int op, int op_flags, bool metadata_op) 773 { 774 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 775 int ret; 776 777 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? 778 rdev->meta_bdev : rdev->bdev; 779 bio_set_op_attrs(bio, op, op_flags); 780 if (metadata_op) 781 bio->bi_iter.bi_sector = sector + rdev->sb_start; 782 else if (rdev->mddev->reshape_position != MaxSector && 783 (rdev->mddev->reshape_backwards == 784 (sector >= rdev->mddev->reshape_position))) 785 bio->bi_iter.bi_sector = sector + rdev->new_data_offset; 786 else 787 bio->bi_iter.bi_sector = sector + rdev->data_offset; 788 bio_add_page(bio, page, size, 0); 789 790 submit_bio_wait(bio); 791 792 ret = !bio->bi_error; 793 bio_put(bio); 794 return ret; 795 } 796 EXPORT_SYMBOL_GPL(sync_page_io); 797 798 static int read_disk_sb(struct md_rdev *rdev, int size) 799 { 800 char b[BDEVNAME_SIZE]; 801 802 if (rdev->sb_loaded) 803 return 0; 804 805 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) 806 goto fail; 807 rdev->sb_loaded = 1; 808 return 0; 809 810 fail: 811 pr_err("md: disabled device %s, could not read superblock.\n", 812 bdevname(rdev->bdev,b)); 813 return -EINVAL; 814 } 815 816 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 817 { 818 return sb1->set_uuid0 == sb2->set_uuid0 && 819 sb1->set_uuid1 == sb2->set_uuid1 && 820 sb1->set_uuid2 == sb2->set_uuid2 && 821 sb1->set_uuid3 == sb2->set_uuid3; 822 } 823 824 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 825 { 826 int ret; 827 mdp_super_t *tmp1, *tmp2; 828 829 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 830 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 831 832 if (!tmp1 || !tmp2) { 833 ret = 0; 834 goto abort; 835 } 836 837 *tmp1 = *sb1; 838 *tmp2 = *sb2; 839 840 /* 841 * nr_disks is not constant 842 */ 843 tmp1->nr_disks = 0; 844 tmp2->nr_disks = 0; 845 846 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 847 abort: 848 kfree(tmp1); 849 kfree(tmp2); 850 return ret; 851 } 852 853 static u32 md_csum_fold(u32 csum) 854 { 855 csum = (csum & 0xffff) + (csum >> 16); 856 return (csum & 0xffff) + (csum >> 16); 857 } 858 859 static unsigned int calc_sb_csum(mdp_super_t *sb) 860 { 861 u64 newcsum = 0; 862 u32 *sb32 = (u32*)sb; 863 int i; 864 unsigned int disk_csum, csum; 865 866 disk_csum = sb->sb_csum; 867 sb->sb_csum = 0; 868 869 for (i = 0; i < MD_SB_BYTES/4 ; i++) 870 newcsum += sb32[i]; 871 csum = (newcsum & 0xffffffff) + (newcsum>>32); 872 873 #ifdef CONFIG_ALPHA 874 /* This used to use csum_partial, which was wrong for several 875 * reasons including that different results are returned on 876 * different architectures. It isn't critical that we get exactly 877 * the same return value as before (we always csum_fold before 878 * testing, and that removes any differences). However as we 879 * know that csum_partial always returned a 16bit value on 880 * alphas, do a fold to maximise conformity to previous behaviour. 881 */ 882 sb->sb_csum = md_csum_fold(disk_csum); 883 #else 884 sb->sb_csum = disk_csum; 885 #endif 886 return csum; 887 } 888 889 /* 890 * Handle superblock details. 891 * We want to be able to handle multiple superblock formats 892 * so we have a common interface to them all, and an array of 893 * different handlers. 894 * We rely on user-space to write the initial superblock, and support 895 * reading and updating of superblocks. 896 * Interface methods are: 897 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 898 * loads and validates a superblock on dev. 899 * if refdev != NULL, compare superblocks on both devices 900 * Return: 901 * 0 - dev has a superblock that is compatible with refdev 902 * 1 - dev has a superblock that is compatible and newer than refdev 903 * so dev should be used as the refdev in future 904 * -EINVAL superblock incompatible or invalid 905 * -othererror e.g. -EIO 906 * 907 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 908 * Verify that dev is acceptable into mddev. 909 * The first time, mddev->raid_disks will be 0, and data from 910 * dev should be merged in. Subsequent calls check that dev 911 * is new enough. Return 0 or -EINVAL 912 * 913 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 914 * Update the superblock for rdev with data in mddev 915 * This does not write to disc. 916 * 917 */ 918 919 struct super_type { 920 char *name; 921 struct module *owner; 922 int (*load_super)(struct md_rdev *rdev, 923 struct md_rdev *refdev, 924 int minor_version); 925 int (*validate_super)(struct mddev *mddev, 926 struct md_rdev *rdev); 927 void (*sync_super)(struct mddev *mddev, 928 struct md_rdev *rdev); 929 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 930 sector_t num_sectors); 931 int (*allow_new_offset)(struct md_rdev *rdev, 932 unsigned long long new_offset); 933 }; 934 935 /* 936 * Check that the given mddev has no bitmap. 937 * 938 * This function is called from the run method of all personalities that do not 939 * support bitmaps. It prints an error message and returns non-zero if mddev 940 * has a bitmap. Otherwise, it returns 0. 941 * 942 */ 943 int md_check_no_bitmap(struct mddev *mddev) 944 { 945 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 946 return 0; 947 pr_warn("%s: bitmaps are not supported for %s\n", 948 mdname(mddev), mddev->pers->name); 949 return 1; 950 } 951 EXPORT_SYMBOL(md_check_no_bitmap); 952 953 /* 954 * load_super for 0.90.0 955 */ 956 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 957 { 958 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 959 mdp_super_t *sb; 960 int ret; 961 962 /* 963 * Calculate the position of the superblock (512byte sectors), 964 * it's at the end of the disk. 965 * 966 * It also happens to be a multiple of 4Kb. 967 */ 968 rdev->sb_start = calc_dev_sboffset(rdev); 969 970 ret = read_disk_sb(rdev, MD_SB_BYTES); 971 if (ret) 972 return ret; 973 974 ret = -EINVAL; 975 976 bdevname(rdev->bdev, b); 977 sb = page_address(rdev->sb_page); 978 979 if (sb->md_magic != MD_SB_MAGIC) { 980 pr_warn("md: invalid raid superblock magic on %s\n", b); 981 goto abort; 982 } 983 984 if (sb->major_version != 0 || 985 sb->minor_version < 90 || 986 sb->minor_version > 91) { 987 pr_warn("Bad version number %d.%d on %s\n", 988 sb->major_version, sb->minor_version, b); 989 goto abort; 990 } 991 992 if (sb->raid_disks <= 0) 993 goto abort; 994 995 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 996 pr_warn("md: invalid superblock checksum on %s\n", b); 997 goto abort; 998 } 999 1000 rdev->preferred_minor = sb->md_minor; 1001 rdev->data_offset = 0; 1002 rdev->new_data_offset = 0; 1003 rdev->sb_size = MD_SB_BYTES; 1004 rdev->badblocks.shift = -1; 1005 1006 if (sb->level == LEVEL_MULTIPATH) 1007 rdev->desc_nr = -1; 1008 else 1009 rdev->desc_nr = sb->this_disk.number; 1010 1011 if (!refdev) { 1012 ret = 1; 1013 } else { 1014 __u64 ev1, ev2; 1015 mdp_super_t *refsb = page_address(refdev->sb_page); 1016 if (!uuid_equal(refsb, sb)) { 1017 pr_warn("md: %s has different UUID to %s\n", 1018 b, bdevname(refdev->bdev,b2)); 1019 goto abort; 1020 } 1021 if (!sb_equal(refsb, sb)) { 1022 pr_warn("md: %s has same UUID but different superblock to %s\n", 1023 b, bdevname(refdev->bdev, b2)); 1024 goto abort; 1025 } 1026 ev1 = md_event(sb); 1027 ev2 = md_event(refsb); 1028 if (ev1 > ev2) 1029 ret = 1; 1030 else 1031 ret = 0; 1032 } 1033 rdev->sectors = rdev->sb_start; 1034 /* Limit to 4TB as metadata cannot record more than that. 1035 * (not needed for Linear and RAID0 as metadata doesn't 1036 * record this size) 1037 */ 1038 if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && 1039 sb->level >= 1) 1040 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1041 1042 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1043 /* "this cannot possibly happen" ... */ 1044 ret = -EINVAL; 1045 1046 abort: 1047 return ret; 1048 } 1049 1050 /* 1051 * validate_super for 0.90.0 1052 */ 1053 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1054 { 1055 mdp_disk_t *desc; 1056 mdp_super_t *sb = page_address(rdev->sb_page); 1057 __u64 ev1 = md_event(sb); 1058 1059 rdev->raid_disk = -1; 1060 clear_bit(Faulty, &rdev->flags); 1061 clear_bit(In_sync, &rdev->flags); 1062 clear_bit(Bitmap_sync, &rdev->flags); 1063 clear_bit(WriteMostly, &rdev->flags); 1064 1065 if (mddev->raid_disks == 0) { 1066 mddev->major_version = 0; 1067 mddev->minor_version = sb->minor_version; 1068 mddev->patch_version = sb->patch_version; 1069 mddev->external = 0; 1070 mddev->chunk_sectors = sb->chunk_size >> 9; 1071 mddev->ctime = sb->ctime; 1072 mddev->utime = sb->utime; 1073 mddev->level = sb->level; 1074 mddev->clevel[0] = 0; 1075 mddev->layout = sb->layout; 1076 mddev->raid_disks = sb->raid_disks; 1077 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1078 mddev->events = ev1; 1079 mddev->bitmap_info.offset = 0; 1080 mddev->bitmap_info.space = 0; 1081 /* bitmap can use 60 K after the 4K superblocks */ 1082 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1083 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1084 mddev->reshape_backwards = 0; 1085 1086 if (mddev->minor_version >= 91) { 1087 mddev->reshape_position = sb->reshape_position; 1088 mddev->delta_disks = sb->delta_disks; 1089 mddev->new_level = sb->new_level; 1090 mddev->new_layout = sb->new_layout; 1091 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1092 if (mddev->delta_disks < 0) 1093 mddev->reshape_backwards = 1; 1094 } else { 1095 mddev->reshape_position = MaxSector; 1096 mddev->delta_disks = 0; 1097 mddev->new_level = mddev->level; 1098 mddev->new_layout = mddev->layout; 1099 mddev->new_chunk_sectors = mddev->chunk_sectors; 1100 } 1101 1102 if (sb->state & (1<<MD_SB_CLEAN)) 1103 mddev->recovery_cp = MaxSector; 1104 else { 1105 if (sb->events_hi == sb->cp_events_hi && 1106 sb->events_lo == sb->cp_events_lo) { 1107 mddev->recovery_cp = sb->recovery_cp; 1108 } else 1109 mddev->recovery_cp = 0; 1110 } 1111 1112 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1113 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1114 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1115 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1116 1117 mddev->max_disks = MD_SB_DISKS; 1118 1119 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1120 mddev->bitmap_info.file == NULL) { 1121 mddev->bitmap_info.offset = 1122 mddev->bitmap_info.default_offset; 1123 mddev->bitmap_info.space = 1124 mddev->bitmap_info.default_space; 1125 } 1126 1127 } else if (mddev->pers == NULL) { 1128 /* Insist on good event counter while assembling, except 1129 * for spares (which don't need an event count) */ 1130 ++ev1; 1131 if (sb->disks[rdev->desc_nr].state & ( 1132 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1133 if (ev1 < mddev->events) 1134 return -EINVAL; 1135 } else if (mddev->bitmap) { 1136 /* if adding to array with a bitmap, then we can accept an 1137 * older device ... but not too old. 1138 */ 1139 if (ev1 < mddev->bitmap->events_cleared) 1140 return 0; 1141 if (ev1 < mddev->events) 1142 set_bit(Bitmap_sync, &rdev->flags); 1143 } else { 1144 if (ev1 < mddev->events) 1145 /* just a hot-add of a new device, leave raid_disk at -1 */ 1146 return 0; 1147 } 1148 1149 if (mddev->level != LEVEL_MULTIPATH) { 1150 desc = sb->disks + rdev->desc_nr; 1151 1152 if (desc->state & (1<<MD_DISK_FAULTY)) 1153 set_bit(Faulty, &rdev->flags); 1154 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1155 desc->raid_disk < mddev->raid_disks */) { 1156 set_bit(In_sync, &rdev->flags); 1157 rdev->raid_disk = desc->raid_disk; 1158 rdev->saved_raid_disk = desc->raid_disk; 1159 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1160 /* active but not in sync implies recovery up to 1161 * reshape position. We don't know exactly where 1162 * that is, so set to zero for now */ 1163 if (mddev->minor_version >= 91) { 1164 rdev->recovery_offset = 0; 1165 rdev->raid_disk = desc->raid_disk; 1166 } 1167 } 1168 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1169 set_bit(WriteMostly, &rdev->flags); 1170 if (desc->state & (1<<MD_DISK_FAILFAST)) 1171 set_bit(FailFast, &rdev->flags); 1172 } else /* MULTIPATH are always insync */ 1173 set_bit(In_sync, &rdev->flags); 1174 return 0; 1175 } 1176 1177 /* 1178 * sync_super for 0.90.0 1179 */ 1180 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1181 { 1182 mdp_super_t *sb; 1183 struct md_rdev *rdev2; 1184 int next_spare = mddev->raid_disks; 1185 1186 /* make rdev->sb match mddev data.. 1187 * 1188 * 1/ zero out disks 1189 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1190 * 3/ any empty disks < next_spare become removed 1191 * 1192 * disks[0] gets initialised to REMOVED because 1193 * we cannot be sure from other fields if it has 1194 * been initialised or not. 1195 */ 1196 int i; 1197 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1198 1199 rdev->sb_size = MD_SB_BYTES; 1200 1201 sb = page_address(rdev->sb_page); 1202 1203 memset(sb, 0, sizeof(*sb)); 1204 1205 sb->md_magic = MD_SB_MAGIC; 1206 sb->major_version = mddev->major_version; 1207 sb->patch_version = mddev->patch_version; 1208 sb->gvalid_words = 0; /* ignored */ 1209 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1210 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1211 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1212 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1213 1214 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1215 sb->level = mddev->level; 1216 sb->size = mddev->dev_sectors / 2; 1217 sb->raid_disks = mddev->raid_disks; 1218 sb->md_minor = mddev->md_minor; 1219 sb->not_persistent = 0; 1220 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1221 sb->state = 0; 1222 sb->events_hi = (mddev->events>>32); 1223 sb->events_lo = (u32)mddev->events; 1224 1225 if (mddev->reshape_position == MaxSector) 1226 sb->minor_version = 90; 1227 else { 1228 sb->minor_version = 91; 1229 sb->reshape_position = mddev->reshape_position; 1230 sb->new_level = mddev->new_level; 1231 sb->delta_disks = mddev->delta_disks; 1232 sb->new_layout = mddev->new_layout; 1233 sb->new_chunk = mddev->new_chunk_sectors << 9; 1234 } 1235 mddev->minor_version = sb->minor_version; 1236 if (mddev->in_sync) 1237 { 1238 sb->recovery_cp = mddev->recovery_cp; 1239 sb->cp_events_hi = (mddev->events>>32); 1240 sb->cp_events_lo = (u32)mddev->events; 1241 if (mddev->recovery_cp == MaxSector) 1242 sb->state = (1<< MD_SB_CLEAN); 1243 } else 1244 sb->recovery_cp = 0; 1245 1246 sb->layout = mddev->layout; 1247 sb->chunk_size = mddev->chunk_sectors << 9; 1248 1249 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1250 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1251 1252 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1253 rdev_for_each(rdev2, mddev) { 1254 mdp_disk_t *d; 1255 int desc_nr; 1256 int is_active = test_bit(In_sync, &rdev2->flags); 1257 1258 if (rdev2->raid_disk >= 0 && 1259 sb->minor_version >= 91) 1260 /* we have nowhere to store the recovery_offset, 1261 * but if it is not below the reshape_position, 1262 * we can piggy-back on that. 1263 */ 1264 is_active = 1; 1265 if (rdev2->raid_disk < 0 || 1266 test_bit(Faulty, &rdev2->flags)) 1267 is_active = 0; 1268 if (is_active) 1269 desc_nr = rdev2->raid_disk; 1270 else 1271 desc_nr = next_spare++; 1272 rdev2->desc_nr = desc_nr; 1273 d = &sb->disks[rdev2->desc_nr]; 1274 nr_disks++; 1275 d->number = rdev2->desc_nr; 1276 d->major = MAJOR(rdev2->bdev->bd_dev); 1277 d->minor = MINOR(rdev2->bdev->bd_dev); 1278 if (is_active) 1279 d->raid_disk = rdev2->raid_disk; 1280 else 1281 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1282 if (test_bit(Faulty, &rdev2->flags)) 1283 d->state = (1<<MD_DISK_FAULTY); 1284 else if (is_active) { 1285 d->state = (1<<MD_DISK_ACTIVE); 1286 if (test_bit(In_sync, &rdev2->flags)) 1287 d->state |= (1<<MD_DISK_SYNC); 1288 active++; 1289 working++; 1290 } else { 1291 d->state = 0; 1292 spare++; 1293 working++; 1294 } 1295 if (test_bit(WriteMostly, &rdev2->flags)) 1296 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1297 if (test_bit(FailFast, &rdev2->flags)) 1298 d->state |= (1<<MD_DISK_FAILFAST); 1299 } 1300 /* now set the "removed" and "faulty" bits on any missing devices */ 1301 for (i=0 ; i < mddev->raid_disks ; i++) { 1302 mdp_disk_t *d = &sb->disks[i]; 1303 if (d->state == 0 && d->number == 0) { 1304 d->number = i; 1305 d->raid_disk = i; 1306 d->state = (1<<MD_DISK_REMOVED); 1307 d->state |= (1<<MD_DISK_FAULTY); 1308 failed++; 1309 } 1310 } 1311 sb->nr_disks = nr_disks; 1312 sb->active_disks = active; 1313 sb->working_disks = working; 1314 sb->failed_disks = failed; 1315 sb->spare_disks = spare; 1316 1317 sb->this_disk = sb->disks[rdev->desc_nr]; 1318 sb->sb_csum = calc_sb_csum(sb); 1319 } 1320 1321 /* 1322 * rdev_size_change for 0.90.0 1323 */ 1324 static unsigned long long 1325 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1326 { 1327 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1328 return 0; /* component must fit device */ 1329 if (rdev->mddev->bitmap_info.offset) 1330 return 0; /* can't move bitmap */ 1331 rdev->sb_start = calc_dev_sboffset(rdev); 1332 if (!num_sectors || num_sectors > rdev->sb_start) 1333 num_sectors = rdev->sb_start; 1334 /* Limit to 4TB as metadata cannot record more than that. 1335 * 4TB == 2^32 KB, or 2*2^32 sectors. 1336 */ 1337 if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && 1338 rdev->mddev->level >= 1) 1339 num_sectors = (sector_t)(2ULL << 32) - 2; 1340 do { 1341 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1342 rdev->sb_page); 1343 } while (md_super_wait(rdev->mddev) < 0); 1344 return num_sectors; 1345 } 1346 1347 static int 1348 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1349 { 1350 /* non-zero offset changes not possible with v0.90 */ 1351 return new_offset == 0; 1352 } 1353 1354 /* 1355 * version 1 superblock 1356 */ 1357 1358 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1359 { 1360 __le32 disk_csum; 1361 u32 csum; 1362 unsigned long long newcsum; 1363 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1364 __le32 *isuper = (__le32*)sb; 1365 1366 disk_csum = sb->sb_csum; 1367 sb->sb_csum = 0; 1368 newcsum = 0; 1369 for (; size >= 4; size -= 4) 1370 newcsum += le32_to_cpu(*isuper++); 1371 1372 if (size == 2) 1373 newcsum += le16_to_cpu(*(__le16*) isuper); 1374 1375 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1376 sb->sb_csum = disk_csum; 1377 return cpu_to_le32(csum); 1378 } 1379 1380 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1381 { 1382 struct mdp_superblock_1 *sb; 1383 int ret; 1384 sector_t sb_start; 1385 sector_t sectors; 1386 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1387 int bmask; 1388 1389 /* 1390 * Calculate the position of the superblock in 512byte sectors. 1391 * It is always aligned to a 4K boundary and 1392 * depeding on minor_version, it can be: 1393 * 0: At least 8K, but less than 12K, from end of device 1394 * 1: At start of device 1395 * 2: 4K from start of device. 1396 */ 1397 switch(minor_version) { 1398 case 0: 1399 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; 1400 sb_start -= 8*2; 1401 sb_start &= ~(sector_t)(4*2-1); 1402 break; 1403 case 1: 1404 sb_start = 0; 1405 break; 1406 case 2: 1407 sb_start = 8; 1408 break; 1409 default: 1410 return -EINVAL; 1411 } 1412 rdev->sb_start = sb_start; 1413 1414 /* superblock is rarely larger than 1K, but it can be larger, 1415 * and it is safe to read 4k, so we do that 1416 */ 1417 ret = read_disk_sb(rdev, 4096); 1418 if (ret) return ret; 1419 1420 sb = page_address(rdev->sb_page); 1421 1422 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1423 sb->major_version != cpu_to_le32(1) || 1424 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1425 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1426 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1427 return -EINVAL; 1428 1429 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1430 pr_warn("md: invalid superblock checksum on %s\n", 1431 bdevname(rdev->bdev,b)); 1432 return -EINVAL; 1433 } 1434 if (le64_to_cpu(sb->data_size) < 10) { 1435 pr_warn("md: data_size too small on %s\n", 1436 bdevname(rdev->bdev,b)); 1437 return -EINVAL; 1438 } 1439 if (sb->pad0 || 1440 sb->pad3[0] || 1441 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1442 /* Some padding is non-zero, might be a new feature */ 1443 return -EINVAL; 1444 1445 rdev->preferred_minor = 0xffff; 1446 rdev->data_offset = le64_to_cpu(sb->data_offset); 1447 rdev->new_data_offset = rdev->data_offset; 1448 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1449 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1450 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1451 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1452 1453 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1454 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1455 if (rdev->sb_size & bmask) 1456 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1457 1458 if (minor_version 1459 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1460 return -EINVAL; 1461 if (minor_version 1462 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1463 return -EINVAL; 1464 1465 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1466 rdev->desc_nr = -1; 1467 else 1468 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1469 1470 if (!rdev->bb_page) { 1471 rdev->bb_page = alloc_page(GFP_KERNEL); 1472 if (!rdev->bb_page) 1473 return -ENOMEM; 1474 } 1475 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1476 rdev->badblocks.count == 0) { 1477 /* need to load the bad block list. 1478 * Currently we limit it to one page. 1479 */ 1480 s32 offset; 1481 sector_t bb_sector; 1482 u64 *bbp; 1483 int i; 1484 int sectors = le16_to_cpu(sb->bblog_size); 1485 if (sectors > (PAGE_SIZE / 512)) 1486 return -EINVAL; 1487 offset = le32_to_cpu(sb->bblog_offset); 1488 if (offset == 0) 1489 return -EINVAL; 1490 bb_sector = (long long)offset; 1491 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1492 rdev->bb_page, REQ_OP_READ, 0, true)) 1493 return -EIO; 1494 bbp = (u64 *)page_address(rdev->bb_page); 1495 rdev->badblocks.shift = sb->bblog_shift; 1496 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1497 u64 bb = le64_to_cpu(*bbp); 1498 int count = bb & (0x3ff); 1499 u64 sector = bb >> 10; 1500 sector <<= sb->bblog_shift; 1501 count <<= sb->bblog_shift; 1502 if (bb + 1 == 0) 1503 break; 1504 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1505 return -EINVAL; 1506 } 1507 } else if (sb->bblog_offset != 0) 1508 rdev->badblocks.shift = 0; 1509 1510 if (!refdev) { 1511 ret = 1; 1512 } else { 1513 __u64 ev1, ev2; 1514 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1515 1516 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1517 sb->level != refsb->level || 1518 sb->layout != refsb->layout || 1519 sb->chunksize != refsb->chunksize) { 1520 pr_warn("md: %s has strangely different superblock to %s\n", 1521 bdevname(rdev->bdev,b), 1522 bdevname(refdev->bdev,b2)); 1523 return -EINVAL; 1524 } 1525 ev1 = le64_to_cpu(sb->events); 1526 ev2 = le64_to_cpu(refsb->events); 1527 1528 if (ev1 > ev2) 1529 ret = 1; 1530 else 1531 ret = 0; 1532 } 1533 if (minor_version) { 1534 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1535 sectors -= rdev->data_offset; 1536 } else 1537 sectors = rdev->sb_start; 1538 if (sectors < le64_to_cpu(sb->data_size)) 1539 return -EINVAL; 1540 rdev->sectors = le64_to_cpu(sb->data_size); 1541 return ret; 1542 } 1543 1544 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1545 { 1546 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1547 __u64 ev1 = le64_to_cpu(sb->events); 1548 1549 rdev->raid_disk = -1; 1550 clear_bit(Faulty, &rdev->flags); 1551 clear_bit(In_sync, &rdev->flags); 1552 clear_bit(Bitmap_sync, &rdev->flags); 1553 clear_bit(WriteMostly, &rdev->flags); 1554 1555 if (mddev->raid_disks == 0) { 1556 mddev->major_version = 1; 1557 mddev->patch_version = 0; 1558 mddev->external = 0; 1559 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1560 mddev->ctime = le64_to_cpu(sb->ctime); 1561 mddev->utime = le64_to_cpu(sb->utime); 1562 mddev->level = le32_to_cpu(sb->level); 1563 mddev->clevel[0] = 0; 1564 mddev->layout = le32_to_cpu(sb->layout); 1565 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1566 mddev->dev_sectors = le64_to_cpu(sb->size); 1567 mddev->events = ev1; 1568 mddev->bitmap_info.offset = 0; 1569 mddev->bitmap_info.space = 0; 1570 /* Default location for bitmap is 1K after superblock 1571 * using 3K - total of 4K 1572 */ 1573 mddev->bitmap_info.default_offset = 1024 >> 9; 1574 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1575 mddev->reshape_backwards = 0; 1576 1577 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1578 memcpy(mddev->uuid, sb->set_uuid, 16); 1579 1580 mddev->max_disks = (4096-256)/2; 1581 1582 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1583 mddev->bitmap_info.file == NULL) { 1584 mddev->bitmap_info.offset = 1585 (__s32)le32_to_cpu(sb->bitmap_offset); 1586 /* Metadata doesn't record how much space is available. 1587 * For 1.0, we assume we can use up to the superblock 1588 * if before, else to 4K beyond superblock. 1589 * For others, assume no change is possible. 1590 */ 1591 if (mddev->minor_version > 0) 1592 mddev->bitmap_info.space = 0; 1593 else if (mddev->bitmap_info.offset > 0) 1594 mddev->bitmap_info.space = 1595 8 - mddev->bitmap_info.offset; 1596 else 1597 mddev->bitmap_info.space = 1598 -mddev->bitmap_info.offset; 1599 } 1600 1601 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1602 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1603 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1604 mddev->new_level = le32_to_cpu(sb->new_level); 1605 mddev->new_layout = le32_to_cpu(sb->new_layout); 1606 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1607 if (mddev->delta_disks < 0 || 1608 (mddev->delta_disks == 0 && 1609 (le32_to_cpu(sb->feature_map) 1610 & MD_FEATURE_RESHAPE_BACKWARDS))) 1611 mddev->reshape_backwards = 1; 1612 } else { 1613 mddev->reshape_position = MaxSector; 1614 mddev->delta_disks = 0; 1615 mddev->new_level = mddev->level; 1616 mddev->new_layout = mddev->layout; 1617 mddev->new_chunk_sectors = mddev->chunk_sectors; 1618 } 1619 1620 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) 1621 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1622 } else if (mddev->pers == NULL) { 1623 /* Insist of good event counter while assembling, except for 1624 * spares (which don't need an event count) */ 1625 ++ev1; 1626 if (rdev->desc_nr >= 0 && 1627 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1628 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1629 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1630 if (ev1 < mddev->events) 1631 return -EINVAL; 1632 } else if (mddev->bitmap) { 1633 /* If adding to array with a bitmap, then we can accept an 1634 * older device, but not too old. 1635 */ 1636 if (ev1 < mddev->bitmap->events_cleared) 1637 return 0; 1638 if (ev1 < mddev->events) 1639 set_bit(Bitmap_sync, &rdev->flags); 1640 } else { 1641 if (ev1 < mddev->events) 1642 /* just a hot-add of a new device, leave raid_disk at -1 */ 1643 return 0; 1644 } 1645 if (mddev->level != LEVEL_MULTIPATH) { 1646 int role; 1647 if (rdev->desc_nr < 0 || 1648 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1649 role = MD_DISK_ROLE_SPARE; 1650 rdev->desc_nr = -1; 1651 } else 1652 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1653 switch(role) { 1654 case MD_DISK_ROLE_SPARE: /* spare */ 1655 break; 1656 case MD_DISK_ROLE_FAULTY: /* faulty */ 1657 set_bit(Faulty, &rdev->flags); 1658 break; 1659 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1660 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1661 /* journal device without journal feature */ 1662 pr_warn("md: journal device provided without journal feature, ignoring the device\n"); 1663 return -EINVAL; 1664 } 1665 set_bit(Journal, &rdev->flags); 1666 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1667 rdev->raid_disk = 0; 1668 break; 1669 default: 1670 rdev->saved_raid_disk = role; 1671 if ((le32_to_cpu(sb->feature_map) & 1672 MD_FEATURE_RECOVERY_OFFSET)) { 1673 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1674 if (!(le32_to_cpu(sb->feature_map) & 1675 MD_FEATURE_RECOVERY_BITMAP)) 1676 rdev->saved_raid_disk = -1; 1677 } else 1678 set_bit(In_sync, &rdev->flags); 1679 rdev->raid_disk = role; 1680 break; 1681 } 1682 if (sb->devflags & WriteMostly1) 1683 set_bit(WriteMostly, &rdev->flags); 1684 if (sb->devflags & FailFast1) 1685 set_bit(FailFast, &rdev->flags); 1686 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1687 set_bit(Replacement, &rdev->flags); 1688 } else /* MULTIPATH are always insync */ 1689 set_bit(In_sync, &rdev->flags); 1690 1691 return 0; 1692 } 1693 1694 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1695 { 1696 struct mdp_superblock_1 *sb; 1697 struct md_rdev *rdev2; 1698 int max_dev, i; 1699 /* make rdev->sb match mddev and rdev data. */ 1700 1701 sb = page_address(rdev->sb_page); 1702 1703 sb->feature_map = 0; 1704 sb->pad0 = 0; 1705 sb->recovery_offset = cpu_to_le64(0); 1706 memset(sb->pad3, 0, sizeof(sb->pad3)); 1707 1708 sb->utime = cpu_to_le64((__u64)mddev->utime); 1709 sb->events = cpu_to_le64(mddev->events); 1710 if (mddev->in_sync) 1711 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1712 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 1713 sb->resync_offset = cpu_to_le64(MaxSector); 1714 else 1715 sb->resync_offset = cpu_to_le64(0); 1716 1717 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1718 1719 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1720 sb->size = cpu_to_le64(mddev->dev_sectors); 1721 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1722 sb->level = cpu_to_le32(mddev->level); 1723 sb->layout = cpu_to_le32(mddev->layout); 1724 if (test_bit(FailFast, &rdev->flags)) 1725 sb->devflags |= FailFast1; 1726 else 1727 sb->devflags &= ~FailFast1; 1728 1729 if (test_bit(WriteMostly, &rdev->flags)) 1730 sb->devflags |= WriteMostly1; 1731 else 1732 sb->devflags &= ~WriteMostly1; 1733 sb->data_offset = cpu_to_le64(rdev->data_offset); 1734 sb->data_size = cpu_to_le64(rdev->sectors); 1735 1736 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1737 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1738 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1739 } 1740 1741 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 1742 !test_bit(In_sync, &rdev->flags)) { 1743 sb->feature_map |= 1744 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1745 sb->recovery_offset = 1746 cpu_to_le64(rdev->recovery_offset); 1747 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 1748 sb->feature_map |= 1749 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 1750 } 1751 /* Note: recovery_offset and journal_tail share space */ 1752 if (test_bit(Journal, &rdev->flags)) 1753 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 1754 if (test_bit(Replacement, &rdev->flags)) 1755 sb->feature_map |= 1756 cpu_to_le32(MD_FEATURE_REPLACEMENT); 1757 1758 if (mddev->reshape_position != MaxSector) { 1759 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1760 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1761 sb->new_layout = cpu_to_le32(mddev->new_layout); 1762 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1763 sb->new_level = cpu_to_le32(mddev->new_level); 1764 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1765 if (mddev->delta_disks == 0 && 1766 mddev->reshape_backwards) 1767 sb->feature_map 1768 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 1769 if (rdev->new_data_offset != rdev->data_offset) { 1770 sb->feature_map 1771 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 1772 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 1773 - rdev->data_offset)); 1774 } 1775 } 1776 1777 if (mddev_is_clustered(mddev)) 1778 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 1779 1780 if (rdev->badblocks.count == 0) 1781 /* Nothing to do for bad blocks*/ ; 1782 else if (sb->bblog_offset == 0) 1783 /* Cannot record bad blocks on this device */ 1784 md_error(mddev, rdev); 1785 else { 1786 struct badblocks *bb = &rdev->badblocks; 1787 u64 *bbp = (u64 *)page_address(rdev->bb_page); 1788 u64 *p = bb->page; 1789 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 1790 if (bb->changed) { 1791 unsigned seq; 1792 1793 retry: 1794 seq = read_seqbegin(&bb->lock); 1795 1796 memset(bbp, 0xff, PAGE_SIZE); 1797 1798 for (i = 0 ; i < bb->count ; i++) { 1799 u64 internal_bb = p[i]; 1800 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1801 | BB_LEN(internal_bb)); 1802 bbp[i] = cpu_to_le64(store_bb); 1803 } 1804 bb->changed = 0; 1805 if (read_seqretry(&bb->lock, seq)) 1806 goto retry; 1807 1808 bb->sector = (rdev->sb_start + 1809 (int)le32_to_cpu(sb->bblog_offset)); 1810 bb->size = le16_to_cpu(sb->bblog_size); 1811 } 1812 } 1813 1814 max_dev = 0; 1815 rdev_for_each(rdev2, mddev) 1816 if (rdev2->desc_nr+1 > max_dev) 1817 max_dev = rdev2->desc_nr+1; 1818 1819 if (max_dev > le32_to_cpu(sb->max_dev)) { 1820 int bmask; 1821 sb->max_dev = cpu_to_le32(max_dev); 1822 rdev->sb_size = max_dev * 2 + 256; 1823 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1824 if (rdev->sb_size & bmask) 1825 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1826 } else 1827 max_dev = le32_to_cpu(sb->max_dev); 1828 1829 for (i=0; i<max_dev;i++) 1830 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 1831 1832 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 1833 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 1834 1835 rdev_for_each(rdev2, mddev) { 1836 i = rdev2->desc_nr; 1837 if (test_bit(Faulty, &rdev2->flags)) 1838 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 1839 else if (test_bit(In_sync, &rdev2->flags)) 1840 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1841 else if (test_bit(Journal, &rdev2->flags)) 1842 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 1843 else if (rdev2->raid_disk >= 0) 1844 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1845 else 1846 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 1847 } 1848 1849 sb->sb_csum = calc_sb_1_csum(sb); 1850 } 1851 1852 static unsigned long long 1853 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1854 { 1855 struct mdp_superblock_1 *sb; 1856 sector_t max_sectors; 1857 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1858 return 0; /* component must fit device */ 1859 if (rdev->data_offset != rdev->new_data_offset) 1860 return 0; /* too confusing */ 1861 if (rdev->sb_start < rdev->data_offset) { 1862 /* minor versions 1 and 2; superblock before data */ 1863 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1864 max_sectors -= rdev->data_offset; 1865 if (!num_sectors || num_sectors > max_sectors) 1866 num_sectors = max_sectors; 1867 } else if (rdev->mddev->bitmap_info.offset) { 1868 /* minor version 0 with bitmap we can't move */ 1869 return 0; 1870 } else { 1871 /* minor version 0; superblock after data */ 1872 sector_t sb_start; 1873 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; 1874 sb_start &= ~(sector_t)(4*2 - 1); 1875 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1876 if (!num_sectors || num_sectors > max_sectors) 1877 num_sectors = max_sectors; 1878 rdev->sb_start = sb_start; 1879 } 1880 sb = page_address(rdev->sb_page); 1881 sb->data_size = cpu_to_le64(num_sectors); 1882 sb->super_offset = cpu_to_le64(rdev->sb_start); 1883 sb->sb_csum = calc_sb_1_csum(sb); 1884 do { 1885 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1886 rdev->sb_page); 1887 } while (md_super_wait(rdev->mddev) < 0); 1888 return num_sectors; 1889 1890 } 1891 1892 static int 1893 super_1_allow_new_offset(struct md_rdev *rdev, 1894 unsigned long long new_offset) 1895 { 1896 /* All necessary checks on new >= old have been done */ 1897 struct bitmap *bitmap; 1898 if (new_offset >= rdev->data_offset) 1899 return 1; 1900 1901 /* with 1.0 metadata, there is no metadata to tread on 1902 * so we can always move back */ 1903 if (rdev->mddev->minor_version == 0) 1904 return 1; 1905 1906 /* otherwise we must be sure not to step on 1907 * any metadata, so stay: 1908 * 36K beyond start of superblock 1909 * beyond end of badblocks 1910 * beyond write-intent bitmap 1911 */ 1912 if (rdev->sb_start + (32+4)*2 > new_offset) 1913 return 0; 1914 bitmap = rdev->mddev->bitmap; 1915 if (bitmap && !rdev->mddev->bitmap_info.file && 1916 rdev->sb_start + rdev->mddev->bitmap_info.offset + 1917 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 1918 return 0; 1919 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 1920 return 0; 1921 1922 return 1; 1923 } 1924 1925 static struct super_type super_types[] = { 1926 [0] = { 1927 .name = "0.90.0", 1928 .owner = THIS_MODULE, 1929 .load_super = super_90_load, 1930 .validate_super = super_90_validate, 1931 .sync_super = super_90_sync, 1932 .rdev_size_change = super_90_rdev_size_change, 1933 .allow_new_offset = super_90_allow_new_offset, 1934 }, 1935 [1] = { 1936 .name = "md-1", 1937 .owner = THIS_MODULE, 1938 .load_super = super_1_load, 1939 .validate_super = super_1_validate, 1940 .sync_super = super_1_sync, 1941 .rdev_size_change = super_1_rdev_size_change, 1942 .allow_new_offset = super_1_allow_new_offset, 1943 }, 1944 }; 1945 1946 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 1947 { 1948 if (mddev->sync_super) { 1949 mddev->sync_super(mddev, rdev); 1950 return; 1951 } 1952 1953 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 1954 1955 super_types[mddev->major_version].sync_super(mddev, rdev); 1956 } 1957 1958 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 1959 { 1960 struct md_rdev *rdev, *rdev2; 1961 1962 rcu_read_lock(); 1963 rdev_for_each_rcu(rdev, mddev1) { 1964 if (test_bit(Faulty, &rdev->flags) || 1965 test_bit(Journal, &rdev->flags) || 1966 rdev->raid_disk == -1) 1967 continue; 1968 rdev_for_each_rcu(rdev2, mddev2) { 1969 if (test_bit(Faulty, &rdev2->flags) || 1970 test_bit(Journal, &rdev2->flags) || 1971 rdev2->raid_disk == -1) 1972 continue; 1973 if (rdev->bdev->bd_contains == 1974 rdev2->bdev->bd_contains) { 1975 rcu_read_unlock(); 1976 return 1; 1977 } 1978 } 1979 } 1980 rcu_read_unlock(); 1981 return 0; 1982 } 1983 1984 static LIST_HEAD(pending_raid_disks); 1985 1986 /* 1987 * Try to register data integrity profile for an mddev 1988 * 1989 * This is called when an array is started and after a disk has been kicked 1990 * from the array. It only succeeds if all working and active component devices 1991 * are integrity capable with matching profiles. 1992 */ 1993 int md_integrity_register(struct mddev *mddev) 1994 { 1995 struct md_rdev *rdev, *reference = NULL; 1996 1997 if (list_empty(&mddev->disks)) 1998 return 0; /* nothing to do */ 1999 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2000 return 0; /* shouldn't register, or already is */ 2001 rdev_for_each(rdev, mddev) { 2002 /* skip spares and non-functional disks */ 2003 if (test_bit(Faulty, &rdev->flags)) 2004 continue; 2005 if (rdev->raid_disk < 0) 2006 continue; 2007 if (!reference) { 2008 /* Use the first rdev as the reference */ 2009 reference = rdev; 2010 continue; 2011 } 2012 /* does this rdev's profile match the reference profile? */ 2013 if (blk_integrity_compare(reference->bdev->bd_disk, 2014 rdev->bdev->bd_disk) < 0) 2015 return -EINVAL; 2016 } 2017 if (!reference || !bdev_get_integrity(reference->bdev)) 2018 return 0; 2019 /* 2020 * All component devices are integrity capable and have matching 2021 * profiles, register the common profile for the md device. 2022 */ 2023 blk_integrity_register(mddev->gendisk, 2024 bdev_get_integrity(reference->bdev)); 2025 2026 pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); 2027 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { 2028 pr_err("md: failed to create integrity pool for %s\n", 2029 mdname(mddev)); 2030 return -EINVAL; 2031 } 2032 return 0; 2033 } 2034 EXPORT_SYMBOL(md_integrity_register); 2035 2036 /* 2037 * Attempt to add an rdev, but only if it is consistent with the current 2038 * integrity profile 2039 */ 2040 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2041 { 2042 struct blk_integrity *bi_rdev; 2043 struct blk_integrity *bi_mddev; 2044 char name[BDEVNAME_SIZE]; 2045 2046 if (!mddev->gendisk) 2047 return 0; 2048 2049 bi_rdev = bdev_get_integrity(rdev->bdev); 2050 bi_mddev = blk_get_integrity(mddev->gendisk); 2051 2052 if (!bi_mddev) /* nothing to do */ 2053 return 0; 2054 2055 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2056 pr_err("%s: incompatible integrity profile for %s\n", 2057 mdname(mddev), bdevname(rdev->bdev, name)); 2058 return -ENXIO; 2059 } 2060 2061 return 0; 2062 } 2063 EXPORT_SYMBOL(md_integrity_add_rdev); 2064 2065 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2066 { 2067 char b[BDEVNAME_SIZE]; 2068 struct kobject *ko; 2069 int err; 2070 2071 /* prevent duplicates */ 2072 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2073 return -EEXIST; 2074 2075 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2076 if (!test_bit(Journal, &rdev->flags) && 2077 rdev->sectors && 2078 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2079 if (mddev->pers) { 2080 /* Cannot change size, so fail 2081 * If mddev->level <= 0, then we don't care 2082 * about aligning sizes (e.g. linear) 2083 */ 2084 if (mddev->level > 0) 2085 return -ENOSPC; 2086 } else 2087 mddev->dev_sectors = rdev->sectors; 2088 } 2089 2090 /* Verify rdev->desc_nr is unique. 2091 * If it is -1, assign a free number, else 2092 * check number is not in use 2093 */ 2094 rcu_read_lock(); 2095 if (rdev->desc_nr < 0) { 2096 int choice = 0; 2097 if (mddev->pers) 2098 choice = mddev->raid_disks; 2099 while (md_find_rdev_nr_rcu(mddev, choice)) 2100 choice++; 2101 rdev->desc_nr = choice; 2102 } else { 2103 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2104 rcu_read_unlock(); 2105 return -EBUSY; 2106 } 2107 } 2108 rcu_read_unlock(); 2109 if (!test_bit(Journal, &rdev->flags) && 2110 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2111 pr_warn("md: %s: array is limited to %d devices\n", 2112 mdname(mddev), mddev->max_disks); 2113 return -EBUSY; 2114 } 2115 bdevname(rdev->bdev,b); 2116 strreplace(b, '/', '!'); 2117 2118 rdev->mddev = mddev; 2119 pr_debug("md: bind<%s>\n", b); 2120 2121 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2122 goto fail; 2123 2124 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 2125 if (sysfs_create_link(&rdev->kobj, ko, "block")) 2126 /* failure here is OK */; 2127 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2128 2129 list_add_rcu(&rdev->same_set, &mddev->disks); 2130 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2131 2132 /* May as well allow recovery to be retried once */ 2133 mddev->recovery_disabled++; 2134 2135 return 0; 2136 2137 fail: 2138 pr_warn("md: failed to register dev-%s for %s\n", 2139 b, mdname(mddev)); 2140 return err; 2141 } 2142 2143 static void md_delayed_delete(struct work_struct *ws) 2144 { 2145 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2146 kobject_del(&rdev->kobj); 2147 kobject_put(&rdev->kobj); 2148 } 2149 2150 static void unbind_rdev_from_array(struct md_rdev *rdev) 2151 { 2152 char b[BDEVNAME_SIZE]; 2153 2154 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2155 list_del_rcu(&rdev->same_set); 2156 pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2157 rdev->mddev = NULL; 2158 sysfs_remove_link(&rdev->kobj, "block"); 2159 sysfs_put(rdev->sysfs_state); 2160 rdev->sysfs_state = NULL; 2161 rdev->badblocks.count = 0; 2162 /* We need to delay this, otherwise we can deadlock when 2163 * writing to 'remove' to "dev/state". We also need 2164 * to delay it due to rcu usage. 2165 */ 2166 synchronize_rcu(); 2167 INIT_WORK(&rdev->del_work, md_delayed_delete); 2168 kobject_get(&rdev->kobj); 2169 queue_work(md_misc_wq, &rdev->del_work); 2170 } 2171 2172 /* 2173 * prevent the device from being mounted, repartitioned or 2174 * otherwise reused by a RAID array (or any other kernel 2175 * subsystem), by bd_claiming the device. 2176 */ 2177 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2178 { 2179 int err = 0; 2180 struct block_device *bdev; 2181 char b[BDEVNAME_SIZE]; 2182 2183 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2184 shared ? (struct md_rdev *)lock_rdev : rdev); 2185 if (IS_ERR(bdev)) { 2186 pr_warn("md: could not open %s.\n", __bdevname(dev, b)); 2187 return PTR_ERR(bdev); 2188 } 2189 rdev->bdev = bdev; 2190 return err; 2191 } 2192 2193 static void unlock_rdev(struct md_rdev *rdev) 2194 { 2195 struct block_device *bdev = rdev->bdev; 2196 rdev->bdev = NULL; 2197 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2198 } 2199 2200 void md_autodetect_dev(dev_t dev); 2201 2202 static void export_rdev(struct md_rdev *rdev) 2203 { 2204 char b[BDEVNAME_SIZE]; 2205 2206 pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); 2207 md_rdev_clear(rdev); 2208 #ifndef MODULE 2209 if (test_bit(AutoDetected, &rdev->flags)) 2210 md_autodetect_dev(rdev->bdev->bd_dev); 2211 #endif 2212 unlock_rdev(rdev); 2213 kobject_put(&rdev->kobj); 2214 } 2215 2216 void md_kick_rdev_from_array(struct md_rdev *rdev) 2217 { 2218 unbind_rdev_from_array(rdev); 2219 export_rdev(rdev); 2220 } 2221 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); 2222 2223 static void export_array(struct mddev *mddev) 2224 { 2225 struct md_rdev *rdev; 2226 2227 while (!list_empty(&mddev->disks)) { 2228 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2229 same_set); 2230 md_kick_rdev_from_array(rdev); 2231 } 2232 mddev->raid_disks = 0; 2233 mddev->major_version = 0; 2234 } 2235 2236 static void sync_sbs(struct mddev *mddev, int nospares) 2237 { 2238 /* Update each superblock (in-memory image), but 2239 * if we are allowed to, skip spares which already 2240 * have the right event counter, or have one earlier 2241 * (which would mean they aren't being marked as dirty 2242 * with the rest of the array) 2243 */ 2244 struct md_rdev *rdev; 2245 rdev_for_each(rdev, mddev) { 2246 if (rdev->sb_events == mddev->events || 2247 (nospares && 2248 rdev->raid_disk < 0 && 2249 rdev->sb_events+1 == mddev->events)) { 2250 /* Don't update this superblock */ 2251 rdev->sb_loaded = 2; 2252 } else { 2253 sync_super(mddev, rdev); 2254 rdev->sb_loaded = 1; 2255 } 2256 } 2257 } 2258 2259 static bool does_sb_need_changing(struct mddev *mddev) 2260 { 2261 struct md_rdev *rdev; 2262 struct mdp_superblock_1 *sb; 2263 int role; 2264 2265 /* Find a good rdev */ 2266 rdev_for_each(rdev, mddev) 2267 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) 2268 break; 2269 2270 /* No good device found. */ 2271 if (!rdev) 2272 return false; 2273 2274 sb = page_address(rdev->sb_page); 2275 /* Check if a device has become faulty or a spare become active */ 2276 rdev_for_each(rdev, mddev) { 2277 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2278 /* Device activated? */ 2279 if (role == 0xffff && rdev->raid_disk >=0 && 2280 !test_bit(Faulty, &rdev->flags)) 2281 return true; 2282 /* Device turned faulty? */ 2283 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) 2284 return true; 2285 } 2286 2287 /* Check if any mddev parameters have changed */ 2288 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2289 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2290 (mddev->layout != le32_to_cpu(sb->layout)) || 2291 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2292 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2293 return true; 2294 2295 return false; 2296 } 2297 2298 void md_update_sb(struct mddev *mddev, int force_change) 2299 { 2300 struct md_rdev *rdev; 2301 int sync_req; 2302 int nospares = 0; 2303 int any_badblocks_changed = 0; 2304 int ret = -1; 2305 2306 if (mddev->ro) { 2307 if (force_change) 2308 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2309 return; 2310 } 2311 2312 repeat: 2313 if (mddev_is_clustered(mddev)) { 2314 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2315 force_change = 1; 2316 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2317 nospares = 1; 2318 ret = md_cluster_ops->metadata_update_start(mddev); 2319 /* Has someone else has updated the sb */ 2320 if (!does_sb_need_changing(mddev)) { 2321 if (ret == 0) 2322 md_cluster_ops->metadata_update_cancel(mddev); 2323 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2324 BIT(MD_SB_CHANGE_DEVS) | 2325 BIT(MD_SB_CHANGE_CLEAN)); 2326 return; 2327 } 2328 } 2329 2330 /* First make sure individual recovery_offsets are correct */ 2331 rdev_for_each(rdev, mddev) { 2332 if (rdev->raid_disk >= 0 && 2333 mddev->delta_disks >= 0 && 2334 !test_bit(Journal, &rdev->flags) && 2335 !test_bit(In_sync, &rdev->flags) && 2336 mddev->curr_resync_completed > rdev->recovery_offset) 2337 rdev->recovery_offset = mddev->curr_resync_completed; 2338 2339 } 2340 if (!mddev->persistent) { 2341 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 2342 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2343 if (!mddev->external) { 2344 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 2345 rdev_for_each(rdev, mddev) { 2346 if (rdev->badblocks.changed) { 2347 rdev->badblocks.changed = 0; 2348 ack_all_badblocks(&rdev->badblocks); 2349 md_error(mddev, rdev); 2350 } 2351 clear_bit(Blocked, &rdev->flags); 2352 clear_bit(BlockedBadBlocks, &rdev->flags); 2353 wake_up(&rdev->blocked_wait); 2354 } 2355 } 2356 wake_up(&mddev->sb_wait); 2357 return; 2358 } 2359 2360 spin_lock(&mddev->lock); 2361 2362 mddev->utime = ktime_get_real_seconds(); 2363 2364 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) 2365 force_change = 1; 2366 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) 2367 /* just a clean<-> dirty transition, possibly leave spares alone, 2368 * though if events isn't the right even/odd, we will have to do 2369 * spares after all 2370 */ 2371 nospares = 1; 2372 if (force_change) 2373 nospares = 0; 2374 if (mddev->degraded) 2375 /* If the array is degraded, then skipping spares is both 2376 * dangerous and fairly pointless. 2377 * Dangerous because a device that was removed from the array 2378 * might have a event_count that still looks up-to-date, 2379 * so it can be re-added without a resync. 2380 * Pointless because if there are any spares to skip, 2381 * then a recovery will happen and soon that array won't 2382 * be degraded any more and the spare can go back to sleep then. 2383 */ 2384 nospares = 0; 2385 2386 sync_req = mddev->in_sync; 2387 2388 /* If this is just a dirty<->clean transition, and the array is clean 2389 * and 'events' is odd, we can roll back to the previous clean state */ 2390 if (nospares 2391 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2392 && mddev->can_decrease_events 2393 && mddev->events != 1) { 2394 mddev->events--; 2395 mddev->can_decrease_events = 0; 2396 } else { 2397 /* otherwise we have to go forward and ... */ 2398 mddev->events ++; 2399 mddev->can_decrease_events = nospares; 2400 } 2401 2402 /* 2403 * This 64-bit counter should never wrap. 2404 * Either we are in around ~1 trillion A.C., assuming 2405 * 1 reboot per second, or we have a bug... 2406 */ 2407 WARN_ON(mddev->events == 0); 2408 2409 rdev_for_each(rdev, mddev) { 2410 if (rdev->badblocks.changed) 2411 any_badblocks_changed++; 2412 if (test_bit(Faulty, &rdev->flags)) 2413 set_bit(FaultRecorded, &rdev->flags); 2414 } 2415 2416 sync_sbs(mddev, nospares); 2417 spin_unlock(&mddev->lock); 2418 2419 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2420 mdname(mddev), mddev->in_sync); 2421 2422 if (mddev->queue) 2423 blk_add_trace_msg(mddev->queue, "md md_update_sb"); 2424 rewrite: 2425 bitmap_update_sb(mddev->bitmap); 2426 rdev_for_each(rdev, mddev) { 2427 char b[BDEVNAME_SIZE]; 2428 2429 if (rdev->sb_loaded != 1) 2430 continue; /* no noise on spare devices */ 2431 2432 if (!test_bit(Faulty, &rdev->flags)) { 2433 md_super_write(mddev,rdev, 2434 rdev->sb_start, rdev->sb_size, 2435 rdev->sb_page); 2436 pr_debug("md: (write) %s's sb offset: %llu\n", 2437 bdevname(rdev->bdev, b), 2438 (unsigned long long)rdev->sb_start); 2439 rdev->sb_events = mddev->events; 2440 if (rdev->badblocks.size) { 2441 md_super_write(mddev, rdev, 2442 rdev->badblocks.sector, 2443 rdev->badblocks.size << 9, 2444 rdev->bb_page); 2445 rdev->badblocks.size = 0; 2446 } 2447 2448 } else 2449 pr_debug("md: %s (skipping faulty)\n", 2450 bdevname(rdev->bdev, b)); 2451 2452 if (mddev->level == LEVEL_MULTIPATH) 2453 /* only need to write one superblock... */ 2454 break; 2455 } 2456 if (md_super_wait(mddev) < 0) 2457 goto rewrite; 2458 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ 2459 2460 if (mddev_is_clustered(mddev) && ret == 0) 2461 md_cluster_ops->metadata_update_finish(mddev); 2462 2463 if (mddev->in_sync != sync_req || 2464 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), 2465 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) 2466 /* have to write it out again */ 2467 goto repeat; 2468 wake_up(&mddev->sb_wait); 2469 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2470 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2471 2472 rdev_for_each(rdev, mddev) { 2473 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2474 clear_bit(Blocked, &rdev->flags); 2475 2476 if (any_badblocks_changed) 2477 ack_all_badblocks(&rdev->badblocks); 2478 clear_bit(BlockedBadBlocks, &rdev->flags); 2479 wake_up(&rdev->blocked_wait); 2480 } 2481 } 2482 EXPORT_SYMBOL(md_update_sb); 2483 2484 static int add_bound_rdev(struct md_rdev *rdev) 2485 { 2486 struct mddev *mddev = rdev->mddev; 2487 int err = 0; 2488 bool add_journal = test_bit(Journal, &rdev->flags); 2489 2490 if (!mddev->pers->hot_remove_disk || add_journal) { 2491 /* If there is hot_add_disk but no hot_remove_disk 2492 * then added disks for geometry changes, 2493 * and should be added immediately. 2494 */ 2495 super_types[mddev->major_version]. 2496 validate_super(mddev, rdev); 2497 if (add_journal) 2498 mddev_suspend(mddev); 2499 err = mddev->pers->hot_add_disk(mddev, rdev); 2500 if (add_journal) 2501 mddev_resume(mddev); 2502 if (err) { 2503 md_kick_rdev_from_array(rdev); 2504 return err; 2505 } 2506 } 2507 sysfs_notify_dirent_safe(rdev->sysfs_state); 2508 2509 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2510 if (mddev->degraded) 2511 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2512 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2513 md_new_event(mddev); 2514 md_wakeup_thread(mddev->thread); 2515 return 0; 2516 } 2517 2518 /* words written to sysfs files may, or may not, be \n terminated. 2519 * We want to accept with case. For this we use cmd_match. 2520 */ 2521 static int cmd_match(const char *cmd, const char *str) 2522 { 2523 /* See if cmd, written into a sysfs file, matches 2524 * str. They must either be the same, or cmd can 2525 * have a trailing newline 2526 */ 2527 while (*cmd && *str && *cmd == *str) { 2528 cmd++; 2529 str++; 2530 } 2531 if (*cmd == '\n') 2532 cmd++; 2533 if (*str || *cmd) 2534 return 0; 2535 return 1; 2536 } 2537 2538 struct rdev_sysfs_entry { 2539 struct attribute attr; 2540 ssize_t (*show)(struct md_rdev *, char *); 2541 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2542 }; 2543 2544 static ssize_t 2545 state_show(struct md_rdev *rdev, char *page) 2546 { 2547 char *sep = ","; 2548 size_t len = 0; 2549 unsigned long flags = ACCESS_ONCE(rdev->flags); 2550 2551 if (test_bit(Faulty, &flags) || 2552 (!test_bit(ExternalBbl, &flags) && 2553 rdev->badblocks.unacked_exist)) 2554 len += sprintf(page+len, "faulty%s", sep); 2555 if (test_bit(In_sync, &flags)) 2556 len += sprintf(page+len, "in_sync%s", sep); 2557 if (test_bit(Journal, &flags)) 2558 len += sprintf(page+len, "journal%s", sep); 2559 if (test_bit(WriteMostly, &flags)) 2560 len += sprintf(page+len, "write_mostly%s", sep); 2561 if (test_bit(Blocked, &flags) || 2562 (rdev->badblocks.unacked_exist 2563 && !test_bit(Faulty, &flags))) 2564 len += sprintf(page+len, "blocked%s", sep); 2565 if (!test_bit(Faulty, &flags) && 2566 !test_bit(Journal, &flags) && 2567 !test_bit(In_sync, &flags)) 2568 len += sprintf(page+len, "spare%s", sep); 2569 if (test_bit(WriteErrorSeen, &flags)) 2570 len += sprintf(page+len, "write_error%s", sep); 2571 if (test_bit(WantReplacement, &flags)) 2572 len += sprintf(page+len, "want_replacement%s", sep); 2573 if (test_bit(Replacement, &flags)) 2574 len += sprintf(page+len, "replacement%s", sep); 2575 if (test_bit(ExternalBbl, &flags)) 2576 len += sprintf(page+len, "external_bbl%s", sep); 2577 if (test_bit(FailFast, &flags)) 2578 len += sprintf(page+len, "failfast%s", sep); 2579 2580 if (len) 2581 len -= strlen(sep); 2582 2583 return len+sprintf(page+len, "\n"); 2584 } 2585 2586 static ssize_t 2587 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2588 { 2589 /* can write 2590 * faulty - simulates an error 2591 * remove - disconnects the device 2592 * writemostly - sets write_mostly 2593 * -writemostly - clears write_mostly 2594 * blocked - sets the Blocked flags 2595 * -blocked - clears the Blocked and possibly simulates an error 2596 * insync - sets Insync providing device isn't active 2597 * -insync - clear Insync for a device with a slot assigned, 2598 * so that it gets rebuilt based on bitmap 2599 * write_error - sets WriteErrorSeen 2600 * -write_error - clears WriteErrorSeen 2601 * {,-}failfast - set/clear FailFast 2602 */ 2603 int err = -EINVAL; 2604 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2605 md_error(rdev->mddev, rdev); 2606 if (test_bit(Faulty, &rdev->flags)) 2607 err = 0; 2608 else 2609 err = -EBUSY; 2610 } else if (cmd_match(buf, "remove")) { 2611 if (rdev->mddev->pers) { 2612 clear_bit(Blocked, &rdev->flags); 2613 remove_and_add_spares(rdev->mddev, rdev); 2614 } 2615 if (rdev->raid_disk >= 0) 2616 err = -EBUSY; 2617 else { 2618 struct mddev *mddev = rdev->mddev; 2619 err = 0; 2620 if (mddev_is_clustered(mddev)) 2621 err = md_cluster_ops->remove_disk(mddev, rdev); 2622 2623 if (err == 0) { 2624 md_kick_rdev_from_array(rdev); 2625 if (mddev->pers) { 2626 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 2627 md_wakeup_thread(mddev->thread); 2628 } 2629 md_new_event(mddev); 2630 } 2631 } 2632 } else if (cmd_match(buf, "writemostly")) { 2633 set_bit(WriteMostly, &rdev->flags); 2634 err = 0; 2635 } else if (cmd_match(buf, "-writemostly")) { 2636 clear_bit(WriteMostly, &rdev->flags); 2637 err = 0; 2638 } else if (cmd_match(buf, "blocked")) { 2639 set_bit(Blocked, &rdev->flags); 2640 err = 0; 2641 } else if (cmd_match(buf, "-blocked")) { 2642 if (!test_bit(Faulty, &rdev->flags) && 2643 !test_bit(ExternalBbl, &rdev->flags) && 2644 rdev->badblocks.unacked_exist) { 2645 /* metadata handler doesn't understand badblocks, 2646 * so we need to fail the device 2647 */ 2648 md_error(rdev->mddev, rdev); 2649 } 2650 clear_bit(Blocked, &rdev->flags); 2651 clear_bit(BlockedBadBlocks, &rdev->flags); 2652 wake_up(&rdev->blocked_wait); 2653 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2654 md_wakeup_thread(rdev->mddev->thread); 2655 2656 err = 0; 2657 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2658 set_bit(In_sync, &rdev->flags); 2659 err = 0; 2660 } else if (cmd_match(buf, "failfast")) { 2661 set_bit(FailFast, &rdev->flags); 2662 err = 0; 2663 } else if (cmd_match(buf, "-failfast")) { 2664 clear_bit(FailFast, &rdev->flags); 2665 err = 0; 2666 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 2667 !test_bit(Journal, &rdev->flags)) { 2668 if (rdev->mddev->pers == NULL) { 2669 clear_bit(In_sync, &rdev->flags); 2670 rdev->saved_raid_disk = rdev->raid_disk; 2671 rdev->raid_disk = -1; 2672 err = 0; 2673 } 2674 } else if (cmd_match(buf, "write_error")) { 2675 set_bit(WriteErrorSeen, &rdev->flags); 2676 err = 0; 2677 } else if (cmd_match(buf, "-write_error")) { 2678 clear_bit(WriteErrorSeen, &rdev->flags); 2679 err = 0; 2680 } else if (cmd_match(buf, "want_replacement")) { 2681 /* Any non-spare device that is not a replacement can 2682 * become want_replacement at any time, but we then need to 2683 * check if recovery is needed. 2684 */ 2685 if (rdev->raid_disk >= 0 && 2686 !test_bit(Journal, &rdev->flags) && 2687 !test_bit(Replacement, &rdev->flags)) 2688 set_bit(WantReplacement, &rdev->flags); 2689 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2690 md_wakeup_thread(rdev->mddev->thread); 2691 err = 0; 2692 } else if (cmd_match(buf, "-want_replacement")) { 2693 /* Clearing 'want_replacement' is always allowed. 2694 * Once replacements starts it is too late though. 2695 */ 2696 err = 0; 2697 clear_bit(WantReplacement, &rdev->flags); 2698 } else if (cmd_match(buf, "replacement")) { 2699 /* Can only set a device as a replacement when array has not 2700 * yet been started. Once running, replacement is automatic 2701 * from spares, or by assigning 'slot'. 2702 */ 2703 if (rdev->mddev->pers) 2704 err = -EBUSY; 2705 else { 2706 set_bit(Replacement, &rdev->flags); 2707 err = 0; 2708 } 2709 } else if (cmd_match(buf, "-replacement")) { 2710 /* Similarly, can only clear Replacement before start */ 2711 if (rdev->mddev->pers) 2712 err = -EBUSY; 2713 else { 2714 clear_bit(Replacement, &rdev->flags); 2715 err = 0; 2716 } 2717 } else if (cmd_match(buf, "re-add")) { 2718 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { 2719 /* clear_bit is performed _after_ all the devices 2720 * have their local Faulty bit cleared. If any writes 2721 * happen in the meantime in the local node, they 2722 * will land in the local bitmap, which will be synced 2723 * by this node eventually 2724 */ 2725 if (!mddev_is_clustered(rdev->mddev) || 2726 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 2727 clear_bit(Faulty, &rdev->flags); 2728 err = add_bound_rdev(rdev); 2729 } 2730 } else 2731 err = -EBUSY; 2732 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { 2733 set_bit(ExternalBbl, &rdev->flags); 2734 rdev->badblocks.shift = 0; 2735 err = 0; 2736 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { 2737 clear_bit(ExternalBbl, &rdev->flags); 2738 err = 0; 2739 } 2740 if (!err) 2741 sysfs_notify_dirent_safe(rdev->sysfs_state); 2742 return err ? err : len; 2743 } 2744 static struct rdev_sysfs_entry rdev_state = 2745 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 2746 2747 static ssize_t 2748 errors_show(struct md_rdev *rdev, char *page) 2749 { 2750 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2751 } 2752 2753 static ssize_t 2754 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 2755 { 2756 unsigned int n; 2757 int rv; 2758 2759 rv = kstrtouint(buf, 10, &n); 2760 if (rv < 0) 2761 return rv; 2762 atomic_set(&rdev->corrected_errors, n); 2763 return len; 2764 } 2765 static struct rdev_sysfs_entry rdev_errors = 2766 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2767 2768 static ssize_t 2769 slot_show(struct md_rdev *rdev, char *page) 2770 { 2771 if (test_bit(Journal, &rdev->flags)) 2772 return sprintf(page, "journal\n"); 2773 else if (rdev->raid_disk < 0) 2774 return sprintf(page, "none\n"); 2775 else 2776 return sprintf(page, "%d\n", rdev->raid_disk); 2777 } 2778 2779 static ssize_t 2780 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 2781 { 2782 int slot; 2783 int err; 2784 2785 if (test_bit(Journal, &rdev->flags)) 2786 return -EBUSY; 2787 if (strncmp(buf, "none", 4)==0) 2788 slot = -1; 2789 else { 2790 err = kstrtouint(buf, 10, (unsigned int *)&slot); 2791 if (err < 0) 2792 return err; 2793 } 2794 if (rdev->mddev->pers && slot == -1) { 2795 /* Setting 'slot' on an active array requires also 2796 * updating the 'rd%d' link, and communicating 2797 * with the personality with ->hot_*_disk. 2798 * For now we only support removing 2799 * failed/spare devices. This normally happens automatically, 2800 * but not when the metadata is externally managed. 2801 */ 2802 if (rdev->raid_disk == -1) 2803 return -EEXIST; 2804 /* personality does all needed checks */ 2805 if (rdev->mddev->pers->hot_remove_disk == NULL) 2806 return -EINVAL; 2807 clear_bit(Blocked, &rdev->flags); 2808 remove_and_add_spares(rdev->mddev, rdev); 2809 if (rdev->raid_disk >= 0) 2810 return -EBUSY; 2811 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2812 md_wakeup_thread(rdev->mddev->thread); 2813 } else if (rdev->mddev->pers) { 2814 /* Activating a spare .. or possibly reactivating 2815 * if we ever get bitmaps working here. 2816 */ 2817 int err; 2818 2819 if (rdev->raid_disk != -1) 2820 return -EBUSY; 2821 2822 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 2823 return -EBUSY; 2824 2825 if (rdev->mddev->pers->hot_add_disk == NULL) 2826 return -EINVAL; 2827 2828 if (slot >= rdev->mddev->raid_disks && 2829 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2830 return -ENOSPC; 2831 2832 rdev->raid_disk = slot; 2833 if (test_bit(In_sync, &rdev->flags)) 2834 rdev->saved_raid_disk = slot; 2835 else 2836 rdev->saved_raid_disk = -1; 2837 clear_bit(In_sync, &rdev->flags); 2838 clear_bit(Bitmap_sync, &rdev->flags); 2839 err = rdev->mddev->pers-> 2840 hot_add_disk(rdev->mddev, rdev); 2841 if (err) { 2842 rdev->raid_disk = -1; 2843 return err; 2844 } else 2845 sysfs_notify_dirent_safe(rdev->sysfs_state); 2846 if (sysfs_link_rdev(rdev->mddev, rdev)) 2847 /* failure here is OK */; 2848 /* don't wakeup anyone, leave that to userspace. */ 2849 } else { 2850 if (slot >= rdev->mddev->raid_disks && 2851 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2852 return -ENOSPC; 2853 rdev->raid_disk = slot; 2854 /* assume it is working */ 2855 clear_bit(Faulty, &rdev->flags); 2856 clear_bit(WriteMostly, &rdev->flags); 2857 set_bit(In_sync, &rdev->flags); 2858 sysfs_notify_dirent_safe(rdev->sysfs_state); 2859 } 2860 return len; 2861 } 2862 2863 static struct rdev_sysfs_entry rdev_slot = 2864 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2865 2866 static ssize_t 2867 offset_show(struct md_rdev *rdev, char *page) 2868 { 2869 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2870 } 2871 2872 static ssize_t 2873 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2874 { 2875 unsigned long long offset; 2876 if (kstrtoull(buf, 10, &offset) < 0) 2877 return -EINVAL; 2878 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2879 return -EBUSY; 2880 if (rdev->sectors && rdev->mddev->external) 2881 /* Must set offset before size, so overlap checks 2882 * can be sane */ 2883 return -EBUSY; 2884 rdev->data_offset = offset; 2885 rdev->new_data_offset = offset; 2886 return len; 2887 } 2888 2889 static struct rdev_sysfs_entry rdev_offset = 2890 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2891 2892 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 2893 { 2894 return sprintf(page, "%llu\n", 2895 (unsigned long long)rdev->new_data_offset); 2896 } 2897 2898 static ssize_t new_offset_store(struct md_rdev *rdev, 2899 const char *buf, size_t len) 2900 { 2901 unsigned long long new_offset; 2902 struct mddev *mddev = rdev->mddev; 2903 2904 if (kstrtoull(buf, 10, &new_offset) < 0) 2905 return -EINVAL; 2906 2907 if (mddev->sync_thread || 2908 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 2909 return -EBUSY; 2910 if (new_offset == rdev->data_offset) 2911 /* reset is always permitted */ 2912 ; 2913 else if (new_offset > rdev->data_offset) { 2914 /* must not push array size beyond rdev_sectors */ 2915 if (new_offset - rdev->data_offset 2916 + mddev->dev_sectors > rdev->sectors) 2917 return -E2BIG; 2918 } 2919 /* Metadata worries about other space details. */ 2920 2921 /* decreasing the offset is inconsistent with a backwards 2922 * reshape. 2923 */ 2924 if (new_offset < rdev->data_offset && 2925 mddev->reshape_backwards) 2926 return -EINVAL; 2927 /* Increasing offset is inconsistent with forwards 2928 * reshape. reshape_direction should be set to 2929 * 'backwards' first. 2930 */ 2931 if (new_offset > rdev->data_offset && 2932 !mddev->reshape_backwards) 2933 return -EINVAL; 2934 2935 if (mddev->pers && mddev->persistent && 2936 !super_types[mddev->major_version] 2937 .allow_new_offset(rdev, new_offset)) 2938 return -E2BIG; 2939 rdev->new_data_offset = new_offset; 2940 if (new_offset > rdev->data_offset) 2941 mddev->reshape_backwards = 1; 2942 else if (new_offset < rdev->data_offset) 2943 mddev->reshape_backwards = 0; 2944 2945 return len; 2946 } 2947 static struct rdev_sysfs_entry rdev_new_offset = 2948 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 2949 2950 static ssize_t 2951 rdev_size_show(struct md_rdev *rdev, char *page) 2952 { 2953 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2954 } 2955 2956 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2957 { 2958 /* check if two start/length pairs overlap */ 2959 if (s1+l1 <= s2) 2960 return 0; 2961 if (s2+l2 <= s1) 2962 return 0; 2963 return 1; 2964 } 2965 2966 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2967 { 2968 unsigned long long blocks; 2969 sector_t new; 2970 2971 if (kstrtoull(buf, 10, &blocks) < 0) 2972 return -EINVAL; 2973 2974 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2975 return -EINVAL; /* sector conversion overflow */ 2976 2977 new = blocks * 2; 2978 if (new != blocks * 2) 2979 return -EINVAL; /* unsigned long long to sector_t overflow */ 2980 2981 *sectors = new; 2982 return 0; 2983 } 2984 2985 static ssize_t 2986 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 2987 { 2988 struct mddev *my_mddev = rdev->mddev; 2989 sector_t oldsectors = rdev->sectors; 2990 sector_t sectors; 2991 2992 if (test_bit(Journal, &rdev->flags)) 2993 return -EBUSY; 2994 if (strict_blocks_to_sectors(buf, §ors) < 0) 2995 return -EINVAL; 2996 if (rdev->data_offset != rdev->new_data_offset) 2997 return -EINVAL; /* too confusing */ 2998 if (my_mddev->pers && rdev->raid_disk >= 0) { 2999 if (my_mddev->persistent) { 3000 sectors = super_types[my_mddev->major_version]. 3001 rdev_size_change(rdev, sectors); 3002 if (!sectors) 3003 return -EBUSY; 3004 } else if (!sectors) 3005 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 3006 rdev->data_offset; 3007 if (!my_mddev->pers->resize) 3008 /* Cannot change size for RAID0 or Linear etc */ 3009 return -EINVAL; 3010 } 3011 if (sectors < my_mddev->dev_sectors) 3012 return -EINVAL; /* component must fit device */ 3013 3014 rdev->sectors = sectors; 3015 if (sectors > oldsectors && my_mddev->external) { 3016 /* Need to check that all other rdevs with the same 3017 * ->bdev do not overlap. 'rcu' is sufficient to walk 3018 * the rdev lists safely. 3019 * This check does not provide a hard guarantee, it 3020 * just helps avoid dangerous mistakes. 3021 */ 3022 struct mddev *mddev; 3023 int overlap = 0; 3024 struct list_head *tmp; 3025 3026 rcu_read_lock(); 3027 for_each_mddev(mddev, tmp) { 3028 struct md_rdev *rdev2; 3029 3030 rdev_for_each(rdev2, mddev) 3031 if (rdev->bdev == rdev2->bdev && 3032 rdev != rdev2 && 3033 overlaps(rdev->data_offset, rdev->sectors, 3034 rdev2->data_offset, 3035 rdev2->sectors)) { 3036 overlap = 1; 3037 break; 3038 } 3039 if (overlap) { 3040 mddev_put(mddev); 3041 break; 3042 } 3043 } 3044 rcu_read_unlock(); 3045 if (overlap) { 3046 /* Someone else could have slipped in a size 3047 * change here, but doing so is just silly. 3048 * We put oldsectors back because we *know* it is 3049 * safe, and trust userspace not to race with 3050 * itself 3051 */ 3052 rdev->sectors = oldsectors; 3053 return -EBUSY; 3054 } 3055 } 3056 return len; 3057 } 3058 3059 static struct rdev_sysfs_entry rdev_size = 3060 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3061 3062 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3063 { 3064 unsigned long long recovery_start = rdev->recovery_offset; 3065 3066 if (test_bit(In_sync, &rdev->flags) || 3067 recovery_start == MaxSector) 3068 return sprintf(page, "none\n"); 3069 3070 return sprintf(page, "%llu\n", recovery_start); 3071 } 3072 3073 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3074 { 3075 unsigned long long recovery_start; 3076 3077 if (cmd_match(buf, "none")) 3078 recovery_start = MaxSector; 3079 else if (kstrtoull(buf, 10, &recovery_start)) 3080 return -EINVAL; 3081 3082 if (rdev->mddev->pers && 3083 rdev->raid_disk >= 0) 3084 return -EBUSY; 3085 3086 rdev->recovery_offset = recovery_start; 3087 if (recovery_start == MaxSector) 3088 set_bit(In_sync, &rdev->flags); 3089 else 3090 clear_bit(In_sync, &rdev->flags); 3091 return len; 3092 } 3093 3094 static struct rdev_sysfs_entry rdev_recovery_start = 3095 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3096 3097 /* sysfs access to bad-blocks list. 3098 * We present two files. 3099 * 'bad-blocks' lists sector numbers and lengths of ranges that 3100 * are recorded as bad. The list is truncated to fit within 3101 * the one-page limit of sysfs. 3102 * Writing "sector length" to this file adds an acknowledged 3103 * bad block list. 3104 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3105 * been acknowledged. Writing to this file adds bad blocks 3106 * without acknowledging them. This is largely for testing. 3107 */ 3108 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3109 { 3110 return badblocks_show(&rdev->badblocks, page, 0); 3111 } 3112 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3113 { 3114 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3115 /* Maybe that ack was all we needed */ 3116 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3117 wake_up(&rdev->blocked_wait); 3118 return rv; 3119 } 3120 static struct rdev_sysfs_entry rdev_bad_blocks = 3121 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3122 3123 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3124 { 3125 return badblocks_show(&rdev->badblocks, page, 1); 3126 } 3127 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3128 { 3129 return badblocks_store(&rdev->badblocks, page, len, 1); 3130 } 3131 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3132 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3133 3134 static struct attribute *rdev_default_attrs[] = { 3135 &rdev_state.attr, 3136 &rdev_errors.attr, 3137 &rdev_slot.attr, 3138 &rdev_offset.attr, 3139 &rdev_new_offset.attr, 3140 &rdev_size.attr, 3141 &rdev_recovery_start.attr, 3142 &rdev_bad_blocks.attr, 3143 &rdev_unack_bad_blocks.attr, 3144 NULL, 3145 }; 3146 static ssize_t 3147 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3148 { 3149 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3150 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3151 3152 if (!entry->show) 3153 return -EIO; 3154 if (!rdev->mddev) 3155 return -EBUSY; 3156 return entry->show(rdev, page); 3157 } 3158 3159 static ssize_t 3160 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3161 const char *page, size_t length) 3162 { 3163 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3164 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3165 ssize_t rv; 3166 struct mddev *mddev = rdev->mddev; 3167 3168 if (!entry->store) 3169 return -EIO; 3170 if (!capable(CAP_SYS_ADMIN)) 3171 return -EACCES; 3172 rv = mddev ? mddev_lock(mddev): -EBUSY; 3173 if (!rv) { 3174 if (rdev->mddev == NULL) 3175 rv = -EBUSY; 3176 else 3177 rv = entry->store(rdev, page, length); 3178 mddev_unlock(mddev); 3179 } 3180 return rv; 3181 } 3182 3183 static void rdev_free(struct kobject *ko) 3184 { 3185 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3186 kfree(rdev); 3187 } 3188 static const struct sysfs_ops rdev_sysfs_ops = { 3189 .show = rdev_attr_show, 3190 .store = rdev_attr_store, 3191 }; 3192 static struct kobj_type rdev_ktype = { 3193 .release = rdev_free, 3194 .sysfs_ops = &rdev_sysfs_ops, 3195 .default_attrs = rdev_default_attrs, 3196 }; 3197 3198 int md_rdev_init(struct md_rdev *rdev) 3199 { 3200 rdev->desc_nr = -1; 3201 rdev->saved_raid_disk = -1; 3202 rdev->raid_disk = -1; 3203 rdev->flags = 0; 3204 rdev->data_offset = 0; 3205 rdev->new_data_offset = 0; 3206 rdev->sb_events = 0; 3207 rdev->last_read_error = 0; 3208 rdev->sb_loaded = 0; 3209 rdev->bb_page = NULL; 3210 atomic_set(&rdev->nr_pending, 0); 3211 atomic_set(&rdev->read_errors, 0); 3212 atomic_set(&rdev->corrected_errors, 0); 3213 3214 INIT_LIST_HEAD(&rdev->same_set); 3215 init_waitqueue_head(&rdev->blocked_wait); 3216 3217 /* Add space to store bad block list. 3218 * This reserves the space even on arrays where it cannot 3219 * be used - I wonder if that matters 3220 */ 3221 return badblocks_init(&rdev->badblocks, 0); 3222 } 3223 EXPORT_SYMBOL_GPL(md_rdev_init); 3224 /* 3225 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3226 * 3227 * mark the device faulty if: 3228 * 3229 * - the device is nonexistent (zero size) 3230 * - the device has no valid superblock 3231 * 3232 * a faulty rdev _never_ has rdev->sb set. 3233 */ 3234 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3235 { 3236 char b[BDEVNAME_SIZE]; 3237 int err; 3238 struct md_rdev *rdev; 3239 sector_t size; 3240 3241 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3242 if (!rdev) 3243 return ERR_PTR(-ENOMEM); 3244 3245 err = md_rdev_init(rdev); 3246 if (err) 3247 goto abort_free; 3248 err = alloc_disk_sb(rdev); 3249 if (err) 3250 goto abort_free; 3251 3252 err = lock_rdev(rdev, newdev, super_format == -2); 3253 if (err) 3254 goto abort_free; 3255 3256 kobject_init(&rdev->kobj, &rdev_ktype); 3257 3258 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3259 if (!size) { 3260 pr_warn("md: %s has zero or unknown size, marking faulty!\n", 3261 bdevname(rdev->bdev,b)); 3262 err = -EINVAL; 3263 goto abort_free; 3264 } 3265 3266 if (super_format >= 0) { 3267 err = super_types[super_format]. 3268 load_super(rdev, NULL, super_minor); 3269 if (err == -EINVAL) { 3270 pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n", 3271 bdevname(rdev->bdev,b), 3272 super_format, super_minor); 3273 goto abort_free; 3274 } 3275 if (err < 0) { 3276 pr_warn("md: could not read %s's sb, not importing!\n", 3277 bdevname(rdev->bdev,b)); 3278 goto abort_free; 3279 } 3280 } 3281 3282 return rdev; 3283 3284 abort_free: 3285 if (rdev->bdev) 3286 unlock_rdev(rdev); 3287 md_rdev_clear(rdev); 3288 kfree(rdev); 3289 return ERR_PTR(err); 3290 } 3291 3292 /* 3293 * Check a full RAID array for plausibility 3294 */ 3295 3296 static void analyze_sbs(struct mddev *mddev) 3297 { 3298 int i; 3299 struct md_rdev *rdev, *freshest, *tmp; 3300 char b[BDEVNAME_SIZE]; 3301 3302 freshest = NULL; 3303 rdev_for_each_safe(rdev, tmp, mddev) 3304 switch (super_types[mddev->major_version]. 3305 load_super(rdev, freshest, mddev->minor_version)) { 3306 case 1: 3307 freshest = rdev; 3308 break; 3309 case 0: 3310 break; 3311 default: 3312 pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n", 3313 bdevname(rdev->bdev,b)); 3314 md_kick_rdev_from_array(rdev); 3315 } 3316 3317 super_types[mddev->major_version]. 3318 validate_super(mddev, freshest); 3319 3320 i = 0; 3321 rdev_for_each_safe(rdev, tmp, mddev) { 3322 if (mddev->max_disks && 3323 (rdev->desc_nr >= mddev->max_disks || 3324 i > mddev->max_disks)) { 3325 pr_warn("md: %s: %s: only %d devices permitted\n", 3326 mdname(mddev), bdevname(rdev->bdev, b), 3327 mddev->max_disks); 3328 md_kick_rdev_from_array(rdev); 3329 continue; 3330 } 3331 if (rdev != freshest) { 3332 if (super_types[mddev->major_version]. 3333 validate_super(mddev, rdev)) { 3334 pr_warn("md: kicking non-fresh %s from array!\n", 3335 bdevname(rdev->bdev,b)); 3336 md_kick_rdev_from_array(rdev); 3337 continue; 3338 } 3339 } 3340 if (mddev->level == LEVEL_MULTIPATH) { 3341 rdev->desc_nr = i++; 3342 rdev->raid_disk = rdev->desc_nr; 3343 set_bit(In_sync, &rdev->flags); 3344 } else if (rdev->raid_disk >= 3345 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3346 !test_bit(Journal, &rdev->flags)) { 3347 rdev->raid_disk = -1; 3348 clear_bit(In_sync, &rdev->flags); 3349 } 3350 } 3351 } 3352 3353 /* Read a fixed-point number. 3354 * Numbers in sysfs attributes should be in "standard" units where 3355 * possible, so time should be in seconds. 3356 * However we internally use a a much smaller unit such as 3357 * milliseconds or jiffies. 3358 * This function takes a decimal number with a possible fractional 3359 * component, and produces an integer which is the result of 3360 * multiplying that number by 10^'scale'. 3361 * all without any floating-point arithmetic. 3362 */ 3363 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3364 { 3365 unsigned long result = 0; 3366 long decimals = -1; 3367 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3368 if (*cp == '.') 3369 decimals = 0; 3370 else if (decimals < scale) { 3371 unsigned int value; 3372 value = *cp - '0'; 3373 result = result * 10 + value; 3374 if (decimals >= 0) 3375 decimals++; 3376 } 3377 cp++; 3378 } 3379 if (*cp == '\n') 3380 cp++; 3381 if (*cp) 3382 return -EINVAL; 3383 if (decimals < 0) 3384 decimals = 0; 3385 while (decimals < scale) { 3386 result *= 10; 3387 decimals ++; 3388 } 3389 *res = result; 3390 return 0; 3391 } 3392 3393 static ssize_t 3394 safe_delay_show(struct mddev *mddev, char *page) 3395 { 3396 int msec = (mddev->safemode_delay*1000)/HZ; 3397 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3398 } 3399 static ssize_t 3400 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3401 { 3402 unsigned long msec; 3403 3404 if (mddev_is_clustered(mddev)) { 3405 pr_warn("md: Safemode is disabled for clustered mode\n"); 3406 return -EINVAL; 3407 } 3408 3409 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 3410 return -EINVAL; 3411 if (msec == 0) 3412 mddev->safemode_delay = 0; 3413 else { 3414 unsigned long old_delay = mddev->safemode_delay; 3415 unsigned long new_delay = (msec*HZ)/1000; 3416 3417 if (new_delay == 0) 3418 new_delay = 1; 3419 mddev->safemode_delay = new_delay; 3420 if (new_delay < old_delay || old_delay == 0) 3421 mod_timer(&mddev->safemode_timer, jiffies+1); 3422 } 3423 return len; 3424 } 3425 static struct md_sysfs_entry md_safe_delay = 3426 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3427 3428 static ssize_t 3429 level_show(struct mddev *mddev, char *page) 3430 { 3431 struct md_personality *p; 3432 int ret; 3433 spin_lock(&mddev->lock); 3434 p = mddev->pers; 3435 if (p) 3436 ret = sprintf(page, "%s\n", p->name); 3437 else if (mddev->clevel[0]) 3438 ret = sprintf(page, "%s\n", mddev->clevel); 3439 else if (mddev->level != LEVEL_NONE) 3440 ret = sprintf(page, "%d\n", mddev->level); 3441 else 3442 ret = 0; 3443 spin_unlock(&mddev->lock); 3444 return ret; 3445 } 3446 3447 static ssize_t 3448 level_store(struct mddev *mddev, const char *buf, size_t len) 3449 { 3450 char clevel[16]; 3451 ssize_t rv; 3452 size_t slen = len; 3453 struct md_personality *pers, *oldpers; 3454 long level; 3455 void *priv, *oldpriv; 3456 struct md_rdev *rdev; 3457 3458 if (slen == 0 || slen >= sizeof(clevel)) 3459 return -EINVAL; 3460 3461 rv = mddev_lock(mddev); 3462 if (rv) 3463 return rv; 3464 3465 if (mddev->pers == NULL) { 3466 strncpy(mddev->clevel, buf, slen); 3467 if (mddev->clevel[slen-1] == '\n') 3468 slen--; 3469 mddev->clevel[slen] = 0; 3470 mddev->level = LEVEL_NONE; 3471 rv = len; 3472 goto out_unlock; 3473 } 3474 rv = -EROFS; 3475 if (mddev->ro) 3476 goto out_unlock; 3477 3478 /* request to change the personality. Need to ensure: 3479 * - array is not engaged in resync/recovery/reshape 3480 * - old personality can be suspended 3481 * - new personality will access other array. 3482 */ 3483 3484 rv = -EBUSY; 3485 if (mddev->sync_thread || 3486 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3487 mddev->reshape_position != MaxSector || 3488 mddev->sysfs_active) 3489 goto out_unlock; 3490 3491 rv = -EINVAL; 3492 if (!mddev->pers->quiesce) { 3493 pr_warn("md: %s: %s does not support online personality change\n", 3494 mdname(mddev), mddev->pers->name); 3495 goto out_unlock; 3496 } 3497 3498 /* Now find the new personality */ 3499 strncpy(clevel, buf, slen); 3500 if (clevel[slen-1] == '\n') 3501 slen--; 3502 clevel[slen] = 0; 3503 if (kstrtol(clevel, 10, &level)) 3504 level = LEVEL_NONE; 3505 3506 if (request_module("md-%s", clevel) != 0) 3507 request_module("md-level-%s", clevel); 3508 spin_lock(&pers_lock); 3509 pers = find_pers(level, clevel); 3510 if (!pers || !try_module_get(pers->owner)) { 3511 spin_unlock(&pers_lock); 3512 pr_warn("md: personality %s not loaded\n", clevel); 3513 rv = -EINVAL; 3514 goto out_unlock; 3515 } 3516 spin_unlock(&pers_lock); 3517 3518 if (pers == mddev->pers) { 3519 /* Nothing to do! */ 3520 module_put(pers->owner); 3521 rv = len; 3522 goto out_unlock; 3523 } 3524 if (!pers->takeover) { 3525 module_put(pers->owner); 3526 pr_warn("md: %s: %s does not support personality takeover\n", 3527 mdname(mddev), clevel); 3528 rv = -EINVAL; 3529 goto out_unlock; 3530 } 3531 3532 rdev_for_each(rdev, mddev) 3533 rdev->new_raid_disk = rdev->raid_disk; 3534 3535 /* ->takeover must set new_* and/or delta_disks 3536 * if it succeeds, and may set them when it fails. 3537 */ 3538 priv = pers->takeover(mddev); 3539 if (IS_ERR(priv)) { 3540 mddev->new_level = mddev->level; 3541 mddev->new_layout = mddev->layout; 3542 mddev->new_chunk_sectors = mddev->chunk_sectors; 3543 mddev->raid_disks -= mddev->delta_disks; 3544 mddev->delta_disks = 0; 3545 mddev->reshape_backwards = 0; 3546 module_put(pers->owner); 3547 pr_warn("md: %s: %s would not accept array\n", 3548 mdname(mddev), clevel); 3549 rv = PTR_ERR(priv); 3550 goto out_unlock; 3551 } 3552 3553 /* Looks like we have a winner */ 3554 mddev_suspend(mddev); 3555 mddev_detach(mddev); 3556 3557 spin_lock(&mddev->lock); 3558 oldpers = mddev->pers; 3559 oldpriv = mddev->private; 3560 mddev->pers = pers; 3561 mddev->private = priv; 3562 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3563 mddev->level = mddev->new_level; 3564 mddev->layout = mddev->new_layout; 3565 mddev->chunk_sectors = mddev->new_chunk_sectors; 3566 mddev->delta_disks = 0; 3567 mddev->reshape_backwards = 0; 3568 mddev->degraded = 0; 3569 spin_unlock(&mddev->lock); 3570 3571 if (oldpers->sync_request == NULL && 3572 mddev->external) { 3573 /* We are converting from a no-redundancy array 3574 * to a redundancy array and metadata is managed 3575 * externally so we need to be sure that writes 3576 * won't block due to a need to transition 3577 * clean->dirty 3578 * until external management is started. 3579 */ 3580 mddev->in_sync = 0; 3581 mddev->safemode_delay = 0; 3582 mddev->safemode = 0; 3583 } 3584 3585 oldpers->free(mddev, oldpriv); 3586 3587 if (oldpers->sync_request == NULL && 3588 pers->sync_request != NULL) { 3589 /* need to add the md_redundancy_group */ 3590 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3591 pr_warn("md: cannot register extra attributes for %s\n", 3592 mdname(mddev)); 3593 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 3594 } 3595 if (oldpers->sync_request != NULL && 3596 pers->sync_request == NULL) { 3597 /* need to remove the md_redundancy_group */ 3598 if (mddev->to_remove == NULL) 3599 mddev->to_remove = &md_redundancy_group; 3600 } 3601 3602 module_put(oldpers->owner); 3603 3604 rdev_for_each(rdev, mddev) { 3605 if (rdev->raid_disk < 0) 3606 continue; 3607 if (rdev->new_raid_disk >= mddev->raid_disks) 3608 rdev->new_raid_disk = -1; 3609 if (rdev->new_raid_disk == rdev->raid_disk) 3610 continue; 3611 sysfs_unlink_rdev(mddev, rdev); 3612 } 3613 rdev_for_each(rdev, mddev) { 3614 if (rdev->raid_disk < 0) 3615 continue; 3616 if (rdev->new_raid_disk == rdev->raid_disk) 3617 continue; 3618 rdev->raid_disk = rdev->new_raid_disk; 3619 if (rdev->raid_disk < 0) 3620 clear_bit(In_sync, &rdev->flags); 3621 else { 3622 if (sysfs_link_rdev(mddev, rdev)) 3623 pr_warn("md: cannot register rd%d for %s after level change\n", 3624 rdev->raid_disk, mdname(mddev)); 3625 } 3626 } 3627 3628 if (pers->sync_request == NULL) { 3629 /* this is now an array without redundancy, so 3630 * it must always be in_sync 3631 */ 3632 mddev->in_sync = 1; 3633 del_timer_sync(&mddev->safemode_timer); 3634 } 3635 blk_set_stacking_limits(&mddev->queue->limits); 3636 pers->run(mddev); 3637 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 3638 mddev_resume(mddev); 3639 if (!mddev->thread) 3640 md_update_sb(mddev, 1); 3641 sysfs_notify(&mddev->kobj, NULL, "level"); 3642 md_new_event(mddev); 3643 rv = len; 3644 out_unlock: 3645 mddev_unlock(mddev); 3646 return rv; 3647 } 3648 3649 static struct md_sysfs_entry md_level = 3650 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3651 3652 static ssize_t 3653 layout_show(struct mddev *mddev, char *page) 3654 { 3655 /* just a number, not meaningful for all levels */ 3656 if (mddev->reshape_position != MaxSector && 3657 mddev->layout != mddev->new_layout) 3658 return sprintf(page, "%d (%d)\n", 3659 mddev->new_layout, mddev->layout); 3660 return sprintf(page, "%d\n", mddev->layout); 3661 } 3662 3663 static ssize_t 3664 layout_store(struct mddev *mddev, const char *buf, size_t len) 3665 { 3666 unsigned int n; 3667 int err; 3668 3669 err = kstrtouint(buf, 10, &n); 3670 if (err < 0) 3671 return err; 3672 err = mddev_lock(mddev); 3673 if (err) 3674 return err; 3675 3676 if (mddev->pers) { 3677 if (mddev->pers->check_reshape == NULL) 3678 err = -EBUSY; 3679 else if (mddev->ro) 3680 err = -EROFS; 3681 else { 3682 mddev->new_layout = n; 3683 err = mddev->pers->check_reshape(mddev); 3684 if (err) 3685 mddev->new_layout = mddev->layout; 3686 } 3687 } else { 3688 mddev->new_layout = n; 3689 if (mddev->reshape_position == MaxSector) 3690 mddev->layout = n; 3691 } 3692 mddev_unlock(mddev); 3693 return err ?: len; 3694 } 3695 static struct md_sysfs_entry md_layout = 3696 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3697 3698 static ssize_t 3699 raid_disks_show(struct mddev *mddev, char *page) 3700 { 3701 if (mddev->raid_disks == 0) 3702 return 0; 3703 if (mddev->reshape_position != MaxSector && 3704 mddev->delta_disks != 0) 3705 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3706 mddev->raid_disks - mddev->delta_disks); 3707 return sprintf(page, "%d\n", mddev->raid_disks); 3708 } 3709 3710 static int update_raid_disks(struct mddev *mddev, int raid_disks); 3711 3712 static ssize_t 3713 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3714 { 3715 unsigned int n; 3716 int err; 3717 3718 err = kstrtouint(buf, 10, &n); 3719 if (err < 0) 3720 return err; 3721 3722 err = mddev_lock(mddev); 3723 if (err) 3724 return err; 3725 if (mddev->pers) 3726 err = update_raid_disks(mddev, n); 3727 else if (mddev->reshape_position != MaxSector) { 3728 struct md_rdev *rdev; 3729 int olddisks = mddev->raid_disks - mddev->delta_disks; 3730 3731 err = -EINVAL; 3732 rdev_for_each(rdev, mddev) { 3733 if (olddisks < n && 3734 rdev->data_offset < rdev->new_data_offset) 3735 goto out_unlock; 3736 if (olddisks > n && 3737 rdev->data_offset > rdev->new_data_offset) 3738 goto out_unlock; 3739 } 3740 err = 0; 3741 mddev->delta_disks = n - olddisks; 3742 mddev->raid_disks = n; 3743 mddev->reshape_backwards = (mddev->delta_disks < 0); 3744 } else 3745 mddev->raid_disks = n; 3746 out_unlock: 3747 mddev_unlock(mddev); 3748 return err ? err : len; 3749 } 3750 static struct md_sysfs_entry md_raid_disks = 3751 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3752 3753 static ssize_t 3754 chunk_size_show(struct mddev *mddev, char *page) 3755 { 3756 if (mddev->reshape_position != MaxSector && 3757 mddev->chunk_sectors != mddev->new_chunk_sectors) 3758 return sprintf(page, "%d (%d)\n", 3759 mddev->new_chunk_sectors << 9, 3760 mddev->chunk_sectors << 9); 3761 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3762 } 3763 3764 static ssize_t 3765 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3766 { 3767 unsigned long n; 3768 int err; 3769 3770 err = kstrtoul(buf, 10, &n); 3771 if (err < 0) 3772 return err; 3773 3774 err = mddev_lock(mddev); 3775 if (err) 3776 return err; 3777 if (mddev->pers) { 3778 if (mddev->pers->check_reshape == NULL) 3779 err = -EBUSY; 3780 else if (mddev->ro) 3781 err = -EROFS; 3782 else { 3783 mddev->new_chunk_sectors = n >> 9; 3784 err = mddev->pers->check_reshape(mddev); 3785 if (err) 3786 mddev->new_chunk_sectors = mddev->chunk_sectors; 3787 } 3788 } else { 3789 mddev->new_chunk_sectors = n >> 9; 3790 if (mddev->reshape_position == MaxSector) 3791 mddev->chunk_sectors = n >> 9; 3792 } 3793 mddev_unlock(mddev); 3794 return err ?: len; 3795 } 3796 static struct md_sysfs_entry md_chunk_size = 3797 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3798 3799 static ssize_t 3800 resync_start_show(struct mddev *mddev, char *page) 3801 { 3802 if (mddev->recovery_cp == MaxSector) 3803 return sprintf(page, "none\n"); 3804 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3805 } 3806 3807 static ssize_t 3808 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3809 { 3810 unsigned long long n; 3811 int err; 3812 3813 if (cmd_match(buf, "none")) 3814 n = MaxSector; 3815 else { 3816 err = kstrtoull(buf, 10, &n); 3817 if (err < 0) 3818 return err; 3819 if (n != (sector_t)n) 3820 return -EINVAL; 3821 } 3822 3823 err = mddev_lock(mddev); 3824 if (err) 3825 return err; 3826 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3827 err = -EBUSY; 3828 3829 if (!err) { 3830 mddev->recovery_cp = n; 3831 if (mddev->pers) 3832 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 3833 } 3834 mddev_unlock(mddev); 3835 return err ?: len; 3836 } 3837 static struct md_sysfs_entry md_resync_start = 3838 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 3839 resync_start_show, resync_start_store); 3840 3841 /* 3842 * The array state can be: 3843 * 3844 * clear 3845 * No devices, no size, no level 3846 * Equivalent to STOP_ARRAY ioctl 3847 * inactive 3848 * May have some settings, but array is not active 3849 * all IO results in error 3850 * When written, doesn't tear down array, but just stops it 3851 * suspended (not supported yet) 3852 * All IO requests will block. The array can be reconfigured. 3853 * Writing this, if accepted, will block until array is quiescent 3854 * readonly 3855 * no resync can happen. no superblocks get written. 3856 * write requests fail 3857 * read-auto 3858 * like readonly, but behaves like 'clean' on a write request. 3859 * 3860 * clean - no pending writes, but otherwise active. 3861 * When written to inactive array, starts without resync 3862 * If a write request arrives then 3863 * if metadata is known, mark 'dirty' and switch to 'active'. 3864 * if not known, block and switch to write-pending 3865 * If written to an active array that has pending writes, then fails. 3866 * active 3867 * fully active: IO and resync can be happening. 3868 * When written to inactive array, starts with resync 3869 * 3870 * write-pending 3871 * clean, but writes are blocked waiting for 'active' to be written. 3872 * 3873 * active-idle 3874 * like active, but no writes have been seen for a while (100msec). 3875 * 3876 */ 3877 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 3878 write_pending, active_idle, bad_word}; 3879 static char *array_states[] = { 3880 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3881 "write-pending", "active-idle", NULL }; 3882 3883 static int match_word(const char *word, char **list) 3884 { 3885 int n; 3886 for (n=0; list[n]; n++) 3887 if (cmd_match(word, list[n])) 3888 break; 3889 return n; 3890 } 3891 3892 static ssize_t 3893 array_state_show(struct mddev *mddev, char *page) 3894 { 3895 enum array_state st = inactive; 3896 3897 if (mddev->pers) 3898 switch(mddev->ro) { 3899 case 1: 3900 st = readonly; 3901 break; 3902 case 2: 3903 st = read_auto; 3904 break; 3905 case 0: 3906 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 3907 st = write_pending; 3908 else if (mddev->in_sync) 3909 st = clean; 3910 else if (mddev->safemode) 3911 st = active_idle; 3912 else 3913 st = active; 3914 } 3915 else { 3916 if (list_empty(&mddev->disks) && 3917 mddev->raid_disks == 0 && 3918 mddev->dev_sectors == 0) 3919 st = clear; 3920 else 3921 st = inactive; 3922 } 3923 return sprintf(page, "%s\n", array_states[st]); 3924 } 3925 3926 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 3927 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 3928 static int do_md_run(struct mddev *mddev); 3929 static int restart_array(struct mddev *mddev); 3930 3931 static ssize_t 3932 array_state_store(struct mddev *mddev, const char *buf, size_t len) 3933 { 3934 int err; 3935 enum array_state st = match_word(buf, array_states); 3936 3937 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { 3938 /* don't take reconfig_mutex when toggling between 3939 * clean and active 3940 */ 3941 spin_lock(&mddev->lock); 3942 if (st == active) { 3943 restart_array(mddev); 3944 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 3945 md_wakeup_thread(mddev->thread); 3946 wake_up(&mddev->sb_wait); 3947 err = 0; 3948 } else /* st == clean */ { 3949 restart_array(mddev); 3950 if (atomic_read(&mddev->writes_pending) == 0) { 3951 if (mddev->in_sync == 0) { 3952 mddev->in_sync = 1; 3953 if (mddev->safemode == 1) 3954 mddev->safemode = 0; 3955 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 3956 } 3957 err = 0; 3958 } else 3959 err = -EBUSY; 3960 } 3961 if (!err) 3962 sysfs_notify_dirent_safe(mddev->sysfs_state); 3963 spin_unlock(&mddev->lock); 3964 return err ?: len; 3965 } 3966 err = mddev_lock(mddev); 3967 if (err) 3968 return err; 3969 err = -EINVAL; 3970 switch(st) { 3971 case bad_word: 3972 break; 3973 case clear: 3974 /* stopping an active array */ 3975 err = do_md_stop(mddev, 0, NULL); 3976 break; 3977 case inactive: 3978 /* stopping an active array */ 3979 if (mddev->pers) 3980 err = do_md_stop(mddev, 2, NULL); 3981 else 3982 err = 0; /* already inactive */ 3983 break; 3984 case suspended: 3985 break; /* not supported yet */ 3986 case readonly: 3987 if (mddev->pers) 3988 err = md_set_readonly(mddev, NULL); 3989 else { 3990 mddev->ro = 1; 3991 set_disk_ro(mddev->gendisk, 1); 3992 err = do_md_run(mddev); 3993 } 3994 break; 3995 case read_auto: 3996 if (mddev->pers) { 3997 if (mddev->ro == 0) 3998 err = md_set_readonly(mddev, NULL); 3999 else if (mddev->ro == 1) 4000 err = restart_array(mddev); 4001 if (err == 0) { 4002 mddev->ro = 2; 4003 set_disk_ro(mddev->gendisk, 0); 4004 } 4005 } else { 4006 mddev->ro = 2; 4007 err = do_md_run(mddev); 4008 } 4009 break; 4010 case clean: 4011 if (mddev->pers) { 4012 err = restart_array(mddev); 4013 if (err) 4014 break; 4015 spin_lock(&mddev->lock); 4016 if (atomic_read(&mddev->writes_pending) == 0) { 4017 if (mddev->in_sync == 0) { 4018 mddev->in_sync = 1; 4019 if (mddev->safemode == 1) 4020 mddev->safemode = 0; 4021 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 4022 } 4023 err = 0; 4024 } else 4025 err = -EBUSY; 4026 spin_unlock(&mddev->lock); 4027 } else 4028 err = -EINVAL; 4029 break; 4030 case active: 4031 if (mddev->pers) { 4032 err = restart_array(mddev); 4033 if (err) 4034 break; 4035 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 4036 wake_up(&mddev->sb_wait); 4037 err = 0; 4038 } else { 4039 mddev->ro = 0; 4040 set_disk_ro(mddev->gendisk, 0); 4041 err = do_md_run(mddev); 4042 } 4043 break; 4044 case write_pending: 4045 case active_idle: 4046 /* these cannot be set */ 4047 break; 4048 } 4049 4050 if (!err) { 4051 if (mddev->hold_active == UNTIL_IOCTL) 4052 mddev->hold_active = 0; 4053 sysfs_notify_dirent_safe(mddev->sysfs_state); 4054 } 4055 mddev_unlock(mddev); 4056 return err ?: len; 4057 } 4058 static struct md_sysfs_entry md_array_state = 4059 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4060 4061 static ssize_t 4062 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4063 return sprintf(page, "%d\n", 4064 atomic_read(&mddev->max_corr_read_errors)); 4065 } 4066 4067 static ssize_t 4068 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4069 { 4070 unsigned int n; 4071 int rv; 4072 4073 rv = kstrtouint(buf, 10, &n); 4074 if (rv < 0) 4075 return rv; 4076 atomic_set(&mddev->max_corr_read_errors, n); 4077 return len; 4078 } 4079 4080 static struct md_sysfs_entry max_corr_read_errors = 4081 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4082 max_corrected_read_errors_store); 4083 4084 static ssize_t 4085 null_show(struct mddev *mddev, char *page) 4086 { 4087 return -EINVAL; 4088 } 4089 4090 static ssize_t 4091 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4092 { 4093 /* buf must be %d:%d\n? giving major and minor numbers */ 4094 /* The new device is added to the array. 4095 * If the array has a persistent superblock, we read the 4096 * superblock to initialise info and check validity. 4097 * Otherwise, only checking done is that in bind_rdev_to_array, 4098 * which mainly checks size. 4099 */ 4100 char *e; 4101 int major = simple_strtoul(buf, &e, 10); 4102 int minor; 4103 dev_t dev; 4104 struct md_rdev *rdev; 4105 int err; 4106 4107 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4108 return -EINVAL; 4109 minor = simple_strtoul(e+1, &e, 10); 4110 if (*e && *e != '\n') 4111 return -EINVAL; 4112 dev = MKDEV(major, minor); 4113 if (major != MAJOR(dev) || 4114 minor != MINOR(dev)) 4115 return -EOVERFLOW; 4116 4117 flush_workqueue(md_misc_wq); 4118 4119 err = mddev_lock(mddev); 4120 if (err) 4121 return err; 4122 if (mddev->persistent) { 4123 rdev = md_import_device(dev, mddev->major_version, 4124 mddev->minor_version); 4125 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4126 struct md_rdev *rdev0 4127 = list_entry(mddev->disks.next, 4128 struct md_rdev, same_set); 4129 err = super_types[mddev->major_version] 4130 .load_super(rdev, rdev0, mddev->minor_version); 4131 if (err < 0) 4132 goto out; 4133 } 4134 } else if (mddev->external) 4135 rdev = md_import_device(dev, -2, -1); 4136 else 4137 rdev = md_import_device(dev, -1, -1); 4138 4139 if (IS_ERR(rdev)) { 4140 mddev_unlock(mddev); 4141 return PTR_ERR(rdev); 4142 } 4143 err = bind_rdev_to_array(rdev, mddev); 4144 out: 4145 if (err) 4146 export_rdev(rdev); 4147 mddev_unlock(mddev); 4148 return err ? err : len; 4149 } 4150 4151 static struct md_sysfs_entry md_new_device = 4152 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4153 4154 static ssize_t 4155 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4156 { 4157 char *end; 4158 unsigned long chunk, end_chunk; 4159 int err; 4160 4161 err = mddev_lock(mddev); 4162 if (err) 4163 return err; 4164 if (!mddev->bitmap) 4165 goto out; 4166 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4167 while (*buf) { 4168 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4169 if (buf == end) break; 4170 if (*end == '-') { /* range */ 4171 buf = end + 1; 4172 end_chunk = simple_strtoul(buf, &end, 0); 4173 if (buf == end) break; 4174 } 4175 if (*end && !isspace(*end)) break; 4176 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4177 buf = skip_spaces(end); 4178 } 4179 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4180 out: 4181 mddev_unlock(mddev); 4182 return len; 4183 } 4184 4185 static struct md_sysfs_entry md_bitmap = 4186 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4187 4188 static ssize_t 4189 size_show(struct mddev *mddev, char *page) 4190 { 4191 return sprintf(page, "%llu\n", 4192 (unsigned long long)mddev->dev_sectors / 2); 4193 } 4194 4195 static int update_size(struct mddev *mddev, sector_t num_sectors); 4196 4197 static ssize_t 4198 size_store(struct mddev *mddev, const char *buf, size_t len) 4199 { 4200 /* If array is inactive, we can reduce the component size, but 4201 * not increase it (except from 0). 4202 * If array is active, we can try an on-line resize 4203 */ 4204 sector_t sectors; 4205 int err = strict_blocks_to_sectors(buf, §ors); 4206 4207 if (err < 0) 4208 return err; 4209 err = mddev_lock(mddev); 4210 if (err) 4211 return err; 4212 if (mddev->pers) { 4213 err = update_size(mddev, sectors); 4214 if (err == 0) 4215 md_update_sb(mddev, 1); 4216 } else { 4217 if (mddev->dev_sectors == 0 || 4218 mddev->dev_sectors > sectors) 4219 mddev->dev_sectors = sectors; 4220 else 4221 err = -ENOSPC; 4222 } 4223 mddev_unlock(mddev); 4224 return err ? err : len; 4225 } 4226 4227 static struct md_sysfs_entry md_size = 4228 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4229 4230 /* Metadata version. 4231 * This is one of 4232 * 'none' for arrays with no metadata (good luck...) 4233 * 'external' for arrays with externally managed metadata, 4234 * or N.M for internally known formats 4235 */ 4236 static ssize_t 4237 metadata_show(struct mddev *mddev, char *page) 4238 { 4239 if (mddev->persistent) 4240 return sprintf(page, "%d.%d\n", 4241 mddev->major_version, mddev->minor_version); 4242 else if (mddev->external) 4243 return sprintf(page, "external:%s\n", mddev->metadata_type); 4244 else 4245 return sprintf(page, "none\n"); 4246 } 4247 4248 static ssize_t 4249 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4250 { 4251 int major, minor; 4252 char *e; 4253 int err; 4254 /* Changing the details of 'external' metadata is 4255 * always permitted. Otherwise there must be 4256 * no devices attached to the array. 4257 */ 4258 4259 err = mddev_lock(mddev); 4260 if (err) 4261 return err; 4262 err = -EBUSY; 4263 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4264 ; 4265 else if (!list_empty(&mddev->disks)) 4266 goto out_unlock; 4267 4268 err = 0; 4269 if (cmd_match(buf, "none")) { 4270 mddev->persistent = 0; 4271 mddev->external = 0; 4272 mddev->major_version = 0; 4273 mddev->minor_version = 90; 4274 goto out_unlock; 4275 } 4276 if (strncmp(buf, "external:", 9) == 0) { 4277 size_t namelen = len-9; 4278 if (namelen >= sizeof(mddev->metadata_type)) 4279 namelen = sizeof(mddev->metadata_type)-1; 4280 strncpy(mddev->metadata_type, buf+9, namelen); 4281 mddev->metadata_type[namelen] = 0; 4282 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4283 mddev->metadata_type[--namelen] = 0; 4284 mddev->persistent = 0; 4285 mddev->external = 1; 4286 mddev->major_version = 0; 4287 mddev->minor_version = 90; 4288 goto out_unlock; 4289 } 4290 major = simple_strtoul(buf, &e, 10); 4291 err = -EINVAL; 4292 if (e==buf || *e != '.') 4293 goto out_unlock; 4294 buf = e+1; 4295 minor = simple_strtoul(buf, &e, 10); 4296 if (e==buf || (*e && *e != '\n') ) 4297 goto out_unlock; 4298 err = -ENOENT; 4299 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4300 goto out_unlock; 4301 mddev->major_version = major; 4302 mddev->minor_version = minor; 4303 mddev->persistent = 1; 4304 mddev->external = 0; 4305 err = 0; 4306 out_unlock: 4307 mddev_unlock(mddev); 4308 return err ?: len; 4309 } 4310 4311 static struct md_sysfs_entry md_metadata = 4312 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4313 4314 static ssize_t 4315 action_show(struct mddev *mddev, char *page) 4316 { 4317 char *type = "idle"; 4318 unsigned long recovery = mddev->recovery; 4319 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4320 type = "frozen"; 4321 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4322 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4323 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4324 type = "reshape"; 4325 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4326 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4327 type = "resync"; 4328 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4329 type = "check"; 4330 else 4331 type = "repair"; 4332 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4333 type = "recover"; 4334 else if (mddev->reshape_position != MaxSector) 4335 type = "reshape"; 4336 } 4337 return sprintf(page, "%s\n", type); 4338 } 4339 4340 static ssize_t 4341 action_store(struct mddev *mddev, const char *page, size_t len) 4342 { 4343 if (!mddev->pers || !mddev->pers->sync_request) 4344 return -EINVAL; 4345 4346 4347 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4348 if (cmd_match(page, "frozen")) 4349 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4350 else 4351 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4352 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4353 mddev_lock(mddev) == 0) { 4354 flush_workqueue(md_misc_wq); 4355 if (mddev->sync_thread) { 4356 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4357 md_reap_sync_thread(mddev); 4358 } 4359 mddev_unlock(mddev); 4360 } 4361 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4362 return -EBUSY; 4363 else if (cmd_match(page, "resync")) 4364 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4365 else if (cmd_match(page, "recover")) { 4366 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4367 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4368 } else if (cmd_match(page, "reshape")) { 4369 int err; 4370 if (mddev->pers->start_reshape == NULL) 4371 return -EINVAL; 4372 err = mddev_lock(mddev); 4373 if (!err) { 4374 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4375 err = -EBUSY; 4376 else { 4377 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4378 err = mddev->pers->start_reshape(mddev); 4379 } 4380 mddev_unlock(mddev); 4381 } 4382 if (err) 4383 return err; 4384 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4385 } else { 4386 if (cmd_match(page, "check")) 4387 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4388 else if (!cmd_match(page, "repair")) 4389 return -EINVAL; 4390 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4391 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4392 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4393 } 4394 if (mddev->ro == 2) { 4395 /* A write to sync_action is enough to justify 4396 * canceling read-auto mode 4397 */ 4398 mddev->ro = 0; 4399 md_wakeup_thread(mddev->sync_thread); 4400 } 4401 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4402 md_wakeup_thread(mddev->thread); 4403 sysfs_notify_dirent_safe(mddev->sysfs_action); 4404 return len; 4405 } 4406 4407 static struct md_sysfs_entry md_scan_mode = 4408 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4409 4410 static ssize_t 4411 last_sync_action_show(struct mddev *mddev, char *page) 4412 { 4413 return sprintf(page, "%s\n", mddev->last_sync_action); 4414 } 4415 4416 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4417 4418 static ssize_t 4419 mismatch_cnt_show(struct mddev *mddev, char *page) 4420 { 4421 return sprintf(page, "%llu\n", 4422 (unsigned long long) 4423 atomic64_read(&mddev->resync_mismatches)); 4424 } 4425 4426 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4427 4428 static ssize_t 4429 sync_min_show(struct mddev *mddev, char *page) 4430 { 4431 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4432 mddev->sync_speed_min ? "local": "system"); 4433 } 4434 4435 static ssize_t 4436 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4437 { 4438 unsigned int min; 4439 int rv; 4440 4441 if (strncmp(buf, "system", 6)==0) { 4442 min = 0; 4443 } else { 4444 rv = kstrtouint(buf, 10, &min); 4445 if (rv < 0) 4446 return rv; 4447 if (min == 0) 4448 return -EINVAL; 4449 } 4450 mddev->sync_speed_min = min; 4451 return len; 4452 } 4453 4454 static struct md_sysfs_entry md_sync_min = 4455 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4456 4457 static ssize_t 4458 sync_max_show(struct mddev *mddev, char *page) 4459 { 4460 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4461 mddev->sync_speed_max ? "local": "system"); 4462 } 4463 4464 static ssize_t 4465 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4466 { 4467 unsigned int max; 4468 int rv; 4469 4470 if (strncmp(buf, "system", 6)==0) { 4471 max = 0; 4472 } else { 4473 rv = kstrtouint(buf, 10, &max); 4474 if (rv < 0) 4475 return rv; 4476 if (max == 0) 4477 return -EINVAL; 4478 } 4479 mddev->sync_speed_max = max; 4480 return len; 4481 } 4482 4483 static struct md_sysfs_entry md_sync_max = 4484 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4485 4486 static ssize_t 4487 degraded_show(struct mddev *mddev, char *page) 4488 { 4489 return sprintf(page, "%d\n", mddev->degraded); 4490 } 4491 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4492 4493 static ssize_t 4494 sync_force_parallel_show(struct mddev *mddev, char *page) 4495 { 4496 return sprintf(page, "%d\n", mddev->parallel_resync); 4497 } 4498 4499 static ssize_t 4500 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4501 { 4502 long n; 4503 4504 if (kstrtol(buf, 10, &n)) 4505 return -EINVAL; 4506 4507 if (n != 0 && n != 1) 4508 return -EINVAL; 4509 4510 mddev->parallel_resync = n; 4511 4512 if (mddev->sync_thread) 4513 wake_up(&resync_wait); 4514 4515 return len; 4516 } 4517 4518 /* force parallel resync, even with shared block devices */ 4519 static struct md_sysfs_entry md_sync_force_parallel = 4520 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 4521 sync_force_parallel_show, sync_force_parallel_store); 4522 4523 static ssize_t 4524 sync_speed_show(struct mddev *mddev, char *page) 4525 { 4526 unsigned long resync, dt, db; 4527 if (mddev->curr_resync == 0) 4528 return sprintf(page, "none\n"); 4529 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 4530 dt = (jiffies - mddev->resync_mark) / HZ; 4531 if (!dt) dt++; 4532 db = resync - mddev->resync_mark_cnt; 4533 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 4534 } 4535 4536 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 4537 4538 static ssize_t 4539 sync_completed_show(struct mddev *mddev, char *page) 4540 { 4541 unsigned long long max_sectors, resync; 4542 4543 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4544 return sprintf(page, "none\n"); 4545 4546 if (mddev->curr_resync == 1 || 4547 mddev->curr_resync == 2) 4548 return sprintf(page, "delayed\n"); 4549 4550 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4551 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4552 max_sectors = mddev->resync_max_sectors; 4553 else 4554 max_sectors = mddev->dev_sectors; 4555 4556 resync = mddev->curr_resync_completed; 4557 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 4558 } 4559 4560 static struct md_sysfs_entry md_sync_completed = 4561 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 4562 4563 static ssize_t 4564 min_sync_show(struct mddev *mddev, char *page) 4565 { 4566 return sprintf(page, "%llu\n", 4567 (unsigned long long)mddev->resync_min); 4568 } 4569 static ssize_t 4570 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4571 { 4572 unsigned long long min; 4573 int err; 4574 4575 if (kstrtoull(buf, 10, &min)) 4576 return -EINVAL; 4577 4578 spin_lock(&mddev->lock); 4579 err = -EINVAL; 4580 if (min > mddev->resync_max) 4581 goto out_unlock; 4582 4583 err = -EBUSY; 4584 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4585 goto out_unlock; 4586 4587 /* Round down to multiple of 4K for safety */ 4588 mddev->resync_min = round_down(min, 8); 4589 err = 0; 4590 4591 out_unlock: 4592 spin_unlock(&mddev->lock); 4593 return err ?: len; 4594 } 4595 4596 static struct md_sysfs_entry md_min_sync = 4597 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 4598 4599 static ssize_t 4600 max_sync_show(struct mddev *mddev, char *page) 4601 { 4602 if (mddev->resync_max == MaxSector) 4603 return sprintf(page, "max\n"); 4604 else 4605 return sprintf(page, "%llu\n", 4606 (unsigned long long)mddev->resync_max); 4607 } 4608 static ssize_t 4609 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4610 { 4611 int err; 4612 spin_lock(&mddev->lock); 4613 if (strncmp(buf, "max", 3) == 0) 4614 mddev->resync_max = MaxSector; 4615 else { 4616 unsigned long long max; 4617 int chunk; 4618 4619 err = -EINVAL; 4620 if (kstrtoull(buf, 10, &max)) 4621 goto out_unlock; 4622 if (max < mddev->resync_min) 4623 goto out_unlock; 4624 4625 err = -EBUSY; 4626 if (max < mddev->resync_max && 4627 mddev->ro == 0 && 4628 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4629 goto out_unlock; 4630 4631 /* Must be a multiple of chunk_size */ 4632 chunk = mddev->chunk_sectors; 4633 if (chunk) { 4634 sector_t temp = max; 4635 4636 err = -EINVAL; 4637 if (sector_div(temp, chunk)) 4638 goto out_unlock; 4639 } 4640 mddev->resync_max = max; 4641 } 4642 wake_up(&mddev->recovery_wait); 4643 err = 0; 4644 out_unlock: 4645 spin_unlock(&mddev->lock); 4646 return err ?: len; 4647 } 4648 4649 static struct md_sysfs_entry md_max_sync = 4650 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 4651 4652 static ssize_t 4653 suspend_lo_show(struct mddev *mddev, char *page) 4654 { 4655 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 4656 } 4657 4658 static ssize_t 4659 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4660 { 4661 unsigned long long old, new; 4662 int err; 4663 4664 err = kstrtoull(buf, 10, &new); 4665 if (err < 0) 4666 return err; 4667 if (new != (sector_t)new) 4668 return -EINVAL; 4669 4670 err = mddev_lock(mddev); 4671 if (err) 4672 return err; 4673 err = -EINVAL; 4674 if (mddev->pers == NULL || 4675 mddev->pers->quiesce == NULL) 4676 goto unlock; 4677 old = mddev->suspend_lo; 4678 mddev->suspend_lo = new; 4679 if (new >= old) 4680 /* Shrinking suspended region */ 4681 mddev->pers->quiesce(mddev, 2); 4682 else { 4683 /* Expanding suspended region - need to wait */ 4684 mddev->pers->quiesce(mddev, 1); 4685 mddev->pers->quiesce(mddev, 0); 4686 } 4687 err = 0; 4688 unlock: 4689 mddev_unlock(mddev); 4690 return err ?: len; 4691 } 4692 static struct md_sysfs_entry md_suspend_lo = 4693 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4694 4695 static ssize_t 4696 suspend_hi_show(struct mddev *mddev, char *page) 4697 { 4698 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 4699 } 4700 4701 static ssize_t 4702 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4703 { 4704 unsigned long long old, new; 4705 int err; 4706 4707 err = kstrtoull(buf, 10, &new); 4708 if (err < 0) 4709 return err; 4710 if (new != (sector_t)new) 4711 return -EINVAL; 4712 4713 err = mddev_lock(mddev); 4714 if (err) 4715 return err; 4716 err = -EINVAL; 4717 if (mddev->pers == NULL || 4718 mddev->pers->quiesce == NULL) 4719 goto unlock; 4720 old = mddev->suspend_hi; 4721 mddev->suspend_hi = new; 4722 if (new <= old) 4723 /* Shrinking suspended region */ 4724 mddev->pers->quiesce(mddev, 2); 4725 else { 4726 /* Expanding suspended region - need to wait */ 4727 mddev->pers->quiesce(mddev, 1); 4728 mddev->pers->quiesce(mddev, 0); 4729 } 4730 err = 0; 4731 unlock: 4732 mddev_unlock(mddev); 4733 return err ?: len; 4734 } 4735 static struct md_sysfs_entry md_suspend_hi = 4736 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4737 4738 static ssize_t 4739 reshape_position_show(struct mddev *mddev, char *page) 4740 { 4741 if (mddev->reshape_position != MaxSector) 4742 return sprintf(page, "%llu\n", 4743 (unsigned long long)mddev->reshape_position); 4744 strcpy(page, "none\n"); 4745 return 5; 4746 } 4747 4748 static ssize_t 4749 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4750 { 4751 struct md_rdev *rdev; 4752 unsigned long long new; 4753 int err; 4754 4755 err = kstrtoull(buf, 10, &new); 4756 if (err < 0) 4757 return err; 4758 if (new != (sector_t)new) 4759 return -EINVAL; 4760 err = mddev_lock(mddev); 4761 if (err) 4762 return err; 4763 err = -EBUSY; 4764 if (mddev->pers) 4765 goto unlock; 4766 mddev->reshape_position = new; 4767 mddev->delta_disks = 0; 4768 mddev->reshape_backwards = 0; 4769 mddev->new_level = mddev->level; 4770 mddev->new_layout = mddev->layout; 4771 mddev->new_chunk_sectors = mddev->chunk_sectors; 4772 rdev_for_each(rdev, mddev) 4773 rdev->new_data_offset = rdev->data_offset; 4774 err = 0; 4775 unlock: 4776 mddev_unlock(mddev); 4777 return err ?: len; 4778 } 4779 4780 static struct md_sysfs_entry md_reshape_position = 4781 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4782 reshape_position_store); 4783 4784 static ssize_t 4785 reshape_direction_show(struct mddev *mddev, char *page) 4786 { 4787 return sprintf(page, "%s\n", 4788 mddev->reshape_backwards ? "backwards" : "forwards"); 4789 } 4790 4791 static ssize_t 4792 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4793 { 4794 int backwards = 0; 4795 int err; 4796 4797 if (cmd_match(buf, "forwards")) 4798 backwards = 0; 4799 else if (cmd_match(buf, "backwards")) 4800 backwards = 1; 4801 else 4802 return -EINVAL; 4803 if (mddev->reshape_backwards == backwards) 4804 return len; 4805 4806 err = mddev_lock(mddev); 4807 if (err) 4808 return err; 4809 /* check if we are allowed to change */ 4810 if (mddev->delta_disks) 4811 err = -EBUSY; 4812 else if (mddev->persistent && 4813 mddev->major_version == 0) 4814 err = -EINVAL; 4815 else 4816 mddev->reshape_backwards = backwards; 4817 mddev_unlock(mddev); 4818 return err ?: len; 4819 } 4820 4821 static struct md_sysfs_entry md_reshape_direction = 4822 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 4823 reshape_direction_store); 4824 4825 static ssize_t 4826 array_size_show(struct mddev *mddev, char *page) 4827 { 4828 if (mddev->external_size) 4829 return sprintf(page, "%llu\n", 4830 (unsigned long long)mddev->array_sectors/2); 4831 else 4832 return sprintf(page, "default\n"); 4833 } 4834 4835 static ssize_t 4836 array_size_store(struct mddev *mddev, const char *buf, size_t len) 4837 { 4838 sector_t sectors; 4839 int err; 4840 4841 err = mddev_lock(mddev); 4842 if (err) 4843 return err; 4844 4845 /* cluster raid doesn't support change array_sectors */ 4846 if (mddev_is_clustered(mddev)) 4847 return -EINVAL; 4848 4849 if (strncmp(buf, "default", 7) == 0) { 4850 if (mddev->pers) 4851 sectors = mddev->pers->size(mddev, 0, 0); 4852 else 4853 sectors = mddev->array_sectors; 4854 4855 mddev->external_size = 0; 4856 } else { 4857 if (strict_blocks_to_sectors(buf, §ors) < 0) 4858 err = -EINVAL; 4859 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 4860 err = -E2BIG; 4861 else 4862 mddev->external_size = 1; 4863 } 4864 4865 if (!err) { 4866 mddev->array_sectors = sectors; 4867 if (mddev->pers) { 4868 set_capacity(mddev->gendisk, mddev->array_sectors); 4869 revalidate_disk(mddev->gendisk); 4870 } 4871 } 4872 mddev_unlock(mddev); 4873 return err ?: len; 4874 } 4875 4876 static struct md_sysfs_entry md_array_size = 4877 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 4878 array_size_store); 4879 4880 static struct attribute *md_default_attrs[] = { 4881 &md_level.attr, 4882 &md_layout.attr, 4883 &md_raid_disks.attr, 4884 &md_chunk_size.attr, 4885 &md_size.attr, 4886 &md_resync_start.attr, 4887 &md_metadata.attr, 4888 &md_new_device.attr, 4889 &md_safe_delay.attr, 4890 &md_array_state.attr, 4891 &md_reshape_position.attr, 4892 &md_reshape_direction.attr, 4893 &md_array_size.attr, 4894 &max_corr_read_errors.attr, 4895 NULL, 4896 }; 4897 4898 static struct attribute *md_redundancy_attrs[] = { 4899 &md_scan_mode.attr, 4900 &md_last_scan_mode.attr, 4901 &md_mismatches.attr, 4902 &md_sync_min.attr, 4903 &md_sync_max.attr, 4904 &md_sync_speed.attr, 4905 &md_sync_force_parallel.attr, 4906 &md_sync_completed.attr, 4907 &md_min_sync.attr, 4908 &md_max_sync.attr, 4909 &md_suspend_lo.attr, 4910 &md_suspend_hi.attr, 4911 &md_bitmap.attr, 4912 &md_degraded.attr, 4913 NULL, 4914 }; 4915 static struct attribute_group md_redundancy_group = { 4916 .name = NULL, 4917 .attrs = md_redundancy_attrs, 4918 }; 4919 4920 static ssize_t 4921 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4922 { 4923 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4924 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4925 ssize_t rv; 4926 4927 if (!entry->show) 4928 return -EIO; 4929 spin_lock(&all_mddevs_lock); 4930 if (list_empty(&mddev->all_mddevs)) { 4931 spin_unlock(&all_mddevs_lock); 4932 return -EBUSY; 4933 } 4934 mddev_get(mddev); 4935 spin_unlock(&all_mddevs_lock); 4936 4937 rv = entry->show(mddev, page); 4938 mddev_put(mddev); 4939 return rv; 4940 } 4941 4942 static ssize_t 4943 md_attr_store(struct kobject *kobj, struct attribute *attr, 4944 const char *page, size_t length) 4945 { 4946 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4947 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4948 ssize_t rv; 4949 4950 if (!entry->store) 4951 return -EIO; 4952 if (!capable(CAP_SYS_ADMIN)) 4953 return -EACCES; 4954 spin_lock(&all_mddevs_lock); 4955 if (list_empty(&mddev->all_mddevs)) { 4956 spin_unlock(&all_mddevs_lock); 4957 return -EBUSY; 4958 } 4959 mddev_get(mddev); 4960 spin_unlock(&all_mddevs_lock); 4961 rv = entry->store(mddev, page, length); 4962 mddev_put(mddev); 4963 return rv; 4964 } 4965 4966 static void md_free(struct kobject *ko) 4967 { 4968 struct mddev *mddev = container_of(ko, struct mddev, kobj); 4969 4970 if (mddev->sysfs_state) 4971 sysfs_put(mddev->sysfs_state); 4972 4973 if (mddev->queue) 4974 blk_cleanup_queue(mddev->queue); 4975 if (mddev->gendisk) { 4976 del_gendisk(mddev->gendisk); 4977 put_disk(mddev->gendisk); 4978 } 4979 4980 kfree(mddev); 4981 } 4982 4983 static const struct sysfs_ops md_sysfs_ops = { 4984 .show = md_attr_show, 4985 .store = md_attr_store, 4986 }; 4987 static struct kobj_type md_ktype = { 4988 .release = md_free, 4989 .sysfs_ops = &md_sysfs_ops, 4990 .default_attrs = md_default_attrs, 4991 }; 4992 4993 int mdp_major = 0; 4994 4995 static void mddev_delayed_delete(struct work_struct *ws) 4996 { 4997 struct mddev *mddev = container_of(ws, struct mddev, del_work); 4998 4999 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 5000 kobject_del(&mddev->kobj); 5001 kobject_put(&mddev->kobj); 5002 } 5003 5004 static int md_alloc(dev_t dev, char *name) 5005 { 5006 static DEFINE_MUTEX(disks_mutex); 5007 struct mddev *mddev = mddev_find(dev); 5008 struct gendisk *disk; 5009 int partitioned; 5010 int shift; 5011 int unit; 5012 int error; 5013 5014 if (!mddev) 5015 return -ENODEV; 5016 5017 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 5018 shift = partitioned ? MdpMinorShift : 0; 5019 unit = MINOR(mddev->unit) >> shift; 5020 5021 /* wait for any previous instance of this device to be 5022 * completely removed (mddev_delayed_delete). 5023 */ 5024 flush_workqueue(md_misc_wq); 5025 5026 mutex_lock(&disks_mutex); 5027 error = -EEXIST; 5028 if (mddev->gendisk) 5029 goto abort; 5030 5031 if (name) { 5032 /* Need to ensure that 'name' is not a duplicate. 5033 */ 5034 struct mddev *mddev2; 5035 spin_lock(&all_mddevs_lock); 5036 5037 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5038 if (mddev2->gendisk && 5039 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5040 spin_unlock(&all_mddevs_lock); 5041 goto abort; 5042 } 5043 spin_unlock(&all_mddevs_lock); 5044 } 5045 5046 error = -ENOMEM; 5047 mddev->queue = blk_alloc_queue(GFP_KERNEL); 5048 if (!mddev->queue) 5049 goto abort; 5050 mddev->queue->queuedata = mddev; 5051 5052 blk_queue_make_request(mddev->queue, md_make_request); 5053 blk_set_stacking_limits(&mddev->queue->limits); 5054 5055 disk = alloc_disk(1 << shift); 5056 if (!disk) { 5057 blk_cleanup_queue(mddev->queue); 5058 mddev->queue = NULL; 5059 goto abort; 5060 } 5061 disk->major = MAJOR(mddev->unit); 5062 disk->first_minor = unit << shift; 5063 if (name) 5064 strcpy(disk->disk_name, name); 5065 else if (partitioned) 5066 sprintf(disk->disk_name, "md_d%d", unit); 5067 else 5068 sprintf(disk->disk_name, "md%d", unit); 5069 disk->fops = &md_fops; 5070 disk->private_data = mddev; 5071 disk->queue = mddev->queue; 5072 blk_queue_write_cache(mddev->queue, true, true); 5073 /* Allow extended partitions. This makes the 5074 * 'mdp' device redundant, but we can't really 5075 * remove it now. 5076 */ 5077 disk->flags |= GENHD_FL_EXT_DEVT; 5078 mddev->gendisk = disk; 5079 /* As soon as we call add_disk(), another thread could get 5080 * through to md_open, so make sure it doesn't get too far 5081 */ 5082 mutex_lock(&mddev->open_mutex); 5083 add_disk(disk); 5084 5085 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 5086 &disk_to_dev(disk)->kobj, "%s", "md"); 5087 if (error) { 5088 /* This isn't possible, but as kobject_init_and_add is marked 5089 * __must_check, we must do something with the result 5090 */ 5091 pr_debug("md: cannot register %s/md - name in use\n", 5092 disk->disk_name); 5093 error = 0; 5094 } 5095 if (mddev->kobj.sd && 5096 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 5097 pr_debug("pointless warning\n"); 5098 mutex_unlock(&mddev->open_mutex); 5099 abort: 5100 mutex_unlock(&disks_mutex); 5101 if (!error && mddev->kobj.sd) { 5102 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5103 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5104 } 5105 mddev_put(mddev); 5106 return error; 5107 } 5108 5109 static struct kobject *md_probe(dev_t dev, int *part, void *data) 5110 { 5111 md_alloc(dev, NULL); 5112 return NULL; 5113 } 5114 5115 static int add_named_array(const char *val, struct kernel_param *kp) 5116 { 5117 /* val must be "md_*" where * is not all digits. 5118 * We allocate an array with a large free minor number, and 5119 * set the name to val. val must not already be an active name. 5120 */ 5121 int len = strlen(val); 5122 char buf[DISK_NAME_LEN]; 5123 5124 while (len && val[len-1] == '\n') 5125 len--; 5126 if (len >= DISK_NAME_LEN) 5127 return -E2BIG; 5128 strlcpy(buf, val, len+1); 5129 if (strncmp(buf, "md_", 3) != 0) 5130 return -EINVAL; 5131 return md_alloc(0, buf); 5132 } 5133 5134 static void md_safemode_timeout(unsigned long data) 5135 { 5136 struct mddev *mddev = (struct mddev *) data; 5137 5138 if (!atomic_read(&mddev->writes_pending)) { 5139 mddev->safemode = 1; 5140 if (mddev->external) 5141 sysfs_notify_dirent_safe(mddev->sysfs_state); 5142 } 5143 md_wakeup_thread(mddev->thread); 5144 } 5145 5146 static int start_dirty_degraded; 5147 5148 int md_run(struct mddev *mddev) 5149 { 5150 int err; 5151 struct md_rdev *rdev; 5152 struct md_personality *pers; 5153 5154 if (list_empty(&mddev->disks)) 5155 /* cannot run an array with no devices.. */ 5156 return -EINVAL; 5157 5158 if (mddev->pers) 5159 return -EBUSY; 5160 /* Cannot run until previous stop completes properly */ 5161 if (mddev->sysfs_active) 5162 return -EBUSY; 5163 5164 /* 5165 * Analyze all RAID superblock(s) 5166 */ 5167 if (!mddev->raid_disks) { 5168 if (!mddev->persistent) 5169 return -EINVAL; 5170 analyze_sbs(mddev); 5171 } 5172 5173 if (mddev->level != LEVEL_NONE) 5174 request_module("md-level-%d", mddev->level); 5175 else if (mddev->clevel[0]) 5176 request_module("md-%s", mddev->clevel); 5177 5178 /* 5179 * Drop all container device buffers, from now on 5180 * the only valid external interface is through the md 5181 * device. 5182 */ 5183 rdev_for_each(rdev, mddev) { 5184 if (test_bit(Faulty, &rdev->flags)) 5185 continue; 5186 sync_blockdev(rdev->bdev); 5187 invalidate_bdev(rdev->bdev); 5188 5189 /* perform some consistency tests on the device. 5190 * We don't want the data to overlap the metadata, 5191 * Internal Bitmap issues have been handled elsewhere. 5192 */ 5193 if (rdev->meta_bdev) { 5194 /* Nothing to check */; 5195 } else if (rdev->data_offset < rdev->sb_start) { 5196 if (mddev->dev_sectors && 5197 rdev->data_offset + mddev->dev_sectors 5198 > rdev->sb_start) { 5199 pr_warn("md: %s: data overlaps metadata\n", 5200 mdname(mddev)); 5201 return -EINVAL; 5202 } 5203 } else { 5204 if (rdev->sb_start + rdev->sb_size/512 5205 > rdev->data_offset) { 5206 pr_warn("md: %s: metadata overlaps data\n", 5207 mdname(mddev)); 5208 return -EINVAL; 5209 } 5210 } 5211 sysfs_notify_dirent_safe(rdev->sysfs_state); 5212 } 5213 5214 if (mddev->bio_set == NULL) { 5215 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); 5216 if (!mddev->bio_set) 5217 return -ENOMEM; 5218 } 5219 5220 spin_lock(&pers_lock); 5221 pers = find_pers(mddev->level, mddev->clevel); 5222 if (!pers || !try_module_get(pers->owner)) { 5223 spin_unlock(&pers_lock); 5224 if (mddev->level != LEVEL_NONE) 5225 pr_warn("md: personality for level %d is not loaded!\n", 5226 mddev->level); 5227 else 5228 pr_warn("md: personality for level %s is not loaded!\n", 5229 mddev->clevel); 5230 return -EINVAL; 5231 } 5232 spin_unlock(&pers_lock); 5233 if (mddev->level != pers->level) { 5234 mddev->level = pers->level; 5235 mddev->new_level = pers->level; 5236 } 5237 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5238 5239 if (mddev->reshape_position != MaxSector && 5240 pers->start_reshape == NULL) { 5241 /* This personality cannot handle reshaping... */ 5242 module_put(pers->owner); 5243 return -EINVAL; 5244 } 5245 5246 if (pers->sync_request) { 5247 /* Warn if this is a potentially silly 5248 * configuration. 5249 */ 5250 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5251 struct md_rdev *rdev2; 5252 int warned = 0; 5253 5254 rdev_for_each(rdev, mddev) 5255 rdev_for_each(rdev2, mddev) { 5256 if (rdev < rdev2 && 5257 rdev->bdev->bd_contains == 5258 rdev2->bdev->bd_contains) { 5259 pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n", 5260 mdname(mddev), 5261 bdevname(rdev->bdev,b), 5262 bdevname(rdev2->bdev,b2)); 5263 warned = 1; 5264 } 5265 } 5266 5267 if (warned) 5268 pr_warn("True protection against single-disk failure might be compromised.\n"); 5269 } 5270 5271 mddev->recovery = 0; 5272 /* may be over-ridden by personality */ 5273 mddev->resync_max_sectors = mddev->dev_sectors; 5274 5275 mddev->ok_start_degraded = start_dirty_degraded; 5276 5277 if (start_readonly && mddev->ro == 0) 5278 mddev->ro = 2; /* read-only, but switch on first write */ 5279 5280 /* 5281 * NOTE: some pers->run(), for example r5l_recovery_log(), wakes 5282 * up mddev->thread. It is important to initialize critical 5283 * resources for mddev->thread BEFORE calling pers->run(). 5284 */ 5285 err = pers->run(mddev); 5286 if (err) 5287 pr_warn("md: pers->run() failed ...\n"); 5288 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 5289 WARN_ONCE(!mddev->external_size, 5290 "%s: default size too small, but 'external_size' not in effect?\n", 5291 __func__); 5292 pr_warn("md: invalid array_size %llu > default size %llu\n", 5293 (unsigned long long)mddev->array_sectors / 2, 5294 (unsigned long long)pers->size(mddev, 0, 0) / 2); 5295 err = -EINVAL; 5296 } 5297 if (err == 0 && pers->sync_request && 5298 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5299 struct bitmap *bitmap; 5300 5301 bitmap = bitmap_create(mddev, -1); 5302 if (IS_ERR(bitmap)) { 5303 err = PTR_ERR(bitmap); 5304 pr_warn("%s: failed to create bitmap (%d)\n", 5305 mdname(mddev), err); 5306 } else 5307 mddev->bitmap = bitmap; 5308 5309 } 5310 if (err) { 5311 mddev_detach(mddev); 5312 if (mddev->private) 5313 pers->free(mddev, mddev->private); 5314 mddev->private = NULL; 5315 module_put(pers->owner); 5316 bitmap_destroy(mddev); 5317 return err; 5318 } 5319 if (mddev->queue) { 5320 bool nonrot = true; 5321 5322 rdev_for_each(rdev, mddev) { 5323 if (rdev->raid_disk >= 0 && 5324 !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { 5325 nonrot = false; 5326 break; 5327 } 5328 } 5329 if (mddev->degraded) 5330 nonrot = false; 5331 if (nonrot) 5332 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue); 5333 else 5334 queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue); 5335 mddev->queue->backing_dev_info->congested_data = mddev; 5336 mddev->queue->backing_dev_info->congested_fn = md_congested; 5337 } 5338 if (pers->sync_request) { 5339 if (mddev->kobj.sd && 5340 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5341 pr_warn("md: cannot register extra attributes for %s\n", 5342 mdname(mddev)); 5343 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 5344 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 5345 mddev->ro = 0; 5346 5347 atomic_set(&mddev->writes_pending,0); 5348 atomic_set(&mddev->max_corr_read_errors, 5349 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 5350 mddev->safemode = 0; 5351 if (mddev_is_clustered(mddev)) 5352 mddev->safemode_delay = 0; 5353 else 5354 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 5355 mddev->in_sync = 1; 5356 smp_wmb(); 5357 spin_lock(&mddev->lock); 5358 mddev->pers = pers; 5359 spin_unlock(&mddev->lock); 5360 rdev_for_each(rdev, mddev) 5361 if (rdev->raid_disk >= 0) 5362 if (sysfs_link_rdev(mddev, rdev)) 5363 /* failure here is OK */; 5364 5365 if (mddev->degraded && !mddev->ro) 5366 /* This ensures that recovering status is reported immediately 5367 * via sysfs - until a lack of spares is confirmed. 5368 */ 5369 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5370 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5371 5372 if (mddev->sb_flags) 5373 md_update_sb(mddev, 0); 5374 5375 md_new_event(mddev); 5376 sysfs_notify_dirent_safe(mddev->sysfs_state); 5377 sysfs_notify_dirent_safe(mddev->sysfs_action); 5378 sysfs_notify(&mddev->kobj, NULL, "degraded"); 5379 return 0; 5380 } 5381 EXPORT_SYMBOL_GPL(md_run); 5382 5383 static int do_md_run(struct mddev *mddev) 5384 { 5385 int err; 5386 5387 err = md_run(mddev); 5388 if (err) 5389 goto out; 5390 err = bitmap_load(mddev); 5391 if (err) { 5392 bitmap_destroy(mddev); 5393 goto out; 5394 } 5395 5396 if (mddev_is_clustered(mddev)) 5397 md_allow_write(mddev); 5398 5399 md_wakeup_thread(mddev->thread); 5400 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 5401 5402 set_capacity(mddev->gendisk, mddev->array_sectors); 5403 revalidate_disk(mddev->gendisk); 5404 mddev->changed = 1; 5405 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5406 out: 5407 return err; 5408 } 5409 5410 static int restart_array(struct mddev *mddev) 5411 { 5412 struct gendisk *disk = mddev->gendisk; 5413 5414 /* Complain if it has no devices */ 5415 if (list_empty(&mddev->disks)) 5416 return -ENXIO; 5417 if (!mddev->pers) 5418 return -EINVAL; 5419 if (!mddev->ro) 5420 return -EBUSY; 5421 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5422 struct md_rdev *rdev; 5423 bool has_journal = false; 5424 5425 rcu_read_lock(); 5426 rdev_for_each_rcu(rdev, mddev) { 5427 if (test_bit(Journal, &rdev->flags) && 5428 !test_bit(Faulty, &rdev->flags)) { 5429 has_journal = true; 5430 break; 5431 } 5432 } 5433 rcu_read_unlock(); 5434 5435 /* Don't restart rw with journal missing/faulty */ 5436 if (!has_journal) 5437 return -EINVAL; 5438 } 5439 5440 mddev->safemode = 0; 5441 mddev->ro = 0; 5442 set_disk_ro(disk, 0); 5443 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); 5444 /* Kick recovery or resync if necessary */ 5445 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5446 md_wakeup_thread(mddev->thread); 5447 md_wakeup_thread(mddev->sync_thread); 5448 sysfs_notify_dirent_safe(mddev->sysfs_state); 5449 return 0; 5450 } 5451 5452 static void md_clean(struct mddev *mddev) 5453 { 5454 mddev->array_sectors = 0; 5455 mddev->external_size = 0; 5456 mddev->dev_sectors = 0; 5457 mddev->raid_disks = 0; 5458 mddev->recovery_cp = 0; 5459 mddev->resync_min = 0; 5460 mddev->resync_max = MaxSector; 5461 mddev->reshape_position = MaxSector; 5462 mddev->external = 0; 5463 mddev->persistent = 0; 5464 mddev->level = LEVEL_NONE; 5465 mddev->clevel[0] = 0; 5466 mddev->flags = 0; 5467 mddev->sb_flags = 0; 5468 mddev->ro = 0; 5469 mddev->metadata_type[0] = 0; 5470 mddev->chunk_sectors = 0; 5471 mddev->ctime = mddev->utime = 0; 5472 mddev->layout = 0; 5473 mddev->max_disks = 0; 5474 mddev->events = 0; 5475 mddev->can_decrease_events = 0; 5476 mddev->delta_disks = 0; 5477 mddev->reshape_backwards = 0; 5478 mddev->new_level = LEVEL_NONE; 5479 mddev->new_layout = 0; 5480 mddev->new_chunk_sectors = 0; 5481 mddev->curr_resync = 0; 5482 atomic64_set(&mddev->resync_mismatches, 0); 5483 mddev->suspend_lo = mddev->suspend_hi = 0; 5484 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5485 mddev->recovery = 0; 5486 mddev->in_sync = 0; 5487 mddev->changed = 0; 5488 mddev->degraded = 0; 5489 mddev->safemode = 0; 5490 mddev->private = NULL; 5491 mddev->cluster_info = NULL; 5492 mddev->bitmap_info.offset = 0; 5493 mddev->bitmap_info.default_offset = 0; 5494 mddev->bitmap_info.default_space = 0; 5495 mddev->bitmap_info.chunksize = 0; 5496 mddev->bitmap_info.daemon_sleep = 0; 5497 mddev->bitmap_info.max_write_behind = 0; 5498 mddev->bitmap_info.nodes = 0; 5499 } 5500 5501 static void __md_stop_writes(struct mddev *mddev) 5502 { 5503 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5504 flush_workqueue(md_misc_wq); 5505 if (mddev->sync_thread) { 5506 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5507 md_reap_sync_thread(mddev); 5508 } 5509 5510 del_timer_sync(&mddev->safemode_timer); 5511 5512 if (mddev->pers && mddev->pers->quiesce) { 5513 mddev->pers->quiesce(mddev, 1); 5514 mddev->pers->quiesce(mddev, 0); 5515 } 5516 bitmap_flush(mddev); 5517 5518 if (mddev->ro == 0 && 5519 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 5520 mddev->sb_flags)) { 5521 /* mark array as shutdown cleanly */ 5522 if (!mddev_is_clustered(mddev)) 5523 mddev->in_sync = 1; 5524 md_update_sb(mddev, 1); 5525 } 5526 } 5527 5528 void md_stop_writes(struct mddev *mddev) 5529 { 5530 mddev_lock_nointr(mddev); 5531 __md_stop_writes(mddev); 5532 mddev_unlock(mddev); 5533 } 5534 EXPORT_SYMBOL_GPL(md_stop_writes); 5535 5536 static void mddev_detach(struct mddev *mddev) 5537 { 5538 struct bitmap *bitmap = mddev->bitmap; 5539 /* wait for behind writes to complete */ 5540 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 5541 pr_debug("md:%s: behind writes in progress - waiting to stop.\n", 5542 mdname(mddev)); 5543 /* need to kick something here to make sure I/O goes? */ 5544 wait_event(bitmap->behind_wait, 5545 atomic_read(&bitmap->behind_writes) == 0); 5546 } 5547 if (mddev->pers && mddev->pers->quiesce) { 5548 mddev->pers->quiesce(mddev, 1); 5549 mddev->pers->quiesce(mddev, 0); 5550 } 5551 md_unregister_thread(&mddev->thread); 5552 if (mddev->queue) 5553 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5554 } 5555 5556 static void __md_stop(struct mddev *mddev) 5557 { 5558 struct md_personality *pers = mddev->pers; 5559 mddev_detach(mddev); 5560 /* Ensure ->event_work is done */ 5561 flush_workqueue(md_misc_wq); 5562 spin_lock(&mddev->lock); 5563 mddev->pers = NULL; 5564 spin_unlock(&mddev->lock); 5565 pers->free(mddev, mddev->private); 5566 mddev->private = NULL; 5567 if (pers->sync_request && mddev->to_remove == NULL) 5568 mddev->to_remove = &md_redundancy_group; 5569 module_put(pers->owner); 5570 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5571 } 5572 5573 void md_stop(struct mddev *mddev) 5574 { 5575 /* stop the array and free an attached data structures. 5576 * This is called from dm-raid 5577 */ 5578 __md_stop(mddev); 5579 bitmap_destroy(mddev); 5580 if (mddev->bio_set) 5581 bioset_free(mddev->bio_set); 5582 } 5583 5584 EXPORT_SYMBOL_GPL(md_stop); 5585 5586 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5587 { 5588 int err = 0; 5589 int did_freeze = 0; 5590 5591 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5592 did_freeze = 1; 5593 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5594 md_wakeup_thread(mddev->thread); 5595 } 5596 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5597 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5598 if (mddev->sync_thread) 5599 /* Thread might be blocked waiting for metadata update 5600 * which will now never happen */ 5601 wake_up_process(mddev->sync_thread->tsk); 5602 5603 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 5604 return -EBUSY; 5605 mddev_unlock(mddev); 5606 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 5607 &mddev->recovery)); 5608 wait_event(mddev->sb_wait, 5609 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 5610 mddev_lock_nointr(mddev); 5611 5612 mutex_lock(&mddev->open_mutex); 5613 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5614 mddev->sync_thread || 5615 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5616 pr_warn("md: %s still in use.\n",mdname(mddev)); 5617 if (did_freeze) { 5618 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5619 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5620 md_wakeup_thread(mddev->thread); 5621 } 5622 err = -EBUSY; 5623 goto out; 5624 } 5625 if (mddev->pers) { 5626 __md_stop_writes(mddev); 5627 5628 err = -ENXIO; 5629 if (mddev->ro==1) 5630 goto out; 5631 mddev->ro = 1; 5632 set_disk_ro(mddev->gendisk, 1); 5633 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5634 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5635 md_wakeup_thread(mddev->thread); 5636 sysfs_notify_dirent_safe(mddev->sysfs_state); 5637 err = 0; 5638 } 5639 out: 5640 mutex_unlock(&mddev->open_mutex); 5641 return err; 5642 } 5643 5644 /* mode: 5645 * 0 - completely stop and dis-assemble array 5646 * 2 - stop but do not disassemble array 5647 */ 5648 static int do_md_stop(struct mddev *mddev, int mode, 5649 struct block_device *bdev) 5650 { 5651 struct gendisk *disk = mddev->gendisk; 5652 struct md_rdev *rdev; 5653 int did_freeze = 0; 5654 5655 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5656 did_freeze = 1; 5657 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5658 md_wakeup_thread(mddev->thread); 5659 } 5660 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5661 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5662 if (mddev->sync_thread) 5663 /* Thread might be blocked waiting for metadata update 5664 * which will now never happen */ 5665 wake_up_process(mddev->sync_thread->tsk); 5666 5667 mddev_unlock(mddev); 5668 wait_event(resync_wait, (mddev->sync_thread == NULL && 5669 !test_bit(MD_RECOVERY_RUNNING, 5670 &mddev->recovery))); 5671 mddev_lock_nointr(mddev); 5672 5673 mutex_lock(&mddev->open_mutex); 5674 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5675 mddev->sysfs_active || 5676 mddev->sync_thread || 5677 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { 5678 pr_warn("md: %s still in use.\n",mdname(mddev)); 5679 mutex_unlock(&mddev->open_mutex); 5680 if (did_freeze) { 5681 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5682 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5683 md_wakeup_thread(mddev->thread); 5684 } 5685 return -EBUSY; 5686 } 5687 if (mddev->pers) { 5688 if (mddev->ro) 5689 set_disk_ro(disk, 0); 5690 5691 __md_stop_writes(mddev); 5692 __md_stop(mddev); 5693 mddev->queue->backing_dev_info->congested_fn = NULL; 5694 5695 /* tell userspace to handle 'inactive' */ 5696 sysfs_notify_dirent_safe(mddev->sysfs_state); 5697 5698 rdev_for_each(rdev, mddev) 5699 if (rdev->raid_disk >= 0) 5700 sysfs_unlink_rdev(mddev, rdev); 5701 5702 set_capacity(disk, 0); 5703 mutex_unlock(&mddev->open_mutex); 5704 mddev->changed = 1; 5705 revalidate_disk(disk); 5706 5707 if (mddev->ro) 5708 mddev->ro = 0; 5709 } else 5710 mutex_unlock(&mddev->open_mutex); 5711 /* 5712 * Free resources if final stop 5713 */ 5714 if (mode == 0) { 5715 pr_info("md: %s stopped.\n", mdname(mddev)); 5716 5717 bitmap_destroy(mddev); 5718 if (mddev->bitmap_info.file) { 5719 struct file *f = mddev->bitmap_info.file; 5720 spin_lock(&mddev->lock); 5721 mddev->bitmap_info.file = NULL; 5722 spin_unlock(&mddev->lock); 5723 fput(f); 5724 } 5725 mddev->bitmap_info.offset = 0; 5726 5727 export_array(mddev); 5728 5729 md_clean(mddev); 5730 if (mddev->hold_active == UNTIL_STOP) 5731 mddev->hold_active = 0; 5732 } 5733 md_new_event(mddev); 5734 sysfs_notify_dirent_safe(mddev->sysfs_state); 5735 return 0; 5736 } 5737 5738 #ifndef MODULE 5739 static void autorun_array(struct mddev *mddev) 5740 { 5741 struct md_rdev *rdev; 5742 int err; 5743 5744 if (list_empty(&mddev->disks)) 5745 return; 5746 5747 pr_info("md: running: "); 5748 5749 rdev_for_each(rdev, mddev) { 5750 char b[BDEVNAME_SIZE]; 5751 pr_cont("<%s>", bdevname(rdev->bdev,b)); 5752 } 5753 pr_cont("\n"); 5754 5755 err = do_md_run(mddev); 5756 if (err) { 5757 pr_warn("md: do_md_run() returned %d\n", err); 5758 do_md_stop(mddev, 0, NULL); 5759 } 5760 } 5761 5762 /* 5763 * lets try to run arrays based on all disks that have arrived 5764 * until now. (those are in pending_raid_disks) 5765 * 5766 * the method: pick the first pending disk, collect all disks with 5767 * the same UUID, remove all from the pending list and put them into 5768 * the 'same_array' list. Then order this list based on superblock 5769 * update time (freshest comes first), kick out 'old' disks and 5770 * compare superblocks. If everything's fine then run it. 5771 * 5772 * If "unit" is allocated, then bump its reference count 5773 */ 5774 static void autorun_devices(int part) 5775 { 5776 struct md_rdev *rdev0, *rdev, *tmp; 5777 struct mddev *mddev; 5778 char b[BDEVNAME_SIZE]; 5779 5780 pr_info("md: autorun ...\n"); 5781 while (!list_empty(&pending_raid_disks)) { 5782 int unit; 5783 dev_t dev; 5784 LIST_HEAD(candidates); 5785 rdev0 = list_entry(pending_raid_disks.next, 5786 struct md_rdev, same_set); 5787 5788 pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b)); 5789 INIT_LIST_HEAD(&candidates); 5790 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 5791 if (super_90_load(rdev, rdev0, 0) >= 0) { 5792 pr_debug("md: adding %s ...\n", 5793 bdevname(rdev->bdev,b)); 5794 list_move(&rdev->same_set, &candidates); 5795 } 5796 /* 5797 * now we have a set of devices, with all of them having 5798 * mostly sane superblocks. It's time to allocate the 5799 * mddev. 5800 */ 5801 if (part) { 5802 dev = MKDEV(mdp_major, 5803 rdev0->preferred_minor << MdpMinorShift); 5804 unit = MINOR(dev) >> MdpMinorShift; 5805 } else { 5806 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 5807 unit = MINOR(dev); 5808 } 5809 if (rdev0->preferred_minor != unit) { 5810 pr_warn("md: unit number in %s is bad: %d\n", 5811 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 5812 break; 5813 } 5814 5815 md_probe(dev, NULL, NULL); 5816 mddev = mddev_find(dev); 5817 if (!mddev || !mddev->gendisk) { 5818 if (mddev) 5819 mddev_put(mddev); 5820 break; 5821 } 5822 if (mddev_lock(mddev)) 5823 pr_warn("md: %s locked, cannot run\n", mdname(mddev)); 5824 else if (mddev->raid_disks || mddev->major_version 5825 || !list_empty(&mddev->disks)) { 5826 pr_warn("md: %s already running, cannot run %s\n", 5827 mdname(mddev), bdevname(rdev0->bdev,b)); 5828 mddev_unlock(mddev); 5829 } else { 5830 pr_debug("md: created %s\n", mdname(mddev)); 5831 mddev->persistent = 1; 5832 rdev_for_each_list(rdev, tmp, &candidates) { 5833 list_del_init(&rdev->same_set); 5834 if (bind_rdev_to_array(rdev, mddev)) 5835 export_rdev(rdev); 5836 } 5837 autorun_array(mddev); 5838 mddev_unlock(mddev); 5839 } 5840 /* on success, candidates will be empty, on error 5841 * it won't... 5842 */ 5843 rdev_for_each_list(rdev, tmp, &candidates) { 5844 list_del_init(&rdev->same_set); 5845 export_rdev(rdev); 5846 } 5847 mddev_put(mddev); 5848 } 5849 pr_info("md: ... autorun DONE.\n"); 5850 } 5851 #endif /* !MODULE */ 5852 5853 static int get_version(void __user *arg) 5854 { 5855 mdu_version_t ver; 5856 5857 ver.major = MD_MAJOR_VERSION; 5858 ver.minor = MD_MINOR_VERSION; 5859 ver.patchlevel = MD_PATCHLEVEL_VERSION; 5860 5861 if (copy_to_user(arg, &ver, sizeof(ver))) 5862 return -EFAULT; 5863 5864 return 0; 5865 } 5866 5867 static int get_array_info(struct mddev *mddev, void __user *arg) 5868 { 5869 mdu_array_info_t info; 5870 int nr,working,insync,failed,spare; 5871 struct md_rdev *rdev; 5872 5873 nr = working = insync = failed = spare = 0; 5874 rcu_read_lock(); 5875 rdev_for_each_rcu(rdev, mddev) { 5876 nr++; 5877 if (test_bit(Faulty, &rdev->flags)) 5878 failed++; 5879 else { 5880 working++; 5881 if (test_bit(In_sync, &rdev->flags)) 5882 insync++; 5883 else if (test_bit(Journal, &rdev->flags)) 5884 /* TODO: add journal count to md_u.h */ 5885 ; 5886 else 5887 spare++; 5888 } 5889 } 5890 rcu_read_unlock(); 5891 5892 info.major_version = mddev->major_version; 5893 info.minor_version = mddev->minor_version; 5894 info.patch_version = MD_PATCHLEVEL_VERSION; 5895 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 5896 info.level = mddev->level; 5897 info.size = mddev->dev_sectors / 2; 5898 if (info.size != mddev->dev_sectors / 2) /* overflow */ 5899 info.size = -1; 5900 info.nr_disks = nr; 5901 info.raid_disks = mddev->raid_disks; 5902 info.md_minor = mddev->md_minor; 5903 info.not_persistent= !mddev->persistent; 5904 5905 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 5906 info.state = 0; 5907 if (mddev->in_sync) 5908 info.state = (1<<MD_SB_CLEAN); 5909 if (mddev->bitmap && mddev->bitmap_info.offset) 5910 info.state |= (1<<MD_SB_BITMAP_PRESENT); 5911 if (mddev_is_clustered(mddev)) 5912 info.state |= (1<<MD_SB_CLUSTERED); 5913 info.active_disks = insync; 5914 info.working_disks = working; 5915 info.failed_disks = failed; 5916 info.spare_disks = spare; 5917 5918 info.layout = mddev->layout; 5919 info.chunk_size = mddev->chunk_sectors << 9; 5920 5921 if (copy_to_user(arg, &info, sizeof(info))) 5922 return -EFAULT; 5923 5924 return 0; 5925 } 5926 5927 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 5928 { 5929 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 5930 char *ptr; 5931 int err; 5932 5933 file = kzalloc(sizeof(*file), GFP_NOIO); 5934 if (!file) 5935 return -ENOMEM; 5936 5937 err = 0; 5938 spin_lock(&mddev->lock); 5939 /* bitmap enabled */ 5940 if (mddev->bitmap_info.file) { 5941 ptr = file_path(mddev->bitmap_info.file, file->pathname, 5942 sizeof(file->pathname)); 5943 if (IS_ERR(ptr)) 5944 err = PTR_ERR(ptr); 5945 else 5946 memmove(file->pathname, ptr, 5947 sizeof(file->pathname)-(ptr-file->pathname)); 5948 } 5949 spin_unlock(&mddev->lock); 5950 5951 if (err == 0 && 5952 copy_to_user(arg, file, sizeof(*file))) 5953 err = -EFAULT; 5954 5955 kfree(file); 5956 return err; 5957 } 5958 5959 static int get_disk_info(struct mddev *mddev, void __user * arg) 5960 { 5961 mdu_disk_info_t info; 5962 struct md_rdev *rdev; 5963 5964 if (copy_from_user(&info, arg, sizeof(info))) 5965 return -EFAULT; 5966 5967 rcu_read_lock(); 5968 rdev = md_find_rdev_nr_rcu(mddev, info.number); 5969 if (rdev) { 5970 info.major = MAJOR(rdev->bdev->bd_dev); 5971 info.minor = MINOR(rdev->bdev->bd_dev); 5972 info.raid_disk = rdev->raid_disk; 5973 info.state = 0; 5974 if (test_bit(Faulty, &rdev->flags)) 5975 info.state |= (1<<MD_DISK_FAULTY); 5976 else if (test_bit(In_sync, &rdev->flags)) { 5977 info.state |= (1<<MD_DISK_ACTIVE); 5978 info.state |= (1<<MD_DISK_SYNC); 5979 } 5980 if (test_bit(Journal, &rdev->flags)) 5981 info.state |= (1<<MD_DISK_JOURNAL); 5982 if (test_bit(WriteMostly, &rdev->flags)) 5983 info.state |= (1<<MD_DISK_WRITEMOSTLY); 5984 if (test_bit(FailFast, &rdev->flags)) 5985 info.state |= (1<<MD_DISK_FAILFAST); 5986 } else { 5987 info.major = info.minor = 0; 5988 info.raid_disk = -1; 5989 info.state = (1<<MD_DISK_REMOVED); 5990 } 5991 rcu_read_unlock(); 5992 5993 if (copy_to_user(arg, &info, sizeof(info))) 5994 return -EFAULT; 5995 5996 return 0; 5997 } 5998 5999 static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) 6000 { 6001 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 6002 struct md_rdev *rdev; 6003 dev_t dev = MKDEV(info->major,info->minor); 6004 6005 if (mddev_is_clustered(mddev) && 6006 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 6007 pr_warn("%s: Cannot add to clustered mddev.\n", 6008 mdname(mddev)); 6009 return -EINVAL; 6010 } 6011 6012 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 6013 return -EOVERFLOW; 6014 6015 if (!mddev->raid_disks) { 6016 int err; 6017 /* expecting a device which has a superblock */ 6018 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 6019 if (IS_ERR(rdev)) { 6020 pr_warn("md: md_import_device returned %ld\n", 6021 PTR_ERR(rdev)); 6022 return PTR_ERR(rdev); 6023 } 6024 if (!list_empty(&mddev->disks)) { 6025 struct md_rdev *rdev0 6026 = list_entry(mddev->disks.next, 6027 struct md_rdev, same_set); 6028 err = super_types[mddev->major_version] 6029 .load_super(rdev, rdev0, mddev->minor_version); 6030 if (err < 0) { 6031 pr_warn("md: %s has different UUID to %s\n", 6032 bdevname(rdev->bdev,b), 6033 bdevname(rdev0->bdev,b2)); 6034 export_rdev(rdev); 6035 return -EINVAL; 6036 } 6037 } 6038 err = bind_rdev_to_array(rdev, mddev); 6039 if (err) 6040 export_rdev(rdev); 6041 return err; 6042 } 6043 6044 /* 6045 * add_new_disk can be used once the array is assembled 6046 * to add "hot spares". They must already have a superblock 6047 * written 6048 */ 6049 if (mddev->pers) { 6050 int err; 6051 if (!mddev->pers->hot_add_disk) { 6052 pr_warn("%s: personality does not support diskops!\n", 6053 mdname(mddev)); 6054 return -EINVAL; 6055 } 6056 if (mddev->persistent) 6057 rdev = md_import_device(dev, mddev->major_version, 6058 mddev->minor_version); 6059 else 6060 rdev = md_import_device(dev, -1, -1); 6061 if (IS_ERR(rdev)) { 6062 pr_warn("md: md_import_device returned %ld\n", 6063 PTR_ERR(rdev)); 6064 return PTR_ERR(rdev); 6065 } 6066 /* set saved_raid_disk if appropriate */ 6067 if (!mddev->persistent) { 6068 if (info->state & (1<<MD_DISK_SYNC) && 6069 info->raid_disk < mddev->raid_disks) { 6070 rdev->raid_disk = info->raid_disk; 6071 set_bit(In_sync, &rdev->flags); 6072 clear_bit(Bitmap_sync, &rdev->flags); 6073 } else 6074 rdev->raid_disk = -1; 6075 rdev->saved_raid_disk = rdev->raid_disk; 6076 } else 6077 super_types[mddev->major_version]. 6078 validate_super(mddev, rdev); 6079 if ((info->state & (1<<MD_DISK_SYNC)) && 6080 rdev->raid_disk != info->raid_disk) { 6081 /* This was a hot-add request, but events doesn't 6082 * match, so reject it. 6083 */ 6084 export_rdev(rdev); 6085 return -EINVAL; 6086 } 6087 6088 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6089 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6090 set_bit(WriteMostly, &rdev->flags); 6091 else 6092 clear_bit(WriteMostly, &rdev->flags); 6093 if (info->state & (1<<MD_DISK_FAILFAST)) 6094 set_bit(FailFast, &rdev->flags); 6095 else 6096 clear_bit(FailFast, &rdev->flags); 6097 6098 if (info->state & (1<<MD_DISK_JOURNAL)) { 6099 struct md_rdev *rdev2; 6100 bool has_journal = false; 6101 6102 /* make sure no existing journal disk */ 6103 rdev_for_each(rdev2, mddev) { 6104 if (test_bit(Journal, &rdev2->flags)) { 6105 has_journal = true; 6106 break; 6107 } 6108 } 6109 if (has_journal) { 6110 export_rdev(rdev); 6111 return -EBUSY; 6112 } 6113 set_bit(Journal, &rdev->flags); 6114 } 6115 /* 6116 * check whether the device shows up in other nodes 6117 */ 6118 if (mddev_is_clustered(mddev)) { 6119 if (info->state & (1 << MD_DISK_CANDIDATE)) 6120 set_bit(Candidate, &rdev->flags); 6121 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6122 /* --add initiated by this node */ 6123 err = md_cluster_ops->add_new_disk(mddev, rdev); 6124 if (err) { 6125 export_rdev(rdev); 6126 return err; 6127 } 6128 } 6129 } 6130 6131 rdev->raid_disk = -1; 6132 err = bind_rdev_to_array(rdev, mddev); 6133 6134 if (err) 6135 export_rdev(rdev); 6136 6137 if (mddev_is_clustered(mddev)) { 6138 if (info->state & (1 << MD_DISK_CANDIDATE)) { 6139 if (!err) { 6140 err = md_cluster_ops->new_disk_ack(mddev, 6141 err == 0); 6142 if (err) 6143 md_kick_rdev_from_array(rdev); 6144 } 6145 } else { 6146 if (err) 6147 md_cluster_ops->add_new_disk_cancel(mddev); 6148 else 6149 err = add_bound_rdev(rdev); 6150 } 6151 6152 } else if (!err) 6153 err = add_bound_rdev(rdev); 6154 6155 return err; 6156 } 6157 6158 /* otherwise, add_new_disk is only allowed 6159 * for major_version==0 superblocks 6160 */ 6161 if (mddev->major_version != 0) { 6162 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); 6163 return -EINVAL; 6164 } 6165 6166 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6167 int err; 6168 rdev = md_import_device(dev, -1, 0); 6169 if (IS_ERR(rdev)) { 6170 pr_warn("md: error, md_import_device() returned %ld\n", 6171 PTR_ERR(rdev)); 6172 return PTR_ERR(rdev); 6173 } 6174 rdev->desc_nr = info->number; 6175 if (info->raid_disk < mddev->raid_disks) 6176 rdev->raid_disk = info->raid_disk; 6177 else 6178 rdev->raid_disk = -1; 6179 6180 if (rdev->raid_disk < mddev->raid_disks) 6181 if (info->state & (1<<MD_DISK_SYNC)) 6182 set_bit(In_sync, &rdev->flags); 6183 6184 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6185 set_bit(WriteMostly, &rdev->flags); 6186 if (info->state & (1<<MD_DISK_FAILFAST)) 6187 set_bit(FailFast, &rdev->flags); 6188 6189 if (!mddev->persistent) { 6190 pr_debug("md: nonpersistent superblock ...\n"); 6191 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6192 } else 6193 rdev->sb_start = calc_dev_sboffset(rdev); 6194 rdev->sectors = rdev->sb_start; 6195 6196 err = bind_rdev_to_array(rdev, mddev); 6197 if (err) { 6198 export_rdev(rdev); 6199 return err; 6200 } 6201 } 6202 6203 return 0; 6204 } 6205 6206 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6207 { 6208 char b[BDEVNAME_SIZE]; 6209 struct md_rdev *rdev; 6210 6211 rdev = find_rdev(mddev, dev); 6212 if (!rdev) 6213 return -ENXIO; 6214 6215 if (rdev->raid_disk < 0) 6216 goto kick_rdev; 6217 6218 clear_bit(Blocked, &rdev->flags); 6219 remove_and_add_spares(mddev, rdev); 6220 6221 if (rdev->raid_disk >= 0) 6222 goto busy; 6223 6224 kick_rdev: 6225 if (mddev_is_clustered(mddev)) 6226 md_cluster_ops->remove_disk(mddev, rdev); 6227 6228 md_kick_rdev_from_array(rdev); 6229 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6230 if (mddev->thread) 6231 md_wakeup_thread(mddev->thread); 6232 else 6233 md_update_sb(mddev, 1); 6234 md_new_event(mddev); 6235 6236 return 0; 6237 busy: 6238 pr_debug("md: cannot remove active disk %s from %s ...\n", 6239 bdevname(rdev->bdev,b), mdname(mddev)); 6240 return -EBUSY; 6241 } 6242 6243 static int hot_add_disk(struct mddev *mddev, dev_t dev) 6244 { 6245 char b[BDEVNAME_SIZE]; 6246 int err; 6247 struct md_rdev *rdev; 6248 6249 if (!mddev->pers) 6250 return -ENODEV; 6251 6252 if (mddev->major_version != 0) { 6253 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", 6254 mdname(mddev)); 6255 return -EINVAL; 6256 } 6257 if (!mddev->pers->hot_add_disk) { 6258 pr_warn("%s: personality does not support diskops!\n", 6259 mdname(mddev)); 6260 return -EINVAL; 6261 } 6262 6263 rdev = md_import_device(dev, -1, 0); 6264 if (IS_ERR(rdev)) { 6265 pr_warn("md: error, md_import_device() returned %ld\n", 6266 PTR_ERR(rdev)); 6267 return -EINVAL; 6268 } 6269 6270 if (mddev->persistent) 6271 rdev->sb_start = calc_dev_sboffset(rdev); 6272 else 6273 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6274 6275 rdev->sectors = rdev->sb_start; 6276 6277 if (test_bit(Faulty, &rdev->flags)) { 6278 pr_warn("md: can not hot-add faulty %s disk to %s!\n", 6279 bdevname(rdev->bdev,b), mdname(mddev)); 6280 err = -EINVAL; 6281 goto abort_export; 6282 } 6283 6284 clear_bit(In_sync, &rdev->flags); 6285 rdev->desc_nr = -1; 6286 rdev->saved_raid_disk = -1; 6287 err = bind_rdev_to_array(rdev, mddev); 6288 if (err) 6289 goto abort_export; 6290 6291 /* 6292 * The rest should better be atomic, we can have disk failures 6293 * noticed in interrupt contexts ... 6294 */ 6295 6296 rdev->raid_disk = -1; 6297 6298 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6299 if (!mddev->thread) 6300 md_update_sb(mddev, 1); 6301 /* 6302 * Kick recovery, maybe this spare has to be added to the 6303 * array immediately. 6304 */ 6305 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6306 md_wakeup_thread(mddev->thread); 6307 md_new_event(mddev); 6308 return 0; 6309 6310 abort_export: 6311 export_rdev(rdev); 6312 return err; 6313 } 6314 6315 static int set_bitmap_file(struct mddev *mddev, int fd) 6316 { 6317 int err = 0; 6318 6319 if (mddev->pers) { 6320 if (!mddev->pers->quiesce || !mddev->thread) 6321 return -EBUSY; 6322 if (mddev->recovery || mddev->sync_thread) 6323 return -EBUSY; 6324 /* we should be able to change the bitmap.. */ 6325 } 6326 6327 if (fd >= 0) { 6328 struct inode *inode; 6329 struct file *f; 6330 6331 if (mddev->bitmap || mddev->bitmap_info.file) 6332 return -EEXIST; /* cannot add when bitmap is present */ 6333 f = fget(fd); 6334 6335 if (f == NULL) { 6336 pr_warn("%s: error: failed to get bitmap file\n", 6337 mdname(mddev)); 6338 return -EBADF; 6339 } 6340 6341 inode = f->f_mapping->host; 6342 if (!S_ISREG(inode->i_mode)) { 6343 pr_warn("%s: error: bitmap file must be a regular file\n", 6344 mdname(mddev)); 6345 err = -EBADF; 6346 } else if (!(f->f_mode & FMODE_WRITE)) { 6347 pr_warn("%s: error: bitmap file must open for write\n", 6348 mdname(mddev)); 6349 err = -EBADF; 6350 } else if (atomic_read(&inode->i_writecount) != 1) { 6351 pr_warn("%s: error: bitmap file is already in use\n", 6352 mdname(mddev)); 6353 err = -EBUSY; 6354 } 6355 if (err) { 6356 fput(f); 6357 return err; 6358 } 6359 mddev->bitmap_info.file = f; 6360 mddev->bitmap_info.offset = 0; /* file overrides offset */ 6361 } else if (mddev->bitmap == NULL) 6362 return -ENOENT; /* cannot remove what isn't there */ 6363 err = 0; 6364 if (mddev->pers) { 6365 mddev->pers->quiesce(mddev, 1); 6366 if (fd >= 0) { 6367 struct bitmap *bitmap; 6368 6369 bitmap = bitmap_create(mddev, -1); 6370 if (!IS_ERR(bitmap)) { 6371 mddev->bitmap = bitmap; 6372 err = bitmap_load(mddev); 6373 } else 6374 err = PTR_ERR(bitmap); 6375 } 6376 if (fd < 0 || err) { 6377 bitmap_destroy(mddev); 6378 fd = -1; /* make sure to put the file */ 6379 } 6380 mddev->pers->quiesce(mddev, 0); 6381 } 6382 if (fd < 0) { 6383 struct file *f = mddev->bitmap_info.file; 6384 if (f) { 6385 spin_lock(&mddev->lock); 6386 mddev->bitmap_info.file = NULL; 6387 spin_unlock(&mddev->lock); 6388 fput(f); 6389 } 6390 } 6391 6392 return err; 6393 } 6394 6395 /* 6396 * set_array_info is used two different ways 6397 * The original usage is when creating a new array. 6398 * In this usage, raid_disks is > 0 and it together with 6399 * level, size, not_persistent,layout,chunksize determine the 6400 * shape of the array. 6401 * This will always create an array with a type-0.90.0 superblock. 6402 * The newer usage is when assembling an array. 6403 * In this case raid_disks will be 0, and the major_version field is 6404 * use to determine which style super-blocks are to be found on the devices. 6405 * The minor and patch _version numbers are also kept incase the 6406 * super_block handler wishes to interpret them. 6407 */ 6408 static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) 6409 { 6410 6411 if (info->raid_disks == 0) { 6412 /* just setting version number for superblock loading */ 6413 if (info->major_version < 0 || 6414 info->major_version >= ARRAY_SIZE(super_types) || 6415 super_types[info->major_version].name == NULL) { 6416 /* maybe try to auto-load a module? */ 6417 pr_warn("md: superblock version %d not known\n", 6418 info->major_version); 6419 return -EINVAL; 6420 } 6421 mddev->major_version = info->major_version; 6422 mddev->minor_version = info->minor_version; 6423 mddev->patch_version = info->patch_version; 6424 mddev->persistent = !info->not_persistent; 6425 /* ensure mddev_put doesn't delete this now that there 6426 * is some minimal configuration. 6427 */ 6428 mddev->ctime = ktime_get_real_seconds(); 6429 return 0; 6430 } 6431 mddev->major_version = MD_MAJOR_VERSION; 6432 mddev->minor_version = MD_MINOR_VERSION; 6433 mddev->patch_version = MD_PATCHLEVEL_VERSION; 6434 mddev->ctime = ktime_get_real_seconds(); 6435 6436 mddev->level = info->level; 6437 mddev->clevel[0] = 0; 6438 mddev->dev_sectors = 2 * (sector_t)info->size; 6439 mddev->raid_disks = info->raid_disks; 6440 /* don't set md_minor, it is determined by which /dev/md* was 6441 * openned 6442 */ 6443 if (info->state & (1<<MD_SB_CLEAN)) 6444 mddev->recovery_cp = MaxSector; 6445 else 6446 mddev->recovery_cp = 0; 6447 mddev->persistent = ! info->not_persistent; 6448 mddev->external = 0; 6449 6450 mddev->layout = info->layout; 6451 mddev->chunk_sectors = info->chunk_size >> 9; 6452 6453 if (mddev->persistent) { 6454 mddev->max_disks = MD_SB_DISKS; 6455 mddev->flags = 0; 6456 mddev->sb_flags = 0; 6457 } 6458 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 6459 6460 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6461 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6462 mddev->bitmap_info.offset = 0; 6463 6464 mddev->reshape_position = MaxSector; 6465 6466 /* 6467 * Generate a 128 bit UUID 6468 */ 6469 get_random_bytes(mddev->uuid, 16); 6470 6471 mddev->new_level = mddev->level; 6472 mddev->new_chunk_sectors = mddev->chunk_sectors; 6473 mddev->new_layout = mddev->layout; 6474 mddev->delta_disks = 0; 6475 mddev->reshape_backwards = 0; 6476 6477 return 0; 6478 } 6479 6480 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 6481 { 6482 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 6483 6484 if (mddev->external_size) 6485 return; 6486 6487 mddev->array_sectors = array_sectors; 6488 } 6489 EXPORT_SYMBOL(md_set_array_sectors); 6490 6491 static int update_size(struct mddev *mddev, sector_t num_sectors) 6492 { 6493 struct md_rdev *rdev; 6494 int rv; 6495 int fit = (num_sectors == 0); 6496 6497 /* cluster raid doesn't support update size */ 6498 if (mddev_is_clustered(mddev)) 6499 return -EINVAL; 6500 6501 if (mddev->pers->resize == NULL) 6502 return -EINVAL; 6503 /* The "num_sectors" is the number of sectors of each device that 6504 * is used. This can only make sense for arrays with redundancy. 6505 * linear and raid0 always use whatever space is available. We can only 6506 * consider changing this number if no resync or reconstruction is 6507 * happening, and if the new size is acceptable. It must fit before the 6508 * sb_start or, if that is <data_offset, it must fit before the size 6509 * of each device. If num_sectors is zero, we find the largest size 6510 * that fits. 6511 */ 6512 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6513 mddev->sync_thread) 6514 return -EBUSY; 6515 if (mddev->ro) 6516 return -EROFS; 6517 6518 rdev_for_each(rdev, mddev) { 6519 sector_t avail = rdev->sectors; 6520 6521 if (fit && (num_sectors == 0 || num_sectors > avail)) 6522 num_sectors = avail; 6523 if (avail < num_sectors) 6524 return -ENOSPC; 6525 } 6526 rv = mddev->pers->resize(mddev, num_sectors); 6527 if (!rv) { 6528 if (mddev->queue) { 6529 set_capacity(mddev->gendisk, mddev->array_sectors); 6530 revalidate_disk(mddev->gendisk); 6531 } 6532 } 6533 return rv; 6534 } 6535 6536 static int update_raid_disks(struct mddev *mddev, int raid_disks) 6537 { 6538 int rv; 6539 struct md_rdev *rdev; 6540 /* change the number of raid disks */ 6541 if (mddev->pers->check_reshape == NULL) 6542 return -EINVAL; 6543 if (mddev->ro) 6544 return -EROFS; 6545 if (raid_disks <= 0 || 6546 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6547 return -EINVAL; 6548 if (mddev->sync_thread || 6549 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6550 mddev->reshape_position != MaxSector) 6551 return -EBUSY; 6552 6553 rdev_for_each(rdev, mddev) { 6554 if (mddev->raid_disks < raid_disks && 6555 rdev->data_offset < rdev->new_data_offset) 6556 return -EINVAL; 6557 if (mddev->raid_disks > raid_disks && 6558 rdev->data_offset > rdev->new_data_offset) 6559 return -EINVAL; 6560 } 6561 6562 mddev->delta_disks = raid_disks - mddev->raid_disks; 6563 if (mddev->delta_disks < 0) 6564 mddev->reshape_backwards = 1; 6565 else if (mddev->delta_disks > 0) 6566 mddev->reshape_backwards = 0; 6567 6568 rv = mddev->pers->check_reshape(mddev); 6569 if (rv < 0) { 6570 mddev->delta_disks = 0; 6571 mddev->reshape_backwards = 0; 6572 } 6573 return rv; 6574 } 6575 6576 /* 6577 * update_array_info is used to change the configuration of an 6578 * on-line array. 6579 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 6580 * fields in the info are checked against the array. 6581 * Any differences that cannot be handled will cause an error. 6582 * Normally, only one change can be managed at a time. 6583 */ 6584 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 6585 { 6586 int rv = 0; 6587 int cnt = 0; 6588 int state = 0; 6589 6590 /* calculate expected state,ignoring low bits */ 6591 if (mddev->bitmap && mddev->bitmap_info.offset) 6592 state |= (1 << MD_SB_BITMAP_PRESENT); 6593 6594 if (mddev->major_version != info->major_version || 6595 mddev->minor_version != info->minor_version || 6596 /* mddev->patch_version != info->patch_version || */ 6597 mddev->ctime != info->ctime || 6598 mddev->level != info->level || 6599 /* mddev->layout != info->layout || */ 6600 mddev->persistent != !info->not_persistent || 6601 mddev->chunk_sectors != info->chunk_size >> 9 || 6602 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 6603 ((state^info->state) & 0xfffffe00) 6604 ) 6605 return -EINVAL; 6606 /* Check there is only one change */ 6607 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6608 cnt++; 6609 if (mddev->raid_disks != info->raid_disks) 6610 cnt++; 6611 if (mddev->layout != info->layout) 6612 cnt++; 6613 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 6614 cnt++; 6615 if (cnt == 0) 6616 return 0; 6617 if (cnt > 1) 6618 return -EINVAL; 6619 6620 if (mddev->layout != info->layout) { 6621 /* Change layout 6622 * we don't need to do anything at the md level, the 6623 * personality will take care of it all. 6624 */ 6625 if (mddev->pers->check_reshape == NULL) 6626 return -EINVAL; 6627 else { 6628 mddev->new_layout = info->layout; 6629 rv = mddev->pers->check_reshape(mddev); 6630 if (rv) 6631 mddev->new_layout = mddev->layout; 6632 return rv; 6633 } 6634 } 6635 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6636 rv = update_size(mddev, (sector_t)info->size * 2); 6637 6638 if (mddev->raid_disks != info->raid_disks) 6639 rv = update_raid_disks(mddev, info->raid_disks); 6640 6641 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 6642 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 6643 rv = -EINVAL; 6644 goto err; 6645 } 6646 if (mddev->recovery || mddev->sync_thread) { 6647 rv = -EBUSY; 6648 goto err; 6649 } 6650 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 6651 struct bitmap *bitmap; 6652 /* add the bitmap */ 6653 if (mddev->bitmap) { 6654 rv = -EEXIST; 6655 goto err; 6656 } 6657 if (mddev->bitmap_info.default_offset == 0) { 6658 rv = -EINVAL; 6659 goto err; 6660 } 6661 mddev->bitmap_info.offset = 6662 mddev->bitmap_info.default_offset; 6663 mddev->bitmap_info.space = 6664 mddev->bitmap_info.default_space; 6665 mddev->pers->quiesce(mddev, 1); 6666 bitmap = bitmap_create(mddev, -1); 6667 if (!IS_ERR(bitmap)) { 6668 mddev->bitmap = bitmap; 6669 rv = bitmap_load(mddev); 6670 } else 6671 rv = PTR_ERR(bitmap); 6672 if (rv) 6673 bitmap_destroy(mddev); 6674 mddev->pers->quiesce(mddev, 0); 6675 } else { 6676 /* remove the bitmap */ 6677 if (!mddev->bitmap) { 6678 rv = -ENOENT; 6679 goto err; 6680 } 6681 if (mddev->bitmap->storage.file) { 6682 rv = -EINVAL; 6683 goto err; 6684 } 6685 if (mddev->bitmap_info.nodes) { 6686 /* hold PW on all the bitmap lock */ 6687 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 6688 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); 6689 rv = -EPERM; 6690 md_cluster_ops->unlock_all_bitmaps(mddev); 6691 goto err; 6692 } 6693 6694 mddev->bitmap_info.nodes = 0; 6695 md_cluster_ops->leave(mddev); 6696 } 6697 mddev->pers->quiesce(mddev, 1); 6698 bitmap_destroy(mddev); 6699 mddev->pers->quiesce(mddev, 0); 6700 mddev->bitmap_info.offset = 0; 6701 } 6702 } 6703 md_update_sb(mddev, 1); 6704 return rv; 6705 err: 6706 return rv; 6707 } 6708 6709 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6710 { 6711 struct md_rdev *rdev; 6712 int err = 0; 6713 6714 if (mddev->pers == NULL) 6715 return -ENODEV; 6716 6717 rcu_read_lock(); 6718 rdev = find_rdev_rcu(mddev, dev); 6719 if (!rdev) 6720 err = -ENODEV; 6721 else { 6722 md_error(mddev, rdev); 6723 if (!test_bit(Faulty, &rdev->flags)) 6724 err = -EBUSY; 6725 } 6726 rcu_read_unlock(); 6727 return err; 6728 } 6729 6730 /* 6731 * We have a problem here : there is no easy way to give a CHS 6732 * virtual geometry. We currently pretend that we have a 2 heads 6733 * 4 sectors (with a BIG number of cylinders...). This drives 6734 * dosfs just mad... ;-) 6735 */ 6736 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 6737 { 6738 struct mddev *mddev = bdev->bd_disk->private_data; 6739 6740 geo->heads = 2; 6741 geo->sectors = 4; 6742 geo->cylinders = mddev->array_sectors / 8; 6743 return 0; 6744 } 6745 6746 static inline bool md_ioctl_valid(unsigned int cmd) 6747 { 6748 switch (cmd) { 6749 case ADD_NEW_DISK: 6750 case BLKROSET: 6751 case GET_ARRAY_INFO: 6752 case GET_BITMAP_FILE: 6753 case GET_DISK_INFO: 6754 case HOT_ADD_DISK: 6755 case HOT_REMOVE_DISK: 6756 case RAID_AUTORUN: 6757 case RAID_VERSION: 6758 case RESTART_ARRAY_RW: 6759 case RUN_ARRAY: 6760 case SET_ARRAY_INFO: 6761 case SET_BITMAP_FILE: 6762 case SET_DISK_FAULTY: 6763 case STOP_ARRAY: 6764 case STOP_ARRAY_RO: 6765 case CLUSTERED_DISK_NACK: 6766 return true; 6767 default: 6768 return false; 6769 } 6770 } 6771 6772 static int md_ioctl(struct block_device *bdev, fmode_t mode, 6773 unsigned int cmd, unsigned long arg) 6774 { 6775 int err = 0; 6776 void __user *argp = (void __user *)arg; 6777 struct mddev *mddev = NULL; 6778 int ro; 6779 6780 if (!md_ioctl_valid(cmd)) 6781 return -ENOTTY; 6782 6783 switch (cmd) { 6784 case RAID_VERSION: 6785 case GET_ARRAY_INFO: 6786 case GET_DISK_INFO: 6787 break; 6788 default: 6789 if (!capable(CAP_SYS_ADMIN)) 6790 return -EACCES; 6791 } 6792 6793 /* 6794 * Commands dealing with the RAID driver but not any 6795 * particular array: 6796 */ 6797 switch (cmd) { 6798 case RAID_VERSION: 6799 err = get_version(argp); 6800 goto out; 6801 6802 #ifndef MODULE 6803 case RAID_AUTORUN: 6804 err = 0; 6805 autostart_arrays(arg); 6806 goto out; 6807 #endif 6808 default:; 6809 } 6810 6811 /* 6812 * Commands creating/starting a new array: 6813 */ 6814 6815 mddev = bdev->bd_disk->private_data; 6816 6817 if (!mddev) { 6818 BUG(); 6819 goto out; 6820 } 6821 6822 /* Some actions do not requires the mutex */ 6823 switch (cmd) { 6824 case GET_ARRAY_INFO: 6825 if (!mddev->raid_disks && !mddev->external) 6826 err = -ENODEV; 6827 else 6828 err = get_array_info(mddev, argp); 6829 goto out; 6830 6831 case GET_DISK_INFO: 6832 if (!mddev->raid_disks && !mddev->external) 6833 err = -ENODEV; 6834 else 6835 err = get_disk_info(mddev, argp); 6836 goto out; 6837 6838 case SET_DISK_FAULTY: 6839 err = set_disk_faulty(mddev, new_decode_dev(arg)); 6840 goto out; 6841 6842 case GET_BITMAP_FILE: 6843 err = get_bitmap_file(mddev, argp); 6844 goto out; 6845 6846 } 6847 6848 if (cmd == ADD_NEW_DISK) 6849 /* need to ensure md_delayed_delete() has completed */ 6850 flush_workqueue(md_misc_wq); 6851 6852 if (cmd == HOT_REMOVE_DISK) 6853 /* need to ensure recovery thread has run */ 6854 wait_event_interruptible_timeout(mddev->sb_wait, 6855 !test_bit(MD_RECOVERY_NEEDED, 6856 &mddev->recovery), 6857 msecs_to_jiffies(5000)); 6858 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 6859 /* Need to flush page cache, and ensure no-one else opens 6860 * and writes 6861 */ 6862 mutex_lock(&mddev->open_mutex); 6863 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 6864 mutex_unlock(&mddev->open_mutex); 6865 err = -EBUSY; 6866 goto out; 6867 } 6868 set_bit(MD_CLOSING, &mddev->flags); 6869 mutex_unlock(&mddev->open_mutex); 6870 sync_blockdev(bdev); 6871 } 6872 err = mddev_lock(mddev); 6873 if (err) { 6874 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", 6875 err, cmd); 6876 goto out; 6877 } 6878 6879 if (cmd == SET_ARRAY_INFO) { 6880 mdu_array_info_t info; 6881 if (!arg) 6882 memset(&info, 0, sizeof(info)); 6883 else if (copy_from_user(&info, argp, sizeof(info))) { 6884 err = -EFAULT; 6885 goto unlock; 6886 } 6887 if (mddev->pers) { 6888 err = update_array_info(mddev, &info); 6889 if (err) { 6890 pr_warn("md: couldn't update array info. %d\n", err); 6891 goto unlock; 6892 } 6893 goto unlock; 6894 } 6895 if (!list_empty(&mddev->disks)) { 6896 pr_warn("md: array %s already has disks!\n", mdname(mddev)); 6897 err = -EBUSY; 6898 goto unlock; 6899 } 6900 if (mddev->raid_disks) { 6901 pr_warn("md: array %s already initialised!\n", mdname(mddev)); 6902 err = -EBUSY; 6903 goto unlock; 6904 } 6905 err = set_array_info(mddev, &info); 6906 if (err) { 6907 pr_warn("md: couldn't set array info. %d\n", err); 6908 goto unlock; 6909 } 6910 goto unlock; 6911 } 6912 6913 /* 6914 * Commands querying/configuring an existing array: 6915 */ 6916 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 6917 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 6918 if ((!mddev->raid_disks && !mddev->external) 6919 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 6920 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 6921 && cmd != GET_BITMAP_FILE) { 6922 err = -ENODEV; 6923 goto unlock; 6924 } 6925 6926 /* 6927 * Commands even a read-only array can execute: 6928 */ 6929 switch (cmd) { 6930 case RESTART_ARRAY_RW: 6931 err = restart_array(mddev); 6932 goto unlock; 6933 6934 case STOP_ARRAY: 6935 err = do_md_stop(mddev, 0, bdev); 6936 goto unlock; 6937 6938 case STOP_ARRAY_RO: 6939 err = md_set_readonly(mddev, bdev); 6940 goto unlock; 6941 6942 case HOT_REMOVE_DISK: 6943 err = hot_remove_disk(mddev, new_decode_dev(arg)); 6944 goto unlock; 6945 6946 case ADD_NEW_DISK: 6947 /* We can support ADD_NEW_DISK on read-only arrays 6948 * only if we are re-adding a preexisting device. 6949 * So require mddev->pers and MD_DISK_SYNC. 6950 */ 6951 if (mddev->pers) { 6952 mdu_disk_info_t info; 6953 if (copy_from_user(&info, argp, sizeof(info))) 6954 err = -EFAULT; 6955 else if (!(info.state & (1<<MD_DISK_SYNC))) 6956 /* Need to clear read-only for this */ 6957 break; 6958 else 6959 err = add_new_disk(mddev, &info); 6960 goto unlock; 6961 } 6962 break; 6963 6964 case BLKROSET: 6965 if (get_user(ro, (int __user *)(arg))) { 6966 err = -EFAULT; 6967 goto unlock; 6968 } 6969 err = -EINVAL; 6970 6971 /* if the bdev is going readonly the value of mddev->ro 6972 * does not matter, no writes are coming 6973 */ 6974 if (ro) 6975 goto unlock; 6976 6977 /* are we are already prepared for writes? */ 6978 if (mddev->ro != 1) 6979 goto unlock; 6980 6981 /* transitioning to readauto need only happen for 6982 * arrays that call md_write_start 6983 */ 6984 if (mddev->pers) { 6985 err = restart_array(mddev); 6986 if (err == 0) { 6987 mddev->ro = 2; 6988 set_disk_ro(mddev->gendisk, 0); 6989 } 6990 } 6991 goto unlock; 6992 } 6993 6994 /* 6995 * The remaining ioctls are changing the state of the 6996 * superblock, so we do not allow them on read-only arrays. 6997 */ 6998 if (mddev->ro && mddev->pers) { 6999 if (mddev->ro == 2) { 7000 mddev->ro = 0; 7001 sysfs_notify_dirent_safe(mddev->sysfs_state); 7002 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7003 /* mddev_unlock will wake thread */ 7004 /* If a device failed while we were read-only, we 7005 * need to make sure the metadata is updated now. 7006 */ 7007 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { 7008 mddev_unlock(mddev); 7009 wait_event(mddev->sb_wait, 7010 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && 7011 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7012 mddev_lock_nointr(mddev); 7013 } 7014 } else { 7015 err = -EROFS; 7016 goto unlock; 7017 } 7018 } 7019 7020 switch (cmd) { 7021 case ADD_NEW_DISK: 7022 { 7023 mdu_disk_info_t info; 7024 if (copy_from_user(&info, argp, sizeof(info))) 7025 err = -EFAULT; 7026 else 7027 err = add_new_disk(mddev, &info); 7028 goto unlock; 7029 } 7030 7031 case CLUSTERED_DISK_NACK: 7032 if (mddev_is_clustered(mddev)) 7033 md_cluster_ops->new_disk_ack(mddev, false); 7034 else 7035 err = -EINVAL; 7036 goto unlock; 7037 7038 case HOT_ADD_DISK: 7039 err = hot_add_disk(mddev, new_decode_dev(arg)); 7040 goto unlock; 7041 7042 case RUN_ARRAY: 7043 err = do_md_run(mddev); 7044 goto unlock; 7045 7046 case SET_BITMAP_FILE: 7047 err = set_bitmap_file(mddev, (int)arg); 7048 goto unlock; 7049 7050 default: 7051 err = -EINVAL; 7052 goto unlock; 7053 } 7054 7055 unlock: 7056 if (mddev->hold_active == UNTIL_IOCTL && 7057 err != -EINVAL) 7058 mddev->hold_active = 0; 7059 mddev_unlock(mddev); 7060 out: 7061 return err; 7062 } 7063 #ifdef CONFIG_COMPAT 7064 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 7065 unsigned int cmd, unsigned long arg) 7066 { 7067 switch (cmd) { 7068 case HOT_REMOVE_DISK: 7069 case HOT_ADD_DISK: 7070 case SET_DISK_FAULTY: 7071 case SET_BITMAP_FILE: 7072 /* These take in integer arg, do not convert */ 7073 break; 7074 default: 7075 arg = (unsigned long)compat_ptr(arg); 7076 break; 7077 } 7078 7079 return md_ioctl(bdev, mode, cmd, arg); 7080 } 7081 #endif /* CONFIG_COMPAT */ 7082 7083 static int md_open(struct block_device *bdev, fmode_t mode) 7084 { 7085 /* 7086 * Succeed if we can lock the mddev, which confirms that 7087 * it isn't being stopped right now. 7088 */ 7089 struct mddev *mddev = mddev_find(bdev->bd_dev); 7090 int err; 7091 7092 if (!mddev) 7093 return -ENODEV; 7094 7095 if (mddev->gendisk != bdev->bd_disk) { 7096 /* we are racing with mddev_put which is discarding this 7097 * bd_disk. 7098 */ 7099 mddev_put(mddev); 7100 /* Wait until bdev->bd_disk is definitely gone */ 7101 flush_workqueue(md_misc_wq); 7102 /* Then retry the open from the top */ 7103 return -ERESTARTSYS; 7104 } 7105 BUG_ON(mddev != bdev->bd_disk->private_data); 7106 7107 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 7108 goto out; 7109 7110 if (test_bit(MD_CLOSING, &mddev->flags)) { 7111 mutex_unlock(&mddev->open_mutex); 7112 err = -ENODEV; 7113 goto out; 7114 } 7115 7116 err = 0; 7117 atomic_inc(&mddev->openers); 7118 mutex_unlock(&mddev->open_mutex); 7119 7120 check_disk_change(bdev); 7121 out: 7122 if (err) 7123 mddev_put(mddev); 7124 return err; 7125 } 7126 7127 static void md_release(struct gendisk *disk, fmode_t mode) 7128 { 7129 struct mddev *mddev = disk->private_data; 7130 7131 BUG_ON(!mddev); 7132 atomic_dec(&mddev->openers); 7133 mddev_put(mddev); 7134 } 7135 7136 static int md_media_changed(struct gendisk *disk) 7137 { 7138 struct mddev *mddev = disk->private_data; 7139 7140 return mddev->changed; 7141 } 7142 7143 static int md_revalidate(struct gendisk *disk) 7144 { 7145 struct mddev *mddev = disk->private_data; 7146 7147 mddev->changed = 0; 7148 return 0; 7149 } 7150 static const struct block_device_operations md_fops = 7151 { 7152 .owner = THIS_MODULE, 7153 .open = md_open, 7154 .release = md_release, 7155 .ioctl = md_ioctl, 7156 #ifdef CONFIG_COMPAT 7157 .compat_ioctl = md_compat_ioctl, 7158 #endif 7159 .getgeo = md_getgeo, 7160 .media_changed = md_media_changed, 7161 .revalidate_disk= md_revalidate, 7162 }; 7163 7164 static int md_thread(void *arg) 7165 { 7166 struct md_thread *thread = arg; 7167 7168 /* 7169 * md_thread is a 'system-thread', it's priority should be very 7170 * high. We avoid resource deadlocks individually in each 7171 * raid personality. (RAID5 does preallocation) We also use RR and 7172 * the very same RT priority as kswapd, thus we will never get 7173 * into a priority inversion deadlock. 7174 * 7175 * we definitely have to have equal or higher priority than 7176 * bdflush, otherwise bdflush will deadlock if there are too 7177 * many dirty RAID5 blocks. 7178 */ 7179 7180 allow_signal(SIGKILL); 7181 while (!kthread_should_stop()) { 7182 7183 /* We need to wait INTERRUPTIBLE so that 7184 * we don't add to the load-average. 7185 * That means we need to be sure no signals are 7186 * pending 7187 */ 7188 if (signal_pending(current)) 7189 flush_signals(current); 7190 7191 wait_event_interruptible_timeout 7192 (thread->wqueue, 7193 test_bit(THREAD_WAKEUP, &thread->flags) 7194 || kthread_should_stop() || kthread_should_park(), 7195 thread->timeout); 7196 7197 clear_bit(THREAD_WAKEUP, &thread->flags); 7198 if (kthread_should_park()) 7199 kthread_parkme(); 7200 if (!kthread_should_stop()) 7201 thread->run(thread); 7202 } 7203 7204 return 0; 7205 } 7206 7207 void md_wakeup_thread(struct md_thread *thread) 7208 { 7209 if (thread) { 7210 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 7211 set_bit(THREAD_WAKEUP, &thread->flags); 7212 wake_up(&thread->wqueue); 7213 } 7214 } 7215 EXPORT_SYMBOL(md_wakeup_thread); 7216 7217 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 7218 struct mddev *mddev, const char *name) 7219 { 7220 struct md_thread *thread; 7221 7222 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 7223 if (!thread) 7224 return NULL; 7225 7226 init_waitqueue_head(&thread->wqueue); 7227 7228 thread->run = run; 7229 thread->mddev = mddev; 7230 thread->timeout = MAX_SCHEDULE_TIMEOUT; 7231 thread->tsk = kthread_run(md_thread, thread, 7232 "%s_%s", 7233 mdname(thread->mddev), 7234 name); 7235 if (IS_ERR(thread->tsk)) { 7236 kfree(thread); 7237 return NULL; 7238 } 7239 return thread; 7240 } 7241 EXPORT_SYMBOL(md_register_thread); 7242 7243 void md_unregister_thread(struct md_thread **threadp) 7244 { 7245 struct md_thread *thread = *threadp; 7246 if (!thread) 7247 return; 7248 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 7249 /* Locking ensures that mddev_unlock does not wake_up a 7250 * non-existent thread 7251 */ 7252 spin_lock(&pers_lock); 7253 *threadp = NULL; 7254 spin_unlock(&pers_lock); 7255 7256 kthread_stop(thread->tsk); 7257 kfree(thread); 7258 } 7259 EXPORT_SYMBOL(md_unregister_thread); 7260 7261 void md_error(struct mddev *mddev, struct md_rdev *rdev) 7262 { 7263 if (!rdev || test_bit(Faulty, &rdev->flags)) 7264 return; 7265 7266 if (!mddev->pers || !mddev->pers->error_handler) 7267 return; 7268 mddev->pers->error_handler(mddev,rdev); 7269 if (mddev->degraded) 7270 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7271 sysfs_notify_dirent_safe(rdev->sysfs_state); 7272 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7273 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7274 md_wakeup_thread(mddev->thread); 7275 if (mddev->event_work.func) 7276 queue_work(md_misc_wq, &mddev->event_work); 7277 md_new_event(mddev); 7278 } 7279 EXPORT_SYMBOL(md_error); 7280 7281 /* seq_file implementation /proc/mdstat */ 7282 7283 static void status_unused(struct seq_file *seq) 7284 { 7285 int i = 0; 7286 struct md_rdev *rdev; 7287 7288 seq_printf(seq, "unused devices: "); 7289 7290 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 7291 char b[BDEVNAME_SIZE]; 7292 i++; 7293 seq_printf(seq, "%s ", 7294 bdevname(rdev->bdev,b)); 7295 } 7296 if (!i) 7297 seq_printf(seq, "<none>"); 7298 7299 seq_printf(seq, "\n"); 7300 } 7301 7302 static int status_resync(struct seq_file *seq, struct mddev *mddev) 7303 { 7304 sector_t max_sectors, resync, res; 7305 unsigned long dt, db; 7306 sector_t rt; 7307 int scale; 7308 unsigned int per_milli; 7309 7310 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7311 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7312 max_sectors = mddev->resync_max_sectors; 7313 else 7314 max_sectors = mddev->dev_sectors; 7315 7316 resync = mddev->curr_resync; 7317 if (resync <= 3) { 7318 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7319 /* Still cleaning up */ 7320 resync = max_sectors; 7321 } else 7322 resync -= atomic_read(&mddev->recovery_active); 7323 7324 if (resync == 0) { 7325 if (mddev->recovery_cp < MaxSector) { 7326 seq_printf(seq, "\tresync=PENDING"); 7327 return 1; 7328 } 7329 return 0; 7330 } 7331 if (resync < 3) { 7332 seq_printf(seq, "\tresync=DELAYED"); 7333 return 1; 7334 } 7335 7336 WARN_ON(max_sectors == 0); 7337 /* Pick 'scale' such that (resync>>scale)*1000 will fit 7338 * in a sector_t, and (max_sectors>>scale) will fit in a 7339 * u32, as those are the requirements for sector_div. 7340 * Thus 'scale' must be at least 10 7341 */ 7342 scale = 10; 7343 if (sizeof(sector_t) > sizeof(unsigned long)) { 7344 while ( max_sectors/2 > (1ULL<<(scale+32))) 7345 scale++; 7346 } 7347 res = (resync>>scale)*1000; 7348 sector_div(res, (u32)((max_sectors>>scale)+1)); 7349 7350 per_milli = res; 7351 { 7352 int i, x = per_milli/50, y = 20-x; 7353 seq_printf(seq, "["); 7354 for (i = 0; i < x; i++) 7355 seq_printf(seq, "="); 7356 seq_printf(seq, ">"); 7357 for (i = 0; i < y; i++) 7358 seq_printf(seq, "."); 7359 seq_printf(seq, "] "); 7360 } 7361 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 7362 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 7363 "reshape" : 7364 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 7365 "check" : 7366 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 7367 "resync" : "recovery"))), 7368 per_milli/10, per_milli % 10, 7369 (unsigned long long) resync/2, 7370 (unsigned long long) max_sectors/2); 7371 7372 /* 7373 * dt: time from mark until now 7374 * db: blocks written from mark until now 7375 * rt: remaining time 7376 * 7377 * rt is a sector_t, so could be 32bit or 64bit. 7378 * So we divide before multiply in case it is 32bit and close 7379 * to the limit. 7380 * We scale the divisor (db) by 32 to avoid losing precision 7381 * near the end of resync when the number of remaining sectors 7382 * is close to 'db'. 7383 * We then divide rt by 32 after multiplying by db to compensate. 7384 * The '+1' avoids division by zero if db is very small. 7385 */ 7386 dt = ((jiffies - mddev->resync_mark) / HZ); 7387 if (!dt) dt++; 7388 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 7389 - mddev->resync_mark_cnt; 7390 7391 rt = max_sectors - resync; /* number of remaining sectors */ 7392 sector_div(rt, db/32+1); 7393 rt *= dt; 7394 rt >>= 5; 7395 7396 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 7397 ((unsigned long)rt % 60)/6); 7398 7399 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 7400 return 1; 7401 } 7402 7403 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 7404 { 7405 struct list_head *tmp; 7406 loff_t l = *pos; 7407 struct mddev *mddev; 7408 7409 if (l >= 0x10000) 7410 return NULL; 7411 if (!l--) 7412 /* header */ 7413 return (void*)1; 7414 7415 spin_lock(&all_mddevs_lock); 7416 list_for_each(tmp,&all_mddevs) 7417 if (!l--) { 7418 mddev = list_entry(tmp, struct mddev, all_mddevs); 7419 mddev_get(mddev); 7420 spin_unlock(&all_mddevs_lock); 7421 return mddev; 7422 } 7423 spin_unlock(&all_mddevs_lock); 7424 if (!l--) 7425 return (void*)2;/* tail */ 7426 return NULL; 7427 } 7428 7429 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 7430 { 7431 struct list_head *tmp; 7432 struct mddev *next_mddev, *mddev = v; 7433 7434 ++*pos; 7435 if (v == (void*)2) 7436 return NULL; 7437 7438 spin_lock(&all_mddevs_lock); 7439 if (v == (void*)1) 7440 tmp = all_mddevs.next; 7441 else 7442 tmp = mddev->all_mddevs.next; 7443 if (tmp != &all_mddevs) 7444 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 7445 else { 7446 next_mddev = (void*)2; 7447 *pos = 0x10000; 7448 } 7449 spin_unlock(&all_mddevs_lock); 7450 7451 if (v != (void*)1) 7452 mddev_put(mddev); 7453 return next_mddev; 7454 7455 } 7456 7457 static void md_seq_stop(struct seq_file *seq, void *v) 7458 { 7459 struct mddev *mddev = v; 7460 7461 if (mddev && v != (void*)1 && v != (void*)2) 7462 mddev_put(mddev); 7463 } 7464 7465 static int md_seq_show(struct seq_file *seq, void *v) 7466 { 7467 struct mddev *mddev = v; 7468 sector_t sectors; 7469 struct md_rdev *rdev; 7470 7471 if (v == (void*)1) { 7472 struct md_personality *pers; 7473 seq_printf(seq, "Personalities : "); 7474 spin_lock(&pers_lock); 7475 list_for_each_entry(pers, &pers_list, list) 7476 seq_printf(seq, "[%s] ", pers->name); 7477 7478 spin_unlock(&pers_lock); 7479 seq_printf(seq, "\n"); 7480 seq->poll_event = atomic_read(&md_event_count); 7481 return 0; 7482 } 7483 if (v == (void*)2) { 7484 status_unused(seq); 7485 return 0; 7486 } 7487 7488 spin_lock(&mddev->lock); 7489 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 7490 seq_printf(seq, "%s : %sactive", mdname(mddev), 7491 mddev->pers ? "" : "in"); 7492 if (mddev->pers) { 7493 if (mddev->ro==1) 7494 seq_printf(seq, " (read-only)"); 7495 if (mddev->ro==2) 7496 seq_printf(seq, " (auto-read-only)"); 7497 seq_printf(seq, " %s", mddev->pers->name); 7498 } 7499 7500 sectors = 0; 7501 rcu_read_lock(); 7502 rdev_for_each_rcu(rdev, mddev) { 7503 char b[BDEVNAME_SIZE]; 7504 seq_printf(seq, " %s[%d]", 7505 bdevname(rdev->bdev,b), rdev->desc_nr); 7506 if (test_bit(WriteMostly, &rdev->flags)) 7507 seq_printf(seq, "(W)"); 7508 if (test_bit(Journal, &rdev->flags)) 7509 seq_printf(seq, "(J)"); 7510 if (test_bit(Faulty, &rdev->flags)) { 7511 seq_printf(seq, "(F)"); 7512 continue; 7513 } 7514 if (rdev->raid_disk < 0) 7515 seq_printf(seq, "(S)"); /* spare */ 7516 if (test_bit(Replacement, &rdev->flags)) 7517 seq_printf(seq, "(R)"); 7518 sectors += rdev->sectors; 7519 } 7520 rcu_read_unlock(); 7521 7522 if (!list_empty(&mddev->disks)) { 7523 if (mddev->pers) 7524 seq_printf(seq, "\n %llu blocks", 7525 (unsigned long long) 7526 mddev->array_sectors / 2); 7527 else 7528 seq_printf(seq, "\n %llu blocks", 7529 (unsigned long long)sectors / 2); 7530 } 7531 if (mddev->persistent) { 7532 if (mddev->major_version != 0 || 7533 mddev->minor_version != 90) { 7534 seq_printf(seq," super %d.%d", 7535 mddev->major_version, 7536 mddev->minor_version); 7537 } 7538 } else if (mddev->external) 7539 seq_printf(seq, " super external:%s", 7540 mddev->metadata_type); 7541 else 7542 seq_printf(seq, " super non-persistent"); 7543 7544 if (mddev->pers) { 7545 mddev->pers->status(seq, mddev); 7546 seq_printf(seq, "\n "); 7547 if (mddev->pers->sync_request) { 7548 if (status_resync(seq, mddev)) 7549 seq_printf(seq, "\n "); 7550 } 7551 } else 7552 seq_printf(seq, "\n "); 7553 7554 bitmap_status(seq, mddev->bitmap); 7555 7556 seq_printf(seq, "\n"); 7557 } 7558 spin_unlock(&mddev->lock); 7559 7560 return 0; 7561 } 7562 7563 static const struct seq_operations md_seq_ops = { 7564 .start = md_seq_start, 7565 .next = md_seq_next, 7566 .stop = md_seq_stop, 7567 .show = md_seq_show, 7568 }; 7569 7570 static int md_seq_open(struct inode *inode, struct file *file) 7571 { 7572 struct seq_file *seq; 7573 int error; 7574 7575 error = seq_open(file, &md_seq_ops); 7576 if (error) 7577 return error; 7578 7579 seq = file->private_data; 7580 seq->poll_event = atomic_read(&md_event_count); 7581 return error; 7582 } 7583 7584 static int md_unloading; 7585 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 7586 { 7587 struct seq_file *seq = filp->private_data; 7588 int mask; 7589 7590 if (md_unloading) 7591 return POLLIN|POLLRDNORM|POLLERR|POLLPRI; 7592 poll_wait(filp, &md_event_waiters, wait); 7593 7594 /* always allow read */ 7595 mask = POLLIN | POLLRDNORM; 7596 7597 if (seq->poll_event != atomic_read(&md_event_count)) 7598 mask |= POLLERR | POLLPRI; 7599 return mask; 7600 } 7601 7602 static const struct file_operations md_seq_fops = { 7603 .owner = THIS_MODULE, 7604 .open = md_seq_open, 7605 .read = seq_read, 7606 .llseek = seq_lseek, 7607 .release = seq_release_private, 7608 .poll = mdstat_poll, 7609 }; 7610 7611 int register_md_personality(struct md_personality *p) 7612 { 7613 pr_debug("md: %s personality registered for level %d\n", 7614 p->name, p->level); 7615 spin_lock(&pers_lock); 7616 list_add_tail(&p->list, &pers_list); 7617 spin_unlock(&pers_lock); 7618 return 0; 7619 } 7620 EXPORT_SYMBOL(register_md_personality); 7621 7622 int unregister_md_personality(struct md_personality *p) 7623 { 7624 pr_debug("md: %s personality unregistered\n", p->name); 7625 spin_lock(&pers_lock); 7626 list_del_init(&p->list); 7627 spin_unlock(&pers_lock); 7628 return 0; 7629 } 7630 EXPORT_SYMBOL(unregister_md_personality); 7631 7632 int register_md_cluster_operations(struct md_cluster_operations *ops, 7633 struct module *module) 7634 { 7635 int ret = 0; 7636 spin_lock(&pers_lock); 7637 if (md_cluster_ops != NULL) 7638 ret = -EALREADY; 7639 else { 7640 md_cluster_ops = ops; 7641 md_cluster_mod = module; 7642 } 7643 spin_unlock(&pers_lock); 7644 return ret; 7645 } 7646 EXPORT_SYMBOL(register_md_cluster_operations); 7647 7648 int unregister_md_cluster_operations(void) 7649 { 7650 spin_lock(&pers_lock); 7651 md_cluster_ops = NULL; 7652 spin_unlock(&pers_lock); 7653 return 0; 7654 } 7655 EXPORT_SYMBOL(unregister_md_cluster_operations); 7656 7657 int md_setup_cluster(struct mddev *mddev, int nodes) 7658 { 7659 if (!md_cluster_ops) 7660 request_module("md-cluster"); 7661 spin_lock(&pers_lock); 7662 /* ensure module won't be unloaded */ 7663 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 7664 pr_warn("can't find md-cluster module or get it's reference.\n"); 7665 spin_unlock(&pers_lock); 7666 return -ENOENT; 7667 } 7668 spin_unlock(&pers_lock); 7669 7670 return md_cluster_ops->join(mddev, nodes); 7671 } 7672 7673 void md_cluster_stop(struct mddev *mddev) 7674 { 7675 if (!md_cluster_ops) 7676 return; 7677 md_cluster_ops->leave(mddev); 7678 module_put(md_cluster_mod); 7679 } 7680 7681 static int is_mddev_idle(struct mddev *mddev, int init) 7682 { 7683 struct md_rdev *rdev; 7684 int idle; 7685 int curr_events; 7686 7687 idle = 1; 7688 rcu_read_lock(); 7689 rdev_for_each_rcu(rdev, mddev) { 7690 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 7691 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 7692 (int)part_stat_read(&disk->part0, sectors[1]) - 7693 atomic_read(&disk->sync_io); 7694 /* sync IO will cause sync_io to increase before the disk_stats 7695 * as sync_io is counted when a request starts, and 7696 * disk_stats is counted when it completes. 7697 * So resync activity will cause curr_events to be smaller than 7698 * when there was no such activity. 7699 * non-sync IO will cause disk_stat to increase without 7700 * increasing sync_io so curr_events will (eventually) 7701 * be larger than it was before. Once it becomes 7702 * substantially larger, the test below will cause 7703 * the array to appear non-idle, and resync will slow 7704 * down. 7705 * If there is a lot of outstanding resync activity when 7706 * we set last_event to curr_events, then all that activity 7707 * completing might cause the array to appear non-idle 7708 * and resync will be slowed down even though there might 7709 * not have been non-resync activity. This will only 7710 * happen once though. 'last_events' will soon reflect 7711 * the state where there is little or no outstanding 7712 * resync requests, and further resync activity will 7713 * always make curr_events less than last_events. 7714 * 7715 */ 7716 if (init || curr_events - rdev->last_events > 64) { 7717 rdev->last_events = curr_events; 7718 idle = 0; 7719 } 7720 } 7721 rcu_read_unlock(); 7722 return idle; 7723 } 7724 7725 void md_done_sync(struct mddev *mddev, int blocks, int ok) 7726 { 7727 /* another "blocks" (512byte) blocks have been synced */ 7728 atomic_sub(blocks, &mddev->recovery_active); 7729 wake_up(&mddev->recovery_wait); 7730 if (!ok) { 7731 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7732 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 7733 md_wakeup_thread(mddev->thread); 7734 // stop recovery, signal do_sync .... 7735 } 7736 } 7737 EXPORT_SYMBOL(md_done_sync); 7738 7739 /* md_write_start(mddev, bi) 7740 * If we need to update some array metadata (e.g. 'active' flag 7741 * in superblock) before writing, schedule a superblock update 7742 * and wait for it to complete. 7743 */ 7744 void md_write_start(struct mddev *mddev, struct bio *bi) 7745 { 7746 int did_change = 0; 7747 if (bio_data_dir(bi) != WRITE) 7748 return; 7749 7750 BUG_ON(mddev->ro == 1); 7751 if (mddev->ro == 2) { 7752 /* need to switch to read/write */ 7753 mddev->ro = 0; 7754 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7755 md_wakeup_thread(mddev->thread); 7756 md_wakeup_thread(mddev->sync_thread); 7757 did_change = 1; 7758 } 7759 atomic_inc(&mddev->writes_pending); 7760 if (mddev->safemode == 1) 7761 mddev->safemode = 0; 7762 if (mddev->in_sync) { 7763 spin_lock(&mddev->lock); 7764 if (mddev->in_sync) { 7765 mddev->in_sync = 0; 7766 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 7767 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 7768 md_wakeup_thread(mddev->thread); 7769 did_change = 1; 7770 } 7771 spin_unlock(&mddev->lock); 7772 } 7773 if (did_change) 7774 sysfs_notify_dirent_safe(mddev->sysfs_state); 7775 wait_event(mddev->sb_wait, 7776 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7777 } 7778 EXPORT_SYMBOL(md_write_start); 7779 7780 void md_write_end(struct mddev *mddev) 7781 { 7782 if (atomic_dec_and_test(&mddev->writes_pending)) { 7783 if (mddev->safemode == 2) 7784 md_wakeup_thread(mddev->thread); 7785 else if (mddev->safemode_delay) 7786 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 7787 } 7788 } 7789 EXPORT_SYMBOL(md_write_end); 7790 7791 /* md_allow_write(mddev) 7792 * Calling this ensures that the array is marked 'active' so that writes 7793 * may proceed without blocking. It is important to call this before 7794 * attempting a GFP_KERNEL allocation while holding the mddev lock. 7795 * Must be called with mddev_lock held. 7796 * 7797 * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock 7798 * is dropped, so return -EAGAIN after notifying userspace. 7799 */ 7800 int md_allow_write(struct mddev *mddev) 7801 { 7802 if (!mddev->pers) 7803 return 0; 7804 if (mddev->ro) 7805 return 0; 7806 if (!mddev->pers->sync_request) 7807 return 0; 7808 7809 spin_lock(&mddev->lock); 7810 if (mddev->in_sync) { 7811 mddev->in_sync = 0; 7812 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 7813 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 7814 if (mddev->safemode_delay && 7815 mddev->safemode == 0) 7816 mddev->safemode = 1; 7817 spin_unlock(&mddev->lock); 7818 md_update_sb(mddev, 0); 7819 sysfs_notify_dirent_safe(mddev->sysfs_state); 7820 } else 7821 spin_unlock(&mddev->lock); 7822 7823 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 7824 return -EAGAIN; 7825 else 7826 return 0; 7827 } 7828 EXPORT_SYMBOL_GPL(md_allow_write); 7829 7830 #define SYNC_MARKS 10 7831 #define SYNC_MARK_STEP (3*HZ) 7832 #define UPDATE_FREQUENCY (5*60*HZ) 7833 void md_do_sync(struct md_thread *thread) 7834 { 7835 struct mddev *mddev = thread->mddev; 7836 struct mddev *mddev2; 7837 unsigned int currspeed = 0, 7838 window; 7839 sector_t max_sectors,j, io_sectors, recovery_done; 7840 unsigned long mark[SYNC_MARKS]; 7841 unsigned long update_time; 7842 sector_t mark_cnt[SYNC_MARKS]; 7843 int last_mark,m; 7844 struct list_head *tmp; 7845 sector_t last_check; 7846 int skipped = 0; 7847 struct md_rdev *rdev; 7848 char *desc, *action = NULL; 7849 struct blk_plug plug; 7850 int ret; 7851 7852 /* just incase thread restarts... */ 7853 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7854 return; 7855 if (mddev->ro) {/* never try to sync a read-only array */ 7856 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7857 return; 7858 } 7859 7860 if (mddev_is_clustered(mddev)) { 7861 ret = md_cluster_ops->resync_start(mddev); 7862 if (ret) 7863 goto skip; 7864 7865 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags); 7866 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7867 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || 7868 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 7869 && ((unsigned long long)mddev->curr_resync_completed 7870 < (unsigned long long)mddev->resync_max_sectors)) 7871 goto skip; 7872 } 7873 7874 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7875 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 7876 desc = "data-check"; 7877 action = "check"; 7878 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7879 desc = "requested-resync"; 7880 action = "repair"; 7881 } else 7882 desc = "resync"; 7883 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7884 desc = "reshape"; 7885 else 7886 desc = "recovery"; 7887 7888 mddev->last_sync_action = action ?: desc; 7889 7890 /* we overload curr_resync somewhat here. 7891 * 0 == not engaged in resync at all 7892 * 2 == checking that there is no conflict with another sync 7893 * 1 == like 2, but have yielded to allow conflicting resync to 7894 * commense 7895 * other == active in resync - this many blocks 7896 * 7897 * Before starting a resync we must have set curr_resync to 7898 * 2, and then checked that every "conflicting" array has curr_resync 7899 * less than ours. When we find one that is the same or higher 7900 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 7901 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 7902 * This will mean we have to start checking from the beginning again. 7903 * 7904 */ 7905 7906 do { 7907 int mddev2_minor = -1; 7908 mddev->curr_resync = 2; 7909 7910 try_again: 7911 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7912 goto skip; 7913 for_each_mddev(mddev2, tmp) { 7914 if (mddev2 == mddev) 7915 continue; 7916 if (!mddev->parallel_resync 7917 && mddev2->curr_resync 7918 && match_mddev_units(mddev, mddev2)) { 7919 DEFINE_WAIT(wq); 7920 if (mddev < mddev2 && mddev->curr_resync == 2) { 7921 /* arbitrarily yield */ 7922 mddev->curr_resync = 1; 7923 wake_up(&resync_wait); 7924 } 7925 if (mddev > mddev2 && mddev->curr_resync == 1) 7926 /* no need to wait here, we can wait the next 7927 * time 'round when curr_resync == 2 7928 */ 7929 continue; 7930 /* We need to wait 'interruptible' so as not to 7931 * contribute to the load average, and not to 7932 * be caught by 'softlockup' 7933 */ 7934 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 7935 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7936 mddev2->curr_resync >= mddev->curr_resync) { 7937 if (mddev2_minor != mddev2->md_minor) { 7938 mddev2_minor = mddev2->md_minor; 7939 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", 7940 desc, mdname(mddev), 7941 mdname(mddev2)); 7942 } 7943 mddev_put(mddev2); 7944 if (signal_pending(current)) 7945 flush_signals(current); 7946 schedule(); 7947 finish_wait(&resync_wait, &wq); 7948 goto try_again; 7949 } 7950 finish_wait(&resync_wait, &wq); 7951 } 7952 } 7953 } while (mddev->curr_resync < 2); 7954 7955 j = 0; 7956 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7957 /* resync follows the size requested by the personality, 7958 * which defaults to physical size, but can be virtual size 7959 */ 7960 max_sectors = mddev->resync_max_sectors; 7961 atomic64_set(&mddev->resync_mismatches, 0); 7962 /* we don't use the checkpoint if there's a bitmap */ 7963 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7964 j = mddev->resync_min; 7965 else if (!mddev->bitmap) 7966 j = mddev->recovery_cp; 7967 7968 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7969 max_sectors = mddev->resync_max_sectors; 7970 else { 7971 /* recovery follows the physical size of devices */ 7972 max_sectors = mddev->dev_sectors; 7973 j = MaxSector; 7974 rcu_read_lock(); 7975 rdev_for_each_rcu(rdev, mddev) 7976 if (rdev->raid_disk >= 0 && 7977 !test_bit(Journal, &rdev->flags) && 7978 !test_bit(Faulty, &rdev->flags) && 7979 !test_bit(In_sync, &rdev->flags) && 7980 rdev->recovery_offset < j) 7981 j = rdev->recovery_offset; 7982 rcu_read_unlock(); 7983 7984 /* If there is a bitmap, we need to make sure all 7985 * writes that started before we added a spare 7986 * complete before we start doing a recovery. 7987 * Otherwise the write might complete and (via 7988 * bitmap_endwrite) set a bit in the bitmap after the 7989 * recovery has checked that bit and skipped that 7990 * region. 7991 */ 7992 if (mddev->bitmap) { 7993 mddev->pers->quiesce(mddev, 1); 7994 mddev->pers->quiesce(mddev, 0); 7995 } 7996 } 7997 7998 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); 7999 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); 8000 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", 8001 speed_max(mddev), desc); 8002 8003 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 8004 8005 io_sectors = 0; 8006 for (m = 0; m < SYNC_MARKS; m++) { 8007 mark[m] = jiffies; 8008 mark_cnt[m] = io_sectors; 8009 } 8010 last_mark = 0; 8011 mddev->resync_mark = mark[last_mark]; 8012 mddev->resync_mark_cnt = mark_cnt[last_mark]; 8013 8014 /* 8015 * Tune reconstruction: 8016 */ 8017 window = 32*(PAGE_SIZE/512); 8018 pr_debug("md: using %dk window, over a total of %lluk.\n", 8019 window/2, (unsigned long long)max_sectors/2); 8020 8021 atomic_set(&mddev->recovery_active, 0); 8022 last_check = 0; 8023 8024 if (j>2) { 8025 pr_debug("md: resuming %s of %s from checkpoint.\n", 8026 desc, mdname(mddev)); 8027 mddev->curr_resync = j; 8028 } else 8029 mddev->curr_resync = 3; /* no longer delayed */ 8030 mddev->curr_resync_completed = j; 8031 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8032 md_new_event(mddev); 8033 update_time = jiffies; 8034 8035 blk_start_plug(&plug); 8036 while (j < max_sectors) { 8037 sector_t sectors; 8038 8039 skipped = 0; 8040 8041 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8042 ((mddev->curr_resync > mddev->curr_resync_completed && 8043 (mddev->curr_resync - mddev->curr_resync_completed) 8044 > (max_sectors >> 4)) || 8045 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 8046 (j - mddev->curr_resync_completed)*2 8047 >= mddev->resync_max - mddev->curr_resync_completed || 8048 mddev->curr_resync_completed > mddev->resync_max 8049 )) { 8050 /* time to update curr_resync_completed */ 8051 wait_event(mddev->recovery_wait, 8052 atomic_read(&mddev->recovery_active) == 0); 8053 mddev->curr_resync_completed = j; 8054 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 8055 j > mddev->recovery_cp) 8056 mddev->recovery_cp = j; 8057 update_time = jiffies; 8058 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8059 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8060 } 8061 8062 while (j >= mddev->resync_max && 8063 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8064 /* As this condition is controlled by user-space, 8065 * we can block indefinitely, so use '_interruptible' 8066 * to avoid triggering warnings. 8067 */ 8068 flush_signals(current); /* just in case */ 8069 wait_event_interruptible(mddev->recovery_wait, 8070 mddev->resync_max > j 8071 || test_bit(MD_RECOVERY_INTR, 8072 &mddev->recovery)); 8073 } 8074 8075 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8076 break; 8077 8078 sectors = mddev->pers->sync_request(mddev, j, &skipped); 8079 if (sectors == 0) { 8080 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8081 break; 8082 } 8083 8084 if (!skipped) { /* actual IO requested */ 8085 io_sectors += sectors; 8086 atomic_add(sectors, &mddev->recovery_active); 8087 } 8088 8089 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8090 break; 8091 8092 j += sectors; 8093 if (j > max_sectors) 8094 /* when skipping, extra large numbers can be returned. */ 8095 j = max_sectors; 8096 if (j > 2) 8097 mddev->curr_resync = j; 8098 mddev->curr_mark_cnt = io_sectors; 8099 if (last_check == 0) 8100 /* this is the earliest that rebuild will be 8101 * visible in /proc/mdstat 8102 */ 8103 md_new_event(mddev); 8104 8105 if (last_check + window > io_sectors || j == max_sectors) 8106 continue; 8107 8108 last_check = io_sectors; 8109 repeat: 8110 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 8111 /* step marks */ 8112 int next = (last_mark+1) % SYNC_MARKS; 8113 8114 mddev->resync_mark = mark[next]; 8115 mddev->resync_mark_cnt = mark_cnt[next]; 8116 mark[next] = jiffies; 8117 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 8118 last_mark = next; 8119 } 8120 8121 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8122 break; 8123 8124 /* 8125 * this loop exits only if either when we are slower than 8126 * the 'hard' speed limit, or the system was IO-idle for 8127 * a jiffy. 8128 * the system might be non-idle CPU-wise, but we only care 8129 * about not overloading the IO subsystem. (things like an 8130 * e2fsck being done on the RAID array should execute fast) 8131 */ 8132 cond_resched(); 8133 8134 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 8135 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 8136 /((jiffies-mddev->resync_mark)/HZ +1) +1; 8137 8138 if (currspeed > speed_min(mddev)) { 8139 if (currspeed > speed_max(mddev)) { 8140 msleep(500); 8141 goto repeat; 8142 } 8143 if (!is_mddev_idle(mddev, 0)) { 8144 /* 8145 * Give other IO more of a chance. 8146 * The faster the devices, the less we wait. 8147 */ 8148 wait_event(mddev->recovery_wait, 8149 !atomic_read(&mddev->recovery_active)); 8150 } 8151 } 8152 } 8153 pr_info("md: %s: %s %s.\n",mdname(mddev), desc, 8154 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 8155 ? "interrupted" : "done"); 8156 /* 8157 * this also signals 'finished resyncing' to md_stop 8158 */ 8159 blk_finish_plug(&plug); 8160 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 8161 8162 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8163 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8164 mddev->curr_resync > 3) { 8165 mddev->curr_resync_completed = mddev->curr_resync; 8166 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8167 } 8168 mddev->pers->sync_request(mddev, max_sectors, &skipped); 8169 8170 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 8171 mddev->curr_resync > 3) { 8172 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8173 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8174 if (mddev->curr_resync >= mddev->recovery_cp) { 8175 pr_debug("md: checkpointing %s of %s.\n", 8176 desc, mdname(mddev)); 8177 if (test_bit(MD_RECOVERY_ERROR, 8178 &mddev->recovery)) 8179 mddev->recovery_cp = 8180 mddev->curr_resync_completed; 8181 else 8182 mddev->recovery_cp = 8183 mddev->curr_resync; 8184 } 8185 } else 8186 mddev->recovery_cp = MaxSector; 8187 } else { 8188 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8189 mddev->curr_resync = MaxSector; 8190 rcu_read_lock(); 8191 rdev_for_each_rcu(rdev, mddev) 8192 if (rdev->raid_disk >= 0 && 8193 mddev->delta_disks >= 0 && 8194 !test_bit(Journal, &rdev->flags) && 8195 !test_bit(Faulty, &rdev->flags) && 8196 !test_bit(In_sync, &rdev->flags) && 8197 rdev->recovery_offset < mddev->curr_resync) 8198 rdev->recovery_offset = mddev->curr_resync; 8199 rcu_read_unlock(); 8200 } 8201 } 8202 skip: 8203 /* set CHANGE_PENDING here since maybe another update is needed, 8204 * so other nodes are informed. It should be harmless for normal 8205 * raid */ 8206 set_mask_bits(&mddev->sb_flags, 0, 8207 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); 8208 8209 spin_lock(&mddev->lock); 8210 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8211 /* We completed so min/max setting can be forgotten if used. */ 8212 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8213 mddev->resync_min = 0; 8214 mddev->resync_max = MaxSector; 8215 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8216 mddev->resync_min = mddev->curr_resync_completed; 8217 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 8218 mddev->curr_resync = 0; 8219 spin_unlock(&mddev->lock); 8220 8221 wake_up(&resync_wait); 8222 md_wakeup_thread(mddev->thread); 8223 return; 8224 } 8225 EXPORT_SYMBOL_GPL(md_do_sync); 8226 8227 static int remove_and_add_spares(struct mddev *mddev, 8228 struct md_rdev *this) 8229 { 8230 struct md_rdev *rdev; 8231 int spares = 0; 8232 int removed = 0; 8233 bool remove_some = false; 8234 8235 rdev_for_each(rdev, mddev) { 8236 if ((this == NULL || rdev == this) && 8237 rdev->raid_disk >= 0 && 8238 !test_bit(Blocked, &rdev->flags) && 8239 test_bit(Faulty, &rdev->flags) && 8240 atomic_read(&rdev->nr_pending)==0) { 8241 /* Faulty non-Blocked devices with nr_pending == 0 8242 * never get nr_pending incremented, 8243 * never get Faulty cleared, and never get Blocked set. 8244 * So we can synchronize_rcu now rather than once per device 8245 */ 8246 remove_some = true; 8247 set_bit(RemoveSynchronized, &rdev->flags); 8248 } 8249 } 8250 8251 if (remove_some) 8252 synchronize_rcu(); 8253 rdev_for_each(rdev, mddev) { 8254 if ((this == NULL || rdev == this) && 8255 rdev->raid_disk >= 0 && 8256 !test_bit(Blocked, &rdev->flags) && 8257 ((test_bit(RemoveSynchronized, &rdev->flags) || 8258 (!test_bit(In_sync, &rdev->flags) && 8259 !test_bit(Journal, &rdev->flags))) && 8260 atomic_read(&rdev->nr_pending)==0)) { 8261 if (mddev->pers->hot_remove_disk( 8262 mddev, rdev) == 0) { 8263 sysfs_unlink_rdev(mddev, rdev); 8264 rdev->raid_disk = -1; 8265 removed++; 8266 } 8267 } 8268 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) 8269 clear_bit(RemoveSynchronized, &rdev->flags); 8270 } 8271 8272 if (removed && mddev->kobj.sd) 8273 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8274 8275 if (this && removed) 8276 goto no_add; 8277 8278 rdev_for_each(rdev, mddev) { 8279 if (this && this != rdev) 8280 continue; 8281 if (test_bit(Candidate, &rdev->flags)) 8282 continue; 8283 if (rdev->raid_disk >= 0 && 8284 !test_bit(In_sync, &rdev->flags) && 8285 !test_bit(Journal, &rdev->flags) && 8286 !test_bit(Faulty, &rdev->flags)) 8287 spares++; 8288 if (rdev->raid_disk >= 0) 8289 continue; 8290 if (test_bit(Faulty, &rdev->flags)) 8291 continue; 8292 if (!test_bit(Journal, &rdev->flags)) { 8293 if (mddev->ro && 8294 ! (rdev->saved_raid_disk >= 0 && 8295 !test_bit(Bitmap_sync, &rdev->flags))) 8296 continue; 8297 8298 rdev->recovery_offset = 0; 8299 } 8300 if (mddev->pers-> 8301 hot_add_disk(mddev, rdev) == 0) { 8302 if (sysfs_link_rdev(mddev, rdev)) 8303 /* failure here is OK */; 8304 if (!test_bit(Journal, &rdev->flags)) 8305 spares++; 8306 md_new_event(mddev); 8307 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8308 } 8309 } 8310 no_add: 8311 if (removed) 8312 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8313 return spares; 8314 } 8315 8316 static void md_start_sync(struct work_struct *ws) 8317 { 8318 struct mddev *mddev = container_of(ws, struct mddev, del_work); 8319 8320 mddev->sync_thread = md_register_thread(md_do_sync, 8321 mddev, 8322 "resync"); 8323 if (!mddev->sync_thread) { 8324 pr_warn("%s: could not start resync thread...\n", 8325 mdname(mddev)); 8326 /* leave the spares where they are, it shouldn't hurt */ 8327 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8328 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8329 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8330 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8331 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8332 wake_up(&resync_wait); 8333 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8334 &mddev->recovery)) 8335 if (mddev->sysfs_action) 8336 sysfs_notify_dirent_safe(mddev->sysfs_action); 8337 } else 8338 md_wakeup_thread(mddev->sync_thread); 8339 sysfs_notify_dirent_safe(mddev->sysfs_action); 8340 md_new_event(mddev); 8341 } 8342 8343 /* 8344 * This routine is regularly called by all per-raid-array threads to 8345 * deal with generic issues like resync and super-block update. 8346 * Raid personalities that don't have a thread (linear/raid0) do not 8347 * need this as they never do any recovery or update the superblock. 8348 * 8349 * It does not do any resync itself, but rather "forks" off other threads 8350 * to do that as needed. 8351 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 8352 * "->recovery" and create a thread at ->sync_thread. 8353 * When the thread finishes it sets MD_RECOVERY_DONE 8354 * and wakeups up this thread which will reap the thread and finish up. 8355 * This thread also removes any faulty devices (with nr_pending == 0). 8356 * 8357 * The overall approach is: 8358 * 1/ if the superblock needs updating, update it. 8359 * 2/ If a recovery thread is running, don't do anything else. 8360 * 3/ If recovery has finished, clean up, possibly marking spares active. 8361 * 4/ If there are any faulty devices, remove them. 8362 * 5/ If array is degraded, try to add spares devices 8363 * 6/ If array has spares or is not in-sync, start a resync thread. 8364 */ 8365 void md_check_recovery(struct mddev *mddev) 8366 { 8367 if (mddev->suspended) 8368 return; 8369 8370 if (mddev->bitmap) 8371 bitmap_daemon_work(mddev); 8372 8373 if (signal_pending(current)) { 8374 if (mddev->pers->sync_request && !mddev->external) { 8375 pr_debug("md: %s in immediate safe mode\n", 8376 mdname(mddev)); 8377 mddev->safemode = 2; 8378 } 8379 flush_signals(current); 8380 } 8381 8382 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 8383 return; 8384 if ( ! ( 8385 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || 8386 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8387 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8388 test_bit(MD_RELOAD_SB, &mddev->flags) || 8389 (mddev->external == 0 && mddev->safemode == 1) || 8390 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 8391 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 8392 )) 8393 return; 8394 8395 if (mddev_trylock(mddev)) { 8396 int spares = 0; 8397 8398 if (mddev->ro) { 8399 struct md_rdev *rdev; 8400 if (!mddev->external && mddev->in_sync) 8401 /* 'Blocked' flag not needed as failed devices 8402 * will be recorded if array switched to read/write. 8403 * Leaving it set will prevent the device 8404 * from being removed. 8405 */ 8406 rdev_for_each(rdev, mddev) 8407 clear_bit(Blocked, &rdev->flags); 8408 /* On a read-only array we can: 8409 * - remove failed devices 8410 * - add already-in_sync devices if the array itself 8411 * is in-sync. 8412 * As we only add devices that are already in-sync, 8413 * we can activate the spares immediately. 8414 */ 8415 remove_and_add_spares(mddev, NULL); 8416 /* There is no thread, but we need to call 8417 * ->spare_active and clear saved_raid_disk 8418 */ 8419 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8420 md_reap_sync_thread(mddev); 8421 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8422 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8423 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 8424 goto unlock; 8425 } 8426 8427 if (mddev_is_clustered(mddev)) { 8428 struct md_rdev *rdev; 8429 /* kick the device if another node issued a 8430 * remove disk. 8431 */ 8432 rdev_for_each(rdev, mddev) { 8433 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 8434 rdev->raid_disk < 0) 8435 md_kick_rdev_from_array(rdev); 8436 } 8437 8438 if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags)) 8439 md_reload_sb(mddev, mddev->good_device_nr); 8440 } 8441 8442 if (!mddev->external) { 8443 int did_change = 0; 8444 spin_lock(&mddev->lock); 8445 if (mddev->safemode && 8446 !atomic_read(&mddev->writes_pending) && 8447 !mddev->in_sync && 8448 mddev->recovery_cp == MaxSector) { 8449 mddev->in_sync = 1; 8450 did_change = 1; 8451 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 8452 } 8453 if (mddev->safemode == 1) 8454 mddev->safemode = 0; 8455 spin_unlock(&mddev->lock); 8456 if (did_change) 8457 sysfs_notify_dirent_safe(mddev->sysfs_state); 8458 } 8459 8460 if (mddev->sb_flags) 8461 md_update_sb(mddev, 0); 8462 8463 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 8464 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 8465 /* resync/recovery still happening */ 8466 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8467 goto unlock; 8468 } 8469 if (mddev->sync_thread) { 8470 md_reap_sync_thread(mddev); 8471 goto unlock; 8472 } 8473 /* Set RUNNING before clearing NEEDED to avoid 8474 * any transients in the value of "sync_action". 8475 */ 8476 mddev->curr_resync_completed = 0; 8477 spin_lock(&mddev->lock); 8478 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8479 spin_unlock(&mddev->lock); 8480 /* Clear some bits that don't mean anything, but 8481 * might be left set 8482 */ 8483 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 8484 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8485 8486 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8487 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 8488 goto not_running; 8489 /* no recovery is running. 8490 * remove any failed drives, then 8491 * add spares if possible. 8492 * Spares are also removed and re-added, to allow 8493 * the personality to fail the re-add. 8494 */ 8495 8496 if (mddev->reshape_position != MaxSector) { 8497 if (mddev->pers->check_reshape == NULL || 8498 mddev->pers->check_reshape(mddev) != 0) 8499 /* Cannot proceed */ 8500 goto not_running; 8501 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8502 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8503 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 8504 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8505 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8506 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8507 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8508 } else if (mddev->recovery_cp < MaxSector) { 8509 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8510 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8511 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 8512 /* nothing to be done ... */ 8513 goto not_running; 8514 8515 if (mddev->pers->sync_request) { 8516 if (spares) { 8517 /* We are adding a device or devices to an array 8518 * which has the bitmap stored on all devices. 8519 * So make sure all bitmap pages get written 8520 */ 8521 bitmap_write_all(mddev->bitmap); 8522 } 8523 INIT_WORK(&mddev->del_work, md_start_sync); 8524 queue_work(md_misc_wq, &mddev->del_work); 8525 goto unlock; 8526 } 8527 not_running: 8528 if (!mddev->sync_thread) { 8529 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8530 wake_up(&resync_wait); 8531 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8532 &mddev->recovery)) 8533 if (mddev->sysfs_action) 8534 sysfs_notify_dirent_safe(mddev->sysfs_action); 8535 } 8536 unlock: 8537 wake_up(&mddev->sb_wait); 8538 mddev_unlock(mddev); 8539 } 8540 } 8541 EXPORT_SYMBOL(md_check_recovery); 8542 8543 void md_reap_sync_thread(struct mddev *mddev) 8544 { 8545 struct md_rdev *rdev; 8546 8547 /* resync has finished, collect result */ 8548 md_unregister_thread(&mddev->sync_thread); 8549 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8550 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8551 /* success...*/ 8552 /* activate any spares */ 8553 if (mddev->pers->spare_active(mddev)) { 8554 sysfs_notify(&mddev->kobj, NULL, 8555 "degraded"); 8556 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 8557 } 8558 } 8559 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8560 mddev->pers->finish_reshape) 8561 mddev->pers->finish_reshape(mddev); 8562 8563 /* If array is no-longer degraded, then any saved_raid_disk 8564 * information must be scrapped. 8565 */ 8566 if (!mddev->degraded) 8567 rdev_for_each(rdev, mddev) 8568 rdev->saved_raid_disk = -1; 8569 8570 md_update_sb(mddev, 1); 8571 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can 8572 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by 8573 * clustered raid */ 8574 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) 8575 md_cluster_ops->resync_finish(mddev); 8576 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8577 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8578 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8579 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8580 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8581 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8582 wake_up(&resync_wait); 8583 /* flag recovery needed just to double check */ 8584 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8585 sysfs_notify_dirent_safe(mddev->sysfs_action); 8586 md_new_event(mddev); 8587 if (mddev->event_work.func) 8588 queue_work(md_misc_wq, &mddev->event_work); 8589 } 8590 EXPORT_SYMBOL(md_reap_sync_thread); 8591 8592 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 8593 { 8594 sysfs_notify_dirent_safe(rdev->sysfs_state); 8595 wait_event_timeout(rdev->blocked_wait, 8596 !test_bit(Blocked, &rdev->flags) && 8597 !test_bit(BlockedBadBlocks, &rdev->flags), 8598 msecs_to_jiffies(5000)); 8599 rdev_dec_pending(rdev, mddev); 8600 } 8601 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 8602 8603 void md_finish_reshape(struct mddev *mddev) 8604 { 8605 /* called be personality module when reshape completes. */ 8606 struct md_rdev *rdev; 8607 8608 rdev_for_each(rdev, mddev) { 8609 if (rdev->data_offset > rdev->new_data_offset) 8610 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 8611 else 8612 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 8613 rdev->data_offset = rdev->new_data_offset; 8614 } 8615 } 8616 EXPORT_SYMBOL(md_finish_reshape); 8617 8618 /* Bad block management */ 8619 8620 /* Returns 1 on success, 0 on failure */ 8621 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8622 int is_new) 8623 { 8624 struct mddev *mddev = rdev->mddev; 8625 int rv; 8626 if (is_new) 8627 s += rdev->new_data_offset; 8628 else 8629 s += rdev->data_offset; 8630 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 8631 if (rv == 0) { 8632 /* Make sure they get written out promptly */ 8633 if (test_bit(ExternalBbl, &rdev->flags)) 8634 sysfs_notify(&rdev->kobj, NULL, 8635 "unacknowledged_bad_blocks"); 8636 sysfs_notify_dirent_safe(rdev->sysfs_state); 8637 set_mask_bits(&mddev->sb_flags, 0, 8638 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); 8639 md_wakeup_thread(rdev->mddev->thread); 8640 return 1; 8641 } else 8642 return 0; 8643 } 8644 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 8645 8646 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8647 int is_new) 8648 { 8649 int rv; 8650 if (is_new) 8651 s += rdev->new_data_offset; 8652 else 8653 s += rdev->data_offset; 8654 rv = badblocks_clear(&rdev->badblocks, s, sectors); 8655 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) 8656 sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); 8657 return rv; 8658 } 8659 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8660 8661 static int md_notify_reboot(struct notifier_block *this, 8662 unsigned long code, void *x) 8663 { 8664 struct list_head *tmp; 8665 struct mddev *mddev; 8666 int need_delay = 0; 8667 8668 for_each_mddev(mddev, tmp) { 8669 if (mddev_trylock(mddev)) { 8670 if (mddev->pers) 8671 __md_stop_writes(mddev); 8672 if (mddev->persistent) 8673 mddev->safemode = 2; 8674 mddev_unlock(mddev); 8675 } 8676 need_delay = 1; 8677 } 8678 /* 8679 * certain more exotic SCSI devices are known to be 8680 * volatile wrt too early system reboots. While the 8681 * right place to handle this issue is the given 8682 * driver, we do want to have a safe RAID driver ... 8683 */ 8684 if (need_delay) 8685 mdelay(1000*1); 8686 8687 return NOTIFY_DONE; 8688 } 8689 8690 static struct notifier_block md_notifier = { 8691 .notifier_call = md_notify_reboot, 8692 .next = NULL, 8693 .priority = INT_MAX, /* before any real devices */ 8694 }; 8695 8696 static void md_geninit(void) 8697 { 8698 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 8699 8700 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 8701 } 8702 8703 static int __init md_init(void) 8704 { 8705 int ret = -ENOMEM; 8706 8707 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 8708 if (!md_wq) 8709 goto err_wq; 8710 8711 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 8712 if (!md_misc_wq) 8713 goto err_misc_wq; 8714 8715 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) 8716 goto err_md; 8717 8718 if ((ret = register_blkdev(0, "mdp")) < 0) 8719 goto err_mdp; 8720 mdp_major = ret; 8721 8722 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE, 8723 md_probe, NULL, NULL); 8724 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 8725 md_probe, NULL, NULL); 8726 8727 register_reboot_notifier(&md_notifier); 8728 raid_table_header = register_sysctl_table(raid_root_table); 8729 8730 md_geninit(); 8731 return 0; 8732 8733 err_mdp: 8734 unregister_blkdev(MD_MAJOR, "md"); 8735 err_md: 8736 destroy_workqueue(md_misc_wq); 8737 err_misc_wq: 8738 destroy_workqueue(md_wq); 8739 err_wq: 8740 return ret; 8741 } 8742 8743 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 8744 { 8745 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 8746 struct md_rdev *rdev2; 8747 int role, ret; 8748 char b[BDEVNAME_SIZE]; 8749 8750 /* Check for change of roles in the active devices */ 8751 rdev_for_each(rdev2, mddev) { 8752 if (test_bit(Faulty, &rdev2->flags)) 8753 continue; 8754 8755 /* Check if the roles changed */ 8756 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 8757 8758 if (test_bit(Candidate, &rdev2->flags)) { 8759 if (role == 0xfffe) { 8760 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); 8761 md_kick_rdev_from_array(rdev2); 8762 continue; 8763 } 8764 else 8765 clear_bit(Candidate, &rdev2->flags); 8766 } 8767 8768 if (role != rdev2->raid_disk) { 8769 /* got activated */ 8770 if (rdev2->raid_disk == -1 && role != 0xffff) { 8771 rdev2->saved_raid_disk = role; 8772 ret = remove_and_add_spares(mddev, rdev2); 8773 pr_info("Activated spare: %s\n", 8774 bdevname(rdev2->bdev,b)); 8775 /* wakeup mddev->thread here, so array could 8776 * perform resync with the new activated disk */ 8777 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8778 md_wakeup_thread(mddev->thread); 8779 8780 } 8781 /* device faulty 8782 * We just want to do the minimum to mark the disk 8783 * as faulty. The recovery is performed by the 8784 * one who initiated the error. 8785 */ 8786 if ((role == 0xfffe) || (role == 0xfffd)) { 8787 md_error(mddev, rdev2); 8788 clear_bit(Blocked, &rdev2->flags); 8789 } 8790 } 8791 } 8792 8793 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) 8794 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 8795 8796 /* Finally set the event to be up to date */ 8797 mddev->events = le64_to_cpu(sb->events); 8798 } 8799 8800 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 8801 { 8802 int err; 8803 struct page *swapout = rdev->sb_page; 8804 struct mdp_superblock_1 *sb; 8805 8806 /* Store the sb page of the rdev in the swapout temporary 8807 * variable in case we err in the future 8808 */ 8809 rdev->sb_page = NULL; 8810 err = alloc_disk_sb(rdev); 8811 if (err == 0) { 8812 ClearPageUptodate(rdev->sb_page); 8813 rdev->sb_loaded = 0; 8814 err = super_types[mddev->major_version]. 8815 load_super(rdev, NULL, mddev->minor_version); 8816 } 8817 if (err < 0) { 8818 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 8819 __func__, __LINE__, rdev->desc_nr, err); 8820 if (rdev->sb_page) 8821 put_page(rdev->sb_page); 8822 rdev->sb_page = swapout; 8823 rdev->sb_loaded = 1; 8824 return err; 8825 } 8826 8827 sb = page_address(rdev->sb_page); 8828 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 8829 * is not set 8830 */ 8831 8832 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 8833 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 8834 8835 /* The other node finished recovery, call spare_active to set 8836 * device In_sync and mddev->degraded 8837 */ 8838 if (rdev->recovery_offset == MaxSector && 8839 !test_bit(In_sync, &rdev->flags) && 8840 mddev->pers->spare_active(mddev)) 8841 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8842 8843 put_page(swapout); 8844 return 0; 8845 } 8846 8847 void md_reload_sb(struct mddev *mddev, int nr) 8848 { 8849 struct md_rdev *rdev; 8850 int err; 8851 8852 /* Find the rdev */ 8853 rdev_for_each_rcu(rdev, mddev) { 8854 if (rdev->desc_nr == nr) 8855 break; 8856 } 8857 8858 if (!rdev || rdev->desc_nr != nr) { 8859 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 8860 return; 8861 } 8862 8863 err = read_rdev(mddev, rdev); 8864 if (err < 0) 8865 return; 8866 8867 check_sb_changes(mddev, rdev); 8868 8869 /* Read all rdev's to update recovery_offset */ 8870 rdev_for_each_rcu(rdev, mddev) 8871 read_rdev(mddev, rdev); 8872 } 8873 EXPORT_SYMBOL(md_reload_sb); 8874 8875 #ifndef MODULE 8876 8877 /* 8878 * Searches all registered partitions for autorun RAID arrays 8879 * at boot time. 8880 */ 8881 8882 static DEFINE_MUTEX(detected_devices_mutex); 8883 static LIST_HEAD(all_detected_devices); 8884 struct detected_devices_node { 8885 struct list_head list; 8886 dev_t dev; 8887 }; 8888 8889 void md_autodetect_dev(dev_t dev) 8890 { 8891 struct detected_devices_node *node_detected_dev; 8892 8893 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 8894 if (node_detected_dev) { 8895 node_detected_dev->dev = dev; 8896 mutex_lock(&detected_devices_mutex); 8897 list_add_tail(&node_detected_dev->list, &all_detected_devices); 8898 mutex_unlock(&detected_devices_mutex); 8899 } 8900 } 8901 8902 static void autostart_arrays(int part) 8903 { 8904 struct md_rdev *rdev; 8905 struct detected_devices_node *node_detected_dev; 8906 dev_t dev; 8907 int i_scanned, i_passed; 8908 8909 i_scanned = 0; 8910 i_passed = 0; 8911 8912 pr_info("md: Autodetecting RAID arrays.\n"); 8913 8914 mutex_lock(&detected_devices_mutex); 8915 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 8916 i_scanned++; 8917 node_detected_dev = list_entry(all_detected_devices.next, 8918 struct detected_devices_node, list); 8919 list_del(&node_detected_dev->list); 8920 dev = node_detected_dev->dev; 8921 kfree(node_detected_dev); 8922 mutex_unlock(&detected_devices_mutex); 8923 rdev = md_import_device(dev,0, 90); 8924 mutex_lock(&detected_devices_mutex); 8925 if (IS_ERR(rdev)) 8926 continue; 8927 8928 if (test_bit(Faulty, &rdev->flags)) 8929 continue; 8930 8931 set_bit(AutoDetected, &rdev->flags); 8932 list_add(&rdev->same_set, &pending_raid_disks); 8933 i_passed++; 8934 } 8935 mutex_unlock(&detected_devices_mutex); 8936 8937 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); 8938 8939 autorun_devices(part); 8940 } 8941 8942 #endif /* !MODULE */ 8943 8944 static __exit void md_exit(void) 8945 { 8946 struct mddev *mddev; 8947 struct list_head *tmp; 8948 int delay = 1; 8949 8950 blk_unregister_region(MKDEV(MD_MAJOR,0), 512); 8951 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 8952 8953 unregister_blkdev(MD_MAJOR,"md"); 8954 unregister_blkdev(mdp_major, "mdp"); 8955 unregister_reboot_notifier(&md_notifier); 8956 unregister_sysctl_table(raid_table_header); 8957 8958 /* We cannot unload the modules while some process is 8959 * waiting for us in select() or poll() - wake them up 8960 */ 8961 md_unloading = 1; 8962 while (waitqueue_active(&md_event_waiters)) { 8963 /* not safe to leave yet */ 8964 wake_up(&md_event_waiters); 8965 msleep(delay); 8966 delay += delay; 8967 } 8968 remove_proc_entry("mdstat", NULL); 8969 8970 for_each_mddev(mddev, tmp) { 8971 export_array(mddev); 8972 mddev->ctime = 0; 8973 mddev->hold_active = 0; 8974 /* 8975 * for_each_mddev() will call mddev_put() at the end of each 8976 * iteration. As the mddev is now fully clear, this will 8977 * schedule the mddev for destruction by a workqueue, and the 8978 * destroy_workqueue() below will wait for that to complete. 8979 */ 8980 } 8981 destroy_workqueue(md_misc_wq); 8982 destroy_workqueue(md_wq); 8983 } 8984 8985 subsys_initcall(md_init); 8986 module_exit(md_exit) 8987 8988 static int get_ro(char *buffer, struct kernel_param *kp) 8989 { 8990 return sprintf(buffer, "%d", start_readonly); 8991 } 8992 static int set_ro(const char *val, struct kernel_param *kp) 8993 { 8994 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 8995 } 8996 8997 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 8998 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 8999 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 9000 9001 MODULE_LICENSE("GPL"); 9002 MODULE_DESCRIPTION("MD RAID framework"); 9003 MODULE_ALIAS("md"); 9004 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 9005