1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/kthread.h> 36 #include <linux/blkdev.h> 37 #include <linux/badblocks.h> 38 #include <linux/sysctl.h> 39 #include <linux/seq_file.h> 40 #include <linux/fs.h> 41 #include <linux/poll.h> 42 #include <linux/ctype.h> 43 #include <linux/string.h> 44 #include <linux/hdreg.h> 45 #include <linux/proc_fs.h> 46 #include <linux/random.h> 47 #include <linux/module.h> 48 #include <linux/reboot.h> 49 #include <linux/file.h> 50 #include <linux/compat.h> 51 #include <linux/delay.h> 52 #include <linux/raid/md_p.h> 53 #include <linux/raid/md_u.h> 54 #include <linux/slab.h> 55 #include "md.h" 56 #include "bitmap.h" 57 #include "md-cluster.h" 58 59 #ifndef MODULE 60 static void autostart_arrays(int part); 61 #endif 62 63 /* pers_list is a list of registered personalities protected 64 * by pers_lock. 65 * pers_lock does extra service to protect accesses to 66 * mddev->thread when the mutex cannot be held. 67 */ 68 static LIST_HEAD(pers_list); 69 static DEFINE_SPINLOCK(pers_lock); 70 71 struct md_cluster_operations *md_cluster_ops; 72 EXPORT_SYMBOL(md_cluster_ops); 73 struct module *md_cluster_mod; 74 EXPORT_SYMBOL(md_cluster_mod); 75 76 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 77 static struct workqueue_struct *md_wq; 78 static struct workqueue_struct *md_misc_wq; 79 80 static int remove_and_add_spares(struct mddev *mddev, 81 struct md_rdev *this); 82 static void mddev_detach(struct mddev *mddev); 83 84 /* 85 * Default number of read corrections we'll attempt on an rdev 86 * before ejecting it from the array. We divide the read error 87 * count by 2 for every hour elapsed between read errors. 88 */ 89 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 90 /* 91 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 92 * is 1000 KB/sec, so the extra system load does not show up that much. 93 * Increase it if you want to have more _guaranteed_ speed. Note that 94 * the RAID driver will use the maximum available bandwidth if the IO 95 * subsystem is idle. There is also an 'absolute maximum' reconstruction 96 * speed limit - in case reconstruction slows down your system despite 97 * idle IO detection. 98 * 99 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 100 * or /sys/block/mdX/md/sync_speed_{min,max} 101 */ 102 103 static int sysctl_speed_limit_min = 1000; 104 static int sysctl_speed_limit_max = 200000; 105 static inline int speed_min(struct mddev *mddev) 106 { 107 return mddev->sync_speed_min ? 108 mddev->sync_speed_min : sysctl_speed_limit_min; 109 } 110 111 static inline int speed_max(struct mddev *mddev) 112 { 113 return mddev->sync_speed_max ? 114 mddev->sync_speed_max : sysctl_speed_limit_max; 115 } 116 117 static struct ctl_table_header *raid_table_header; 118 119 static struct ctl_table raid_table[] = { 120 { 121 .procname = "speed_limit_min", 122 .data = &sysctl_speed_limit_min, 123 .maxlen = sizeof(int), 124 .mode = S_IRUGO|S_IWUSR, 125 .proc_handler = proc_dointvec, 126 }, 127 { 128 .procname = "speed_limit_max", 129 .data = &sysctl_speed_limit_max, 130 .maxlen = sizeof(int), 131 .mode = S_IRUGO|S_IWUSR, 132 .proc_handler = proc_dointvec, 133 }, 134 { } 135 }; 136 137 static struct ctl_table raid_dir_table[] = { 138 { 139 .procname = "raid", 140 .maxlen = 0, 141 .mode = S_IRUGO|S_IXUGO, 142 .child = raid_table, 143 }, 144 { } 145 }; 146 147 static struct ctl_table raid_root_table[] = { 148 { 149 .procname = "dev", 150 .maxlen = 0, 151 .mode = 0555, 152 .child = raid_dir_table, 153 }, 154 { } 155 }; 156 157 static const struct block_device_operations md_fops; 158 159 static int start_readonly; 160 161 /* bio_clone_mddev 162 * like bio_clone, but with a local bio set 163 */ 164 165 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 166 struct mddev *mddev) 167 { 168 struct bio *b; 169 170 if (!mddev || !mddev->bio_set) 171 return bio_alloc(gfp_mask, nr_iovecs); 172 173 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); 174 if (!b) 175 return NULL; 176 return b; 177 } 178 EXPORT_SYMBOL_GPL(bio_alloc_mddev); 179 180 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 181 struct mddev *mddev) 182 { 183 if (!mddev || !mddev->bio_set) 184 return bio_clone(bio, gfp_mask); 185 186 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); 187 } 188 EXPORT_SYMBOL_GPL(bio_clone_mddev); 189 190 /* 191 * We have a system wide 'event count' that is incremented 192 * on any 'interesting' event, and readers of /proc/mdstat 193 * can use 'poll' or 'select' to find out when the event 194 * count increases. 195 * 196 * Events are: 197 * start array, stop array, error, add device, remove device, 198 * start build, activate spare 199 */ 200 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 201 static atomic_t md_event_count; 202 void md_new_event(struct mddev *mddev) 203 { 204 atomic_inc(&md_event_count); 205 wake_up(&md_event_waiters); 206 } 207 EXPORT_SYMBOL_GPL(md_new_event); 208 209 /* 210 * Enables to iterate over all existing md arrays 211 * all_mddevs_lock protects this list. 212 */ 213 static LIST_HEAD(all_mddevs); 214 static DEFINE_SPINLOCK(all_mddevs_lock); 215 216 /* 217 * iterates through all used mddevs in the system. 218 * We take care to grab the all_mddevs_lock whenever navigating 219 * the list, and to always hold a refcount when unlocked. 220 * Any code which breaks out of this loop while own 221 * a reference to the current mddev and must mddev_put it. 222 */ 223 #define for_each_mddev(_mddev,_tmp) \ 224 \ 225 for (({ spin_lock(&all_mddevs_lock); \ 226 _tmp = all_mddevs.next; \ 227 _mddev = NULL;}); \ 228 ({ if (_tmp != &all_mddevs) \ 229 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 230 spin_unlock(&all_mddevs_lock); \ 231 if (_mddev) mddev_put(_mddev); \ 232 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 233 _tmp != &all_mddevs;}); \ 234 ({ spin_lock(&all_mddevs_lock); \ 235 _tmp = _tmp->next;}) \ 236 ) 237 238 /* Rather than calling directly into the personality make_request function, 239 * IO requests come here first so that we can check if the device is 240 * being suspended pending a reconfiguration. 241 * We hold a refcount over the call to ->make_request. By the time that 242 * call has finished, the bio has been linked into some internal structure 243 * and so is visible to ->quiesce(), so we don't need the refcount any more. 244 */ 245 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) 246 { 247 const int rw = bio_data_dir(bio); 248 struct mddev *mddev = q->queuedata; 249 unsigned int sectors; 250 int cpu; 251 252 blk_queue_split(q, &bio, q->bio_split); 253 254 if (mddev == NULL || mddev->pers == NULL) { 255 bio_io_error(bio); 256 return BLK_QC_T_NONE; 257 } 258 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 259 if (bio_sectors(bio) != 0) 260 bio->bi_error = -EROFS; 261 bio_endio(bio); 262 return BLK_QC_T_NONE; 263 } 264 smp_rmb(); /* Ensure implications of 'active' are visible */ 265 rcu_read_lock(); 266 if (mddev->suspended) { 267 DEFINE_WAIT(__wait); 268 for (;;) { 269 prepare_to_wait(&mddev->sb_wait, &__wait, 270 TASK_UNINTERRUPTIBLE); 271 if (!mddev->suspended) 272 break; 273 rcu_read_unlock(); 274 schedule(); 275 rcu_read_lock(); 276 } 277 finish_wait(&mddev->sb_wait, &__wait); 278 } 279 atomic_inc(&mddev->active_io); 280 rcu_read_unlock(); 281 282 /* 283 * save the sectors now since our bio can 284 * go away inside make_request 285 */ 286 sectors = bio_sectors(bio); 287 mddev->pers->make_request(mddev, bio); 288 289 cpu = part_stat_lock(); 290 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 291 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); 292 part_stat_unlock(); 293 294 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 295 wake_up(&mddev->sb_wait); 296 297 return BLK_QC_T_NONE; 298 } 299 300 /* mddev_suspend makes sure no new requests are submitted 301 * to the device, and that any requests that have been submitted 302 * are completely handled. 303 * Once mddev_detach() is called and completes, the module will be 304 * completely unused. 305 */ 306 void mddev_suspend(struct mddev *mddev) 307 { 308 WARN_ON_ONCE(current == mddev->thread->tsk); 309 if (mddev->suspended++) 310 return; 311 synchronize_rcu(); 312 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 313 mddev->pers->quiesce(mddev, 1); 314 315 del_timer_sync(&mddev->safemode_timer); 316 } 317 EXPORT_SYMBOL_GPL(mddev_suspend); 318 319 void mddev_resume(struct mddev *mddev) 320 { 321 if (--mddev->suspended) 322 return; 323 wake_up(&mddev->sb_wait); 324 mddev->pers->quiesce(mddev, 0); 325 326 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 327 md_wakeup_thread(mddev->thread); 328 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 329 } 330 EXPORT_SYMBOL_GPL(mddev_resume); 331 332 int mddev_congested(struct mddev *mddev, int bits) 333 { 334 struct md_personality *pers = mddev->pers; 335 int ret = 0; 336 337 rcu_read_lock(); 338 if (mddev->suspended) 339 ret = 1; 340 else if (pers && pers->congested) 341 ret = pers->congested(mddev, bits); 342 rcu_read_unlock(); 343 return ret; 344 } 345 EXPORT_SYMBOL_GPL(mddev_congested); 346 static int md_congested(void *data, int bits) 347 { 348 struct mddev *mddev = data; 349 return mddev_congested(mddev, bits); 350 } 351 352 /* 353 * Generic flush handling for md 354 */ 355 356 static void md_end_flush(struct bio *bio) 357 { 358 struct md_rdev *rdev = bio->bi_private; 359 struct mddev *mddev = rdev->mddev; 360 361 rdev_dec_pending(rdev, mddev); 362 363 if (atomic_dec_and_test(&mddev->flush_pending)) { 364 /* The pre-request flush has finished */ 365 queue_work(md_wq, &mddev->flush_work); 366 } 367 bio_put(bio); 368 } 369 370 static void md_submit_flush_data(struct work_struct *ws); 371 372 static void submit_flushes(struct work_struct *ws) 373 { 374 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 375 struct md_rdev *rdev; 376 377 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 378 atomic_set(&mddev->flush_pending, 1); 379 rcu_read_lock(); 380 rdev_for_each_rcu(rdev, mddev) 381 if (rdev->raid_disk >= 0 && 382 !test_bit(Faulty, &rdev->flags)) { 383 /* Take two references, one is dropped 384 * when request finishes, one after 385 * we reclaim rcu_read_lock 386 */ 387 struct bio *bi; 388 atomic_inc(&rdev->nr_pending); 389 atomic_inc(&rdev->nr_pending); 390 rcu_read_unlock(); 391 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 392 bi->bi_end_io = md_end_flush; 393 bi->bi_private = rdev; 394 bi->bi_bdev = rdev->bdev; 395 atomic_inc(&mddev->flush_pending); 396 submit_bio(WRITE_FLUSH, bi); 397 rcu_read_lock(); 398 rdev_dec_pending(rdev, mddev); 399 } 400 rcu_read_unlock(); 401 if (atomic_dec_and_test(&mddev->flush_pending)) 402 queue_work(md_wq, &mddev->flush_work); 403 } 404 405 static void md_submit_flush_data(struct work_struct *ws) 406 { 407 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 408 struct bio *bio = mddev->flush_bio; 409 410 if (bio->bi_iter.bi_size == 0) 411 /* an empty barrier - all done */ 412 bio_endio(bio); 413 else { 414 bio->bi_rw &= ~REQ_FLUSH; 415 mddev->pers->make_request(mddev, bio); 416 } 417 418 mddev->flush_bio = NULL; 419 wake_up(&mddev->sb_wait); 420 } 421 422 void md_flush_request(struct mddev *mddev, struct bio *bio) 423 { 424 spin_lock_irq(&mddev->lock); 425 wait_event_lock_irq(mddev->sb_wait, 426 !mddev->flush_bio, 427 mddev->lock); 428 mddev->flush_bio = bio; 429 spin_unlock_irq(&mddev->lock); 430 431 INIT_WORK(&mddev->flush_work, submit_flushes); 432 queue_work(md_wq, &mddev->flush_work); 433 } 434 EXPORT_SYMBOL(md_flush_request); 435 436 void md_unplug(struct blk_plug_cb *cb, bool from_schedule) 437 { 438 struct mddev *mddev = cb->data; 439 md_wakeup_thread(mddev->thread); 440 kfree(cb); 441 } 442 EXPORT_SYMBOL(md_unplug); 443 444 static inline struct mddev *mddev_get(struct mddev *mddev) 445 { 446 atomic_inc(&mddev->active); 447 return mddev; 448 } 449 450 static void mddev_delayed_delete(struct work_struct *ws); 451 452 static void mddev_put(struct mddev *mddev) 453 { 454 struct bio_set *bs = NULL; 455 456 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 457 return; 458 if (!mddev->raid_disks && list_empty(&mddev->disks) && 459 mddev->ctime == 0 && !mddev->hold_active) { 460 /* Array is not configured at all, and not held active, 461 * so destroy it */ 462 list_del_init(&mddev->all_mddevs); 463 bs = mddev->bio_set; 464 mddev->bio_set = NULL; 465 if (mddev->gendisk) { 466 /* We did a probe so need to clean up. Call 467 * queue_work inside the spinlock so that 468 * flush_workqueue() after mddev_find will 469 * succeed in waiting for the work to be done. 470 */ 471 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 472 queue_work(md_misc_wq, &mddev->del_work); 473 } else 474 kfree(mddev); 475 } 476 spin_unlock(&all_mddevs_lock); 477 if (bs) 478 bioset_free(bs); 479 } 480 481 static void md_safemode_timeout(unsigned long data); 482 483 void mddev_init(struct mddev *mddev) 484 { 485 mutex_init(&mddev->open_mutex); 486 mutex_init(&mddev->reconfig_mutex); 487 mutex_init(&mddev->bitmap_info.mutex); 488 INIT_LIST_HEAD(&mddev->disks); 489 INIT_LIST_HEAD(&mddev->all_mddevs); 490 setup_timer(&mddev->safemode_timer, md_safemode_timeout, 491 (unsigned long) mddev); 492 atomic_set(&mddev->active, 1); 493 atomic_set(&mddev->openers, 0); 494 atomic_set(&mddev->active_io, 0); 495 spin_lock_init(&mddev->lock); 496 atomic_set(&mddev->flush_pending, 0); 497 init_waitqueue_head(&mddev->sb_wait); 498 init_waitqueue_head(&mddev->recovery_wait); 499 mddev->reshape_position = MaxSector; 500 mddev->reshape_backwards = 0; 501 mddev->last_sync_action = "none"; 502 mddev->resync_min = 0; 503 mddev->resync_max = MaxSector; 504 mddev->level = LEVEL_NONE; 505 } 506 EXPORT_SYMBOL_GPL(mddev_init); 507 508 static struct mddev *mddev_find(dev_t unit) 509 { 510 struct mddev *mddev, *new = NULL; 511 512 if (unit && MAJOR(unit) != MD_MAJOR) 513 unit &= ~((1<<MdpMinorShift)-1); 514 515 retry: 516 spin_lock(&all_mddevs_lock); 517 518 if (unit) { 519 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 520 if (mddev->unit == unit) { 521 mddev_get(mddev); 522 spin_unlock(&all_mddevs_lock); 523 kfree(new); 524 return mddev; 525 } 526 527 if (new) { 528 list_add(&new->all_mddevs, &all_mddevs); 529 spin_unlock(&all_mddevs_lock); 530 new->hold_active = UNTIL_IOCTL; 531 return new; 532 } 533 } else if (new) { 534 /* find an unused unit number */ 535 static int next_minor = 512; 536 int start = next_minor; 537 int is_free = 0; 538 int dev = 0; 539 while (!is_free) { 540 dev = MKDEV(MD_MAJOR, next_minor); 541 next_minor++; 542 if (next_minor > MINORMASK) 543 next_minor = 0; 544 if (next_minor == start) { 545 /* Oh dear, all in use. */ 546 spin_unlock(&all_mddevs_lock); 547 kfree(new); 548 return NULL; 549 } 550 551 is_free = 1; 552 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 553 if (mddev->unit == dev) { 554 is_free = 0; 555 break; 556 } 557 } 558 new->unit = dev; 559 new->md_minor = MINOR(dev); 560 new->hold_active = UNTIL_STOP; 561 list_add(&new->all_mddevs, &all_mddevs); 562 spin_unlock(&all_mddevs_lock); 563 return new; 564 } 565 spin_unlock(&all_mddevs_lock); 566 567 new = kzalloc(sizeof(*new), GFP_KERNEL); 568 if (!new) 569 return NULL; 570 571 new->unit = unit; 572 if (MAJOR(unit) == MD_MAJOR) 573 new->md_minor = MINOR(unit); 574 else 575 new->md_minor = MINOR(unit) >> MdpMinorShift; 576 577 mddev_init(new); 578 579 goto retry; 580 } 581 582 static struct attribute_group md_redundancy_group; 583 584 void mddev_unlock(struct mddev *mddev) 585 { 586 if (mddev->to_remove) { 587 /* These cannot be removed under reconfig_mutex as 588 * an access to the files will try to take reconfig_mutex 589 * while holding the file unremovable, which leads to 590 * a deadlock. 591 * So hold set sysfs_active while the remove in happeing, 592 * and anything else which might set ->to_remove or my 593 * otherwise change the sysfs namespace will fail with 594 * -EBUSY if sysfs_active is still set. 595 * We set sysfs_active under reconfig_mutex and elsewhere 596 * test it under the same mutex to ensure its correct value 597 * is seen. 598 */ 599 struct attribute_group *to_remove = mddev->to_remove; 600 mddev->to_remove = NULL; 601 mddev->sysfs_active = 1; 602 mutex_unlock(&mddev->reconfig_mutex); 603 604 if (mddev->kobj.sd) { 605 if (to_remove != &md_redundancy_group) 606 sysfs_remove_group(&mddev->kobj, to_remove); 607 if (mddev->pers == NULL || 608 mddev->pers->sync_request == NULL) { 609 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 610 if (mddev->sysfs_action) 611 sysfs_put(mddev->sysfs_action); 612 mddev->sysfs_action = NULL; 613 } 614 } 615 mddev->sysfs_active = 0; 616 } else 617 mutex_unlock(&mddev->reconfig_mutex); 618 619 /* As we've dropped the mutex we need a spinlock to 620 * make sure the thread doesn't disappear 621 */ 622 spin_lock(&pers_lock); 623 md_wakeup_thread(mddev->thread); 624 spin_unlock(&pers_lock); 625 } 626 EXPORT_SYMBOL_GPL(mddev_unlock); 627 628 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) 629 { 630 struct md_rdev *rdev; 631 632 rdev_for_each_rcu(rdev, mddev) 633 if (rdev->desc_nr == nr) 634 return rdev; 635 636 return NULL; 637 } 638 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); 639 640 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 641 { 642 struct md_rdev *rdev; 643 644 rdev_for_each(rdev, mddev) 645 if (rdev->bdev->bd_dev == dev) 646 return rdev; 647 648 return NULL; 649 } 650 651 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 652 { 653 struct md_rdev *rdev; 654 655 rdev_for_each_rcu(rdev, mddev) 656 if (rdev->bdev->bd_dev == dev) 657 return rdev; 658 659 return NULL; 660 } 661 662 static struct md_personality *find_pers(int level, char *clevel) 663 { 664 struct md_personality *pers; 665 list_for_each_entry(pers, &pers_list, list) { 666 if (level != LEVEL_NONE && pers->level == level) 667 return pers; 668 if (strcmp(pers->name, clevel)==0) 669 return pers; 670 } 671 return NULL; 672 } 673 674 /* return the offset of the super block in 512byte sectors */ 675 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 676 { 677 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 678 return MD_NEW_SIZE_SECTORS(num_sectors); 679 } 680 681 static int alloc_disk_sb(struct md_rdev *rdev) 682 { 683 rdev->sb_page = alloc_page(GFP_KERNEL); 684 if (!rdev->sb_page) { 685 printk(KERN_ALERT "md: out of memory.\n"); 686 return -ENOMEM; 687 } 688 689 return 0; 690 } 691 692 void md_rdev_clear(struct md_rdev *rdev) 693 { 694 if (rdev->sb_page) { 695 put_page(rdev->sb_page); 696 rdev->sb_loaded = 0; 697 rdev->sb_page = NULL; 698 rdev->sb_start = 0; 699 rdev->sectors = 0; 700 } 701 if (rdev->bb_page) { 702 put_page(rdev->bb_page); 703 rdev->bb_page = NULL; 704 } 705 badblocks_exit(&rdev->badblocks); 706 } 707 EXPORT_SYMBOL_GPL(md_rdev_clear); 708 709 static void super_written(struct bio *bio) 710 { 711 struct md_rdev *rdev = bio->bi_private; 712 struct mddev *mddev = rdev->mddev; 713 714 if (bio->bi_error) { 715 printk("md: super_written gets error=%d\n", bio->bi_error); 716 md_error(mddev, rdev); 717 } 718 719 if (atomic_dec_and_test(&mddev->pending_writes)) 720 wake_up(&mddev->sb_wait); 721 rdev_dec_pending(rdev, mddev); 722 bio_put(bio); 723 } 724 725 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 726 sector_t sector, int size, struct page *page) 727 { 728 /* write first size bytes of page to sector of rdev 729 * Increment mddev->pending_writes before returning 730 * and decrement it on completion, waking up sb_wait 731 * if zero is reached. 732 * If an error occurred, call md_error 733 */ 734 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); 735 736 atomic_inc(&rdev->nr_pending); 737 738 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; 739 bio->bi_iter.bi_sector = sector; 740 bio_add_page(bio, page, size, 0); 741 bio->bi_private = rdev; 742 bio->bi_end_io = super_written; 743 744 atomic_inc(&mddev->pending_writes); 745 submit_bio(WRITE_FLUSH_FUA, bio); 746 } 747 748 void md_super_wait(struct mddev *mddev) 749 { 750 /* wait for all superblock writes that were scheduled to complete */ 751 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 752 } 753 754 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 755 struct page *page, int rw, bool metadata_op) 756 { 757 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 758 int ret; 759 760 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? 761 rdev->meta_bdev : rdev->bdev; 762 if (metadata_op) 763 bio->bi_iter.bi_sector = sector + rdev->sb_start; 764 else if (rdev->mddev->reshape_position != MaxSector && 765 (rdev->mddev->reshape_backwards == 766 (sector >= rdev->mddev->reshape_position))) 767 bio->bi_iter.bi_sector = sector + rdev->new_data_offset; 768 else 769 bio->bi_iter.bi_sector = sector + rdev->data_offset; 770 bio_add_page(bio, page, size, 0); 771 submit_bio_wait(rw, bio); 772 773 ret = !bio->bi_error; 774 bio_put(bio); 775 return ret; 776 } 777 EXPORT_SYMBOL_GPL(sync_page_io); 778 779 static int read_disk_sb(struct md_rdev *rdev, int size) 780 { 781 char b[BDEVNAME_SIZE]; 782 783 if (rdev->sb_loaded) 784 return 0; 785 786 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) 787 goto fail; 788 rdev->sb_loaded = 1; 789 return 0; 790 791 fail: 792 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 793 bdevname(rdev->bdev,b)); 794 return -EINVAL; 795 } 796 797 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 798 { 799 return sb1->set_uuid0 == sb2->set_uuid0 && 800 sb1->set_uuid1 == sb2->set_uuid1 && 801 sb1->set_uuid2 == sb2->set_uuid2 && 802 sb1->set_uuid3 == sb2->set_uuid3; 803 } 804 805 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 806 { 807 int ret; 808 mdp_super_t *tmp1, *tmp2; 809 810 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 811 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 812 813 if (!tmp1 || !tmp2) { 814 ret = 0; 815 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 816 goto abort; 817 } 818 819 *tmp1 = *sb1; 820 *tmp2 = *sb2; 821 822 /* 823 * nr_disks is not constant 824 */ 825 tmp1->nr_disks = 0; 826 tmp2->nr_disks = 0; 827 828 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 829 abort: 830 kfree(tmp1); 831 kfree(tmp2); 832 return ret; 833 } 834 835 static u32 md_csum_fold(u32 csum) 836 { 837 csum = (csum & 0xffff) + (csum >> 16); 838 return (csum & 0xffff) + (csum >> 16); 839 } 840 841 static unsigned int calc_sb_csum(mdp_super_t *sb) 842 { 843 u64 newcsum = 0; 844 u32 *sb32 = (u32*)sb; 845 int i; 846 unsigned int disk_csum, csum; 847 848 disk_csum = sb->sb_csum; 849 sb->sb_csum = 0; 850 851 for (i = 0; i < MD_SB_BYTES/4 ; i++) 852 newcsum += sb32[i]; 853 csum = (newcsum & 0xffffffff) + (newcsum>>32); 854 855 #ifdef CONFIG_ALPHA 856 /* This used to use csum_partial, which was wrong for several 857 * reasons including that different results are returned on 858 * different architectures. It isn't critical that we get exactly 859 * the same return value as before (we always csum_fold before 860 * testing, and that removes any differences). However as we 861 * know that csum_partial always returned a 16bit value on 862 * alphas, do a fold to maximise conformity to previous behaviour. 863 */ 864 sb->sb_csum = md_csum_fold(disk_csum); 865 #else 866 sb->sb_csum = disk_csum; 867 #endif 868 return csum; 869 } 870 871 /* 872 * Handle superblock details. 873 * We want to be able to handle multiple superblock formats 874 * so we have a common interface to them all, and an array of 875 * different handlers. 876 * We rely on user-space to write the initial superblock, and support 877 * reading and updating of superblocks. 878 * Interface methods are: 879 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 880 * loads and validates a superblock on dev. 881 * if refdev != NULL, compare superblocks on both devices 882 * Return: 883 * 0 - dev has a superblock that is compatible with refdev 884 * 1 - dev has a superblock that is compatible and newer than refdev 885 * so dev should be used as the refdev in future 886 * -EINVAL superblock incompatible or invalid 887 * -othererror e.g. -EIO 888 * 889 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 890 * Verify that dev is acceptable into mddev. 891 * The first time, mddev->raid_disks will be 0, and data from 892 * dev should be merged in. Subsequent calls check that dev 893 * is new enough. Return 0 or -EINVAL 894 * 895 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 896 * Update the superblock for rdev with data in mddev 897 * This does not write to disc. 898 * 899 */ 900 901 struct super_type { 902 char *name; 903 struct module *owner; 904 int (*load_super)(struct md_rdev *rdev, 905 struct md_rdev *refdev, 906 int minor_version); 907 int (*validate_super)(struct mddev *mddev, 908 struct md_rdev *rdev); 909 void (*sync_super)(struct mddev *mddev, 910 struct md_rdev *rdev); 911 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 912 sector_t num_sectors); 913 int (*allow_new_offset)(struct md_rdev *rdev, 914 unsigned long long new_offset); 915 }; 916 917 /* 918 * Check that the given mddev has no bitmap. 919 * 920 * This function is called from the run method of all personalities that do not 921 * support bitmaps. It prints an error message and returns non-zero if mddev 922 * has a bitmap. Otherwise, it returns 0. 923 * 924 */ 925 int md_check_no_bitmap(struct mddev *mddev) 926 { 927 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 928 return 0; 929 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 930 mdname(mddev), mddev->pers->name); 931 return 1; 932 } 933 EXPORT_SYMBOL(md_check_no_bitmap); 934 935 /* 936 * load_super for 0.90.0 937 */ 938 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 939 { 940 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 941 mdp_super_t *sb; 942 int ret; 943 944 /* 945 * Calculate the position of the superblock (512byte sectors), 946 * it's at the end of the disk. 947 * 948 * It also happens to be a multiple of 4Kb. 949 */ 950 rdev->sb_start = calc_dev_sboffset(rdev); 951 952 ret = read_disk_sb(rdev, MD_SB_BYTES); 953 if (ret) return ret; 954 955 ret = -EINVAL; 956 957 bdevname(rdev->bdev, b); 958 sb = page_address(rdev->sb_page); 959 960 if (sb->md_magic != MD_SB_MAGIC) { 961 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 962 b); 963 goto abort; 964 } 965 966 if (sb->major_version != 0 || 967 sb->minor_version < 90 || 968 sb->minor_version > 91) { 969 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 970 sb->major_version, sb->minor_version, 971 b); 972 goto abort; 973 } 974 975 if (sb->raid_disks <= 0) 976 goto abort; 977 978 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 979 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 980 b); 981 goto abort; 982 } 983 984 rdev->preferred_minor = sb->md_minor; 985 rdev->data_offset = 0; 986 rdev->new_data_offset = 0; 987 rdev->sb_size = MD_SB_BYTES; 988 rdev->badblocks.shift = -1; 989 990 if (sb->level == LEVEL_MULTIPATH) 991 rdev->desc_nr = -1; 992 else 993 rdev->desc_nr = sb->this_disk.number; 994 995 if (!refdev) { 996 ret = 1; 997 } else { 998 __u64 ev1, ev2; 999 mdp_super_t *refsb = page_address(refdev->sb_page); 1000 if (!uuid_equal(refsb, sb)) { 1001 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1002 b, bdevname(refdev->bdev,b2)); 1003 goto abort; 1004 } 1005 if (!sb_equal(refsb, sb)) { 1006 printk(KERN_WARNING "md: %s has same UUID" 1007 " but different superblock to %s\n", 1008 b, bdevname(refdev->bdev, b2)); 1009 goto abort; 1010 } 1011 ev1 = md_event(sb); 1012 ev2 = md_event(refsb); 1013 if (ev1 > ev2) 1014 ret = 1; 1015 else 1016 ret = 0; 1017 } 1018 rdev->sectors = rdev->sb_start; 1019 /* Limit to 4TB as metadata cannot record more than that. 1020 * (not needed for Linear and RAID0 as metadata doesn't 1021 * record this size) 1022 */ 1023 if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && 1024 sb->level >= 1) 1025 rdev->sectors = (sector_t)(2ULL << 32) - 2; 1026 1027 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1028 /* "this cannot possibly happen" ... */ 1029 ret = -EINVAL; 1030 1031 abort: 1032 return ret; 1033 } 1034 1035 /* 1036 * validate_super for 0.90.0 1037 */ 1038 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1039 { 1040 mdp_disk_t *desc; 1041 mdp_super_t *sb = page_address(rdev->sb_page); 1042 __u64 ev1 = md_event(sb); 1043 1044 rdev->raid_disk = -1; 1045 clear_bit(Faulty, &rdev->flags); 1046 clear_bit(In_sync, &rdev->flags); 1047 clear_bit(Bitmap_sync, &rdev->flags); 1048 clear_bit(WriteMostly, &rdev->flags); 1049 1050 if (mddev->raid_disks == 0) { 1051 mddev->major_version = 0; 1052 mddev->minor_version = sb->minor_version; 1053 mddev->patch_version = sb->patch_version; 1054 mddev->external = 0; 1055 mddev->chunk_sectors = sb->chunk_size >> 9; 1056 mddev->ctime = sb->ctime; 1057 mddev->utime = sb->utime; 1058 mddev->level = sb->level; 1059 mddev->clevel[0] = 0; 1060 mddev->layout = sb->layout; 1061 mddev->raid_disks = sb->raid_disks; 1062 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1063 mddev->events = ev1; 1064 mddev->bitmap_info.offset = 0; 1065 mddev->bitmap_info.space = 0; 1066 /* bitmap can use 60 K after the 4K superblocks */ 1067 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1068 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1069 mddev->reshape_backwards = 0; 1070 1071 if (mddev->minor_version >= 91) { 1072 mddev->reshape_position = sb->reshape_position; 1073 mddev->delta_disks = sb->delta_disks; 1074 mddev->new_level = sb->new_level; 1075 mddev->new_layout = sb->new_layout; 1076 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1077 if (mddev->delta_disks < 0) 1078 mddev->reshape_backwards = 1; 1079 } else { 1080 mddev->reshape_position = MaxSector; 1081 mddev->delta_disks = 0; 1082 mddev->new_level = mddev->level; 1083 mddev->new_layout = mddev->layout; 1084 mddev->new_chunk_sectors = mddev->chunk_sectors; 1085 } 1086 1087 if (sb->state & (1<<MD_SB_CLEAN)) 1088 mddev->recovery_cp = MaxSector; 1089 else { 1090 if (sb->events_hi == sb->cp_events_hi && 1091 sb->events_lo == sb->cp_events_lo) { 1092 mddev->recovery_cp = sb->recovery_cp; 1093 } else 1094 mddev->recovery_cp = 0; 1095 } 1096 1097 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1098 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1099 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1100 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1101 1102 mddev->max_disks = MD_SB_DISKS; 1103 1104 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1105 mddev->bitmap_info.file == NULL) { 1106 mddev->bitmap_info.offset = 1107 mddev->bitmap_info.default_offset; 1108 mddev->bitmap_info.space = 1109 mddev->bitmap_info.default_space; 1110 } 1111 1112 } else if (mddev->pers == NULL) { 1113 /* Insist on good event counter while assembling, except 1114 * for spares (which don't need an event count) */ 1115 ++ev1; 1116 if (sb->disks[rdev->desc_nr].state & ( 1117 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1118 if (ev1 < mddev->events) 1119 return -EINVAL; 1120 } else if (mddev->bitmap) { 1121 /* if adding to array with a bitmap, then we can accept an 1122 * older device ... but not too old. 1123 */ 1124 if (ev1 < mddev->bitmap->events_cleared) 1125 return 0; 1126 if (ev1 < mddev->events) 1127 set_bit(Bitmap_sync, &rdev->flags); 1128 } else { 1129 if (ev1 < mddev->events) 1130 /* just a hot-add of a new device, leave raid_disk at -1 */ 1131 return 0; 1132 } 1133 1134 if (mddev->level != LEVEL_MULTIPATH) { 1135 desc = sb->disks + rdev->desc_nr; 1136 1137 if (desc->state & (1<<MD_DISK_FAULTY)) 1138 set_bit(Faulty, &rdev->flags); 1139 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1140 desc->raid_disk < mddev->raid_disks */) { 1141 set_bit(In_sync, &rdev->flags); 1142 rdev->raid_disk = desc->raid_disk; 1143 rdev->saved_raid_disk = desc->raid_disk; 1144 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1145 /* active but not in sync implies recovery up to 1146 * reshape position. We don't know exactly where 1147 * that is, so set to zero for now */ 1148 if (mddev->minor_version >= 91) { 1149 rdev->recovery_offset = 0; 1150 rdev->raid_disk = desc->raid_disk; 1151 } 1152 } 1153 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1154 set_bit(WriteMostly, &rdev->flags); 1155 } else /* MULTIPATH are always insync */ 1156 set_bit(In_sync, &rdev->flags); 1157 return 0; 1158 } 1159 1160 /* 1161 * sync_super for 0.90.0 1162 */ 1163 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1164 { 1165 mdp_super_t *sb; 1166 struct md_rdev *rdev2; 1167 int next_spare = mddev->raid_disks; 1168 1169 /* make rdev->sb match mddev data.. 1170 * 1171 * 1/ zero out disks 1172 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1173 * 3/ any empty disks < next_spare become removed 1174 * 1175 * disks[0] gets initialised to REMOVED because 1176 * we cannot be sure from other fields if it has 1177 * been initialised or not. 1178 */ 1179 int i; 1180 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1181 1182 rdev->sb_size = MD_SB_BYTES; 1183 1184 sb = page_address(rdev->sb_page); 1185 1186 memset(sb, 0, sizeof(*sb)); 1187 1188 sb->md_magic = MD_SB_MAGIC; 1189 sb->major_version = mddev->major_version; 1190 sb->patch_version = mddev->patch_version; 1191 sb->gvalid_words = 0; /* ignored */ 1192 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1193 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1194 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1195 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1196 1197 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 1198 sb->level = mddev->level; 1199 sb->size = mddev->dev_sectors / 2; 1200 sb->raid_disks = mddev->raid_disks; 1201 sb->md_minor = mddev->md_minor; 1202 sb->not_persistent = 0; 1203 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 1204 sb->state = 0; 1205 sb->events_hi = (mddev->events>>32); 1206 sb->events_lo = (u32)mddev->events; 1207 1208 if (mddev->reshape_position == MaxSector) 1209 sb->minor_version = 90; 1210 else { 1211 sb->minor_version = 91; 1212 sb->reshape_position = mddev->reshape_position; 1213 sb->new_level = mddev->new_level; 1214 sb->delta_disks = mddev->delta_disks; 1215 sb->new_layout = mddev->new_layout; 1216 sb->new_chunk = mddev->new_chunk_sectors << 9; 1217 } 1218 mddev->minor_version = sb->minor_version; 1219 if (mddev->in_sync) 1220 { 1221 sb->recovery_cp = mddev->recovery_cp; 1222 sb->cp_events_hi = (mddev->events>>32); 1223 sb->cp_events_lo = (u32)mddev->events; 1224 if (mddev->recovery_cp == MaxSector) 1225 sb->state = (1<< MD_SB_CLEAN); 1226 } else 1227 sb->recovery_cp = 0; 1228 1229 sb->layout = mddev->layout; 1230 sb->chunk_size = mddev->chunk_sectors << 9; 1231 1232 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1233 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1234 1235 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1236 rdev_for_each(rdev2, mddev) { 1237 mdp_disk_t *d; 1238 int desc_nr; 1239 int is_active = test_bit(In_sync, &rdev2->flags); 1240 1241 if (rdev2->raid_disk >= 0 && 1242 sb->minor_version >= 91) 1243 /* we have nowhere to store the recovery_offset, 1244 * but if it is not below the reshape_position, 1245 * we can piggy-back on that. 1246 */ 1247 is_active = 1; 1248 if (rdev2->raid_disk < 0 || 1249 test_bit(Faulty, &rdev2->flags)) 1250 is_active = 0; 1251 if (is_active) 1252 desc_nr = rdev2->raid_disk; 1253 else 1254 desc_nr = next_spare++; 1255 rdev2->desc_nr = desc_nr; 1256 d = &sb->disks[rdev2->desc_nr]; 1257 nr_disks++; 1258 d->number = rdev2->desc_nr; 1259 d->major = MAJOR(rdev2->bdev->bd_dev); 1260 d->minor = MINOR(rdev2->bdev->bd_dev); 1261 if (is_active) 1262 d->raid_disk = rdev2->raid_disk; 1263 else 1264 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1265 if (test_bit(Faulty, &rdev2->flags)) 1266 d->state = (1<<MD_DISK_FAULTY); 1267 else if (is_active) { 1268 d->state = (1<<MD_DISK_ACTIVE); 1269 if (test_bit(In_sync, &rdev2->flags)) 1270 d->state |= (1<<MD_DISK_SYNC); 1271 active++; 1272 working++; 1273 } else { 1274 d->state = 0; 1275 spare++; 1276 working++; 1277 } 1278 if (test_bit(WriteMostly, &rdev2->flags)) 1279 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1280 } 1281 /* now set the "removed" and "faulty" bits on any missing devices */ 1282 for (i=0 ; i < mddev->raid_disks ; i++) { 1283 mdp_disk_t *d = &sb->disks[i]; 1284 if (d->state == 0 && d->number == 0) { 1285 d->number = i; 1286 d->raid_disk = i; 1287 d->state = (1<<MD_DISK_REMOVED); 1288 d->state |= (1<<MD_DISK_FAULTY); 1289 failed++; 1290 } 1291 } 1292 sb->nr_disks = nr_disks; 1293 sb->active_disks = active; 1294 sb->working_disks = working; 1295 sb->failed_disks = failed; 1296 sb->spare_disks = spare; 1297 1298 sb->this_disk = sb->disks[rdev->desc_nr]; 1299 sb->sb_csum = calc_sb_csum(sb); 1300 } 1301 1302 /* 1303 * rdev_size_change for 0.90.0 1304 */ 1305 static unsigned long long 1306 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1307 { 1308 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1309 return 0; /* component must fit device */ 1310 if (rdev->mddev->bitmap_info.offset) 1311 return 0; /* can't move bitmap */ 1312 rdev->sb_start = calc_dev_sboffset(rdev); 1313 if (!num_sectors || num_sectors > rdev->sb_start) 1314 num_sectors = rdev->sb_start; 1315 /* Limit to 4TB as metadata cannot record more than that. 1316 * 4TB == 2^32 KB, or 2*2^32 sectors. 1317 */ 1318 if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && 1319 rdev->mddev->level >= 1) 1320 num_sectors = (sector_t)(2ULL << 32) - 2; 1321 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1322 rdev->sb_page); 1323 md_super_wait(rdev->mddev); 1324 return num_sectors; 1325 } 1326 1327 static int 1328 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1329 { 1330 /* non-zero offset changes not possible with v0.90 */ 1331 return new_offset == 0; 1332 } 1333 1334 /* 1335 * version 1 superblock 1336 */ 1337 1338 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) 1339 { 1340 __le32 disk_csum; 1341 u32 csum; 1342 unsigned long long newcsum; 1343 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1344 __le32 *isuper = (__le32*)sb; 1345 1346 disk_csum = sb->sb_csum; 1347 sb->sb_csum = 0; 1348 newcsum = 0; 1349 for (; size >= 4; size -= 4) 1350 newcsum += le32_to_cpu(*isuper++); 1351 1352 if (size == 2) 1353 newcsum += le16_to_cpu(*(__le16*) isuper); 1354 1355 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1356 sb->sb_csum = disk_csum; 1357 return cpu_to_le32(csum); 1358 } 1359 1360 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1361 { 1362 struct mdp_superblock_1 *sb; 1363 int ret; 1364 sector_t sb_start; 1365 sector_t sectors; 1366 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1367 int bmask; 1368 1369 /* 1370 * Calculate the position of the superblock in 512byte sectors. 1371 * It is always aligned to a 4K boundary and 1372 * depeding on minor_version, it can be: 1373 * 0: At least 8K, but less than 12K, from end of device 1374 * 1: At start of device 1375 * 2: 4K from start of device. 1376 */ 1377 switch(minor_version) { 1378 case 0: 1379 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; 1380 sb_start -= 8*2; 1381 sb_start &= ~(sector_t)(4*2-1); 1382 break; 1383 case 1: 1384 sb_start = 0; 1385 break; 1386 case 2: 1387 sb_start = 8; 1388 break; 1389 default: 1390 return -EINVAL; 1391 } 1392 rdev->sb_start = sb_start; 1393 1394 /* superblock is rarely larger than 1K, but it can be larger, 1395 * and it is safe to read 4k, so we do that 1396 */ 1397 ret = read_disk_sb(rdev, 4096); 1398 if (ret) return ret; 1399 1400 sb = page_address(rdev->sb_page); 1401 1402 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1403 sb->major_version != cpu_to_le32(1) || 1404 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1405 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1406 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1407 return -EINVAL; 1408 1409 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1410 printk("md: invalid superblock checksum on %s\n", 1411 bdevname(rdev->bdev,b)); 1412 return -EINVAL; 1413 } 1414 if (le64_to_cpu(sb->data_size) < 10) { 1415 printk("md: data_size too small on %s\n", 1416 bdevname(rdev->bdev,b)); 1417 return -EINVAL; 1418 } 1419 if (sb->pad0 || 1420 sb->pad3[0] || 1421 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1422 /* Some padding is non-zero, might be a new feature */ 1423 return -EINVAL; 1424 1425 rdev->preferred_minor = 0xffff; 1426 rdev->data_offset = le64_to_cpu(sb->data_offset); 1427 rdev->new_data_offset = rdev->data_offset; 1428 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1429 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1430 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1431 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1432 1433 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1434 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1435 if (rdev->sb_size & bmask) 1436 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1437 1438 if (minor_version 1439 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1440 return -EINVAL; 1441 if (minor_version 1442 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1443 return -EINVAL; 1444 1445 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1446 rdev->desc_nr = -1; 1447 else 1448 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1449 1450 if (!rdev->bb_page) { 1451 rdev->bb_page = alloc_page(GFP_KERNEL); 1452 if (!rdev->bb_page) 1453 return -ENOMEM; 1454 } 1455 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1456 rdev->badblocks.count == 0) { 1457 /* need to load the bad block list. 1458 * Currently we limit it to one page. 1459 */ 1460 s32 offset; 1461 sector_t bb_sector; 1462 u64 *bbp; 1463 int i; 1464 int sectors = le16_to_cpu(sb->bblog_size); 1465 if (sectors > (PAGE_SIZE / 512)) 1466 return -EINVAL; 1467 offset = le32_to_cpu(sb->bblog_offset); 1468 if (offset == 0) 1469 return -EINVAL; 1470 bb_sector = (long long)offset; 1471 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1472 rdev->bb_page, READ, true)) 1473 return -EIO; 1474 bbp = (u64 *)page_address(rdev->bb_page); 1475 rdev->badblocks.shift = sb->bblog_shift; 1476 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1477 u64 bb = le64_to_cpu(*bbp); 1478 int count = bb & (0x3ff); 1479 u64 sector = bb >> 10; 1480 sector <<= sb->bblog_shift; 1481 count <<= sb->bblog_shift; 1482 if (bb + 1 == 0) 1483 break; 1484 if (badblocks_set(&rdev->badblocks, sector, count, 1)) 1485 return -EINVAL; 1486 } 1487 } else if (sb->bblog_offset != 0) 1488 rdev->badblocks.shift = 0; 1489 1490 if (!refdev) { 1491 ret = 1; 1492 } else { 1493 __u64 ev1, ev2; 1494 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1495 1496 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1497 sb->level != refsb->level || 1498 sb->layout != refsb->layout || 1499 sb->chunksize != refsb->chunksize) { 1500 printk(KERN_WARNING "md: %s has strangely different" 1501 " superblock to %s\n", 1502 bdevname(rdev->bdev,b), 1503 bdevname(refdev->bdev,b2)); 1504 return -EINVAL; 1505 } 1506 ev1 = le64_to_cpu(sb->events); 1507 ev2 = le64_to_cpu(refsb->events); 1508 1509 if (ev1 > ev2) 1510 ret = 1; 1511 else 1512 ret = 0; 1513 } 1514 if (minor_version) { 1515 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1516 sectors -= rdev->data_offset; 1517 } else 1518 sectors = rdev->sb_start; 1519 if (sectors < le64_to_cpu(sb->data_size)) 1520 return -EINVAL; 1521 rdev->sectors = le64_to_cpu(sb->data_size); 1522 return ret; 1523 } 1524 1525 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1526 { 1527 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1528 __u64 ev1 = le64_to_cpu(sb->events); 1529 1530 rdev->raid_disk = -1; 1531 clear_bit(Faulty, &rdev->flags); 1532 clear_bit(In_sync, &rdev->flags); 1533 clear_bit(Bitmap_sync, &rdev->flags); 1534 clear_bit(WriteMostly, &rdev->flags); 1535 1536 if (mddev->raid_disks == 0) { 1537 mddev->major_version = 1; 1538 mddev->patch_version = 0; 1539 mddev->external = 0; 1540 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1541 mddev->ctime = le64_to_cpu(sb->ctime); 1542 mddev->utime = le64_to_cpu(sb->utime); 1543 mddev->level = le32_to_cpu(sb->level); 1544 mddev->clevel[0] = 0; 1545 mddev->layout = le32_to_cpu(sb->layout); 1546 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1547 mddev->dev_sectors = le64_to_cpu(sb->size); 1548 mddev->events = ev1; 1549 mddev->bitmap_info.offset = 0; 1550 mddev->bitmap_info.space = 0; 1551 /* Default location for bitmap is 1K after superblock 1552 * using 3K - total of 4K 1553 */ 1554 mddev->bitmap_info.default_offset = 1024 >> 9; 1555 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1556 mddev->reshape_backwards = 0; 1557 1558 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1559 memcpy(mddev->uuid, sb->set_uuid, 16); 1560 1561 mddev->max_disks = (4096-256)/2; 1562 1563 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1564 mddev->bitmap_info.file == NULL) { 1565 mddev->bitmap_info.offset = 1566 (__s32)le32_to_cpu(sb->bitmap_offset); 1567 /* Metadata doesn't record how much space is available. 1568 * For 1.0, we assume we can use up to the superblock 1569 * if before, else to 4K beyond superblock. 1570 * For others, assume no change is possible. 1571 */ 1572 if (mddev->minor_version > 0) 1573 mddev->bitmap_info.space = 0; 1574 else if (mddev->bitmap_info.offset > 0) 1575 mddev->bitmap_info.space = 1576 8 - mddev->bitmap_info.offset; 1577 else 1578 mddev->bitmap_info.space = 1579 -mddev->bitmap_info.offset; 1580 } 1581 1582 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1583 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1584 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1585 mddev->new_level = le32_to_cpu(sb->new_level); 1586 mddev->new_layout = le32_to_cpu(sb->new_layout); 1587 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1588 if (mddev->delta_disks < 0 || 1589 (mddev->delta_disks == 0 && 1590 (le32_to_cpu(sb->feature_map) 1591 & MD_FEATURE_RESHAPE_BACKWARDS))) 1592 mddev->reshape_backwards = 1; 1593 } else { 1594 mddev->reshape_position = MaxSector; 1595 mddev->delta_disks = 0; 1596 mddev->new_level = mddev->level; 1597 mddev->new_layout = mddev->layout; 1598 mddev->new_chunk_sectors = mddev->chunk_sectors; 1599 } 1600 1601 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) { 1602 set_bit(MD_HAS_JOURNAL, &mddev->flags); 1603 if (mddev->recovery_cp == MaxSector) 1604 set_bit(MD_JOURNAL_CLEAN, &mddev->flags); 1605 } 1606 } else if (mddev->pers == NULL) { 1607 /* Insist of good event counter while assembling, except for 1608 * spares (which don't need an event count) */ 1609 ++ev1; 1610 if (rdev->desc_nr >= 0 && 1611 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1612 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || 1613 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) 1614 if (ev1 < mddev->events) 1615 return -EINVAL; 1616 } else if (mddev->bitmap) { 1617 /* If adding to array with a bitmap, then we can accept an 1618 * older device, but not too old. 1619 */ 1620 if (ev1 < mddev->bitmap->events_cleared) 1621 return 0; 1622 if (ev1 < mddev->events) 1623 set_bit(Bitmap_sync, &rdev->flags); 1624 } else { 1625 if (ev1 < mddev->events) 1626 /* just a hot-add of a new device, leave raid_disk at -1 */ 1627 return 0; 1628 } 1629 if (mddev->level != LEVEL_MULTIPATH) { 1630 int role; 1631 if (rdev->desc_nr < 0 || 1632 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1633 role = MD_DISK_ROLE_SPARE; 1634 rdev->desc_nr = -1; 1635 } else 1636 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1637 switch(role) { 1638 case MD_DISK_ROLE_SPARE: /* spare */ 1639 break; 1640 case MD_DISK_ROLE_FAULTY: /* faulty */ 1641 set_bit(Faulty, &rdev->flags); 1642 break; 1643 case MD_DISK_ROLE_JOURNAL: /* journal device */ 1644 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { 1645 /* journal device without journal feature */ 1646 printk(KERN_WARNING 1647 "md: journal device provided without journal feature, ignoring the device\n"); 1648 return -EINVAL; 1649 } 1650 set_bit(Journal, &rdev->flags); 1651 rdev->journal_tail = le64_to_cpu(sb->journal_tail); 1652 rdev->raid_disk = 0; 1653 break; 1654 default: 1655 rdev->saved_raid_disk = role; 1656 if ((le32_to_cpu(sb->feature_map) & 1657 MD_FEATURE_RECOVERY_OFFSET)) { 1658 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1659 if (!(le32_to_cpu(sb->feature_map) & 1660 MD_FEATURE_RECOVERY_BITMAP)) 1661 rdev->saved_raid_disk = -1; 1662 } else 1663 set_bit(In_sync, &rdev->flags); 1664 rdev->raid_disk = role; 1665 break; 1666 } 1667 if (sb->devflags & WriteMostly1) 1668 set_bit(WriteMostly, &rdev->flags); 1669 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1670 set_bit(Replacement, &rdev->flags); 1671 } else /* MULTIPATH are always insync */ 1672 set_bit(In_sync, &rdev->flags); 1673 1674 return 0; 1675 } 1676 1677 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1678 { 1679 struct mdp_superblock_1 *sb; 1680 struct md_rdev *rdev2; 1681 int max_dev, i; 1682 /* make rdev->sb match mddev and rdev data. */ 1683 1684 sb = page_address(rdev->sb_page); 1685 1686 sb->feature_map = 0; 1687 sb->pad0 = 0; 1688 sb->recovery_offset = cpu_to_le64(0); 1689 memset(sb->pad3, 0, sizeof(sb->pad3)); 1690 1691 sb->utime = cpu_to_le64((__u64)mddev->utime); 1692 sb->events = cpu_to_le64(mddev->events); 1693 if (mddev->in_sync) 1694 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1695 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) 1696 sb->resync_offset = cpu_to_le64(MaxSector); 1697 else 1698 sb->resync_offset = cpu_to_le64(0); 1699 1700 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1701 1702 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1703 sb->size = cpu_to_le64(mddev->dev_sectors); 1704 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1705 sb->level = cpu_to_le32(mddev->level); 1706 sb->layout = cpu_to_le32(mddev->layout); 1707 1708 if (test_bit(WriteMostly, &rdev->flags)) 1709 sb->devflags |= WriteMostly1; 1710 else 1711 sb->devflags &= ~WriteMostly1; 1712 sb->data_offset = cpu_to_le64(rdev->data_offset); 1713 sb->data_size = cpu_to_le64(rdev->sectors); 1714 1715 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1716 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1717 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1718 } 1719 1720 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && 1721 !test_bit(In_sync, &rdev->flags)) { 1722 sb->feature_map |= 1723 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1724 sb->recovery_offset = 1725 cpu_to_le64(rdev->recovery_offset); 1726 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 1727 sb->feature_map |= 1728 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 1729 } 1730 /* Note: recovery_offset and journal_tail share space */ 1731 if (test_bit(Journal, &rdev->flags)) 1732 sb->journal_tail = cpu_to_le64(rdev->journal_tail); 1733 if (test_bit(Replacement, &rdev->flags)) 1734 sb->feature_map |= 1735 cpu_to_le32(MD_FEATURE_REPLACEMENT); 1736 1737 if (mddev->reshape_position != MaxSector) { 1738 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1739 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1740 sb->new_layout = cpu_to_le32(mddev->new_layout); 1741 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1742 sb->new_level = cpu_to_le32(mddev->new_level); 1743 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1744 if (mddev->delta_disks == 0 && 1745 mddev->reshape_backwards) 1746 sb->feature_map 1747 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 1748 if (rdev->new_data_offset != rdev->data_offset) { 1749 sb->feature_map 1750 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 1751 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 1752 - rdev->data_offset)); 1753 } 1754 } 1755 1756 if (mddev_is_clustered(mddev)) 1757 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); 1758 1759 if (rdev->badblocks.count == 0) 1760 /* Nothing to do for bad blocks*/ ; 1761 else if (sb->bblog_offset == 0) 1762 /* Cannot record bad blocks on this device */ 1763 md_error(mddev, rdev); 1764 else { 1765 struct badblocks *bb = &rdev->badblocks; 1766 u64 *bbp = (u64 *)page_address(rdev->bb_page); 1767 u64 *p = bb->page; 1768 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 1769 if (bb->changed) { 1770 unsigned seq; 1771 1772 retry: 1773 seq = read_seqbegin(&bb->lock); 1774 1775 memset(bbp, 0xff, PAGE_SIZE); 1776 1777 for (i = 0 ; i < bb->count ; i++) { 1778 u64 internal_bb = p[i]; 1779 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1780 | BB_LEN(internal_bb)); 1781 bbp[i] = cpu_to_le64(store_bb); 1782 } 1783 bb->changed = 0; 1784 if (read_seqretry(&bb->lock, seq)) 1785 goto retry; 1786 1787 bb->sector = (rdev->sb_start + 1788 (int)le32_to_cpu(sb->bblog_offset)); 1789 bb->size = le16_to_cpu(sb->bblog_size); 1790 } 1791 } 1792 1793 max_dev = 0; 1794 rdev_for_each(rdev2, mddev) 1795 if (rdev2->desc_nr+1 > max_dev) 1796 max_dev = rdev2->desc_nr+1; 1797 1798 if (max_dev > le32_to_cpu(sb->max_dev)) { 1799 int bmask; 1800 sb->max_dev = cpu_to_le32(max_dev); 1801 rdev->sb_size = max_dev * 2 + 256; 1802 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1803 if (rdev->sb_size & bmask) 1804 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1805 } else 1806 max_dev = le32_to_cpu(sb->max_dev); 1807 1808 for (i=0; i<max_dev;i++) 1809 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 1810 1811 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) 1812 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); 1813 1814 rdev_for_each(rdev2, mddev) { 1815 i = rdev2->desc_nr; 1816 if (test_bit(Faulty, &rdev2->flags)) 1817 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); 1818 else if (test_bit(In_sync, &rdev2->flags)) 1819 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1820 else if (test_bit(Journal, &rdev2->flags)) 1821 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); 1822 else if (rdev2->raid_disk >= 0) 1823 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1824 else 1825 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); 1826 } 1827 1828 sb->sb_csum = calc_sb_1_csum(sb); 1829 } 1830 1831 static unsigned long long 1832 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1833 { 1834 struct mdp_superblock_1 *sb; 1835 sector_t max_sectors; 1836 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1837 return 0; /* component must fit device */ 1838 if (rdev->data_offset != rdev->new_data_offset) 1839 return 0; /* too confusing */ 1840 if (rdev->sb_start < rdev->data_offset) { 1841 /* minor versions 1 and 2; superblock before data */ 1842 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1843 max_sectors -= rdev->data_offset; 1844 if (!num_sectors || num_sectors > max_sectors) 1845 num_sectors = max_sectors; 1846 } else if (rdev->mddev->bitmap_info.offset) { 1847 /* minor version 0 with bitmap we can't move */ 1848 return 0; 1849 } else { 1850 /* minor version 0; superblock after data */ 1851 sector_t sb_start; 1852 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; 1853 sb_start &= ~(sector_t)(4*2 - 1); 1854 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1855 if (!num_sectors || num_sectors > max_sectors) 1856 num_sectors = max_sectors; 1857 rdev->sb_start = sb_start; 1858 } 1859 sb = page_address(rdev->sb_page); 1860 sb->data_size = cpu_to_le64(num_sectors); 1861 sb->super_offset = rdev->sb_start; 1862 sb->sb_csum = calc_sb_1_csum(sb); 1863 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1864 rdev->sb_page); 1865 md_super_wait(rdev->mddev); 1866 return num_sectors; 1867 1868 } 1869 1870 static int 1871 super_1_allow_new_offset(struct md_rdev *rdev, 1872 unsigned long long new_offset) 1873 { 1874 /* All necessary checks on new >= old have been done */ 1875 struct bitmap *bitmap; 1876 if (new_offset >= rdev->data_offset) 1877 return 1; 1878 1879 /* with 1.0 metadata, there is no metadata to tread on 1880 * so we can always move back */ 1881 if (rdev->mddev->minor_version == 0) 1882 return 1; 1883 1884 /* otherwise we must be sure not to step on 1885 * any metadata, so stay: 1886 * 36K beyond start of superblock 1887 * beyond end of badblocks 1888 * beyond write-intent bitmap 1889 */ 1890 if (rdev->sb_start + (32+4)*2 > new_offset) 1891 return 0; 1892 bitmap = rdev->mddev->bitmap; 1893 if (bitmap && !rdev->mddev->bitmap_info.file && 1894 rdev->sb_start + rdev->mddev->bitmap_info.offset + 1895 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 1896 return 0; 1897 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 1898 return 0; 1899 1900 return 1; 1901 } 1902 1903 static struct super_type super_types[] = { 1904 [0] = { 1905 .name = "0.90.0", 1906 .owner = THIS_MODULE, 1907 .load_super = super_90_load, 1908 .validate_super = super_90_validate, 1909 .sync_super = super_90_sync, 1910 .rdev_size_change = super_90_rdev_size_change, 1911 .allow_new_offset = super_90_allow_new_offset, 1912 }, 1913 [1] = { 1914 .name = "md-1", 1915 .owner = THIS_MODULE, 1916 .load_super = super_1_load, 1917 .validate_super = super_1_validate, 1918 .sync_super = super_1_sync, 1919 .rdev_size_change = super_1_rdev_size_change, 1920 .allow_new_offset = super_1_allow_new_offset, 1921 }, 1922 }; 1923 1924 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 1925 { 1926 if (mddev->sync_super) { 1927 mddev->sync_super(mddev, rdev); 1928 return; 1929 } 1930 1931 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 1932 1933 super_types[mddev->major_version].sync_super(mddev, rdev); 1934 } 1935 1936 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 1937 { 1938 struct md_rdev *rdev, *rdev2; 1939 1940 rcu_read_lock(); 1941 rdev_for_each_rcu(rdev, mddev1) { 1942 if (test_bit(Faulty, &rdev->flags) || 1943 test_bit(Journal, &rdev->flags) || 1944 rdev->raid_disk == -1) 1945 continue; 1946 rdev_for_each_rcu(rdev2, mddev2) { 1947 if (test_bit(Faulty, &rdev2->flags) || 1948 test_bit(Journal, &rdev2->flags) || 1949 rdev2->raid_disk == -1) 1950 continue; 1951 if (rdev->bdev->bd_contains == 1952 rdev2->bdev->bd_contains) { 1953 rcu_read_unlock(); 1954 return 1; 1955 } 1956 } 1957 } 1958 rcu_read_unlock(); 1959 return 0; 1960 } 1961 1962 static LIST_HEAD(pending_raid_disks); 1963 1964 /* 1965 * Try to register data integrity profile for an mddev 1966 * 1967 * This is called when an array is started and after a disk has been kicked 1968 * from the array. It only succeeds if all working and active component devices 1969 * are integrity capable with matching profiles. 1970 */ 1971 int md_integrity_register(struct mddev *mddev) 1972 { 1973 struct md_rdev *rdev, *reference = NULL; 1974 1975 if (list_empty(&mddev->disks)) 1976 return 0; /* nothing to do */ 1977 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 1978 return 0; /* shouldn't register, or already is */ 1979 rdev_for_each(rdev, mddev) { 1980 /* skip spares and non-functional disks */ 1981 if (test_bit(Faulty, &rdev->flags)) 1982 continue; 1983 if (rdev->raid_disk < 0) 1984 continue; 1985 if (!reference) { 1986 /* Use the first rdev as the reference */ 1987 reference = rdev; 1988 continue; 1989 } 1990 /* does this rdev's profile match the reference profile? */ 1991 if (blk_integrity_compare(reference->bdev->bd_disk, 1992 rdev->bdev->bd_disk) < 0) 1993 return -EINVAL; 1994 } 1995 if (!reference || !bdev_get_integrity(reference->bdev)) 1996 return 0; 1997 /* 1998 * All component devices are integrity capable and have matching 1999 * profiles, register the common profile for the md device. 2000 */ 2001 blk_integrity_register(mddev->gendisk, 2002 bdev_get_integrity(reference->bdev)); 2003 2004 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); 2005 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { 2006 printk(KERN_ERR "md: failed to create integrity pool for %s\n", 2007 mdname(mddev)); 2008 return -EINVAL; 2009 } 2010 return 0; 2011 } 2012 EXPORT_SYMBOL(md_integrity_register); 2013 2014 /* 2015 * Attempt to add an rdev, but only if it is consistent with the current 2016 * integrity profile 2017 */ 2018 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2019 { 2020 struct blk_integrity *bi_rdev; 2021 struct blk_integrity *bi_mddev; 2022 char name[BDEVNAME_SIZE]; 2023 2024 if (!mddev->gendisk) 2025 return 0; 2026 2027 bi_rdev = bdev_get_integrity(rdev->bdev); 2028 bi_mddev = blk_get_integrity(mddev->gendisk); 2029 2030 if (!bi_mddev) /* nothing to do */ 2031 return 0; 2032 2033 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { 2034 printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n", 2035 mdname(mddev), bdevname(rdev->bdev, name)); 2036 return -ENXIO; 2037 } 2038 2039 return 0; 2040 } 2041 EXPORT_SYMBOL(md_integrity_add_rdev); 2042 2043 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) 2044 { 2045 char b[BDEVNAME_SIZE]; 2046 struct kobject *ko; 2047 int err; 2048 2049 /* prevent duplicates */ 2050 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2051 return -EEXIST; 2052 2053 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2054 if (!test_bit(Journal, &rdev->flags) && 2055 rdev->sectors && 2056 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { 2057 if (mddev->pers) { 2058 /* Cannot change size, so fail 2059 * If mddev->level <= 0, then we don't care 2060 * about aligning sizes (e.g. linear) 2061 */ 2062 if (mddev->level > 0) 2063 return -ENOSPC; 2064 } else 2065 mddev->dev_sectors = rdev->sectors; 2066 } 2067 2068 /* Verify rdev->desc_nr is unique. 2069 * If it is -1, assign a free number, else 2070 * check number is not in use 2071 */ 2072 rcu_read_lock(); 2073 if (rdev->desc_nr < 0) { 2074 int choice = 0; 2075 if (mddev->pers) 2076 choice = mddev->raid_disks; 2077 while (md_find_rdev_nr_rcu(mddev, choice)) 2078 choice++; 2079 rdev->desc_nr = choice; 2080 } else { 2081 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { 2082 rcu_read_unlock(); 2083 return -EBUSY; 2084 } 2085 } 2086 rcu_read_unlock(); 2087 if (!test_bit(Journal, &rdev->flags) && 2088 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2089 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 2090 mdname(mddev), mddev->max_disks); 2091 return -EBUSY; 2092 } 2093 bdevname(rdev->bdev,b); 2094 strreplace(b, '/', '!'); 2095 2096 rdev->mddev = mddev; 2097 printk(KERN_INFO "md: bind<%s>\n", b); 2098 2099 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2100 goto fail; 2101 2102 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 2103 if (sysfs_create_link(&rdev->kobj, ko, "block")) 2104 /* failure here is OK */; 2105 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2106 2107 list_add_rcu(&rdev->same_set, &mddev->disks); 2108 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2109 2110 /* May as well allow recovery to be retried once */ 2111 mddev->recovery_disabled++; 2112 2113 return 0; 2114 2115 fail: 2116 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 2117 b, mdname(mddev)); 2118 return err; 2119 } 2120 2121 static void md_delayed_delete(struct work_struct *ws) 2122 { 2123 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2124 kobject_del(&rdev->kobj); 2125 kobject_put(&rdev->kobj); 2126 } 2127 2128 static void unbind_rdev_from_array(struct md_rdev *rdev) 2129 { 2130 char b[BDEVNAME_SIZE]; 2131 2132 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2133 list_del_rcu(&rdev->same_set); 2134 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2135 rdev->mddev = NULL; 2136 sysfs_remove_link(&rdev->kobj, "block"); 2137 sysfs_put(rdev->sysfs_state); 2138 rdev->sysfs_state = NULL; 2139 rdev->badblocks.count = 0; 2140 /* We need to delay this, otherwise we can deadlock when 2141 * writing to 'remove' to "dev/state". We also need 2142 * to delay it due to rcu usage. 2143 */ 2144 synchronize_rcu(); 2145 INIT_WORK(&rdev->del_work, md_delayed_delete); 2146 kobject_get(&rdev->kobj); 2147 queue_work(md_misc_wq, &rdev->del_work); 2148 } 2149 2150 /* 2151 * prevent the device from being mounted, repartitioned or 2152 * otherwise reused by a RAID array (or any other kernel 2153 * subsystem), by bd_claiming the device. 2154 */ 2155 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2156 { 2157 int err = 0; 2158 struct block_device *bdev; 2159 char b[BDEVNAME_SIZE]; 2160 2161 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2162 shared ? (struct md_rdev *)lock_rdev : rdev); 2163 if (IS_ERR(bdev)) { 2164 printk(KERN_ERR "md: could not open %s.\n", 2165 __bdevname(dev, b)); 2166 return PTR_ERR(bdev); 2167 } 2168 rdev->bdev = bdev; 2169 return err; 2170 } 2171 2172 static void unlock_rdev(struct md_rdev *rdev) 2173 { 2174 struct block_device *bdev = rdev->bdev; 2175 rdev->bdev = NULL; 2176 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2177 } 2178 2179 void md_autodetect_dev(dev_t dev); 2180 2181 static void export_rdev(struct md_rdev *rdev) 2182 { 2183 char b[BDEVNAME_SIZE]; 2184 2185 printk(KERN_INFO "md: export_rdev(%s)\n", 2186 bdevname(rdev->bdev,b)); 2187 md_rdev_clear(rdev); 2188 #ifndef MODULE 2189 if (test_bit(AutoDetected, &rdev->flags)) 2190 md_autodetect_dev(rdev->bdev->bd_dev); 2191 #endif 2192 unlock_rdev(rdev); 2193 kobject_put(&rdev->kobj); 2194 } 2195 2196 void md_kick_rdev_from_array(struct md_rdev *rdev) 2197 { 2198 unbind_rdev_from_array(rdev); 2199 export_rdev(rdev); 2200 } 2201 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); 2202 2203 static void export_array(struct mddev *mddev) 2204 { 2205 struct md_rdev *rdev; 2206 2207 while (!list_empty(&mddev->disks)) { 2208 rdev = list_first_entry(&mddev->disks, struct md_rdev, 2209 same_set); 2210 md_kick_rdev_from_array(rdev); 2211 } 2212 mddev->raid_disks = 0; 2213 mddev->major_version = 0; 2214 } 2215 2216 static void sync_sbs(struct mddev *mddev, int nospares) 2217 { 2218 /* Update each superblock (in-memory image), but 2219 * if we are allowed to, skip spares which already 2220 * have the right event counter, or have one earlier 2221 * (which would mean they aren't being marked as dirty 2222 * with the rest of the array) 2223 */ 2224 struct md_rdev *rdev; 2225 rdev_for_each(rdev, mddev) { 2226 if (rdev->sb_events == mddev->events || 2227 (nospares && 2228 rdev->raid_disk < 0 && 2229 rdev->sb_events+1 == mddev->events)) { 2230 /* Don't update this superblock */ 2231 rdev->sb_loaded = 2; 2232 } else { 2233 sync_super(mddev, rdev); 2234 rdev->sb_loaded = 1; 2235 } 2236 } 2237 } 2238 2239 static bool does_sb_need_changing(struct mddev *mddev) 2240 { 2241 struct md_rdev *rdev; 2242 struct mdp_superblock_1 *sb; 2243 int role; 2244 2245 /* Find a good rdev */ 2246 rdev_for_each(rdev, mddev) 2247 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) 2248 break; 2249 2250 /* No good device found. */ 2251 if (!rdev) 2252 return false; 2253 2254 sb = page_address(rdev->sb_page); 2255 /* Check if a device has become faulty or a spare become active */ 2256 rdev_for_each(rdev, mddev) { 2257 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 2258 /* Device activated? */ 2259 if (role == 0xffff && rdev->raid_disk >=0 && 2260 !test_bit(Faulty, &rdev->flags)) 2261 return true; 2262 /* Device turned faulty? */ 2263 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) 2264 return true; 2265 } 2266 2267 /* Check if any mddev parameters have changed */ 2268 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || 2269 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || 2270 (mddev->layout != le64_to_cpu(sb->layout)) || 2271 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || 2272 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) 2273 return true; 2274 2275 return false; 2276 } 2277 2278 void md_update_sb(struct mddev *mddev, int force_change) 2279 { 2280 struct md_rdev *rdev; 2281 int sync_req; 2282 int nospares = 0; 2283 int any_badblocks_changed = 0; 2284 int ret = -1; 2285 2286 if (mddev->ro) { 2287 if (force_change) 2288 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2289 return; 2290 } 2291 2292 if (mddev_is_clustered(mddev)) { 2293 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2294 force_change = 1; 2295 ret = md_cluster_ops->metadata_update_start(mddev); 2296 /* Has someone else has updated the sb */ 2297 if (!does_sb_need_changing(mddev)) { 2298 if (ret == 0) 2299 md_cluster_ops->metadata_update_cancel(mddev); 2300 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2301 return; 2302 } 2303 } 2304 repeat: 2305 /* First make sure individual recovery_offsets are correct */ 2306 rdev_for_each(rdev, mddev) { 2307 if (rdev->raid_disk >= 0 && 2308 mddev->delta_disks >= 0 && 2309 !test_bit(Journal, &rdev->flags) && 2310 !test_bit(In_sync, &rdev->flags) && 2311 mddev->curr_resync_completed > rdev->recovery_offset) 2312 rdev->recovery_offset = mddev->curr_resync_completed; 2313 2314 } 2315 if (!mddev->persistent) { 2316 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2317 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2318 if (!mddev->external) { 2319 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2320 rdev_for_each(rdev, mddev) { 2321 if (rdev->badblocks.changed) { 2322 rdev->badblocks.changed = 0; 2323 ack_all_badblocks(&rdev->badblocks); 2324 md_error(mddev, rdev); 2325 } 2326 clear_bit(Blocked, &rdev->flags); 2327 clear_bit(BlockedBadBlocks, &rdev->flags); 2328 wake_up(&rdev->blocked_wait); 2329 } 2330 } 2331 wake_up(&mddev->sb_wait); 2332 return; 2333 } 2334 2335 spin_lock(&mddev->lock); 2336 2337 mddev->utime = ktime_get_real_seconds(); 2338 2339 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2340 force_change = 1; 2341 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2342 /* just a clean<-> dirty transition, possibly leave spares alone, 2343 * though if events isn't the right even/odd, we will have to do 2344 * spares after all 2345 */ 2346 nospares = 1; 2347 if (force_change) 2348 nospares = 0; 2349 if (mddev->degraded) 2350 /* If the array is degraded, then skipping spares is both 2351 * dangerous and fairly pointless. 2352 * Dangerous because a device that was removed from the array 2353 * might have a event_count that still looks up-to-date, 2354 * so it can be re-added without a resync. 2355 * Pointless because if there are any spares to skip, 2356 * then a recovery will happen and soon that array won't 2357 * be degraded any more and the spare can go back to sleep then. 2358 */ 2359 nospares = 0; 2360 2361 sync_req = mddev->in_sync; 2362 2363 /* If this is just a dirty<->clean transition, and the array is clean 2364 * and 'events' is odd, we can roll back to the previous clean state */ 2365 if (nospares 2366 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2367 && mddev->can_decrease_events 2368 && mddev->events != 1) { 2369 mddev->events--; 2370 mddev->can_decrease_events = 0; 2371 } else { 2372 /* otherwise we have to go forward and ... */ 2373 mddev->events ++; 2374 mddev->can_decrease_events = nospares; 2375 } 2376 2377 /* 2378 * This 64-bit counter should never wrap. 2379 * Either we are in around ~1 trillion A.C., assuming 2380 * 1 reboot per second, or we have a bug... 2381 */ 2382 WARN_ON(mddev->events == 0); 2383 2384 rdev_for_each(rdev, mddev) { 2385 if (rdev->badblocks.changed) 2386 any_badblocks_changed++; 2387 if (test_bit(Faulty, &rdev->flags)) 2388 set_bit(FaultRecorded, &rdev->flags); 2389 } 2390 2391 sync_sbs(mddev, nospares); 2392 spin_unlock(&mddev->lock); 2393 2394 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2395 mdname(mddev), mddev->in_sync); 2396 2397 bitmap_update_sb(mddev->bitmap); 2398 rdev_for_each(rdev, mddev) { 2399 char b[BDEVNAME_SIZE]; 2400 2401 if (rdev->sb_loaded != 1) 2402 continue; /* no noise on spare devices */ 2403 2404 if (!test_bit(Faulty, &rdev->flags)) { 2405 md_super_write(mddev,rdev, 2406 rdev->sb_start, rdev->sb_size, 2407 rdev->sb_page); 2408 pr_debug("md: (write) %s's sb offset: %llu\n", 2409 bdevname(rdev->bdev, b), 2410 (unsigned long long)rdev->sb_start); 2411 rdev->sb_events = mddev->events; 2412 if (rdev->badblocks.size) { 2413 md_super_write(mddev, rdev, 2414 rdev->badblocks.sector, 2415 rdev->badblocks.size << 9, 2416 rdev->bb_page); 2417 rdev->badblocks.size = 0; 2418 } 2419 2420 } else 2421 pr_debug("md: %s (skipping faulty)\n", 2422 bdevname(rdev->bdev, b)); 2423 2424 if (mddev->level == LEVEL_MULTIPATH) 2425 /* only need to write one superblock... */ 2426 break; 2427 } 2428 md_super_wait(mddev); 2429 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2430 2431 spin_lock(&mddev->lock); 2432 if (mddev->in_sync != sync_req || 2433 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2434 /* have to write it out again */ 2435 spin_unlock(&mddev->lock); 2436 goto repeat; 2437 } 2438 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2439 spin_unlock(&mddev->lock); 2440 wake_up(&mddev->sb_wait); 2441 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2442 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2443 2444 rdev_for_each(rdev, mddev) { 2445 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2446 clear_bit(Blocked, &rdev->flags); 2447 2448 if (any_badblocks_changed) 2449 ack_all_badblocks(&rdev->badblocks); 2450 clear_bit(BlockedBadBlocks, &rdev->flags); 2451 wake_up(&rdev->blocked_wait); 2452 } 2453 2454 if (mddev_is_clustered(mddev) && ret == 0) 2455 md_cluster_ops->metadata_update_finish(mddev); 2456 } 2457 EXPORT_SYMBOL(md_update_sb); 2458 2459 static int add_bound_rdev(struct md_rdev *rdev) 2460 { 2461 struct mddev *mddev = rdev->mddev; 2462 int err = 0; 2463 bool add_journal = test_bit(Journal, &rdev->flags); 2464 2465 if (!mddev->pers->hot_remove_disk || add_journal) { 2466 /* If there is hot_add_disk but no hot_remove_disk 2467 * then added disks for geometry changes, 2468 * and should be added immediately. 2469 */ 2470 super_types[mddev->major_version]. 2471 validate_super(mddev, rdev); 2472 if (add_journal) 2473 mddev_suspend(mddev); 2474 err = mddev->pers->hot_add_disk(mddev, rdev); 2475 if (add_journal) 2476 mddev_resume(mddev); 2477 if (err) { 2478 unbind_rdev_from_array(rdev); 2479 export_rdev(rdev); 2480 return err; 2481 } 2482 } 2483 sysfs_notify_dirent_safe(rdev->sysfs_state); 2484 2485 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2486 if (mddev->degraded) 2487 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 2488 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2489 md_new_event(mddev); 2490 md_wakeup_thread(mddev->thread); 2491 return 0; 2492 } 2493 2494 /* words written to sysfs files may, or may not, be \n terminated. 2495 * We want to accept with case. For this we use cmd_match. 2496 */ 2497 static int cmd_match(const char *cmd, const char *str) 2498 { 2499 /* See if cmd, written into a sysfs file, matches 2500 * str. They must either be the same, or cmd can 2501 * have a trailing newline 2502 */ 2503 while (*cmd && *str && *cmd == *str) { 2504 cmd++; 2505 str++; 2506 } 2507 if (*cmd == '\n') 2508 cmd++; 2509 if (*str || *cmd) 2510 return 0; 2511 return 1; 2512 } 2513 2514 struct rdev_sysfs_entry { 2515 struct attribute attr; 2516 ssize_t (*show)(struct md_rdev *, char *); 2517 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2518 }; 2519 2520 static ssize_t 2521 state_show(struct md_rdev *rdev, char *page) 2522 { 2523 char *sep = ""; 2524 size_t len = 0; 2525 unsigned long flags = ACCESS_ONCE(rdev->flags); 2526 2527 if (test_bit(Faulty, &flags) || 2528 rdev->badblocks.unacked_exist) { 2529 len+= sprintf(page+len, "%sfaulty",sep); 2530 sep = ","; 2531 } 2532 if (test_bit(In_sync, &flags)) { 2533 len += sprintf(page+len, "%sin_sync",sep); 2534 sep = ","; 2535 } 2536 if (test_bit(Journal, &flags)) { 2537 len += sprintf(page+len, "%sjournal",sep); 2538 sep = ","; 2539 } 2540 if (test_bit(WriteMostly, &flags)) { 2541 len += sprintf(page+len, "%swrite_mostly",sep); 2542 sep = ","; 2543 } 2544 if (test_bit(Blocked, &flags) || 2545 (rdev->badblocks.unacked_exist 2546 && !test_bit(Faulty, &flags))) { 2547 len += sprintf(page+len, "%sblocked", sep); 2548 sep = ","; 2549 } 2550 if (!test_bit(Faulty, &flags) && 2551 !test_bit(Journal, &flags) && 2552 !test_bit(In_sync, &flags)) { 2553 len += sprintf(page+len, "%sspare", sep); 2554 sep = ","; 2555 } 2556 if (test_bit(WriteErrorSeen, &flags)) { 2557 len += sprintf(page+len, "%swrite_error", sep); 2558 sep = ","; 2559 } 2560 if (test_bit(WantReplacement, &flags)) { 2561 len += sprintf(page+len, "%swant_replacement", sep); 2562 sep = ","; 2563 } 2564 if (test_bit(Replacement, &flags)) { 2565 len += sprintf(page+len, "%sreplacement", sep); 2566 sep = ","; 2567 } 2568 2569 return len+sprintf(page+len, "\n"); 2570 } 2571 2572 static ssize_t 2573 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2574 { 2575 /* can write 2576 * faulty - simulates an error 2577 * remove - disconnects the device 2578 * writemostly - sets write_mostly 2579 * -writemostly - clears write_mostly 2580 * blocked - sets the Blocked flags 2581 * -blocked - clears the Blocked and possibly simulates an error 2582 * insync - sets Insync providing device isn't active 2583 * -insync - clear Insync for a device with a slot assigned, 2584 * so that it gets rebuilt based on bitmap 2585 * write_error - sets WriteErrorSeen 2586 * -write_error - clears WriteErrorSeen 2587 */ 2588 int err = -EINVAL; 2589 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2590 md_error(rdev->mddev, rdev); 2591 if (test_bit(Faulty, &rdev->flags)) 2592 err = 0; 2593 else 2594 err = -EBUSY; 2595 } else if (cmd_match(buf, "remove")) { 2596 if (rdev->raid_disk >= 0) 2597 err = -EBUSY; 2598 else { 2599 struct mddev *mddev = rdev->mddev; 2600 err = 0; 2601 if (mddev_is_clustered(mddev)) 2602 err = md_cluster_ops->remove_disk(mddev, rdev); 2603 2604 if (err == 0) { 2605 md_kick_rdev_from_array(rdev); 2606 if (mddev->pers) 2607 md_update_sb(mddev, 1); 2608 md_new_event(mddev); 2609 } 2610 } 2611 } else if (cmd_match(buf, "writemostly")) { 2612 set_bit(WriteMostly, &rdev->flags); 2613 err = 0; 2614 } else if (cmd_match(buf, "-writemostly")) { 2615 clear_bit(WriteMostly, &rdev->flags); 2616 err = 0; 2617 } else if (cmd_match(buf, "blocked")) { 2618 set_bit(Blocked, &rdev->flags); 2619 err = 0; 2620 } else if (cmd_match(buf, "-blocked")) { 2621 if (!test_bit(Faulty, &rdev->flags) && 2622 rdev->badblocks.unacked_exist) { 2623 /* metadata handler doesn't understand badblocks, 2624 * so we need to fail the device 2625 */ 2626 md_error(rdev->mddev, rdev); 2627 } 2628 clear_bit(Blocked, &rdev->flags); 2629 clear_bit(BlockedBadBlocks, &rdev->flags); 2630 wake_up(&rdev->blocked_wait); 2631 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2632 md_wakeup_thread(rdev->mddev->thread); 2633 2634 err = 0; 2635 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2636 set_bit(In_sync, &rdev->flags); 2637 err = 0; 2638 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && 2639 !test_bit(Journal, &rdev->flags)) { 2640 if (rdev->mddev->pers == NULL) { 2641 clear_bit(In_sync, &rdev->flags); 2642 rdev->saved_raid_disk = rdev->raid_disk; 2643 rdev->raid_disk = -1; 2644 err = 0; 2645 } 2646 } else if (cmd_match(buf, "write_error")) { 2647 set_bit(WriteErrorSeen, &rdev->flags); 2648 err = 0; 2649 } else if (cmd_match(buf, "-write_error")) { 2650 clear_bit(WriteErrorSeen, &rdev->flags); 2651 err = 0; 2652 } else if (cmd_match(buf, "want_replacement")) { 2653 /* Any non-spare device that is not a replacement can 2654 * become want_replacement at any time, but we then need to 2655 * check if recovery is needed. 2656 */ 2657 if (rdev->raid_disk >= 0 && 2658 !test_bit(Journal, &rdev->flags) && 2659 !test_bit(Replacement, &rdev->flags)) 2660 set_bit(WantReplacement, &rdev->flags); 2661 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2662 md_wakeup_thread(rdev->mddev->thread); 2663 err = 0; 2664 } else if (cmd_match(buf, "-want_replacement")) { 2665 /* Clearing 'want_replacement' is always allowed. 2666 * Once replacements starts it is too late though. 2667 */ 2668 err = 0; 2669 clear_bit(WantReplacement, &rdev->flags); 2670 } else if (cmd_match(buf, "replacement")) { 2671 /* Can only set a device as a replacement when array has not 2672 * yet been started. Once running, replacement is automatic 2673 * from spares, or by assigning 'slot'. 2674 */ 2675 if (rdev->mddev->pers) 2676 err = -EBUSY; 2677 else { 2678 set_bit(Replacement, &rdev->flags); 2679 err = 0; 2680 } 2681 } else if (cmd_match(buf, "-replacement")) { 2682 /* Similarly, can only clear Replacement before start */ 2683 if (rdev->mddev->pers) 2684 err = -EBUSY; 2685 else { 2686 clear_bit(Replacement, &rdev->flags); 2687 err = 0; 2688 } 2689 } else if (cmd_match(buf, "re-add")) { 2690 if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { 2691 /* clear_bit is performed _after_ all the devices 2692 * have their local Faulty bit cleared. If any writes 2693 * happen in the meantime in the local node, they 2694 * will land in the local bitmap, which will be synced 2695 * by this node eventually 2696 */ 2697 if (!mddev_is_clustered(rdev->mddev) || 2698 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { 2699 clear_bit(Faulty, &rdev->flags); 2700 err = add_bound_rdev(rdev); 2701 } 2702 } else 2703 err = -EBUSY; 2704 } 2705 if (!err) 2706 sysfs_notify_dirent_safe(rdev->sysfs_state); 2707 return err ? err : len; 2708 } 2709 static struct rdev_sysfs_entry rdev_state = 2710 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); 2711 2712 static ssize_t 2713 errors_show(struct md_rdev *rdev, char *page) 2714 { 2715 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2716 } 2717 2718 static ssize_t 2719 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 2720 { 2721 unsigned int n; 2722 int rv; 2723 2724 rv = kstrtouint(buf, 10, &n); 2725 if (rv < 0) 2726 return rv; 2727 atomic_set(&rdev->corrected_errors, n); 2728 return len; 2729 } 2730 static struct rdev_sysfs_entry rdev_errors = 2731 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2732 2733 static ssize_t 2734 slot_show(struct md_rdev *rdev, char *page) 2735 { 2736 if (test_bit(Journal, &rdev->flags)) 2737 return sprintf(page, "journal\n"); 2738 else if (rdev->raid_disk < 0) 2739 return sprintf(page, "none\n"); 2740 else 2741 return sprintf(page, "%d\n", rdev->raid_disk); 2742 } 2743 2744 static ssize_t 2745 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 2746 { 2747 int slot; 2748 int err; 2749 2750 if (test_bit(Journal, &rdev->flags)) 2751 return -EBUSY; 2752 if (strncmp(buf, "none", 4)==0) 2753 slot = -1; 2754 else { 2755 err = kstrtouint(buf, 10, (unsigned int *)&slot); 2756 if (err < 0) 2757 return err; 2758 } 2759 if (rdev->mddev->pers && slot == -1) { 2760 /* Setting 'slot' on an active array requires also 2761 * updating the 'rd%d' link, and communicating 2762 * with the personality with ->hot_*_disk. 2763 * For now we only support removing 2764 * failed/spare devices. This normally happens automatically, 2765 * but not when the metadata is externally managed. 2766 */ 2767 if (rdev->raid_disk == -1) 2768 return -EEXIST; 2769 /* personality does all needed checks */ 2770 if (rdev->mddev->pers->hot_remove_disk == NULL) 2771 return -EINVAL; 2772 clear_bit(Blocked, &rdev->flags); 2773 remove_and_add_spares(rdev->mddev, rdev); 2774 if (rdev->raid_disk >= 0) 2775 return -EBUSY; 2776 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2777 md_wakeup_thread(rdev->mddev->thread); 2778 } else if (rdev->mddev->pers) { 2779 /* Activating a spare .. or possibly reactivating 2780 * if we ever get bitmaps working here. 2781 */ 2782 int err; 2783 2784 if (rdev->raid_disk != -1) 2785 return -EBUSY; 2786 2787 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 2788 return -EBUSY; 2789 2790 if (rdev->mddev->pers->hot_add_disk == NULL) 2791 return -EINVAL; 2792 2793 if (slot >= rdev->mddev->raid_disks && 2794 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2795 return -ENOSPC; 2796 2797 rdev->raid_disk = slot; 2798 if (test_bit(In_sync, &rdev->flags)) 2799 rdev->saved_raid_disk = slot; 2800 else 2801 rdev->saved_raid_disk = -1; 2802 clear_bit(In_sync, &rdev->flags); 2803 clear_bit(Bitmap_sync, &rdev->flags); 2804 err = rdev->mddev->pers-> 2805 hot_add_disk(rdev->mddev, rdev); 2806 if (err) { 2807 rdev->raid_disk = -1; 2808 return err; 2809 } else 2810 sysfs_notify_dirent_safe(rdev->sysfs_state); 2811 if (sysfs_link_rdev(rdev->mddev, rdev)) 2812 /* failure here is OK */; 2813 /* don't wakeup anyone, leave that to userspace. */ 2814 } else { 2815 if (slot >= rdev->mddev->raid_disks && 2816 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2817 return -ENOSPC; 2818 rdev->raid_disk = slot; 2819 /* assume it is working */ 2820 clear_bit(Faulty, &rdev->flags); 2821 clear_bit(WriteMostly, &rdev->flags); 2822 set_bit(In_sync, &rdev->flags); 2823 sysfs_notify_dirent_safe(rdev->sysfs_state); 2824 } 2825 return len; 2826 } 2827 2828 static struct rdev_sysfs_entry rdev_slot = 2829 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2830 2831 static ssize_t 2832 offset_show(struct md_rdev *rdev, char *page) 2833 { 2834 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2835 } 2836 2837 static ssize_t 2838 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2839 { 2840 unsigned long long offset; 2841 if (kstrtoull(buf, 10, &offset) < 0) 2842 return -EINVAL; 2843 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2844 return -EBUSY; 2845 if (rdev->sectors && rdev->mddev->external) 2846 /* Must set offset before size, so overlap checks 2847 * can be sane */ 2848 return -EBUSY; 2849 rdev->data_offset = offset; 2850 rdev->new_data_offset = offset; 2851 return len; 2852 } 2853 2854 static struct rdev_sysfs_entry rdev_offset = 2855 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2856 2857 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 2858 { 2859 return sprintf(page, "%llu\n", 2860 (unsigned long long)rdev->new_data_offset); 2861 } 2862 2863 static ssize_t new_offset_store(struct md_rdev *rdev, 2864 const char *buf, size_t len) 2865 { 2866 unsigned long long new_offset; 2867 struct mddev *mddev = rdev->mddev; 2868 2869 if (kstrtoull(buf, 10, &new_offset) < 0) 2870 return -EINVAL; 2871 2872 if (mddev->sync_thread || 2873 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) 2874 return -EBUSY; 2875 if (new_offset == rdev->data_offset) 2876 /* reset is always permitted */ 2877 ; 2878 else if (new_offset > rdev->data_offset) { 2879 /* must not push array size beyond rdev_sectors */ 2880 if (new_offset - rdev->data_offset 2881 + mddev->dev_sectors > rdev->sectors) 2882 return -E2BIG; 2883 } 2884 /* Metadata worries about other space details. */ 2885 2886 /* decreasing the offset is inconsistent with a backwards 2887 * reshape. 2888 */ 2889 if (new_offset < rdev->data_offset && 2890 mddev->reshape_backwards) 2891 return -EINVAL; 2892 /* Increasing offset is inconsistent with forwards 2893 * reshape. reshape_direction should be set to 2894 * 'backwards' first. 2895 */ 2896 if (new_offset > rdev->data_offset && 2897 !mddev->reshape_backwards) 2898 return -EINVAL; 2899 2900 if (mddev->pers && mddev->persistent && 2901 !super_types[mddev->major_version] 2902 .allow_new_offset(rdev, new_offset)) 2903 return -E2BIG; 2904 rdev->new_data_offset = new_offset; 2905 if (new_offset > rdev->data_offset) 2906 mddev->reshape_backwards = 1; 2907 else if (new_offset < rdev->data_offset) 2908 mddev->reshape_backwards = 0; 2909 2910 return len; 2911 } 2912 static struct rdev_sysfs_entry rdev_new_offset = 2913 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 2914 2915 static ssize_t 2916 rdev_size_show(struct md_rdev *rdev, char *page) 2917 { 2918 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2919 } 2920 2921 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2922 { 2923 /* check if two start/length pairs overlap */ 2924 if (s1+l1 <= s2) 2925 return 0; 2926 if (s2+l2 <= s1) 2927 return 0; 2928 return 1; 2929 } 2930 2931 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2932 { 2933 unsigned long long blocks; 2934 sector_t new; 2935 2936 if (kstrtoull(buf, 10, &blocks) < 0) 2937 return -EINVAL; 2938 2939 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2940 return -EINVAL; /* sector conversion overflow */ 2941 2942 new = blocks * 2; 2943 if (new != blocks * 2) 2944 return -EINVAL; /* unsigned long long to sector_t overflow */ 2945 2946 *sectors = new; 2947 return 0; 2948 } 2949 2950 static ssize_t 2951 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 2952 { 2953 struct mddev *my_mddev = rdev->mddev; 2954 sector_t oldsectors = rdev->sectors; 2955 sector_t sectors; 2956 2957 if (test_bit(Journal, &rdev->flags)) 2958 return -EBUSY; 2959 if (strict_blocks_to_sectors(buf, §ors) < 0) 2960 return -EINVAL; 2961 if (rdev->data_offset != rdev->new_data_offset) 2962 return -EINVAL; /* too confusing */ 2963 if (my_mddev->pers && rdev->raid_disk >= 0) { 2964 if (my_mddev->persistent) { 2965 sectors = super_types[my_mddev->major_version]. 2966 rdev_size_change(rdev, sectors); 2967 if (!sectors) 2968 return -EBUSY; 2969 } else if (!sectors) 2970 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 2971 rdev->data_offset; 2972 if (!my_mddev->pers->resize) 2973 /* Cannot change size for RAID0 or Linear etc */ 2974 return -EINVAL; 2975 } 2976 if (sectors < my_mddev->dev_sectors) 2977 return -EINVAL; /* component must fit device */ 2978 2979 rdev->sectors = sectors; 2980 if (sectors > oldsectors && my_mddev->external) { 2981 /* Need to check that all other rdevs with the same 2982 * ->bdev do not overlap. 'rcu' is sufficient to walk 2983 * the rdev lists safely. 2984 * This check does not provide a hard guarantee, it 2985 * just helps avoid dangerous mistakes. 2986 */ 2987 struct mddev *mddev; 2988 int overlap = 0; 2989 struct list_head *tmp; 2990 2991 rcu_read_lock(); 2992 for_each_mddev(mddev, tmp) { 2993 struct md_rdev *rdev2; 2994 2995 rdev_for_each(rdev2, mddev) 2996 if (rdev->bdev == rdev2->bdev && 2997 rdev != rdev2 && 2998 overlaps(rdev->data_offset, rdev->sectors, 2999 rdev2->data_offset, 3000 rdev2->sectors)) { 3001 overlap = 1; 3002 break; 3003 } 3004 if (overlap) { 3005 mddev_put(mddev); 3006 break; 3007 } 3008 } 3009 rcu_read_unlock(); 3010 if (overlap) { 3011 /* Someone else could have slipped in a size 3012 * change here, but doing so is just silly. 3013 * We put oldsectors back because we *know* it is 3014 * safe, and trust userspace not to race with 3015 * itself 3016 */ 3017 rdev->sectors = oldsectors; 3018 return -EBUSY; 3019 } 3020 } 3021 return len; 3022 } 3023 3024 static struct rdev_sysfs_entry rdev_size = 3025 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3026 3027 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3028 { 3029 unsigned long long recovery_start = rdev->recovery_offset; 3030 3031 if (test_bit(In_sync, &rdev->flags) || 3032 recovery_start == MaxSector) 3033 return sprintf(page, "none\n"); 3034 3035 return sprintf(page, "%llu\n", recovery_start); 3036 } 3037 3038 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3039 { 3040 unsigned long long recovery_start; 3041 3042 if (cmd_match(buf, "none")) 3043 recovery_start = MaxSector; 3044 else if (kstrtoull(buf, 10, &recovery_start)) 3045 return -EINVAL; 3046 3047 if (rdev->mddev->pers && 3048 rdev->raid_disk >= 0) 3049 return -EBUSY; 3050 3051 rdev->recovery_offset = recovery_start; 3052 if (recovery_start == MaxSector) 3053 set_bit(In_sync, &rdev->flags); 3054 else 3055 clear_bit(In_sync, &rdev->flags); 3056 return len; 3057 } 3058 3059 static struct rdev_sysfs_entry rdev_recovery_start = 3060 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3061 3062 /* sysfs access to bad-blocks list. 3063 * We present two files. 3064 * 'bad-blocks' lists sector numbers and lengths of ranges that 3065 * are recorded as bad. The list is truncated to fit within 3066 * the one-page limit of sysfs. 3067 * Writing "sector length" to this file adds an acknowledged 3068 * bad block list. 3069 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 3070 * been acknowledged. Writing to this file adds bad blocks 3071 * without acknowledging them. This is largely for testing. 3072 */ 3073 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3074 { 3075 return badblocks_show(&rdev->badblocks, page, 0); 3076 } 3077 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3078 { 3079 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3080 /* Maybe that ack was all we needed */ 3081 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3082 wake_up(&rdev->blocked_wait); 3083 return rv; 3084 } 3085 static struct rdev_sysfs_entry rdev_bad_blocks = 3086 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3087 3088 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3089 { 3090 return badblocks_show(&rdev->badblocks, page, 1); 3091 } 3092 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3093 { 3094 return badblocks_store(&rdev->badblocks, page, len, 1); 3095 } 3096 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3097 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3098 3099 static struct attribute *rdev_default_attrs[] = { 3100 &rdev_state.attr, 3101 &rdev_errors.attr, 3102 &rdev_slot.attr, 3103 &rdev_offset.attr, 3104 &rdev_new_offset.attr, 3105 &rdev_size.attr, 3106 &rdev_recovery_start.attr, 3107 &rdev_bad_blocks.attr, 3108 &rdev_unack_bad_blocks.attr, 3109 NULL, 3110 }; 3111 static ssize_t 3112 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3113 { 3114 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3115 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3116 3117 if (!entry->show) 3118 return -EIO; 3119 if (!rdev->mddev) 3120 return -EBUSY; 3121 return entry->show(rdev, page); 3122 } 3123 3124 static ssize_t 3125 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3126 const char *page, size_t length) 3127 { 3128 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3129 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3130 ssize_t rv; 3131 struct mddev *mddev = rdev->mddev; 3132 3133 if (!entry->store) 3134 return -EIO; 3135 if (!capable(CAP_SYS_ADMIN)) 3136 return -EACCES; 3137 rv = mddev ? mddev_lock(mddev): -EBUSY; 3138 if (!rv) { 3139 if (rdev->mddev == NULL) 3140 rv = -EBUSY; 3141 else 3142 rv = entry->store(rdev, page, length); 3143 mddev_unlock(mddev); 3144 } 3145 return rv; 3146 } 3147 3148 static void rdev_free(struct kobject *ko) 3149 { 3150 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3151 kfree(rdev); 3152 } 3153 static const struct sysfs_ops rdev_sysfs_ops = { 3154 .show = rdev_attr_show, 3155 .store = rdev_attr_store, 3156 }; 3157 static struct kobj_type rdev_ktype = { 3158 .release = rdev_free, 3159 .sysfs_ops = &rdev_sysfs_ops, 3160 .default_attrs = rdev_default_attrs, 3161 }; 3162 3163 int md_rdev_init(struct md_rdev *rdev) 3164 { 3165 rdev->desc_nr = -1; 3166 rdev->saved_raid_disk = -1; 3167 rdev->raid_disk = -1; 3168 rdev->flags = 0; 3169 rdev->data_offset = 0; 3170 rdev->new_data_offset = 0; 3171 rdev->sb_events = 0; 3172 rdev->last_read_error.tv_sec = 0; 3173 rdev->last_read_error.tv_nsec = 0; 3174 rdev->sb_loaded = 0; 3175 rdev->bb_page = NULL; 3176 atomic_set(&rdev->nr_pending, 0); 3177 atomic_set(&rdev->read_errors, 0); 3178 atomic_set(&rdev->corrected_errors, 0); 3179 3180 INIT_LIST_HEAD(&rdev->same_set); 3181 init_waitqueue_head(&rdev->blocked_wait); 3182 3183 /* Add space to store bad block list. 3184 * This reserves the space even on arrays where it cannot 3185 * be used - I wonder if that matters 3186 */ 3187 return badblocks_init(&rdev->badblocks, 0); 3188 } 3189 EXPORT_SYMBOL_GPL(md_rdev_init); 3190 /* 3191 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3192 * 3193 * mark the device faulty if: 3194 * 3195 * - the device is nonexistent (zero size) 3196 * - the device has no valid superblock 3197 * 3198 * a faulty rdev _never_ has rdev->sb set. 3199 */ 3200 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3201 { 3202 char b[BDEVNAME_SIZE]; 3203 int err; 3204 struct md_rdev *rdev; 3205 sector_t size; 3206 3207 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3208 if (!rdev) { 3209 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 3210 return ERR_PTR(-ENOMEM); 3211 } 3212 3213 err = md_rdev_init(rdev); 3214 if (err) 3215 goto abort_free; 3216 err = alloc_disk_sb(rdev); 3217 if (err) 3218 goto abort_free; 3219 3220 err = lock_rdev(rdev, newdev, super_format == -2); 3221 if (err) 3222 goto abort_free; 3223 3224 kobject_init(&rdev->kobj, &rdev_ktype); 3225 3226 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3227 if (!size) { 3228 printk(KERN_WARNING 3229 "md: %s has zero or unknown size, marking faulty!\n", 3230 bdevname(rdev->bdev,b)); 3231 err = -EINVAL; 3232 goto abort_free; 3233 } 3234 3235 if (super_format >= 0) { 3236 err = super_types[super_format]. 3237 load_super(rdev, NULL, super_minor); 3238 if (err == -EINVAL) { 3239 printk(KERN_WARNING 3240 "md: %s does not have a valid v%d.%d " 3241 "superblock, not importing!\n", 3242 bdevname(rdev->bdev,b), 3243 super_format, super_minor); 3244 goto abort_free; 3245 } 3246 if (err < 0) { 3247 printk(KERN_WARNING 3248 "md: could not read %s's sb, not importing!\n", 3249 bdevname(rdev->bdev,b)); 3250 goto abort_free; 3251 } 3252 } 3253 3254 return rdev; 3255 3256 abort_free: 3257 if (rdev->bdev) 3258 unlock_rdev(rdev); 3259 md_rdev_clear(rdev); 3260 kfree(rdev); 3261 return ERR_PTR(err); 3262 } 3263 3264 /* 3265 * Check a full RAID array for plausibility 3266 */ 3267 3268 static void analyze_sbs(struct mddev *mddev) 3269 { 3270 int i; 3271 struct md_rdev *rdev, *freshest, *tmp; 3272 char b[BDEVNAME_SIZE]; 3273 3274 freshest = NULL; 3275 rdev_for_each_safe(rdev, tmp, mddev) 3276 switch (super_types[mddev->major_version]. 3277 load_super(rdev, freshest, mddev->minor_version)) { 3278 case 1: 3279 freshest = rdev; 3280 break; 3281 case 0: 3282 break; 3283 default: 3284 printk( KERN_ERR \ 3285 "md: fatal superblock inconsistency in %s" 3286 " -- removing from array\n", 3287 bdevname(rdev->bdev,b)); 3288 md_kick_rdev_from_array(rdev); 3289 } 3290 3291 super_types[mddev->major_version]. 3292 validate_super(mddev, freshest); 3293 3294 i = 0; 3295 rdev_for_each_safe(rdev, tmp, mddev) { 3296 if (mddev->max_disks && 3297 (rdev->desc_nr >= mddev->max_disks || 3298 i > mddev->max_disks)) { 3299 printk(KERN_WARNING 3300 "md: %s: %s: only %d devices permitted\n", 3301 mdname(mddev), bdevname(rdev->bdev, b), 3302 mddev->max_disks); 3303 md_kick_rdev_from_array(rdev); 3304 continue; 3305 } 3306 if (rdev != freshest) { 3307 if (super_types[mddev->major_version]. 3308 validate_super(mddev, rdev)) { 3309 printk(KERN_WARNING "md: kicking non-fresh %s" 3310 " from array!\n", 3311 bdevname(rdev->bdev,b)); 3312 md_kick_rdev_from_array(rdev); 3313 continue; 3314 } 3315 } 3316 if (mddev->level == LEVEL_MULTIPATH) { 3317 rdev->desc_nr = i++; 3318 rdev->raid_disk = rdev->desc_nr; 3319 set_bit(In_sync, &rdev->flags); 3320 } else if (rdev->raid_disk >= 3321 (mddev->raid_disks - min(0, mddev->delta_disks)) && 3322 !test_bit(Journal, &rdev->flags)) { 3323 rdev->raid_disk = -1; 3324 clear_bit(In_sync, &rdev->flags); 3325 } 3326 } 3327 } 3328 3329 /* Read a fixed-point number. 3330 * Numbers in sysfs attributes should be in "standard" units where 3331 * possible, so time should be in seconds. 3332 * However we internally use a a much smaller unit such as 3333 * milliseconds or jiffies. 3334 * This function takes a decimal number with a possible fractional 3335 * component, and produces an integer which is the result of 3336 * multiplying that number by 10^'scale'. 3337 * all without any floating-point arithmetic. 3338 */ 3339 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3340 { 3341 unsigned long result = 0; 3342 long decimals = -1; 3343 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3344 if (*cp == '.') 3345 decimals = 0; 3346 else if (decimals < scale) { 3347 unsigned int value; 3348 value = *cp - '0'; 3349 result = result * 10 + value; 3350 if (decimals >= 0) 3351 decimals++; 3352 } 3353 cp++; 3354 } 3355 if (*cp == '\n') 3356 cp++; 3357 if (*cp) 3358 return -EINVAL; 3359 if (decimals < 0) 3360 decimals = 0; 3361 while (decimals < scale) { 3362 result *= 10; 3363 decimals ++; 3364 } 3365 *res = result; 3366 return 0; 3367 } 3368 3369 static ssize_t 3370 safe_delay_show(struct mddev *mddev, char *page) 3371 { 3372 int msec = (mddev->safemode_delay*1000)/HZ; 3373 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3374 } 3375 static ssize_t 3376 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3377 { 3378 unsigned long msec; 3379 3380 if (mddev_is_clustered(mddev)) { 3381 pr_info("md: Safemode is disabled for clustered mode\n"); 3382 return -EINVAL; 3383 } 3384 3385 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 3386 return -EINVAL; 3387 if (msec == 0) 3388 mddev->safemode_delay = 0; 3389 else { 3390 unsigned long old_delay = mddev->safemode_delay; 3391 unsigned long new_delay = (msec*HZ)/1000; 3392 3393 if (new_delay == 0) 3394 new_delay = 1; 3395 mddev->safemode_delay = new_delay; 3396 if (new_delay < old_delay || old_delay == 0) 3397 mod_timer(&mddev->safemode_timer, jiffies+1); 3398 } 3399 return len; 3400 } 3401 static struct md_sysfs_entry md_safe_delay = 3402 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3403 3404 static ssize_t 3405 level_show(struct mddev *mddev, char *page) 3406 { 3407 struct md_personality *p; 3408 int ret; 3409 spin_lock(&mddev->lock); 3410 p = mddev->pers; 3411 if (p) 3412 ret = sprintf(page, "%s\n", p->name); 3413 else if (mddev->clevel[0]) 3414 ret = sprintf(page, "%s\n", mddev->clevel); 3415 else if (mddev->level != LEVEL_NONE) 3416 ret = sprintf(page, "%d\n", mddev->level); 3417 else 3418 ret = 0; 3419 spin_unlock(&mddev->lock); 3420 return ret; 3421 } 3422 3423 static ssize_t 3424 level_store(struct mddev *mddev, const char *buf, size_t len) 3425 { 3426 char clevel[16]; 3427 ssize_t rv; 3428 size_t slen = len; 3429 struct md_personality *pers, *oldpers; 3430 long level; 3431 void *priv, *oldpriv; 3432 struct md_rdev *rdev; 3433 3434 if (slen == 0 || slen >= sizeof(clevel)) 3435 return -EINVAL; 3436 3437 rv = mddev_lock(mddev); 3438 if (rv) 3439 return rv; 3440 3441 if (mddev->pers == NULL) { 3442 strncpy(mddev->clevel, buf, slen); 3443 if (mddev->clevel[slen-1] == '\n') 3444 slen--; 3445 mddev->clevel[slen] = 0; 3446 mddev->level = LEVEL_NONE; 3447 rv = len; 3448 goto out_unlock; 3449 } 3450 rv = -EROFS; 3451 if (mddev->ro) 3452 goto out_unlock; 3453 3454 /* request to change the personality. Need to ensure: 3455 * - array is not engaged in resync/recovery/reshape 3456 * - old personality can be suspended 3457 * - new personality will access other array. 3458 */ 3459 3460 rv = -EBUSY; 3461 if (mddev->sync_thread || 3462 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3463 mddev->reshape_position != MaxSector || 3464 mddev->sysfs_active) 3465 goto out_unlock; 3466 3467 rv = -EINVAL; 3468 if (!mddev->pers->quiesce) { 3469 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 3470 mdname(mddev), mddev->pers->name); 3471 goto out_unlock; 3472 } 3473 3474 /* Now find the new personality */ 3475 strncpy(clevel, buf, slen); 3476 if (clevel[slen-1] == '\n') 3477 slen--; 3478 clevel[slen] = 0; 3479 if (kstrtol(clevel, 10, &level)) 3480 level = LEVEL_NONE; 3481 3482 if (request_module("md-%s", clevel) != 0) 3483 request_module("md-level-%s", clevel); 3484 spin_lock(&pers_lock); 3485 pers = find_pers(level, clevel); 3486 if (!pers || !try_module_get(pers->owner)) { 3487 spin_unlock(&pers_lock); 3488 printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 3489 rv = -EINVAL; 3490 goto out_unlock; 3491 } 3492 spin_unlock(&pers_lock); 3493 3494 if (pers == mddev->pers) { 3495 /* Nothing to do! */ 3496 module_put(pers->owner); 3497 rv = len; 3498 goto out_unlock; 3499 } 3500 if (!pers->takeover) { 3501 module_put(pers->owner); 3502 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3503 mdname(mddev), clevel); 3504 rv = -EINVAL; 3505 goto out_unlock; 3506 } 3507 3508 rdev_for_each(rdev, mddev) 3509 rdev->new_raid_disk = rdev->raid_disk; 3510 3511 /* ->takeover must set new_* and/or delta_disks 3512 * if it succeeds, and may set them when it fails. 3513 */ 3514 priv = pers->takeover(mddev); 3515 if (IS_ERR(priv)) { 3516 mddev->new_level = mddev->level; 3517 mddev->new_layout = mddev->layout; 3518 mddev->new_chunk_sectors = mddev->chunk_sectors; 3519 mddev->raid_disks -= mddev->delta_disks; 3520 mddev->delta_disks = 0; 3521 mddev->reshape_backwards = 0; 3522 module_put(pers->owner); 3523 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3524 mdname(mddev), clevel); 3525 rv = PTR_ERR(priv); 3526 goto out_unlock; 3527 } 3528 3529 /* Looks like we have a winner */ 3530 mddev_suspend(mddev); 3531 mddev_detach(mddev); 3532 3533 spin_lock(&mddev->lock); 3534 oldpers = mddev->pers; 3535 oldpriv = mddev->private; 3536 mddev->pers = pers; 3537 mddev->private = priv; 3538 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3539 mddev->level = mddev->new_level; 3540 mddev->layout = mddev->new_layout; 3541 mddev->chunk_sectors = mddev->new_chunk_sectors; 3542 mddev->delta_disks = 0; 3543 mddev->reshape_backwards = 0; 3544 mddev->degraded = 0; 3545 spin_unlock(&mddev->lock); 3546 3547 if (oldpers->sync_request == NULL && 3548 mddev->external) { 3549 /* We are converting from a no-redundancy array 3550 * to a redundancy array and metadata is managed 3551 * externally so we need to be sure that writes 3552 * won't block due to a need to transition 3553 * clean->dirty 3554 * until external management is started. 3555 */ 3556 mddev->in_sync = 0; 3557 mddev->safemode_delay = 0; 3558 mddev->safemode = 0; 3559 } 3560 3561 oldpers->free(mddev, oldpriv); 3562 3563 if (oldpers->sync_request == NULL && 3564 pers->sync_request != NULL) { 3565 /* need to add the md_redundancy_group */ 3566 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3567 printk(KERN_WARNING 3568 "md: cannot register extra attributes for %s\n", 3569 mdname(mddev)); 3570 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 3571 } 3572 if (oldpers->sync_request != NULL && 3573 pers->sync_request == NULL) { 3574 /* need to remove the md_redundancy_group */ 3575 if (mddev->to_remove == NULL) 3576 mddev->to_remove = &md_redundancy_group; 3577 } 3578 3579 rdev_for_each(rdev, mddev) { 3580 if (rdev->raid_disk < 0) 3581 continue; 3582 if (rdev->new_raid_disk >= mddev->raid_disks) 3583 rdev->new_raid_disk = -1; 3584 if (rdev->new_raid_disk == rdev->raid_disk) 3585 continue; 3586 sysfs_unlink_rdev(mddev, rdev); 3587 } 3588 rdev_for_each(rdev, mddev) { 3589 if (rdev->raid_disk < 0) 3590 continue; 3591 if (rdev->new_raid_disk == rdev->raid_disk) 3592 continue; 3593 rdev->raid_disk = rdev->new_raid_disk; 3594 if (rdev->raid_disk < 0) 3595 clear_bit(In_sync, &rdev->flags); 3596 else { 3597 if (sysfs_link_rdev(mddev, rdev)) 3598 printk(KERN_WARNING "md: cannot register rd%d" 3599 " for %s after level change\n", 3600 rdev->raid_disk, mdname(mddev)); 3601 } 3602 } 3603 3604 if (pers->sync_request == NULL) { 3605 /* this is now an array without redundancy, so 3606 * it must always be in_sync 3607 */ 3608 mddev->in_sync = 1; 3609 del_timer_sync(&mddev->safemode_timer); 3610 } 3611 blk_set_stacking_limits(&mddev->queue->limits); 3612 pers->run(mddev); 3613 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3614 mddev_resume(mddev); 3615 if (!mddev->thread) 3616 md_update_sb(mddev, 1); 3617 sysfs_notify(&mddev->kobj, NULL, "level"); 3618 md_new_event(mddev); 3619 rv = len; 3620 out_unlock: 3621 mddev_unlock(mddev); 3622 return rv; 3623 } 3624 3625 static struct md_sysfs_entry md_level = 3626 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3627 3628 static ssize_t 3629 layout_show(struct mddev *mddev, char *page) 3630 { 3631 /* just a number, not meaningful for all levels */ 3632 if (mddev->reshape_position != MaxSector && 3633 mddev->layout != mddev->new_layout) 3634 return sprintf(page, "%d (%d)\n", 3635 mddev->new_layout, mddev->layout); 3636 return sprintf(page, "%d\n", mddev->layout); 3637 } 3638 3639 static ssize_t 3640 layout_store(struct mddev *mddev, const char *buf, size_t len) 3641 { 3642 unsigned int n; 3643 int err; 3644 3645 err = kstrtouint(buf, 10, &n); 3646 if (err < 0) 3647 return err; 3648 err = mddev_lock(mddev); 3649 if (err) 3650 return err; 3651 3652 if (mddev->pers) { 3653 if (mddev->pers->check_reshape == NULL) 3654 err = -EBUSY; 3655 else if (mddev->ro) 3656 err = -EROFS; 3657 else { 3658 mddev->new_layout = n; 3659 err = mddev->pers->check_reshape(mddev); 3660 if (err) 3661 mddev->new_layout = mddev->layout; 3662 } 3663 } else { 3664 mddev->new_layout = n; 3665 if (mddev->reshape_position == MaxSector) 3666 mddev->layout = n; 3667 } 3668 mddev_unlock(mddev); 3669 return err ?: len; 3670 } 3671 static struct md_sysfs_entry md_layout = 3672 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3673 3674 static ssize_t 3675 raid_disks_show(struct mddev *mddev, char *page) 3676 { 3677 if (mddev->raid_disks == 0) 3678 return 0; 3679 if (mddev->reshape_position != MaxSector && 3680 mddev->delta_disks != 0) 3681 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3682 mddev->raid_disks - mddev->delta_disks); 3683 return sprintf(page, "%d\n", mddev->raid_disks); 3684 } 3685 3686 static int update_raid_disks(struct mddev *mddev, int raid_disks); 3687 3688 static ssize_t 3689 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3690 { 3691 unsigned int n; 3692 int err; 3693 3694 err = kstrtouint(buf, 10, &n); 3695 if (err < 0) 3696 return err; 3697 3698 err = mddev_lock(mddev); 3699 if (err) 3700 return err; 3701 if (mddev->pers) 3702 err = update_raid_disks(mddev, n); 3703 else if (mddev->reshape_position != MaxSector) { 3704 struct md_rdev *rdev; 3705 int olddisks = mddev->raid_disks - mddev->delta_disks; 3706 3707 err = -EINVAL; 3708 rdev_for_each(rdev, mddev) { 3709 if (olddisks < n && 3710 rdev->data_offset < rdev->new_data_offset) 3711 goto out_unlock; 3712 if (olddisks > n && 3713 rdev->data_offset > rdev->new_data_offset) 3714 goto out_unlock; 3715 } 3716 err = 0; 3717 mddev->delta_disks = n - olddisks; 3718 mddev->raid_disks = n; 3719 mddev->reshape_backwards = (mddev->delta_disks < 0); 3720 } else 3721 mddev->raid_disks = n; 3722 out_unlock: 3723 mddev_unlock(mddev); 3724 return err ? err : len; 3725 } 3726 static struct md_sysfs_entry md_raid_disks = 3727 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3728 3729 static ssize_t 3730 chunk_size_show(struct mddev *mddev, char *page) 3731 { 3732 if (mddev->reshape_position != MaxSector && 3733 mddev->chunk_sectors != mddev->new_chunk_sectors) 3734 return sprintf(page, "%d (%d)\n", 3735 mddev->new_chunk_sectors << 9, 3736 mddev->chunk_sectors << 9); 3737 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3738 } 3739 3740 static ssize_t 3741 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3742 { 3743 unsigned long n; 3744 int err; 3745 3746 err = kstrtoul(buf, 10, &n); 3747 if (err < 0) 3748 return err; 3749 3750 err = mddev_lock(mddev); 3751 if (err) 3752 return err; 3753 if (mddev->pers) { 3754 if (mddev->pers->check_reshape == NULL) 3755 err = -EBUSY; 3756 else if (mddev->ro) 3757 err = -EROFS; 3758 else { 3759 mddev->new_chunk_sectors = n >> 9; 3760 err = mddev->pers->check_reshape(mddev); 3761 if (err) 3762 mddev->new_chunk_sectors = mddev->chunk_sectors; 3763 } 3764 } else { 3765 mddev->new_chunk_sectors = n >> 9; 3766 if (mddev->reshape_position == MaxSector) 3767 mddev->chunk_sectors = n >> 9; 3768 } 3769 mddev_unlock(mddev); 3770 return err ?: len; 3771 } 3772 static struct md_sysfs_entry md_chunk_size = 3773 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3774 3775 static ssize_t 3776 resync_start_show(struct mddev *mddev, char *page) 3777 { 3778 if (mddev->recovery_cp == MaxSector) 3779 return sprintf(page, "none\n"); 3780 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3781 } 3782 3783 static ssize_t 3784 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3785 { 3786 unsigned long long n; 3787 int err; 3788 3789 if (cmd_match(buf, "none")) 3790 n = MaxSector; 3791 else { 3792 err = kstrtoull(buf, 10, &n); 3793 if (err < 0) 3794 return err; 3795 if (n != (sector_t)n) 3796 return -EINVAL; 3797 } 3798 3799 err = mddev_lock(mddev); 3800 if (err) 3801 return err; 3802 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3803 err = -EBUSY; 3804 3805 if (!err) { 3806 mddev->recovery_cp = n; 3807 if (mddev->pers) 3808 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3809 } 3810 mddev_unlock(mddev); 3811 return err ?: len; 3812 } 3813 static struct md_sysfs_entry md_resync_start = 3814 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, 3815 resync_start_show, resync_start_store); 3816 3817 /* 3818 * The array state can be: 3819 * 3820 * clear 3821 * No devices, no size, no level 3822 * Equivalent to STOP_ARRAY ioctl 3823 * inactive 3824 * May have some settings, but array is not active 3825 * all IO results in error 3826 * When written, doesn't tear down array, but just stops it 3827 * suspended (not supported yet) 3828 * All IO requests will block. The array can be reconfigured. 3829 * Writing this, if accepted, will block until array is quiescent 3830 * readonly 3831 * no resync can happen. no superblocks get written. 3832 * write requests fail 3833 * read-auto 3834 * like readonly, but behaves like 'clean' on a write request. 3835 * 3836 * clean - no pending writes, but otherwise active. 3837 * When written to inactive array, starts without resync 3838 * If a write request arrives then 3839 * if metadata is known, mark 'dirty' and switch to 'active'. 3840 * if not known, block and switch to write-pending 3841 * If written to an active array that has pending writes, then fails. 3842 * active 3843 * fully active: IO and resync can be happening. 3844 * When written to inactive array, starts with resync 3845 * 3846 * write-pending 3847 * clean, but writes are blocked waiting for 'active' to be written. 3848 * 3849 * active-idle 3850 * like active, but no writes have been seen for a while (100msec). 3851 * 3852 */ 3853 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 3854 write_pending, active_idle, bad_word}; 3855 static char *array_states[] = { 3856 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3857 "write-pending", "active-idle", NULL }; 3858 3859 static int match_word(const char *word, char **list) 3860 { 3861 int n; 3862 for (n=0; list[n]; n++) 3863 if (cmd_match(word, list[n])) 3864 break; 3865 return n; 3866 } 3867 3868 static ssize_t 3869 array_state_show(struct mddev *mddev, char *page) 3870 { 3871 enum array_state st = inactive; 3872 3873 if (mddev->pers) 3874 switch(mddev->ro) { 3875 case 1: 3876 st = readonly; 3877 break; 3878 case 2: 3879 st = read_auto; 3880 break; 3881 case 0: 3882 if (mddev->in_sync) 3883 st = clean; 3884 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 3885 st = write_pending; 3886 else if (mddev->safemode) 3887 st = active_idle; 3888 else 3889 st = active; 3890 } 3891 else { 3892 if (list_empty(&mddev->disks) && 3893 mddev->raid_disks == 0 && 3894 mddev->dev_sectors == 0) 3895 st = clear; 3896 else 3897 st = inactive; 3898 } 3899 return sprintf(page, "%s\n", array_states[st]); 3900 } 3901 3902 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); 3903 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); 3904 static int do_md_run(struct mddev *mddev); 3905 static int restart_array(struct mddev *mddev); 3906 3907 static ssize_t 3908 array_state_store(struct mddev *mddev, const char *buf, size_t len) 3909 { 3910 int err; 3911 enum array_state st = match_word(buf, array_states); 3912 3913 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { 3914 /* don't take reconfig_mutex when toggling between 3915 * clean and active 3916 */ 3917 spin_lock(&mddev->lock); 3918 if (st == active) { 3919 restart_array(mddev); 3920 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 3921 wake_up(&mddev->sb_wait); 3922 err = 0; 3923 } else /* st == clean */ { 3924 restart_array(mddev); 3925 if (atomic_read(&mddev->writes_pending) == 0) { 3926 if (mddev->in_sync == 0) { 3927 mddev->in_sync = 1; 3928 if (mddev->safemode == 1) 3929 mddev->safemode = 0; 3930 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3931 } 3932 err = 0; 3933 } else 3934 err = -EBUSY; 3935 } 3936 spin_unlock(&mddev->lock); 3937 return err ?: len; 3938 } 3939 err = mddev_lock(mddev); 3940 if (err) 3941 return err; 3942 err = -EINVAL; 3943 switch(st) { 3944 case bad_word: 3945 break; 3946 case clear: 3947 /* stopping an active array */ 3948 err = do_md_stop(mddev, 0, NULL); 3949 break; 3950 case inactive: 3951 /* stopping an active array */ 3952 if (mddev->pers) 3953 err = do_md_stop(mddev, 2, NULL); 3954 else 3955 err = 0; /* already inactive */ 3956 break; 3957 case suspended: 3958 break; /* not supported yet */ 3959 case readonly: 3960 if (mddev->pers) 3961 err = md_set_readonly(mddev, NULL); 3962 else { 3963 mddev->ro = 1; 3964 set_disk_ro(mddev->gendisk, 1); 3965 err = do_md_run(mddev); 3966 } 3967 break; 3968 case read_auto: 3969 if (mddev->pers) { 3970 if (mddev->ro == 0) 3971 err = md_set_readonly(mddev, NULL); 3972 else if (mddev->ro == 1) 3973 err = restart_array(mddev); 3974 if (err == 0) { 3975 mddev->ro = 2; 3976 set_disk_ro(mddev->gendisk, 0); 3977 } 3978 } else { 3979 mddev->ro = 2; 3980 err = do_md_run(mddev); 3981 } 3982 break; 3983 case clean: 3984 if (mddev->pers) { 3985 err = restart_array(mddev); 3986 if (err) 3987 break; 3988 spin_lock(&mddev->lock); 3989 if (atomic_read(&mddev->writes_pending) == 0) { 3990 if (mddev->in_sync == 0) { 3991 mddev->in_sync = 1; 3992 if (mddev->safemode == 1) 3993 mddev->safemode = 0; 3994 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3995 } 3996 err = 0; 3997 } else 3998 err = -EBUSY; 3999 spin_unlock(&mddev->lock); 4000 } else 4001 err = -EINVAL; 4002 break; 4003 case active: 4004 if (mddev->pers) { 4005 err = restart_array(mddev); 4006 if (err) 4007 break; 4008 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 4009 wake_up(&mddev->sb_wait); 4010 err = 0; 4011 } else { 4012 mddev->ro = 0; 4013 set_disk_ro(mddev->gendisk, 0); 4014 err = do_md_run(mddev); 4015 } 4016 break; 4017 case write_pending: 4018 case active_idle: 4019 /* these cannot be set */ 4020 break; 4021 } 4022 4023 if (!err) { 4024 if (mddev->hold_active == UNTIL_IOCTL) 4025 mddev->hold_active = 0; 4026 sysfs_notify_dirent_safe(mddev->sysfs_state); 4027 } 4028 mddev_unlock(mddev); 4029 return err ?: len; 4030 } 4031 static struct md_sysfs_entry md_array_state = 4032 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 4033 4034 static ssize_t 4035 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 4036 return sprintf(page, "%d\n", 4037 atomic_read(&mddev->max_corr_read_errors)); 4038 } 4039 4040 static ssize_t 4041 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 4042 { 4043 unsigned int n; 4044 int rv; 4045 4046 rv = kstrtouint(buf, 10, &n); 4047 if (rv < 0) 4048 return rv; 4049 atomic_set(&mddev->max_corr_read_errors, n); 4050 return len; 4051 } 4052 4053 static struct md_sysfs_entry max_corr_read_errors = 4054 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4055 max_corrected_read_errors_store); 4056 4057 static ssize_t 4058 null_show(struct mddev *mddev, char *page) 4059 { 4060 return -EINVAL; 4061 } 4062 4063 static ssize_t 4064 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4065 { 4066 /* buf must be %d:%d\n? giving major and minor numbers */ 4067 /* The new device is added to the array. 4068 * If the array has a persistent superblock, we read the 4069 * superblock to initialise info and check validity. 4070 * Otherwise, only checking done is that in bind_rdev_to_array, 4071 * which mainly checks size. 4072 */ 4073 char *e; 4074 int major = simple_strtoul(buf, &e, 10); 4075 int minor; 4076 dev_t dev; 4077 struct md_rdev *rdev; 4078 int err; 4079 4080 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4081 return -EINVAL; 4082 minor = simple_strtoul(e+1, &e, 10); 4083 if (*e && *e != '\n') 4084 return -EINVAL; 4085 dev = MKDEV(major, minor); 4086 if (major != MAJOR(dev) || 4087 minor != MINOR(dev)) 4088 return -EOVERFLOW; 4089 4090 flush_workqueue(md_misc_wq); 4091 4092 err = mddev_lock(mddev); 4093 if (err) 4094 return err; 4095 if (mddev->persistent) { 4096 rdev = md_import_device(dev, mddev->major_version, 4097 mddev->minor_version); 4098 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4099 struct md_rdev *rdev0 4100 = list_entry(mddev->disks.next, 4101 struct md_rdev, same_set); 4102 err = super_types[mddev->major_version] 4103 .load_super(rdev, rdev0, mddev->minor_version); 4104 if (err < 0) 4105 goto out; 4106 } 4107 } else if (mddev->external) 4108 rdev = md_import_device(dev, -2, -1); 4109 else 4110 rdev = md_import_device(dev, -1, -1); 4111 4112 if (IS_ERR(rdev)) { 4113 mddev_unlock(mddev); 4114 return PTR_ERR(rdev); 4115 } 4116 err = bind_rdev_to_array(rdev, mddev); 4117 out: 4118 if (err) 4119 export_rdev(rdev); 4120 mddev_unlock(mddev); 4121 return err ? err : len; 4122 } 4123 4124 static struct md_sysfs_entry md_new_device = 4125 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4126 4127 static ssize_t 4128 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4129 { 4130 char *end; 4131 unsigned long chunk, end_chunk; 4132 int err; 4133 4134 err = mddev_lock(mddev); 4135 if (err) 4136 return err; 4137 if (!mddev->bitmap) 4138 goto out; 4139 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4140 while (*buf) { 4141 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4142 if (buf == end) break; 4143 if (*end == '-') { /* range */ 4144 buf = end + 1; 4145 end_chunk = simple_strtoul(buf, &end, 0); 4146 if (buf == end) break; 4147 } 4148 if (*end && !isspace(*end)) break; 4149 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4150 buf = skip_spaces(end); 4151 } 4152 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4153 out: 4154 mddev_unlock(mddev); 4155 return len; 4156 } 4157 4158 static struct md_sysfs_entry md_bitmap = 4159 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4160 4161 static ssize_t 4162 size_show(struct mddev *mddev, char *page) 4163 { 4164 return sprintf(page, "%llu\n", 4165 (unsigned long long)mddev->dev_sectors / 2); 4166 } 4167 4168 static int update_size(struct mddev *mddev, sector_t num_sectors); 4169 4170 static ssize_t 4171 size_store(struct mddev *mddev, const char *buf, size_t len) 4172 { 4173 /* If array is inactive, we can reduce the component size, but 4174 * not increase it (except from 0). 4175 * If array is active, we can try an on-line resize 4176 */ 4177 sector_t sectors; 4178 int err = strict_blocks_to_sectors(buf, §ors); 4179 4180 if (err < 0) 4181 return err; 4182 err = mddev_lock(mddev); 4183 if (err) 4184 return err; 4185 if (mddev->pers) { 4186 err = update_size(mddev, sectors); 4187 md_update_sb(mddev, 1); 4188 } else { 4189 if (mddev->dev_sectors == 0 || 4190 mddev->dev_sectors > sectors) 4191 mddev->dev_sectors = sectors; 4192 else 4193 err = -ENOSPC; 4194 } 4195 mddev_unlock(mddev); 4196 return err ? err : len; 4197 } 4198 4199 static struct md_sysfs_entry md_size = 4200 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4201 4202 /* Metadata version. 4203 * This is one of 4204 * 'none' for arrays with no metadata (good luck...) 4205 * 'external' for arrays with externally managed metadata, 4206 * or N.M for internally known formats 4207 */ 4208 static ssize_t 4209 metadata_show(struct mddev *mddev, char *page) 4210 { 4211 if (mddev->persistent) 4212 return sprintf(page, "%d.%d\n", 4213 mddev->major_version, mddev->minor_version); 4214 else if (mddev->external) 4215 return sprintf(page, "external:%s\n", mddev->metadata_type); 4216 else 4217 return sprintf(page, "none\n"); 4218 } 4219 4220 static ssize_t 4221 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4222 { 4223 int major, minor; 4224 char *e; 4225 int err; 4226 /* Changing the details of 'external' metadata is 4227 * always permitted. Otherwise there must be 4228 * no devices attached to the array. 4229 */ 4230 4231 err = mddev_lock(mddev); 4232 if (err) 4233 return err; 4234 err = -EBUSY; 4235 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4236 ; 4237 else if (!list_empty(&mddev->disks)) 4238 goto out_unlock; 4239 4240 err = 0; 4241 if (cmd_match(buf, "none")) { 4242 mddev->persistent = 0; 4243 mddev->external = 0; 4244 mddev->major_version = 0; 4245 mddev->minor_version = 90; 4246 goto out_unlock; 4247 } 4248 if (strncmp(buf, "external:", 9) == 0) { 4249 size_t namelen = len-9; 4250 if (namelen >= sizeof(mddev->metadata_type)) 4251 namelen = sizeof(mddev->metadata_type)-1; 4252 strncpy(mddev->metadata_type, buf+9, namelen); 4253 mddev->metadata_type[namelen] = 0; 4254 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4255 mddev->metadata_type[--namelen] = 0; 4256 mddev->persistent = 0; 4257 mddev->external = 1; 4258 mddev->major_version = 0; 4259 mddev->minor_version = 90; 4260 goto out_unlock; 4261 } 4262 major = simple_strtoul(buf, &e, 10); 4263 err = -EINVAL; 4264 if (e==buf || *e != '.') 4265 goto out_unlock; 4266 buf = e+1; 4267 minor = simple_strtoul(buf, &e, 10); 4268 if (e==buf || (*e && *e != '\n') ) 4269 goto out_unlock; 4270 err = -ENOENT; 4271 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4272 goto out_unlock; 4273 mddev->major_version = major; 4274 mddev->minor_version = minor; 4275 mddev->persistent = 1; 4276 mddev->external = 0; 4277 err = 0; 4278 out_unlock: 4279 mddev_unlock(mddev); 4280 return err ?: len; 4281 } 4282 4283 static struct md_sysfs_entry md_metadata = 4284 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4285 4286 static ssize_t 4287 action_show(struct mddev *mddev, char *page) 4288 { 4289 char *type = "idle"; 4290 unsigned long recovery = mddev->recovery; 4291 if (test_bit(MD_RECOVERY_FROZEN, &recovery)) 4292 type = "frozen"; 4293 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || 4294 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { 4295 if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) 4296 type = "reshape"; 4297 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { 4298 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) 4299 type = "resync"; 4300 else if (test_bit(MD_RECOVERY_CHECK, &recovery)) 4301 type = "check"; 4302 else 4303 type = "repair"; 4304 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) 4305 type = "recover"; 4306 else if (mddev->reshape_position != MaxSector) 4307 type = "reshape"; 4308 } 4309 return sprintf(page, "%s\n", type); 4310 } 4311 4312 static ssize_t 4313 action_store(struct mddev *mddev, const char *page, size_t len) 4314 { 4315 if (!mddev->pers || !mddev->pers->sync_request) 4316 return -EINVAL; 4317 4318 4319 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4320 if (cmd_match(page, "frozen")) 4321 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4322 else 4323 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4324 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 4325 mddev_lock(mddev) == 0) { 4326 flush_workqueue(md_misc_wq); 4327 if (mddev->sync_thread) { 4328 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4329 md_reap_sync_thread(mddev); 4330 } 4331 mddev_unlock(mddev); 4332 } 4333 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4334 return -EBUSY; 4335 else if (cmd_match(page, "resync")) 4336 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4337 else if (cmd_match(page, "recover")) { 4338 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4339 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4340 } else if (cmd_match(page, "reshape")) { 4341 int err; 4342 if (mddev->pers->start_reshape == NULL) 4343 return -EINVAL; 4344 err = mddev_lock(mddev); 4345 if (!err) { 4346 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4347 err = -EBUSY; 4348 else { 4349 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4350 err = mddev->pers->start_reshape(mddev); 4351 } 4352 mddev_unlock(mddev); 4353 } 4354 if (err) 4355 return err; 4356 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4357 } else { 4358 if (cmd_match(page, "check")) 4359 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4360 else if (!cmd_match(page, "repair")) 4361 return -EINVAL; 4362 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4363 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4364 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4365 } 4366 if (mddev->ro == 2) { 4367 /* A write to sync_action is enough to justify 4368 * canceling read-auto mode 4369 */ 4370 mddev->ro = 0; 4371 md_wakeup_thread(mddev->sync_thread); 4372 } 4373 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4374 md_wakeup_thread(mddev->thread); 4375 sysfs_notify_dirent_safe(mddev->sysfs_action); 4376 return len; 4377 } 4378 4379 static struct md_sysfs_entry md_scan_mode = 4380 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4381 4382 static ssize_t 4383 last_sync_action_show(struct mddev *mddev, char *page) 4384 { 4385 return sprintf(page, "%s\n", mddev->last_sync_action); 4386 } 4387 4388 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4389 4390 static ssize_t 4391 mismatch_cnt_show(struct mddev *mddev, char *page) 4392 { 4393 return sprintf(page, "%llu\n", 4394 (unsigned long long) 4395 atomic64_read(&mddev->resync_mismatches)); 4396 } 4397 4398 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4399 4400 static ssize_t 4401 sync_min_show(struct mddev *mddev, char *page) 4402 { 4403 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4404 mddev->sync_speed_min ? "local": "system"); 4405 } 4406 4407 static ssize_t 4408 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4409 { 4410 unsigned int min; 4411 int rv; 4412 4413 if (strncmp(buf, "system", 6)==0) { 4414 min = 0; 4415 } else { 4416 rv = kstrtouint(buf, 10, &min); 4417 if (rv < 0) 4418 return rv; 4419 if (min == 0) 4420 return -EINVAL; 4421 } 4422 mddev->sync_speed_min = min; 4423 return len; 4424 } 4425 4426 static struct md_sysfs_entry md_sync_min = 4427 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4428 4429 static ssize_t 4430 sync_max_show(struct mddev *mddev, char *page) 4431 { 4432 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4433 mddev->sync_speed_max ? "local": "system"); 4434 } 4435 4436 static ssize_t 4437 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4438 { 4439 unsigned int max; 4440 int rv; 4441 4442 if (strncmp(buf, "system", 6)==0) { 4443 max = 0; 4444 } else { 4445 rv = kstrtouint(buf, 10, &max); 4446 if (rv < 0) 4447 return rv; 4448 if (max == 0) 4449 return -EINVAL; 4450 } 4451 mddev->sync_speed_max = max; 4452 return len; 4453 } 4454 4455 static struct md_sysfs_entry md_sync_max = 4456 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4457 4458 static ssize_t 4459 degraded_show(struct mddev *mddev, char *page) 4460 { 4461 return sprintf(page, "%d\n", mddev->degraded); 4462 } 4463 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4464 4465 static ssize_t 4466 sync_force_parallel_show(struct mddev *mddev, char *page) 4467 { 4468 return sprintf(page, "%d\n", mddev->parallel_resync); 4469 } 4470 4471 static ssize_t 4472 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4473 { 4474 long n; 4475 4476 if (kstrtol(buf, 10, &n)) 4477 return -EINVAL; 4478 4479 if (n != 0 && n != 1) 4480 return -EINVAL; 4481 4482 mddev->parallel_resync = n; 4483 4484 if (mddev->sync_thread) 4485 wake_up(&resync_wait); 4486 4487 return len; 4488 } 4489 4490 /* force parallel resync, even with shared block devices */ 4491 static struct md_sysfs_entry md_sync_force_parallel = 4492 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 4493 sync_force_parallel_show, sync_force_parallel_store); 4494 4495 static ssize_t 4496 sync_speed_show(struct mddev *mddev, char *page) 4497 { 4498 unsigned long resync, dt, db; 4499 if (mddev->curr_resync == 0) 4500 return sprintf(page, "none\n"); 4501 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 4502 dt = (jiffies - mddev->resync_mark) / HZ; 4503 if (!dt) dt++; 4504 db = resync - mddev->resync_mark_cnt; 4505 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 4506 } 4507 4508 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 4509 4510 static ssize_t 4511 sync_completed_show(struct mddev *mddev, char *page) 4512 { 4513 unsigned long long max_sectors, resync; 4514 4515 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4516 return sprintf(page, "none\n"); 4517 4518 if (mddev->curr_resync == 1 || 4519 mddev->curr_resync == 2) 4520 return sprintf(page, "delayed\n"); 4521 4522 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4523 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4524 max_sectors = mddev->resync_max_sectors; 4525 else 4526 max_sectors = mddev->dev_sectors; 4527 4528 resync = mddev->curr_resync_completed; 4529 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 4530 } 4531 4532 static struct md_sysfs_entry md_sync_completed = 4533 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); 4534 4535 static ssize_t 4536 min_sync_show(struct mddev *mddev, char *page) 4537 { 4538 return sprintf(page, "%llu\n", 4539 (unsigned long long)mddev->resync_min); 4540 } 4541 static ssize_t 4542 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4543 { 4544 unsigned long long min; 4545 int err; 4546 4547 if (kstrtoull(buf, 10, &min)) 4548 return -EINVAL; 4549 4550 spin_lock(&mddev->lock); 4551 err = -EINVAL; 4552 if (min > mddev->resync_max) 4553 goto out_unlock; 4554 4555 err = -EBUSY; 4556 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4557 goto out_unlock; 4558 4559 /* Round down to multiple of 4K for safety */ 4560 mddev->resync_min = round_down(min, 8); 4561 err = 0; 4562 4563 out_unlock: 4564 spin_unlock(&mddev->lock); 4565 return err ?: len; 4566 } 4567 4568 static struct md_sysfs_entry md_min_sync = 4569 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 4570 4571 static ssize_t 4572 max_sync_show(struct mddev *mddev, char *page) 4573 { 4574 if (mddev->resync_max == MaxSector) 4575 return sprintf(page, "max\n"); 4576 else 4577 return sprintf(page, "%llu\n", 4578 (unsigned long long)mddev->resync_max); 4579 } 4580 static ssize_t 4581 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4582 { 4583 int err; 4584 spin_lock(&mddev->lock); 4585 if (strncmp(buf, "max", 3) == 0) 4586 mddev->resync_max = MaxSector; 4587 else { 4588 unsigned long long max; 4589 int chunk; 4590 4591 err = -EINVAL; 4592 if (kstrtoull(buf, 10, &max)) 4593 goto out_unlock; 4594 if (max < mddev->resync_min) 4595 goto out_unlock; 4596 4597 err = -EBUSY; 4598 if (max < mddev->resync_max && 4599 mddev->ro == 0 && 4600 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4601 goto out_unlock; 4602 4603 /* Must be a multiple of chunk_size */ 4604 chunk = mddev->chunk_sectors; 4605 if (chunk) { 4606 sector_t temp = max; 4607 4608 err = -EINVAL; 4609 if (sector_div(temp, chunk)) 4610 goto out_unlock; 4611 } 4612 mddev->resync_max = max; 4613 } 4614 wake_up(&mddev->recovery_wait); 4615 err = 0; 4616 out_unlock: 4617 spin_unlock(&mddev->lock); 4618 return err ?: len; 4619 } 4620 4621 static struct md_sysfs_entry md_max_sync = 4622 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 4623 4624 static ssize_t 4625 suspend_lo_show(struct mddev *mddev, char *page) 4626 { 4627 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 4628 } 4629 4630 static ssize_t 4631 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4632 { 4633 unsigned long long old, new; 4634 int err; 4635 4636 err = kstrtoull(buf, 10, &new); 4637 if (err < 0) 4638 return err; 4639 if (new != (sector_t)new) 4640 return -EINVAL; 4641 4642 err = mddev_lock(mddev); 4643 if (err) 4644 return err; 4645 err = -EINVAL; 4646 if (mddev->pers == NULL || 4647 mddev->pers->quiesce == NULL) 4648 goto unlock; 4649 old = mddev->suspend_lo; 4650 mddev->suspend_lo = new; 4651 if (new >= old) 4652 /* Shrinking suspended region */ 4653 mddev->pers->quiesce(mddev, 2); 4654 else { 4655 /* Expanding suspended region - need to wait */ 4656 mddev->pers->quiesce(mddev, 1); 4657 mddev->pers->quiesce(mddev, 0); 4658 } 4659 err = 0; 4660 unlock: 4661 mddev_unlock(mddev); 4662 return err ?: len; 4663 } 4664 static struct md_sysfs_entry md_suspend_lo = 4665 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4666 4667 static ssize_t 4668 suspend_hi_show(struct mddev *mddev, char *page) 4669 { 4670 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 4671 } 4672 4673 static ssize_t 4674 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4675 { 4676 unsigned long long old, new; 4677 int err; 4678 4679 err = kstrtoull(buf, 10, &new); 4680 if (err < 0) 4681 return err; 4682 if (new != (sector_t)new) 4683 return -EINVAL; 4684 4685 err = mddev_lock(mddev); 4686 if (err) 4687 return err; 4688 err = -EINVAL; 4689 if (mddev->pers == NULL || 4690 mddev->pers->quiesce == NULL) 4691 goto unlock; 4692 old = mddev->suspend_hi; 4693 mddev->suspend_hi = new; 4694 if (new <= old) 4695 /* Shrinking suspended region */ 4696 mddev->pers->quiesce(mddev, 2); 4697 else { 4698 /* Expanding suspended region - need to wait */ 4699 mddev->pers->quiesce(mddev, 1); 4700 mddev->pers->quiesce(mddev, 0); 4701 } 4702 err = 0; 4703 unlock: 4704 mddev_unlock(mddev); 4705 return err ?: len; 4706 } 4707 static struct md_sysfs_entry md_suspend_hi = 4708 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4709 4710 static ssize_t 4711 reshape_position_show(struct mddev *mddev, char *page) 4712 { 4713 if (mddev->reshape_position != MaxSector) 4714 return sprintf(page, "%llu\n", 4715 (unsigned long long)mddev->reshape_position); 4716 strcpy(page, "none\n"); 4717 return 5; 4718 } 4719 4720 static ssize_t 4721 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4722 { 4723 struct md_rdev *rdev; 4724 unsigned long long new; 4725 int err; 4726 4727 err = kstrtoull(buf, 10, &new); 4728 if (err < 0) 4729 return err; 4730 if (new != (sector_t)new) 4731 return -EINVAL; 4732 err = mddev_lock(mddev); 4733 if (err) 4734 return err; 4735 err = -EBUSY; 4736 if (mddev->pers) 4737 goto unlock; 4738 mddev->reshape_position = new; 4739 mddev->delta_disks = 0; 4740 mddev->reshape_backwards = 0; 4741 mddev->new_level = mddev->level; 4742 mddev->new_layout = mddev->layout; 4743 mddev->new_chunk_sectors = mddev->chunk_sectors; 4744 rdev_for_each(rdev, mddev) 4745 rdev->new_data_offset = rdev->data_offset; 4746 err = 0; 4747 unlock: 4748 mddev_unlock(mddev); 4749 return err ?: len; 4750 } 4751 4752 static struct md_sysfs_entry md_reshape_position = 4753 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4754 reshape_position_store); 4755 4756 static ssize_t 4757 reshape_direction_show(struct mddev *mddev, char *page) 4758 { 4759 return sprintf(page, "%s\n", 4760 mddev->reshape_backwards ? "backwards" : "forwards"); 4761 } 4762 4763 static ssize_t 4764 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4765 { 4766 int backwards = 0; 4767 int err; 4768 4769 if (cmd_match(buf, "forwards")) 4770 backwards = 0; 4771 else if (cmd_match(buf, "backwards")) 4772 backwards = 1; 4773 else 4774 return -EINVAL; 4775 if (mddev->reshape_backwards == backwards) 4776 return len; 4777 4778 err = mddev_lock(mddev); 4779 if (err) 4780 return err; 4781 /* check if we are allowed to change */ 4782 if (mddev->delta_disks) 4783 err = -EBUSY; 4784 else if (mddev->persistent && 4785 mddev->major_version == 0) 4786 err = -EINVAL; 4787 else 4788 mddev->reshape_backwards = backwards; 4789 mddev_unlock(mddev); 4790 return err ?: len; 4791 } 4792 4793 static struct md_sysfs_entry md_reshape_direction = 4794 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 4795 reshape_direction_store); 4796 4797 static ssize_t 4798 array_size_show(struct mddev *mddev, char *page) 4799 { 4800 if (mddev->external_size) 4801 return sprintf(page, "%llu\n", 4802 (unsigned long long)mddev->array_sectors/2); 4803 else 4804 return sprintf(page, "default\n"); 4805 } 4806 4807 static ssize_t 4808 array_size_store(struct mddev *mddev, const char *buf, size_t len) 4809 { 4810 sector_t sectors; 4811 int err; 4812 4813 err = mddev_lock(mddev); 4814 if (err) 4815 return err; 4816 4817 if (strncmp(buf, "default", 7) == 0) { 4818 if (mddev->pers) 4819 sectors = mddev->pers->size(mddev, 0, 0); 4820 else 4821 sectors = mddev->array_sectors; 4822 4823 mddev->external_size = 0; 4824 } else { 4825 if (strict_blocks_to_sectors(buf, §ors) < 0) 4826 err = -EINVAL; 4827 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 4828 err = -E2BIG; 4829 else 4830 mddev->external_size = 1; 4831 } 4832 4833 if (!err) { 4834 mddev->array_sectors = sectors; 4835 if (mddev->pers) { 4836 set_capacity(mddev->gendisk, mddev->array_sectors); 4837 revalidate_disk(mddev->gendisk); 4838 } 4839 } 4840 mddev_unlock(mddev); 4841 return err ?: len; 4842 } 4843 4844 static struct md_sysfs_entry md_array_size = 4845 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 4846 array_size_store); 4847 4848 static struct attribute *md_default_attrs[] = { 4849 &md_level.attr, 4850 &md_layout.attr, 4851 &md_raid_disks.attr, 4852 &md_chunk_size.attr, 4853 &md_size.attr, 4854 &md_resync_start.attr, 4855 &md_metadata.attr, 4856 &md_new_device.attr, 4857 &md_safe_delay.attr, 4858 &md_array_state.attr, 4859 &md_reshape_position.attr, 4860 &md_reshape_direction.attr, 4861 &md_array_size.attr, 4862 &max_corr_read_errors.attr, 4863 NULL, 4864 }; 4865 4866 static struct attribute *md_redundancy_attrs[] = { 4867 &md_scan_mode.attr, 4868 &md_last_scan_mode.attr, 4869 &md_mismatches.attr, 4870 &md_sync_min.attr, 4871 &md_sync_max.attr, 4872 &md_sync_speed.attr, 4873 &md_sync_force_parallel.attr, 4874 &md_sync_completed.attr, 4875 &md_min_sync.attr, 4876 &md_max_sync.attr, 4877 &md_suspend_lo.attr, 4878 &md_suspend_hi.attr, 4879 &md_bitmap.attr, 4880 &md_degraded.attr, 4881 NULL, 4882 }; 4883 static struct attribute_group md_redundancy_group = { 4884 .name = NULL, 4885 .attrs = md_redundancy_attrs, 4886 }; 4887 4888 static ssize_t 4889 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4890 { 4891 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4892 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4893 ssize_t rv; 4894 4895 if (!entry->show) 4896 return -EIO; 4897 spin_lock(&all_mddevs_lock); 4898 if (list_empty(&mddev->all_mddevs)) { 4899 spin_unlock(&all_mddevs_lock); 4900 return -EBUSY; 4901 } 4902 mddev_get(mddev); 4903 spin_unlock(&all_mddevs_lock); 4904 4905 rv = entry->show(mddev, page); 4906 mddev_put(mddev); 4907 return rv; 4908 } 4909 4910 static ssize_t 4911 md_attr_store(struct kobject *kobj, struct attribute *attr, 4912 const char *page, size_t length) 4913 { 4914 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4915 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4916 ssize_t rv; 4917 4918 if (!entry->store) 4919 return -EIO; 4920 if (!capable(CAP_SYS_ADMIN)) 4921 return -EACCES; 4922 spin_lock(&all_mddevs_lock); 4923 if (list_empty(&mddev->all_mddevs)) { 4924 spin_unlock(&all_mddevs_lock); 4925 return -EBUSY; 4926 } 4927 mddev_get(mddev); 4928 spin_unlock(&all_mddevs_lock); 4929 rv = entry->store(mddev, page, length); 4930 mddev_put(mddev); 4931 return rv; 4932 } 4933 4934 static void md_free(struct kobject *ko) 4935 { 4936 struct mddev *mddev = container_of(ko, struct mddev, kobj); 4937 4938 if (mddev->sysfs_state) 4939 sysfs_put(mddev->sysfs_state); 4940 4941 if (mddev->queue) 4942 blk_cleanup_queue(mddev->queue); 4943 if (mddev->gendisk) { 4944 del_gendisk(mddev->gendisk); 4945 put_disk(mddev->gendisk); 4946 } 4947 4948 kfree(mddev); 4949 } 4950 4951 static const struct sysfs_ops md_sysfs_ops = { 4952 .show = md_attr_show, 4953 .store = md_attr_store, 4954 }; 4955 static struct kobj_type md_ktype = { 4956 .release = md_free, 4957 .sysfs_ops = &md_sysfs_ops, 4958 .default_attrs = md_default_attrs, 4959 }; 4960 4961 int mdp_major = 0; 4962 4963 static void mddev_delayed_delete(struct work_struct *ws) 4964 { 4965 struct mddev *mddev = container_of(ws, struct mddev, del_work); 4966 4967 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4968 kobject_del(&mddev->kobj); 4969 kobject_put(&mddev->kobj); 4970 } 4971 4972 static int md_alloc(dev_t dev, char *name) 4973 { 4974 static DEFINE_MUTEX(disks_mutex); 4975 struct mddev *mddev = mddev_find(dev); 4976 struct gendisk *disk; 4977 int partitioned; 4978 int shift; 4979 int unit; 4980 int error; 4981 4982 if (!mddev) 4983 return -ENODEV; 4984 4985 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 4986 shift = partitioned ? MdpMinorShift : 0; 4987 unit = MINOR(mddev->unit) >> shift; 4988 4989 /* wait for any previous instance of this device to be 4990 * completely removed (mddev_delayed_delete). 4991 */ 4992 flush_workqueue(md_misc_wq); 4993 4994 mutex_lock(&disks_mutex); 4995 error = -EEXIST; 4996 if (mddev->gendisk) 4997 goto abort; 4998 4999 if (name) { 5000 /* Need to ensure that 'name' is not a duplicate. 5001 */ 5002 struct mddev *mddev2; 5003 spin_lock(&all_mddevs_lock); 5004 5005 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 5006 if (mddev2->gendisk && 5007 strcmp(mddev2->gendisk->disk_name, name) == 0) { 5008 spin_unlock(&all_mddevs_lock); 5009 goto abort; 5010 } 5011 spin_unlock(&all_mddevs_lock); 5012 } 5013 5014 error = -ENOMEM; 5015 mddev->queue = blk_alloc_queue(GFP_KERNEL); 5016 if (!mddev->queue) 5017 goto abort; 5018 mddev->queue->queuedata = mddev; 5019 5020 blk_queue_make_request(mddev->queue, md_make_request); 5021 blk_set_stacking_limits(&mddev->queue->limits); 5022 5023 disk = alloc_disk(1 << shift); 5024 if (!disk) { 5025 blk_cleanup_queue(mddev->queue); 5026 mddev->queue = NULL; 5027 goto abort; 5028 } 5029 disk->major = MAJOR(mddev->unit); 5030 disk->first_minor = unit << shift; 5031 if (name) 5032 strcpy(disk->disk_name, name); 5033 else if (partitioned) 5034 sprintf(disk->disk_name, "md_d%d", unit); 5035 else 5036 sprintf(disk->disk_name, "md%d", unit); 5037 disk->fops = &md_fops; 5038 disk->private_data = mddev; 5039 disk->queue = mddev->queue; 5040 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); 5041 /* Allow extended partitions. This makes the 5042 * 'mdp' device redundant, but we can't really 5043 * remove it now. 5044 */ 5045 disk->flags |= GENHD_FL_EXT_DEVT; 5046 mddev->gendisk = disk; 5047 /* As soon as we call add_disk(), another thread could get 5048 * through to md_open, so make sure it doesn't get too far 5049 */ 5050 mutex_lock(&mddev->open_mutex); 5051 add_disk(disk); 5052 5053 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 5054 &disk_to_dev(disk)->kobj, "%s", "md"); 5055 if (error) { 5056 /* This isn't possible, but as kobject_init_and_add is marked 5057 * __must_check, we must do something with the result 5058 */ 5059 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 5060 disk->disk_name); 5061 error = 0; 5062 } 5063 if (mddev->kobj.sd && 5064 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 5065 printk(KERN_DEBUG "pointless warning\n"); 5066 mutex_unlock(&mddev->open_mutex); 5067 abort: 5068 mutex_unlock(&disks_mutex); 5069 if (!error && mddev->kobj.sd) { 5070 kobject_uevent(&mddev->kobj, KOBJ_ADD); 5071 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 5072 } 5073 mddev_put(mddev); 5074 return error; 5075 } 5076 5077 static struct kobject *md_probe(dev_t dev, int *part, void *data) 5078 { 5079 md_alloc(dev, NULL); 5080 return NULL; 5081 } 5082 5083 static int add_named_array(const char *val, struct kernel_param *kp) 5084 { 5085 /* val must be "md_*" where * is not all digits. 5086 * We allocate an array with a large free minor number, and 5087 * set the name to val. val must not already be an active name. 5088 */ 5089 int len = strlen(val); 5090 char buf[DISK_NAME_LEN]; 5091 5092 while (len && val[len-1] == '\n') 5093 len--; 5094 if (len >= DISK_NAME_LEN) 5095 return -E2BIG; 5096 strlcpy(buf, val, len+1); 5097 if (strncmp(buf, "md_", 3) != 0) 5098 return -EINVAL; 5099 return md_alloc(0, buf); 5100 } 5101 5102 static void md_safemode_timeout(unsigned long data) 5103 { 5104 struct mddev *mddev = (struct mddev *) data; 5105 5106 if (!atomic_read(&mddev->writes_pending)) { 5107 mddev->safemode = 1; 5108 if (mddev->external) 5109 sysfs_notify_dirent_safe(mddev->sysfs_state); 5110 } 5111 md_wakeup_thread(mddev->thread); 5112 } 5113 5114 static int start_dirty_degraded; 5115 5116 int md_run(struct mddev *mddev) 5117 { 5118 int err; 5119 struct md_rdev *rdev; 5120 struct md_personality *pers; 5121 5122 if (list_empty(&mddev->disks)) 5123 /* cannot run an array with no devices.. */ 5124 return -EINVAL; 5125 5126 if (mddev->pers) 5127 return -EBUSY; 5128 /* Cannot run until previous stop completes properly */ 5129 if (mddev->sysfs_active) 5130 return -EBUSY; 5131 5132 /* 5133 * Analyze all RAID superblock(s) 5134 */ 5135 if (!mddev->raid_disks) { 5136 if (!mddev->persistent) 5137 return -EINVAL; 5138 analyze_sbs(mddev); 5139 } 5140 5141 if (mddev->level != LEVEL_NONE) 5142 request_module("md-level-%d", mddev->level); 5143 else if (mddev->clevel[0]) 5144 request_module("md-%s", mddev->clevel); 5145 5146 /* 5147 * Drop all container device buffers, from now on 5148 * the only valid external interface is through the md 5149 * device. 5150 */ 5151 rdev_for_each(rdev, mddev) { 5152 if (test_bit(Faulty, &rdev->flags)) 5153 continue; 5154 sync_blockdev(rdev->bdev); 5155 invalidate_bdev(rdev->bdev); 5156 5157 /* perform some consistency tests on the device. 5158 * We don't want the data to overlap the metadata, 5159 * Internal Bitmap issues have been handled elsewhere. 5160 */ 5161 if (rdev->meta_bdev) { 5162 /* Nothing to check */; 5163 } else if (rdev->data_offset < rdev->sb_start) { 5164 if (mddev->dev_sectors && 5165 rdev->data_offset + mddev->dev_sectors 5166 > rdev->sb_start) { 5167 printk("md: %s: data overlaps metadata\n", 5168 mdname(mddev)); 5169 return -EINVAL; 5170 } 5171 } else { 5172 if (rdev->sb_start + rdev->sb_size/512 5173 > rdev->data_offset) { 5174 printk("md: %s: metadata overlaps data\n", 5175 mdname(mddev)); 5176 return -EINVAL; 5177 } 5178 } 5179 sysfs_notify_dirent_safe(rdev->sysfs_state); 5180 } 5181 5182 if (mddev->bio_set == NULL) 5183 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); 5184 5185 spin_lock(&pers_lock); 5186 pers = find_pers(mddev->level, mddev->clevel); 5187 if (!pers || !try_module_get(pers->owner)) { 5188 spin_unlock(&pers_lock); 5189 if (mddev->level != LEVEL_NONE) 5190 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 5191 mddev->level); 5192 else 5193 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 5194 mddev->clevel); 5195 return -EINVAL; 5196 } 5197 spin_unlock(&pers_lock); 5198 if (mddev->level != pers->level) { 5199 mddev->level = pers->level; 5200 mddev->new_level = pers->level; 5201 } 5202 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5203 5204 if (mddev->reshape_position != MaxSector && 5205 pers->start_reshape == NULL) { 5206 /* This personality cannot handle reshaping... */ 5207 module_put(pers->owner); 5208 return -EINVAL; 5209 } 5210 5211 if (pers->sync_request) { 5212 /* Warn if this is a potentially silly 5213 * configuration. 5214 */ 5215 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5216 struct md_rdev *rdev2; 5217 int warned = 0; 5218 5219 rdev_for_each(rdev, mddev) 5220 rdev_for_each(rdev2, mddev) { 5221 if (rdev < rdev2 && 5222 rdev->bdev->bd_contains == 5223 rdev2->bdev->bd_contains) { 5224 printk(KERN_WARNING 5225 "%s: WARNING: %s appears to be" 5226 " on the same physical disk as" 5227 " %s.\n", 5228 mdname(mddev), 5229 bdevname(rdev->bdev,b), 5230 bdevname(rdev2->bdev,b2)); 5231 warned = 1; 5232 } 5233 } 5234 5235 if (warned) 5236 printk(KERN_WARNING 5237 "True protection against single-disk" 5238 " failure might be compromised.\n"); 5239 } 5240 5241 mddev->recovery = 0; 5242 /* may be over-ridden by personality */ 5243 mddev->resync_max_sectors = mddev->dev_sectors; 5244 5245 mddev->ok_start_degraded = start_dirty_degraded; 5246 5247 if (start_readonly && mddev->ro == 0) 5248 mddev->ro = 2; /* read-only, but switch on first write */ 5249 5250 err = pers->run(mddev); 5251 if (err) 5252 printk(KERN_ERR "md: pers->run() failed ...\n"); 5253 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { 5254 WARN_ONCE(!mddev->external_size, "%s: default size too small," 5255 " but 'external_size' not in effect?\n", __func__); 5256 printk(KERN_ERR 5257 "md: invalid array_size %llu > default size %llu\n", 5258 (unsigned long long)mddev->array_sectors / 2, 5259 (unsigned long long)pers->size(mddev, 0, 0) / 2); 5260 err = -EINVAL; 5261 } 5262 if (err == 0 && pers->sync_request && 5263 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5264 struct bitmap *bitmap; 5265 5266 bitmap = bitmap_create(mddev, -1); 5267 if (IS_ERR(bitmap)) { 5268 err = PTR_ERR(bitmap); 5269 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 5270 mdname(mddev), err); 5271 } else 5272 mddev->bitmap = bitmap; 5273 5274 } 5275 if (err) { 5276 mddev_detach(mddev); 5277 if (mddev->private) 5278 pers->free(mddev, mddev->private); 5279 mddev->private = NULL; 5280 module_put(pers->owner); 5281 bitmap_destroy(mddev); 5282 return err; 5283 } 5284 if (mddev->queue) { 5285 mddev->queue->backing_dev_info.congested_data = mddev; 5286 mddev->queue->backing_dev_info.congested_fn = md_congested; 5287 } 5288 if (pers->sync_request) { 5289 if (mddev->kobj.sd && 5290 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5291 printk(KERN_WARNING 5292 "md: cannot register extra attributes for %s\n", 5293 mdname(mddev)); 5294 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 5295 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 5296 mddev->ro = 0; 5297 5298 atomic_set(&mddev->writes_pending,0); 5299 atomic_set(&mddev->max_corr_read_errors, 5300 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 5301 mddev->safemode = 0; 5302 if (mddev_is_clustered(mddev)) 5303 mddev->safemode_delay = 0; 5304 else 5305 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 5306 mddev->in_sync = 1; 5307 smp_wmb(); 5308 spin_lock(&mddev->lock); 5309 mddev->pers = pers; 5310 spin_unlock(&mddev->lock); 5311 rdev_for_each(rdev, mddev) 5312 if (rdev->raid_disk >= 0) 5313 if (sysfs_link_rdev(mddev, rdev)) 5314 /* failure here is OK */; 5315 5316 if (mddev->degraded && !mddev->ro) 5317 /* This ensures that recovering status is reported immediately 5318 * via sysfs - until a lack of spares is confirmed. 5319 */ 5320 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5321 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5322 5323 if (mddev->flags & MD_UPDATE_SB_FLAGS) 5324 md_update_sb(mddev, 0); 5325 5326 md_new_event(mddev); 5327 sysfs_notify_dirent_safe(mddev->sysfs_state); 5328 sysfs_notify_dirent_safe(mddev->sysfs_action); 5329 sysfs_notify(&mddev->kobj, NULL, "degraded"); 5330 return 0; 5331 } 5332 EXPORT_SYMBOL_GPL(md_run); 5333 5334 static int do_md_run(struct mddev *mddev) 5335 { 5336 int err; 5337 5338 err = md_run(mddev); 5339 if (err) 5340 goto out; 5341 err = bitmap_load(mddev); 5342 if (err) { 5343 bitmap_destroy(mddev); 5344 goto out; 5345 } 5346 5347 if (mddev_is_clustered(mddev)) 5348 md_allow_write(mddev); 5349 5350 md_wakeup_thread(mddev->thread); 5351 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 5352 5353 set_capacity(mddev->gendisk, mddev->array_sectors); 5354 revalidate_disk(mddev->gendisk); 5355 mddev->changed = 1; 5356 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5357 out: 5358 return err; 5359 } 5360 5361 static int restart_array(struct mddev *mddev) 5362 { 5363 struct gendisk *disk = mddev->gendisk; 5364 5365 /* Complain if it has no devices */ 5366 if (list_empty(&mddev->disks)) 5367 return -ENXIO; 5368 if (!mddev->pers) 5369 return -EINVAL; 5370 if (!mddev->ro) 5371 return -EBUSY; 5372 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 5373 struct md_rdev *rdev; 5374 bool has_journal = false; 5375 5376 rcu_read_lock(); 5377 rdev_for_each_rcu(rdev, mddev) { 5378 if (test_bit(Journal, &rdev->flags) && 5379 !test_bit(Faulty, &rdev->flags)) { 5380 has_journal = true; 5381 break; 5382 } 5383 } 5384 rcu_read_unlock(); 5385 5386 /* Don't restart rw with journal missing/faulty */ 5387 if (!has_journal) 5388 return -EINVAL; 5389 } 5390 5391 mddev->safemode = 0; 5392 mddev->ro = 0; 5393 set_disk_ro(disk, 0); 5394 printk(KERN_INFO "md: %s switched to read-write mode.\n", 5395 mdname(mddev)); 5396 /* Kick recovery or resync if necessary */ 5397 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5398 md_wakeup_thread(mddev->thread); 5399 md_wakeup_thread(mddev->sync_thread); 5400 sysfs_notify_dirent_safe(mddev->sysfs_state); 5401 return 0; 5402 } 5403 5404 static void md_clean(struct mddev *mddev) 5405 { 5406 mddev->array_sectors = 0; 5407 mddev->external_size = 0; 5408 mddev->dev_sectors = 0; 5409 mddev->raid_disks = 0; 5410 mddev->recovery_cp = 0; 5411 mddev->resync_min = 0; 5412 mddev->resync_max = MaxSector; 5413 mddev->reshape_position = MaxSector; 5414 mddev->external = 0; 5415 mddev->persistent = 0; 5416 mddev->level = LEVEL_NONE; 5417 mddev->clevel[0] = 0; 5418 mddev->flags = 0; 5419 mddev->ro = 0; 5420 mddev->metadata_type[0] = 0; 5421 mddev->chunk_sectors = 0; 5422 mddev->ctime = mddev->utime = 0; 5423 mddev->layout = 0; 5424 mddev->max_disks = 0; 5425 mddev->events = 0; 5426 mddev->can_decrease_events = 0; 5427 mddev->delta_disks = 0; 5428 mddev->reshape_backwards = 0; 5429 mddev->new_level = LEVEL_NONE; 5430 mddev->new_layout = 0; 5431 mddev->new_chunk_sectors = 0; 5432 mddev->curr_resync = 0; 5433 atomic64_set(&mddev->resync_mismatches, 0); 5434 mddev->suspend_lo = mddev->suspend_hi = 0; 5435 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5436 mddev->recovery = 0; 5437 mddev->in_sync = 0; 5438 mddev->changed = 0; 5439 mddev->degraded = 0; 5440 mddev->safemode = 0; 5441 mddev->private = NULL; 5442 mddev->bitmap_info.offset = 0; 5443 mddev->bitmap_info.default_offset = 0; 5444 mddev->bitmap_info.default_space = 0; 5445 mddev->bitmap_info.chunksize = 0; 5446 mddev->bitmap_info.daemon_sleep = 0; 5447 mddev->bitmap_info.max_write_behind = 0; 5448 } 5449 5450 static void __md_stop_writes(struct mddev *mddev) 5451 { 5452 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5453 flush_workqueue(md_misc_wq); 5454 if (mddev->sync_thread) { 5455 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5456 md_reap_sync_thread(mddev); 5457 } 5458 5459 del_timer_sync(&mddev->safemode_timer); 5460 5461 bitmap_flush(mddev); 5462 md_super_wait(mddev); 5463 5464 if (mddev->ro == 0 && 5465 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || 5466 (mddev->flags & MD_UPDATE_SB_FLAGS))) { 5467 /* mark array as shutdown cleanly */ 5468 if (!mddev_is_clustered(mddev)) 5469 mddev->in_sync = 1; 5470 md_update_sb(mddev, 1); 5471 } 5472 } 5473 5474 void md_stop_writes(struct mddev *mddev) 5475 { 5476 mddev_lock_nointr(mddev); 5477 __md_stop_writes(mddev); 5478 mddev_unlock(mddev); 5479 } 5480 EXPORT_SYMBOL_GPL(md_stop_writes); 5481 5482 static void mddev_detach(struct mddev *mddev) 5483 { 5484 struct bitmap *bitmap = mddev->bitmap; 5485 /* wait for behind writes to complete */ 5486 if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 5487 printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", 5488 mdname(mddev)); 5489 /* need to kick something here to make sure I/O goes? */ 5490 wait_event(bitmap->behind_wait, 5491 atomic_read(&bitmap->behind_writes) == 0); 5492 } 5493 if (mddev->pers && mddev->pers->quiesce) { 5494 mddev->pers->quiesce(mddev, 1); 5495 mddev->pers->quiesce(mddev, 0); 5496 } 5497 md_unregister_thread(&mddev->thread); 5498 if (mddev->queue) 5499 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5500 } 5501 5502 static void __md_stop(struct mddev *mddev) 5503 { 5504 struct md_personality *pers = mddev->pers; 5505 mddev_detach(mddev); 5506 /* Ensure ->event_work is done */ 5507 flush_workqueue(md_misc_wq); 5508 spin_lock(&mddev->lock); 5509 mddev->pers = NULL; 5510 spin_unlock(&mddev->lock); 5511 pers->free(mddev, mddev->private); 5512 mddev->private = NULL; 5513 if (pers->sync_request && mddev->to_remove == NULL) 5514 mddev->to_remove = &md_redundancy_group; 5515 module_put(pers->owner); 5516 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5517 } 5518 5519 void md_stop(struct mddev *mddev) 5520 { 5521 /* stop the array and free an attached data structures. 5522 * This is called from dm-raid 5523 */ 5524 __md_stop(mddev); 5525 bitmap_destroy(mddev); 5526 if (mddev->bio_set) 5527 bioset_free(mddev->bio_set); 5528 } 5529 5530 EXPORT_SYMBOL_GPL(md_stop); 5531 5532 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5533 { 5534 int err = 0; 5535 int did_freeze = 0; 5536 5537 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5538 did_freeze = 1; 5539 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5540 md_wakeup_thread(mddev->thread); 5541 } 5542 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5543 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5544 if (mddev->sync_thread) 5545 /* Thread might be blocked waiting for metadata update 5546 * which will now never happen */ 5547 wake_up_process(mddev->sync_thread->tsk); 5548 5549 if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags)) 5550 return -EBUSY; 5551 mddev_unlock(mddev); 5552 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, 5553 &mddev->recovery)); 5554 wait_event(mddev->sb_wait, 5555 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 5556 mddev_lock_nointr(mddev); 5557 5558 mutex_lock(&mddev->open_mutex); 5559 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5560 mddev->sync_thread || 5561 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5562 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5563 printk("md: %s still in use.\n",mdname(mddev)); 5564 if (did_freeze) { 5565 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5566 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5567 md_wakeup_thread(mddev->thread); 5568 } 5569 err = -EBUSY; 5570 goto out; 5571 } 5572 if (mddev->pers) { 5573 __md_stop_writes(mddev); 5574 5575 err = -ENXIO; 5576 if (mddev->ro==1) 5577 goto out; 5578 mddev->ro = 1; 5579 set_disk_ro(mddev->gendisk, 1); 5580 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5581 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5582 md_wakeup_thread(mddev->thread); 5583 sysfs_notify_dirent_safe(mddev->sysfs_state); 5584 err = 0; 5585 } 5586 out: 5587 mutex_unlock(&mddev->open_mutex); 5588 return err; 5589 } 5590 5591 /* mode: 5592 * 0 - completely stop and dis-assemble array 5593 * 2 - stop but do not disassemble array 5594 */ 5595 static int do_md_stop(struct mddev *mddev, int mode, 5596 struct block_device *bdev) 5597 { 5598 struct gendisk *disk = mddev->gendisk; 5599 struct md_rdev *rdev; 5600 int did_freeze = 0; 5601 5602 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5603 did_freeze = 1; 5604 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5605 md_wakeup_thread(mddev->thread); 5606 } 5607 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5608 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5609 if (mddev->sync_thread) 5610 /* Thread might be blocked waiting for metadata update 5611 * which will now never happen */ 5612 wake_up_process(mddev->sync_thread->tsk); 5613 5614 mddev_unlock(mddev); 5615 wait_event(resync_wait, (mddev->sync_thread == NULL && 5616 !test_bit(MD_RECOVERY_RUNNING, 5617 &mddev->recovery))); 5618 mddev_lock_nointr(mddev); 5619 5620 mutex_lock(&mddev->open_mutex); 5621 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || 5622 mddev->sysfs_active || 5623 mddev->sync_thread || 5624 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 5625 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5626 printk("md: %s still in use.\n",mdname(mddev)); 5627 mutex_unlock(&mddev->open_mutex); 5628 if (did_freeze) { 5629 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5630 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5631 md_wakeup_thread(mddev->thread); 5632 } 5633 return -EBUSY; 5634 } 5635 if (mddev->pers) { 5636 if (mddev->ro) 5637 set_disk_ro(disk, 0); 5638 5639 __md_stop_writes(mddev); 5640 __md_stop(mddev); 5641 mddev->queue->backing_dev_info.congested_fn = NULL; 5642 5643 /* tell userspace to handle 'inactive' */ 5644 sysfs_notify_dirent_safe(mddev->sysfs_state); 5645 5646 rdev_for_each(rdev, mddev) 5647 if (rdev->raid_disk >= 0) 5648 sysfs_unlink_rdev(mddev, rdev); 5649 5650 set_capacity(disk, 0); 5651 mutex_unlock(&mddev->open_mutex); 5652 mddev->changed = 1; 5653 revalidate_disk(disk); 5654 5655 if (mddev->ro) 5656 mddev->ro = 0; 5657 } else 5658 mutex_unlock(&mddev->open_mutex); 5659 /* 5660 * Free resources if final stop 5661 */ 5662 if (mode == 0) { 5663 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 5664 5665 bitmap_destroy(mddev); 5666 if (mddev->bitmap_info.file) { 5667 struct file *f = mddev->bitmap_info.file; 5668 spin_lock(&mddev->lock); 5669 mddev->bitmap_info.file = NULL; 5670 spin_unlock(&mddev->lock); 5671 fput(f); 5672 } 5673 mddev->bitmap_info.offset = 0; 5674 5675 export_array(mddev); 5676 5677 md_clean(mddev); 5678 if (mddev->hold_active == UNTIL_STOP) 5679 mddev->hold_active = 0; 5680 } 5681 md_new_event(mddev); 5682 sysfs_notify_dirent_safe(mddev->sysfs_state); 5683 return 0; 5684 } 5685 5686 #ifndef MODULE 5687 static void autorun_array(struct mddev *mddev) 5688 { 5689 struct md_rdev *rdev; 5690 int err; 5691 5692 if (list_empty(&mddev->disks)) 5693 return; 5694 5695 printk(KERN_INFO "md: running: "); 5696 5697 rdev_for_each(rdev, mddev) { 5698 char b[BDEVNAME_SIZE]; 5699 printk("<%s>", bdevname(rdev->bdev,b)); 5700 } 5701 printk("\n"); 5702 5703 err = do_md_run(mddev); 5704 if (err) { 5705 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 5706 do_md_stop(mddev, 0, NULL); 5707 } 5708 } 5709 5710 /* 5711 * lets try to run arrays based on all disks that have arrived 5712 * until now. (those are in pending_raid_disks) 5713 * 5714 * the method: pick the first pending disk, collect all disks with 5715 * the same UUID, remove all from the pending list and put them into 5716 * the 'same_array' list. Then order this list based on superblock 5717 * update time (freshest comes first), kick out 'old' disks and 5718 * compare superblocks. If everything's fine then run it. 5719 * 5720 * If "unit" is allocated, then bump its reference count 5721 */ 5722 static void autorun_devices(int part) 5723 { 5724 struct md_rdev *rdev0, *rdev, *tmp; 5725 struct mddev *mddev; 5726 char b[BDEVNAME_SIZE]; 5727 5728 printk(KERN_INFO "md: autorun ...\n"); 5729 while (!list_empty(&pending_raid_disks)) { 5730 int unit; 5731 dev_t dev; 5732 LIST_HEAD(candidates); 5733 rdev0 = list_entry(pending_raid_disks.next, 5734 struct md_rdev, same_set); 5735 5736 printk(KERN_INFO "md: considering %s ...\n", 5737 bdevname(rdev0->bdev,b)); 5738 INIT_LIST_HEAD(&candidates); 5739 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 5740 if (super_90_load(rdev, rdev0, 0) >= 0) { 5741 printk(KERN_INFO "md: adding %s ...\n", 5742 bdevname(rdev->bdev,b)); 5743 list_move(&rdev->same_set, &candidates); 5744 } 5745 /* 5746 * now we have a set of devices, with all of them having 5747 * mostly sane superblocks. It's time to allocate the 5748 * mddev. 5749 */ 5750 if (part) { 5751 dev = MKDEV(mdp_major, 5752 rdev0->preferred_minor << MdpMinorShift); 5753 unit = MINOR(dev) >> MdpMinorShift; 5754 } else { 5755 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 5756 unit = MINOR(dev); 5757 } 5758 if (rdev0->preferred_minor != unit) { 5759 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 5760 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 5761 break; 5762 } 5763 5764 md_probe(dev, NULL, NULL); 5765 mddev = mddev_find(dev); 5766 if (!mddev || !mddev->gendisk) { 5767 if (mddev) 5768 mddev_put(mddev); 5769 printk(KERN_ERR 5770 "md: cannot allocate memory for md drive.\n"); 5771 break; 5772 } 5773 if (mddev_lock(mddev)) 5774 printk(KERN_WARNING "md: %s locked, cannot run\n", 5775 mdname(mddev)); 5776 else if (mddev->raid_disks || mddev->major_version 5777 || !list_empty(&mddev->disks)) { 5778 printk(KERN_WARNING 5779 "md: %s already running, cannot run %s\n", 5780 mdname(mddev), bdevname(rdev0->bdev,b)); 5781 mddev_unlock(mddev); 5782 } else { 5783 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 5784 mddev->persistent = 1; 5785 rdev_for_each_list(rdev, tmp, &candidates) { 5786 list_del_init(&rdev->same_set); 5787 if (bind_rdev_to_array(rdev, mddev)) 5788 export_rdev(rdev); 5789 } 5790 autorun_array(mddev); 5791 mddev_unlock(mddev); 5792 } 5793 /* on success, candidates will be empty, on error 5794 * it won't... 5795 */ 5796 rdev_for_each_list(rdev, tmp, &candidates) { 5797 list_del_init(&rdev->same_set); 5798 export_rdev(rdev); 5799 } 5800 mddev_put(mddev); 5801 } 5802 printk(KERN_INFO "md: ... autorun DONE.\n"); 5803 } 5804 #endif /* !MODULE */ 5805 5806 static int get_version(void __user *arg) 5807 { 5808 mdu_version_t ver; 5809 5810 ver.major = MD_MAJOR_VERSION; 5811 ver.minor = MD_MINOR_VERSION; 5812 ver.patchlevel = MD_PATCHLEVEL_VERSION; 5813 5814 if (copy_to_user(arg, &ver, sizeof(ver))) 5815 return -EFAULT; 5816 5817 return 0; 5818 } 5819 5820 static int get_array_info(struct mddev *mddev, void __user *arg) 5821 { 5822 mdu_array_info_t info; 5823 int nr,working,insync,failed,spare; 5824 struct md_rdev *rdev; 5825 5826 nr = working = insync = failed = spare = 0; 5827 rcu_read_lock(); 5828 rdev_for_each_rcu(rdev, mddev) { 5829 nr++; 5830 if (test_bit(Faulty, &rdev->flags)) 5831 failed++; 5832 else { 5833 working++; 5834 if (test_bit(In_sync, &rdev->flags)) 5835 insync++; 5836 else 5837 spare++; 5838 } 5839 } 5840 rcu_read_unlock(); 5841 5842 info.major_version = mddev->major_version; 5843 info.minor_version = mddev->minor_version; 5844 info.patch_version = MD_PATCHLEVEL_VERSION; 5845 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); 5846 info.level = mddev->level; 5847 info.size = mddev->dev_sectors / 2; 5848 if (info.size != mddev->dev_sectors / 2) /* overflow */ 5849 info.size = -1; 5850 info.nr_disks = nr; 5851 info.raid_disks = mddev->raid_disks; 5852 info.md_minor = mddev->md_minor; 5853 info.not_persistent= !mddev->persistent; 5854 5855 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); 5856 info.state = 0; 5857 if (mddev->in_sync) 5858 info.state = (1<<MD_SB_CLEAN); 5859 if (mddev->bitmap && mddev->bitmap_info.offset) 5860 info.state |= (1<<MD_SB_BITMAP_PRESENT); 5861 if (mddev_is_clustered(mddev)) 5862 info.state |= (1<<MD_SB_CLUSTERED); 5863 info.active_disks = insync; 5864 info.working_disks = working; 5865 info.failed_disks = failed; 5866 info.spare_disks = spare; 5867 5868 info.layout = mddev->layout; 5869 info.chunk_size = mddev->chunk_sectors << 9; 5870 5871 if (copy_to_user(arg, &info, sizeof(info))) 5872 return -EFAULT; 5873 5874 return 0; 5875 } 5876 5877 static int get_bitmap_file(struct mddev *mddev, void __user * arg) 5878 { 5879 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 5880 char *ptr; 5881 int err; 5882 5883 file = kzalloc(sizeof(*file), GFP_NOIO); 5884 if (!file) 5885 return -ENOMEM; 5886 5887 err = 0; 5888 spin_lock(&mddev->lock); 5889 /* bitmap enabled */ 5890 if (mddev->bitmap_info.file) { 5891 ptr = file_path(mddev->bitmap_info.file, file->pathname, 5892 sizeof(file->pathname)); 5893 if (IS_ERR(ptr)) 5894 err = PTR_ERR(ptr); 5895 else 5896 memmove(file->pathname, ptr, 5897 sizeof(file->pathname)-(ptr-file->pathname)); 5898 } 5899 spin_unlock(&mddev->lock); 5900 5901 if (err == 0 && 5902 copy_to_user(arg, file, sizeof(*file))) 5903 err = -EFAULT; 5904 5905 kfree(file); 5906 return err; 5907 } 5908 5909 static int get_disk_info(struct mddev *mddev, void __user * arg) 5910 { 5911 mdu_disk_info_t info; 5912 struct md_rdev *rdev; 5913 5914 if (copy_from_user(&info, arg, sizeof(info))) 5915 return -EFAULT; 5916 5917 rcu_read_lock(); 5918 rdev = md_find_rdev_nr_rcu(mddev, info.number); 5919 if (rdev) { 5920 info.major = MAJOR(rdev->bdev->bd_dev); 5921 info.minor = MINOR(rdev->bdev->bd_dev); 5922 info.raid_disk = rdev->raid_disk; 5923 info.state = 0; 5924 if (test_bit(Faulty, &rdev->flags)) 5925 info.state |= (1<<MD_DISK_FAULTY); 5926 else if (test_bit(In_sync, &rdev->flags)) { 5927 info.state |= (1<<MD_DISK_ACTIVE); 5928 info.state |= (1<<MD_DISK_SYNC); 5929 } 5930 if (test_bit(Journal, &rdev->flags)) 5931 info.state |= (1<<MD_DISK_JOURNAL); 5932 if (test_bit(WriteMostly, &rdev->flags)) 5933 info.state |= (1<<MD_DISK_WRITEMOSTLY); 5934 } else { 5935 info.major = info.minor = 0; 5936 info.raid_disk = -1; 5937 info.state = (1<<MD_DISK_REMOVED); 5938 } 5939 rcu_read_unlock(); 5940 5941 if (copy_to_user(arg, &info, sizeof(info))) 5942 return -EFAULT; 5943 5944 return 0; 5945 } 5946 5947 static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) 5948 { 5949 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5950 struct md_rdev *rdev; 5951 dev_t dev = MKDEV(info->major,info->minor); 5952 5953 if (mddev_is_clustered(mddev) && 5954 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { 5955 pr_err("%s: Cannot add to clustered mddev.\n", 5956 mdname(mddev)); 5957 return -EINVAL; 5958 } 5959 5960 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 5961 return -EOVERFLOW; 5962 5963 if (!mddev->raid_disks) { 5964 int err; 5965 /* expecting a device which has a superblock */ 5966 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 5967 if (IS_ERR(rdev)) { 5968 printk(KERN_WARNING 5969 "md: md_import_device returned %ld\n", 5970 PTR_ERR(rdev)); 5971 return PTR_ERR(rdev); 5972 } 5973 if (!list_empty(&mddev->disks)) { 5974 struct md_rdev *rdev0 5975 = list_entry(mddev->disks.next, 5976 struct md_rdev, same_set); 5977 err = super_types[mddev->major_version] 5978 .load_super(rdev, rdev0, mddev->minor_version); 5979 if (err < 0) { 5980 printk(KERN_WARNING 5981 "md: %s has different UUID to %s\n", 5982 bdevname(rdev->bdev,b), 5983 bdevname(rdev0->bdev,b2)); 5984 export_rdev(rdev); 5985 return -EINVAL; 5986 } 5987 } 5988 err = bind_rdev_to_array(rdev, mddev); 5989 if (err) 5990 export_rdev(rdev); 5991 return err; 5992 } 5993 5994 /* 5995 * add_new_disk can be used once the array is assembled 5996 * to add "hot spares". They must already have a superblock 5997 * written 5998 */ 5999 if (mddev->pers) { 6000 int err; 6001 if (!mddev->pers->hot_add_disk) { 6002 printk(KERN_WARNING 6003 "%s: personality does not support diskops!\n", 6004 mdname(mddev)); 6005 return -EINVAL; 6006 } 6007 if (mddev->persistent) 6008 rdev = md_import_device(dev, mddev->major_version, 6009 mddev->minor_version); 6010 else 6011 rdev = md_import_device(dev, -1, -1); 6012 if (IS_ERR(rdev)) { 6013 printk(KERN_WARNING 6014 "md: md_import_device returned %ld\n", 6015 PTR_ERR(rdev)); 6016 return PTR_ERR(rdev); 6017 } 6018 /* set saved_raid_disk if appropriate */ 6019 if (!mddev->persistent) { 6020 if (info->state & (1<<MD_DISK_SYNC) && 6021 info->raid_disk < mddev->raid_disks) { 6022 rdev->raid_disk = info->raid_disk; 6023 set_bit(In_sync, &rdev->flags); 6024 clear_bit(Bitmap_sync, &rdev->flags); 6025 } else 6026 rdev->raid_disk = -1; 6027 rdev->saved_raid_disk = rdev->raid_disk; 6028 } else 6029 super_types[mddev->major_version]. 6030 validate_super(mddev, rdev); 6031 if ((info->state & (1<<MD_DISK_SYNC)) && 6032 rdev->raid_disk != info->raid_disk) { 6033 /* This was a hot-add request, but events doesn't 6034 * match, so reject it. 6035 */ 6036 export_rdev(rdev); 6037 return -EINVAL; 6038 } 6039 6040 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 6041 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6042 set_bit(WriteMostly, &rdev->flags); 6043 else 6044 clear_bit(WriteMostly, &rdev->flags); 6045 6046 if (info->state & (1<<MD_DISK_JOURNAL)) { 6047 struct md_rdev *rdev2; 6048 bool has_journal = false; 6049 6050 /* make sure no existing journal disk */ 6051 rdev_for_each(rdev2, mddev) { 6052 if (test_bit(Journal, &rdev2->flags)) { 6053 has_journal = true; 6054 break; 6055 } 6056 } 6057 if (has_journal) { 6058 export_rdev(rdev); 6059 return -EBUSY; 6060 } 6061 set_bit(Journal, &rdev->flags); 6062 } 6063 /* 6064 * check whether the device shows up in other nodes 6065 */ 6066 if (mddev_is_clustered(mddev)) { 6067 if (info->state & (1 << MD_DISK_CANDIDATE)) 6068 set_bit(Candidate, &rdev->flags); 6069 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { 6070 /* --add initiated by this node */ 6071 err = md_cluster_ops->add_new_disk(mddev, rdev); 6072 if (err) { 6073 export_rdev(rdev); 6074 return err; 6075 } 6076 } 6077 } 6078 6079 rdev->raid_disk = -1; 6080 err = bind_rdev_to_array(rdev, mddev); 6081 6082 if (err) 6083 export_rdev(rdev); 6084 6085 if (mddev_is_clustered(mddev)) { 6086 if (info->state & (1 << MD_DISK_CANDIDATE)) 6087 md_cluster_ops->new_disk_ack(mddev, (err == 0)); 6088 else { 6089 if (err) 6090 md_cluster_ops->add_new_disk_cancel(mddev); 6091 else 6092 err = add_bound_rdev(rdev); 6093 } 6094 6095 } else if (!err) 6096 err = add_bound_rdev(rdev); 6097 6098 return err; 6099 } 6100 6101 /* otherwise, add_new_disk is only allowed 6102 * for major_version==0 superblocks 6103 */ 6104 if (mddev->major_version != 0) { 6105 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 6106 mdname(mddev)); 6107 return -EINVAL; 6108 } 6109 6110 if (!(info->state & (1<<MD_DISK_FAULTY))) { 6111 int err; 6112 rdev = md_import_device(dev, -1, 0); 6113 if (IS_ERR(rdev)) { 6114 printk(KERN_WARNING 6115 "md: error, md_import_device() returned %ld\n", 6116 PTR_ERR(rdev)); 6117 return PTR_ERR(rdev); 6118 } 6119 rdev->desc_nr = info->number; 6120 if (info->raid_disk < mddev->raid_disks) 6121 rdev->raid_disk = info->raid_disk; 6122 else 6123 rdev->raid_disk = -1; 6124 6125 if (rdev->raid_disk < mddev->raid_disks) 6126 if (info->state & (1<<MD_DISK_SYNC)) 6127 set_bit(In_sync, &rdev->flags); 6128 6129 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 6130 set_bit(WriteMostly, &rdev->flags); 6131 6132 if (!mddev->persistent) { 6133 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 6134 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6135 } else 6136 rdev->sb_start = calc_dev_sboffset(rdev); 6137 rdev->sectors = rdev->sb_start; 6138 6139 err = bind_rdev_to_array(rdev, mddev); 6140 if (err) { 6141 export_rdev(rdev); 6142 return err; 6143 } 6144 } 6145 6146 return 0; 6147 } 6148 6149 static int hot_remove_disk(struct mddev *mddev, dev_t dev) 6150 { 6151 char b[BDEVNAME_SIZE]; 6152 struct md_rdev *rdev; 6153 6154 rdev = find_rdev(mddev, dev); 6155 if (!rdev) 6156 return -ENXIO; 6157 6158 if (rdev->raid_disk < 0) 6159 goto kick_rdev; 6160 6161 clear_bit(Blocked, &rdev->flags); 6162 remove_and_add_spares(mddev, rdev); 6163 6164 if (rdev->raid_disk >= 0) 6165 goto busy; 6166 6167 kick_rdev: 6168 if (mddev_is_clustered(mddev)) 6169 md_cluster_ops->remove_disk(mddev, rdev); 6170 6171 md_kick_rdev_from_array(rdev); 6172 md_update_sb(mddev, 1); 6173 md_new_event(mddev); 6174 6175 return 0; 6176 busy: 6177 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 6178 bdevname(rdev->bdev,b), mdname(mddev)); 6179 return -EBUSY; 6180 } 6181 6182 static int hot_add_disk(struct mddev *mddev, dev_t dev) 6183 { 6184 char b[BDEVNAME_SIZE]; 6185 int err; 6186 struct md_rdev *rdev; 6187 6188 if (!mddev->pers) 6189 return -ENODEV; 6190 6191 if (mddev->major_version != 0) { 6192 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 6193 " version-0 superblocks.\n", 6194 mdname(mddev)); 6195 return -EINVAL; 6196 } 6197 if (!mddev->pers->hot_add_disk) { 6198 printk(KERN_WARNING 6199 "%s: personality does not support diskops!\n", 6200 mdname(mddev)); 6201 return -EINVAL; 6202 } 6203 6204 rdev = md_import_device(dev, -1, 0); 6205 if (IS_ERR(rdev)) { 6206 printk(KERN_WARNING 6207 "md: error, md_import_device() returned %ld\n", 6208 PTR_ERR(rdev)); 6209 return -EINVAL; 6210 } 6211 6212 if (mddev->persistent) 6213 rdev->sb_start = calc_dev_sboffset(rdev); 6214 else 6215 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 6216 6217 rdev->sectors = rdev->sb_start; 6218 6219 if (test_bit(Faulty, &rdev->flags)) { 6220 printk(KERN_WARNING 6221 "md: can not hot-add faulty %s disk to %s!\n", 6222 bdevname(rdev->bdev,b), mdname(mddev)); 6223 err = -EINVAL; 6224 goto abort_export; 6225 } 6226 6227 clear_bit(In_sync, &rdev->flags); 6228 rdev->desc_nr = -1; 6229 rdev->saved_raid_disk = -1; 6230 err = bind_rdev_to_array(rdev, mddev); 6231 if (err) 6232 goto abort_export; 6233 6234 /* 6235 * The rest should better be atomic, we can have disk failures 6236 * noticed in interrupt contexts ... 6237 */ 6238 6239 rdev->raid_disk = -1; 6240 6241 md_update_sb(mddev, 1); 6242 /* 6243 * Kick recovery, maybe this spare has to be added to the 6244 * array immediately. 6245 */ 6246 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6247 md_wakeup_thread(mddev->thread); 6248 md_new_event(mddev); 6249 return 0; 6250 6251 abort_export: 6252 export_rdev(rdev); 6253 return err; 6254 } 6255 6256 static int set_bitmap_file(struct mddev *mddev, int fd) 6257 { 6258 int err = 0; 6259 6260 if (mddev->pers) { 6261 if (!mddev->pers->quiesce || !mddev->thread) 6262 return -EBUSY; 6263 if (mddev->recovery || mddev->sync_thread) 6264 return -EBUSY; 6265 /* we should be able to change the bitmap.. */ 6266 } 6267 6268 if (fd >= 0) { 6269 struct inode *inode; 6270 struct file *f; 6271 6272 if (mddev->bitmap || mddev->bitmap_info.file) 6273 return -EEXIST; /* cannot add when bitmap is present */ 6274 f = fget(fd); 6275 6276 if (f == NULL) { 6277 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 6278 mdname(mddev)); 6279 return -EBADF; 6280 } 6281 6282 inode = f->f_mapping->host; 6283 if (!S_ISREG(inode->i_mode)) { 6284 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", 6285 mdname(mddev)); 6286 err = -EBADF; 6287 } else if (!(f->f_mode & FMODE_WRITE)) { 6288 printk(KERN_ERR "%s: error: bitmap file must open for write\n", 6289 mdname(mddev)); 6290 err = -EBADF; 6291 } else if (atomic_read(&inode->i_writecount) != 1) { 6292 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 6293 mdname(mddev)); 6294 err = -EBUSY; 6295 } 6296 if (err) { 6297 fput(f); 6298 return err; 6299 } 6300 mddev->bitmap_info.file = f; 6301 mddev->bitmap_info.offset = 0; /* file overrides offset */ 6302 } else if (mddev->bitmap == NULL) 6303 return -ENOENT; /* cannot remove what isn't there */ 6304 err = 0; 6305 if (mddev->pers) { 6306 mddev->pers->quiesce(mddev, 1); 6307 if (fd >= 0) { 6308 struct bitmap *bitmap; 6309 6310 bitmap = bitmap_create(mddev, -1); 6311 if (!IS_ERR(bitmap)) { 6312 mddev->bitmap = bitmap; 6313 err = bitmap_load(mddev); 6314 } else 6315 err = PTR_ERR(bitmap); 6316 } 6317 if (fd < 0 || err) { 6318 bitmap_destroy(mddev); 6319 fd = -1; /* make sure to put the file */ 6320 } 6321 mddev->pers->quiesce(mddev, 0); 6322 } 6323 if (fd < 0) { 6324 struct file *f = mddev->bitmap_info.file; 6325 if (f) { 6326 spin_lock(&mddev->lock); 6327 mddev->bitmap_info.file = NULL; 6328 spin_unlock(&mddev->lock); 6329 fput(f); 6330 } 6331 } 6332 6333 return err; 6334 } 6335 6336 /* 6337 * set_array_info is used two different ways 6338 * The original usage is when creating a new array. 6339 * In this usage, raid_disks is > 0 and it together with 6340 * level, size, not_persistent,layout,chunksize determine the 6341 * shape of the array. 6342 * This will always create an array with a type-0.90.0 superblock. 6343 * The newer usage is when assembling an array. 6344 * In this case raid_disks will be 0, and the major_version field is 6345 * use to determine which style super-blocks are to be found on the devices. 6346 * The minor and patch _version numbers are also kept incase the 6347 * super_block handler wishes to interpret them. 6348 */ 6349 static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) 6350 { 6351 6352 if (info->raid_disks == 0) { 6353 /* just setting version number for superblock loading */ 6354 if (info->major_version < 0 || 6355 info->major_version >= ARRAY_SIZE(super_types) || 6356 super_types[info->major_version].name == NULL) { 6357 /* maybe try to auto-load a module? */ 6358 printk(KERN_INFO 6359 "md: superblock version %d not known\n", 6360 info->major_version); 6361 return -EINVAL; 6362 } 6363 mddev->major_version = info->major_version; 6364 mddev->minor_version = info->minor_version; 6365 mddev->patch_version = info->patch_version; 6366 mddev->persistent = !info->not_persistent; 6367 /* ensure mddev_put doesn't delete this now that there 6368 * is some minimal configuration. 6369 */ 6370 mddev->ctime = ktime_get_real_seconds(); 6371 return 0; 6372 } 6373 mddev->major_version = MD_MAJOR_VERSION; 6374 mddev->minor_version = MD_MINOR_VERSION; 6375 mddev->patch_version = MD_PATCHLEVEL_VERSION; 6376 mddev->ctime = ktime_get_real_seconds(); 6377 6378 mddev->level = info->level; 6379 mddev->clevel[0] = 0; 6380 mddev->dev_sectors = 2 * (sector_t)info->size; 6381 mddev->raid_disks = info->raid_disks; 6382 /* don't set md_minor, it is determined by which /dev/md* was 6383 * openned 6384 */ 6385 if (info->state & (1<<MD_SB_CLEAN)) 6386 mddev->recovery_cp = MaxSector; 6387 else 6388 mddev->recovery_cp = 0; 6389 mddev->persistent = ! info->not_persistent; 6390 mddev->external = 0; 6391 6392 mddev->layout = info->layout; 6393 mddev->chunk_sectors = info->chunk_size >> 9; 6394 6395 mddev->max_disks = MD_SB_DISKS; 6396 6397 if (mddev->persistent) 6398 mddev->flags = 0; 6399 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6400 6401 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6402 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6403 mddev->bitmap_info.offset = 0; 6404 6405 mddev->reshape_position = MaxSector; 6406 6407 /* 6408 * Generate a 128 bit UUID 6409 */ 6410 get_random_bytes(mddev->uuid, 16); 6411 6412 mddev->new_level = mddev->level; 6413 mddev->new_chunk_sectors = mddev->chunk_sectors; 6414 mddev->new_layout = mddev->layout; 6415 mddev->delta_disks = 0; 6416 mddev->reshape_backwards = 0; 6417 6418 return 0; 6419 } 6420 6421 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 6422 { 6423 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 6424 6425 if (mddev->external_size) 6426 return; 6427 6428 mddev->array_sectors = array_sectors; 6429 } 6430 EXPORT_SYMBOL(md_set_array_sectors); 6431 6432 static int update_size(struct mddev *mddev, sector_t num_sectors) 6433 { 6434 struct md_rdev *rdev; 6435 int rv; 6436 int fit = (num_sectors == 0); 6437 6438 if (mddev->pers->resize == NULL) 6439 return -EINVAL; 6440 /* The "num_sectors" is the number of sectors of each device that 6441 * is used. This can only make sense for arrays with redundancy. 6442 * linear and raid0 always use whatever space is available. We can only 6443 * consider changing this number if no resync or reconstruction is 6444 * happening, and if the new size is acceptable. It must fit before the 6445 * sb_start or, if that is <data_offset, it must fit before the size 6446 * of each device. If num_sectors is zero, we find the largest size 6447 * that fits. 6448 */ 6449 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6450 mddev->sync_thread) 6451 return -EBUSY; 6452 if (mddev->ro) 6453 return -EROFS; 6454 6455 rdev_for_each(rdev, mddev) { 6456 sector_t avail = rdev->sectors; 6457 6458 if (fit && (num_sectors == 0 || num_sectors > avail)) 6459 num_sectors = avail; 6460 if (avail < num_sectors) 6461 return -ENOSPC; 6462 } 6463 rv = mddev->pers->resize(mddev, num_sectors); 6464 if (!rv) 6465 revalidate_disk(mddev->gendisk); 6466 return rv; 6467 } 6468 6469 static int update_raid_disks(struct mddev *mddev, int raid_disks) 6470 { 6471 int rv; 6472 struct md_rdev *rdev; 6473 /* change the number of raid disks */ 6474 if (mddev->pers->check_reshape == NULL) 6475 return -EINVAL; 6476 if (mddev->ro) 6477 return -EROFS; 6478 if (raid_disks <= 0 || 6479 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6480 return -EINVAL; 6481 if (mddev->sync_thread || 6482 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 6483 mddev->reshape_position != MaxSector) 6484 return -EBUSY; 6485 6486 rdev_for_each(rdev, mddev) { 6487 if (mddev->raid_disks < raid_disks && 6488 rdev->data_offset < rdev->new_data_offset) 6489 return -EINVAL; 6490 if (mddev->raid_disks > raid_disks && 6491 rdev->data_offset > rdev->new_data_offset) 6492 return -EINVAL; 6493 } 6494 6495 mddev->delta_disks = raid_disks - mddev->raid_disks; 6496 if (mddev->delta_disks < 0) 6497 mddev->reshape_backwards = 1; 6498 else if (mddev->delta_disks > 0) 6499 mddev->reshape_backwards = 0; 6500 6501 rv = mddev->pers->check_reshape(mddev); 6502 if (rv < 0) { 6503 mddev->delta_disks = 0; 6504 mddev->reshape_backwards = 0; 6505 } 6506 return rv; 6507 } 6508 6509 /* 6510 * update_array_info is used to change the configuration of an 6511 * on-line array. 6512 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 6513 * fields in the info are checked against the array. 6514 * Any differences that cannot be handled will cause an error. 6515 * Normally, only one change can be managed at a time. 6516 */ 6517 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 6518 { 6519 int rv = 0; 6520 int cnt = 0; 6521 int state = 0; 6522 6523 /* calculate expected state,ignoring low bits */ 6524 if (mddev->bitmap && mddev->bitmap_info.offset) 6525 state |= (1 << MD_SB_BITMAP_PRESENT); 6526 6527 if (mddev->major_version != info->major_version || 6528 mddev->minor_version != info->minor_version || 6529 /* mddev->patch_version != info->patch_version || */ 6530 mddev->ctime != info->ctime || 6531 mddev->level != info->level || 6532 /* mddev->layout != info->layout || */ 6533 mddev->persistent != !info->not_persistent || 6534 mddev->chunk_sectors != info->chunk_size >> 9 || 6535 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 6536 ((state^info->state) & 0xfffffe00) 6537 ) 6538 return -EINVAL; 6539 /* Check there is only one change */ 6540 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6541 cnt++; 6542 if (mddev->raid_disks != info->raid_disks) 6543 cnt++; 6544 if (mddev->layout != info->layout) 6545 cnt++; 6546 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 6547 cnt++; 6548 if (cnt == 0) 6549 return 0; 6550 if (cnt > 1) 6551 return -EINVAL; 6552 6553 if (mddev->layout != info->layout) { 6554 /* Change layout 6555 * we don't need to do anything at the md level, the 6556 * personality will take care of it all. 6557 */ 6558 if (mddev->pers->check_reshape == NULL) 6559 return -EINVAL; 6560 else { 6561 mddev->new_layout = info->layout; 6562 rv = mddev->pers->check_reshape(mddev); 6563 if (rv) 6564 mddev->new_layout = mddev->layout; 6565 return rv; 6566 } 6567 } 6568 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6569 rv = update_size(mddev, (sector_t)info->size * 2); 6570 6571 if (mddev->raid_disks != info->raid_disks) 6572 rv = update_raid_disks(mddev, info->raid_disks); 6573 6574 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 6575 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) { 6576 rv = -EINVAL; 6577 goto err; 6578 } 6579 if (mddev->recovery || mddev->sync_thread) { 6580 rv = -EBUSY; 6581 goto err; 6582 } 6583 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 6584 struct bitmap *bitmap; 6585 /* add the bitmap */ 6586 if (mddev->bitmap) { 6587 rv = -EEXIST; 6588 goto err; 6589 } 6590 if (mddev->bitmap_info.default_offset == 0) { 6591 rv = -EINVAL; 6592 goto err; 6593 } 6594 mddev->bitmap_info.offset = 6595 mddev->bitmap_info.default_offset; 6596 mddev->bitmap_info.space = 6597 mddev->bitmap_info.default_space; 6598 mddev->pers->quiesce(mddev, 1); 6599 bitmap = bitmap_create(mddev, -1); 6600 if (!IS_ERR(bitmap)) { 6601 mddev->bitmap = bitmap; 6602 rv = bitmap_load(mddev); 6603 } else 6604 rv = PTR_ERR(bitmap); 6605 if (rv) 6606 bitmap_destroy(mddev); 6607 mddev->pers->quiesce(mddev, 0); 6608 } else { 6609 /* remove the bitmap */ 6610 if (!mddev->bitmap) { 6611 rv = -ENOENT; 6612 goto err; 6613 } 6614 if (mddev->bitmap->storage.file) { 6615 rv = -EINVAL; 6616 goto err; 6617 } 6618 if (mddev->bitmap_info.nodes) { 6619 /* hold PW on all the bitmap lock */ 6620 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { 6621 printk("md: can't change bitmap to none since the" 6622 " array is in use by more than one node\n"); 6623 rv = -EPERM; 6624 md_cluster_ops->unlock_all_bitmaps(mddev); 6625 goto err; 6626 } 6627 6628 mddev->bitmap_info.nodes = 0; 6629 md_cluster_ops->leave(mddev); 6630 } 6631 mddev->pers->quiesce(mddev, 1); 6632 bitmap_destroy(mddev); 6633 mddev->pers->quiesce(mddev, 0); 6634 mddev->bitmap_info.offset = 0; 6635 } 6636 } 6637 md_update_sb(mddev, 1); 6638 return rv; 6639 err: 6640 return rv; 6641 } 6642 6643 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6644 { 6645 struct md_rdev *rdev; 6646 int err = 0; 6647 6648 if (mddev->pers == NULL) 6649 return -ENODEV; 6650 6651 rcu_read_lock(); 6652 rdev = find_rdev_rcu(mddev, dev); 6653 if (!rdev) 6654 err = -ENODEV; 6655 else { 6656 md_error(mddev, rdev); 6657 if (!test_bit(Faulty, &rdev->flags)) 6658 err = -EBUSY; 6659 } 6660 rcu_read_unlock(); 6661 return err; 6662 } 6663 6664 /* 6665 * We have a problem here : there is no easy way to give a CHS 6666 * virtual geometry. We currently pretend that we have a 2 heads 6667 * 4 sectors (with a BIG number of cylinders...). This drives 6668 * dosfs just mad... ;-) 6669 */ 6670 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 6671 { 6672 struct mddev *mddev = bdev->bd_disk->private_data; 6673 6674 geo->heads = 2; 6675 geo->sectors = 4; 6676 geo->cylinders = mddev->array_sectors / 8; 6677 return 0; 6678 } 6679 6680 static inline bool md_ioctl_valid(unsigned int cmd) 6681 { 6682 switch (cmd) { 6683 case ADD_NEW_DISK: 6684 case BLKROSET: 6685 case GET_ARRAY_INFO: 6686 case GET_BITMAP_FILE: 6687 case GET_DISK_INFO: 6688 case HOT_ADD_DISK: 6689 case HOT_REMOVE_DISK: 6690 case RAID_AUTORUN: 6691 case RAID_VERSION: 6692 case RESTART_ARRAY_RW: 6693 case RUN_ARRAY: 6694 case SET_ARRAY_INFO: 6695 case SET_BITMAP_FILE: 6696 case SET_DISK_FAULTY: 6697 case STOP_ARRAY: 6698 case STOP_ARRAY_RO: 6699 case CLUSTERED_DISK_NACK: 6700 return true; 6701 default: 6702 return false; 6703 } 6704 } 6705 6706 static int md_ioctl(struct block_device *bdev, fmode_t mode, 6707 unsigned int cmd, unsigned long arg) 6708 { 6709 int err = 0; 6710 void __user *argp = (void __user *)arg; 6711 struct mddev *mddev = NULL; 6712 int ro; 6713 6714 if (!md_ioctl_valid(cmd)) 6715 return -ENOTTY; 6716 6717 switch (cmd) { 6718 case RAID_VERSION: 6719 case GET_ARRAY_INFO: 6720 case GET_DISK_INFO: 6721 break; 6722 default: 6723 if (!capable(CAP_SYS_ADMIN)) 6724 return -EACCES; 6725 } 6726 6727 /* 6728 * Commands dealing with the RAID driver but not any 6729 * particular array: 6730 */ 6731 switch (cmd) { 6732 case RAID_VERSION: 6733 err = get_version(argp); 6734 goto out; 6735 6736 #ifndef MODULE 6737 case RAID_AUTORUN: 6738 err = 0; 6739 autostart_arrays(arg); 6740 goto out; 6741 #endif 6742 default:; 6743 } 6744 6745 /* 6746 * Commands creating/starting a new array: 6747 */ 6748 6749 mddev = bdev->bd_disk->private_data; 6750 6751 if (!mddev) { 6752 BUG(); 6753 goto out; 6754 } 6755 6756 /* Some actions do not requires the mutex */ 6757 switch (cmd) { 6758 case GET_ARRAY_INFO: 6759 if (!mddev->raid_disks && !mddev->external) 6760 err = -ENODEV; 6761 else 6762 err = get_array_info(mddev, argp); 6763 goto out; 6764 6765 case GET_DISK_INFO: 6766 if (!mddev->raid_disks && !mddev->external) 6767 err = -ENODEV; 6768 else 6769 err = get_disk_info(mddev, argp); 6770 goto out; 6771 6772 case SET_DISK_FAULTY: 6773 err = set_disk_faulty(mddev, new_decode_dev(arg)); 6774 goto out; 6775 6776 case GET_BITMAP_FILE: 6777 err = get_bitmap_file(mddev, argp); 6778 goto out; 6779 6780 } 6781 6782 if (cmd == ADD_NEW_DISK) 6783 /* need to ensure md_delayed_delete() has completed */ 6784 flush_workqueue(md_misc_wq); 6785 6786 if (cmd == HOT_REMOVE_DISK) 6787 /* need to ensure recovery thread has run */ 6788 wait_event_interruptible_timeout(mddev->sb_wait, 6789 !test_bit(MD_RECOVERY_NEEDED, 6790 &mddev->flags), 6791 msecs_to_jiffies(5000)); 6792 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 6793 /* Need to flush page cache, and ensure no-one else opens 6794 * and writes 6795 */ 6796 mutex_lock(&mddev->open_mutex); 6797 if (mddev->pers && atomic_read(&mddev->openers) > 1) { 6798 mutex_unlock(&mddev->open_mutex); 6799 err = -EBUSY; 6800 goto out; 6801 } 6802 set_bit(MD_STILL_CLOSED, &mddev->flags); 6803 mutex_unlock(&mddev->open_mutex); 6804 sync_blockdev(bdev); 6805 } 6806 err = mddev_lock(mddev); 6807 if (err) { 6808 printk(KERN_INFO 6809 "md: ioctl lock interrupted, reason %d, cmd %d\n", 6810 err, cmd); 6811 goto out; 6812 } 6813 6814 if (cmd == SET_ARRAY_INFO) { 6815 mdu_array_info_t info; 6816 if (!arg) 6817 memset(&info, 0, sizeof(info)); 6818 else if (copy_from_user(&info, argp, sizeof(info))) { 6819 err = -EFAULT; 6820 goto unlock; 6821 } 6822 if (mddev->pers) { 6823 err = update_array_info(mddev, &info); 6824 if (err) { 6825 printk(KERN_WARNING "md: couldn't update" 6826 " array info. %d\n", err); 6827 goto unlock; 6828 } 6829 goto unlock; 6830 } 6831 if (!list_empty(&mddev->disks)) { 6832 printk(KERN_WARNING 6833 "md: array %s already has disks!\n", 6834 mdname(mddev)); 6835 err = -EBUSY; 6836 goto unlock; 6837 } 6838 if (mddev->raid_disks) { 6839 printk(KERN_WARNING 6840 "md: array %s already initialised!\n", 6841 mdname(mddev)); 6842 err = -EBUSY; 6843 goto unlock; 6844 } 6845 err = set_array_info(mddev, &info); 6846 if (err) { 6847 printk(KERN_WARNING "md: couldn't set" 6848 " array info. %d\n", err); 6849 goto unlock; 6850 } 6851 goto unlock; 6852 } 6853 6854 /* 6855 * Commands querying/configuring an existing array: 6856 */ 6857 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 6858 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 6859 if ((!mddev->raid_disks && !mddev->external) 6860 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 6861 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 6862 && cmd != GET_BITMAP_FILE) { 6863 err = -ENODEV; 6864 goto unlock; 6865 } 6866 6867 /* 6868 * Commands even a read-only array can execute: 6869 */ 6870 switch (cmd) { 6871 case RESTART_ARRAY_RW: 6872 err = restart_array(mddev); 6873 goto unlock; 6874 6875 case STOP_ARRAY: 6876 err = do_md_stop(mddev, 0, bdev); 6877 goto unlock; 6878 6879 case STOP_ARRAY_RO: 6880 err = md_set_readonly(mddev, bdev); 6881 goto unlock; 6882 6883 case HOT_REMOVE_DISK: 6884 err = hot_remove_disk(mddev, new_decode_dev(arg)); 6885 goto unlock; 6886 6887 case ADD_NEW_DISK: 6888 /* We can support ADD_NEW_DISK on read-only arrays 6889 * only if we are re-adding a preexisting device. 6890 * So require mddev->pers and MD_DISK_SYNC. 6891 */ 6892 if (mddev->pers) { 6893 mdu_disk_info_t info; 6894 if (copy_from_user(&info, argp, sizeof(info))) 6895 err = -EFAULT; 6896 else if (!(info.state & (1<<MD_DISK_SYNC))) 6897 /* Need to clear read-only for this */ 6898 break; 6899 else 6900 err = add_new_disk(mddev, &info); 6901 goto unlock; 6902 } 6903 break; 6904 6905 case BLKROSET: 6906 if (get_user(ro, (int __user *)(arg))) { 6907 err = -EFAULT; 6908 goto unlock; 6909 } 6910 err = -EINVAL; 6911 6912 /* if the bdev is going readonly the value of mddev->ro 6913 * does not matter, no writes are coming 6914 */ 6915 if (ro) 6916 goto unlock; 6917 6918 /* are we are already prepared for writes? */ 6919 if (mddev->ro != 1) 6920 goto unlock; 6921 6922 /* transitioning to readauto need only happen for 6923 * arrays that call md_write_start 6924 */ 6925 if (mddev->pers) { 6926 err = restart_array(mddev); 6927 if (err == 0) { 6928 mddev->ro = 2; 6929 set_disk_ro(mddev->gendisk, 0); 6930 } 6931 } 6932 goto unlock; 6933 } 6934 6935 /* 6936 * The remaining ioctls are changing the state of the 6937 * superblock, so we do not allow them on read-only arrays. 6938 */ 6939 if (mddev->ro && mddev->pers) { 6940 if (mddev->ro == 2) { 6941 mddev->ro = 0; 6942 sysfs_notify_dirent_safe(mddev->sysfs_state); 6943 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6944 /* mddev_unlock will wake thread */ 6945 /* If a device failed while we were read-only, we 6946 * need to make sure the metadata is updated now. 6947 */ 6948 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 6949 mddev_unlock(mddev); 6950 wait_event(mddev->sb_wait, 6951 !test_bit(MD_CHANGE_DEVS, &mddev->flags) && 6952 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6953 mddev_lock_nointr(mddev); 6954 } 6955 } else { 6956 err = -EROFS; 6957 goto unlock; 6958 } 6959 } 6960 6961 switch (cmd) { 6962 case ADD_NEW_DISK: 6963 { 6964 mdu_disk_info_t info; 6965 if (copy_from_user(&info, argp, sizeof(info))) 6966 err = -EFAULT; 6967 else 6968 err = add_new_disk(mddev, &info); 6969 goto unlock; 6970 } 6971 6972 case CLUSTERED_DISK_NACK: 6973 if (mddev_is_clustered(mddev)) 6974 md_cluster_ops->new_disk_ack(mddev, false); 6975 else 6976 err = -EINVAL; 6977 goto unlock; 6978 6979 case HOT_ADD_DISK: 6980 err = hot_add_disk(mddev, new_decode_dev(arg)); 6981 goto unlock; 6982 6983 case RUN_ARRAY: 6984 err = do_md_run(mddev); 6985 goto unlock; 6986 6987 case SET_BITMAP_FILE: 6988 err = set_bitmap_file(mddev, (int)arg); 6989 goto unlock; 6990 6991 default: 6992 err = -EINVAL; 6993 goto unlock; 6994 } 6995 6996 unlock: 6997 if (mddev->hold_active == UNTIL_IOCTL && 6998 err != -EINVAL) 6999 mddev->hold_active = 0; 7000 mddev_unlock(mddev); 7001 out: 7002 return err; 7003 } 7004 #ifdef CONFIG_COMPAT 7005 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 7006 unsigned int cmd, unsigned long arg) 7007 { 7008 switch (cmd) { 7009 case HOT_REMOVE_DISK: 7010 case HOT_ADD_DISK: 7011 case SET_DISK_FAULTY: 7012 case SET_BITMAP_FILE: 7013 /* These take in integer arg, do not convert */ 7014 break; 7015 default: 7016 arg = (unsigned long)compat_ptr(arg); 7017 break; 7018 } 7019 7020 return md_ioctl(bdev, mode, cmd, arg); 7021 } 7022 #endif /* CONFIG_COMPAT */ 7023 7024 static int md_open(struct block_device *bdev, fmode_t mode) 7025 { 7026 /* 7027 * Succeed if we can lock the mddev, which confirms that 7028 * it isn't being stopped right now. 7029 */ 7030 struct mddev *mddev = mddev_find(bdev->bd_dev); 7031 int err; 7032 7033 if (!mddev) 7034 return -ENODEV; 7035 7036 if (mddev->gendisk != bdev->bd_disk) { 7037 /* we are racing with mddev_put which is discarding this 7038 * bd_disk. 7039 */ 7040 mddev_put(mddev); 7041 /* Wait until bdev->bd_disk is definitely gone */ 7042 flush_workqueue(md_misc_wq); 7043 /* Then retry the open from the top */ 7044 return -ERESTARTSYS; 7045 } 7046 BUG_ON(mddev != bdev->bd_disk->private_data); 7047 7048 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 7049 goto out; 7050 7051 err = 0; 7052 atomic_inc(&mddev->openers); 7053 clear_bit(MD_STILL_CLOSED, &mddev->flags); 7054 mutex_unlock(&mddev->open_mutex); 7055 7056 check_disk_change(bdev); 7057 out: 7058 return err; 7059 } 7060 7061 static void md_release(struct gendisk *disk, fmode_t mode) 7062 { 7063 struct mddev *mddev = disk->private_data; 7064 7065 BUG_ON(!mddev); 7066 atomic_dec(&mddev->openers); 7067 mddev_put(mddev); 7068 } 7069 7070 static int md_media_changed(struct gendisk *disk) 7071 { 7072 struct mddev *mddev = disk->private_data; 7073 7074 return mddev->changed; 7075 } 7076 7077 static int md_revalidate(struct gendisk *disk) 7078 { 7079 struct mddev *mddev = disk->private_data; 7080 7081 mddev->changed = 0; 7082 return 0; 7083 } 7084 static const struct block_device_operations md_fops = 7085 { 7086 .owner = THIS_MODULE, 7087 .open = md_open, 7088 .release = md_release, 7089 .ioctl = md_ioctl, 7090 #ifdef CONFIG_COMPAT 7091 .compat_ioctl = md_compat_ioctl, 7092 #endif 7093 .getgeo = md_getgeo, 7094 .media_changed = md_media_changed, 7095 .revalidate_disk= md_revalidate, 7096 }; 7097 7098 static int md_thread(void *arg) 7099 { 7100 struct md_thread *thread = arg; 7101 7102 /* 7103 * md_thread is a 'system-thread', it's priority should be very 7104 * high. We avoid resource deadlocks individually in each 7105 * raid personality. (RAID5 does preallocation) We also use RR and 7106 * the very same RT priority as kswapd, thus we will never get 7107 * into a priority inversion deadlock. 7108 * 7109 * we definitely have to have equal or higher priority than 7110 * bdflush, otherwise bdflush will deadlock if there are too 7111 * many dirty RAID5 blocks. 7112 */ 7113 7114 allow_signal(SIGKILL); 7115 while (!kthread_should_stop()) { 7116 7117 /* We need to wait INTERRUPTIBLE so that 7118 * we don't add to the load-average. 7119 * That means we need to be sure no signals are 7120 * pending 7121 */ 7122 if (signal_pending(current)) 7123 flush_signals(current); 7124 7125 wait_event_interruptible_timeout 7126 (thread->wqueue, 7127 test_bit(THREAD_WAKEUP, &thread->flags) 7128 || kthread_should_stop(), 7129 thread->timeout); 7130 7131 clear_bit(THREAD_WAKEUP, &thread->flags); 7132 if (!kthread_should_stop()) 7133 thread->run(thread); 7134 } 7135 7136 return 0; 7137 } 7138 7139 void md_wakeup_thread(struct md_thread *thread) 7140 { 7141 if (thread) { 7142 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 7143 set_bit(THREAD_WAKEUP, &thread->flags); 7144 wake_up(&thread->wqueue); 7145 } 7146 } 7147 EXPORT_SYMBOL(md_wakeup_thread); 7148 7149 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 7150 struct mddev *mddev, const char *name) 7151 { 7152 struct md_thread *thread; 7153 7154 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 7155 if (!thread) 7156 return NULL; 7157 7158 init_waitqueue_head(&thread->wqueue); 7159 7160 thread->run = run; 7161 thread->mddev = mddev; 7162 thread->timeout = MAX_SCHEDULE_TIMEOUT; 7163 thread->tsk = kthread_run(md_thread, thread, 7164 "%s_%s", 7165 mdname(thread->mddev), 7166 name); 7167 if (IS_ERR(thread->tsk)) { 7168 kfree(thread); 7169 return NULL; 7170 } 7171 return thread; 7172 } 7173 EXPORT_SYMBOL(md_register_thread); 7174 7175 void md_unregister_thread(struct md_thread **threadp) 7176 { 7177 struct md_thread *thread = *threadp; 7178 if (!thread) 7179 return; 7180 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 7181 /* Locking ensures that mddev_unlock does not wake_up a 7182 * non-existent thread 7183 */ 7184 spin_lock(&pers_lock); 7185 *threadp = NULL; 7186 spin_unlock(&pers_lock); 7187 7188 kthread_stop(thread->tsk); 7189 kfree(thread); 7190 } 7191 EXPORT_SYMBOL(md_unregister_thread); 7192 7193 void md_error(struct mddev *mddev, struct md_rdev *rdev) 7194 { 7195 if (!rdev || test_bit(Faulty, &rdev->flags)) 7196 return; 7197 7198 if (!mddev->pers || !mddev->pers->error_handler) 7199 return; 7200 mddev->pers->error_handler(mddev,rdev); 7201 if (mddev->degraded) 7202 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7203 sysfs_notify_dirent_safe(rdev->sysfs_state); 7204 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7205 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7206 md_wakeup_thread(mddev->thread); 7207 if (mddev->event_work.func) 7208 queue_work(md_misc_wq, &mddev->event_work); 7209 md_new_event(mddev); 7210 } 7211 EXPORT_SYMBOL(md_error); 7212 7213 /* seq_file implementation /proc/mdstat */ 7214 7215 static void status_unused(struct seq_file *seq) 7216 { 7217 int i = 0; 7218 struct md_rdev *rdev; 7219 7220 seq_printf(seq, "unused devices: "); 7221 7222 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 7223 char b[BDEVNAME_SIZE]; 7224 i++; 7225 seq_printf(seq, "%s ", 7226 bdevname(rdev->bdev,b)); 7227 } 7228 if (!i) 7229 seq_printf(seq, "<none>"); 7230 7231 seq_printf(seq, "\n"); 7232 } 7233 7234 static int status_resync(struct seq_file *seq, struct mddev *mddev) 7235 { 7236 sector_t max_sectors, resync, res; 7237 unsigned long dt, db; 7238 sector_t rt; 7239 int scale; 7240 unsigned int per_milli; 7241 7242 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 7243 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7244 max_sectors = mddev->resync_max_sectors; 7245 else 7246 max_sectors = mddev->dev_sectors; 7247 7248 resync = mddev->curr_resync; 7249 if (resync <= 3) { 7250 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7251 /* Still cleaning up */ 7252 resync = max_sectors; 7253 } else 7254 resync -= atomic_read(&mddev->recovery_active); 7255 7256 if (resync == 0) { 7257 if (mddev->recovery_cp < MaxSector) { 7258 seq_printf(seq, "\tresync=PENDING"); 7259 return 1; 7260 } 7261 return 0; 7262 } 7263 if (resync < 3) { 7264 seq_printf(seq, "\tresync=DELAYED"); 7265 return 1; 7266 } 7267 7268 WARN_ON(max_sectors == 0); 7269 /* Pick 'scale' such that (resync>>scale)*1000 will fit 7270 * in a sector_t, and (max_sectors>>scale) will fit in a 7271 * u32, as those are the requirements for sector_div. 7272 * Thus 'scale' must be at least 10 7273 */ 7274 scale = 10; 7275 if (sizeof(sector_t) > sizeof(unsigned long)) { 7276 while ( max_sectors/2 > (1ULL<<(scale+32))) 7277 scale++; 7278 } 7279 res = (resync>>scale)*1000; 7280 sector_div(res, (u32)((max_sectors>>scale)+1)); 7281 7282 per_milli = res; 7283 { 7284 int i, x = per_milli/50, y = 20-x; 7285 seq_printf(seq, "["); 7286 for (i = 0; i < x; i++) 7287 seq_printf(seq, "="); 7288 seq_printf(seq, ">"); 7289 for (i = 0; i < y; i++) 7290 seq_printf(seq, "."); 7291 seq_printf(seq, "] "); 7292 } 7293 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 7294 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 7295 "reshape" : 7296 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 7297 "check" : 7298 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 7299 "resync" : "recovery"))), 7300 per_milli/10, per_milli % 10, 7301 (unsigned long long) resync/2, 7302 (unsigned long long) max_sectors/2); 7303 7304 /* 7305 * dt: time from mark until now 7306 * db: blocks written from mark until now 7307 * rt: remaining time 7308 * 7309 * rt is a sector_t, so could be 32bit or 64bit. 7310 * So we divide before multiply in case it is 32bit and close 7311 * to the limit. 7312 * We scale the divisor (db) by 32 to avoid losing precision 7313 * near the end of resync when the number of remaining sectors 7314 * is close to 'db'. 7315 * We then divide rt by 32 after multiplying by db to compensate. 7316 * The '+1' avoids division by zero if db is very small. 7317 */ 7318 dt = ((jiffies - mddev->resync_mark) / HZ); 7319 if (!dt) dt++; 7320 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 7321 - mddev->resync_mark_cnt; 7322 7323 rt = max_sectors - resync; /* number of remaining sectors */ 7324 sector_div(rt, db/32+1); 7325 rt *= dt; 7326 rt >>= 5; 7327 7328 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 7329 ((unsigned long)rt % 60)/6); 7330 7331 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 7332 return 1; 7333 } 7334 7335 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 7336 { 7337 struct list_head *tmp; 7338 loff_t l = *pos; 7339 struct mddev *mddev; 7340 7341 if (l >= 0x10000) 7342 return NULL; 7343 if (!l--) 7344 /* header */ 7345 return (void*)1; 7346 7347 spin_lock(&all_mddevs_lock); 7348 list_for_each(tmp,&all_mddevs) 7349 if (!l--) { 7350 mddev = list_entry(tmp, struct mddev, all_mddevs); 7351 mddev_get(mddev); 7352 spin_unlock(&all_mddevs_lock); 7353 return mddev; 7354 } 7355 spin_unlock(&all_mddevs_lock); 7356 if (!l--) 7357 return (void*)2;/* tail */ 7358 return NULL; 7359 } 7360 7361 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 7362 { 7363 struct list_head *tmp; 7364 struct mddev *next_mddev, *mddev = v; 7365 7366 ++*pos; 7367 if (v == (void*)2) 7368 return NULL; 7369 7370 spin_lock(&all_mddevs_lock); 7371 if (v == (void*)1) 7372 tmp = all_mddevs.next; 7373 else 7374 tmp = mddev->all_mddevs.next; 7375 if (tmp != &all_mddevs) 7376 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 7377 else { 7378 next_mddev = (void*)2; 7379 *pos = 0x10000; 7380 } 7381 spin_unlock(&all_mddevs_lock); 7382 7383 if (v != (void*)1) 7384 mddev_put(mddev); 7385 return next_mddev; 7386 7387 } 7388 7389 static void md_seq_stop(struct seq_file *seq, void *v) 7390 { 7391 struct mddev *mddev = v; 7392 7393 if (mddev && v != (void*)1 && v != (void*)2) 7394 mddev_put(mddev); 7395 } 7396 7397 static int md_seq_show(struct seq_file *seq, void *v) 7398 { 7399 struct mddev *mddev = v; 7400 sector_t sectors; 7401 struct md_rdev *rdev; 7402 7403 if (v == (void*)1) { 7404 struct md_personality *pers; 7405 seq_printf(seq, "Personalities : "); 7406 spin_lock(&pers_lock); 7407 list_for_each_entry(pers, &pers_list, list) 7408 seq_printf(seq, "[%s] ", pers->name); 7409 7410 spin_unlock(&pers_lock); 7411 seq_printf(seq, "\n"); 7412 seq->poll_event = atomic_read(&md_event_count); 7413 return 0; 7414 } 7415 if (v == (void*)2) { 7416 status_unused(seq); 7417 return 0; 7418 } 7419 7420 spin_lock(&mddev->lock); 7421 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 7422 seq_printf(seq, "%s : %sactive", mdname(mddev), 7423 mddev->pers ? "" : "in"); 7424 if (mddev->pers) { 7425 if (mddev->ro==1) 7426 seq_printf(seq, " (read-only)"); 7427 if (mddev->ro==2) 7428 seq_printf(seq, " (auto-read-only)"); 7429 seq_printf(seq, " %s", mddev->pers->name); 7430 } 7431 7432 sectors = 0; 7433 rcu_read_lock(); 7434 rdev_for_each_rcu(rdev, mddev) { 7435 char b[BDEVNAME_SIZE]; 7436 seq_printf(seq, " %s[%d]", 7437 bdevname(rdev->bdev,b), rdev->desc_nr); 7438 if (test_bit(WriteMostly, &rdev->flags)) 7439 seq_printf(seq, "(W)"); 7440 if (test_bit(Journal, &rdev->flags)) 7441 seq_printf(seq, "(J)"); 7442 if (test_bit(Faulty, &rdev->flags)) { 7443 seq_printf(seq, "(F)"); 7444 continue; 7445 } 7446 if (rdev->raid_disk < 0) 7447 seq_printf(seq, "(S)"); /* spare */ 7448 if (test_bit(Replacement, &rdev->flags)) 7449 seq_printf(seq, "(R)"); 7450 sectors += rdev->sectors; 7451 } 7452 rcu_read_unlock(); 7453 7454 if (!list_empty(&mddev->disks)) { 7455 if (mddev->pers) 7456 seq_printf(seq, "\n %llu blocks", 7457 (unsigned long long) 7458 mddev->array_sectors / 2); 7459 else 7460 seq_printf(seq, "\n %llu blocks", 7461 (unsigned long long)sectors / 2); 7462 } 7463 if (mddev->persistent) { 7464 if (mddev->major_version != 0 || 7465 mddev->minor_version != 90) { 7466 seq_printf(seq," super %d.%d", 7467 mddev->major_version, 7468 mddev->minor_version); 7469 } 7470 } else if (mddev->external) 7471 seq_printf(seq, " super external:%s", 7472 mddev->metadata_type); 7473 else 7474 seq_printf(seq, " super non-persistent"); 7475 7476 if (mddev->pers) { 7477 mddev->pers->status(seq, mddev); 7478 seq_printf(seq, "\n "); 7479 if (mddev->pers->sync_request) { 7480 if (status_resync(seq, mddev)) 7481 seq_printf(seq, "\n "); 7482 } 7483 } else 7484 seq_printf(seq, "\n "); 7485 7486 bitmap_status(seq, mddev->bitmap); 7487 7488 seq_printf(seq, "\n"); 7489 } 7490 spin_unlock(&mddev->lock); 7491 7492 return 0; 7493 } 7494 7495 static const struct seq_operations md_seq_ops = { 7496 .start = md_seq_start, 7497 .next = md_seq_next, 7498 .stop = md_seq_stop, 7499 .show = md_seq_show, 7500 }; 7501 7502 static int md_seq_open(struct inode *inode, struct file *file) 7503 { 7504 struct seq_file *seq; 7505 int error; 7506 7507 error = seq_open(file, &md_seq_ops); 7508 if (error) 7509 return error; 7510 7511 seq = file->private_data; 7512 seq->poll_event = atomic_read(&md_event_count); 7513 return error; 7514 } 7515 7516 static int md_unloading; 7517 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 7518 { 7519 struct seq_file *seq = filp->private_data; 7520 int mask; 7521 7522 if (md_unloading) 7523 return POLLIN|POLLRDNORM|POLLERR|POLLPRI; 7524 poll_wait(filp, &md_event_waiters, wait); 7525 7526 /* always allow read */ 7527 mask = POLLIN | POLLRDNORM; 7528 7529 if (seq->poll_event != atomic_read(&md_event_count)) 7530 mask |= POLLERR | POLLPRI; 7531 return mask; 7532 } 7533 7534 static const struct file_operations md_seq_fops = { 7535 .owner = THIS_MODULE, 7536 .open = md_seq_open, 7537 .read = seq_read, 7538 .llseek = seq_lseek, 7539 .release = seq_release_private, 7540 .poll = mdstat_poll, 7541 }; 7542 7543 int register_md_personality(struct md_personality *p) 7544 { 7545 printk(KERN_INFO "md: %s personality registered for level %d\n", 7546 p->name, p->level); 7547 spin_lock(&pers_lock); 7548 list_add_tail(&p->list, &pers_list); 7549 spin_unlock(&pers_lock); 7550 return 0; 7551 } 7552 EXPORT_SYMBOL(register_md_personality); 7553 7554 int unregister_md_personality(struct md_personality *p) 7555 { 7556 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 7557 spin_lock(&pers_lock); 7558 list_del_init(&p->list); 7559 spin_unlock(&pers_lock); 7560 return 0; 7561 } 7562 EXPORT_SYMBOL(unregister_md_personality); 7563 7564 int register_md_cluster_operations(struct md_cluster_operations *ops, 7565 struct module *module) 7566 { 7567 int ret = 0; 7568 spin_lock(&pers_lock); 7569 if (md_cluster_ops != NULL) 7570 ret = -EALREADY; 7571 else { 7572 md_cluster_ops = ops; 7573 md_cluster_mod = module; 7574 } 7575 spin_unlock(&pers_lock); 7576 return ret; 7577 } 7578 EXPORT_SYMBOL(register_md_cluster_operations); 7579 7580 int unregister_md_cluster_operations(void) 7581 { 7582 spin_lock(&pers_lock); 7583 md_cluster_ops = NULL; 7584 spin_unlock(&pers_lock); 7585 return 0; 7586 } 7587 EXPORT_SYMBOL(unregister_md_cluster_operations); 7588 7589 int md_setup_cluster(struct mddev *mddev, int nodes) 7590 { 7591 int err; 7592 7593 err = request_module("md-cluster"); 7594 if (err) { 7595 pr_err("md-cluster module not found.\n"); 7596 return -ENOENT; 7597 } 7598 7599 spin_lock(&pers_lock); 7600 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { 7601 spin_unlock(&pers_lock); 7602 return -ENOENT; 7603 } 7604 spin_unlock(&pers_lock); 7605 7606 return md_cluster_ops->join(mddev, nodes); 7607 } 7608 7609 void md_cluster_stop(struct mddev *mddev) 7610 { 7611 if (!md_cluster_ops) 7612 return; 7613 md_cluster_ops->leave(mddev); 7614 module_put(md_cluster_mod); 7615 } 7616 7617 static int is_mddev_idle(struct mddev *mddev, int init) 7618 { 7619 struct md_rdev *rdev; 7620 int idle; 7621 int curr_events; 7622 7623 idle = 1; 7624 rcu_read_lock(); 7625 rdev_for_each_rcu(rdev, mddev) { 7626 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 7627 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 7628 (int)part_stat_read(&disk->part0, sectors[1]) - 7629 atomic_read(&disk->sync_io); 7630 /* sync IO will cause sync_io to increase before the disk_stats 7631 * as sync_io is counted when a request starts, and 7632 * disk_stats is counted when it completes. 7633 * So resync activity will cause curr_events to be smaller than 7634 * when there was no such activity. 7635 * non-sync IO will cause disk_stat to increase without 7636 * increasing sync_io so curr_events will (eventually) 7637 * be larger than it was before. Once it becomes 7638 * substantially larger, the test below will cause 7639 * the array to appear non-idle, and resync will slow 7640 * down. 7641 * If there is a lot of outstanding resync activity when 7642 * we set last_event to curr_events, then all that activity 7643 * completing might cause the array to appear non-idle 7644 * and resync will be slowed down even though there might 7645 * not have been non-resync activity. This will only 7646 * happen once though. 'last_events' will soon reflect 7647 * the state where there is little or no outstanding 7648 * resync requests, and further resync activity will 7649 * always make curr_events less than last_events. 7650 * 7651 */ 7652 if (init || curr_events - rdev->last_events > 64) { 7653 rdev->last_events = curr_events; 7654 idle = 0; 7655 } 7656 } 7657 rcu_read_unlock(); 7658 return idle; 7659 } 7660 7661 void md_done_sync(struct mddev *mddev, int blocks, int ok) 7662 { 7663 /* another "blocks" (512byte) blocks have been synced */ 7664 atomic_sub(blocks, &mddev->recovery_active); 7665 wake_up(&mddev->recovery_wait); 7666 if (!ok) { 7667 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7668 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 7669 md_wakeup_thread(mddev->thread); 7670 // stop recovery, signal do_sync .... 7671 } 7672 } 7673 EXPORT_SYMBOL(md_done_sync); 7674 7675 /* md_write_start(mddev, bi) 7676 * If we need to update some array metadata (e.g. 'active' flag 7677 * in superblock) before writing, schedule a superblock update 7678 * and wait for it to complete. 7679 */ 7680 void md_write_start(struct mddev *mddev, struct bio *bi) 7681 { 7682 int did_change = 0; 7683 if (bio_data_dir(bi) != WRITE) 7684 return; 7685 7686 BUG_ON(mddev->ro == 1); 7687 if (mddev->ro == 2) { 7688 /* need to switch to read/write */ 7689 mddev->ro = 0; 7690 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7691 md_wakeup_thread(mddev->thread); 7692 md_wakeup_thread(mddev->sync_thread); 7693 did_change = 1; 7694 } 7695 atomic_inc(&mddev->writes_pending); 7696 if (mddev->safemode == 1) 7697 mddev->safemode = 0; 7698 if (mddev->in_sync) { 7699 spin_lock(&mddev->lock); 7700 if (mddev->in_sync) { 7701 mddev->in_sync = 0; 7702 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7703 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7704 md_wakeup_thread(mddev->thread); 7705 did_change = 1; 7706 } 7707 spin_unlock(&mddev->lock); 7708 } 7709 if (did_change) 7710 sysfs_notify_dirent_safe(mddev->sysfs_state); 7711 wait_event(mddev->sb_wait, 7712 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 7713 } 7714 EXPORT_SYMBOL(md_write_start); 7715 7716 void md_write_end(struct mddev *mddev) 7717 { 7718 if (atomic_dec_and_test(&mddev->writes_pending)) { 7719 if (mddev->safemode == 2) 7720 md_wakeup_thread(mddev->thread); 7721 else if (mddev->safemode_delay) 7722 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 7723 } 7724 } 7725 EXPORT_SYMBOL(md_write_end); 7726 7727 /* md_allow_write(mddev) 7728 * Calling this ensures that the array is marked 'active' so that writes 7729 * may proceed without blocking. It is important to call this before 7730 * attempting a GFP_KERNEL allocation while holding the mddev lock. 7731 * Must be called with mddev_lock held. 7732 * 7733 * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock 7734 * is dropped, so return -EAGAIN after notifying userspace. 7735 */ 7736 int md_allow_write(struct mddev *mddev) 7737 { 7738 if (!mddev->pers) 7739 return 0; 7740 if (mddev->ro) 7741 return 0; 7742 if (!mddev->pers->sync_request) 7743 return 0; 7744 7745 spin_lock(&mddev->lock); 7746 if (mddev->in_sync) { 7747 mddev->in_sync = 0; 7748 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7749 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7750 if (mddev->safemode_delay && 7751 mddev->safemode == 0) 7752 mddev->safemode = 1; 7753 spin_unlock(&mddev->lock); 7754 md_update_sb(mddev, 0); 7755 sysfs_notify_dirent_safe(mddev->sysfs_state); 7756 } else 7757 spin_unlock(&mddev->lock); 7758 7759 if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 7760 return -EAGAIN; 7761 else 7762 return 0; 7763 } 7764 EXPORT_SYMBOL_GPL(md_allow_write); 7765 7766 #define SYNC_MARKS 10 7767 #define SYNC_MARK_STEP (3*HZ) 7768 #define UPDATE_FREQUENCY (5*60*HZ) 7769 void md_do_sync(struct md_thread *thread) 7770 { 7771 struct mddev *mddev = thread->mddev; 7772 struct mddev *mddev2; 7773 unsigned int currspeed = 0, 7774 window; 7775 sector_t max_sectors,j, io_sectors, recovery_done; 7776 unsigned long mark[SYNC_MARKS]; 7777 unsigned long update_time; 7778 sector_t mark_cnt[SYNC_MARKS]; 7779 int last_mark,m; 7780 struct list_head *tmp; 7781 sector_t last_check; 7782 int skipped = 0; 7783 struct md_rdev *rdev; 7784 char *desc, *action = NULL; 7785 struct blk_plug plug; 7786 bool cluster_resync_finished = false; 7787 7788 /* just incase thread restarts... */ 7789 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7790 return; 7791 if (mddev->ro) {/* never try to sync a read-only array */ 7792 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7793 return; 7794 } 7795 7796 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7797 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 7798 desc = "data-check"; 7799 action = "check"; 7800 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7801 desc = "requested-resync"; 7802 action = "repair"; 7803 } else 7804 desc = "resync"; 7805 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7806 desc = "reshape"; 7807 else 7808 desc = "recovery"; 7809 7810 mddev->last_sync_action = action ?: desc; 7811 7812 /* we overload curr_resync somewhat here. 7813 * 0 == not engaged in resync at all 7814 * 2 == checking that there is no conflict with another sync 7815 * 1 == like 2, but have yielded to allow conflicting resync to 7816 * commense 7817 * other == active in resync - this many blocks 7818 * 7819 * Before starting a resync we must have set curr_resync to 7820 * 2, and then checked that every "conflicting" array has curr_resync 7821 * less than ours. When we find one that is the same or higher 7822 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 7823 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 7824 * This will mean we have to start checking from the beginning again. 7825 * 7826 */ 7827 7828 do { 7829 mddev->curr_resync = 2; 7830 7831 try_again: 7832 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7833 goto skip; 7834 for_each_mddev(mddev2, tmp) { 7835 if (mddev2 == mddev) 7836 continue; 7837 if (!mddev->parallel_resync 7838 && mddev2->curr_resync 7839 && match_mddev_units(mddev, mddev2)) { 7840 DEFINE_WAIT(wq); 7841 if (mddev < mddev2 && mddev->curr_resync == 2) { 7842 /* arbitrarily yield */ 7843 mddev->curr_resync = 1; 7844 wake_up(&resync_wait); 7845 } 7846 if (mddev > mddev2 && mddev->curr_resync == 1) 7847 /* no need to wait here, we can wait the next 7848 * time 'round when curr_resync == 2 7849 */ 7850 continue; 7851 /* We need to wait 'interruptible' so as not to 7852 * contribute to the load average, and not to 7853 * be caught by 'softlockup' 7854 */ 7855 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 7856 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7857 mddev2->curr_resync >= mddev->curr_resync) { 7858 printk(KERN_INFO "md: delaying %s of %s" 7859 " until %s has finished (they" 7860 " share one or more physical units)\n", 7861 desc, mdname(mddev), mdname(mddev2)); 7862 mddev_put(mddev2); 7863 if (signal_pending(current)) 7864 flush_signals(current); 7865 schedule(); 7866 finish_wait(&resync_wait, &wq); 7867 goto try_again; 7868 } 7869 finish_wait(&resync_wait, &wq); 7870 } 7871 } 7872 } while (mddev->curr_resync < 2); 7873 7874 j = 0; 7875 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7876 /* resync follows the size requested by the personality, 7877 * which defaults to physical size, but can be virtual size 7878 */ 7879 max_sectors = mddev->resync_max_sectors; 7880 atomic64_set(&mddev->resync_mismatches, 0); 7881 /* we don't use the checkpoint if there's a bitmap */ 7882 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7883 j = mddev->resync_min; 7884 else if (!mddev->bitmap) 7885 j = mddev->recovery_cp; 7886 7887 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7888 max_sectors = mddev->resync_max_sectors; 7889 else { 7890 /* recovery follows the physical size of devices */ 7891 max_sectors = mddev->dev_sectors; 7892 j = MaxSector; 7893 rcu_read_lock(); 7894 rdev_for_each_rcu(rdev, mddev) 7895 if (rdev->raid_disk >= 0 && 7896 !test_bit(Journal, &rdev->flags) && 7897 !test_bit(Faulty, &rdev->flags) && 7898 !test_bit(In_sync, &rdev->flags) && 7899 rdev->recovery_offset < j) 7900 j = rdev->recovery_offset; 7901 rcu_read_unlock(); 7902 7903 /* If there is a bitmap, we need to make sure all 7904 * writes that started before we added a spare 7905 * complete before we start doing a recovery. 7906 * Otherwise the write might complete and (via 7907 * bitmap_endwrite) set a bit in the bitmap after the 7908 * recovery has checked that bit and skipped that 7909 * region. 7910 */ 7911 if (mddev->bitmap) { 7912 mddev->pers->quiesce(mddev, 1); 7913 mddev->pers->quiesce(mddev, 0); 7914 } 7915 } 7916 7917 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 7918 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 7919 " %d KB/sec/disk.\n", speed_min(mddev)); 7920 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 7921 "(but not more than %d KB/sec) for %s.\n", 7922 speed_max(mddev), desc); 7923 7924 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 7925 7926 io_sectors = 0; 7927 for (m = 0; m < SYNC_MARKS; m++) { 7928 mark[m] = jiffies; 7929 mark_cnt[m] = io_sectors; 7930 } 7931 last_mark = 0; 7932 mddev->resync_mark = mark[last_mark]; 7933 mddev->resync_mark_cnt = mark_cnt[last_mark]; 7934 7935 /* 7936 * Tune reconstruction: 7937 */ 7938 window = 32*(PAGE_SIZE/512); 7939 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", 7940 window/2, (unsigned long long)max_sectors/2); 7941 7942 atomic_set(&mddev->recovery_active, 0); 7943 last_check = 0; 7944 7945 if (j>2) { 7946 printk(KERN_INFO 7947 "md: resuming %s of %s from checkpoint.\n", 7948 desc, mdname(mddev)); 7949 mddev->curr_resync = j; 7950 } else 7951 mddev->curr_resync = 3; /* no longer delayed */ 7952 mddev->curr_resync_completed = j; 7953 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7954 md_new_event(mddev); 7955 update_time = jiffies; 7956 7957 blk_start_plug(&plug); 7958 while (j < max_sectors) { 7959 sector_t sectors; 7960 7961 skipped = 0; 7962 7963 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 7964 ((mddev->curr_resync > mddev->curr_resync_completed && 7965 (mddev->curr_resync - mddev->curr_resync_completed) 7966 > (max_sectors >> 4)) || 7967 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 7968 (j - mddev->curr_resync_completed)*2 7969 >= mddev->resync_max - mddev->curr_resync_completed || 7970 mddev->curr_resync_completed > mddev->resync_max 7971 )) { 7972 /* time to update curr_resync_completed */ 7973 wait_event(mddev->recovery_wait, 7974 atomic_read(&mddev->recovery_active) == 0); 7975 mddev->curr_resync_completed = j; 7976 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 7977 j > mddev->recovery_cp) 7978 mddev->recovery_cp = j; 7979 update_time = jiffies; 7980 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7981 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7982 } 7983 7984 while (j >= mddev->resync_max && 7985 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7986 /* As this condition is controlled by user-space, 7987 * we can block indefinitely, so use '_interruptible' 7988 * to avoid triggering warnings. 7989 */ 7990 flush_signals(current); /* just in case */ 7991 wait_event_interruptible(mddev->recovery_wait, 7992 mddev->resync_max > j 7993 || test_bit(MD_RECOVERY_INTR, 7994 &mddev->recovery)); 7995 } 7996 7997 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7998 break; 7999 8000 sectors = mddev->pers->sync_request(mddev, j, &skipped); 8001 if (sectors == 0) { 8002 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8003 break; 8004 } 8005 8006 if (!skipped) { /* actual IO requested */ 8007 io_sectors += sectors; 8008 atomic_add(sectors, &mddev->recovery_active); 8009 } 8010 8011 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8012 break; 8013 8014 j += sectors; 8015 if (j > max_sectors) 8016 /* when skipping, extra large numbers can be returned. */ 8017 j = max_sectors; 8018 if (j > 2) 8019 mddev->curr_resync = j; 8020 mddev->curr_mark_cnt = io_sectors; 8021 if (last_check == 0) 8022 /* this is the earliest that rebuild will be 8023 * visible in /proc/mdstat 8024 */ 8025 md_new_event(mddev); 8026 8027 if (last_check + window > io_sectors || j == max_sectors) 8028 continue; 8029 8030 last_check = io_sectors; 8031 repeat: 8032 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 8033 /* step marks */ 8034 int next = (last_mark+1) % SYNC_MARKS; 8035 8036 mddev->resync_mark = mark[next]; 8037 mddev->resync_mark_cnt = mark_cnt[next]; 8038 mark[next] = jiffies; 8039 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 8040 last_mark = next; 8041 } 8042 8043 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8044 break; 8045 8046 /* 8047 * this loop exits only if either when we are slower than 8048 * the 'hard' speed limit, or the system was IO-idle for 8049 * a jiffy. 8050 * the system might be non-idle CPU-wise, but we only care 8051 * about not overloading the IO subsystem. (things like an 8052 * e2fsck being done on the RAID array should execute fast) 8053 */ 8054 cond_resched(); 8055 8056 recovery_done = io_sectors - atomic_read(&mddev->recovery_active); 8057 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 8058 /((jiffies-mddev->resync_mark)/HZ +1) +1; 8059 8060 if (currspeed > speed_min(mddev)) { 8061 if (currspeed > speed_max(mddev)) { 8062 msleep(500); 8063 goto repeat; 8064 } 8065 if (!is_mddev_idle(mddev, 0)) { 8066 /* 8067 * Give other IO more of a chance. 8068 * The faster the devices, the less we wait. 8069 */ 8070 wait_event(mddev->recovery_wait, 8071 !atomic_read(&mddev->recovery_active)); 8072 } 8073 } 8074 } 8075 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, 8076 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 8077 ? "interrupted" : "done"); 8078 /* 8079 * this also signals 'finished resyncing' to md_stop 8080 */ 8081 blk_finish_plug(&plug); 8082 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 8083 8084 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8085 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8086 mddev->curr_resync > 2) { 8087 mddev->curr_resync_completed = mddev->curr_resync; 8088 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 8089 } 8090 /* tell personality and other nodes that we are finished */ 8091 if (mddev_is_clustered(mddev)) { 8092 md_cluster_ops->resync_finish(mddev); 8093 cluster_resync_finished = true; 8094 } 8095 mddev->pers->sync_request(mddev, max_sectors, &skipped); 8096 8097 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 8098 mddev->curr_resync > 2) { 8099 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 8100 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8101 if (mddev->curr_resync >= mddev->recovery_cp) { 8102 printk(KERN_INFO 8103 "md: checkpointing %s of %s.\n", 8104 desc, mdname(mddev)); 8105 if (test_bit(MD_RECOVERY_ERROR, 8106 &mddev->recovery)) 8107 mddev->recovery_cp = 8108 mddev->curr_resync_completed; 8109 else 8110 mddev->recovery_cp = 8111 mddev->curr_resync; 8112 } 8113 } else 8114 mddev->recovery_cp = MaxSector; 8115 } else { 8116 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 8117 mddev->curr_resync = MaxSector; 8118 rcu_read_lock(); 8119 rdev_for_each_rcu(rdev, mddev) 8120 if (rdev->raid_disk >= 0 && 8121 mddev->delta_disks >= 0 && 8122 !test_bit(Journal, &rdev->flags) && 8123 !test_bit(Faulty, &rdev->flags) && 8124 !test_bit(In_sync, &rdev->flags) && 8125 rdev->recovery_offset < mddev->curr_resync) 8126 rdev->recovery_offset = mddev->curr_resync; 8127 rcu_read_unlock(); 8128 } 8129 } 8130 skip: 8131 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8132 8133 if (mddev_is_clustered(mddev) && 8134 test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8135 !cluster_resync_finished) 8136 md_cluster_ops->resync_finish(mddev); 8137 8138 spin_lock(&mddev->lock); 8139 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 8140 /* We completed so min/max setting can be forgotten if used. */ 8141 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8142 mddev->resync_min = 0; 8143 mddev->resync_max = MaxSector; 8144 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 8145 mddev->resync_min = mddev->curr_resync_completed; 8146 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 8147 mddev->curr_resync = 0; 8148 spin_unlock(&mddev->lock); 8149 8150 wake_up(&resync_wait); 8151 md_wakeup_thread(mddev->thread); 8152 return; 8153 } 8154 EXPORT_SYMBOL_GPL(md_do_sync); 8155 8156 static int remove_and_add_spares(struct mddev *mddev, 8157 struct md_rdev *this) 8158 { 8159 struct md_rdev *rdev; 8160 int spares = 0; 8161 int removed = 0; 8162 8163 rdev_for_each(rdev, mddev) 8164 if ((this == NULL || rdev == this) && 8165 rdev->raid_disk >= 0 && 8166 !test_bit(Blocked, &rdev->flags) && 8167 (test_bit(Faulty, &rdev->flags) || 8168 (!test_bit(In_sync, &rdev->flags) && 8169 !test_bit(Journal, &rdev->flags))) && 8170 atomic_read(&rdev->nr_pending)==0) { 8171 if (mddev->pers->hot_remove_disk( 8172 mddev, rdev) == 0) { 8173 sysfs_unlink_rdev(mddev, rdev); 8174 rdev->raid_disk = -1; 8175 removed++; 8176 } 8177 } 8178 if (removed && mddev->kobj.sd) 8179 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8180 8181 if (this && removed) 8182 goto no_add; 8183 8184 rdev_for_each(rdev, mddev) { 8185 if (this && this != rdev) 8186 continue; 8187 if (test_bit(Candidate, &rdev->flags)) 8188 continue; 8189 if (rdev->raid_disk >= 0 && 8190 !test_bit(In_sync, &rdev->flags) && 8191 !test_bit(Journal, &rdev->flags) && 8192 !test_bit(Faulty, &rdev->flags)) 8193 spares++; 8194 if (rdev->raid_disk >= 0) 8195 continue; 8196 if (test_bit(Faulty, &rdev->flags)) 8197 continue; 8198 if (!test_bit(Journal, &rdev->flags)) { 8199 if (mddev->ro && 8200 ! (rdev->saved_raid_disk >= 0 && 8201 !test_bit(Bitmap_sync, &rdev->flags))) 8202 continue; 8203 8204 rdev->recovery_offset = 0; 8205 } 8206 if (mddev->pers-> 8207 hot_add_disk(mddev, rdev) == 0) { 8208 if (sysfs_link_rdev(mddev, rdev)) 8209 /* failure here is OK */; 8210 if (!test_bit(Journal, &rdev->flags)) 8211 spares++; 8212 md_new_event(mddev); 8213 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8214 } 8215 } 8216 no_add: 8217 if (removed) 8218 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8219 return spares; 8220 } 8221 8222 static void md_start_sync(struct work_struct *ws) 8223 { 8224 struct mddev *mddev = container_of(ws, struct mddev, del_work); 8225 int ret = 0; 8226 8227 if (mddev_is_clustered(mddev)) { 8228 ret = md_cluster_ops->resync_start(mddev); 8229 if (ret) { 8230 mddev->sync_thread = NULL; 8231 goto out; 8232 } 8233 } 8234 8235 mddev->sync_thread = md_register_thread(md_do_sync, 8236 mddev, 8237 "resync"); 8238 out: 8239 if (!mddev->sync_thread) { 8240 if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) 8241 printk(KERN_ERR "%s: could not start resync" 8242 " thread...\n", 8243 mdname(mddev)); 8244 /* leave the spares where they are, it shouldn't hurt */ 8245 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8246 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8247 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8248 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8249 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8250 wake_up(&resync_wait); 8251 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8252 &mddev->recovery)) 8253 if (mddev->sysfs_action) 8254 sysfs_notify_dirent_safe(mddev->sysfs_action); 8255 } else 8256 md_wakeup_thread(mddev->sync_thread); 8257 sysfs_notify_dirent_safe(mddev->sysfs_action); 8258 md_new_event(mddev); 8259 } 8260 8261 /* 8262 * This routine is regularly called by all per-raid-array threads to 8263 * deal with generic issues like resync and super-block update. 8264 * Raid personalities that don't have a thread (linear/raid0) do not 8265 * need this as they never do any recovery or update the superblock. 8266 * 8267 * It does not do any resync itself, but rather "forks" off other threads 8268 * to do that as needed. 8269 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 8270 * "->recovery" and create a thread at ->sync_thread. 8271 * When the thread finishes it sets MD_RECOVERY_DONE 8272 * and wakeups up this thread which will reap the thread and finish up. 8273 * This thread also removes any faulty devices (with nr_pending == 0). 8274 * 8275 * The overall approach is: 8276 * 1/ if the superblock needs updating, update it. 8277 * 2/ If a recovery thread is running, don't do anything else. 8278 * 3/ If recovery has finished, clean up, possibly marking spares active. 8279 * 4/ If there are any faulty devices, remove them. 8280 * 5/ If array is degraded, try to add spares devices 8281 * 6/ If array has spares or is not in-sync, start a resync thread. 8282 */ 8283 void md_check_recovery(struct mddev *mddev) 8284 { 8285 if (mddev->suspended) 8286 return; 8287 8288 if (mddev->bitmap) 8289 bitmap_daemon_work(mddev); 8290 8291 if (signal_pending(current)) { 8292 if (mddev->pers->sync_request && !mddev->external) { 8293 printk(KERN_INFO "md: %s in immediate safe mode\n", 8294 mdname(mddev)); 8295 mddev->safemode = 2; 8296 } 8297 flush_signals(current); 8298 } 8299 8300 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 8301 return; 8302 if ( ! ( 8303 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || 8304 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8305 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 8306 test_bit(MD_RELOAD_SB, &mddev->flags) || 8307 (mddev->external == 0 && mddev->safemode == 1) || 8308 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 8309 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 8310 )) 8311 return; 8312 8313 if (mddev_trylock(mddev)) { 8314 int spares = 0; 8315 8316 if (mddev->ro) { 8317 struct md_rdev *rdev; 8318 if (!mddev->external && mddev->in_sync) 8319 /* 'Blocked' flag not needed as failed devices 8320 * will be recorded if array switched to read/write. 8321 * Leaving it set will prevent the device 8322 * from being removed. 8323 */ 8324 rdev_for_each(rdev, mddev) 8325 clear_bit(Blocked, &rdev->flags); 8326 /* On a read-only array we can: 8327 * - remove failed devices 8328 * - add already-in_sync devices if the array itself 8329 * is in-sync. 8330 * As we only add devices that are already in-sync, 8331 * we can activate the spares immediately. 8332 */ 8333 remove_and_add_spares(mddev, NULL); 8334 /* There is no thread, but we need to call 8335 * ->spare_active and clear saved_raid_disk 8336 */ 8337 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 8338 md_reap_sync_thread(mddev); 8339 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8340 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8341 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 8342 goto unlock; 8343 } 8344 8345 if (mddev_is_clustered(mddev)) { 8346 struct md_rdev *rdev; 8347 /* kick the device if another node issued a 8348 * remove disk. 8349 */ 8350 rdev_for_each(rdev, mddev) { 8351 if (test_and_clear_bit(ClusterRemove, &rdev->flags) && 8352 rdev->raid_disk < 0) 8353 md_kick_rdev_from_array(rdev); 8354 } 8355 8356 if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags)) 8357 md_reload_sb(mddev, mddev->good_device_nr); 8358 } 8359 8360 if (!mddev->external) { 8361 int did_change = 0; 8362 spin_lock(&mddev->lock); 8363 if (mddev->safemode && 8364 !atomic_read(&mddev->writes_pending) && 8365 !mddev->in_sync && 8366 mddev->recovery_cp == MaxSector) { 8367 mddev->in_sync = 1; 8368 did_change = 1; 8369 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 8370 } 8371 if (mddev->safemode == 1) 8372 mddev->safemode = 0; 8373 spin_unlock(&mddev->lock); 8374 if (did_change) 8375 sysfs_notify_dirent_safe(mddev->sysfs_state); 8376 } 8377 8378 if (mddev->flags & MD_UPDATE_SB_FLAGS) 8379 md_update_sb(mddev, 0); 8380 8381 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 8382 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 8383 /* resync/recovery still happening */ 8384 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8385 goto unlock; 8386 } 8387 if (mddev->sync_thread) { 8388 md_reap_sync_thread(mddev); 8389 goto unlock; 8390 } 8391 /* Set RUNNING before clearing NEEDED to avoid 8392 * any transients in the value of "sync_action". 8393 */ 8394 mddev->curr_resync_completed = 0; 8395 spin_lock(&mddev->lock); 8396 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8397 spin_unlock(&mddev->lock); 8398 /* Clear some bits that don't mean anything, but 8399 * might be left set 8400 */ 8401 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 8402 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8403 8404 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 8405 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 8406 goto not_running; 8407 /* no recovery is running. 8408 * remove any failed drives, then 8409 * add spares if possible. 8410 * Spares are also removed and re-added, to allow 8411 * the personality to fail the re-add. 8412 */ 8413 8414 if (mddev->reshape_position != MaxSector) { 8415 if (mddev->pers->check_reshape == NULL || 8416 mddev->pers->check_reshape(mddev) != 0) 8417 /* Cannot proceed */ 8418 goto not_running; 8419 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8420 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8421 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 8422 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8423 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8424 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8425 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8426 } else if (mddev->recovery_cp < MaxSector) { 8427 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8428 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 8429 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 8430 /* nothing to be done ... */ 8431 goto not_running; 8432 8433 if (mddev->pers->sync_request) { 8434 if (spares) { 8435 /* We are adding a device or devices to an array 8436 * which has the bitmap stored on all devices. 8437 * So make sure all bitmap pages get written 8438 */ 8439 bitmap_write_all(mddev->bitmap); 8440 } 8441 INIT_WORK(&mddev->del_work, md_start_sync); 8442 queue_work(md_misc_wq, &mddev->del_work); 8443 goto unlock; 8444 } 8445 not_running: 8446 if (!mddev->sync_thread) { 8447 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8448 wake_up(&resync_wait); 8449 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 8450 &mddev->recovery)) 8451 if (mddev->sysfs_action) 8452 sysfs_notify_dirent_safe(mddev->sysfs_action); 8453 } 8454 unlock: 8455 wake_up(&mddev->sb_wait); 8456 mddev_unlock(mddev); 8457 } 8458 } 8459 EXPORT_SYMBOL(md_check_recovery); 8460 8461 void md_reap_sync_thread(struct mddev *mddev) 8462 { 8463 struct md_rdev *rdev; 8464 8465 /* resync has finished, collect result */ 8466 md_unregister_thread(&mddev->sync_thread); 8467 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 8468 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 8469 /* success...*/ 8470 /* activate any spares */ 8471 if (mddev->pers->spare_active(mddev)) { 8472 sysfs_notify(&mddev->kobj, NULL, 8473 "degraded"); 8474 set_bit(MD_CHANGE_DEVS, &mddev->flags); 8475 } 8476 } 8477 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 8478 mddev->pers->finish_reshape) 8479 mddev->pers->finish_reshape(mddev); 8480 8481 /* If array is no-longer degraded, then any saved_raid_disk 8482 * information must be scrapped. 8483 */ 8484 if (!mddev->degraded) 8485 rdev_for_each(rdev, mddev) 8486 rdev->saved_raid_disk = -1; 8487 8488 md_update_sb(mddev, 1); 8489 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 8490 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 8491 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 8492 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 8493 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 8494 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 8495 wake_up(&resync_wait); 8496 /* flag recovery needed just to double check */ 8497 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8498 sysfs_notify_dirent_safe(mddev->sysfs_action); 8499 md_new_event(mddev); 8500 if (mddev->event_work.func) 8501 queue_work(md_misc_wq, &mddev->event_work); 8502 } 8503 EXPORT_SYMBOL(md_reap_sync_thread); 8504 8505 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 8506 { 8507 sysfs_notify_dirent_safe(rdev->sysfs_state); 8508 wait_event_timeout(rdev->blocked_wait, 8509 !test_bit(Blocked, &rdev->flags) && 8510 !test_bit(BlockedBadBlocks, &rdev->flags), 8511 msecs_to_jiffies(5000)); 8512 rdev_dec_pending(rdev, mddev); 8513 } 8514 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 8515 8516 void md_finish_reshape(struct mddev *mddev) 8517 { 8518 /* called be personality module when reshape completes. */ 8519 struct md_rdev *rdev; 8520 8521 rdev_for_each(rdev, mddev) { 8522 if (rdev->data_offset > rdev->new_data_offset) 8523 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 8524 else 8525 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 8526 rdev->data_offset = rdev->new_data_offset; 8527 } 8528 } 8529 EXPORT_SYMBOL(md_finish_reshape); 8530 8531 /* Bad block management */ 8532 8533 /* Returns 1 on success, 0 on failure */ 8534 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8535 int is_new) 8536 { 8537 int rv; 8538 if (is_new) 8539 s += rdev->new_data_offset; 8540 else 8541 s += rdev->data_offset; 8542 rv = badblocks_set(&rdev->badblocks, s, sectors, 0); 8543 if (rv == 0) { 8544 /* Make sure they get written out promptly */ 8545 sysfs_notify_dirent_safe(rdev->sysfs_state); 8546 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); 8547 set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); 8548 md_wakeup_thread(rdev->mddev->thread); 8549 return 1; 8550 } else 8551 return 0; 8552 } 8553 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 8554 8555 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8556 int is_new) 8557 { 8558 if (is_new) 8559 s += rdev->new_data_offset; 8560 else 8561 s += rdev->data_offset; 8562 return badblocks_clear(&rdev->badblocks, 8563 s, sectors); 8564 } 8565 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8566 8567 static int md_notify_reboot(struct notifier_block *this, 8568 unsigned long code, void *x) 8569 { 8570 struct list_head *tmp; 8571 struct mddev *mddev; 8572 int need_delay = 0; 8573 8574 for_each_mddev(mddev, tmp) { 8575 if (mddev_trylock(mddev)) { 8576 if (mddev->pers) 8577 __md_stop_writes(mddev); 8578 if (mddev->persistent) 8579 mddev->safemode = 2; 8580 mddev_unlock(mddev); 8581 } 8582 need_delay = 1; 8583 } 8584 /* 8585 * certain more exotic SCSI devices are known to be 8586 * volatile wrt too early system reboots. While the 8587 * right place to handle this issue is the given 8588 * driver, we do want to have a safe RAID driver ... 8589 */ 8590 if (need_delay) 8591 mdelay(1000*1); 8592 8593 return NOTIFY_DONE; 8594 } 8595 8596 static struct notifier_block md_notifier = { 8597 .notifier_call = md_notify_reboot, 8598 .next = NULL, 8599 .priority = INT_MAX, /* before any real devices */ 8600 }; 8601 8602 static void md_geninit(void) 8603 { 8604 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 8605 8606 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 8607 } 8608 8609 static int __init md_init(void) 8610 { 8611 int ret = -ENOMEM; 8612 8613 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 8614 if (!md_wq) 8615 goto err_wq; 8616 8617 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 8618 if (!md_misc_wq) 8619 goto err_misc_wq; 8620 8621 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) 8622 goto err_md; 8623 8624 if ((ret = register_blkdev(0, "mdp")) < 0) 8625 goto err_mdp; 8626 mdp_major = ret; 8627 8628 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE, 8629 md_probe, NULL, NULL); 8630 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 8631 md_probe, NULL, NULL); 8632 8633 register_reboot_notifier(&md_notifier); 8634 raid_table_header = register_sysctl_table(raid_root_table); 8635 8636 md_geninit(); 8637 return 0; 8638 8639 err_mdp: 8640 unregister_blkdev(MD_MAJOR, "md"); 8641 err_md: 8642 destroy_workqueue(md_misc_wq); 8643 err_misc_wq: 8644 destroy_workqueue(md_wq); 8645 err_wq: 8646 return ret; 8647 } 8648 8649 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) 8650 { 8651 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 8652 struct md_rdev *rdev2; 8653 int role, ret; 8654 char b[BDEVNAME_SIZE]; 8655 8656 /* Check for change of roles in the active devices */ 8657 rdev_for_each(rdev2, mddev) { 8658 if (test_bit(Faulty, &rdev2->flags)) 8659 continue; 8660 8661 /* Check if the roles changed */ 8662 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); 8663 8664 if (test_bit(Candidate, &rdev2->flags)) { 8665 if (role == 0xfffe) { 8666 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); 8667 md_kick_rdev_from_array(rdev2); 8668 continue; 8669 } 8670 else 8671 clear_bit(Candidate, &rdev2->flags); 8672 } 8673 8674 if (role != rdev2->raid_disk) { 8675 /* got activated */ 8676 if (rdev2->raid_disk == -1 && role != 0xffff) { 8677 rdev2->saved_raid_disk = role; 8678 ret = remove_and_add_spares(mddev, rdev2); 8679 pr_info("Activated spare: %s\n", 8680 bdevname(rdev2->bdev,b)); 8681 } 8682 /* device faulty 8683 * We just want to do the minimum to mark the disk 8684 * as faulty. The recovery is performed by the 8685 * one who initiated the error. 8686 */ 8687 if ((role == 0xfffe) || (role == 0xfffd)) { 8688 md_error(mddev, rdev2); 8689 clear_bit(Blocked, &rdev2->flags); 8690 } 8691 } 8692 } 8693 8694 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) 8695 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); 8696 8697 /* Finally set the event to be up to date */ 8698 mddev->events = le64_to_cpu(sb->events); 8699 } 8700 8701 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) 8702 { 8703 int err; 8704 struct page *swapout = rdev->sb_page; 8705 struct mdp_superblock_1 *sb; 8706 8707 /* Store the sb page of the rdev in the swapout temporary 8708 * variable in case we err in the future 8709 */ 8710 rdev->sb_page = NULL; 8711 alloc_disk_sb(rdev); 8712 ClearPageUptodate(rdev->sb_page); 8713 rdev->sb_loaded = 0; 8714 err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); 8715 8716 if (err < 0) { 8717 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", 8718 __func__, __LINE__, rdev->desc_nr, err); 8719 put_page(rdev->sb_page); 8720 rdev->sb_page = swapout; 8721 rdev->sb_loaded = 1; 8722 return err; 8723 } 8724 8725 sb = page_address(rdev->sb_page); 8726 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET 8727 * is not set 8728 */ 8729 8730 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) 8731 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 8732 8733 /* The other node finished recovery, call spare_active to set 8734 * device In_sync and mddev->degraded 8735 */ 8736 if (rdev->recovery_offset == MaxSector && 8737 !test_bit(In_sync, &rdev->flags) && 8738 mddev->pers->spare_active(mddev)) 8739 sysfs_notify(&mddev->kobj, NULL, "degraded"); 8740 8741 put_page(swapout); 8742 return 0; 8743 } 8744 8745 void md_reload_sb(struct mddev *mddev, int nr) 8746 { 8747 struct md_rdev *rdev; 8748 int err; 8749 8750 /* Find the rdev */ 8751 rdev_for_each_rcu(rdev, mddev) { 8752 if (rdev->desc_nr == nr) 8753 break; 8754 } 8755 8756 if (!rdev || rdev->desc_nr != nr) { 8757 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); 8758 return; 8759 } 8760 8761 err = read_rdev(mddev, rdev); 8762 if (err < 0) 8763 return; 8764 8765 check_sb_changes(mddev, rdev); 8766 8767 /* Read all rdev's to update recovery_offset */ 8768 rdev_for_each_rcu(rdev, mddev) 8769 read_rdev(mddev, rdev); 8770 } 8771 EXPORT_SYMBOL(md_reload_sb); 8772 8773 #ifndef MODULE 8774 8775 /* 8776 * Searches all registered partitions for autorun RAID arrays 8777 * at boot time. 8778 */ 8779 8780 static LIST_HEAD(all_detected_devices); 8781 struct detected_devices_node { 8782 struct list_head list; 8783 dev_t dev; 8784 }; 8785 8786 void md_autodetect_dev(dev_t dev) 8787 { 8788 struct detected_devices_node *node_detected_dev; 8789 8790 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 8791 if (node_detected_dev) { 8792 node_detected_dev->dev = dev; 8793 list_add_tail(&node_detected_dev->list, &all_detected_devices); 8794 } else { 8795 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 8796 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 8797 } 8798 } 8799 8800 static void autostart_arrays(int part) 8801 { 8802 struct md_rdev *rdev; 8803 struct detected_devices_node *node_detected_dev; 8804 dev_t dev; 8805 int i_scanned, i_passed; 8806 8807 i_scanned = 0; 8808 i_passed = 0; 8809 8810 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 8811 8812 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 8813 i_scanned++; 8814 node_detected_dev = list_entry(all_detected_devices.next, 8815 struct detected_devices_node, list); 8816 list_del(&node_detected_dev->list); 8817 dev = node_detected_dev->dev; 8818 kfree(node_detected_dev); 8819 rdev = md_import_device(dev,0, 90); 8820 if (IS_ERR(rdev)) 8821 continue; 8822 8823 if (test_bit(Faulty, &rdev->flags)) 8824 continue; 8825 8826 set_bit(AutoDetected, &rdev->flags); 8827 list_add(&rdev->same_set, &pending_raid_disks); 8828 i_passed++; 8829 } 8830 8831 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 8832 i_scanned, i_passed); 8833 8834 autorun_devices(part); 8835 } 8836 8837 #endif /* !MODULE */ 8838 8839 static __exit void md_exit(void) 8840 { 8841 struct mddev *mddev; 8842 struct list_head *tmp; 8843 int delay = 1; 8844 8845 blk_unregister_region(MKDEV(MD_MAJOR,0), 512); 8846 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 8847 8848 unregister_blkdev(MD_MAJOR,"md"); 8849 unregister_blkdev(mdp_major, "mdp"); 8850 unregister_reboot_notifier(&md_notifier); 8851 unregister_sysctl_table(raid_table_header); 8852 8853 /* We cannot unload the modules while some process is 8854 * waiting for us in select() or poll() - wake them up 8855 */ 8856 md_unloading = 1; 8857 while (waitqueue_active(&md_event_waiters)) { 8858 /* not safe to leave yet */ 8859 wake_up(&md_event_waiters); 8860 msleep(delay); 8861 delay += delay; 8862 } 8863 remove_proc_entry("mdstat", NULL); 8864 8865 for_each_mddev(mddev, tmp) { 8866 export_array(mddev); 8867 mddev->hold_active = 0; 8868 } 8869 destroy_workqueue(md_misc_wq); 8870 destroy_workqueue(md_wq); 8871 } 8872 8873 subsys_initcall(md_init); 8874 module_exit(md_exit) 8875 8876 static int get_ro(char *buffer, struct kernel_param *kp) 8877 { 8878 return sprintf(buffer, "%d", start_readonly); 8879 } 8880 static int set_ro(const char *val, struct kernel_param *kp) 8881 { 8882 return kstrtouint(val, 10, (unsigned int *)&start_readonly); 8883 } 8884 8885 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 8886 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 8887 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 8888 8889 MODULE_LICENSE("GPL"); 8890 MODULE_DESCRIPTION("MD RAID framework"); 8891 MODULE_ALIAS("md"); 8892 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 8893