1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/kernel.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/buffer_head.h> /* for invalidate_bdev */ 43 #include <linux/poll.h> 44 #include <linux/mutex.h> 45 #include <linux/ctype.h> 46 #include <linux/freezer.h> 47 #include <linux/init.h> 48 #include <linux/file.h> 49 #include <linux/kmod.h> 50 51 #include <asm/unaligned.h> 52 53 #define MAJOR_NR MD_MAJOR 54 #define MD_DRIVER 55 56 /* 63 partitions with the alternate major number (mdp) */ 57 #define MdpMinorShift 6 58 59 #define DEBUG 0 60 #define dprintk(x...) ((void)(DEBUG && printk(x))) 61 62 63 #ifndef MODULE 64 static void autostart_arrays (int part); 65 #endif 66 67 static LIST_HEAD(pers_list); 68 static DEFINE_SPINLOCK(pers_lock); 69 70 static void md_print_devices(void); 71 72 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 73 74 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 75 76 /* 77 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 78 * is 1000 KB/sec, so the extra system load does not show up that much. 79 * Increase it if you want to have more _guaranteed_ speed. Note that 80 * the RAID driver will use the maximum available bandwidth if the IO 81 * subsystem is idle. There is also an 'absolute maximum' reconstruction 82 * speed limit - in case reconstruction slows down your system despite 83 * idle IO detection. 84 * 85 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 86 * or /sys/block/mdX/md/sync_speed_{min,max} 87 */ 88 89 static int sysctl_speed_limit_min = 1000; 90 static int sysctl_speed_limit_max = 200000; 91 static inline int speed_min(mddev_t *mddev) 92 { 93 return mddev->sync_speed_min ? 94 mddev->sync_speed_min : sysctl_speed_limit_min; 95 } 96 97 static inline int speed_max(mddev_t *mddev) 98 { 99 return mddev->sync_speed_max ? 100 mddev->sync_speed_max : sysctl_speed_limit_max; 101 } 102 103 static struct ctl_table_header *raid_table_header; 104 105 static ctl_table raid_table[] = { 106 { 107 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 108 .procname = "speed_limit_min", 109 .data = &sysctl_speed_limit_min, 110 .maxlen = sizeof(int), 111 .mode = S_IRUGO|S_IWUSR, 112 .proc_handler = &proc_dointvec, 113 }, 114 { 115 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 116 .procname = "speed_limit_max", 117 .data = &sysctl_speed_limit_max, 118 .maxlen = sizeof(int), 119 .mode = S_IRUGO|S_IWUSR, 120 .proc_handler = &proc_dointvec, 121 }, 122 { .ctl_name = 0 } 123 }; 124 125 static ctl_table raid_dir_table[] = { 126 { 127 .ctl_name = DEV_RAID, 128 .procname = "raid", 129 .maxlen = 0, 130 .mode = S_IRUGO|S_IXUGO, 131 .child = raid_table, 132 }, 133 { .ctl_name = 0 } 134 }; 135 136 static ctl_table raid_root_table[] = { 137 { 138 .ctl_name = CTL_DEV, 139 .procname = "dev", 140 .maxlen = 0, 141 .mode = 0555, 142 .child = raid_dir_table, 143 }, 144 { .ctl_name = 0 } 145 }; 146 147 static struct block_device_operations md_fops; 148 149 static int start_readonly; 150 151 /* 152 * We have a system wide 'event count' that is incremented 153 * on any 'interesting' event, and readers of /proc/mdstat 154 * can use 'poll' or 'select' to find out when the event 155 * count increases. 156 * 157 * Events are: 158 * start array, stop array, error, add device, remove device, 159 * start build, activate spare 160 */ 161 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 162 static atomic_t md_event_count; 163 void md_new_event(mddev_t *mddev) 164 { 165 atomic_inc(&md_event_count); 166 wake_up(&md_event_waiters); 167 } 168 EXPORT_SYMBOL_GPL(md_new_event); 169 170 /* Alternate version that can be called from interrupts 171 * when calling sysfs_notify isn't needed. 172 */ 173 static void md_new_event_inintr(mddev_t *mddev) 174 { 175 atomic_inc(&md_event_count); 176 wake_up(&md_event_waiters); 177 } 178 179 /* 180 * Enables to iterate over all existing md arrays 181 * all_mddevs_lock protects this list. 182 */ 183 static LIST_HEAD(all_mddevs); 184 static DEFINE_SPINLOCK(all_mddevs_lock); 185 186 187 /* 188 * iterates through all used mddevs in the system. 189 * We take care to grab the all_mddevs_lock whenever navigating 190 * the list, and to always hold a refcount when unlocked. 191 * Any code which breaks out of this loop while own 192 * a reference to the current mddev and must mddev_put it. 193 */ 194 #define for_each_mddev(mddev,tmp) \ 195 \ 196 for (({ spin_lock(&all_mddevs_lock); \ 197 tmp = all_mddevs.next; \ 198 mddev = NULL;}); \ 199 ({ if (tmp != &all_mddevs) \ 200 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 201 spin_unlock(&all_mddevs_lock); \ 202 if (mddev) mddev_put(mddev); \ 203 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 204 tmp != &all_mddevs;}); \ 205 ({ spin_lock(&all_mddevs_lock); \ 206 tmp = tmp->next;}) \ 207 ) 208 209 210 static int md_fail_request (struct request_queue *q, struct bio *bio) 211 { 212 bio_io_error(bio); 213 return 0; 214 } 215 216 static inline mddev_t *mddev_get(mddev_t *mddev) 217 { 218 atomic_inc(&mddev->active); 219 return mddev; 220 } 221 222 static void mddev_put(mddev_t *mddev) 223 { 224 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 225 return; 226 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 227 list_del(&mddev->all_mddevs); 228 spin_unlock(&all_mddevs_lock); 229 blk_cleanup_queue(mddev->queue); 230 kobject_put(&mddev->kobj); 231 } else 232 spin_unlock(&all_mddevs_lock); 233 } 234 235 static mddev_t * mddev_find(dev_t unit) 236 { 237 mddev_t *mddev, *new = NULL; 238 239 retry: 240 spin_lock(&all_mddevs_lock); 241 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 242 if (mddev->unit == unit) { 243 mddev_get(mddev); 244 spin_unlock(&all_mddevs_lock); 245 kfree(new); 246 return mddev; 247 } 248 249 if (new) { 250 list_add(&new->all_mddevs, &all_mddevs); 251 spin_unlock(&all_mddevs_lock); 252 return new; 253 } 254 spin_unlock(&all_mddevs_lock); 255 256 new = kzalloc(sizeof(*new), GFP_KERNEL); 257 if (!new) 258 return NULL; 259 260 new->unit = unit; 261 if (MAJOR(unit) == MD_MAJOR) 262 new->md_minor = MINOR(unit); 263 else 264 new->md_minor = MINOR(unit) >> MdpMinorShift; 265 266 mutex_init(&new->reconfig_mutex); 267 INIT_LIST_HEAD(&new->disks); 268 INIT_LIST_HEAD(&new->all_mddevs); 269 init_timer(&new->safemode_timer); 270 atomic_set(&new->active, 1); 271 atomic_set(&new->openers, 0); 272 spin_lock_init(&new->write_lock); 273 init_waitqueue_head(&new->sb_wait); 274 init_waitqueue_head(&new->recovery_wait); 275 new->reshape_position = MaxSector; 276 new->resync_min = 0; 277 new->resync_max = MaxSector; 278 new->level = LEVEL_NONE; 279 280 new->queue = blk_alloc_queue(GFP_KERNEL); 281 if (!new->queue) { 282 kfree(new); 283 return NULL; 284 } 285 /* Can be unlocked because the queue is new: no concurrency */ 286 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue); 287 288 blk_queue_make_request(new->queue, md_fail_request); 289 290 goto retry; 291 } 292 293 static inline int mddev_lock(mddev_t * mddev) 294 { 295 return mutex_lock_interruptible(&mddev->reconfig_mutex); 296 } 297 298 static inline int mddev_trylock(mddev_t * mddev) 299 { 300 return mutex_trylock(&mddev->reconfig_mutex); 301 } 302 303 static inline void mddev_unlock(mddev_t * mddev) 304 { 305 mutex_unlock(&mddev->reconfig_mutex); 306 307 md_wakeup_thread(mddev->thread); 308 } 309 310 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 311 { 312 mdk_rdev_t * rdev; 313 struct list_head *tmp; 314 315 rdev_for_each(rdev, tmp, mddev) { 316 if (rdev->desc_nr == nr) 317 return rdev; 318 } 319 return NULL; 320 } 321 322 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 323 { 324 struct list_head *tmp; 325 mdk_rdev_t *rdev; 326 327 rdev_for_each(rdev, tmp, mddev) { 328 if (rdev->bdev->bd_dev == dev) 329 return rdev; 330 } 331 return NULL; 332 } 333 334 static struct mdk_personality *find_pers(int level, char *clevel) 335 { 336 struct mdk_personality *pers; 337 list_for_each_entry(pers, &pers_list, list) { 338 if (level != LEVEL_NONE && pers->level == level) 339 return pers; 340 if (strcmp(pers->name, clevel)==0) 341 return pers; 342 } 343 return NULL; 344 } 345 346 /* return the offset of the super block in 512byte sectors */ 347 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 348 { 349 sector_t num_sectors = bdev->bd_inode->i_size / 512; 350 return MD_NEW_SIZE_SECTORS(num_sectors); 351 } 352 353 static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) 354 { 355 sector_t num_sectors = rdev->sb_start; 356 357 if (chunk_size) 358 num_sectors &= ~((sector_t)chunk_size/512 - 1); 359 return num_sectors; 360 } 361 362 static int alloc_disk_sb(mdk_rdev_t * rdev) 363 { 364 if (rdev->sb_page) 365 MD_BUG(); 366 367 rdev->sb_page = alloc_page(GFP_KERNEL); 368 if (!rdev->sb_page) { 369 printk(KERN_ALERT "md: out of memory.\n"); 370 return -ENOMEM; 371 } 372 373 return 0; 374 } 375 376 static void free_disk_sb(mdk_rdev_t * rdev) 377 { 378 if (rdev->sb_page) { 379 put_page(rdev->sb_page); 380 rdev->sb_loaded = 0; 381 rdev->sb_page = NULL; 382 rdev->sb_start = 0; 383 rdev->size = 0; 384 } 385 } 386 387 388 static void super_written(struct bio *bio, int error) 389 { 390 mdk_rdev_t *rdev = bio->bi_private; 391 mddev_t *mddev = rdev->mddev; 392 393 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 394 printk("md: super_written gets error=%d, uptodate=%d\n", 395 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 396 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 397 md_error(mddev, rdev); 398 } 399 400 if (atomic_dec_and_test(&mddev->pending_writes)) 401 wake_up(&mddev->sb_wait); 402 bio_put(bio); 403 } 404 405 static void super_written_barrier(struct bio *bio, int error) 406 { 407 struct bio *bio2 = bio->bi_private; 408 mdk_rdev_t *rdev = bio2->bi_private; 409 mddev_t *mddev = rdev->mddev; 410 411 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 412 error == -EOPNOTSUPP) { 413 unsigned long flags; 414 /* barriers don't appear to be supported :-( */ 415 set_bit(BarriersNotsupp, &rdev->flags); 416 mddev->barriers_work = 0; 417 spin_lock_irqsave(&mddev->write_lock, flags); 418 bio2->bi_next = mddev->biolist; 419 mddev->biolist = bio2; 420 spin_unlock_irqrestore(&mddev->write_lock, flags); 421 wake_up(&mddev->sb_wait); 422 bio_put(bio); 423 } else { 424 bio_put(bio2); 425 bio->bi_private = rdev; 426 super_written(bio, error); 427 } 428 } 429 430 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 431 sector_t sector, int size, struct page *page) 432 { 433 /* write first size bytes of page to sector of rdev 434 * Increment mddev->pending_writes before returning 435 * and decrement it on completion, waking up sb_wait 436 * if zero is reached. 437 * If an error occurred, call md_error 438 * 439 * As we might need to resubmit the request if BIO_RW_BARRIER 440 * causes ENOTSUPP, we allocate a spare bio... 441 */ 442 struct bio *bio = bio_alloc(GFP_NOIO, 1); 443 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 444 445 bio->bi_bdev = rdev->bdev; 446 bio->bi_sector = sector; 447 bio_add_page(bio, page, size, 0); 448 bio->bi_private = rdev; 449 bio->bi_end_io = super_written; 450 bio->bi_rw = rw; 451 452 atomic_inc(&mddev->pending_writes); 453 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 454 struct bio *rbio; 455 rw |= (1<<BIO_RW_BARRIER); 456 rbio = bio_clone(bio, GFP_NOIO); 457 rbio->bi_private = bio; 458 rbio->bi_end_io = super_written_barrier; 459 submit_bio(rw, rbio); 460 } else 461 submit_bio(rw, bio); 462 } 463 464 void md_super_wait(mddev_t *mddev) 465 { 466 /* wait for all superblock writes that were scheduled to complete. 467 * if any had to be retried (due to BARRIER problems), retry them 468 */ 469 DEFINE_WAIT(wq); 470 for(;;) { 471 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 472 if (atomic_read(&mddev->pending_writes)==0) 473 break; 474 while (mddev->biolist) { 475 struct bio *bio; 476 spin_lock_irq(&mddev->write_lock); 477 bio = mddev->biolist; 478 mddev->biolist = bio->bi_next ; 479 bio->bi_next = NULL; 480 spin_unlock_irq(&mddev->write_lock); 481 submit_bio(bio->bi_rw, bio); 482 } 483 schedule(); 484 } 485 finish_wait(&mddev->sb_wait, &wq); 486 } 487 488 static void bi_complete(struct bio *bio, int error) 489 { 490 complete((struct completion*)bio->bi_private); 491 } 492 493 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 494 struct page *page, int rw) 495 { 496 struct bio *bio = bio_alloc(GFP_NOIO, 1); 497 struct completion event; 498 int ret; 499 500 rw |= (1 << BIO_RW_SYNC); 501 502 bio->bi_bdev = bdev; 503 bio->bi_sector = sector; 504 bio_add_page(bio, page, size, 0); 505 init_completion(&event); 506 bio->bi_private = &event; 507 bio->bi_end_io = bi_complete; 508 submit_bio(rw, bio); 509 wait_for_completion(&event); 510 511 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 512 bio_put(bio); 513 return ret; 514 } 515 EXPORT_SYMBOL_GPL(sync_page_io); 516 517 static int read_disk_sb(mdk_rdev_t * rdev, int size) 518 { 519 char b[BDEVNAME_SIZE]; 520 if (!rdev->sb_page) { 521 MD_BUG(); 522 return -EINVAL; 523 } 524 if (rdev->sb_loaded) 525 return 0; 526 527 528 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 529 goto fail; 530 rdev->sb_loaded = 1; 531 return 0; 532 533 fail: 534 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 535 bdevname(rdev->bdev,b)); 536 return -EINVAL; 537 } 538 539 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 540 { 541 return sb1->set_uuid0 == sb2->set_uuid0 && 542 sb1->set_uuid1 == sb2->set_uuid1 && 543 sb1->set_uuid2 == sb2->set_uuid2 && 544 sb1->set_uuid3 == sb2->set_uuid3; 545 } 546 547 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 548 { 549 int ret; 550 mdp_super_t *tmp1, *tmp2; 551 552 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 553 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 554 555 if (!tmp1 || !tmp2) { 556 ret = 0; 557 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 558 goto abort; 559 } 560 561 *tmp1 = *sb1; 562 *tmp2 = *sb2; 563 564 /* 565 * nr_disks is not constant 566 */ 567 tmp1->nr_disks = 0; 568 tmp2->nr_disks = 0; 569 570 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 571 abort: 572 kfree(tmp1); 573 kfree(tmp2); 574 return ret; 575 } 576 577 578 static u32 md_csum_fold(u32 csum) 579 { 580 csum = (csum & 0xffff) + (csum >> 16); 581 return (csum & 0xffff) + (csum >> 16); 582 } 583 584 static unsigned int calc_sb_csum(mdp_super_t * sb) 585 { 586 u64 newcsum = 0; 587 u32 *sb32 = (u32*)sb; 588 int i; 589 unsigned int disk_csum, csum; 590 591 disk_csum = sb->sb_csum; 592 sb->sb_csum = 0; 593 594 for (i = 0; i < MD_SB_BYTES/4 ; i++) 595 newcsum += sb32[i]; 596 csum = (newcsum & 0xffffffff) + (newcsum>>32); 597 598 599 #ifdef CONFIG_ALPHA 600 /* This used to use csum_partial, which was wrong for several 601 * reasons including that different results are returned on 602 * different architectures. It isn't critical that we get exactly 603 * the same return value as before (we always csum_fold before 604 * testing, and that removes any differences). However as we 605 * know that csum_partial always returned a 16bit value on 606 * alphas, do a fold to maximise conformity to previous behaviour. 607 */ 608 sb->sb_csum = md_csum_fold(disk_csum); 609 #else 610 sb->sb_csum = disk_csum; 611 #endif 612 return csum; 613 } 614 615 616 /* 617 * Handle superblock details. 618 * We want to be able to handle multiple superblock formats 619 * so we have a common interface to them all, and an array of 620 * different handlers. 621 * We rely on user-space to write the initial superblock, and support 622 * reading and updating of superblocks. 623 * Interface methods are: 624 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 625 * loads and validates a superblock on dev. 626 * if refdev != NULL, compare superblocks on both devices 627 * Return: 628 * 0 - dev has a superblock that is compatible with refdev 629 * 1 - dev has a superblock that is compatible and newer than refdev 630 * so dev should be used as the refdev in future 631 * -EINVAL superblock incompatible or invalid 632 * -othererror e.g. -EIO 633 * 634 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 635 * Verify that dev is acceptable into mddev. 636 * The first time, mddev->raid_disks will be 0, and data from 637 * dev should be merged in. Subsequent calls check that dev 638 * is new enough. Return 0 or -EINVAL 639 * 640 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 641 * Update the superblock for rdev with data in mddev 642 * This does not write to disc. 643 * 644 */ 645 646 struct super_type { 647 char *name; 648 struct module *owner; 649 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, 650 int minor_version); 651 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 652 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 653 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, 654 sector_t num_sectors); 655 }; 656 657 /* 658 * load_super for 0.90.0 659 */ 660 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 661 { 662 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 663 mdp_super_t *sb; 664 int ret; 665 666 /* 667 * Calculate the position of the superblock (512byte sectors), 668 * it's at the end of the disk. 669 * 670 * It also happens to be a multiple of 4Kb. 671 */ 672 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 673 674 ret = read_disk_sb(rdev, MD_SB_BYTES); 675 if (ret) return ret; 676 677 ret = -EINVAL; 678 679 bdevname(rdev->bdev, b); 680 sb = (mdp_super_t*)page_address(rdev->sb_page); 681 682 if (sb->md_magic != MD_SB_MAGIC) { 683 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 684 b); 685 goto abort; 686 } 687 688 if (sb->major_version != 0 || 689 sb->minor_version < 90 || 690 sb->minor_version > 91) { 691 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 692 sb->major_version, sb->minor_version, 693 b); 694 goto abort; 695 } 696 697 if (sb->raid_disks <= 0) 698 goto abort; 699 700 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 701 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 702 b); 703 goto abort; 704 } 705 706 rdev->preferred_minor = sb->md_minor; 707 rdev->data_offset = 0; 708 rdev->sb_size = MD_SB_BYTES; 709 710 if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { 711 if (sb->level != 1 && sb->level != 4 712 && sb->level != 5 && sb->level != 6 713 && sb->level != 10) { 714 /* FIXME use a better test */ 715 printk(KERN_WARNING 716 "md: bitmaps not supported for this level.\n"); 717 goto abort; 718 } 719 } 720 721 if (sb->level == LEVEL_MULTIPATH) 722 rdev->desc_nr = -1; 723 else 724 rdev->desc_nr = sb->this_disk.number; 725 726 if (!refdev) { 727 ret = 1; 728 } else { 729 __u64 ev1, ev2; 730 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 731 if (!uuid_equal(refsb, sb)) { 732 printk(KERN_WARNING "md: %s has different UUID to %s\n", 733 b, bdevname(refdev->bdev,b2)); 734 goto abort; 735 } 736 if (!sb_equal(refsb, sb)) { 737 printk(KERN_WARNING "md: %s has same UUID" 738 " but different superblock to %s\n", 739 b, bdevname(refdev->bdev, b2)); 740 goto abort; 741 } 742 ev1 = md_event(sb); 743 ev2 = md_event(refsb); 744 if (ev1 > ev2) 745 ret = 1; 746 else 747 ret = 0; 748 } 749 rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; 750 751 if (rdev->size < sb->size && sb->level > 1) 752 /* "this cannot possibly happen" ... */ 753 ret = -EINVAL; 754 755 abort: 756 return ret; 757 } 758 759 /* 760 * validate_super for 0.90.0 761 */ 762 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 763 { 764 mdp_disk_t *desc; 765 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 766 __u64 ev1 = md_event(sb); 767 768 rdev->raid_disk = -1; 769 clear_bit(Faulty, &rdev->flags); 770 clear_bit(In_sync, &rdev->flags); 771 clear_bit(WriteMostly, &rdev->flags); 772 clear_bit(BarriersNotsupp, &rdev->flags); 773 774 if (mddev->raid_disks == 0) { 775 mddev->major_version = 0; 776 mddev->minor_version = sb->minor_version; 777 mddev->patch_version = sb->patch_version; 778 mddev->external = 0; 779 mddev->chunk_size = sb->chunk_size; 780 mddev->ctime = sb->ctime; 781 mddev->utime = sb->utime; 782 mddev->level = sb->level; 783 mddev->clevel[0] = 0; 784 mddev->layout = sb->layout; 785 mddev->raid_disks = sb->raid_disks; 786 mddev->size = sb->size; 787 mddev->events = ev1; 788 mddev->bitmap_offset = 0; 789 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 790 791 if (mddev->minor_version >= 91) { 792 mddev->reshape_position = sb->reshape_position; 793 mddev->delta_disks = sb->delta_disks; 794 mddev->new_level = sb->new_level; 795 mddev->new_layout = sb->new_layout; 796 mddev->new_chunk = sb->new_chunk; 797 } else { 798 mddev->reshape_position = MaxSector; 799 mddev->delta_disks = 0; 800 mddev->new_level = mddev->level; 801 mddev->new_layout = mddev->layout; 802 mddev->new_chunk = mddev->chunk_size; 803 } 804 805 if (sb->state & (1<<MD_SB_CLEAN)) 806 mddev->recovery_cp = MaxSector; 807 else { 808 if (sb->events_hi == sb->cp_events_hi && 809 sb->events_lo == sb->cp_events_lo) { 810 mddev->recovery_cp = sb->recovery_cp; 811 } else 812 mddev->recovery_cp = 0; 813 } 814 815 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 816 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 817 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 818 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 819 820 mddev->max_disks = MD_SB_DISKS; 821 822 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 823 mddev->bitmap_file == NULL) 824 mddev->bitmap_offset = mddev->default_bitmap_offset; 825 826 } else if (mddev->pers == NULL) { 827 /* Insist on good event counter while assembling */ 828 ++ev1; 829 if (ev1 < mddev->events) 830 return -EINVAL; 831 } else if (mddev->bitmap) { 832 /* if adding to array with a bitmap, then we can accept an 833 * older device ... but not too old. 834 */ 835 if (ev1 < mddev->bitmap->events_cleared) 836 return 0; 837 } else { 838 if (ev1 < mddev->events) 839 /* just a hot-add of a new device, leave raid_disk at -1 */ 840 return 0; 841 } 842 843 if (mddev->level != LEVEL_MULTIPATH) { 844 desc = sb->disks + rdev->desc_nr; 845 846 if (desc->state & (1<<MD_DISK_FAULTY)) 847 set_bit(Faulty, &rdev->flags); 848 else if (desc->state & (1<<MD_DISK_SYNC) /* && 849 desc->raid_disk < mddev->raid_disks */) { 850 set_bit(In_sync, &rdev->flags); 851 rdev->raid_disk = desc->raid_disk; 852 } 853 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 854 set_bit(WriteMostly, &rdev->flags); 855 } else /* MULTIPATH are always insync */ 856 set_bit(In_sync, &rdev->flags); 857 return 0; 858 } 859 860 /* 861 * sync_super for 0.90.0 862 */ 863 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 864 { 865 mdp_super_t *sb; 866 struct list_head *tmp; 867 mdk_rdev_t *rdev2; 868 int next_spare = mddev->raid_disks; 869 870 871 /* make rdev->sb match mddev data.. 872 * 873 * 1/ zero out disks 874 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 875 * 3/ any empty disks < next_spare become removed 876 * 877 * disks[0] gets initialised to REMOVED because 878 * we cannot be sure from other fields if it has 879 * been initialised or not. 880 */ 881 int i; 882 int active=0, working=0,failed=0,spare=0,nr_disks=0; 883 884 rdev->sb_size = MD_SB_BYTES; 885 886 sb = (mdp_super_t*)page_address(rdev->sb_page); 887 888 memset(sb, 0, sizeof(*sb)); 889 890 sb->md_magic = MD_SB_MAGIC; 891 sb->major_version = mddev->major_version; 892 sb->patch_version = mddev->patch_version; 893 sb->gvalid_words = 0; /* ignored */ 894 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 895 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 896 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 897 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 898 899 sb->ctime = mddev->ctime; 900 sb->level = mddev->level; 901 sb->size = mddev->size; 902 sb->raid_disks = mddev->raid_disks; 903 sb->md_minor = mddev->md_minor; 904 sb->not_persistent = 0; 905 sb->utime = mddev->utime; 906 sb->state = 0; 907 sb->events_hi = (mddev->events>>32); 908 sb->events_lo = (u32)mddev->events; 909 910 if (mddev->reshape_position == MaxSector) 911 sb->minor_version = 90; 912 else { 913 sb->minor_version = 91; 914 sb->reshape_position = mddev->reshape_position; 915 sb->new_level = mddev->new_level; 916 sb->delta_disks = mddev->delta_disks; 917 sb->new_layout = mddev->new_layout; 918 sb->new_chunk = mddev->new_chunk; 919 } 920 mddev->minor_version = sb->minor_version; 921 if (mddev->in_sync) 922 { 923 sb->recovery_cp = mddev->recovery_cp; 924 sb->cp_events_hi = (mddev->events>>32); 925 sb->cp_events_lo = (u32)mddev->events; 926 if (mddev->recovery_cp == MaxSector) 927 sb->state = (1<< MD_SB_CLEAN); 928 } else 929 sb->recovery_cp = 0; 930 931 sb->layout = mddev->layout; 932 sb->chunk_size = mddev->chunk_size; 933 934 if (mddev->bitmap && mddev->bitmap_file == NULL) 935 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 936 937 sb->disks[0].state = (1<<MD_DISK_REMOVED); 938 rdev_for_each(rdev2, tmp, mddev) { 939 mdp_disk_t *d; 940 int desc_nr; 941 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 942 && !test_bit(Faulty, &rdev2->flags)) 943 desc_nr = rdev2->raid_disk; 944 else 945 desc_nr = next_spare++; 946 rdev2->desc_nr = desc_nr; 947 d = &sb->disks[rdev2->desc_nr]; 948 nr_disks++; 949 d->number = rdev2->desc_nr; 950 d->major = MAJOR(rdev2->bdev->bd_dev); 951 d->minor = MINOR(rdev2->bdev->bd_dev); 952 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 953 && !test_bit(Faulty, &rdev2->flags)) 954 d->raid_disk = rdev2->raid_disk; 955 else 956 d->raid_disk = rdev2->desc_nr; /* compatibility */ 957 if (test_bit(Faulty, &rdev2->flags)) 958 d->state = (1<<MD_DISK_FAULTY); 959 else if (test_bit(In_sync, &rdev2->flags)) { 960 d->state = (1<<MD_DISK_ACTIVE); 961 d->state |= (1<<MD_DISK_SYNC); 962 active++; 963 working++; 964 } else { 965 d->state = 0; 966 spare++; 967 working++; 968 } 969 if (test_bit(WriteMostly, &rdev2->flags)) 970 d->state |= (1<<MD_DISK_WRITEMOSTLY); 971 } 972 /* now set the "removed" and "faulty" bits on any missing devices */ 973 for (i=0 ; i < mddev->raid_disks ; i++) { 974 mdp_disk_t *d = &sb->disks[i]; 975 if (d->state == 0 && d->number == 0) { 976 d->number = i; 977 d->raid_disk = i; 978 d->state = (1<<MD_DISK_REMOVED); 979 d->state |= (1<<MD_DISK_FAULTY); 980 failed++; 981 } 982 } 983 sb->nr_disks = nr_disks; 984 sb->active_disks = active; 985 sb->working_disks = working; 986 sb->failed_disks = failed; 987 sb->spare_disks = spare; 988 989 sb->this_disk = sb->disks[rdev->desc_nr]; 990 sb->sb_csum = calc_sb_csum(sb); 991 } 992 993 /* 994 * rdev_size_change for 0.90.0 995 */ 996 static unsigned long long 997 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 998 { 999 if (num_sectors && num_sectors < rdev->mddev->size * 2) 1000 return 0; /* component must fit device */ 1001 if (rdev->mddev->bitmap_offset) 1002 return 0; /* can't move bitmap */ 1003 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1004 if (!num_sectors || num_sectors > rdev->sb_start) 1005 num_sectors = rdev->sb_start; 1006 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1007 rdev->sb_page); 1008 md_super_wait(rdev->mddev); 1009 return num_sectors / 2; /* kB for sysfs */ 1010 } 1011 1012 1013 /* 1014 * version 1 superblock 1015 */ 1016 1017 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1018 { 1019 __le32 disk_csum; 1020 u32 csum; 1021 unsigned long long newcsum; 1022 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1023 __le32 *isuper = (__le32*)sb; 1024 int i; 1025 1026 disk_csum = sb->sb_csum; 1027 sb->sb_csum = 0; 1028 newcsum = 0; 1029 for (i=0; size>=4; size -= 4 ) 1030 newcsum += le32_to_cpu(*isuper++); 1031 1032 if (size == 2) 1033 newcsum += le16_to_cpu(*(__le16*) isuper); 1034 1035 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1036 sb->sb_csum = disk_csum; 1037 return cpu_to_le32(csum); 1038 } 1039 1040 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1041 { 1042 struct mdp_superblock_1 *sb; 1043 int ret; 1044 sector_t sb_start; 1045 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1046 int bmask; 1047 1048 /* 1049 * Calculate the position of the superblock in 512byte sectors. 1050 * It is always aligned to a 4K boundary and 1051 * depeding on minor_version, it can be: 1052 * 0: At least 8K, but less than 12K, from end of device 1053 * 1: At start of device 1054 * 2: 4K from start of device. 1055 */ 1056 switch(minor_version) { 1057 case 0: 1058 sb_start = rdev->bdev->bd_inode->i_size >> 9; 1059 sb_start -= 8*2; 1060 sb_start &= ~(sector_t)(4*2-1); 1061 break; 1062 case 1: 1063 sb_start = 0; 1064 break; 1065 case 2: 1066 sb_start = 8; 1067 break; 1068 default: 1069 return -EINVAL; 1070 } 1071 rdev->sb_start = sb_start; 1072 1073 /* superblock is rarely larger than 1K, but it can be larger, 1074 * and it is safe to read 4k, so we do that 1075 */ 1076 ret = read_disk_sb(rdev, 4096); 1077 if (ret) return ret; 1078 1079 1080 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1081 1082 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1083 sb->major_version != cpu_to_le32(1) || 1084 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1085 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1086 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1087 return -EINVAL; 1088 1089 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1090 printk("md: invalid superblock checksum on %s\n", 1091 bdevname(rdev->bdev,b)); 1092 return -EINVAL; 1093 } 1094 if (le64_to_cpu(sb->data_size) < 10) { 1095 printk("md: data_size too small on %s\n", 1096 bdevname(rdev->bdev,b)); 1097 return -EINVAL; 1098 } 1099 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { 1100 if (sb->level != cpu_to_le32(1) && 1101 sb->level != cpu_to_le32(4) && 1102 sb->level != cpu_to_le32(5) && 1103 sb->level != cpu_to_le32(6) && 1104 sb->level != cpu_to_le32(10)) { 1105 printk(KERN_WARNING 1106 "md: bitmaps not supported for this level.\n"); 1107 return -EINVAL; 1108 } 1109 } 1110 1111 rdev->preferred_minor = 0xffff; 1112 rdev->data_offset = le64_to_cpu(sb->data_offset); 1113 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1114 1115 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1116 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1117 if (rdev->sb_size & bmask) 1118 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1119 1120 if (minor_version 1121 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1122 return -EINVAL; 1123 1124 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1125 rdev->desc_nr = -1; 1126 else 1127 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1128 1129 if (!refdev) { 1130 ret = 1; 1131 } else { 1132 __u64 ev1, ev2; 1133 struct mdp_superblock_1 *refsb = 1134 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1135 1136 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1137 sb->level != refsb->level || 1138 sb->layout != refsb->layout || 1139 sb->chunksize != refsb->chunksize) { 1140 printk(KERN_WARNING "md: %s has strangely different" 1141 " superblock to %s\n", 1142 bdevname(rdev->bdev,b), 1143 bdevname(refdev->bdev,b2)); 1144 return -EINVAL; 1145 } 1146 ev1 = le64_to_cpu(sb->events); 1147 ev2 = le64_to_cpu(refsb->events); 1148 1149 if (ev1 > ev2) 1150 ret = 1; 1151 else 1152 ret = 0; 1153 } 1154 if (minor_version) 1155 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1156 else 1157 rdev->size = rdev->sb_start / 2; 1158 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1159 return -EINVAL; 1160 rdev->size = le64_to_cpu(sb->data_size)/2; 1161 if (le32_to_cpu(sb->chunksize)) 1162 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1163 1164 if (le64_to_cpu(sb->size) > rdev->size*2) 1165 return -EINVAL; 1166 return ret; 1167 } 1168 1169 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1170 { 1171 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1172 __u64 ev1 = le64_to_cpu(sb->events); 1173 1174 rdev->raid_disk = -1; 1175 clear_bit(Faulty, &rdev->flags); 1176 clear_bit(In_sync, &rdev->flags); 1177 clear_bit(WriteMostly, &rdev->flags); 1178 clear_bit(BarriersNotsupp, &rdev->flags); 1179 1180 if (mddev->raid_disks == 0) { 1181 mddev->major_version = 1; 1182 mddev->patch_version = 0; 1183 mddev->external = 0; 1184 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1185 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1186 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1187 mddev->level = le32_to_cpu(sb->level); 1188 mddev->clevel[0] = 0; 1189 mddev->layout = le32_to_cpu(sb->layout); 1190 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1191 mddev->size = le64_to_cpu(sb->size)/2; 1192 mddev->events = ev1; 1193 mddev->bitmap_offset = 0; 1194 mddev->default_bitmap_offset = 1024 >> 9; 1195 1196 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1197 memcpy(mddev->uuid, sb->set_uuid, 16); 1198 1199 mddev->max_disks = (4096-256)/2; 1200 1201 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1202 mddev->bitmap_file == NULL ) 1203 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1204 1205 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1206 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1207 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1208 mddev->new_level = le32_to_cpu(sb->new_level); 1209 mddev->new_layout = le32_to_cpu(sb->new_layout); 1210 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1211 } else { 1212 mddev->reshape_position = MaxSector; 1213 mddev->delta_disks = 0; 1214 mddev->new_level = mddev->level; 1215 mddev->new_layout = mddev->layout; 1216 mddev->new_chunk = mddev->chunk_size; 1217 } 1218 1219 } else if (mddev->pers == NULL) { 1220 /* Insist of good event counter while assembling */ 1221 ++ev1; 1222 if (ev1 < mddev->events) 1223 return -EINVAL; 1224 } else if (mddev->bitmap) { 1225 /* If adding to array with a bitmap, then we can accept an 1226 * older device, but not too old. 1227 */ 1228 if (ev1 < mddev->bitmap->events_cleared) 1229 return 0; 1230 } else { 1231 if (ev1 < mddev->events) 1232 /* just a hot-add of a new device, leave raid_disk at -1 */ 1233 return 0; 1234 } 1235 if (mddev->level != LEVEL_MULTIPATH) { 1236 int role; 1237 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1238 switch(role) { 1239 case 0xffff: /* spare */ 1240 break; 1241 case 0xfffe: /* faulty */ 1242 set_bit(Faulty, &rdev->flags); 1243 break; 1244 default: 1245 if ((le32_to_cpu(sb->feature_map) & 1246 MD_FEATURE_RECOVERY_OFFSET)) 1247 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1248 else 1249 set_bit(In_sync, &rdev->flags); 1250 rdev->raid_disk = role; 1251 break; 1252 } 1253 if (sb->devflags & WriteMostly1) 1254 set_bit(WriteMostly, &rdev->flags); 1255 } else /* MULTIPATH are always insync */ 1256 set_bit(In_sync, &rdev->flags); 1257 1258 return 0; 1259 } 1260 1261 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1262 { 1263 struct mdp_superblock_1 *sb; 1264 struct list_head *tmp; 1265 mdk_rdev_t *rdev2; 1266 int max_dev, i; 1267 /* make rdev->sb match mddev and rdev data. */ 1268 1269 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1270 1271 sb->feature_map = 0; 1272 sb->pad0 = 0; 1273 sb->recovery_offset = cpu_to_le64(0); 1274 memset(sb->pad1, 0, sizeof(sb->pad1)); 1275 memset(sb->pad2, 0, sizeof(sb->pad2)); 1276 memset(sb->pad3, 0, sizeof(sb->pad3)); 1277 1278 sb->utime = cpu_to_le64((__u64)mddev->utime); 1279 sb->events = cpu_to_le64(mddev->events); 1280 if (mddev->in_sync) 1281 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1282 else 1283 sb->resync_offset = cpu_to_le64(0); 1284 1285 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1286 1287 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1288 sb->size = cpu_to_le64(mddev->size<<1); 1289 1290 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1291 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1292 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1293 } 1294 1295 if (rdev->raid_disk >= 0 && 1296 !test_bit(In_sync, &rdev->flags) && 1297 rdev->recovery_offset > 0) { 1298 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1299 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1300 } 1301 1302 if (mddev->reshape_position != MaxSector) { 1303 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1304 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1305 sb->new_layout = cpu_to_le32(mddev->new_layout); 1306 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1307 sb->new_level = cpu_to_le32(mddev->new_level); 1308 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1309 } 1310 1311 max_dev = 0; 1312 rdev_for_each(rdev2, tmp, mddev) 1313 if (rdev2->desc_nr+1 > max_dev) 1314 max_dev = rdev2->desc_nr+1; 1315 1316 if (max_dev > le32_to_cpu(sb->max_dev)) 1317 sb->max_dev = cpu_to_le32(max_dev); 1318 for (i=0; i<max_dev;i++) 1319 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1320 1321 rdev_for_each(rdev2, tmp, mddev) { 1322 i = rdev2->desc_nr; 1323 if (test_bit(Faulty, &rdev2->flags)) 1324 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1325 else if (test_bit(In_sync, &rdev2->flags)) 1326 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1327 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1328 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1329 else 1330 sb->dev_roles[i] = cpu_to_le16(0xffff); 1331 } 1332 1333 sb->sb_csum = calc_sb_1_csum(sb); 1334 } 1335 1336 static unsigned long long 1337 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1338 { 1339 struct mdp_superblock_1 *sb; 1340 sector_t max_sectors; 1341 if (num_sectors && num_sectors < rdev->mddev->size * 2) 1342 return 0; /* component must fit device */ 1343 if (rdev->sb_start < rdev->data_offset) { 1344 /* minor versions 1 and 2; superblock before data */ 1345 max_sectors = rdev->bdev->bd_inode->i_size >> 9; 1346 max_sectors -= rdev->data_offset; 1347 if (!num_sectors || num_sectors > max_sectors) 1348 num_sectors = max_sectors; 1349 } else if (rdev->mddev->bitmap_offset) { 1350 /* minor version 0 with bitmap we can't move */ 1351 return 0; 1352 } else { 1353 /* minor version 0; superblock after data */ 1354 sector_t sb_start; 1355 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1356 sb_start &= ~(sector_t)(4*2 - 1); 1357 max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; 1358 if (!num_sectors || num_sectors > max_sectors) 1359 num_sectors = max_sectors; 1360 rdev->sb_start = sb_start; 1361 } 1362 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1363 sb->data_size = cpu_to_le64(num_sectors); 1364 sb->super_offset = rdev->sb_start; 1365 sb->sb_csum = calc_sb_1_csum(sb); 1366 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1367 rdev->sb_page); 1368 md_super_wait(rdev->mddev); 1369 return num_sectors / 2; /* kB for sysfs */ 1370 } 1371 1372 static struct super_type super_types[] = { 1373 [0] = { 1374 .name = "0.90.0", 1375 .owner = THIS_MODULE, 1376 .load_super = super_90_load, 1377 .validate_super = super_90_validate, 1378 .sync_super = super_90_sync, 1379 .rdev_size_change = super_90_rdev_size_change, 1380 }, 1381 [1] = { 1382 .name = "md-1", 1383 .owner = THIS_MODULE, 1384 .load_super = super_1_load, 1385 .validate_super = super_1_validate, 1386 .sync_super = super_1_sync, 1387 .rdev_size_change = super_1_rdev_size_change, 1388 }, 1389 }; 1390 1391 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1392 { 1393 mdk_rdev_t *rdev, *rdev2; 1394 1395 rcu_read_lock(); 1396 rdev_for_each_rcu(rdev, mddev1) 1397 rdev_for_each_rcu(rdev2, mddev2) 1398 if (rdev->bdev->bd_contains == 1399 rdev2->bdev->bd_contains) { 1400 rcu_read_unlock(); 1401 return 1; 1402 } 1403 rcu_read_unlock(); 1404 return 0; 1405 } 1406 1407 static LIST_HEAD(pending_raid_disks); 1408 1409 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1410 { 1411 char b[BDEVNAME_SIZE]; 1412 struct kobject *ko; 1413 char *s; 1414 int err; 1415 1416 if (rdev->mddev) { 1417 MD_BUG(); 1418 return -EINVAL; 1419 } 1420 1421 /* prevent duplicates */ 1422 if (find_rdev(mddev, rdev->bdev->bd_dev)) 1423 return -EEXIST; 1424 1425 /* make sure rdev->size exceeds mddev->size */ 1426 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1427 if (mddev->pers) { 1428 /* Cannot change size, so fail 1429 * If mddev->level <= 0, then we don't care 1430 * about aligning sizes (e.g. linear) 1431 */ 1432 if (mddev->level > 0) 1433 return -ENOSPC; 1434 } else 1435 mddev->size = rdev->size; 1436 } 1437 1438 /* Verify rdev->desc_nr is unique. 1439 * If it is -1, assign a free number, else 1440 * check number is not in use 1441 */ 1442 if (rdev->desc_nr < 0) { 1443 int choice = 0; 1444 if (mddev->pers) choice = mddev->raid_disks; 1445 while (find_rdev_nr(mddev, choice)) 1446 choice++; 1447 rdev->desc_nr = choice; 1448 } else { 1449 if (find_rdev_nr(mddev, rdev->desc_nr)) 1450 return -EBUSY; 1451 } 1452 bdevname(rdev->bdev,b); 1453 while ( (s=strchr(b, '/')) != NULL) 1454 *s = '!'; 1455 1456 rdev->mddev = mddev; 1457 printk(KERN_INFO "md: bind<%s>\n", b); 1458 1459 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 1460 goto fail; 1461 1462 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 1463 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1464 kobject_del(&rdev->kobj); 1465 goto fail; 1466 } 1467 list_add_rcu(&rdev->same_set, &mddev->disks); 1468 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1469 return 0; 1470 1471 fail: 1472 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1473 b, mdname(mddev)); 1474 return err; 1475 } 1476 1477 static void md_delayed_delete(struct work_struct *ws) 1478 { 1479 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1480 kobject_del(&rdev->kobj); 1481 kobject_put(&rdev->kobj); 1482 } 1483 1484 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1485 { 1486 char b[BDEVNAME_SIZE]; 1487 if (!rdev->mddev) { 1488 MD_BUG(); 1489 return; 1490 } 1491 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1492 list_del_rcu(&rdev->same_set); 1493 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1494 rdev->mddev = NULL; 1495 sysfs_remove_link(&rdev->kobj, "block"); 1496 1497 /* We need to delay this, otherwise we can deadlock when 1498 * writing to 'remove' to "dev/state". We also need 1499 * to delay it due to rcu usage. 1500 */ 1501 synchronize_rcu(); 1502 INIT_WORK(&rdev->del_work, md_delayed_delete); 1503 kobject_get(&rdev->kobj); 1504 schedule_work(&rdev->del_work); 1505 } 1506 1507 /* 1508 * prevent the device from being mounted, repartitioned or 1509 * otherwise reused by a RAID array (or any other kernel 1510 * subsystem), by bd_claiming the device. 1511 */ 1512 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) 1513 { 1514 int err = 0; 1515 struct block_device *bdev; 1516 char b[BDEVNAME_SIZE]; 1517 1518 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1519 if (IS_ERR(bdev)) { 1520 printk(KERN_ERR "md: could not open %s.\n", 1521 __bdevname(dev, b)); 1522 return PTR_ERR(bdev); 1523 } 1524 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); 1525 if (err) { 1526 printk(KERN_ERR "md: could not bd_claim %s.\n", 1527 bdevname(bdev, b)); 1528 blkdev_put(bdev); 1529 return err; 1530 } 1531 if (!shared) 1532 set_bit(AllReserved, &rdev->flags); 1533 rdev->bdev = bdev; 1534 return err; 1535 } 1536 1537 static void unlock_rdev(mdk_rdev_t *rdev) 1538 { 1539 struct block_device *bdev = rdev->bdev; 1540 rdev->bdev = NULL; 1541 if (!bdev) 1542 MD_BUG(); 1543 bd_release(bdev); 1544 blkdev_put(bdev); 1545 } 1546 1547 void md_autodetect_dev(dev_t dev); 1548 1549 static void export_rdev(mdk_rdev_t * rdev) 1550 { 1551 char b[BDEVNAME_SIZE]; 1552 printk(KERN_INFO "md: export_rdev(%s)\n", 1553 bdevname(rdev->bdev,b)); 1554 if (rdev->mddev) 1555 MD_BUG(); 1556 free_disk_sb(rdev); 1557 #ifndef MODULE 1558 if (test_bit(AutoDetected, &rdev->flags)) 1559 md_autodetect_dev(rdev->bdev->bd_dev); 1560 #endif 1561 unlock_rdev(rdev); 1562 kobject_put(&rdev->kobj); 1563 } 1564 1565 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1566 { 1567 unbind_rdev_from_array(rdev); 1568 export_rdev(rdev); 1569 } 1570 1571 static void export_array(mddev_t *mddev) 1572 { 1573 struct list_head *tmp; 1574 mdk_rdev_t *rdev; 1575 1576 rdev_for_each(rdev, tmp, mddev) { 1577 if (!rdev->mddev) { 1578 MD_BUG(); 1579 continue; 1580 } 1581 kick_rdev_from_array(rdev); 1582 } 1583 if (!list_empty(&mddev->disks)) 1584 MD_BUG(); 1585 mddev->raid_disks = 0; 1586 mddev->major_version = 0; 1587 } 1588 1589 static void print_desc(mdp_disk_t *desc) 1590 { 1591 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1592 desc->major,desc->minor,desc->raid_disk,desc->state); 1593 } 1594 1595 static void print_sb(mdp_super_t *sb) 1596 { 1597 int i; 1598 1599 printk(KERN_INFO 1600 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1601 sb->major_version, sb->minor_version, sb->patch_version, 1602 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1603 sb->ctime); 1604 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1605 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1606 sb->md_minor, sb->layout, sb->chunk_size); 1607 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1608 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1609 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1610 sb->failed_disks, sb->spare_disks, 1611 sb->sb_csum, (unsigned long)sb->events_lo); 1612 1613 printk(KERN_INFO); 1614 for (i = 0; i < MD_SB_DISKS; i++) { 1615 mdp_disk_t *desc; 1616 1617 desc = sb->disks + i; 1618 if (desc->number || desc->major || desc->minor || 1619 desc->raid_disk || (desc->state && (desc->state != 4))) { 1620 printk(" D %2d: ", i); 1621 print_desc(desc); 1622 } 1623 } 1624 printk(KERN_INFO "md: THIS: "); 1625 print_desc(&sb->this_disk); 1626 1627 } 1628 1629 static void print_rdev(mdk_rdev_t *rdev) 1630 { 1631 char b[BDEVNAME_SIZE]; 1632 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1633 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1634 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1635 rdev->desc_nr); 1636 if (rdev->sb_loaded) { 1637 printk(KERN_INFO "md: rdev superblock:\n"); 1638 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1639 } else 1640 printk(KERN_INFO "md: no rdev superblock!\n"); 1641 } 1642 1643 static void md_print_devices(void) 1644 { 1645 struct list_head *tmp, *tmp2; 1646 mdk_rdev_t *rdev; 1647 mddev_t *mddev; 1648 char b[BDEVNAME_SIZE]; 1649 1650 printk("\n"); 1651 printk("md: **********************************\n"); 1652 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1653 printk("md: **********************************\n"); 1654 for_each_mddev(mddev, tmp) { 1655 1656 if (mddev->bitmap) 1657 bitmap_print_sb(mddev->bitmap); 1658 else 1659 printk("%s: ", mdname(mddev)); 1660 rdev_for_each(rdev, tmp2, mddev) 1661 printk("<%s>", bdevname(rdev->bdev,b)); 1662 printk("\n"); 1663 1664 rdev_for_each(rdev, tmp2, mddev) 1665 print_rdev(rdev); 1666 } 1667 printk("md: **********************************\n"); 1668 printk("\n"); 1669 } 1670 1671 1672 static void sync_sbs(mddev_t * mddev, int nospares) 1673 { 1674 /* Update each superblock (in-memory image), but 1675 * if we are allowed to, skip spares which already 1676 * have the right event counter, or have one earlier 1677 * (which would mean they aren't being marked as dirty 1678 * with the rest of the array) 1679 */ 1680 mdk_rdev_t *rdev; 1681 struct list_head *tmp; 1682 1683 rdev_for_each(rdev, tmp, mddev) { 1684 if (rdev->sb_events == mddev->events || 1685 (nospares && 1686 rdev->raid_disk < 0 && 1687 (rdev->sb_events&1)==0 && 1688 rdev->sb_events+1 == mddev->events)) { 1689 /* Don't update this superblock */ 1690 rdev->sb_loaded = 2; 1691 } else { 1692 super_types[mddev->major_version]. 1693 sync_super(mddev, rdev); 1694 rdev->sb_loaded = 1; 1695 } 1696 } 1697 } 1698 1699 static void md_update_sb(mddev_t * mddev, int force_change) 1700 { 1701 struct list_head *tmp; 1702 mdk_rdev_t *rdev; 1703 int sync_req; 1704 int nospares = 0; 1705 1706 if (mddev->external) 1707 return; 1708 repeat: 1709 spin_lock_irq(&mddev->write_lock); 1710 1711 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1712 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1713 force_change = 1; 1714 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1715 /* just a clean<-> dirty transition, possibly leave spares alone, 1716 * though if events isn't the right even/odd, we will have to do 1717 * spares after all 1718 */ 1719 nospares = 1; 1720 if (force_change) 1721 nospares = 0; 1722 if (mddev->degraded) 1723 /* If the array is degraded, then skipping spares is both 1724 * dangerous and fairly pointless. 1725 * Dangerous because a device that was removed from the array 1726 * might have a event_count that still looks up-to-date, 1727 * so it can be re-added without a resync. 1728 * Pointless because if there are any spares to skip, 1729 * then a recovery will happen and soon that array won't 1730 * be degraded any more and the spare can go back to sleep then. 1731 */ 1732 nospares = 0; 1733 1734 sync_req = mddev->in_sync; 1735 mddev->utime = get_seconds(); 1736 1737 /* If this is just a dirty<->clean transition, and the array is clean 1738 * and 'events' is odd, we can roll back to the previous clean state */ 1739 if (nospares 1740 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1741 && (mddev->events & 1) 1742 && mddev->events != 1) 1743 mddev->events--; 1744 else { 1745 /* otherwise we have to go forward and ... */ 1746 mddev->events ++; 1747 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1748 /* .. if the array isn't clean, insist on an odd 'events' */ 1749 if ((mddev->events&1)==0) { 1750 mddev->events++; 1751 nospares = 0; 1752 } 1753 } else { 1754 /* otherwise insist on an even 'events' (for clean states) */ 1755 if ((mddev->events&1)) { 1756 mddev->events++; 1757 nospares = 0; 1758 } 1759 } 1760 } 1761 1762 if (!mddev->events) { 1763 /* 1764 * oops, this 64-bit counter should never wrap. 1765 * Either we are in around ~1 trillion A.C., assuming 1766 * 1 reboot per second, or we have a bug: 1767 */ 1768 MD_BUG(); 1769 mddev->events --; 1770 } 1771 1772 /* 1773 * do not write anything to disk if using 1774 * nonpersistent superblocks 1775 */ 1776 if (!mddev->persistent) { 1777 if (!mddev->external) 1778 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1779 1780 spin_unlock_irq(&mddev->write_lock); 1781 wake_up(&mddev->sb_wait); 1782 return; 1783 } 1784 sync_sbs(mddev, nospares); 1785 spin_unlock_irq(&mddev->write_lock); 1786 1787 dprintk(KERN_INFO 1788 "md: updating %s RAID superblock on device (in sync %d)\n", 1789 mdname(mddev),mddev->in_sync); 1790 1791 bitmap_update_sb(mddev->bitmap); 1792 rdev_for_each(rdev, tmp, mddev) { 1793 char b[BDEVNAME_SIZE]; 1794 dprintk(KERN_INFO "md: "); 1795 if (rdev->sb_loaded != 1) 1796 continue; /* no noise on spare devices */ 1797 if (test_bit(Faulty, &rdev->flags)) 1798 dprintk("(skipping faulty "); 1799 1800 dprintk("%s ", bdevname(rdev->bdev,b)); 1801 if (!test_bit(Faulty, &rdev->flags)) { 1802 md_super_write(mddev,rdev, 1803 rdev->sb_start, rdev->sb_size, 1804 rdev->sb_page); 1805 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1806 bdevname(rdev->bdev,b), 1807 (unsigned long long)rdev->sb_start); 1808 rdev->sb_events = mddev->events; 1809 1810 } else 1811 dprintk(")\n"); 1812 if (mddev->level == LEVEL_MULTIPATH) 1813 /* only need to write one superblock... */ 1814 break; 1815 } 1816 md_super_wait(mddev); 1817 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 1818 1819 spin_lock_irq(&mddev->write_lock); 1820 if (mddev->in_sync != sync_req || 1821 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 1822 /* have to write it out again */ 1823 spin_unlock_irq(&mddev->write_lock); 1824 goto repeat; 1825 } 1826 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1827 spin_unlock_irq(&mddev->write_lock); 1828 wake_up(&mddev->sb_wait); 1829 1830 } 1831 1832 /* words written to sysfs files may, or may not, be \n terminated. 1833 * We want to accept with case. For this we use cmd_match. 1834 */ 1835 static int cmd_match(const char *cmd, const char *str) 1836 { 1837 /* See if cmd, written into a sysfs file, matches 1838 * str. They must either be the same, or cmd can 1839 * have a trailing newline 1840 */ 1841 while (*cmd && *str && *cmd == *str) { 1842 cmd++; 1843 str++; 1844 } 1845 if (*cmd == '\n') 1846 cmd++; 1847 if (*str || *cmd) 1848 return 0; 1849 return 1; 1850 } 1851 1852 struct rdev_sysfs_entry { 1853 struct attribute attr; 1854 ssize_t (*show)(mdk_rdev_t *, char *); 1855 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1856 }; 1857 1858 static ssize_t 1859 state_show(mdk_rdev_t *rdev, char *page) 1860 { 1861 char *sep = ""; 1862 size_t len = 0; 1863 1864 if (test_bit(Faulty, &rdev->flags)) { 1865 len+= sprintf(page+len, "%sfaulty",sep); 1866 sep = ","; 1867 } 1868 if (test_bit(In_sync, &rdev->flags)) { 1869 len += sprintf(page+len, "%sin_sync",sep); 1870 sep = ","; 1871 } 1872 if (test_bit(WriteMostly, &rdev->flags)) { 1873 len += sprintf(page+len, "%swrite_mostly",sep); 1874 sep = ","; 1875 } 1876 if (test_bit(Blocked, &rdev->flags)) { 1877 len += sprintf(page+len, "%sblocked", sep); 1878 sep = ","; 1879 } 1880 if (!test_bit(Faulty, &rdev->flags) && 1881 !test_bit(In_sync, &rdev->flags)) { 1882 len += sprintf(page+len, "%sspare", sep); 1883 sep = ","; 1884 } 1885 return len+sprintf(page+len, "\n"); 1886 } 1887 1888 static ssize_t 1889 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1890 { 1891 /* can write 1892 * faulty - simulates and error 1893 * remove - disconnects the device 1894 * writemostly - sets write_mostly 1895 * -writemostly - clears write_mostly 1896 * blocked - sets the Blocked flag 1897 * -blocked - clears the Blocked flag 1898 */ 1899 int err = -EINVAL; 1900 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1901 md_error(rdev->mddev, rdev); 1902 err = 0; 1903 } else if (cmd_match(buf, "remove")) { 1904 if (rdev->raid_disk >= 0) 1905 err = -EBUSY; 1906 else { 1907 mddev_t *mddev = rdev->mddev; 1908 kick_rdev_from_array(rdev); 1909 if (mddev->pers) 1910 md_update_sb(mddev, 1); 1911 md_new_event(mddev); 1912 err = 0; 1913 } 1914 } else if (cmd_match(buf, "writemostly")) { 1915 set_bit(WriteMostly, &rdev->flags); 1916 err = 0; 1917 } else if (cmd_match(buf, "-writemostly")) { 1918 clear_bit(WriteMostly, &rdev->flags); 1919 err = 0; 1920 } else if (cmd_match(buf, "blocked")) { 1921 set_bit(Blocked, &rdev->flags); 1922 err = 0; 1923 } else if (cmd_match(buf, "-blocked")) { 1924 clear_bit(Blocked, &rdev->flags); 1925 wake_up(&rdev->blocked_wait); 1926 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 1927 md_wakeup_thread(rdev->mddev->thread); 1928 1929 err = 0; 1930 } 1931 if (!err) 1932 sysfs_notify(&rdev->kobj, NULL, "state"); 1933 return err ? err : len; 1934 } 1935 static struct rdev_sysfs_entry rdev_state = 1936 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 1937 1938 static ssize_t 1939 errors_show(mdk_rdev_t *rdev, char *page) 1940 { 1941 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1942 } 1943 1944 static ssize_t 1945 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1946 { 1947 char *e; 1948 unsigned long n = simple_strtoul(buf, &e, 10); 1949 if (*buf && (*e == 0 || *e == '\n')) { 1950 atomic_set(&rdev->corrected_errors, n); 1951 return len; 1952 } 1953 return -EINVAL; 1954 } 1955 static struct rdev_sysfs_entry rdev_errors = 1956 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 1957 1958 static ssize_t 1959 slot_show(mdk_rdev_t *rdev, char *page) 1960 { 1961 if (rdev->raid_disk < 0) 1962 return sprintf(page, "none\n"); 1963 else 1964 return sprintf(page, "%d\n", rdev->raid_disk); 1965 } 1966 1967 static ssize_t 1968 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1969 { 1970 char *e; 1971 int err; 1972 char nm[20]; 1973 int slot = simple_strtoul(buf, &e, 10); 1974 if (strncmp(buf, "none", 4)==0) 1975 slot = -1; 1976 else if (e==buf || (*e && *e!= '\n')) 1977 return -EINVAL; 1978 if (rdev->mddev->pers && slot == -1) { 1979 /* Setting 'slot' on an active array requires also 1980 * updating the 'rd%d' link, and communicating 1981 * with the personality with ->hot_*_disk. 1982 * For now we only support removing 1983 * failed/spare devices. This normally happens automatically, 1984 * but not when the metadata is externally managed. 1985 */ 1986 if (rdev->raid_disk == -1) 1987 return -EEXIST; 1988 /* personality does all needed checks */ 1989 if (rdev->mddev->pers->hot_add_disk == NULL) 1990 return -EINVAL; 1991 err = rdev->mddev->pers-> 1992 hot_remove_disk(rdev->mddev, rdev->raid_disk); 1993 if (err) 1994 return err; 1995 sprintf(nm, "rd%d", rdev->raid_disk); 1996 sysfs_remove_link(&rdev->mddev->kobj, nm); 1997 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 1998 md_wakeup_thread(rdev->mddev->thread); 1999 } else if (rdev->mddev->pers) { 2000 mdk_rdev_t *rdev2; 2001 struct list_head *tmp; 2002 /* Activating a spare .. or possibly reactivating 2003 * if we every get bitmaps working here. 2004 */ 2005 2006 if (rdev->raid_disk != -1) 2007 return -EBUSY; 2008 2009 if (rdev->mddev->pers->hot_add_disk == NULL) 2010 return -EINVAL; 2011 2012 rdev_for_each(rdev2, tmp, rdev->mddev) 2013 if (rdev2->raid_disk == slot) 2014 return -EEXIST; 2015 2016 rdev->raid_disk = slot; 2017 if (test_bit(In_sync, &rdev->flags)) 2018 rdev->saved_raid_disk = slot; 2019 else 2020 rdev->saved_raid_disk = -1; 2021 err = rdev->mddev->pers-> 2022 hot_add_disk(rdev->mddev, rdev); 2023 if (err) { 2024 rdev->raid_disk = -1; 2025 return err; 2026 } else 2027 sysfs_notify(&rdev->kobj, NULL, "state"); 2028 sprintf(nm, "rd%d", rdev->raid_disk); 2029 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) 2030 printk(KERN_WARNING 2031 "md: cannot register " 2032 "%s for %s\n", 2033 nm, mdname(rdev->mddev)); 2034 2035 /* don't wakeup anyone, leave that to userspace. */ 2036 } else { 2037 if (slot >= rdev->mddev->raid_disks) 2038 return -ENOSPC; 2039 rdev->raid_disk = slot; 2040 /* assume it is working */ 2041 clear_bit(Faulty, &rdev->flags); 2042 clear_bit(WriteMostly, &rdev->flags); 2043 set_bit(In_sync, &rdev->flags); 2044 sysfs_notify(&rdev->kobj, NULL, "state"); 2045 } 2046 return len; 2047 } 2048 2049 2050 static struct rdev_sysfs_entry rdev_slot = 2051 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2052 2053 static ssize_t 2054 offset_show(mdk_rdev_t *rdev, char *page) 2055 { 2056 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2057 } 2058 2059 static ssize_t 2060 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2061 { 2062 char *e; 2063 unsigned long long offset = simple_strtoull(buf, &e, 10); 2064 if (e==buf || (*e && *e != '\n')) 2065 return -EINVAL; 2066 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2067 return -EBUSY; 2068 if (rdev->size && rdev->mddev->external) 2069 /* Must set offset before size, so overlap checks 2070 * can be sane */ 2071 return -EBUSY; 2072 rdev->data_offset = offset; 2073 return len; 2074 } 2075 2076 static struct rdev_sysfs_entry rdev_offset = 2077 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2078 2079 static ssize_t 2080 rdev_size_show(mdk_rdev_t *rdev, char *page) 2081 { 2082 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 2083 } 2084 2085 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2086 { 2087 /* check if two start/length pairs overlap */ 2088 if (s1+l1 <= s2) 2089 return 0; 2090 if (s2+l2 <= s1) 2091 return 0; 2092 return 1; 2093 } 2094 2095 static ssize_t 2096 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2097 { 2098 unsigned long long size; 2099 unsigned long long oldsize = rdev->size; 2100 mddev_t *my_mddev = rdev->mddev; 2101 2102 if (strict_strtoull(buf, 10, &size) < 0) 2103 return -EINVAL; 2104 if (size < my_mddev->size) 2105 return -EINVAL; 2106 if (my_mddev->pers && rdev->raid_disk >= 0) { 2107 if (my_mddev->persistent) { 2108 size = super_types[my_mddev->major_version]. 2109 rdev_size_change(rdev, size * 2); 2110 if (!size) 2111 return -EBUSY; 2112 } else if (!size) { 2113 size = (rdev->bdev->bd_inode->i_size >> 10); 2114 size -= rdev->data_offset/2; 2115 } 2116 if (size < my_mddev->size) 2117 return -EINVAL; /* component must fit device */ 2118 } 2119 2120 rdev->size = size; 2121 if (size > oldsize && my_mddev->external) { 2122 /* need to check that all other rdevs with the same ->bdev 2123 * do not overlap. We need to unlock the mddev to avoid 2124 * a deadlock. We have already changed rdev->size, and if 2125 * we have to change it back, we will have the lock again. 2126 */ 2127 mddev_t *mddev; 2128 int overlap = 0; 2129 struct list_head *tmp, *tmp2; 2130 2131 mddev_unlock(my_mddev); 2132 for_each_mddev(mddev, tmp) { 2133 mdk_rdev_t *rdev2; 2134 2135 mddev_lock(mddev); 2136 rdev_for_each(rdev2, tmp2, mddev) 2137 if (test_bit(AllReserved, &rdev2->flags) || 2138 (rdev->bdev == rdev2->bdev && 2139 rdev != rdev2 && 2140 overlaps(rdev->data_offset, rdev->size * 2, 2141 rdev2->data_offset, 2142 rdev2->size * 2))) { 2143 overlap = 1; 2144 break; 2145 } 2146 mddev_unlock(mddev); 2147 if (overlap) { 2148 mddev_put(mddev); 2149 break; 2150 } 2151 } 2152 mddev_lock(my_mddev); 2153 if (overlap) { 2154 /* Someone else could have slipped in a size 2155 * change here, but doing so is just silly. 2156 * We put oldsize back because we *know* it is 2157 * safe, and trust userspace not to race with 2158 * itself 2159 */ 2160 rdev->size = oldsize; 2161 return -EBUSY; 2162 } 2163 } 2164 return len; 2165 } 2166 2167 static struct rdev_sysfs_entry rdev_size = 2168 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2169 2170 static struct attribute *rdev_default_attrs[] = { 2171 &rdev_state.attr, 2172 &rdev_errors.attr, 2173 &rdev_slot.attr, 2174 &rdev_offset.attr, 2175 &rdev_size.attr, 2176 NULL, 2177 }; 2178 static ssize_t 2179 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2180 { 2181 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2182 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2183 mddev_t *mddev = rdev->mddev; 2184 ssize_t rv; 2185 2186 if (!entry->show) 2187 return -EIO; 2188 2189 rv = mddev ? mddev_lock(mddev) : -EBUSY; 2190 if (!rv) { 2191 if (rdev->mddev == NULL) 2192 rv = -EBUSY; 2193 else 2194 rv = entry->show(rdev, page); 2195 mddev_unlock(mddev); 2196 } 2197 return rv; 2198 } 2199 2200 static ssize_t 2201 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 2202 const char *page, size_t length) 2203 { 2204 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2205 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2206 ssize_t rv; 2207 mddev_t *mddev = rdev->mddev; 2208 2209 if (!entry->store) 2210 return -EIO; 2211 if (!capable(CAP_SYS_ADMIN)) 2212 return -EACCES; 2213 rv = mddev ? mddev_lock(mddev): -EBUSY; 2214 if (!rv) { 2215 if (rdev->mddev == NULL) 2216 rv = -EBUSY; 2217 else 2218 rv = entry->store(rdev, page, length); 2219 mddev_unlock(mddev); 2220 } 2221 return rv; 2222 } 2223 2224 static void rdev_free(struct kobject *ko) 2225 { 2226 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2227 kfree(rdev); 2228 } 2229 static struct sysfs_ops rdev_sysfs_ops = { 2230 .show = rdev_attr_show, 2231 .store = rdev_attr_store, 2232 }; 2233 static struct kobj_type rdev_ktype = { 2234 .release = rdev_free, 2235 .sysfs_ops = &rdev_sysfs_ops, 2236 .default_attrs = rdev_default_attrs, 2237 }; 2238 2239 /* 2240 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2241 * 2242 * mark the device faulty if: 2243 * 2244 * - the device is nonexistent (zero size) 2245 * - the device has no valid superblock 2246 * 2247 * a faulty rdev _never_ has rdev->sb set. 2248 */ 2249 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2250 { 2251 char b[BDEVNAME_SIZE]; 2252 int err; 2253 mdk_rdev_t *rdev; 2254 sector_t size; 2255 2256 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2257 if (!rdev) { 2258 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2259 return ERR_PTR(-ENOMEM); 2260 } 2261 2262 if ((err = alloc_disk_sb(rdev))) 2263 goto abort_free; 2264 2265 err = lock_rdev(rdev, newdev, super_format == -2); 2266 if (err) 2267 goto abort_free; 2268 2269 kobject_init(&rdev->kobj, &rdev_ktype); 2270 2271 rdev->desc_nr = -1; 2272 rdev->saved_raid_disk = -1; 2273 rdev->raid_disk = -1; 2274 rdev->flags = 0; 2275 rdev->data_offset = 0; 2276 rdev->sb_events = 0; 2277 atomic_set(&rdev->nr_pending, 0); 2278 atomic_set(&rdev->read_errors, 0); 2279 atomic_set(&rdev->corrected_errors, 0); 2280 2281 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2282 if (!size) { 2283 printk(KERN_WARNING 2284 "md: %s has zero or unknown size, marking faulty!\n", 2285 bdevname(rdev->bdev,b)); 2286 err = -EINVAL; 2287 goto abort_free; 2288 } 2289 2290 if (super_format >= 0) { 2291 err = super_types[super_format]. 2292 load_super(rdev, NULL, super_minor); 2293 if (err == -EINVAL) { 2294 printk(KERN_WARNING 2295 "md: %s does not have a valid v%d.%d " 2296 "superblock, not importing!\n", 2297 bdevname(rdev->bdev,b), 2298 super_format, super_minor); 2299 goto abort_free; 2300 } 2301 if (err < 0) { 2302 printk(KERN_WARNING 2303 "md: could not read %s's sb, not importing!\n", 2304 bdevname(rdev->bdev,b)); 2305 goto abort_free; 2306 } 2307 } 2308 2309 INIT_LIST_HEAD(&rdev->same_set); 2310 init_waitqueue_head(&rdev->blocked_wait); 2311 2312 return rdev; 2313 2314 abort_free: 2315 if (rdev->sb_page) { 2316 if (rdev->bdev) 2317 unlock_rdev(rdev); 2318 free_disk_sb(rdev); 2319 } 2320 kfree(rdev); 2321 return ERR_PTR(err); 2322 } 2323 2324 /* 2325 * Check a full RAID array for plausibility 2326 */ 2327 2328 2329 static void analyze_sbs(mddev_t * mddev) 2330 { 2331 int i; 2332 struct list_head *tmp; 2333 mdk_rdev_t *rdev, *freshest; 2334 char b[BDEVNAME_SIZE]; 2335 2336 freshest = NULL; 2337 rdev_for_each(rdev, tmp, mddev) 2338 switch (super_types[mddev->major_version]. 2339 load_super(rdev, freshest, mddev->minor_version)) { 2340 case 1: 2341 freshest = rdev; 2342 break; 2343 case 0: 2344 break; 2345 default: 2346 printk( KERN_ERR \ 2347 "md: fatal superblock inconsistency in %s" 2348 " -- removing from array\n", 2349 bdevname(rdev->bdev,b)); 2350 kick_rdev_from_array(rdev); 2351 } 2352 2353 2354 super_types[mddev->major_version]. 2355 validate_super(mddev, freshest); 2356 2357 i = 0; 2358 rdev_for_each(rdev, tmp, mddev) { 2359 if (rdev != freshest) 2360 if (super_types[mddev->major_version]. 2361 validate_super(mddev, rdev)) { 2362 printk(KERN_WARNING "md: kicking non-fresh %s" 2363 " from array!\n", 2364 bdevname(rdev->bdev,b)); 2365 kick_rdev_from_array(rdev); 2366 continue; 2367 } 2368 if (mddev->level == LEVEL_MULTIPATH) { 2369 rdev->desc_nr = i++; 2370 rdev->raid_disk = rdev->desc_nr; 2371 set_bit(In_sync, &rdev->flags); 2372 } else if (rdev->raid_disk >= mddev->raid_disks) { 2373 rdev->raid_disk = -1; 2374 clear_bit(In_sync, &rdev->flags); 2375 } 2376 } 2377 2378 2379 2380 if (mddev->recovery_cp != MaxSector && 2381 mddev->level >= 1) 2382 printk(KERN_ERR "md: %s: raid array is not clean" 2383 " -- starting background reconstruction\n", 2384 mdname(mddev)); 2385 2386 } 2387 2388 static void md_safemode_timeout(unsigned long data); 2389 2390 static ssize_t 2391 safe_delay_show(mddev_t *mddev, char *page) 2392 { 2393 int msec = (mddev->safemode_delay*1000)/HZ; 2394 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2395 } 2396 static ssize_t 2397 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2398 { 2399 int scale=1; 2400 int dot=0; 2401 int i; 2402 unsigned long msec; 2403 char buf[30]; 2404 char *e; 2405 /* remove a period, and count digits after it */ 2406 if (len >= sizeof(buf)) 2407 return -EINVAL; 2408 strlcpy(buf, cbuf, len); 2409 buf[len] = 0; 2410 for (i=0; i<len; i++) { 2411 if (dot) { 2412 if (isdigit(buf[i])) { 2413 buf[i-1] = buf[i]; 2414 scale *= 10; 2415 } 2416 buf[i] = 0; 2417 } else if (buf[i] == '.') { 2418 dot=1; 2419 buf[i] = 0; 2420 } 2421 } 2422 msec = simple_strtoul(buf, &e, 10); 2423 if (e == buf || (*e && *e != '\n')) 2424 return -EINVAL; 2425 msec = (msec * 1000) / scale; 2426 if (msec == 0) 2427 mddev->safemode_delay = 0; 2428 else { 2429 unsigned long old_delay = mddev->safemode_delay; 2430 mddev->safemode_delay = (msec*HZ)/1000; 2431 if (mddev->safemode_delay == 0) 2432 mddev->safemode_delay = 1; 2433 if (mddev->safemode_delay < old_delay) 2434 md_safemode_timeout((unsigned long)mddev); 2435 } 2436 return len; 2437 } 2438 static struct md_sysfs_entry md_safe_delay = 2439 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2440 2441 static ssize_t 2442 level_show(mddev_t *mddev, char *page) 2443 { 2444 struct mdk_personality *p = mddev->pers; 2445 if (p) 2446 return sprintf(page, "%s\n", p->name); 2447 else if (mddev->clevel[0]) 2448 return sprintf(page, "%s\n", mddev->clevel); 2449 else if (mddev->level != LEVEL_NONE) 2450 return sprintf(page, "%d\n", mddev->level); 2451 else 2452 return 0; 2453 } 2454 2455 static ssize_t 2456 level_store(mddev_t *mddev, const char *buf, size_t len) 2457 { 2458 ssize_t rv = len; 2459 if (mddev->pers) 2460 return -EBUSY; 2461 if (len == 0) 2462 return 0; 2463 if (len >= sizeof(mddev->clevel)) 2464 return -ENOSPC; 2465 strncpy(mddev->clevel, buf, len); 2466 if (mddev->clevel[len-1] == '\n') 2467 len--; 2468 mddev->clevel[len] = 0; 2469 mddev->level = LEVEL_NONE; 2470 return rv; 2471 } 2472 2473 static struct md_sysfs_entry md_level = 2474 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2475 2476 2477 static ssize_t 2478 layout_show(mddev_t *mddev, char *page) 2479 { 2480 /* just a number, not meaningful for all levels */ 2481 if (mddev->reshape_position != MaxSector && 2482 mddev->layout != mddev->new_layout) 2483 return sprintf(page, "%d (%d)\n", 2484 mddev->new_layout, mddev->layout); 2485 return sprintf(page, "%d\n", mddev->layout); 2486 } 2487 2488 static ssize_t 2489 layout_store(mddev_t *mddev, const char *buf, size_t len) 2490 { 2491 char *e; 2492 unsigned long n = simple_strtoul(buf, &e, 10); 2493 2494 if (!*buf || (*e && *e != '\n')) 2495 return -EINVAL; 2496 2497 if (mddev->pers) 2498 return -EBUSY; 2499 if (mddev->reshape_position != MaxSector) 2500 mddev->new_layout = n; 2501 else 2502 mddev->layout = n; 2503 return len; 2504 } 2505 static struct md_sysfs_entry md_layout = 2506 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2507 2508 2509 static ssize_t 2510 raid_disks_show(mddev_t *mddev, char *page) 2511 { 2512 if (mddev->raid_disks == 0) 2513 return 0; 2514 if (mddev->reshape_position != MaxSector && 2515 mddev->delta_disks != 0) 2516 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 2517 mddev->raid_disks - mddev->delta_disks); 2518 return sprintf(page, "%d\n", mddev->raid_disks); 2519 } 2520 2521 static int update_raid_disks(mddev_t *mddev, int raid_disks); 2522 2523 static ssize_t 2524 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2525 { 2526 char *e; 2527 int rv = 0; 2528 unsigned long n = simple_strtoul(buf, &e, 10); 2529 2530 if (!*buf || (*e && *e != '\n')) 2531 return -EINVAL; 2532 2533 if (mddev->pers) 2534 rv = update_raid_disks(mddev, n); 2535 else if (mddev->reshape_position != MaxSector) { 2536 int olddisks = mddev->raid_disks - mddev->delta_disks; 2537 mddev->delta_disks = n - olddisks; 2538 mddev->raid_disks = n; 2539 } else 2540 mddev->raid_disks = n; 2541 return rv ? rv : len; 2542 } 2543 static struct md_sysfs_entry md_raid_disks = 2544 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2545 2546 static ssize_t 2547 chunk_size_show(mddev_t *mddev, char *page) 2548 { 2549 if (mddev->reshape_position != MaxSector && 2550 mddev->chunk_size != mddev->new_chunk) 2551 return sprintf(page, "%d (%d)\n", mddev->new_chunk, 2552 mddev->chunk_size); 2553 return sprintf(page, "%d\n", mddev->chunk_size); 2554 } 2555 2556 static ssize_t 2557 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2558 { 2559 /* can only set chunk_size if array is not yet active */ 2560 char *e; 2561 unsigned long n = simple_strtoul(buf, &e, 10); 2562 2563 if (!*buf || (*e && *e != '\n')) 2564 return -EINVAL; 2565 2566 if (mddev->pers) 2567 return -EBUSY; 2568 else if (mddev->reshape_position != MaxSector) 2569 mddev->new_chunk = n; 2570 else 2571 mddev->chunk_size = n; 2572 return len; 2573 } 2574 static struct md_sysfs_entry md_chunk_size = 2575 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2576 2577 static ssize_t 2578 resync_start_show(mddev_t *mddev, char *page) 2579 { 2580 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2581 } 2582 2583 static ssize_t 2584 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2585 { 2586 char *e; 2587 unsigned long long n = simple_strtoull(buf, &e, 10); 2588 2589 if (mddev->pers) 2590 return -EBUSY; 2591 if (!*buf || (*e && *e != '\n')) 2592 return -EINVAL; 2593 2594 mddev->recovery_cp = n; 2595 return len; 2596 } 2597 static struct md_sysfs_entry md_resync_start = 2598 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2599 2600 /* 2601 * The array state can be: 2602 * 2603 * clear 2604 * No devices, no size, no level 2605 * Equivalent to STOP_ARRAY ioctl 2606 * inactive 2607 * May have some settings, but array is not active 2608 * all IO results in error 2609 * When written, doesn't tear down array, but just stops it 2610 * suspended (not supported yet) 2611 * All IO requests will block. The array can be reconfigured. 2612 * Writing this, if accepted, will block until array is quiescent 2613 * readonly 2614 * no resync can happen. no superblocks get written. 2615 * write requests fail 2616 * read-auto 2617 * like readonly, but behaves like 'clean' on a write request. 2618 * 2619 * clean - no pending writes, but otherwise active. 2620 * When written to inactive array, starts without resync 2621 * If a write request arrives then 2622 * if metadata is known, mark 'dirty' and switch to 'active'. 2623 * if not known, block and switch to write-pending 2624 * If written to an active array that has pending writes, then fails. 2625 * active 2626 * fully active: IO and resync can be happening. 2627 * When written to inactive array, starts with resync 2628 * 2629 * write-pending 2630 * clean, but writes are blocked waiting for 'active' to be written. 2631 * 2632 * active-idle 2633 * like active, but no writes have been seen for a while (100msec). 2634 * 2635 */ 2636 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2637 write_pending, active_idle, bad_word}; 2638 static char *array_states[] = { 2639 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2640 "write-pending", "active-idle", NULL }; 2641 2642 static int match_word(const char *word, char **list) 2643 { 2644 int n; 2645 for (n=0; list[n]; n++) 2646 if (cmd_match(word, list[n])) 2647 break; 2648 return n; 2649 } 2650 2651 static ssize_t 2652 array_state_show(mddev_t *mddev, char *page) 2653 { 2654 enum array_state st = inactive; 2655 2656 if (mddev->pers) 2657 switch(mddev->ro) { 2658 case 1: 2659 st = readonly; 2660 break; 2661 case 2: 2662 st = read_auto; 2663 break; 2664 case 0: 2665 if (mddev->in_sync) 2666 st = clean; 2667 else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2668 st = write_pending; 2669 else if (mddev->safemode) 2670 st = active_idle; 2671 else 2672 st = active; 2673 } 2674 else { 2675 if (list_empty(&mddev->disks) && 2676 mddev->raid_disks == 0 && 2677 mddev->size == 0) 2678 st = clear; 2679 else 2680 st = inactive; 2681 } 2682 return sprintf(page, "%s\n", array_states[st]); 2683 } 2684 2685 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 2686 static int do_md_run(mddev_t * mddev); 2687 static int restart_array(mddev_t *mddev); 2688 2689 static ssize_t 2690 array_state_store(mddev_t *mddev, const char *buf, size_t len) 2691 { 2692 int err = -EINVAL; 2693 enum array_state st = match_word(buf, array_states); 2694 switch(st) { 2695 case bad_word: 2696 break; 2697 case clear: 2698 /* stopping an active array */ 2699 if (atomic_read(&mddev->openers) > 0) 2700 return -EBUSY; 2701 err = do_md_stop(mddev, 0, 0); 2702 break; 2703 case inactive: 2704 /* stopping an active array */ 2705 if (mddev->pers) { 2706 if (atomic_read(&mddev->openers) > 0) 2707 return -EBUSY; 2708 err = do_md_stop(mddev, 2, 0); 2709 } else 2710 err = 0; /* already inactive */ 2711 break; 2712 case suspended: 2713 break; /* not supported yet */ 2714 case readonly: 2715 if (mddev->pers) 2716 err = do_md_stop(mddev, 1, 0); 2717 else { 2718 mddev->ro = 1; 2719 set_disk_ro(mddev->gendisk, 1); 2720 err = do_md_run(mddev); 2721 } 2722 break; 2723 case read_auto: 2724 if (mddev->pers) { 2725 if (mddev->ro != 1) 2726 err = do_md_stop(mddev, 1, 0); 2727 else 2728 err = restart_array(mddev); 2729 if (err == 0) { 2730 mddev->ro = 2; 2731 set_disk_ro(mddev->gendisk, 0); 2732 } 2733 } else { 2734 mddev->ro = 2; 2735 err = do_md_run(mddev); 2736 } 2737 break; 2738 case clean: 2739 if (mddev->pers) { 2740 restart_array(mddev); 2741 spin_lock_irq(&mddev->write_lock); 2742 if (atomic_read(&mddev->writes_pending) == 0) { 2743 if (mddev->in_sync == 0) { 2744 mddev->in_sync = 1; 2745 if (mddev->safemode == 1) 2746 mddev->safemode = 0; 2747 if (mddev->persistent) 2748 set_bit(MD_CHANGE_CLEAN, 2749 &mddev->flags); 2750 } 2751 err = 0; 2752 } else 2753 err = -EBUSY; 2754 spin_unlock_irq(&mddev->write_lock); 2755 } else { 2756 mddev->ro = 0; 2757 mddev->recovery_cp = MaxSector; 2758 err = do_md_run(mddev); 2759 } 2760 break; 2761 case active: 2762 if (mddev->pers) { 2763 restart_array(mddev); 2764 if (mddev->external) 2765 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2766 wake_up(&mddev->sb_wait); 2767 err = 0; 2768 } else { 2769 mddev->ro = 0; 2770 set_disk_ro(mddev->gendisk, 0); 2771 err = do_md_run(mddev); 2772 } 2773 break; 2774 case write_pending: 2775 case active_idle: 2776 /* these cannot be set */ 2777 break; 2778 } 2779 if (err) 2780 return err; 2781 else { 2782 sysfs_notify(&mddev->kobj, NULL, "array_state"); 2783 return len; 2784 } 2785 } 2786 static struct md_sysfs_entry md_array_state = 2787 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2788 2789 static ssize_t 2790 null_show(mddev_t *mddev, char *page) 2791 { 2792 return -EINVAL; 2793 } 2794 2795 static ssize_t 2796 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 2797 { 2798 /* buf must be %d:%d\n? giving major and minor numbers */ 2799 /* The new device is added to the array. 2800 * If the array has a persistent superblock, we read the 2801 * superblock to initialise info and check validity. 2802 * Otherwise, only checking done is that in bind_rdev_to_array, 2803 * which mainly checks size. 2804 */ 2805 char *e; 2806 int major = simple_strtoul(buf, &e, 10); 2807 int minor; 2808 dev_t dev; 2809 mdk_rdev_t *rdev; 2810 int err; 2811 2812 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 2813 return -EINVAL; 2814 minor = simple_strtoul(e+1, &e, 10); 2815 if (*e && *e != '\n') 2816 return -EINVAL; 2817 dev = MKDEV(major, minor); 2818 if (major != MAJOR(dev) || 2819 minor != MINOR(dev)) 2820 return -EOVERFLOW; 2821 2822 2823 if (mddev->persistent) { 2824 rdev = md_import_device(dev, mddev->major_version, 2825 mddev->minor_version); 2826 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 2827 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2828 mdk_rdev_t, same_set); 2829 err = super_types[mddev->major_version] 2830 .load_super(rdev, rdev0, mddev->minor_version); 2831 if (err < 0) 2832 goto out; 2833 } 2834 } else if (mddev->external) 2835 rdev = md_import_device(dev, -2, -1); 2836 else 2837 rdev = md_import_device(dev, -1, -1); 2838 2839 if (IS_ERR(rdev)) 2840 return PTR_ERR(rdev); 2841 err = bind_rdev_to_array(rdev, mddev); 2842 out: 2843 if (err) 2844 export_rdev(rdev); 2845 return err ? err : len; 2846 } 2847 2848 static struct md_sysfs_entry md_new_device = 2849 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 2850 2851 static ssize_t 2852 bitmap_store(mddev_t *mddev, const char *buf, size_t len) 2853 { 2854 char *end; 2855 unsigned long chunk, end_chunk; 2856 2857 if (!mddev->bitmap) 2858 goto out; 2859 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 2860 while (*buf) { 2861 chunk = end_chunk = simple_strtoul(buf, &end, 0); 2862 if (buf == end) break; 2863 if (*end == '-') { /* range */ 2864 buf = end + 1; 2865 end_chunk = simple_strtoul(buf, &end, 0); 2866 if (buf == end) break; 2867 } 2868 if (*end && !isspace(*end)) break; 2869 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 2870 buf = end; 2871 while (isspace(*buf)) buf++; 2872 } 2873 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 2874 out: 2875 return len; 2876 } 2877 2878 static struct md_sysfs_entry md_bitmap = 2879 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 2880 2881 static ssize_t 2882 size_show(mddev_t *mddev, char *page) 2883 { 2884 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2885 } 2886 2887 static int update_size(mddev_t *mddev, sector_t num_sectors); 2888 2889 static ssize_t 2890 size_store(mddev_t *mddev, const char *buf, size_t len) 2891 { 2892 /* If array is inactive, we can reduce the component size, but 2893 * not increase it (except from 0). 2894 * If array is active, we can try an on-line resize 2895 */ 2896 char *e; 2897 int err = 0; 2898 unsigned long long size = simple_strtoull(buf, &e, 10); 2899 if (!*buf || *buf == '\n' || 2900 (*e && *e != '\n')) 2901 return -EINVAL; 2902 2903 if (mddev->pers) { 2904 err = update_size(mddev, size * 2); 2905 md_update_sb(mddev, 1); 2906 } else { 2907 if (mddev->size == 0 || 2908 mddev->size > size) 2909 mddev->size = size; 2910 else 2911 err = -ENOSPC; 2912 } 2913 return err ? err : len; 2914 } 2915 2916 static struct md_sysfs_entry md_size = 2917 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 2918 2919 2920 /* Metdata version. 2921 * This is one of 2922 * 'none' for arrays with no metadata (good luck...) 2923 * 'external' for arrays with externally managed metadata, 2924 * or N.M for internally known formats 2925 */ 2926 static ssize_t 2927 metadata_show(mddev_t *mddev, char *page) 2928 { 2929 if (mddev->persistent) 2930 return sprintf(page, "%d.%d\n", 2931 mddev->major_version, mddev->minor_version); 2932 else if (mddev->external) 2933 return sprintf(page, "external:%s\n", mddev->metadata_type); 2934 else 2935 return sprintf(page, "none\n"); 2936 } 2937 2938 static ssize_t 2939 metadata_store(mddev_t *mddev, const char *buf, size_t len) 2940 { 2941 int major, minor; 2942 char *e; 2943 if (!list_empty(&mddev->disks)) 2944 return -EBUSY; 2945 2946 if (cmd_match(buf, "none")) { 2947 mddev->persistent = 0; 2948 mddev->external = 0; 2949 mddev->major_version = 0; 2950 mddev->minor_version = 90; 2951 return len; 2952 } 2953 if (strncmp(buf, "external:", 9) == 0) { 2954 size_t namelen = len-9; 2955 if (namelen >= sizeof(mddev->metadata_type)) 2956 namelen = sizeof(mddev->metadata_type)-1; 2957 strncpy(mddev->metadata_type, buf+9, namelen); 2958 mddev->metadata_type[namelen] = 0; 2959 if (namelen && mddev->metadata_type[namelen-1] == '\n') 2960 mddev->metadata_type[--namelen] = 0; 2961 mddev->persistent = 0; 2962 mddev->external = 1; 2963 mddev->major_version = 0; 2964 mddev->minor_version = 90; 2965 return len; 2966 } 2967 major = simple_strtoul(buf, &e, 10); 2968 if (e==buf || *e != '.') 2969 return -EINVAL; 2970 buf = e+1; 2971 minor = simple_strtoul(buf, &e, 10); 2972 if (e==buf || (*e && *e != '\n') ) 2973 return -EINVAL; 2974 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 2975 return -ENOENT; 2976 mddev->major_version = major; 2977 mddev->minor_version = minor; 2978 mddev->persistent = 1; 2979 mddev->external = 0; 2980 return len; 2981 } 2982 2983 static struct md_sysfs_entry md_metadata = 2984 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 2985 2986 static ssize_t 2987 action_show(mddev_t *mddev, char *page) 2988 { 2989 char *type = "idle"; 2990 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2991 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 2992 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2993 type = "reshape"; 2994 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2995 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2996 type = "resync"; 2997 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2998 type = "check"; 2999 else 3000 type = "repair"; 3001 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 3002 type = "recover"; 3003 } 3004 return sprintf(page, "%s\n", type); 3005 } 3006 3007 static ssize_t 3008 action_store(mddev_t *mddev, const char *page, size_t len) 3009 { 3010 if (!mddev->pers || !mddev->pers->sync_request) 3011 return -EINVAL; 3012 3013 if (cmd_match(page, "idle")) { 3014 if (mddev->sync_thread) { 3015 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3016 md_unregister_thread(mddev->sync_thread); 3017 mddev->sync_thread = NULL; 3018 mddev->recovery = 0; 3019 } 3020 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3021 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3022 return -EBUSY; 3023 else if (cmd_match(page, "resync")) 3024 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3025 else if (cmd_match(page, "recover")) { 3026 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 3027 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3028 } else if (cmd_match(page, "reshape")) { 3029 int err; 3030 if (mddev->pers->start_reshape == NULL) 3031 return -EINVAL; 3032 err = mddev->pers->start_reshape(mddev); 3033 if (err) 3034 return err; 3035 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3036 } else { 3037 if (cmd_match(page, "check")) 3038 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3039 else if (!cmd_match(page, "repair")) 3040 return -EINVAL; 3041 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 3042 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3043 } 3044 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3045 md_wakeup_thread(mddev->thread); 3046 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 3047 return len; 3048 } 3049 3050 static ssize_t 3051 mismatch_cnt_show(mddev_t *mddev, char *page) 3052 { 3053 return sprintf(page, "%llu\n", 3054 (unsigned long long) mddev->resync_mismatches); 3055 } 3056 3057 static struct md_sysfs_entry md_scan_mode = 3058 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 3059 3060 3061 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 3062 3063 static ssize_t 3064 sync_min_show(mddev_t *mddev, char *page) 3065 { 3066 return sprintf(page, "%d (%s)\n", speed_min(mddev), 3067 mddev->sync_speed_min ? "local": "system"); 3068 } 3069 3070 static ssize_t 3071 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 3072 { 3073 int min; 3074 char *e; 3075 if (strncmp(buf, "system", 6)==0) { 3076 mddev->sync_speed_min = 0; 3077 return len; 3078 } 3079 min = simple_strtoul(buf, &e, 10); 3080 if (buf == e || (*e && *e != '\n') || min <= 0) 3081 return -EINVAL; 3082 mddev->sync_speed_min = min; 3083 return len; 3084 } 3085 3086 static struct md_sysfs_entry md_sync_min = 3087 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 3088 3089 static ssize_t 3090 sync_max_show(mddev_t *mddev, char *page) 3091 { 3092 return sprintf(page, "%d (%s)\n", speed_max(mddev), 3093 mddev->sync_speed_max ? "local": "system"); 3094 } 3095 3096 static ssize_t 3097 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 3098 { 3099 int max; 3100 char *e; 3101 if (strncmp(buf, "system", 6)==0) { 3102 mddev->sync_speed_max = 0; 3103 return len; 3104 } 3105 max = simple_strtoul(buf, &e, 10); 3106 if (buf == e || (*e && *e != '\n') || max <= 0) 3107 return -EINVAL; 3108 mddev->sync_speed_max = max; 3109 return len; 3110 } 3111 3112 static struct md_sysfs_entry md_sync_max = 3113 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 3114 3115 static ssize_t 3116 degraded_show(mddev_t *mddev, char *page) 3117 { 3118 return sprintf(page, "%d\n", mddev->degraded); 3119 } 3120 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 3121 3122 static ssize_t 3123 sync_force_parallel_show(mddev_t *mddev, char *page) 3124 { 3125 return sprintf(page, "%d\n", mddev->parallel_resync); 3126 } 3127 3128 static ssize_t 3129 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) 3130 { 3131 long n; 3132 3133 if (strict_strtol(buf, 10, &n)) 3134 return -EINVAL; 3135 3136 if (n != 0 && n != 1) 3137 return -EINVAL; 3138 3139 mddev->parallel_resync = n; 3140 3141 if (mddev->sync_thread) 3142 wake_up(&resync_wait); 3143 3144 return len; 3145 } 3146 3147 /* force parallel resync, even with shared block devices */ 3148 static struct md_sysfs_entry md_sync_force_parallel = 3149 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 3150 sync_force_parallel_show, sync_force_parallel_store); 3151 3152 static ssize_t 3153 sync_speed_show(mddev_t *mddev, char *page) 3154 { 3155 unsigned long resync, dt, db; 3156 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 3157 dt = (jiffies - mddev->resync_mark) / HZ; 3158 if (!dt) dt++; 3159 db = resync - mddev->resync_mark_cnt; 3160 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 3161 } 3162 3163 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3164 3165 static ssize_t 3166 sync_completed_show(mddev_t *mddev, char *page) 3167 { 3168 unsigned long max_blocks, resync; 3169 3170 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3171 max_blocks = mddev->resync_max_sectors; 3172 else 3173 max_blocks = mddev->size << 1; 3174 3175 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 3176 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 3177 } 3178 3179 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3180 3181 static ssize_t 3182 min_sync_show(mddev_t *mddev, char *page) 3183 { 3184 return sprintf(page, "%llu\n", 3185 (unsigned long long)mddev->resync_min); 3186 } 3187 static ssize_t 3188 min_sync_store(mddev_t *mddev, const char *buf, size_t len) 3189 { 3190 unsigned long long min; 3191 if (strict_strtoull(buf, 10, &min)) 3192 return -EINVAL; 3193 if (min > mddev->resync_max) 3194 return -EINVAL; 3195 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3196 return -EBUSY; 3197 3198 /* Must be a multiple of chunk_size */ 3199 if (mddev->chunk_size) { 3200 if (min & (sector_t)((mddev->chunk_size>>9)-1)) 3201 return -EINVAL; 3202 } 3203 mddev->resync_min = min; 3204 3205 return len; 3206 } 3207 3208 static struct md_sysfs_entry md_min_sync = 3209 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 3210 3211 static ssize_t 3212 max_sync_show(mddev_t *mddev, char *page) 3213 { 3214 if (mddev->resync_max == MaxSector) 3215 return sprintf(page, "max\n"); 3216 else 3217 return sprintf(page, "%llu\n", 3218 (unsigned long long)mddev->resync_max); 3219 } 3220 static ssize_t 3221 max_sync_store(mddev_t *mddev, const char *buf, size_t len) 3222 { 3223 if (strncmp(buf, "max", 3) == 0) 3224 mddev->resync_max = MaxSector; 3225 else { 3226 unsigned long long max; 3227 if (strict_strtoull(buf, 10, &max)) 3228 return -EINVAL; 3229 if (max < mddev->resync_min) 3230 return -EINVAL; 3231 if (max < mddev->resync_max && 3232 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3233 return -EBUSY; 3234 3235 /* Must be a multiple of chunk_size */ 3236 if (mddev->chunk_size) { 3237 if (max & (sector_t)((mddev->chunk_size>>9)-1)) 3238 return -EINVAL; 3239 } 3240 mddev->resync_max = max; 3241 } 3242 wake_up(&mddev->recovery_wait); 3243 return len; 3244 } 3245 3246 static struct md_sysfs_entry md_max_sync = 3247 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 3248 3249 static ssize_t 3250 suspend_lo_show(mddev_t *mddev, char *page) 3251 { 3252 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 3253 } 3254 3255 static ssize_t 3256 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 3257 { 3258 char *e; 3259 unsigned long long new = simple_strtoull(buf, &e, 10); 3260 3261 if (mddev->pers->quiesce == NULL) 3262 return -EINVAL; 3263 if (buf == e || (*e && *e != '\n')) 3264 return -EINVAL; 3265 if (new >= mddev->suspend_hi || 3266 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 3267 mddev->suspend_lo = new; 3268 mddev->pers->quiesce(mddev, 2); 3269 return len; 3270 } else 3271 return -EINVAL; 3272 } 3273 static struct md_sysfs_entry md_suspend_lo = 3274 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 3275 3276 3277 static ssize_t 3278 suspend_hi_show(mddev_t *mddev, char *page) 3279 { 3280 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 3281 } 3282 3283 static ssize_t 3284 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 3285 { 3286 char *e; 3287 unsigned long long new = simple_strtoull(buf, &e, 10); 3288 3289 if (mddev->pers->quiesce == NULL) 3290 return -EINVAL; 3291 if (buf == e || (*e && *e != '\n')) 3292 return -EINVAL; 3293 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 3294 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 3295 mddev->suspend_hi = new; 3296 mddev->pers->quiesce(mddev, 1); 3297 mddev->pers->quiesce(mddev, 0); 3298 return len; 3299 } else 3300 return -EINVAL; 3301 } 3302 static struct md_sysfs_entry md_suspend_hi = 3303 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 3304 3305 static ssize_t 3306 reshape_position_show(mddev_t *mddev, char *page) 3307 { 3308 if (mddev->reshape_position != MaxSector) 3309 return sprintf(page, "%llu\n", 3310 (unsigned long long)mddev->reshape_position); 3311 strcpy(page, "none\n"); 3312 return 5; 3313 } 3314 3315 static ssize_t 3316 reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 3317 { 3318 char *e; 3319 unsigned long long new = simple_strtoull(buf, &e, 10); 3320 if (mddev->pers) 3321 return -EBUSY; 3322 if (buf == e || (*e && *e != '\n')) 3323 return -EINVAL; 3324 mddev->reshape_position = new; 3325 mddev->delta_disks = 0; 3326 mddev->new_level = mddev->level; 3327 mddev->new_layout = mddev->layout; 3328 mddev->new_chunk = mddev->chunk_size; 3329 return len; 3330 } 3331 3332 static struct md_sysfs_entry md_reshape_position = 3333 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 3334 reshape_position_store); 3335 3336 3337 static struct attribute *md_default_attrs[] = { 3338 &md_level.attr, 3339 &md_layout.attr, 3340 &md_raid_disks.attr, 3341 &md_chunk_size.attr, 3342 &md_size.attr, 3343 &md_resync_start.attr, 3344 &md_metadata.attr, 3345 &md_new_device.attr, 3346 &md_safe_delay.attr, 3347 &md_array_state.attr, 3348 &md_reshape_position.attr, 3349 NULL, 3350 }; 3351 3352 static struct attribute *md_redundancy_attrs[] = { 3353 &md_scan_mode.attr, 3354 &md_mismatches.attr, 3355 &md_sync_min.attr, 3356 &md_sync_max.attr, 3357 &md_sync_speed.attr, 3358 &md_sync_force_parallel.attr, 3359 &md_sync_completed.attr, 3360 &md_min_sync.attr, 3361 &md_max_sync.attr, 3362 &md_suspend_lo.attr, 3363 &md_suspend_hi.attr, 3364 &md_bitmap.attr, 3365 &md_degraded.attr, 3366 NULL, 3367 }; 3368 static struct attribute_group md_redundancy_group = { 3369 .name = NULL, 3370 .attrs = md_redundancy_attrs, 3371 }; 3372 3373 3374 static ssize_t 3375 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3376 { 3377 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3378 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3379 ssize_t rv; 3380 3381 if (!entry->show) 3382 return -EIO; 3383 rv = mddev_lock(mddev); 3384 if (!rv) { 3385 rv = entry->show(mddev, page); 3386 mddev_unlock(mddev); 3387 } 3388 return rv; 3389 } 3390 3391 static ssize_t 3392 md_attr_store(struct kobject *kobj, struct attribute *attr, 3393 const char *page, size_t length) 3394 { 3395 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3396 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3397 ssize_t rv; 3398 3399 if (!entry->store) 3400 return -EIO; 3401 if (!capable(CAP_SYS_ADMIN)) 3402 return -EACCES; 3403 rv = mddev_lock(mddev); 3404 if (!rv) { 3405 rv = entry->store(mddev, page, length); 3406 mddev_unlock(mddev); 3407 } 3408 return rv; 3409 } 3410 3411 static void md_free(struct kobject *ko) 3412 { 3413 mddev_t *mddev = container_of(ko, mddev_t, kobj); 3414 kfree(mddev); 3415 } 3416 3417 static struct sysfs_ops md_sysfs_ops = { 3418 .show = md_attr_show, 3419 .store = md_attr_store, 3420 }; 3421 static struct kobj_type md_ktype = { 3422 .release = md_free, 3423 .sysfs_ops = &md_sysfs_ops, 3424 .default_attrs = md_default_attrs, 3425 }; 3426 3427 int mdp_major = 0; 3428 3429 static struct kobject *md_probe(dev_t dev, int *part, void *data) 3430 { 3431 static DEFINE_MUTEX(disks_mutex); 3432 mddev_t *mddev = mddev_find(dev); 3433 struct gendisk *disk; 3434 int partitioned = (MAJOR(dev) != MD_MAJOR); 3435 int shift = partitioned ? MdpMinorShift : 0; 3436 int unit = MINOR(dev) >> shift; 3437 int error; 3438 3439 if (!mddev) 3440 return NULL; 3441 3442 mutex_lock(&disks_mutex); 3443 if (mddev->gendisk) { 3444 mutex_unlock(&disks_mutex); 3445 mddev_put(mddev); 3446 return NULL; 3447 } 3448 disk = alloc_disk(1 << shift); 3449 if (!disk) { 3450 mutex_unlock(&disks_mutex); 3451 mddev_put(mddev); 3452 return NULL; 3453 } 3454 disk->major = MAJOR(dev); 3455 disk->first_minor = unit << shift; 3456 if (partitioned) 3457 sprintf(disk->disk_name, "md_d%d", unit); 3458 else 3459 sprintf(disk->disk_name, "md%d", unit); 3460 disk->fops = &md_fops; 3461 disk->private_data = mddev; 3462 disk->queue = mddev->queue; 3463 add_disk(disk); 3464 mddev->gendisk = disk; 3465 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 3466 &disk_to_dev(disk)->kobj, "%s", "md"); 3467 mutex_unlock(&disks_mutex); 3468 if (error) 3469 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3470 disk->disk_name); 3471 else 3472 kobject_uevent(&mddev->kobj, KOBJ_ADD); 3473 return NULL; 3474 } 3475 3476 static void md_safemode_timeout(unsigned long data) 3477 { 3478 mddev_t *mddev = (mddev_t *) data; 3479 3480 if (!atomic_read(&mddev->writes_pending)) { 3481 mddev->safemode = 1; 3482 if (mddev->external) 3483 set_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags); 3484 } 3485 md_wakeup_thread(mddev->thread); 3486 } 3487 3488 static int start_dirty_degraded; 3489 3490 static int do_md_run(mddev_t * mddev) 3491 { 3492 int err; 3493 int chunk_size; 3494 struct list_head *tmp; 3495 mdk_rdev_t *rdev; 3496 struct gendisk *disk; 3497 struct mdk_personality *pers; 3498 char b[BDEVNAME_SIZE]; 3499 3500 if (list_empty(&mddev->disks)) 3501 /* cannot run an array with no devices.. */ 3502 return -EINVAL; 3503 3504 if (mddev->pers) 3505 return -EBUSY; 3506 3507 /* 3508 * Analyze all RAID superblock(s) 3509 */ 3510 if (!mddev->raid_disks) { 3511 if (!mddev->persistent) 3512 return -EINVAL; 3513 analyze_sbs(mddev); 3514 } 3515 3516 chunk_size = mddev->chunk_size; 3517 3518 if (chunk_size) { 3519 if (chunk_size > MAX_CHUNK_SIZE) { 3520 printk(KERN_ERR "too big chunk_size: %d > %d\n", 3521 chunk_size, MAX_CHUNK_SIZE); 3522 return -EINVAL; 3523 } 3524 /* 3525 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 3526 */ 3527 if ( (1 << ffz(~chunk_size)) != chunk_size) { 3528 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 3529 return -EINVAL; 3530 } 3531 if (chunk_size < PAGE_SIZE) { 3532 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 3533 chunk_size, PAGE_SIZE); 3534 return -EINVAL; 3535 } 3536 3537 /* devices must have minimum size of one chunk */ 3538 rdev_for_each(rdev, tmp, mddev) { 3539 if (test_bit(Faulty, &rdev->flags)) 3540 continue; 3541 if (rdev->size < chunk_size / 1024) { 3542 printk(KERN_WARNING 3543 "md: Dev %s smaller than chunk_size:" 3544 " %lluk < %dk\n", 3545 bdevname(rdev->bdev,b), 3546 (unsigned long long)rdev->size, 3547 chunk_size / 1024); 3548 return -EINVAL; 3549 } 3550 } 3551 } 3552 3553 if (mddev->level != LEVEL_NONE) 3554 request_module("md-level-%d", mddev->level); 3555 else if (mddev->clevel[0]) 3556 request_module("md-%s", mddev->clevel); 3557 3558 /* 3559 * Drop all container device buffers, from now on 3560 * the only valid external interface is through the md 3561 * device. 3562 */ 3563 rdev_for_each(rdev, tmp, mddev) { 3564 if (test_bit(Faulty, &rdev->flags)) 3565 continue; 3566 sync_blockdev(rdev->bdev); 3567 invalidate_bdev(rdev->bdev); 3568 3569 /* perform some consistency tests on the device. 3570 * We don't want the data to overlap the metadata, 3571 * Internal Bitmap issues has handled elsewhere. 3572 */ 3573 if (rdev->data_offset < rdev->sb_start) { 3574 if (mddev->size && 3575 rdev->data_offset + mddev->size*2 3576 > rdev->sb_start) { 3577 printk("md: %s: data overlaps metadata\n", 3578 mdname(mddev)); 3579 return -EINVAL; 3580 } 3581 } else { 3582 if (rdev->sb_start + rdev->sb_size/512 3583 > rdev->data_offset) { 3584 printk("md: %s: metadata overlaps data\n", 3585 mdname(mddev)); 3586 return -EINVAL; 3587 } 3588 } 3589 sysfs_notify(&rdev->kobj, NULL, "state"); 3590 } 3591 3592 md_probe(mddev->unit, NULL, NULL); 3593 disk = mddev->gendisk; 3594 if (!disk) 3595 return -ENOMEM; 3596 3597 spin_lock(&pers_lock); 3598 pers = find_pers(mddev->level, mddev->clevel); 3599 if (!pers || !try_module_get(pers->owner)) { 3600 spin_unlock(&pers_lock); 3601 if (mddev->level != LEVEL_NONE) 3602 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 3603 mddev->level); 3604 else 3605 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 3606 mddev->clevel); 3607 return -EINVAL; 3608 } 3609 mddev->pers = pers; 3610 spin_unlock(&pers_lock); 3611 mddev->level = pers->level; 3612 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3613 3614 if (mddev->reshape_position != MaxSector && 3615 pers->start_reshape == NULL) { 3616 /* This personality cannot handle reshaping... */ 3617 mddev->pers = NULL; 3618 module_put(pers->owner); 3619 return -EINVAL; 3620 } 3621 3622 if (pers->sync_request) { 3623 /* Warn if this is a potentially silly 3624 * configuration. 3625 */ 3626 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3627 mdk_rdev_t *rdev2; 3628 struct list_head *tmp2; 3629 int warned = 0; 3630 rdev_for_each(rdev, tmp, mddev) { 3631 rdev_for_each(rdev2, tmp2, mddev) { 3632 if (rdev < rdev2 && 3633 rdev->bdev->bd_contains == 3634 rdev2->bdev->bd_contains) { 3635 printk(KERN_WARNING 3636 "%s: WARNING: %s appears to be" 3637 " on the same physical disk as" 3638 " %s.\n", 3639 mdname(mddev), 3640 bdevname(rdev->bdev,b), 3641 bdevname(rdev2->bdev,b2)); 3642 warned = 1; 3643 } 3644 } 3645 } 3646 if (warned) 3647 printk(KERN_WARNING 3648 "True protection against single-disk" 3649 " failure might be compromised.\n"); 3650 } 3651 3652 mddev->recovery = 0; 3653 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 3654 mddev->barriers_work = 1; 3655 mddev->ok_start_degraded = start_dirty_degraded; 3656 3657 if (start_readonly) 3658 mddev->ro = 2; /* read-only, but switch on first write */ 3659 3660 err = mddev->pers->run(mddev); 3661 if (err) 3662 printk(KERN_ERR "md: pers->run() failed ...\n"); 3663 else if (mddev->pers->sync_request) { 3664 err = bitmap_create(mddev); 3665 if (err) { 3666 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3667 mdname(mddev), err); 3668 mddev->pers->stop(mddev); 3669 } 3670 } 3671 if (err) { 3672 module_put(mddev->pers->owner); 3673 mddev->pers = NULL; 3674 bitmap_destroy(mddev); 3675 return err; 3676 } 3677 if (mddev->pers->sync_request) { 3678 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3679 printk(KERN_WARNING 3680 "md: cannot register extra attributes for %s\n", 3681 mdname(mddev)); 3682 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3683 mddev->ro = 0; 3684 3685 atomic_set(&mddev->writes_pending,0); 3686 mddev->safemode = 0; 3687 mddev->safemode_timer.function = md_safemode_timeout; 3688 mddev->safemode_timer.data = (unsigned long) mddev; 3689 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3690 mddev->in_sync = 1; 3691 3692 rdev_for_each(rdev, tmp, mddev) 3693 if (rdev->raid_disk >= 0) { 3694 char nm[20]; 3695 sprintf(nm, "rd%d", rdev->raid_disk); 3696 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 3697 printk("md: cannot register %s for %s\n", 3698 nm, mdname(mddev)); 3699 } 3700 3701 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3702 3703 if (mddev->flags) 3704 md_update_sb(mddev, 0); 3705 3706 set_capacity(disk, mddev->array_sectors); 3707 3708 /* If we call blk_queue_make_request here, it will 3709 * re-initialise max_sectors etc which may have been 3710 * refined inside -> run. So just set the bits we need to set. 3711 * Most initialisation happended when we called 3712 * blk_queue_make_request(..., md_fail_request) 3713 * earlier. 3714 */ 3715 mddev->queue->queuedata = mddev; 3716 mddev->queue->make_request_fn = mddev->pers->make_request; 3717 3718 /* If there is a partially-recovered drive we need to 3719 * start recovery here. If we leave it to md_check_recovery, 3720 * it will remove the drives and not do the right thing 3721 */ 3722 if (mddev->degraded && !mddev->sync_thread) { 3723 struct list_head *rtmp; 3724 int spares = 0; 3725 rdev_for_each(rdev, rtmp, mddev) 3726 if (rdev->raid_disk >= 0 && 3727 !test_bit(In_sync, &rdev->flags) && 3728 !test_bit(Faulty, &rdev->flags)) 3729 /* complete an interrupted recovery */ 3730 spares++; 3731 if (spares && mddev->pers->sync_request) { 3732 mddev->recovery = 0; 3733 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3734 mddev->sync_thread = md_register_thread(md_do_sync, 3735 mddev, 3736 "%s_resync"); 3737 if (!mddev->sync_thread) { 3738 printk(KERN_ERR "%s: could not start resync" 3739 " thread...\n", 3740 mdname(mddev)); 3741 /* leave the spares where they are, it shouldn't hurt */ 3742 mddev->recovery = 0; 3743 } 3744 } 3745 } 3746 md_wakeup_thread(mddev->thread); 3747 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 3748 3749 mddev->changed = 1; 3750 md_new_event(mddev); 3751 sysfs_notify(&mddev->kobj, NULL, "array_state"); 3752 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 3753 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3754 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 3755 return 0; 3756 } 3757 3758 static int restart_array(mddev_t *mddev) 3759 { 3760 struct gendisk *disk = mddev->gendisk; 3761 3762 /* Complain if it has no devices */ 3763 if (list_empty(&mddev->disks)) 3764 return -ENXIO; 3765 if (!mddev->pers) 3766 return -EINVAL; 3767 if (!mddev->ro) 3768 return -EBUSY; 3769 mddev->safemode = 0; 3770 mddev->ro = 0; 3771 set_disk_ro(disk, 0); 3772 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3773 mdname(mddev)); 3774 /* Kick recovery or resync if necessary */ 3775 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3776 md_wakeup_thread(mddev->thread); 3777 md_wakeup_thread(mddev->sync_thread); 3778 sysfs_notify(&mddev->kobj, NULL, "array_state"); 3779 return 0; 3780 } 3781 3782 /* similar to deny_write_access, but accounts for our holding a reference 3783 * to the file ourselves */ 3784 static int deny_bitmap_write_access(struct file * file) 3785 { 3786 struct inode *inode = file->f_mapping->host; 3787 3788 spin_lock(&inode->i_lock); 3789 if (atomic_read(&inode->i_writecount) > 1) { 3790 spin_unlock(&inode->i_lock); 3791 return -ETXTBSY; 3792 } 3793 atomic_set(&inode->i_writecount, -1); 3794 spin_unlock(&inode->i_lock); 3795 3796 return 0; 3797 } 3798 3799 static void restore_bitmap_write_access(struct file *file) 3800 { 3801 struct inode *inode = file->f_mapping->host; 3802 3803 spin_lock(&inode->i_lock); 3804 atomic_set(&inode->i_writecount, 1); 3805 spin_unlock(&inode->i_lock); 3806 } 3807 3808 /* mode: 3809 * 0 - completely stop and dis-assemble array 3810 * 1 - switch to readonly 3811 * 2 - stop but do not disassemble array 3812 */ 3813 static int do_md_stop(mddev_t * mddev, int mode, int is_open) 3814 { 3815 int err = 0; 3816 struct gendisk *disk = mddev->gendisk; 3817 3818 if (atomic_read(&mddev->openers) > is_open) { 3819 printk("md: %s still in use.\n",mdname(mddev)); 3820 return -EBUSY; 3821 } 3822 3823 if (mddev->pers) { 3824 3825 if (mddev->sync_thread) { 3826 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3827 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3828 md_unregister_thread(mddev->sync_thread); 3829 mddev->sync_thread = NULL; 3830 } 3831 3832 del_timer_sync(&mddev->safemode_timer); 3833 3834 switch(mode) { 3835 case 1: /* readonly */ 3836 err = -ENXIO; 3837 if (mddev->ro==1) 3838 goto out; 3839 mddev->ro = 1; 3840 break; 3841 case 0: /* disassemble */ 3842 case 2: /* stop */ 3843 bitmap_flush(mddev); 3844 md_super_wait(mddev); 3845 if (mddev->ro) 3846 set_disk_ro(disk, 0); 3847 blk_queue_make_request(mddev->queue, md_fail_request); 3848 mddev->pers->stop(mddev); 3849 mddev->queue->merge_bvec_fn = NULL; 3850 mddev->queue->unplug_fn = NULL; 3851 mddev->queue->backing_dev_info.congested_fn = NULL; 3852 if (mddev->pers->sync_request) 3853 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3854 3855 module_put(mddev->pers->owner); 3856 mddev->pers = NULL; 3857 /* tell userspace to handle 'inactive' */ 3858 sysfs_notify(&mddev->kobj, NULL, "array_state"); 3859 3860 set_capacity(disk, 0); 3861 mddev->changed = 1; 3862 3863 if (mddev->ro) 3864 mddev->ro = 0; 3865 } 3866 if (!mddev->in_sync || mddev->flags) { 3867 /* mark array as shutdown cleanly */ 3868 mddev->in_sync = 1; 3869 md_update_sb(mddev, 1); 3870 } 3871 if (mode == 1) 3872 set_disk_ro(disk, 1); 3873 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3874 } 3875 3876 /* 3877 * Free resources if final stop 3878 */ 3879 if (mode == 0) { 3880 mdk_rdev_t *rdev; 3881 struct list_head *tmp; 3882 3883 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 3884 3885 bitmap_destroy(mddev); 3886 if (mddev->bitmap_file) { 3887 restore_bitmap_write_access(mddev->bitmap_file); 3888 fput(mddev->bitmap_file); 3889 mddev->bitmap_file = NULL; 3890 } 3891 mddev->bitmap_offset = 0; 3892 3893 rdev_for_each(rdev, tmp, mddev) 3894 if (rdev->raid_disk >= 0) { 3895 char nm[20]; 3896 sprintf(nm, "rd%d", rdev->raid_disk); 3897 sysfs_remove_link(&mddev->kobj, nm); 3898 } 3899 3900 /* make sure all md_delayed_delete calls have finished */ 3901 flush_scheduled_work(); 3902 3903 export_array(mddev); 3904 3905 mddev->array_sectors = 0; 3906 mddev->size = 0; 3907 mddev->raid_disks = 0; 3908 mddev->recovery_cp = 0; 3909 mddev->resync_min = 0; 3910 mddev->resync_max = MaxSector; 3911 mddev->reshape_position = MaxSector; 3912 mddev->external = 0; 3913 mddev->persistent = 0; 3914 mddev->level = LEVEL_NONE; 3915 mddev->clevel[0] = 0; 3916 mddev->flags = 0; 3917 mddev->ro = 0; 3918 mddev->metadata_type[0] = 0; 3919 mddev->chunk_size = 0; 3920 mddev->ctime = mddev->utime = 0; 3921 mddev->layout = 0; 3922 mddev->max_disks = 0; 3923 mddev->events = 0; 3924 mddev->delta_disks = 0; 3925 mddev->new_level = LEVEL_NONE; 3926 mddev->new_layout = 0; 3927 mddev->new_chunk = 0; 3928 mddev->curr_resync = 0; 3929 mddev->resync_mismatches = 0; 3930 mddev->suspend_lo = mddev->suspend_hi = 0; 3931 mddev->sync_speed_min = mddev->sync_speed_max = 0; 3932 mddev->recovery = 0; 3933 mddev->in_sync = 0; 3934 mddev->changed = 0; 3935 mddev->degraded = 0; 3936 mddev->barriers_work = 0; 3937 mddev->safemode = 0; 3938 3939 } else if (mddev->pers) 3940 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3941 mdname(mddev)); 3942 err = 0; 3943 md_new_event(mddev); 3944 sysfs_notify(&mddev->kobj, NULL, "array_state"); 3945 out: 3946 return err; 3947 } 3948 3949 #ifndef MODULE 3950 static void autorun_array(mddev_t *mddev) 3951 { 3952 mdk_rdev_t *rdev; 3953 struct list_head *tmp; 3954 int err; 3955 3956 if (list_empty(&mddev->disks)) 3957 return; 3958 3959 printk(KERN_INFO "md: running: "); 3960 3961 rdev_for_each(rdev, tmp, mddev) { 3962 char b[BDEVNAME_SIZE]; 3963 printk("<%s>", bdevname(rdev->bdev,b)); 3964 } 3965 printk("\n"); 3966 3967 err = do_md_run (mddev); 3968 if (err) { 3969 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3970 do_md_stop (mddev, 0, 0); 3971 } 3972 } 3973 3974 /* 3975 * lets try to run arrays based on all disks that have arrived 3976 * until now. (those are in pending_raid_disks) 3977 * 3978 * the method: pick the first pending disk, collect all disks with 3979 * the same UUID, remove all from the pending list and put them into 3980 * the 'same_array' list. Then order this list based on superblock 3981 * update time (freshest comes first), kick out 'old' disks and 3982 * compare superblocks. If everything's fine then run it. 3983 * 3984 * If "unit" is allocated, then bump its reference count 3985 */ 3986 static void autorun_devices(int part) 3987 { 3988 struct list_head *tmp; 3989 mdk_rdev_t *rdev0, *rdev; 3990 mddev_t *mddev; 3991 char b[BDEVNAME_SIZE]; 3992 3993 printk(KERN_INFO "md: autorun ...\n"); 3994 while (!list_empty(&pending_raid_disks)) { 3995 int unit; 3996 dev_t dev; 3997 LIST_HEAD(candidates); 3998 rdev0 = list_entry(pending_raid_disks.next, 3999 mdk_rdev_t, same_set); 4000 4001 printk(KERN_INFO "md: considering %s ...\n", 4002 bdevname(rdev0->bdev,b)); 4003 INIT_LIST_HEAD(&candidates); 4004 rdev_for_each_list(rdev, tmp, pending_raid_disks) 4005 if (super_90_load(rdev, rdev0, 0) >= 0) { 4006 printk(KERN_INFO "md: adding %s ...\n", 4007 bdevname(rdev->bdev,b)); 4008 list_move(&rdev->same_set, &candidates); 4009 } 4010 /* 4011 * now we have a set of devices, with all of them having 4012 * mostly sane superblocks. It's time to allocate the 4013 * mddev. 4014 */ 4015 if (part) { 4016 dev = MKDEV(mdp_major, 4017 rdev0->preferred_minor << MdpMinorShift); 4018 unit = MINOR(dev) >> MdpMinorShift; 4019 } else { 4020 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 4021 unit = MINOR(dev); 4022 } 4023 if (rdev0->preferred_minor != unit) { 4024 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 4025 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 4026 break; 4027 } 4028 4029 md_probe(dev, NULL, NULL); 4030 mddev = mddev_find(dev); 4031 if (!mddev || !mddev->gendisk) { 4032 if (mddev) 4033 mddev_put(mddev); 4034 printk(KERN_ERR 4035 "md: cannot allocate memory for md drive.\n"); 4036 break; 4037 } 4038 if (mddev_lock(mddev)) 4039 printk(KERN_WARNING "md: %s locked, cannot run\n", 4040 mdname(mddev)); 4041 else if (mddev->raid_disks || mddev->major_version 4042 || !list_empty(&mddev->disks)) { 4043 printk(KERN_WARNING 4044 "md: %s already running, cannot run %s\n", 4045 mdname(mddev), bdevname(rdev0->bdev,b)); 4046 mddev_unlock(mddev); 4047 } else { 4048 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 4049 mddev->persistent = 1; 4050 rdev_for_each_list(rdev, tmp, candidates) { 4051 list_del_init(&rdev->same_set); 4052 if (bind_rdev_to_array(rdev, mddev)) 4053 export_rdev(rdev); 4054 } 4055 autorun_array(mddev); 4056 mddev_unlock(mddev); 4057 } 4058 /* on success, candidates will be empty, on error 4059 * it won't... 4060 */ 4061 rdev_for_each_list(rdev, tmp, candidates) { 4062 list_del_init(&rdev->same_set); 4063 export_rdev(rdev); 4064 } 4065 mddev_put(mddev); 4066 } 4067 printk(KERN_INFO "md: ... autorun DONE.\n"); 4068 } 4069 #endif /* !MODULE */ 4070 4071 static int get_version(void __user * arg) 4072 { 4073 mdu_version_t ver; 4074 4075 ver.major = MD_MAJOR_VERSION; 4076 ver.minor = MD_MINOR_VERSION; 4077 ver.patchlevel = MD_PATCHLEVEL_VERSION; 4078 4079 if (copy_to_user(arg, &ver, sizeof(ver))) 4080 return -EFAULT; 4081 4082 return 0; 4083 } 4084 4085 static int get_array_info(mddev_t * mddev, void __user * arg) 4086 { 4087 mdu_array_info_t info; 4088 int nr,working,active,failed,spare; 4089 mdk_rdev_t *rdev; 4090 struct list_head *tmp; 4091 4092 nr=working=active=failed=spare=0; 4093 rdev_for_each(rdev, tmp, mddev) { 4094 nr++; 4095 if (test_bit(Faulty, &rdev->flags)) 4096 failed++; 4097 else { 4098 working++; 4099 if (test_bit(In_sync, &rdev->flags)) 4100 active++; 4101 else 4102 spare++; 4103 } 4104 } 4105 4106 info.major_version = mddev->major_version; 4107 info.minor_version = mddev->minor_version; 4108 info.patch_version = MD_PATCHLEVEL_VERSION; 4109 info.ctime = mddev->ctime; 4110 info.level = mddev->level; 4111 info.size = mddev->size; 4112 if (info.size != mddev->size) /* overflow */ 4113 info.size = -1; 4114 info.nr_disks = nr; 4115 info.raid_disks = mddev->raid_disks; 4116 info.md_minor = mddev->md_minor; 4117 info.not_persistent= !mddev->persistent; 4118 4119 info.utime = mddev->utime; 4120 info.state = 0; 4121 if (mddev->in_sync) 4122 info.state = (1<<MD_SB_CLEAN); 4123 if (mddev->bitmap && mddev->bitmap_offset) 4124 info.state = (1<<MD_SB_BITMAP_PRESENT); 4125 info.active_disks = active; 4126 info.working_disks = working; 4127 info.failed_disks = failed; 4128 info.spare_disks = spare; 4129 4130 info.layout = mddev->layout; 4131 info.chunk_size = mddev->chunk_size; 4132 4133 if (copy_to_user(arg, &info, sizeof(info))) 4134 return -EFAULT; 4135 4136 return 0; 4137 } 4138 4139 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 4140 { 4141 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 4142 char *ptr, *buf = NULL; 4143 int err = -ENOMEM; 4144 4145 if (md_allow_write(mddev)) 4146 file = kmalloc(sizeof(*file), GFP_NOIO); 4147 else 4148 file = kmalloc(sizeof(*file), GFP_KERNEL); 4149 4150 if (!file) 4151 goto out; 4152 4153 /* bitmap disabled, zero the first byte and copy out */ 4154 if (!mddev->bitmap || !mddev->bitmap->file) { 4155 file->pathname[0] = '\0'; 4156 goto copy_out; 4157 } 4158 4159 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 4160 if (!buf) 4161 goto out; 4162 4163 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); 4164 if (IS_ERR(ptr)) 4165 goto out; 4166 4167 strcpy(file->pathname, ptr); 4168 4169 copy_out: 4170 err = 0; 4171 if (copy_to_user(arg, file, sizeof(*file))) 4172 err = -EFAULT; 4173 out: 4174 kfree(buf); 4175 kfree(file); 4176 return err; 4177 } 4178 4179 static int get_disk_info(mddev_t * mddev, void __user * arg) 4180 { 4181 mdu_disk_info_t info; 4182 mdk_rdev_t *rdev; 4183 4184 if (copy_from_user(&info, arg, sizeof(info))) 4185 return -EFAULT; 4186 4187 rdev = find_rdev_nr(mddev, info.number); 4188 if (rdev) { 4189 info.major = MAJOR(rdev->bdev->bd_dev); 4190 info.minor = MINOR(rdev->bdev->bd_dev); 4191 info.raid_disk = rdev->raid_disk; 4192 info.state = 0; 4193 if (test_bit(Faulty, &rdev->flags)) 4194 info.state |= (1<<MD_DISK_FAULTY); 4195 else if (test_bit(In_sync, &rdev->flags)) { 4196 info.state |= (1<<MD_DISK_ACTIVE); 4197 info.state |= (1<<MD_DISK_SYNC); 4198 } 4199 if (test_bit(WriteMostly, &rdev->flags)) 4200 info.state |= (1<<MD_DISK_WRITEMOSTLY); 4201 } else { 4202 info.major = info.minor = 0; 4203 info.raid_disk = -1; 4204 info.state = (1<<MD_DISK_REMOVED); 4205 } 4206 4207 if (copy_to_user(arg, &info, sizeof(info))) 4208 return -EFAULT; 4209 4210 return 0; 4211 } 4212 4213 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 4214 { 4215 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4216 mdk_rdev_t *rdev; 4217 dev_t dev = MKDEV(info->major,info->minor); 4218 4219 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 4220 return -EOVERFLOW; 4221 4222 if (!mddev->raid_disks) { 4223 int err; 4224 /* expecting a device which has a superblock */ 4225 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 4226 if (IS_ERR(rdev)) { 4227 printk(KERN_WARNING 4228 "md: md_import_device returned %ld\n", 4229 PTR_ERR(rdev)); 4230 return PTR_ERR(rdev); 4231 } 4232 if (!list_empty(&mddev->disks)) { 4233 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 4234 mdk_rdev_t, same_set); 4235 int err = super_types[mddev->major_version] 4236 .load_super(rdev, rdev0, mddev->minor_version); 4237 if (err < 0) { 4238 printk(KERN_WARNING 4239 "md: %s has different UUID to %s\n", 4240 bdevname(rdev->bdev,b), 4241 bdevname(rdev0->bdev,b2)); 4242 export_rdev(rdev); 4243 return -EINVAL; 4244 } 4245 } 4246 err = bind_rdev_to_array(rdev, mddev); 4247 if (err) 4248 export_rdev(rdev); 4249 return err; 4250 } 4251 4252 /* 4253 * add_new_disk can be used once the array is assembled 4254 * to add "hot spares". They must already have a superblock 4255 * written 4256 */ 4257 if (mddev->pers) { 4258 int err; 4259 if (!mddev->pers->hot_add_disk) { 4260 printk(KERN_WARNING 4261 "%s: personality does not support diskops!\n", 4262 mdname(mddev)); 4263 return -EINVAL; 4264 } 4265 if (mddev->persistent) 4266 rdev = md_import_device(dev, mddev->major_version, 4267 mddev->minor_version); 4268 else 4269 rdev = md_import_device(dev, -1, -1); 4270 if (IS_ERR(rdev)) { 4271 printk(KERN_WARNING 4272 "md: md_import_device returned %ld\n", 4273 PTR_ERR(rdev)); 4274 return PTR_ERR(rdev); 4275 } 4276 /* set save_raid_disk if appropriate */ 4277 if (!mddev->persistent) { 4278 if (info->state & (1<<MD_DISK_SYNC) && 4279 info->raid_disk < mddev->raid_disks) 4280 rdev->raid_disk = info->raid_disk; 4281 else 4282 rdev->raid_disk = -1; 4283 } else 4284 super_types[mddev->major_version]. 4285 validate_super(mddev, rdev); 4286 rdev->saved_raid_disk = rdev->raid_disk; 4287 4288 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 4289 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4290 set_bit(WriteMostly, &rdev->flags); 4291 4292 rdev->raid_disk = -1; 4293 err = bind_rdev_to_array(rdev, mddev); 4294 if (!err && !mddev->pers->hot_remove_disk) { 4295 /* If there is hot_add_disk but no hot_remove_disk 4296 * then added disks for geometry changes, 4297 * and should be added immediately. 4298 */ 4299 super_types[mddev->major_version]. 4300 validate_super(mddev, rdev); 4301 err = mddev->pers->hot_add_disk(mddev, rdev); 4302 if (err) 4303 unbind_rdev_from_array(rdev); 4304 } 4305 if (err) 4306 export_rdev(rdev); 4307 else 4308 sysfs_notify(&rdev->kobj, NULL, "state"); 4309 4310 md_update_sb(mddev, 1); 4311 if (mddev->degraded) 4312 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4313 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4314 md_wakeup_thread(mddev->thread); 4315 return err; 4316 } 4317 4318 /* otherwise, add_new_disk is only allowed 4319 * for major_version==0 superblocks 4320 */ 4321 if (mddev->major_version != 0) { 4322 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 4323 mdname(mddev)); 4324 return -EINVAL; 4325 } 4326 4327 if (!(info->state & (1<<MD_DISK_FAULTY))) { 4328 int err; 4329 rdev = md_import_device (dev, -1, 0); 4330 if (IS_ERR(rdev)) { 4331 printk(KERN_WARNING 4332 "md: error, md_import_device() returned %ld\n", 4333 PTR_ERR(rdev)); 4334 return PTR_ERR(rdev); 4335 } 4336 rdev->desc_nr = info->number; 4337 if (info->raid_disk < mddev->raid_disks) 4338 rdev->raid_disk = info->raid_disk; 4339 else 4340 rdev->raid_disk = -1; 4341 4342 if (rdev->raid_disk < mddev->raid_disks) 4343 if (info->state & (1<<MD_DISK_SYNC)) 4344 set_bit(In_sync, &rdev->flags); 4345 4346 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4347 set_bit(WriteMostly, &rdev->flags); 4348 4349 if (!mddev->persistent) { 4350 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 4351 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4352 } else 4353 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4354 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; 4355 4356 err = bind_rdev_to_array(rdev, mddev); 4357 if (err) { 4358 export_rdev(rdev); 4359 return err; 4360 } 4361 } 4362 4363 return 0; 4364 } 4365 4366 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 4367 { 4368 char b[BDEVNAME_SIZE]; 4369 mdk_rdev_t *rdev; 4370 4371 rdev = find_rdev(mddev, dev); 4372 if (!rdev) 4373 return -ENXIO; 4374 4375 if (rdev->raid_disk >= 0) 4376 goto busy; 4377 4378 kick_rdev_from_array(rdev); 4379 md_update_sb(mddev, 1); 4380 md_new_event(mddev); 4381 4382 return 0; 4383 busy: 4384 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 4385 bdevname(rdev->bdev,b), mdname(mddev)); 4386 return -EBUSY; 4387 } 4388 4389 static int hot_add_disk(mddev_t * mddev, dev_t dev) 4390 { 4391 char b[BDEVNAME_SIZE]; 4392 int err; 4393 mdk_rdev_t *rdev; 4394 4395 if (!mddev->pers) 4396 return -ENODEV; 4397 4398 if (mddev->major_version != 0) { 4399 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 4400 " version-0 superblocks.\n", 4401 mdname(mddev)); 4402 return -EINVAL; 4403 } 4404 if (!mddev->pers->hot_add_disk) { 4405 printk(KERN_WARNING 4406 "%s: personality does not support diskops!\n", 4407 mdname(mddev)); 4408 return -EINVAL; 4409 } 4410 4411 rdev = md_import_device (dev, -1, 0); 4412 if (IS_ERR(rdev)) { 4413 printk(KERN_WARNING 4414 "md: error, md_import_device() returned %ld\n", 4415 PTR_ERR(rdev)); 4416 return -EINVAL; 4417 } 4418 4419 if (mddev->persistent) 4420 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4421 else 4422 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4423 4424 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; 4425 4426 if (test_bit(Faulty, &rdev->flags)) { 4427 printk(KERN_WARNING 4428 "md: can not hot-add faulty %s disk to %s!\n", 4429 bdevname(rdev->bdev,b), mdname(mddev)); 4430 err = -EINVAL; 4431 goto abort_export; 4432 } 4433 clear_bit(In_sync, &rdev->flags); 4434 rdev->desc_nr = -1; 4435 rdev->saved_raid_disk = -1; 4436 err = bind_rdev_to_array(rdev, mddev); 4437 if (err) 4438 goto abort_export; 4439 4440 /* 4441 * The rest should better be atomic, we can have disk failures 4442 * noticed in interrupt contexts ... 4443 */ 4444 4445 if (rdev->desc_nr == mddev->max_disks) { 4446 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 4447 mdname(mddev)); 4448 err = -EBUSY; 4449 goto abort_unbind_export; 4450 } 4451 4452 rdev->raid_disk = -1; 4453 4454 md_update_sb(mddev, 1); 4455 4456 /* 4457 * Kick recovery, maybe this spare has to be added to the 4458 * array immediately. 4459 */ 4460 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4461 md_wakeup_thread(mddev->thread); 4462 md_new_event(mddev); 4463 return 0; 4464 4465 abort_unbind_export: 4466 unbind_rdev_from_array(rdev); 4467 4468 abort_export: 4469 export_rdev(rdev); 4470 return err; 4471 } 4472 4473 static int set_bitmap_file(mddev_t *mddev, int fd) 4474 { 4475 int err; 4476 4477 if (mddev->pers) { 4478 if (!mddev->pers->quiesce) 4479 return -EBUSY; 4480 if (mddev->recovery || mddev->sync_thread) 4481 return -EBUSY; 4482 /* we should be able to change the bitmap.. */ 4483 } 4484 4485 4486 if (fd >= 0) { 4487 if (mddev->bitmap) 4488 return -EEXIST; /* cannot add when bitmap is present */ 4489 mddev->bitmap_file = fget(fd); 4490 4491 if (mddev->bitmap_file == NULL) { 4492 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 4493 mdname(mddev)); 4494 return -EBADF; 4495 } 4496 4497 err = deny_bitmap_write_access(mddev->bitmap_file); 4498 if (err) { 4499 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 4500 mdname(mddev)); 4501 fput(mddev->bitmap_file); 4502 mddev->bitmap_file = NULL; 4503 return err; 4504 } 4505 mddev->bitmap_offset = 0; /* file overrides offset */ 4506 } else if (mddev->bitmap == NULL) 4507 return -ENOENT; /* cannot remove what isn't there */ 4508 err = 0; 4509 if (mddev->pers) { 4510 mddev->pers->quiesce(mddev, 1); 4511 if (fd >= 0) 4512 err = bitmap_create(mddev); 4513 if (fd < 0 || err) { 4514 bitmap_destroy(mddev); 4515 fd = -1; /* make sure to put the file */ 4516 } 4517 mddev->pers->quiesce(mddev, 0); 4518 } 4519 if (fd < 0) { 4520 if (mddev->bitmap_file) { 4521 restore_bitmap_write_access(mddev->bitmap_file); 4522 fput(mddev->bitmap_file); 4523 } 4524 mddev->bitmap_file = NULL; 4525 } 4526 4527 return err; 4528 } 4529 4530 /* 4531 * set_array_info is used two different ways 4532 * The original usage is when creating a new array. 4533 * In this usage, raid_disks is > 0 and it together with 4534 * level, size, not_persistent,layout,chunksize determine the 4535 * shape of the array. 4536 * This will always create an array with a type-0.90.0 superblock. 4537 * The newer usage is when assembling an array. 4538 * In this case raid_disks will be 0, and the major_version field is 4539 * use to determine which style super-blocks are to be found on the devices. 4540 * The minor and patch _version numbers are also kept incase the 4541 * super_block handler wishes to interpret them. 4542 */ 4543 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 4544 { 4545 4546 if (info->raid_disks == 0) { 4547 /* just setting version number for superblock loading */ 4548 if (info->major_version < 0 || 4549 info->major_version >= ARRAY_SIZE(super_types) || 4550 super_types[info->major_version].name == NULL) { 4551 /* maybe try to auto-load a module? */ 4552 printk(KERN_INFO 4553 "md: superblock version %d not known\n", 4554 info->major_version); 4555 return -EINVAL; 4556 } 4557 mddev->major_version = info->major_version; 4558 mddev->minor_version = info->minor_version; 4559 mddev->patch_version = info->patch_version; 4560 mddev->persistent = !info->not_persistent; 4561 return 0; 4562 } 4563 mddev->major_version = MD_MAJOR_VERSION; 4564 mddev->minor_version = MD_MINOR_VERSION; 4565 mddev->patch_version = MD_PATCHLEVEL_VERSION; 4566 mddev->ctime = get_seconds(); 4567 4568 mddev->level = info->level; 4569 mddev->clevel[0] = 0; 4570 mddev->size = info->size; 4571 mddev->raid_disks = info->raid_disks; 4572 /* don't set md_minor, it is determined by which /dev/md* was 4573 * openned 4574 */ 4575 if (info->state & (1<<MD_SB_CLEAN)) 4576 mddev->recovery_cp = MaxSector; 4577 else 4578 mddev->recovery_cp = 0; 4579 mddev->persistent = ! info->not_persistent; 4580 mddev->external = 0; 4581 4582 mddev->layout = info->layout; 4583 mddev->chunk_size = info->chunk_size; 4584 4585 mddev->max_disks = MD_SB_DISKS; 4586 4587 if (mddev->persistent) 4588 mddev->flags = 0; 4589 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4590 4591 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 4592 mddev->bitmap_offset = 0; 4593 4594 mddev->reshape_position = MaxSector; 4595 4596 /* 4597 * Generate a 128 bit UUID 4598 */ 4599 get_random_bytes(mddev->uuid, 16); 4600 4601 mddev->new_level = mddev->level; 4602 mddev->new_chunk = mddev->chunk_size; 4603 mddev->new_layout = mddev->layout; 4604 mddev->delta_disks = 0; 4605 4606 return 0; 4607 } 4608 4609 static int update_size(mddev_t *mddev, sector_t num_sectors) 4610 { 4611 mdk_rdev_t * rdev; 4612 int rv; 4613 struct list_head *tmp; 4614 int fit = (num_sectors == 0); 4615 4616 if (mddev->pers->resize == NULL) 4617 return -EINVAL; 4618 /* The "num_sectors" is the number of sectors of each device that 4619 * is used. This can only make sense for arrays with redundancy. 4620 * linear and raid0 always use whatever space is available. We can only 4621 * consider changing this number if no resync or reconstruction is 4622 * happening, and if the new size is acceptable. It must fit before the 4623 * sb_start or, if that is <data_offset, it must fit before the size 4624 * of each device. If num_sectors is zero, we find the largest size 4625 * that fits. 4626 4627 */ 4628 if (mddev->sync_thread) 4629 return -EBUSY; 4630 if (mddev->bitmap) 4631 /* Sorry, cannot grow a bitmap yet, just remove it, 4632 * grow, and re-add. 4633 */ 4634 return -EBUSY; 4635 rdev_for_each(rdev, tmp, mddev) { 4636 sector_t avail; 4637 avail = rdev->size * 2; 4638 4639 if (fit && (num_sectors == 0 || num_sectors > avail)) 4640 num_sectors = avail; 4641 if (avail < num_sectors) 4642 return -ENOSPC; 4643 } 4644 rv = mddev->pers->resize(mddev, num_sectors); 4645 if (!rv) { 4646 struct block_device *bdev; 4647 4648 bdev = bdget_disk(mddev->gendisk, 0); 4649 if (bdev) { 4650 mutex_lock(&bdev->bd_inode->i_mutex); 4651 i_size_write(bdev->bd_inode, 4652 (loff_t)mddev->array_sectors << 9); 4653 mutex_unlock(&bdev->bd_inode->i_mutex); 4654 bdput(bdev); 4655 } 4656 } 4657 return rv; 4658 } 4659 4660 static int update_raid_disks(mddev_t *mddev, int raid_disks) 4661 { 4662 int rv; 4663 /* change the number of raid disks */ 4664 if (mddev->pers->check_reshape == NULL) 4665 return -EINVAL; 4666 if (raid_disks <= 0 || 4667 raid_disks >= mddev->max_disks) 4668 return -EINVAL; 4669 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 4670 return -EBUSY; 4671 mddev->delta_disks = raid_disks - mddev->raid_disks; 4672 4673 rv = mddev->pers->check_reshape(mddev); 4674 return rv; 4675 } 4676 4677 4678 /* 4679 * update_array_info is used to change the configuration of an 4680 * on-line array. 4681 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 4682 * fields in the info are checked against the array. 4683 * Any differences that cannot be handled will cause an error. 4684 * Normally, only one change can be managed at a time. 4685 */ 4686 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 4687 { 4688 int rv = 0; 4689 int cnt = 0; 4690 int state = 0; 4691 4692 /* calculate expected state,ignoring low bits */ 4693 if (mddev->bitmap && mddev->bitmap_offset) 4694 state |= (1 << MD_SB_BITMAP_PRESENT); 4695 4696 if (mddev->major_version != info->major_version || 4697 mddev->minor_version != info->minor_version || 4698 /* mddev->patch_version != info->patch_version || */ 4699 mddev->ctime != info->ctime || 4700 mddev->level != info->level || 4701 /* mddev->layout != info->layout || */ 4702 !mddev->persistent != info->not_persistent|| 4703 mddev->chunk_size != info->chunk_size || 4704 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 4705 ((state^info->state) & 0xfffffe00) 4706 ) 4707 return -EINVAL; 4708 /* Check there is only one change */ 4709 if (info->size >= 0 && mddev->size != info->size) cnt++; 4710 if (mddev->raid_disks != info->raid_disks) cnt++; 4711 if (mddev->layout != info->layout) cnt++; 4712 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 4713 if (cnt == 0) return 0; 4714 if (cnt > 1) return -EINVAL; 4715 4716 if (mddev->layout != info->layout) { 4717 /* Change layout 4718 * we don't need to do anything at the md level, the 4719 * personality will take care of it all. 4720 */ 4721 if (mddev->pers->reconfig == NULL) 4722 return -EINVAL; 4723 else 4724 return mddev->pers->reconfig(mddev, info->layout, -1); 4725 } 4726 if (info->size >= 0 && mddev->size != info->size) 4727 rv = update_size(mddev, (sector_t)info->size * 2); 4728 4729 if (mddev->raid_disks != info->raid_disks) 4730 rv = update_raid_disks(mddev, info->raid_disks); 4731 4732 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 4733 if (mddev->pers->quiesce == NULL) 4734 return -EINVAL; 4735 if (mddev->recovery || mddev->sync_thread) 4736 return -EBUSY; 4737 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 4738 /* add the bitmap */ 4739 if (mddev->bitmap) 4740 return -EEXIST; 4741 if (mddev->default_bitmap_offset == 0) 4742 return -EINVAL; 4743 mddev->bitmap_offset = mddev->default_bitmap_offset; 4744 mddev->pers->quiesce(mddev, 1); 4745 rv = bitmap_create(mddev); 4746 if (rv) 4747 bitmap_destroy(mddev); 4748 mddev->pers->quiesce(mddev, 0); 4749 } else { 4750 /* remove the bitmap */ 4751 if (!mddev->bitmap) 4752 return -ENOENT; 4753 if (mddev->bitmap->file) 4754 return -EINVAL; 4755 mddev->pers->quiesce(mddev, 1); 4756 bitmap_destroy(mddev); 4757 mddev->pers->quiesce(mddev, 0); 4758 mddev->bitmap_offset = 0; 4759 } 4760 } 4761 md_update_sb(mddev, 1); 4762 return rv; 4763 } 4764 4765 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 4766 { 4767 mdk_rdev_t *rdev; 4768 4769 if (mddev->pers == NULL) 4770 return -ENODEV; 4771 4772 rdev = find_rdev(mddev, dev); 4773 if (!rdev) 4774 return -ENODEV; 4775 4776 md_error(mddev, rdev); 4777 return 0; 4778 } 4779 4780 /* 4781 * We have a problem here : there is no easy way to give a CHS 4782 * virtual geometry. We currently pretend that we have a 2 heads 4783 * 4 sectors (with a BIG number of cylinders...). This drives 4784 * dosfs just mad... ;-) 4785 */ 4786 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4787 { 4788 mddev_t *mddev = bdev->bd_disk->private_data; 4789 4790 geo->heads = 2; 4791 geo->sectors = 4; 4792 geo->cylinders = get_capacity(mddev->gendisk) / 8; 4793 return 0; 4794 } 4795 4796 static int md_ioctl(struct inode *inode, struct file *file, 4797 unsigned int cmd, unsigned long arg) 4798 { 4799 int err = 0; 4800 void __user *argp = (void __user *)arg; 4801 mddev_t *mddev = NULL; 4802 4803 if (!capable(CAP_SYS_ADMIN)) 4804 return -EACCES; 4805 4806 /* 4807 * Commands dealing with the RAID driver but not any 4808 * particular array: 4809 */ 4810 switch (cmd) 4811 { 4812 case RAID_VERSION: 4813 err = get_version(argp); 4814 goto done; 4815 4816 case PRINT_RAID_DEBUG: 4817 err = 0; 4818 md_print_devices(); 4819 goto done; 4820 4821 #ifndef MODULE 4822 case RAID_AUTORUN: 4823 err = 0; 4824 autostart_arrays(arg); 4825 goto done; 4826 #endif 4827 default:; 4828 } 4829 4830 /* 4831 * Commands creating/starting a new array: 4832 */ 4833 4834 mddev = inode->i_bdev->bd_disk->private_data; 4835 4836 if (!mddev) { 4837 BUG(); 4838 goto abort; 4839 } 4840 4841 err = mddev_lock(mddev); 4842 if (err) { 4843 printk(KERN_INFO 4844 "md: ioctl lock interrupted, reason %d, cmd %d\n", 4845 err, cmd); 4846 goto abort; 4847 } 4848 4849 switch (cmd) 4850 { 4851 case SET_ARRAY_INFO: 4852 { 4853 mdu_array_info_t info; 4854 if (!arg) 4855 memset(&info, 0, sizeof(info)); 4856 else if (copy_from_user(&info, argp, sizeof(info))) { 4857 err = -EFAULT; 4858 goto abort_unlock; 4859 } 4860 if (mddev->pers) { 4861 err = update_array_info(mddev, &info); 4862 if (err) { 4863 printk(KERN_WARNING "md: couldn't update" 4864 " array info. %d\n", err); 4865 goto abort_unlock; 4866 } 4867 goto done_unlock; 4868 } 4869 if (!list_empty(&mddev->disks)) { 4870 printk(KERN_WARNING 4871 "md: array %s already has disks!\n", 4872 mdname(mddev)); 4873 err = -EBUSY; 4874 goto abort_unlock; 4875 } 4876 if (mddev->raid_disks) { 4877 printk(KERN_WARNING 4878 "md: array %s already initialised!\n", 4879 mdname(mddev)); 4880 err = -EBUSY; 4881 goto abort_unlock; 4882 } 4883 err = set_array_info(mddev, &info); 4884 if (err) { 4885 printk(KERN_WARNING "md: couldn't set" 4886 " array info. %d\n", err); 4887 goto abort_unlock; 4888 } 4889 } 4890 goto done_unlock; 4891 4892 default:; 4893 } 4894 4895 /* 4896 * Commands querying/configuring an existing array: 4897 */ 4898 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 4899 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 4900 if ((!mddev->raid_disks && !mddev->external) 4901 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 4902 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 4903 && cmd != GET_BITMAP_FILE) { 4904 err = -ENODEV; 4905 goto abort_unlock; 4906 } 4907 4908 /* 4909 * Commands even a read-only array can execute: 4910 */ 4911 switch (cmd) 4912 { 4913 case GET_ARRAY_INFO: 4914 err = get_array_info(mddev, argp); 4915 goto done_unlock; 4916 4917 case GET_BITMAP_FILE: 4918 err = get_bitmap_file(mddev, argp); 4919 goto done_unlock; 4920 4921 case GET_DISK_INFO: 4922 err = get_disk_info(mddev, argp); 4923 goto done_unlock; 4924 4925 case RESTART_ARRAY_RW: 4926 err = restart_array(mddev); 4927 goto done_unlock; 4928 4929 case STOP_ARRAY: 4930 err = do_md_stop (mddev, 0, 1); 4931 goto done_unlock; 4932 4933 case STOP_ARRAY_RO: 4934 err = do_md_stop (mddev, 1, 1); 4935 goto done_unlock; 4936 4937 } 4938 4939 /* 4940 * The remaining ioctls are changing the state of the 4941 * superblock, so we do not allow them on read-only arrays. 4942 * However non-MD ioctls (e.g. get-size) will still come through 4943 * here and hit the 'default' below, so only disallow 4944 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4945 */ 4946 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 4947 if (mddev->ro == 2) { 4948 mddev->ro = 0; 4949 sysfs_notify(&mddev->kobj, NULL, "array_state"); 4950 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4951 md_wakeup_thread(mddev->thread); 4952 } else { 4953 err = -EROFS; 4954 goto abort_unlock; 4955 } 4956 } 4957 4958 switch (cmd) 4959 { 4960 case ADD_NEW_DISK: 4961 { 4962 mdu_disk_info_t info; 4963 if (copy_from_user(&info, argp, sizeof(info))) 4964 err = -EFAULT; 4965 else 4966 err = add_new_disk(mddev, &info); 4967 goto done_unlock; 4968 } 4969 4970 case HOT_REMOVE_DISK: 4971 err = hot_remove_disk(mddev, new_decode_dev(arg)); 4972 goto done_unlock; 4973 4974 case HOT_ADD_DISK: 4975 err = hot_add_disk(mddev, new_decode_dev(arg)); 4976 goto done_unlock; 4977 4978 case SET_DISK_FAULTY: 4979 err = set_disk_faulty(mddev, new_decode_dev(arg)); 4980 goto done_unlock; 4981 4982 case RUN_ARRAY: 4983 err = do_md_run (mddev); 4984 goto done_unlock; 4985 4986 case SET_BITMAP_FILE: 4987 err = set_bitmap_file(mddev, (int)arg); 4988 goto done_unlock; 4989 4990 default: 4991 err = -EINVAL; 4992 goto abort_unlock; 4993 } 4994 4995 done_unlock: 4996 abort_unlock: 4997 mddev_unlock(mddev); 4998 4999 return err; 5000 done: 5001 if (err) 5002 MD_BUG(); 5003 abort: 5004 return err; 5005 } 5006 5007 static int md_open(struct inode *inode, struct file *file) 5008 { 5009 /* 5010 * Succeed if we can lock the mddev, which confirms that 5011 * it isn't being stopped right now. 5012 */ 5013 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 5014 int err; 5015 5016 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 5017 goto out; 5018 5019 err = 0; 5020 mddev_get(mddev); 5021 atomic_inc(&mddev->openers); 5022 mddev_unlock(mddev); 5023 5024 check_disk_change(inode->i_bdev); 5025 out: 5026 return err; 5027 } 5028 5029 static int md_release(struct inode *inode, struct file * file) 5030 { 5031 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 5032 5033 BUG_ON(!mddev); 5034 atomic_dec(&mddev->openers); 5035 mddev_put(mddev); 5036 5037 return 0; 5038 } 5039 5040 static int md_media_changed(struct gendisk *disk) 5041 { 5042 mddev_t *mddev = disk->private_data; 5043 5044 return mddev->changed; 5045 } 5046 5047 static int md_revalidate(struct gendisk *disk) 5048 { 5049 mddev_t *mddev = disk->private_data; 5050 5051 mddev->changed = 0; 5052 return 0; 5053 } 5054 static struct block_device_operations md_fops = 5055 { 5056 .owner = THIS_MODULE, 5057 .open = md_open, 5058 .release = md_release, 5059 .ioctl = md_ioctl, 5060 .getgeo = md_getgeo, 5061 .media_changed = md_media_changed, 5062 .revalidate_disk= md_revalidate, 5063 }; 5064 5065 static int md_thread(void * arg) 5066 { 5067 mdk_thread_t *thread = arg; 5068 5069 /* 5070 * md_thread is a 'system-thread', it's priority should be very 5071 * high. We avoid resource deadlocks individually in each 5072 * raid personality. (RAID5 does preallocation) We also use RR and 5073 * the very same RT priority as kswapd, thus we will never get 5074 * into a priority inversion deadlock. 5075 * 5076 * we definitely have to have equal or higher priority than 5077 * bdflush, otherwise bdflush will deadlock if there are too 5078 * many dirty RAID5 blocks. 5079 */ 5080 5081 allow_signal(SIGKILL); 5082 while (!kthread_should_stop()) { 5083 5084 /* We need to wait INTERRUPTIBLE so that 5085 * we don't add to the load-average. 5086 * That means we need to be sure no signals are 5087 * pending 5088 */ 5089 if (signal_pending(current)) 5090 flush_signals(current); 5091 5092 wait_event_interruptible_timeout 5093 (thread->wqueue, 5094 test_bit(THREAD_WAKEUP, &thread->flags) 5095 || kthread_should_stop(), 5096 thread->timeout); 5097 5098 clear_bit(THREAD_WAKEUP, &thread->flags); 5099 5100 thread->run(thread->mddev); 5101 } 5102 5103 return 0; 5104 } 5105 5106 void md_wakeup_thread(mdk_thread_t *thread) 5107 { 5108 if (thread) { 5109 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 5110 set_bit(THREAD_WAKEUP, &thread->flags); 5111 wake_up(&thread->wqueue); 5112 } 5113 } 5114 5115 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 5116 const char *name) 5117 { 5118 mdk_thread_t *thread; 5119 5120 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 5121 if (!thread) 5122 return NULL; 5123 5124 init_waitqueue_head(&thread->wqueue); 5125 5126 thread->run = run; 5127 thread->mddev = mddev; 5128 thread->timeout = MAX_SCHEDULE_TIMEOUT; 5129 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 5130 if (IS_ERR(thread->tsk)) { 5131 kfree(thread); 5132 return NULL; 5133 } 5134 return thread; 5135 } 5136 5137 void md_unregister_thread(mdk_thread_t *thread) 5138 { 5139 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 5140 5141 kthread_stop(thread->tsk); 5142 kfree(thread); 5143 } 5144 5145 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 5146 { 5147 if (!mddev) { 5148 MD_BUG(); 5149 return; 5150 } 5151 5152 if (!rdev || test_bit(Faulty, &rdev->flags)) 5153 return; 5154 5155 if (mddev->external) 5156 set_bit(Blocked, &rdev->flags); 5157 /* 5158 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 5159 mdname(mddev), 5160 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 5161 __builtin_return_address(0),__builtin_return_address(1), 5162 __builtin_return_address(2),__builtin_return_address(3)); 5163 */ 5164 if (!mddev->pers) 5165 return; 5166 if (!mddev->pers->error_handler) 5167 return; 5168 mddev->pers->error_handler(mddev,rdev); 5169 if (mddev->degraded) 5170 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5171 set_bit(StateChanged, &rdev->flags); 5172 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5173 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5174 md_wakeup_thread(mddev->thread); 5175 md_new_event_inintr(mddev); 5176 } 5177 5178 /* seq_file implementation /proc/mdstat */ 5179 5180 static void status_unused(struct seq_file *seq) 5181 { 5182 int i = 0; 5183 mdk_rdev_t *rdev; 5184 struct list_head *tmp; 5185 5186 seq_printf(seq, "unused devices: "); 5187 5188 rdev_for_each_list(rdev, tmp, pending_raid_disks) { 5189 char b[BDEVNAME_SIZE]; 5190 i++; 5191 seq_printf(seq, "%s ", 5192 bdevname(rdev->bdev,b)); 5193 } 5194 if (!i) 5195 seq_printf(seq, "<none>"); 5196 5197 seq_printf(seq, "\n"); 5198 } 5199 5200 5201 static void status_resync(struct seq_file *seq, mddev_t * mddev) 5202 { 5203 sector_t max_blocks, resync, res; 5204 unsigned long dt, db, rt; 5205 int scale; 5206 unsigned int per_milli; 5207 5208 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 5209 5210 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5211 max_blocks = mddev->resync_max_sectors >> 1; 5212 else 5213 max_blocks = mddev->size; 5214 5215 /* 5216 * Should not happen. 5217 */ 5218 if (!max_blocks) { 5219 MD_BUG(); 5220 return; 5221 } 5222 /* Pick 'scale' such that (resync>>scale)*1000 will fit 5223 * in a sector_t, and (max_blocks>>scale) will fit in a 5224 * u32, as those are the requirements for sector_div. 5225 * Thus 'scale' must be at least 10 5226 */ 5227 scale = 10; 5228 if (sizeof(sector_t) > sizeof(unsigned long)) { 5229 while ( max_blocks/2 > (1ULL<<(scale+32))) 5230 scale++; 5231 } 5232 res = (resync>>scale)*1000; 5233 sector_div(res, (u32)((max_blocks>>scale)+1)); 5234 5235 per_milli = res; 5236 { 5237 int i, x = per_milli/50, y = 20-x; 5238 seq_printf(seq, "["); 5239 for (i = 0; i < x; i++) 5240 seq_printf(seq, "="); 5241 seq_printf(seq, ">"); 5242 for (i = 0; i < y; i++) 5243 seq_printf(seq, "."); 5244 seq_printf(seq, "] "); 5245 } 5246 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 5247 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 5248 "reshape" : 5249 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 5250 "check" : 5251 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 5252 "resync" : "recovery"))), 5253 per_milli/10, per_milli % 10, 5254 (unsigned long long) resync, 5255 (unsigned long long) max_blocks); 5256 5257 /* 5258 * We do not want to overflow, so the order of operands and 5259 * the * 100 / 100 trick are important. We do a +1 to be 5260 * safe against division by zero. We only estimate anyway. 5261 * 5262 * dt: time from mark until now 5263 * db: blocks written from mark until now 5264 * rt: remaining time 5265 */ 5266 dt = ((jiffies - mddev->resync_mark) / HZ); 5267 if (!dt) dt++; 5268 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 5269 - mddev->resync_mark_cnt; 5270 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; 5271 5272 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 5273 5274 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 5275 } 5276 5277 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 5278 { 5279 struct list_head *tmp; 5280 loff_t l = *pos; 5281 mddev_t *mddev; 5282 5283 if (l >= 0x10000) 5284 return NULL; 5285 if (!l--) 5286 /* header */ 5287 return (void*)1; 5288 5289 spin_lock(&all_mddevs_lock); 5290 list_for_each(tmp,&all_mddevs) 5291 if (!l--) { 5292 mddev = list_entry(tmp, mddev_t, all_mddevs); 5293 mddev_get(mddev); 5294 spin_unlock(&all_mddevs_lock); 5295 return mddev; 5296 } 5297 spin_unlock(&all_mddevs_lock); 5298 if (!l--) 5299 return (void*)2;/* tail */ 5300 return NULL; 5301 } 5302 5303 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 5304 { 5305 struct list_head *tmp; 5306 mddev_t *next_mddev, *mddev = v; 5307 5308 ++*pos; 5309 if (v == (void*)2) 5310 return NULL; 5311 5312 spin_lock(&all_mddevs_lock); 5313 if (v == (void*)1) 5314 tmp = all_mddevs.next; 5315 else 5316 tmp = mddev->all_mddevs.next; 5317 if (tmp != &all_mddevs) 5318 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 5319 else { 5320 next_mddev = (void*)2; 5321 *pos = 0x10000; 5322 } 5323 spin_unlock(&all_mddevs_lock); 5324 5325 if (v != (void*)1) 5326 mddev_put(mddev); 5327 return next_mddev; 5328 5329 } 5330 5331 static void md_seq_stop(struct seq_file *seq, void *v) 5332 { 5333 mddev_t *mddev = v; 5334 5335 if (mddev && v != (void*)1 && v != (void*)2) 5336 mddev_put(mddev); 5337 } 5338 5339 struct mdstat_info { 5340 int event; 5341 }; 5342 5343 static int md_seq_show(struct seq_file *seq, void *v) 5344 { 5345 mddev_t *mddev = v; 5346 sector_t size; 5347 struct list_head *tmp2; 5348 mdk_rdev_t *rdev; 5349 struct mdstat_info *mi = seq->private; 5350 struct bitmap *bitmap; 5351 5352 if (v == (void*)1) { 5353 struct mdk_personality *pers; 5354 seq_printf(seq, "Personalities : "); 5355 spin_lock(&pers_lock); 5356 list_for_each_entry(pers, &pers_list, list) 5357 seq_printf(seq, "[%s] ", pers->name); 5358 5359 spin_unlock(&pers_lock); 5360 seq_printf(seq, "\n"); 5361 mi->event = atomic_read(&md_event_count); 5362 return 0; 5363 } 5364 if (v == (void*)2) { 5365 status_unused(seq); 5366 return 0; 5367 } 5368 5369 if (mddev_lock(mddev) < 0) 5370 return -EINTR; 5371 5372 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 5373 seq_printf(seq, "%s : %sactive", mdname(mddev), 5374 mddev->pers ? "" : "in"); 5375 if (mddev->pers) { 5376 if (mddev->ro==1) 5377 seq_printf(seq, " (read-only)"); 5378 if (mddev->ro==2) 5379 seq_printf(seq, " (auto-read-only)"); 5380 seq_printf(seq, " %s", mddev->pers->name); 5381 } 5382 5383 size = 0; 5384 rdev_for_each(rdev, tmp2, mddev) { 5385 char b[BDEVNAME_SIZE]; 5386 seq_printf(seq, " %s[%d]", 5387 bdevname(rdev->bdev,b), rdev->desc_nr); 5388 if (test_bit(WriteMostly, &rdev->flags)) 5389 seq_printf(seq, "(W)"); 5390 if (test_bit(Faulty, &rdev->flags)) { 5391 seq_printf(seq, "(F)"); 5392 continue; 5393 } else if (rdev->raid_disk < 0) 5394 seq_printf(seq, "(S)"); /* spare */ 5395 size += rdev->size; 5396 } 5397 5398 if (!list_empty(&mddev->disks)) { 5399 if (mddev->pers) 5400 seq_printf(seq, "\n %llu blocks", 5401 (unsigned long long) 5402 mddev->array_sectors / 2); 5403 else 5404 seq_printf(seq, "\n %llu blocks", 5405 (unsigned long long)size); 5406 } 5407 if (mddev->persistent) { 5408 if (mddev->major_version != 0 || 5409 mddev->minor_version != 90) { 5410 seq_printf(seq," super %d.%d", 5411 mddev->major_version, 5412 mddev->minor_version); 5413 } 5414 } else if (mddev->external) 5415 seq_printf(seq, " super external:%s", 5416 mddev->metadata_type); 5417 else 5418 seq_printf(seq, " super non-persistent"); 5419 5420 if (mddev->pers) { 5421 mddev->pers->status (seq, mddev); 5422 seq_printf(seq, "\n "); 5423 if (mddev->pers->sync_request) { 5424 if (mddev->curr_resync > 2) { 5425 status_resync (seq, mddev); 5426 seq_printf(seq, "\n "); 5427 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 5428 seq_printf(seq, "\tresync=DELAYED\n "); 5429 else if (mddev->recovery_cp < MaxSector) 5430 seq_printf(seq, "\tresync=PENDING\n "); 5431 } 5432 } else 5433 seq_printf(seq, "\n "); 5434 5435 if ((bitmap = mddev->bitmap)) { 5436 unsigned long chunk_kb; 5437 unsigned long flags; 5438 spin_lock_irqsave(&bitmap->lock, flags); 5439 chunk_kb = bitmap->chunksize >> 10; 5440 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 5441 "%lu%s chunk", 5442 bitmap->pages - bitmap->missing_pages, 5443 bitmap->pages, 5444 (bitmap->pages - bitmap->missing_pages) 5445 << (PAGE_SHIFT - 10), 5446 chunk_kb ? chunk_kb : bitmap->chunksize, 5447 chunk_kb ? "KB" : "B"); 5448 if (bitmap->file) { 5449 seq_printf(seq, ", file: "); 5450 seq_path(seq, &bitmap->file->f_path, " \t\n"); 5451 } 5452 5453 seq_printf(seq, "\n"); 5454 spin_unlock_irqrestore(&bitmap->lock, flags); 5455 } 5456 5457 seq_printf(seq, "\n"); 5458 } 5459 mddev_unlock(mddev); 5460 5461 return 0; 5462 } 5463 5464 static struct seq_operations md_seq_ops = { 5465 .start = md_seq_start, 5466 .next = md_seq_next, 5467 .stop = md_seq_stop, 5468 .show = md_seq_show, 5469 }; 5470 5471 static int md_seq_open(struct inode *inode, struct file *file) 5472 { 5473 int error; 5474 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 5475 if (mi == NULL) 5476 return -ENOMEM; 5477 5478 error = seq_open(file, &md_seq_ops); 5479 if (error) 5480 kfree(mi); 5481 else { 5482 struct seq_file *p = file->private_data; 5483 p->private = mi; 5484 mi->event = atomic_read(&md_event_count); 5485 } 5486 return error; 5487 } 5488 5489 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 5490 { 5491 struct seq_file *m = filp->private_data; 5492 struct mdstat_info *mi = m->private; 5493 int mask; 5494 5495 poll_wait(filp, &md_event_waiters, wait); 5496 5497 /* always allow read */ 5498 mask = POLLIN | POLLRDNORM; 5499 5500 if (mi->event != atomic_read(&md_event_count)) 5501 mask |= POLLERR | POLLPRI; 5502 return mask; 5503 } 5504 5505 static const struct file_operations md_seq_fops = { 5506 .owner = THIS_MODULE, 5507 .open = md_seq_open, 5508 .read = seq_read, 5509 .llseek = seq_lseek, 5510 .release = seq_release_private, 5511 .poll = mdstat_poll, 5512 }; 5513 5514 int register_md_personality(struct mdk_personality *p) 5515 { 5516 spin_lock(&pers_lock); 5517 list_add_tail(&p->list, &pers_list); 5518 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 5519 spin_unlock(&pers_lock); 5520 return 0; 5521 } 5522 5523 int unregister_md_personality(struct mdk_personality *p) 5524 { 5525 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 5526 spin_lock(&pers_lock); 5527 list_del_init(&p->list); 5528 spin_unlock(&pers_lock); 5529 return 0; 5530 } 5531 5532 static int is_mddev_idle(mddev_t *mddev) 5533 { 5534 mdk_rdev_t * rdev; 5535 int idle; 5536 long curr_events; 5537 5538 idle = 1; 5539 rcu_read_lock(); 5540 rdev_for_each_rcu(rdev, mddev) { 5541 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 5542 curr_events = part_stat_read(&disk->part0, sectors[0]) + 5543 part_stat_read(&disk->part0, sectors[1]) - 5544 atomic_read(&disk->sync_io); 5545 /* sync IO will cause sync_io to increase before the disk_stats 5546 * as sync_io is counted when a request starts, and 5547 * disk_stats is counted when it completes. 5548 * So resync activity will cause curr_events to be smaller than 5549 * when there was no such activity. 5550 * non-sync IO will cause disk_stat to increase without 5551 * increasing sync_io so curr_events will (eventually) 5552 * be larger than it was before. Once it becomes 5553 * substantially larger, the test below will cause 5554 * the array to appear non-idle, and resync will slow 5555 * down. 5556 * If there is a lot of outstanding resync activity when 5557 * we set last_event to curr_events, then all that activity 5558 * completing might cause the array to appear non-idle 5559 * and resync will be slowed down even though there might 5560 * not have been non-resync activity. This will only 5561 * happen once though. 'last_events' will soon reflect 5562 * the state where there is little or no outstanding 5563 * resync requests, and further resync activity will 5564 * always make curr_events less than last_events. 5565 * 5566 */ 5567 if (curr_events - rdev->last_events > 4096) { 5568 rdev->last_events = curr_events; 5569 idle = 0; 5570 } 5571 } 5572 rcu_read_unlock(); 5573 return idle; 5574 } 5575 5576 void md_done_sync(mddev_t *mddev, int blocks, int ok) 5577 { 5578 /* another "blocks" (512byte) blocks have been synced */ 5579 atomic_sub(blocks, &mddev->recovery_active); 5580 wake_up(&mddev->recovery_wait); 5581 if (!ok) { 5582 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5583 md_wakeup_thread(mddev->thread); 5584 // stop recovery, signal do_sync .... 5585 } 5586 } 5587 5588 5589 /* md_write_start(mddev, bi) 5590 * If we need to update some array metadata (e.g. 'active' flag 5591 * in superblock) before writing, schedule a superblock update 5592 * and wait for it to complete. 5593 */ 5594 void md_write_start(mddev_t *mddev, struct bio *bi) 5595 { 5596 int did_change = 0; 5597 if (bio_data_dir(bi) != WRITE) 5598 return; 5599 5600 BUG_ON(mddev->ro == 1); 5601 if (mddev->ro == 2) { 5602 /* need to switch to read/write */ 5603 mddev->ro = 0; 5604 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5605 md_wakeup_thread(mddev->thread); 5606 md_wakeup_thread(mddev->sync_thread); 5607 did_change = 1; 5608 } 5609 atomic_inc(&mddev->writes_pending); 5610 if (mddev->safemode == 1) 5611 mddev->safemode = 0; 5612 if (mddev->in_sync) { 5613 spin_lock_irq(&mddev->write_lock); 5614 if (mddev->in_sync) { 5615 mddev->in_sync = 0; 5616 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5617 md_wakeup_thread(mddev->thread); 5618 did_change = 1; 5619 } 5620 spin_unlock_irq(&mddev->write_lock); 5621 } 5622 if (did_change) 5623 sysfs_notify(&mddev->kobj, NULL, "array_state"); 5624 wait_event(mddev->sb_wait, 5625 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 5626 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 5627 } 5628 5629 void md_write_end(mddev_t *mddev) 5630 { 5631 if (atomic_dec_and_test(&mddev->writes_pending)) { 5632 if (mddev->safemode == 2) 5633 md_wakeup_thread(mddev->thread); 5634 else if (mddev->safemode_delay) 5635 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5636 } 5637 } 5638 5639 /* md_allow_write(mddev) 5640 * Calling this ensures that the array is marked 'active' so that writes 5641 * may proceed without blocking. It is important to call this before 5642 * attempting a GFP_KERNEL allocation while holding the mddev lock. 5643 * Must be called with mddev_lock held. 5644 * 5645 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 5646 * is dropped, so return -EAGAIN after notifying userspace. 5647 */ 5648 int md_allow_write(mddev_t *mddev) 5649 { 5650 if (!mddev->pers) 5651 return 0; 5652 if (mddev->ro) 5653 return 0; 5654 if (!mddev->pers->sync_request) 5655 return 0; 5656 5657 spin_lock_irq(&mddev->write_lock); 5658 if (mddev->in_sync) { 5659 mddev->in_sync = 0; 5660 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5661 if (mddev->safemode_delay && 5662 mddev->safemode == 0) 5663 mddev->safemode = 1; 5664 spin_unlock_irq(&mddev->write_lock); 5665 md_update_sb(mddev, 0); 5666 sysfs_notify(&mddev->kobj, NULL, "array_state"); 5667 } else 5668 spin_unlock_irq(&mddev->write_lock); 5669 5670 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 5671 return -EAGAIN; 5672 else 5673 return 0; 5674 } 5675 EXPORT_SYMBOL_GPL(md_allow_write); 5676 5677 #define SYNC_MARKS 10 5678 #define SYNC_MARK_STEP (3*HZ) 5679 void md_do_sync(mddev_t *mddev) 5680 { 5681 mddev_t *mddev2; 5682 unsigned int currspeed = 0, 5683 window; 5684 sector_t max_sectors,j, io_sectors; 5685 unsigned long mark[SYNC_MARKS]; 5686 sector_t mark_cnt[SYNC_MARKS]; 5687 int last_mark,m; 5688 struct list_head *tmp; 5689 sector_t last_check; 5690 int skipped = 0; 5691 struct list_head *rtmp; 5692 mdk_rdev_t *rdev; 5693 char *desc; 5694 5695 /* just incase thread restarts... */ 5696 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5697 return; 5698 if (mddev->ro) /* never try to sync a read-only array */ 5699 return; 5700 5701 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5702 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 5703 desc = "data-check"; 5704 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5705 desc = "requested-resync"; 5706 else 5707 desc = "resync"; 5708 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5709 desc = "reshape"; 5710 else 5711 desc = "recovery"; 5712 5713 /* we overload curr_resync somewhat here. 5714 * 0 == not engaged in resync at all 5715 * 2 == checking that there is no conflict with another sync 5716 * 1 == like 2, but have yielded to allow conflicting resync to 5717 * commense 5718 * other == active in resync - this many blocks 5719 * 5720 * Before starting a resync we must have set curr_resync to 5721 * 2, and then checked that every "conflicting" array has curr_resync 5722 * less than ours. When we find one that is the same or higher 5723 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 5724 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 5725 * This will mean we have to start checking from the beginning again. 5726 * 5727 */ 5728 5729 do { 5730 mddev->curr_resync = 2; 5731 5732 try_again: 5733 if (kthread_should_stop()) { 5734 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5735 goto skip; 5736 } 5737 for_each_mddev(mddev2, tmp) { 5738 if (mddev2 == mddev) 5739 continue; 5740 if (!mddev->parallel_resync 5741 && mddev2->curr_resync 5742 && match_mddev_units(mddev, mddev2)) { 5743 DEFINE_WAIT(wq); 5744 if (mddev < mddev2 && mddev->curr_resync == 2) { 5745 /* arbitrarily yield */ 5746 mddev->curr_resync = 1; 5747 wake_up(&resync_wait); 5748 } 5749 if (mddev > mddev2 && mddev->curr_resync == 1) 5750 /* no need to wait here, we can wait the next 5751 * time 'round when curr_resync == 2 5752 */ 5753 continue; 5754 /* We need to wait 'interruptible' so as not to 5755 * contribute to the load average, and not to 5756 * be caught by 'softlockup' 5757 */ 5758 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 5759 if (!kthread_should_stop() && 5760 mddev2->curr_resync >= mddev->curr_resync) { 5761 printk(KERN_INFO "md: delaying %s of %s" 5762 " until %s has finished (they" 5763 " share one or more physical units)\n", 5764 desc, mdname(mddev), mdname(mddev2)); 5765 mddev_put(mddev2); 5766 if (signal_pending(current)) 5767 flush_signals(current); 5768 schedule(); 5769 finish_wait(&resync_wait, &wq); 5770 goto try_again; 5771 } 5772 finish_wait(&resync_wait, &wq); 5773 } 5774 } 5775 } while (mddev->curr_resync < 2); 5776 5777 j = 0; 5778 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5779 /* resync follows the size requested by the personality, 5780 * which defaults to physical size, but can be virtual size 5781 */ 5782 max_sectors = mddev->resync_max_sectors; 5783 mddev->resync_mismatches = 0; 5784 /* we don't use the checkpoint if there's a bitmap */ 5785 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5786 j = mddev->resync_min; 5787 else if (!mddev->bitmap) 5788 j = mddev->recovery_cp; 5789 5790 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5791 max_sectors = mddev->size << 1; 5792 else { 5793 /* recovery follows the physical size of devices */ 5794 max_sectors = mddev->size << 1; 5795 j = MaxSector; 5796 rdev_for_each(rdev, rtmp, mddev) 5797 if (rdev->raid_disk >= 0 && 5798 !test_bit(Faulty, &rdev->flags) && 5799 !test_bit(In_sync, &rdev->flags) && 5800 rdev->recovery_offset < j) 5801 j = rdev->recovery_offset; 5802 } 5803 5804 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 5805 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 5806 " %d KB/sec/disk.\n", speed_min(mddev)); 5807 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 5808 "(but not more than %d KB/sec) for %s.\n", 5809 speed_max(mddev), desc); 5810 5811 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5812 5813 io_sectors = 0; 5814 for (m = 0; m < SYNC_MARKS; m++) { 5815 mark[m] = jiffies; 5816 mark_cnt[m] = io_sectors; 5817 } 5818 last_mark = 0; 5819 mddev->resync_mark = mark[last_mark]; 5820 mddev->resync_mark_cnt = mark_cnt[last_mark]; 5821 5822 /* 5823 * Tune reconstruction: 5824 */ 5825 window = 32*(PAGE_SIZE/512); 5826 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 5827 window/2,(unsigned long long) max_sectors/2); 5828 5829 atomic_set(&mddev->recovery_active, 0); 5830 last_check = 0; 5831 5832 if (j>2) { 5833 printk(KERN_INFO 5834 "md: resuming %s of %s from checkpoint.\n", 5835 desc, mdname(mddev)); 5836 mddev->curr_resync = j; 5837 } 5838 5839 while (j < max_sectors) { 5840 sector_t sectors; 5841 5842 skipped = 0; 5843 if (j >= mddev->resync_max) { 5844 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5845 wait_event(mddev->recovery_wait, 5846 mddev->resync_max > j 5847 || kthread_should_stop()); 5848 } 5849 if (kthread_should_stop()) 5850 goto interrupted; 5851 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5852 currspeed < speed_min(mddev)); 5853 if (sectors == 0) { 5854 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5855 goto out; 5856 } 5857 5858 if (!skipped) { /* actual IO requested */ 5859 io_sectors += sectors; 5860 atomic_add(sectors, &mddev->recovery_active); 5861 } 5862 5863 j += sectors; 5864 if (j>1) mddev->curr_resync = j; 5865 mddev->curr_mark_cnt = io_sectors; 5866 if (last_check == 0) 5867 /* this is the earliers that rebuilt will be 5868 * visible in /proc/mdstat 5869 */ 5870 md_new_event(mddev); 5871 5872 if (last_check + window > io_sectors || j == max_sectors) 5873 continue; 5874 5875 last_check = io_sectors; 5876 5877 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5878 break; 5879 5880 repeat: 5881 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 5882 /* step marks */ 5883 int next = (last_mark+1) % SYNC_MARKS; 5884 5885 mddev->resync_mark = mark[next]; 5886 mddev->resync_mark_cnt = mark_cnt[next]; 5887 mark[next] = jiffies; 5888 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 5889 last_mark = next; 5890 } 5891 5892 5893 if (kthread_should_stop()) 5894 goto interrupted; 5895 5896 5897 /* 5898 * this loop exits only if either when we are slower than 5899 * the 'hard' speed limit, or the system was IO-idle for 5900 * a jiffy. 5901 * the system might be non-idle CPU-wise, but we only care 5902 * about not overloading the IO subsystem. (things like an 5903 * e2fsck being done on the RAID array should execute fast) 5904 */ 5905 blk_unplug(mddev->queue); 5906 cond_resched(); 5907 5908 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 5909 /((jiffies-mddev->resync_mark)/HZ +1) +1; 5910 5911 if (currspeed > speed_min(mddev)) { 5912 if ((currspeed > speed_max(mddev)) || 5913 !is_mddev_idle(mddev)) { 5914 msleep(500); 5915 goto repeat; 5916 } 5917 } 5918 } 5919 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 5920 /* 5921 * this also signals 'finished resyncing' to md_stop 5922 */ 5923 out: 5924 blk_unplug(mddev->queue); 5925 5926 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 5927 5928 /* tell personality that we are finished */ 5929 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5930 5931 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5932 mddev->curr_resync > 2) { 5933 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5934 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5935 if (mddev->curr_resync >= mddev->recovery_cp) { 5936 printk(KERN_INFO 5937 "md: checkpointing %s of %s.\n", 5938 desc, mdname(mddev)); 5939 mddev->recovery_cp = mddev->curr_resync; 5940 } 5941 } else 5942 mddev->recovery_cp = MaxSector; 5943 } else { 5944 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5945 mddev->curr_resync = MaxSector; 5946 rdev_for_each(rdev, rtmp, mddev) 5947 if (rdev->raid_disk >= 0 && 5948 !test_bit(Faulty, &rdev->flags) && 5949 !test_bit(In_sync, &rdev->flags) && 5950 rdev->recovery_offset < mddev->curr_resync) 5951 rdev->recovery_offset = mddev->curr_resync; 5952 } 5953 } 5954 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5955 5956 skip: 5957 mddev->curr_resync = 0; 5958 mddev->resync_min = 0; 5959 mddev->resync_max = MaxSector; 5960 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5961 wake_up(&resync_wait); 5962 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 5963 md_wakeup_thread(mddev->thread); 5964 return; 5965 5966 interrupted: 5967 /* 5968 * got a signal, exit. 5969 */ 5970 printk(KERN_INFO 5971 "md: md_do_sync() got signal ... exiting\n"); 5972 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5973 goto out; 5974 5975 } 5976 EXPORT_SYMBOL_GPL(md_do_sync); 5977 5978 5979 static int remove_and_add_spares(mddev_t *mddev) 5980 { 5981 mdk_rdev_t *rdev; 5982 struct list_head *rtmp; 5983 int spares = 0; 5984 5985 rdev_for_each(rdev, rtmp, mddev) 5986 if (rdev->raid_disk >= 0 && 5987 !test_bit(Blocked, &rdev->flags) && 5988 (test_bit(Faulty, &rdev->flags) || 5989 ! test_bit(In_sync, &rdev->flags)) && 5990 atomic_read(&rdev->nr_pending)==0) { 5991 if (mddev->pers->hot_remove_disk( 5992 mddev, rdev->raid_disk)==0) { 5993 char nm[20]; 5994 sprintf(nm,"rd%d", rdev->raid_disk); 5995 sysfs_remove_link(&mddev->kobj, nm); 5996 rdev->raid_disk = -1; 5997 } 5998 } 5999 6000 if (mddev->degraded && ! mddev->ro) { 6001 rdev_for_each(rdev, rtmp, mddev) { 6002 if (rdev->raid_disk >= 0 && 6003 !test_bit(In_sync, &rdev->flags) && 6004 !test_bit(Blocked, &rdev->flags)) 6005 spares++; 6006 if (rdev->raid_disk < 0 6007 && !test_bit(Faulty, &rdev->flags)) { 6008 rdev->recovery_offset = 0; 6009 if (mddev->pers-> 6010 hot_add_disk(mddev, rdev) == 0) { 6011 char nm[20]; 6012 sprintf(nm, "rd%d", rdev->raid_disk); 6013 if (sysfs_create_link(&mddev->kobj, 6014 &rdev->kobj, nm)) 6015 printk(KERN_WARNING 6016 "md: cannot register " 6017 "%s for %s\n", 6018 nm, mdname(mddev)); 6019 spares++; 6020 md_new_event(mddev); 6021 } else 6022 break; 6023 } 6024 } 6025 } 6026 return spares; 6027 } 6028 /* 6029 * This routine is regularly called by all per-raid-array threads to 6030 * deal with generic issues like resync and super-block update. 6031 * Raid personalities that don't have a thread (linear/raid0) do not 6032 * need this as they never do any recovery or update the superblock. 6033 * 6034 * It does not do any resync itself, but rather "forks" off other threads 6035 * to do that as needed. 6036 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 6037 * "->recovery" and create a thread at ->sync_thread. 6038 * When the thread finishes it sets MD_RECOVERY_DONE 6039 * and wakeups up this thread which will reap the thread and finish up. 6040 * This thread also removes any faulty devices (with nr_pending == 0). 6041 * 6042 * The overall approach is: 6043 * 1/ if the superblock needs updating, update it. 6044 * 2/ If a recovery thread is running, don't do anything else. 6045 * 3/ If recovery has finished, clean up, possibly marking spares active. 6046 * 4/ If there are any faulty devices, remove them. 6047 * 5/ If array is degraded, try to add spares devices 6048 * 6/ If array has spares or is not in-sync, start a resync thread. 6049 */ 6050 void md_check_recovery(mddev_t *mddev) 6051 { 6052 mdk_rdev_t *rdev; 6053 struct list_head *rtmp; 6054 6055 6056 if (mddev->bitmap) 6057 bitmap_daemon_work(mddev->bitmap); 6058 6059 if (test_and_clear_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags)) 6060 sysfs_notify(&mddev->kobj, NULL, "array_state"); 6061 6062 if (mddev->ro) 6063 return; 6064 6065 if (signal_pending(current)) { 6066 if (mddev->pers->sync_request && !mddev->external) { 6067 printk(KERN_INFO "md: %s in immediate safe mode\n", 6068 mdname(mddev)); 6069 mddev->safemode = 2; 6070 } 6071 flush_signals(current); 6072 } 6073 6074 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 6075 return; 6076 if ( ! ( 6077 (mddev->flags && !mddev->external) || 6078 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 6079 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 6080 (mddev->external == 0 && mddev->safemode == 1) || 6081 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 6082 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 6083 )) 6084 return; 6085 6086 if (mddev_trylock(mddev)) { 6087 int spares = 0; 6088 6089 if (mddev->ro) { 6090 /* Only thing we do on a ro array is remove 6091 * failed devices. 6092 */ 6093 remove_and_add_spares(mddev); 6094 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6095 goto unlock; 6096 } 6097 6098 if (!mddev->external) { 6099 int did_change = 0; 6100 spin_lock_irq(&mddev->write_lock); 6101 if (mddev->safemode && 6102 !atomic_read(&mddev->writes_pending) && 6103 !mddev->in_sync && 6104 mddev->recovery_cp == MaxSector) { 6105 mddev->in_sync = 1; 6106 did_change = 1; 6107 if (mddev->persistent) 6108 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6109 } 6110 if (mddev->safemode == 1) 6111 mddev->safemode = 0; 6112 spin_unlock_irq(&mddev->write_lock); 6113 if (did_change) 6114 sysfs_notify(&mddev->kobj, NULL, "array_state"); 6115 } 6116 6117 if (mddev->flags) 6118 md_update_sb(mddev, 0); 6119 6120 rdev_for_each(rdev, rtmp, mddev) 6121 if (test_and_clear_bit(StateChanged, &rdev->flags)) 6122 sysfs_notify(&rdev->kobj, NULL, "state"); 6123 6124 6125 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6126 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6127 /* resync/recovery still happening */ 6128 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6129 goto unlock; 6130 } 6131 if (mddev->sync_thread) { 6132 /* resync has finished, collect result */ 6133 md_unregister_thread(mddev->sync_thread); 6134 mddev->sync_thread = NULL; 6135 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 6136 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 6137 /* success...*/ 6138 /* activate any spares */ 6139 if (mddev->pers->spare_active(mddev)) 6140 sysfs_notify(&mddev->kobj, NULL, 6141 "degraded"); 6142 } 6143 md_update_sb(mddev, 1); 6144 6145 /* if array is no-longer degraded, then any saved_raid_disk 6146 * information must be scrapped 6147 */ 6148 if (!mddev->degraded) 6149 rdev_for_each(rdev, rtmp, mddev) 6150 rdev->saved_raid_disk = -1; 6151 6152 mddev->recovery = 0; 6153 /* flag recovery needed just to double check */ 6154 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6155 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 6156 md_new_event(mddev); 6157 goto unlock; 6158 } 6159 /* Set RUNNING before clearing NEEDED to avoid 6160 * any transients in the value of "sync_action". 6161 */ 6162 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6163 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6164 /* Clear some bits that don't mean anything, but 6165 * might be left set 6166 */ 6167 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6168 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6169 6170 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 6171 goto unlock; 6172 /* no recovery is running. 6173 * remove any failed drives, then 6174 * add spares if possible. 6175 * Spare are also removed and re-added, to allow 6176 * the personality to fail the re-add. 6177 */ 6178 6179 if (mddev->reshape_position != MaxSector) { 6180 if (mddev->pers->check_reshape(mddev) != 0) 6181 /* Cannot proceed */ 6182 goto unlock; 6183 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6184 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6185 } else if ((spares = remove_and_add_spares(mddev))) { 6186 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6187 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6188 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 6189 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6190 } else if (mddev->recovery_cp < MaxSector) { 6191 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6192 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6193 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6194 /* nothing to be done ... */ 6195 goto unlock; 6196 6197 if (mddev->pers->sync_request) { 6198 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6199 /* We are adding a device or devices to an array 6200 * which has the bitmap stored on all devices. 6201 * So make sure all bitmap pages get written 6202 */ 6203 bitmap_write_all(mddev->bitmap); 6204 } 6205 mddev->sync_thread = md_register_thread(md_do_sync, 6206 mddev, 6207 "%s_resync"); 6208 if (!mddev->sync_thread) { 6209 printk(KERN_ERR "%s: could not start resync" 6210 " thread...\n", 6211 mdname(mddev)); 6212 /* leave the spares where they are, it shouldn't hurt */ 6213 mddev->recovery = 0; 6214 } else 6215 md_wakeup_thread(mddev->sync_thread); 6216 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 6217 md_new_event(mddev); 6218 } 6219 unlock: 6220 if (!mddev->sync_thread) { 6221 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6222 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 6223 &mddev->recovery)) 6224 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 6225 } 6226 mddev_unlock(mddev); 6227 } 6228 } 6229 6230 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 6231 { 6232 sysfs_notify(&rdev->kobj, NULL, "state"); 6233 wait_event_timeout(rdev->blocked_wait, 6234 !test_bit(Blocked, &rdev->flags), 6235 msecs_to_jiffies(5000)); 6236 rdev_dec_pending(rdev, mddev); 6237 } 6238 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 6239 6240 static int md_notify_reboot(struct notifier_block *this, 6241 unsigned long code, void *x) 6242 { 6243 struct list_head *tmp; 6244 mddev_t *mddev; 6245 6246 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 6247 6248 printk(KERN_INFO "md: stopping all md devices.\n"); 6249 6250 for_each_mddev(mddev, tmp) 6251 if (mddev_trylock(mddev)) { 6252 /* Force a switch to readonly even array 6253 * appears to still be in use. Hence 6254 * the '100'. 6255 */ 6256 do_md_stop (mddev, 1, 100); 6257 mddev_unlock(mddev); 6258 } 6259 /* 6260 * certain more exotic SCSI devices are known to be 6261 * volatile wrt too early system reboots. While the 6262 * right place to handle this issue is the given 6263 * driver, we do want to have a safe RAID driver ... 6264 */ 6265 mdelay(1000*1); 6266 } 6267 return NOTIFY_DONE; 6268 } 6269 6270 static struct notifier_block md_notifier = { 6271 .notifier_call = md_notify_reboot, 6272 .next = NULL, 6273 .priority = INT_MAX, /* before any real devices */ 6274 }; 6275 6276 static void md_geninit(void) 6277 { 6278 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 6279 6280 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 6281 } 6282 6283 static int __init md_init(void) 6284 { 6285 if (register_blkdev(MAJOR_NR, "md")) 6286 return -1; 6287 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 6288 unregister_blkdev(MAJOR_NR, "md"); 6289 return -1; 6290 } 6291 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, 6292 md_probe, NULL, NULL); 6293 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 6294 md_probe, NULL, NULL); 6295 6296 register_reboot_notifier(&md_notifier); 6297 raid_table_header = register_sysctl_table(raid_root_table); 6298 6299 md_geninit(); 6300 return (0); 6301 } 6302 6303 6304 #ifndef MODULE 6305 6306 /* 6307 * Searches all registered partitions for autorun RAID arrays 6308 * at boot time. 6309 */ 6310 6311 static LIST_HEAD(all_detected_devices); 6312 struct detected_devices_node { 6313 struct list_head list; 6314 dev_t dev; 6315 }; 6316 6317 void md_autodetect_dev(dev_t dev) 6318 { 6319 struct detected_devices_node *node_detected_dev; 6320 6321 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 6322 if (node_detected_dev) { 6323 node_detected_dev->dev = dev; 6324 list_add_tail(&node_detected_dev->list, &all_detected_devices); 6325 } else { 6326 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 6327 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 6328 } 6329 } 6330 6331 6332 static void autostart_arrays(int part) 6333 { 6334 mdk_rdev_t *rdev; 6335 struct detected_devices_node *node_detected_dev; 6336 dev_t dev; 6337 int i_scanned, i_passed; 6338 6339 i_scanned = 0; 6340 i_passed = 0; 6341 6342 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 6343 6344 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 6345 i_scanned++; 6346 node_detected_dev = list_entry(all_detected_devices.next, 6347 struct detected_devices_node, list); 6348 list_del(&node_detected_dev->list); 6349 dev = node_detected_dev->dev; 6350 kfree(node_detected_dev); 6351 rdev = md_import_device(dev,0, 90); 6352 if (IS_ERR(rdev)) 6353 continue; 6354 6355 if (test_bit(Faulty, &rdev->flags)) { 6356 MD_BUG(); 6357 continue; 6358 } 6359 set_bit(AutoDetected, &rdev->flags); 6360 list_add(&rdev->same_set, &pending_raid_disks); 6361 i_passed++; 6362 } 6363 6364 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 6365 i_scanned, i_passed); 6366 6367 autorun_devices(part); 6368 } 6369 6370 #endif /* !MODULE */ 6371 6372 static __exit void md_exit(void) 6373 { 6374 mddev_t *mddev; 6375 struct list_head *tmp; 6376 6377 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); 6378 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 6379 6380 unregister_blkdev(MAJOR_NR,"md"); 6381 unregister_blkdev(mdp_major, "mdp"); 6382 unregister_reboot_notifier(&md_notifier); 6383 unregister_sysctl_table(raid_table_header); 6384 remove_proc_entry("mdstat", NULL); 6385 for_each_mddev(mddev, tmp) { 6386 struct gendisk *disk = mddev->gendisk; 6387 if (!disk) 6388 continue; 6389 export_array(mddev); 6390 del_gendisk(disk); 6391 put_disk(disk); 6392 mddev->gendisk = NULL; 6393 mddev_put(mddev); 6394 } 6395 } 6396 6397 subsys_initcall(md_init); 6398 module_exit(md_exit) 6399 6400 static int get_ro(char *buffer, struct kernel_param *kp) 6401 { 6402 return sprintf(buffer, "%d", start_readonly); 6403 } 6404 static int set_ro(const char *val, struct kernel_param *kp) 6405 { 6406 char *e; 6407 int num = simple_strtoul(val, &e, 10); 6408 if (*val && (*e == '\0' || *e == '\n')) { 6409 start_readonly = num; 6410 return 0; 6411 } 6412 return -EINVAL; 6413 } 6414 6415 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 6416 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 6417 6418 6419 EXPORT_SYMBOL(register_md_personality); 6420 EXPORT_SYMBOL(unregister_md_personality); 6421 EXPORT_SYMBOL(md_error); 6422 EXPORT_SYMBOL(md_done_sync); 6423 EXPORT_SYMBOL(md_write_start); 6424 EXPORT_SYMBOL(md_write_end); 6425 EXPORT_SYMBOL(md_register_thread); 6426 EXPORT_SYMBOL(md_unregister_thread); 6427 EXPORT_SYMBOL(md_wakeup_thread); 6428 EXPORT_SYMBOL(md_check_recovery); 6429 MODULE_LICENSE("GPL"); 6430 MODULE_ALIAS("md"); 6431 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 6432