1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/kernel.h> 37 #include <linux/kthread.h> 38 #include <linux/linkage.h> 39 #include <linux/raid/md.h> 40 #include <linux/raid/bitmap.h> 41 #include <linux/sysctl.h> 42 #include <linux/buffer_head.h> /* for invalidate_bdev */ 43 #include <linux/poll.h> 44 #include <linux/mutex.h> 45 #include <linux/ctype.h> 46 #include <linux/freezer.h> 47 48 #include <linux/init.h> 49 50 #include <linux/file.h> 51 52 #ifdef CONFIG_KMOD 53 #include <linux/kmod.h> 54 #endif 55 56 #include <asm/unaligned.h> 57 58 #define MAJOR_NR MD_MAJOR 59 #define MD_DRIVER 60 61 /* 63 partitions with the alternate major number (mdp) */ 62 #define MdpMinorShift 6 63 64 #define DEBUG 0 65 #define dprintk(x...) ((void)(DEBUG && printk(x))) 66 67 68 #ifndef MODULE 69 static void autostart_arrays (int part); 70 #endif 71 72 static LIST_HEAD(pers_list); 73 static DEFINE_SPINLOCK(pers_lock); 74 75 static void md_print_devices(void); 76 77 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 78 79 /* 80 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 81 * is 1000 KB/sec, so the extra system load does not show up that much. 82 * Increase it if you want to have more _guaranteed_ speed. Note that 83 * the RAID driver will use the maximum available bandwidth if the IO 84 * subsystem is idle. There is also an 'absolute maximum' reconstruction 85 * speed limit - in case reconstruction slows down your system despite 86 * idle IO detection. 87 * 88 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 89 * or /sys/block/mdX/md/sync_speed_{min,max} 90 */ 91 92 static int sysctl_speed_limit_min = 1000; 93 static int sysctl_speed_limit_max = 200000; 94 static inline int speed_min(mddev_t *mddev) 95 { 96 return mddev->sync_speed_min ? 97 mddev->sync_speed_min : sysctl_speed_limit_min; 98 } 99 100 static inline int speed_max(mddev_t *mddev) 101 { 102 return mddev->sync_speed_max ? 103 mddev->sync_speed_max : sysctl_speed_limit_max; 104 } 105 106 static struct ctl_table_header *raid_table_header; 107 108 static ctl_table raid_table[] = { 109 { 110 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 111 .procname = "speed_limit_min", 112 .data = &sysctl_speed_limit_min, 113 .maxlen = sizeof(int), 114 .mode = S_IRUGO|S_IWUSR, 115 .proc_handler = &proc_dointvec, 116 }, 117 { 118 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 119 .procname = "speed_limit_max", 120 .data = &sysctl_speed_limit_max, 121 .maxlen = sizeof(int), 122 .mode = S_IRUGO|S_IWUSR, 123 .proc_handler = &proc_dointvec, 124 }, 125 { .ctl_name = 0 } 126 }; 127 128 static ctl_table raid_dir_table[] = { 129 { 130 .ctl_name = DEV_RAID, 131 .procname = "raid", 132 .maxlen = 0, 133 .mode = S_IRUGO|S_IXUGO, 134 .child = raid_table, 135 }, 136 { .ctl_name = 0 } 137 }; 138 139 static ctl_table raid_root_table[] = { 140 { 141 .ctl_name = CTL_DEV, 142 .procname = "dev", 143 .maxlen = 0, 144 .mode = 0555, 145 .child = raid_dir_table, 146 }, 147 { .ctl_name = 0 } 148 }; 149 150 static struct block_device_operations md_fops; 151 152 static int start_readonly; 153 154 /* 155 * We have a system wide 'event count' that is incremented 156 * on any 'interesting' event, and readers of /proc/mdstat 157 * can use 'poll' or 'select' to find out when the event 158 * count increases. 159 * 160 * Events are: 161 * start array, stop array, error, add device, remove device, 162 * start build, activate spare 163 */ 164 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 165 static atomic_t md_event_count; 166 void md_new_event(mddev_t *mddev) 167 { 168 atomic_inc(&md_event_count); 169 wake_up(&md_event_waiters); 170 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 171 } 172 EXPORT_SYMBOL_GPL(md_new_event); 173 174 /* Alternate version that can be called from interrupts 175 * when calling sysfs_notify isn't needed. 176 */ 177 static void md_new_event_inintr(mddev_t *mddev) 178 { 179 atomic_inc(&md_event_count); 180 wake_up(&md_event_waiters); 181 } 182 183 /* 184 * Enables to iterate over all existing md arrays 185 * all_mddevs_lock protects this list. 186 */ 187 static LIST_HEAD(all_mddevs); 188 static DEFINE_SPINLOCK(all_mddevs_lock); 189 190 191 /* 192 * iterates through all used mddevs in the system. 193 * We take care to grab the all_mddevs_lock whenever navigating 194 * the list, and to always hold a refcount when unlocked. 195 * Any code which breaks out of this loop while own 196 * a reference to the current mddev and must mddev_put it. 197 */ 198 #define ITERATE_MDDEV(mddev,tmp) \ 199 \ 200 for (({ spin_lock(&all_mddevs_lock); \ 201 tmp = all_mddevs.next; \ 202 mddev = NULL;}); \ 203 ({ if (tmp != &all_mddevs) \ 204 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 205 spin_unlock(&all_mddevs_lock); \ 206 if (mddev) mddev_put(mddev); \ 207 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 208 tmp != &all_mddevs;}); \ 209 ({ spin_lock(&all_mddevs_lock); \ 210 tmp = tmp->next;}) \ 211 ) 212 213 214 static int md_fail_request (request_queue_t *q, struct bio *bio) 215 { 216 bio_io_error(bio, bio->bi_size); 217 return 0; 218 } 219 220 static inline mddev_t *mddev_get(mddev_t *mddev) 221 { 222 atomic_inc(&mddev->active); 223 return mddev; 224 } 225 226 static void mddev_put(mddev_t *mddev) 227 { 228 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 229 return; 230 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 231 list_del(&mddev->all_mddevs); 232 spin_unlock(&all_mddevs_lock); 233 blk_cleanup_queue(mddev->queue); 234 kobject_unregister(&mddev->kobj); 235 } else 236 spin_unlock(&all_mddevs_lock); 237 } 238 239 static mddev_t * mddev_find(dev_t unit) 240 { 241 mddev_t *mddev, *new = NULL; 242 243 retry: 244 spin_lock(&all_mddevs_lock); 245 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 246 if (mddev->unit == unit) { 247 mddev_get(mddev); 248 spin_unlock(&all_mddevs_lock); 249 kfree(new); 250 return mddev; 251 } 252 253 if (new) { 254 list_add(&new->all_mddevs, &all_mddevs); 255 spin_unlock(&all_mddevs_lock); 256 return new; 257 } 258 spin_unlock(&all_mddevs_lock); 259 260 new = kzalloc(sizeof(*new), GFP_KERNEL); 261 if (!new) 262 return NULL; 263 264 new->unit = unit; 265 if (MAJOR(unit) == MD_MAJOR) 266 new->md_minor = MINOR(unit); 267 else 268 new->md_minor = MINOR(unit) >> MdpMinorShift; 269 270 mutex_init(&new->reconfig_mutex); 271 INIT_LIST_HEAD(&new->disks); 272 INIT_LIST_HEAD(&new->all_mddevs); 273 init_timer(&new->safemode_timer); 274 atomic_set(&new->active, 1); 275 spin_lock_init(&new->write_lock); 276 init_waitqueue_head(&new->sb_wait); 277 new->reshape_position = MaxSector; 278 279 new->queue = blk_alloc_queue(GFP_KERNEL); 280 if (!new->queue) { 281 kfree(new); 282 return NULL; 283 } 284 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); 285 286 blk_queue_make_request(new->queue, md_fail_request); 287 288 goto retry; 289 } 290 291 static inline int mddev_lock(mddev_t * mddev) 292 { 293 return mutex_lock_interruptible(&mddev->reconfig_mutex); 294 } 295 296 static inline int mddev_trylock(mddev_t * mddev) 297 { 298 return mutex_trylock(&mddev->reconfig_mutex); 299 } 300 301 static inline void mddev_unlock(mddev_t * mddev) 302 { 303 mutex_unlock(&mddev->reconfig_mutex); 304 305 md_wakeup_thread(mddev->thread); 306 } 307 308 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 309 { 310 mdk_rdev_t * rdev; 311 struct list_head *tmp; 312 313 ITERATE_RDEV(mddev,rdev,tmp) { 314 if (rdev->desc_nr == nr) 315 return rdev; 316 } 317 return NULL; 318 } 319 320 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 321 { 322 struct list_head *tmp; 323 mdk_rdev_t *rdev; 324 325 ITERATE_RDEV(mddev,rdev,tmp) { 326 if (rdev->bdev->bd_dev == dev) 327 return rdev; 328 } 329 return NULL; 330 } 331 332 static struct mdk_personality *find_pers(int level, char *clevel) 333 { 334 struct mdk_personality *pers; 335 list_for_each_entry(pers, &pers_list, list) { 336 if (level != LEVEL_NONE && pers->level == level) 337 return pers; 338 if (strcmp(pers->name, clevel)==0) 339 return pers; 340 } 341 return NULL; 342 } 343 344 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 345 { 346 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 347 return MD_NEW_SIZE_BLOCKS(size); 348 } 349 350 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 351 { 352 sector_t size; 353 354 size = rdev->sb_offset; 355 356 if (chunk_size) 357 size &= ~((sector_t)chunk_size/1024 - 1); 358 return size; 359 } 360 361 static int alloc_disk_sb(mdk_rdev_t * rdev) 362 { 363 if (rdev->sb_page) 364 MD_BUG(); 365 366 rdev->sb_page = alloc_page(GFP_KERNEL); 367 if (!rdev->sb_page) { 368 printk(KERN_ALERT "md: out of memory.\n"); 369 return -EINVAL; 370 } 371 372 return 0; 373 } 374 375 static void free_disk_sb(mdk_rdev_t * rdev) 376 { 377 if (rdev->sb_page) { 378 put_page(rdev->sb_page); 379 rdev->sb_loaded = 0; 380 rdev->sb_page = NULL; 381 rdev->sb_offset = 0; 382 rdev->size = 0; 383 } 384 } 385 386 387 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 388 { 389 mdk_rdev_t *rdev = bio->bi_private; 390 mddev_t *mddev = rdev->mddev; 391 if (bio->bi_size) 392 return 1; 393 394 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 395 printk("md: super_written gets error=%d, uptodate=%d\n", 396 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 397 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 398 md_error(mddev, rdev); 399 } 400 401 if (atomic_dec_and_test(&mddev->pending_writes)) 402 wake_up(&mddev->sb_wait); 403 bio_put(bio); 404 return 0; 405 } 406 407 static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) 408 { 409 struct bio *bio2 = bio->bi_private; 410 mdk_rdev_t *rdev = bio2->bi_private; 411 mddev_t *mddev = rdev->mddev; 412 if (bio->bi_size) 413 return 1; 414 415 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 416 error == -EOPNOTSUPP) { 417 unsigned long flags; 418 /* barriers don't appear to be supported :-( */ 419 set_bit(BarriersNotsupp, &rdev->flags); 420 mddev->barriers_work = 0; 421 spin_lock_irqsave(&mddev->write_lock, flags); 422 bio2->bi_next = mddev->biolist; 423 mddev->biolist = bio2; 424 spin_unlock_irqrestore(&mddev->write_lock, flags); 425 wake_up(&mddev->sb_wait); 426 bio_put(bio); 427 return 0; 428 } 429 bio_put(bio2); 430 bio->bi_private = rdev; 431 return super_written(bio, bytes_done, error); 432 } 433 434 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 435 sector_t sector, int size, struct page *page) 436 { 437 /* write first size bytes of page to sector of rdev 438 * Increment mddev->pending_writes before returning 439 * and decrement it on completion, waking up sb_wait 440 * if zero is reached. 441 * If an error occurred, call md_error 442 * 443 * As we might need to resubmit the request if BIO_RW_BARRIER 444 * causes ENOTSUPP, we allocate a spare bio... 445 */ 446 struct bio *bio = bio_alloc(GFP_NOIO, 1); 447 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 448 449 bio->bi_bdev = rdev->bdev; 450 bio->bi_sector = sector; 451 bio_add_page(bio, page, size, 0); 452 bio->bi_private = rdev; 453 bio->bi_end_io = super_written; 454 bio->bi_rw = rw; 455 456 atomic_inc(&mddev->pending_writes); 457 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 458 struct bio *rbio; 459 rw |= (1<<BIO_RW_BARRIER); 460 rbio = bio_clone(bio, GFP_NOIO); 461 rbio->bi_private = bio; 462 rbio->bi_end_io = super_written_barrier; 463 submit_bio(rw, rbio); 464 } else 465 submit_bio(rw, bio); 466 } 467 468 void md_super_wait(mddev_t *mddev) 469 { 470 /* wait for all superblock writes that were scheduled to complete. 471 * if any had to be retried (due to BARRIER problems), retry them 472 */ 473 DEFINE_WAIT(wq); 474 for(;;) { 475 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 476 if (atomic_read(&mddev->pending_writes)==0) 477 break; 478 while (mddev->biolist) { 479 struct bio *bio; 480 spin_lock_irq(&mddev->write_lock); 481 bio = mddev->biolist; 482 mddev->biolist = bio->bi_next ; 483 bio->bi_next = NULL; 484 spin_unlock_irq(&mddev->write_lock); 485 submit_bio(bio->bi_rw, bio); 486 } 487 schedule(); 488 } 489 finish_wait(&mddev->sb_wait, &wq); 490 } 491 492 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 493 { 494 if (bio->bi_size) 495 return 1; 496 497 complete((struct completion*)bio->bi_private); 498 return 0; 499 } 500 501 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 502 struct page *page, int rw) 503 { 504 struct bio *bio = bio_alloc(GFP_NOIO, 1); 505 struct completion event; 506 int ret; 507 508 rw |= (1 << BIO_RW_SYNC); 509 510 bio->bi_bdev = bdev; 511 bio->bi_sector = sector; 512 bio_add_page(bio, page, size, 0); 513 init_completion(&event); 514 bio->bi_private = &event; 515 bio->bi_end_io = bi_complete; 516 submit_bio(rw, bio); 517 wait_for_completion(&event); 518 519 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 520 bio_put(bio); 521 return ret; 522 } 523 EXPORT_SYMBOL_GPL(sync_page_io); 524 525 static int read_disk_sb(mdk_rdev_t * rdev, int size) 526 { 527 char b[BDEVNAME_SIZE]; 528 if (!rdev->sb_page) { 529 MD_BUG(); 530 return -EINVAL; 531 } 532 if (rdev->sb_loaded) 533 return 0; 534 535 536 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) 537 goto fail; 538 rdev->sb_loaded = 1; 539 return 0; 540 541 fail: 542 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 543 bdevname(rdev->bdev,b)); 544 return -EINVAL; 545 } 546 547 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 548 { 549 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 550 (sb1->set_uuid1 == sb2->set_uuid1) && 551 (sb1->set_uuid2 == sb2->set_uuid2) && 552 (sb1->set_uuid3 == sb2->set_uuid3)) 553 554 return 1; 555 556 return 0; 557 } 558 559 560 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 561 { 562 int ret; 563 mdp_super_t *tmp1, *tmp2; 564 565 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 566 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 567 568 if (!tmp1 || !tmp2) { 569 ret = 0; 570 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 571 goto abort; 572 } 573 574 *tmp1 = *sb1; 575 *tmp2 = *sb2; 576 577 /* 578 * nr_disks is not constant 579 */ 580 tmp1->nr_disks = 0; 581 tmp2->nr_disks = 0; 582 583 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 584 ret = 0; 585 else 586 ret = 1; 587 588 abort: 589 kfree(tmp1); 590 kfree(tmp2); 591 return ret; 592 } 593 594 595 static u32 md_csum_fold(u32 csum) 596 { 597 csum = (csum & 0xffff) + (csum >> 16); 598 return (csum & 0xffff) + (csum >> 16); 599 } 600 601 static unsigned int calc_sb_csum(mdp_super_t * sb) 602 { 603 u64 newcsum = 0; 604 u32 *sb32 = (u32*)sb; 605 int i; 606 unsigned int disk_csum, csum; 607 608 disk_csum = sb->sb_csum; 609 sb->sb_csum = 0; 610 611 for (i = 0; i < MD_SB_BYTES/4 ; i++) 612 newcsum += sb32[i]; 613 csum = (newcsum & 0xffffffff) + (newcsum>>32); 614 615 616 #ifdef CONFIG_ALPHA 617 /* This used to use csum_partial, which was wrong for several 618 * reasons including that different results are returned on 619 * different architectures. It isn't critical that we get exactly 620 * the same return value as before (we always csum_fold before 621 * testing, and that removes any differences). However as we 622 * know that csum_partial always returned a 16bit value on 623 * alphas, do a fold to maximise conformity to previous behaviour. 624 */ 625 sb->sb_csum = md_csum_fold(disk_csum); 626 #else 627 sb->sb_csum = disk_csum; 628 #endif 629 return csum; 630 } 631 632 633 /* 634 * Handle superblock details. 635 * We want to be able to handle multiple superblock formats 636 * so we have a common interface to them all, and an array of 637 * different handlers. 638 * We rely on user-space to write the initial superblock, and support 639 * reading and updating of superblocks. 640 * Interface methods are: 641 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 642 * loads and validates a superblock on dev. 643 * if refdev != NULL, compare superblocks on both devices 644 * Return: 645 * 0 - dev has a superblock that is compatible with refdev 646 * 1 - dev has a superblock that is compatible and newer than refdev 647 * so dev should be used as the refdev in future 648 * -EINVAL superblock incompatible or invalid 649 * -othererror e.g. -EIO 650 * 651 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 652 * Verify that dev is acceptable into mddev. 653 * The first time, mddev->raid_disks will be 0, and data from 654 * dev should be merged in. Subsequent calls check that dev 655 * is new enough. Return 0 or -EINVAL 656 * 657 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 658 * Update the superblock for rdev with data in mddev 659 * This does not write to disc. 660 * 661 */ 662 663 struct super_type { 664 char *name; 665 struct module *owner; 666 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 667 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 668 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 669 }; 670 671 /* 672 * load_super for 0.90.0 673 */ 674 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 675 { 676 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 677 mdp_super_t *sb; 678 int ret; 679 sector_t sb_offset; 680 681 /* 682 * Calculate the position of the superblock, 683 * it's at the end of the disk. 684 * 685 * It also happens to be a multiple of 4Kb. 686 */ 687 sb_offset = calc_dev_sboffset(rdev->bdev); 688 rdev->sb_offset = sb_offset; 689 690 ret = read_disk_sb(rdev, MD_SB_BYTES); 691 if (ret) return ret; 692 693 ret = -EINVAL; 694 695 bdevname(rdev->bdev, b); 696 sb = (mdp_super_t*)page_address(rdev->sb_page); 697 698 if (sb->md_magic != MD_SB_MAGIC) { 699 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 700 b); 701 goto abort; 702 } 703 704 if (sb->major_version != 0 || 705 sb->minor_version < 90 || 706 sb->minor_version > 91) { 707 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 708 sb->major_version, sb->minor_version, 709 b); 710 goto abort; 711 } 712 713 if (sb->raid_disks <= 0) 714 goto abort; 715 716 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 717 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 718 b); 719 goto abort; 720 } 721 722 rdev->preferred_minor = sb->md_minor; 723 rdev->data_offset = 0; 724 rdev->sb_size = MD_SB_BYTES; 725 726 if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { 727 if (sb->level != 1 && sb->level != 4 728 && sb->level != 5 && sb->level != 6 729 && sb->level != 10) { 730 /* FIXME use a better test */ 731 printk(KERN_WARNING 732 "md: bitmaps not supported for this level.\n"); 733 goto abort; 734 } 735 } 736 737 if (sb->level == LEVEL_MULTIPATH) 738 rdev->desc_nr = -1; 739 else 740 rdev->desc_nr = sb->this_disk.number; 741 742 if (refdev == 0) 743 ret = 1; 744 else { 745 __u64 ev1, ev2; 746 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 747 if (!uuid_equal(refsb, sb)) { 748 printk(KERN_WARNING "md: %s has different UUID to %s\n", 749 b, bdevname(refdev->bdev,b2)); 750 goto abort; 751 } 752 if (!sb_equal(refsb, sb)) { 753 printk(KERN_WARNING "md: %s has same UUID" 754 " but different superblock to %s\n", 755 b, bdevname(refdev->bdev, b2)); 756 goto abort; 757 } 758 ev1 = md_event(sb); 759 ev2 = md_event(refsb); 760 if (ev1 > ev2) 761 ret = 1; 762 else 763 ret = 0; 764 } 765 rdev->size = calc_dev_size(rdev, sb->chunk_size); 766 767 if (rdev->size < sb->size && sb->level > 1) 768 /* "this cannot possibly happen" ... */ 769 ret = -EINVAL; 770 771 abort: 772 return ret; 773 } 774 775 /* 776 * validate_super for 0.90.0 777 */ 778 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 779 { 780 mdp_disk_t *desc; 781 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 782 __u64 ev1 = md_event(sb); 783 784 rdev->raid_disk = -1; 785 rdev->flags = 0; 786 if (mddev->raid_disks == 0) { 787 mddev->major_version = 0; 788 mddev->minor_version = sb->minor_version; 789 mddev->patch_version = sb->patch_version; 790 mddev->persistent = ! sb->not_persistent; 791 mddev->chunk_size = sb->chunk_size; 792 mddev->ctime = sb->ctime; 793 mddev->utime = sb->utime; 794 mddev->level = sb->level; 795 mddev->clevel[0] = 0; 796 mddev->layout = sb->layout; 797 mddev->raid_disks = sb->raid_disks; 798 mddev->size = sb->size; 799 mddev->events = ev1; 800 mddev->bitmap_offset = 0; 801 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 802 803 if (mddev->minor_version >= 91) { 804 mddev->reshape_position = sb->reshape_position; 805 mddev->delta_disks = sb->delta_disks; 806 mddev->new_level = sb->new_level; 807 mddev->new_layout = sb->new_layout; 808 mddev->new_chunk = sb->new_chunk; 809 } else { 810 mddev->reshape_position = MaxSector; 811 mddev->delta_disks = 0; 812 mddev->new_level = mddev->level; 813 mddev->new_layout = mddev->layout; 814 mddev->new_chunk = mddev->chunk_size; 815 } 816 817 if (sb->state & (1<<MD_SB_CLEAN)) 818 mddev->recovery_cp = MaxSector; 819 else { 820 if (sb->events_hi == sb->cp_events_hi && 821 sb->events_lo == sb->cp_events_lo) { 822 mddev->recovery_cp = sb->recovery_cp; 823 } else 824 mddev->recovery_cp = 0; 825 } 826 827 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 828 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 829 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 830 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 831 832 mddev->max_disks = MD_SB_DISKS; 833 834 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 835 mddev->bitmap_file == NULL) 836 mddev->bitmap_offset = mddev->default_bitmap_offset; 837 838 } else if (mddev->pers == NULL) { 839 /* Insist on good event counter while assembling */ 840 ++ev1; 841 if (ev1 < mddev->events) 842 return -EINVAL; 843 } else if (mddev->bitmap) { 844 /* if adding to array with a bitmap, then we can accept an 845 * older device ... but not too old. 846 */ 847 if (ev1 < mddev->bitmap->events_cleared) 848 return 0; 849 } else { 850 if (ev1 < mddev->events) 851 /* just a hot-add of a new device, leave raid_disk at -1 */ 852 return 0; 853 } 854 855 if (mddev->level != LEVEL_MULTIPATH) { 856 desc = sb->disks + rdev->desc_nr; 857 858 if (desc->state & (1<<MD_DISK_FAULTY)) 859 set_bit(Faulty, &rdev->flags); 860 else if (desc->state & (1<<MD_DISK_SYNC) /* && 861 desc->raid_disk < mddev->raid_disks */) { 862 set_bit(In_sync, &rdev->flags); 863 rdev->raid_disk = desc->raid_disk; 864 } 865 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 866 set_bit(WriteMostly, &rdev->flags); 867 } else /* MULTIPATH are always insync */ 868 set_bit(In_sync, &rdev->flags); 869 return 0; 870 } 871 872 /* 873 * sync_super for 0.90.0 874 */ 875 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 876 { 877 mdp_super_t *sb; 878 struct list_head *tmp; 879 mdk_rdev_t *rdev2; 880 int next_spare = mddev->raid_disks; 881 882 883 /* make rdev->sb match mddev data.. 884 * 885 * 1/ zero out disks 886 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 887 * 3/ any empty disks < next_spare become removed 888 * 889 * disks[0] gets initialised to REMOVED because 890 * we cannot be sure from other fields if it has 891 * been initialised or not. 892 */ 893 int i; 894 int active=0, working=0,failed=0,spare=0,nr_disks=0; 895 896 rdev->sb_size = MD_SB_BYTES; 897 898 sb = (mdp_super_t*)page_address(rdev->sb_page); 899 900 memset(sb, 0, sizeof(*sb)); 901 902 sb->md_magic = MD_SB_MAGIC; 903 sb->major_version = mddev->major_version; 904 sb->patch_version = mddev->patch_version; 905 sb->gvalid_words = 0; /* ignored */ 906 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 907 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 908 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 909 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 910 911 sb->ctime = mddev->ctime; 912 sb->level = mddev->level; 913 sb->size = mddev->size; 914 sb->raid_disks = mddev->raid_disks; 915 sb->md_minor = mddev->md_minor; 916 sb->not_persistent = !mddev->persistent; 917 sb->utime = mddev->utime; 918 sb->state = 0; 919 sb->events_hi = (mddev->events>>32); 920 sb->events_lo = (u32)mddev->events; 921 922 if (mddev->reshape_position == MaxSector) 923 sb->minor_version = 90; 924 else { 925 sb->minor_version = 91; 926 sb->reshape_position = mddev->reshape_position; 927 sb->new_level = mddev->new_level; 928 sb->delta_disks = mddev->delta_disks; 929 sb->new_layout = mddev->new_layout; 930 sb->new_chunk = mddev->new_chunk; 931 } 932 mddev->minor_version = sb->minor_version; 933 if (mddev->in_sync) 934 { 935 sb->recovery_cp = mddev->recovery_cp; 936 sb->cp_events_hi = (mddev->events>>32); 937 sb->cp_events_lo = (u32)mddev->events; 938 if (mddev->recovery_cp == MaxSector) 939 sb->state = (1<< MD_SB_CLEAN); 940 } else 941 sb->recovery_cp = 0; 942 943 sb->layout = mddev->layout; 944 sb->chunk_size = mddev->chunk_size; 945 946 if (mddev->bitmap && mddev->bitmap_file == NULL) 947 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 948 949 sb->disks[0].state = (1<<MD_DISK_REMOVED); 950 ITERATE_RDEV(mddev,rdev2,tmp) { 951 mdp_disk_t *d; 952 int desc_nr; 953 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 954 && !test_bit(Faulty, &rdev2->flags)) 955 desc_nr = rdev2->raid_disk; 956 else 957 desc_nr = next_spare++; 958 rdev2->desc_nr = desc_nr; 959 d = &sb->disks[rdev2->desc_nr]; 960 nr_disks++; 961 d->number = rdev2->desc_nr; 962 d->major = MAJOR(rdev2->bdev->bd_dev); 963 d->minor = MINOR(rdev2->bdev->bd_dev); 964 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 965 && !test_bit(Faulty, &rdev2->flags)) 966 d->raid_disk = rdev2->raid_disk; 967 else 968 d->raid_disk = rdev2->desc_nr; /* compatibility */ 969 if (test_bit(Faulty, &rdev2->flags)) 970 d->state = (1<<MD_DISK_FAULTY); 971 else if (test_bit(In_sync, &rdev2->flags)) { 972 d->state = (1<<MD_DISK_ACTIVE); 973 d->state |= (1<<MD_DISK_SYNC); 974 active++; 975 working++; 976 } else { 977 d->state = 0; 978 spare++; 979 working++; 980 } 981 if (test_bit(WriteMostly, &rdev2->flags)) 982 d->state |= (1<<MD_DISK_WRITEMOSTLY); 983 } 984 /* now set the "removed" and "faulty" bits on any missing devices */ 985 for (i=0 ; i < mddev->raid_disks ; i++) { 986 mdp_disk_t *d = &sb->disks[i]; 987 if (d->state == 0 && d->number == 0) { 988 d->number = i; 989 d->raid_disk = i; 990 d->state = (1<<MD_DISK_REMOVED); 991 d->state |= (1<<MD_DISK_FAULTY); 992 failed++; 993 } 994 } 995 sb->nr_disks = nr_disks; 996 sb->active_disks = active; 997 sb->working_disks = working; 998 sb->failed_disks = failed; 999 sb->spare_disks = spare; 1000 1001 sb->this_disk = sb->disks[rdev->desc_nr]; 1002 sb->sb_csum = calc_sb_csum(sb); 1003 } 1004 1005 /* 1006 * version 1 superblock 1007 */ 1008 1009 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1010 { 1011 __le32 disk_csum; 1012 u32 csum; 1013 unsigned long long newcsum; 1014 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1015 __le32 *isuper = (__le32*)sb; 1016 int i; 1017 1018 disk_csum = sb->sb_csum; 1019 sb->sb_csum = 0; 1020 newcsum = 0; 1021 for (i=0; size>=4; size -= 4 ) 1022 newcsum += le32_to_cpu(*isuper++); 1023 1024 if (size == 2) 1025 newcsum += le16_to_cpu(*(__le16*) isuper); 1026 1027 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1028 sb->sb_csum = disk_csum; 1029 return cpu_to_le32(csum); 1030 } 1031 1032 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1033 { 1034 struct mdp_superblock_1 *sb; 1035 int ret; 1036 sector_t sb_offset; 1037 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1038 int bmask; 1039 1040 /* 1041 * Calculate the position of the superblock. 1042 * It is always aligned to a 4K boundary and 1043 * depeding on minor_version, it can be: 1044 * 0: At least 8K, but less than 12K, from end of device 1045 * 1: At start of device 1046 * 2: 4K from start of device. 1047 */ 1048 switch(minor_version) { 1049 case 0: 1050 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 1051 sb_offset -= 8*2; 1052 sb_offset &= ~(sector_t)(4*2-1); 1053 /* convert from sectors to K */ 1054 sb_offset /= 2; 1055 break; 1056 case 1: 1057 sb_offset = 0; 1058 break; 1059 case 2: 1060 sb_offset = 4; 1061 break; 1062 default: 1063 return -EINVAL; 1064 } 1065 rdev->sb_offset = sb_offset; 1066 1067 /* superblock is rarely larger than 1K, but it can be larger, 1068 * and it is safe to read 4k, so we do that 1069 */ 1070 ret = read_disk_sb(rdev, 4096); 1071 if (ret) return ret; 1072 1073 1074 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1075 1076 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1077 sb->major_version != cpu_to_le32(1) || 1078 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1079 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 1080 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1081 return -EINVAL; 1082 1083 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1084 printk("md: invalid superblock checksum on %s\n", 1085 bdevname(rdev->bdev,b)); 1086 return -EINVAL; 1087 } 1088 if (le64_to_cpu(sb->data_size) < 10) { 1089 printk("md: data_size too small on %s\n", 1090 bdevname(rdev->bdev,b)); 1091 return -EINVAL; 1092 } 1093 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { 1094 if (sb->level != cpu_to_le32(1) && 1095 sb->level != cpu_to_le32(4) && 1096 sb->level != cpu_to_le32(5) && 1097 sb->level != cpu_to_le32(6) && 1098 sb->level != cpu_to_le32(10)) { 1099 printk(KERN_WARNING 1100 "md: bitmaps not supported for this level.\n"); 1101 return -EINVAL; 1102 } 1103 } 1104 1105 rdev->preferred_minor = 0xffff; 1106 rdev->data_offset = le64_to_cpu(sb->data_offset); 1107 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1108 1109 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1110 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1111 if (rdev->sb_size & bmask) 1112 rdev-> sb_size = (rdev->sb_size | bmask)+1; 1113 1114 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1115 rdev->desc_nr = -1; 1116 else 1117 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1118 1119 if (refdev == 0) 1120 ret = 1; 1121 else { 1122 __u64 ev1, ev2; 1123 struct mdp_superblock_1 *refsb = 1124 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1125 1126 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1127 sb->level != refsb->level || 1128 sb->layout != refsb->layout || 1129 sb->chunksize != refsb->chunksize) { 1130 printk(KERN_WARNING "md: %s has strangely different" 1131 " superblock to %s\n", 1132 bdevname(rdev->bdev,b), 1133 bdevname(refdev->bdev,b2)); 1134 return -EINVAL; 1135 } 1136 ev1 = le64_to_cpu(sb->events); 1137 ev2 = le64_to_cpu(refsb->events); 1138 1139 if (ev1 > ev2) 1140 ret = 1; 1141 else 1142 ret = 0; 1143 } 1144 if (minor_version) 1145 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1146 else 1147 rdev->size = rdev->sb_offset; 1148 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1149 return -EINVAL; 1150 rdev->size = le64_to_cpu(sb->data_size)/2; 1151 if (le32_to_cpu(sb->chunksize)) 1152 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1153 1154 if (le64_to_cpu(sb->size) > rdev->size*2) 1155 return -EINVAL; 1156 return ret; 1157 } 1158 1159 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1160 { 1161 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1162 __u64 ev1 = le64_to_cpu(sb->events); 1163 1164 rdev->raid_disk = -1; 1165 rdev->flags = 0; 1166 if (mddev->raid_disks == 0) { 1167 mddev->major_version = 1; 1168 mddev->patch_version = 0; 1169 mddev->persistent = 1; 1170 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1171 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1172 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1173 mddev->level = le32_to_cpu(sb->level); 1174 mddev->clevel[0] = 0; 1175 mddev->layout = le32_to_cpu(sb->layout); 1176 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1177 mddev->size = le64_to_cpu(sb->size)/2; 1178 mddev->events = ev1; 1179 mddev->bitmap_offset = 0; 1180 mddev->default_bitmap_offset = 1024 >> 9; 1181 1182 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1183 memcpy(mddev->uuid, sb->set_uuid, 16); 1184 1185 mddev->max_disks = (4096-256)/2; 1186 1187 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1188 mddev->bitmap_file == NULL ) 1189 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1190 1191 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1192 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1193 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1194 mddev->new_level = le32_to_cpu(sb->new_level); 1195 mddev->new_layout = le32_to_cpu(sb->new_layout); 1196 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1197 } else { 1198 mddev->reshape_position = MaxSector; 1199 mddev->delta_disks = 0; 1200 mddev->new_level = mddev->level; 1201 mddev->new_layout = mddev->layout; 1202 mddev->new_chunk = mddev->chunk_size; 1203 } 1204 1205 } else if (mddev->pers == NULL) { 1206 /* Insist of good event counter while assembling */ 1207 ++ev1; 1208 if (ev1 < mddev->events) 1209 return -EINVAL; 1210 } else if (mddev->bitmap) { 1211 /* If adding to array with a bitmap, then we can accept an 1212 * older device, but not too old. 1213 */ 1214 if (ev1 < mddev->bitmap->events_cleared) 1215 return 0; 1216 } else { 1217 if (ev1 < mddev->events) 1218 /* just a hot-add of a new device, leave raid_disk at -1 */ 1219 return 0; 1220 } 1221 if (mddev->level != LEVEL_MULTIPATH) { 1222 int role; 1223 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1224 switch(role) { 1225 case 0xffff: /* spare */ 1226 break; 1227 case 0xfffe: /* faulty */ 1228 set_bit(Faulty, &rdev->flags); 1229 break; 1230 default: 1231 if ((le32_to_cpu(sb->feature_map) & 1232 MD_FEATURE_RECOVERY_OFFSET)) 1233 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1234 else 1235 set_bit(In_sync, &rdev->flags); 1236 rdev->raid_disk = role; 1237 break; 1238 } 1239 if (sb->devflags & WriteMostly1) 1240 set_bit(WriteMostly, &rdev->flags); 1241 } else /* MULTIPATH are always insync */ 1242 set_bit(In_sync, &rdev->flags); 1243 1244 return 0; 1245 } 1246 1247 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1248 { 1249 struct mdp_superblock_1 *sb; 1250 struct list_head *tmp; 1251 mdk_rdev_t *rdev2; 1252 int max_dev, i; 1253 /* make rdev->sb match mddev and rdev data. */ 1254 1255 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1256 1257 sb->feature_map = 0; 1258 sb->pad0 = 0; 1259 sb->recovery_offset = cpu_to_le64(0); 1260 memset(sb->pad1, 0, sizeof(sb->pad1)); 1261 memset(sb->pad2, 0, sizeof(sb->pad2)); 1262 memset(sb->pad3, 0, sizeof(sb->pad3)); 1263 1264 sb->utime = cpu_to_le64((__u64)mddev->utime); 1265 sb->events = cpu_to_le64(mddev->events); 1266 if (mddev->in_sync) 1267 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1268 else 1269 sb->resync_offset = cpu_to_le64(0); 1270 1271 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1272 1273 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1274 sb->size = cpu_to_le64(mddev->size<<1); 1275 1276 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1277 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1278 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1279 } 1280 1281 if (rdev->raid_disk >= 0 && 1282 !test_bit(In_sync, &rdev->flags) && 1283 rdev->recovery_offset > 0) { 1284 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1285 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1286 } 1287 1288 if (mddev->reshape_position != MaxSector) { 1289 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1290 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1291 sb->new_layout = cpu_to_le32(mddev->new_layout); 1292 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1293 sb->new_level = cpu_to_le32(mddev->new_level); 1294 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1295 } 1296 1297 max_dev = 0; 1298 ITERATE_RDEV(mddev,rdev2,tmp) 1299 if (rdev2->desc_nr+1 > max_dev) 1300 max_dev = rdev2->desc_nr+1; 1301 1302 if (max_dev > le32_to_cpu(sb->max_dev)) 1303 sb->max_dev = cpu_to_le32(max_dev); 1304 for (i=0; i<max_dev;i++) 1305 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1306 1307 ITERATE_RDEV(mddev,rdev2,tmp) { 1308 i = rdev2->desc_nr; 1309 if (test_bit(Faulty, &rdev2->flags)) 1310 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1311 else if (test_bit(In_sync, &rdev2->flags)) 1312 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1313 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1314 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1315 else 1316 sb->dev_roles[i] = cpu_to_le16(0xffff); 1317 } 1318 1319 sb->sb_csum = calc_sb_1_csum(sb); 1320 } 1321 1322 1323 static struct super_type super_types[] = { 1324 [0] = { 1325 .name = "0.90.0", 1326 .owner = THIS_MODULE, 1327 .load_super = super_90_load, 1328 .validate_super = super_90_validate, 1329 .sync_super = super_90_sync, 1330 }, 1331 [1] = { 1332 .name = "md-1", 1333 .owner = THIS_MODULE, 1334 .load_super = super_1_load, 1335 .validate_super = super_1_validate, 1336 .sync_super = super_1_sync, 1337 }, 1338 }; 1339 1340 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1341 { 1342 struct list_head *tmp, *tmp2; 1343 mdk_rdev_t *rdev, *rdev2; 1344 1345 ITERATE_RDEV(mddev1,rdev,tmp) 1346 ITERATE_RDEV(mddev2, rdev2, tmp2) 1347 if (rdev->bdev->bd_contains == 1348 rdev2->bdev->bd_contains) 1349 return 1; 1350 1351 return 0; 1352 } 1353 1354 static LIST_HEAD(pending_raid_disks); 1355 1356 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1357 { 1358 char b[BDEVNAME_SIZE]; 1359 struct kobject *ko; 1360 char *s; 1361 int err; 1362 1363 if (rdev->mddev) { 1364 MD_BUG(); 1365 return -EINVAL; 1366 } 1367 /* make sure rdev->size exceeds mddev->size */ 1368 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1369 if (mddev->pers) { 1370 /* Cannot change size, so fail 1371 * If mddev->level <= 0, then we don't care 1372 * about aligning sizes (e.g. linear) 1373 */ 1374 if (mddev->level > 0) 1375 return -ENOSPC; 1376 } else 1377 mddev->size = rdev->size; 1378 } 1379 1380 /* Verify rdev->desc_nr is unique. 1381 * If it is -1, assign a free number, else 1382 * check number is not in use 1383 */ 1384 if (rdev->desc_nr < 0) { 1385 int choice = 0; 1386 if (mddev->pers) choice = mddev->raid_disks; 1387 while (find_rdev_nr(mddev, choice)) 1388 choice++; 1389 rdev->desc_nr = choice; 1390 } else { 1391 if (find_rdev_nr(mddev, rdev->desc_nr)) 1392 return -EBUSY; 1393 } 1394 bdevname(rdev->bdev,b); 1395 if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) 1396 return -ENOMEM; 1397 while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL) 1398 *s = '!'; 1399 1400 rdev->mddev = mddev; 1401 printk(KERN_INFO "md: bind<%s>\n", b); 1402 1403 rdev->kobj.parent = &mddev->kobj; 1404 if ((err = kobject_add(&rdev->kobj))) 1405 goto fail; 1406 1407 if (rdev->bdev->bd_part) 1408 ko = &rdev->bdev->bd_part->kobj; 1409 else 1410 ko = &rdev->bdev->bd_disk->kobj; 1411 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1412 kobject_del(&rdev->kobj); 1413 goto fail; 1414 } 1415 list_add(&rdev->same_set, &mddev->disks); 1416 bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); 1417 return 0; 1418 1419 fail: 1420 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1421 b, mdname(mddev)); 1422 return err; 1423 } 1424 1425 static void delayed_delete(struct work_struct *ws) 1426 { 1427 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1428 kobject_del(&rdev->kobj); 1429 } 1430 1431 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1432 { 1433 char b[BDEVNAME_SIZE]; 1434 if (!rdev->mddev) { 1435 MD_BUG(); 1436 return; 1437 } 1438 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1439 list_del_init(&rdev->same_set); 1440 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1441 rdev->mddev = NULL; 1442 sysfs_remove_link(&rdev->kobj, "block"); 1443 1444 /* We need to delay this, otherwise we can deadlock when 1445 * writing to 'remove' to "dev/state" 1446 */ 1447 INIT_WORK(&rdev->del_work, delayed_delete); 1448 schedule_work(&rdev->del_work); 1449 } 1450 1451 /* 1452 * prevent the device from being mounted, repartitioned or 1453 * otherwise reused by a RAID array (or any other kernel 1454 * subsystem), by bd_claiming the device. 1455 */ 1456 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1457 { 1458 int err = 0; 1459 struct block_device *bdev; 1460 char b[BDEVNAME_SIZE]; 1461 1462 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1463 if (IS_ERR(bdev)) { 1464 printk(KERN_ERR "md: could not open %s.\n", 1465 __bdevname(dev, b)); 1466 return PTR_ERR(bdev); 1467 } 1468 err = bd_claim(bdev, rdev); 1469 if (err) { 1470 printk(KERN_ERR "md: could not bd_claim %s.\n", 1471 bdevname(bdev, b)); 1472 blkdev_put(bdev); 1473 return err; 1474 } 1475 rdev->bdev = bdev; 1476 return err; 1477 } 1478 1479 static void unlock_rdev(mdk_rdev_t *rdev) 1480 { 1481 struct block_device *bdev = rdev->bdev; 1482 rdev->bdev = NULL; 1483 if (!bdev) 1484 MD_BUG(); 1485 bd_release(bdev); 1486 blkdev_put(bdev); 1487 } 1488 1489 void md_autodetect_dev(dev_t dev); 1490 1491 static void export_rdev(mdk_rdev_t * rdev) 1492 { 1493 char b[BDEVNAME_SIZE]; 1494 printk(KERN_INFO "md: export_rdev(%s)\n", 1495 bdevname(rdev->bdev,b)); 1496 if (rdev->mddev) 1497 MD_BUG(); 1498 free_disk_sb(rdev); 1499 list_del_init(&rdev->same_set); 1500 #ifndef MODULE 1501 md_autodetect_dev(rdev->bdev->bd_dev); 1502 #endif 1503 unlock_rdev(rdev); 1504 kobject_put(&rdev->kobj); 1505 } 1506 1507 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1508 { 1509 unbind_rdev_from_array(rdev); 1510 export_rdev(rdev); 1511 } 1512 1513 static void export_array(mddev_t *mddev) 1514 { 1515 struct list_head *tmp; 1516 mdk_rdev_t *rdev; 1517 1518 ITERATE_RDEV(mddev,rdev,tmp) { 1519 if (!rdev->mddev) { 1520 MD_BUG(); 1521 continue; 1522 } 1523 kick_rdev_from_array(rdev); 1524 } 1525 if (!list_empty(&mddev->disks)) 1526 MD_BUG(); 1527 mddev->raid_disks = 0; 1528 mddev->major_version = 0; 1529 } 1530 1531 static void print_desc(mdp_disk_t *desc) 1532 { 1533 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1534 desc->major,desc->minor,desc->raid_disk,desc->state); 1535 } 1536 1537 static void print_sb(mdp_super_t *sb) 1538 { 1539 int i; 1540 1541 printk(KERN_INFO 1542 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1543 sb->major_version, sb->minor_version, sb->patch_version, 1544 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1545 sb->ctime); 1546 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1547 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1548 sb->md_minor, sb->layout, sb->chunk_size); 1549 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1550 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1551 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1552 sb->failed_disks, sb->spare_disks, 1553 sb->sb_csum, (unsigned long)sb->events_lo); 1554 1555 printk(KERN_INFO); 1556 for (i = 0; i < MD_SB_DISKS; i++) { 1557 mdp_disk_t *desc; 1558 1559 desc = sb->disks + i; 1560 if (desc->number || desc->major || desc->minor || 1561 desc->raid_disk || (desc->state && (desc->state != 4))) { 1562 printk(" D %2d: ", i); 1563 print_desc(desc); 1564 } 1565 } 1566 printk(KERN_INFO "md: THIS: "); 1567 print_desc(&sb->this_disk); 1568 1569 } 1570 1571 static void print_rdev(mdk_rdev_t *rdev) 1572 { 1573 char b[BDEVNAME_SIZE]; 1574 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1575 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1576 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1577 rdev->desc_nr); 1578 if (rdev->sb_loaded) { 1579 printk(KERN_INFO "md: rdev superblock:\n"); 1580 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1581 } else 1582 printk(KERN_INFO "md: no rdev superblock!\n"); 1583 } 1584 1585 static void md_print_devices(void) 1586 { 1587 struct list_head *tmp, *tmp2; 1588 mdk_rdev_t *rdev; 1589 mddev_t *mddev; 1590 char b[BDEVNAME_SIZE]; 1591 1592 printk("\n"); 1593 printk("md: **********************************\n"); 1594 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1595 printk("md: **********************************\n"); 1596 ITERATE_MDDEV(mddev,tmp) { 1597 1598 if (mddev->bitmap) 1599 bitmap_print_sb(mddev->bitmap); 1600 else 1601 printk("%s: ", mdname(mddev)); 1602 ITERATE_RDEV(mddev,rdev,tmp2) 1603 printk("<%s>", bdevname(rdev->bdev,b)); 1604 printk("\n"); 1605 1606 ITERATE_RDEV(mddev,rdev,tmp2) 1607 print_rdev(rdev); 1608 } 1609 printk("md: **********************************\n"); 1610 printk("\n"); 1611 } 1612 1613 1614 static void sync_sbs(mddev_t * mddev, int nospares) 1615 { 1616 /* Update each superblock (in-memory image), but 1617 * if we are allowed to, skip spares which already 1618 * have the right event counter, or have one earlier 1619 * (which would mean they aren't being marked as dirty 1620 * with the rest of the array) 1621 */ 1622 mdk_rdev_t *rdev; 1623 struct list_head *tmp; 1624 1625 ITERATE_RDEV(mddev,rdev,tmp) { 1626 if (rdev->sb_events == mddev->events || 1627 (nospares && 1628 rdev->raid_disk < 0 && 1629 (rdev->sb_events&1)==0 && 1630 rdev->sb_events+1 == mddev->events)) { 1631 /* Don't update this superblock */ 1632 rdev->sb_loaded = 2; 1633 } else { 1634 super_types[mddev->major_version]. 1635 sync_super(mddev, rdev); 1636 rdev->sb_loaded = 1; 1637 } 1638 } 1639 } 1640 1641 static void md_update_sb(mddev_t * mddev, int force_change) 1642 { 1643 int err; 1644 struct list_head *tmp; 1645 mdk_rdev_t *rdev; 1646 int sync_req; 1647 int nospares = 0; 1648 1649 repeat: 1650 spin_lock_irq(&mddev->write_lock); 1651 1652 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1653 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1654 force_change = 1; 1655 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1656 /* just a clean<-> dirty transition, possibly leave spares alone, 1657 * though if events isn't the right even/odd, we will have to do 1658 * spares after all 1659 */ 1660 nospares = 1; 1661 if (force_change) 1662 nospares = 0; 1663 if (mddev->degraded) 1664 /* If the array is degraded, then skipping spares is both 1665 * dangerous and fairly pointless. 1666 * Dangerous because a device that was removed from the array 1667 * might have a event_count that still looks up-to-date, 1668 * so it can be re-added without a resync. 1669 * Pointless because if there are any spares to skip, 1670 * then a recovery will happen and soon that array won't 1671 * be degraded any more and the spare can go back to sleep then. 1672 */ 1673 nospares = 0; 1674 1675 sync_req = mddev->in_sync; 1676 mddev->utime = get_seconds(); 1677 1678 /* If this is just a dirty<->clean transition, and the array is clean 1679 * and 'events' is odd, we can roll back to the previous clean state */ 1680 if (nospares 1681 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1682 && (mddev->events & 1) 1683 && mddev->events != 1) 1684 mddev->events--; 1685 else { 1686 /* otherwise we have to go forward and ... */ 1687 mddev->events ++; 1688 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1689 /* .. if the array isn't clean, insist on an odd 'events' */ 1690 if ((mddev->events&1)==0) { 1691 mddev->events++; 1692 nospares = 0; 1693 } 1694 } else { 1695 /* otherwise insist on an even 'events' (for clean states) */ 1696 if ((mddev->events&1)) { 1697 mddev->events++; 1698 nospares = 0; 1699 } 1700 } 1701 } 1702 1703 if (!mddev->events) { 1704 /* 1705 * oops, this 64-bit counter should never wrap. 1706 * Either we are in around ~1 trillion A.C., assuming 1707 * 1 reboot per second, or we have a bug: 1708 */ 1709 MD_BUG(); 1710 mddev->events --; 1711 } 1712 sync_sbs(mddev, nospares); 1713 1714 /* 1715 * do not write anything to disk if using 1716 * nonpersistent superblocks 1717 */ 1718 if (!mddev->persistent) { 1719 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1720 spin_unlock_irq(&mddev->write_lock); 1721 wake_up(&mddev->sb_wait); 1722 return; 1723 } 1724 spin_unlock_irq(&mddev->write_lock); 1725 1726 dprintk(KERN_INFO 1727 "md: updating %s RAID superblock on device (in sync %d)\n", 1728 mdname(mddev),mddev->in_sync); 1729 1730 err = bitmap_update_sb(mddev->bitmap); 1731 ITERATE_RDEV(mddev,rdev,tmp) { 1732 char b[BDEVNAME_SIZE]; 1733 dprintk(KERN_INFO "md: "); 1734 if (rdev->sb_loaded != 1) 1735 continue; /* no noise on spare devices */ 1736 if (test_bit(Faulty, &rdev->flags)) 1737 dprintk("(skipping faulty "); 1738 1739 dprintk("%s ", bdevname(rdev->bdev,b)); 1740 if (!test_bit(Faulty, &rdev->flags)) { 1741 md_super_write(mddev,rdev, 1742 rdev->sb_offset<<1, rdev->sb_size, 1743 rdev->sb_page); 1744 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1745 bdevname(rdev->bdev,b), 1746 (unsigned long long)rdev->sb_offset); 1747 rdev->sb_events = mddev->events; 1748 1749 } else 1750 dprintk(")\n"); 1751 if (mddev->level == LEVEL_MULTIPATH) 1752 /* only need to write one superblock... */ 1753 break; 1754 } 1755 md_super_wait(mddev); 1756 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 1757 1758 spin_lock_irq(&mddev->write_lock); 1759 if (mddev->in_sync != sync_req || 1760 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 1761 /* have to write it out again */ 1762 spin_unlock_irq(&mddev->write_lock); 1763 goto repeat; 1764 } 1765 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1766 spin_unlock_irq(&mddev->write_lock); 1767 wake_up(&mddev->sb_wait); 1768 1769 } 1770 1771 /* words written to sysfs files may, or my not, be \n terminated. 1772 * We want to accept with case. For this we use cmd_match. 1773 */ 1774 static int cmd_match(const char *cmd, const char *str) 1775 { 1776 /* See if cmd, written into a sysfs file, matches 1777 * str. They must either be the same, or cmd can 1778 * have a trailing newline 1779 */ 1780 while (*cmd && *str && *cmd == *str) { 1781 cmd++; 1782 str++; 1783 } 1784 if (*cmd == '\n') 1785 cmd++; 1786 if (*str || *cmd) 1787 return 0; 1788 return 1; 1789 } 1790 1791 struct rdev_sysfs_entry { 1792 struct attribute attr; 1793 ssize_t (*show)(mdk_rdev_t *, char *); 1794 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1795 }; 1796 1797 static ssize_t 1798 state_show(mdk_rdev_t *rdev, char *page) 1799 { 1800 char *sep = ""; 1801 int len=0; 1802 1803 if (test_bit(Faulty, &rdev->flags)) { 1804 len+= sprintf(page+len, "%sfaulty",sep); 1805 sep = ","; 1806 } 1807 if (test_bit(In_sync, &rdev->flags)) { 1808 len += sprintf(page+len, "%sin_sync",sep); 1809 sep = ","; 1810 } 1811 if (test_bit(WriteMostly, &rdev->flags)) { 1812 len += sprintf(page+len, "%swrite_mostly",sep); 1813 sep = ","; 1814 } 1815 if (!test_bit(Faulty, &rdev->flags) && 1816 !test_bit(In_sync, &rdev->flags)) { 1817 len += sprintf(page+len, "%sspare", sep); 1818 sep = ","; 1819 } 1820 return len+sprintf(page+len, "\n"); 1821 } 1822 1823 static ssize_t 1824 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1825 { 1826 /* can write 1827 * faulty - simulates and error 1828 * remove - disconnects the device 1829 * writemostly - sets write_mostly 1830 * -writemostly - clears write_mostly 1831 */ 1832 int err = -EINVAL; 1833 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1834 md_error(rdev->mddev, rdev); 1835 err = 0; 1836 } else if (cmd_match(buf, "remove")) { 1837 if (rdev->raid_disk >= 0) 1838 err = -EBUSY; 1839 else { 1840 mddev_t *mddev = rdev->mddev; 1841 kick_rdev_from_array(rdev); 1842 if (mddev->pers) 1843 md_update_sb(mddev, 1); 1844 md_new_event(mddev); 1845 err = 0; 1846 } 1847 } else if (cmd_match(buf, "writemostly")) { 1848 set_bit(WriteMostly, &rdev->flags); 1849 err = 0; 1850 } else if (cmd_match(buf, "-writemostly")) { 1851 clear_bit(WriteMostly, &rdev->flags); 1852 err = 0; 1853 } 1854 return err ? err : len; 1855 } 1856 static struct rdev_sysfs_entry rdev_state = 1857 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 1858 1859 static ssize_t 1860 super_show(mdk_rdev_t *rdev, char *page) 1861 { 1862 if (rdev->sb_loaded && rdev->sb_size) { 1863 memcpy(page, page_address(rdev->sb_page), rdev->sb_size); 1864 return rdev->sb_size; 1865 } else 1866 return 0; 1867 } 1868 static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1869 1870 static ssize_t 1871 errors_show(mdk_rdev_t *rdev, char *page) 1872 { 1873 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1874 } 1875 1876 static ssize_t 1877 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1878 { 1879 char *e; 1880 unsigned long n = simple_strtoul(buf, &e, 10); 1881 if (*buf && (*e == 0 || *e == '\n')) { 1882 atomic_set(&rdev->corrected_errors, n); 1883 return len; 1884 } 1885 return -EINVAL; 1886 } 1887 static struct rdev_sysfs_entry rdev_errors = 1888 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 1889 1890 static ssize_t 1891 slot_show(mdk_rdev_t *rdev, char *page) 1892 { 1893 if (rdev->raid_disk < 0) 1894 return sprintf(page, "none\n"); 1895 else 1896 return sprintf(page, "%d\n", rdev->raid_disk); 1897 } 1898 1899 static ssize_t 1900 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1901 { 1902 char *e; 1903 int slot = simple_strtoul(buf, &e, 10); 1904 if (strncmp(buf, "none", 4)==0) 1905 slot = -1; 1906 else if (e==buf || (*e && *e!= '\n')) 1907 return -EINVAL; 1908 if (rdev->mddev->pers) 1909 /* Cannot set slot in active array (yet) */ 1910 return -EBUSY; 1911 if (slot >= rdev->mddev->raid_disks) 1912 return -ENOSPC; 1913 rdev->raid_disk = slot; 1914 /* assume it is working */ 1915 rdev->flags = 0; 1916 set_bit(In_sync, &rdev->flags); 1917 return len; 1918 } 1919 1920 1921 static struct rdev_sysfs_entry rdev_slot = 1922 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 1923 1924 static ssize_t 1925 offset_show(mdk_rdev_t *rdev, char *page) 1926 { 1927 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 1928 } 1929 1930 static ssize_t 1931 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1932 { 1933 char *e; 1934 unsigned long long offset = simple_strtoull(buf, &e, 10); 1935 if (e==buf || (*e && *e != '\n')) 1936 return -EINVAL; 1937 if (rdev->mddev->pers) 1938 return -EBUSY; 1939 rdev->data_offset = offset; 1940 return len; 1941 } 1942 1943 static struct rdev_sysfs_entry rdev_offset = 1944 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 1945 1946 static ssize_t 1947 rdev_size_show(mdk_rdev_t *rdev, char *page) 1948 { 1949 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 1950 } 1951 1952 static ssize_t 1953 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1954 { 1955 char *e; 1956 unsigned long long size = simple_strtoull(buf, &e, 10); 1957 if (e==buf || (*e && *e != '\n')) 1958 return -EINVAL; 1959 if (rdev->mddev->pers) 1960 return -EBUSY; 1961 rdev->size = size; 1962 if (size < rdev->mddev->size || rdev->mddev->size == 0) 1963 rdev->mddev->size = size; 1964 return len; 1965 } 1966 1967 static struct rdev_sysfs_entry rdev_size = 1968 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 1969 1970 static struct attribute *rdev_default_attrs[] = { 1971 &rdev_state.attr, 1972 &rdev_super.attr, 1973 &rdev_errors.attr, 1974 &rdev_slot.attr, 1975 &rdev_offset.attr, 1976 &rdev_size.attr, 1977 NULL, 1978 }; 1979 static ssize_t 1980 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1981 { 1982 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1983 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1984 1985 if (!entry->show) 1986 return -EIO; 1987 return entry->show(rdev, page); 1988 } 1989 1990 static ssize_t 1991 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 1992 const char *page, size_t length) 1993 { 1994 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 1995 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 1996 1997 if (!entry->store) 1998 return -EIO; 1999 if (!capable(CAP_SYS_ADMIN)) 2000 return -EACCES; 2001 return entry->store(rdev, page, length); 2002 } 2003 2004 static void rdev_free(struct kobject *ko) 2005 { 2006 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2007 kfree(rdev); 2008 } 2009 static struct sysfs_ops rdev_sysfs_ops = { 2010 .show = rdev_attr_show, 2011 .store = rdev_attr_store, 2012 }; 2013 static struct kobj_type rdev_ktype = { 2014 .release = rdev_free, 2015 .sysfs_ops = &rdev_sysfs_ops, 2016 .default_attrs = rdev_default_attrs, 2017 }; 2018 2019 /* 2020 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2021 * 2022 * mark the device faulty if: 2023 * 2024 * - the device is nonexistent (zero size) 2025 * - the device has no valid superblock 2026 * 2027 * a faulty rdev _never_ has rdev->sb set. 2028 */ 2029 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2030 { 2031 char b[BDEVNAME_SIZE]; 2032 int err; 2033 mdk_rdev_t *rdev; 2034 sector_t size; 2035 2036 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2037 if (!rdev) { 2038 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2039 return ERR_PTR(-ENOMEM); 2040 } 2041 2042 if ((err = alloc_disk_sb(rdev))) 2043 goto abort_free; 2044 2045 err = lock_rdev(rdev, newdev); 2046 if (err) 2047 goto abort_free; 2048 2049 rdev->kobj.parent = NULL; 2050 rdev->kobj.ktype = &rdev_ktype; 2051 kobject_init(&rdev->kobj); 2052 2053 rdev->desc_nr = -1; 2054 rdev->saved_raid_disk = -1; 2055 rdev->raid_disk = -1; 2056 rdev->flags = 0; 2057 rdev->data_offset = 0; 2058 rdev->sb_events = 0; 2059 atomic_set(&rdev->nr_pending, 0); 2060 atomic_set(&rdev->read_errors, 0); 2061 atomic_set(&rdev->corrected_errors, 0); 2062 2063 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2064 if (!size) { 2065 printk(KERN_WARNING 2066 "md: %s has zero or unknown size, marking faulty!\n", 2067 bdevname(rdev->bdev,b)); 2068 err = -EINVAL; 2069 goto abort_free; 2070 } 2071 2072 if (super_format >= 0) { 2073 err = super_types[super_format]. 2074 load_super(rdev, NULL, super_minor); 2075 if (err == -EINVAL) { 2076 printk(KERN_WARNING 2077 "md: %s has invalid sb, not importing!\n", 2078 bdevname(rdev->bdev,b)); 2079 goto abort_free; 2080 } 2081 if (err < 0) { 2082 printk(KERN_WARNING 2083 "md: could not read %s's sb, not importing!\n", 2084 bdevname(rdev->bdev,b)); 2085 goto abort_free; 2086 } 2087 } 2088 INIT_LIST_HEAD(&rdev->same_set); 2089 2090 return rdev; 2091 2092 abort_free: 2093 if (rdev->sb_page) { 2094 if (rdev->bdev) 2095 unlock_rdev(rdev); 2096 free_disk_sb(rdev); 2097 } 2098 kfree(rdev); 2099 return ERR_PTR(err); 2100 } 2101 2102 /* 2103 * Check a full RAID array for plausibility 2104 */ 2105 2106 2107 static void analyze_sbs(mddev_t * mddev) 2108 { 2109 int i; 2110 struct list_head *tmp; 2111 mdk_rdev_t *rdev, *freshest; 2112 char b[BDEVNAME_SIZE]; 2113 2114 freshest = NULL; 2115 ITERATE_RDEV(mddev,rdev,tmp) 2116 switch (super_types[mddev->major_version]. 2117 load_super(rdev, freshest, mddev->minor_version)) { 2118 case 1: 2119 freshest = rdev; 2120 break; 2121 case 0: 2122 break; 2123 default: 2124 printk( KERN_ERR \ 2125 "md: fatal superblock inconsistency in %s" 2126 " -- removing from array\n", 2127 bdevname(rdev->bdev,b)); 2128 kick_rdev_from_array(rdev); 2129 } 2130 2131 2132 super_types[mddev->major_version]. 2133 validate_super(mddev, freshest); 2134 2135 i = 0; 2136 ITERATE_RDEV(mddev,rdev,tmp) { 2137 if (rdev != freshest) 2138 if (super_types[mddev->major_version]. 2139 validate_super(mddev, rdev)) { 2140 printk(KERN_WARNING "md: kicking non-fresh %s" 2141 " from array!\n", 2142 bdevname(rdev->bdev,b)); 2143 kick_rdev_from_array(rdev); 2144 continue; 2145 } 2146 if (mddev->level == LEVEL_MULTIPATH) { 2147 rdev->desc_nr = i++; 2148 rdev->raid_disk = rdev->desc_nr; 2149 set_bit(In_sync, &rdev->flags); 2150 } else if (rdev->raid_disk >= mddev->raid_disks) { 2151 rdev->raid_disk = -1; 2152 clear_bit(In_sync, &rdev->flags); 2153 } 2154 } 2155 2156 2157 2158 if (mddev->recovery_cp != MaxSector && 2159 mddev->level >= 1) 2160 printk(KERN_ERR "md: %s: raid array is not clean" 2161 " -- starting background reconstruction\n", 2162 mdname(mddev)); 2163 2164 } 2165 2166 static ssize_t 2167 safe_delay_show(mddev_t *mddev, char *page) 2168 { 2169 int msec = (mddev->safemode_delay*1000)/HZ; 2170 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2171 } 2172 static ssize_t 2173 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2174 { 2175 int scale=1; 2176 int dot=0; 2177 int i; 2178 unsigned long msec; 2179 char buf[30]; 2180 char *e; 2181 /* remove a period, and count digits after it */ 2182 if (len >= sizeof(buf)) 2183 return -EINVAL; 2184 strlcpy(buf, cbuf, len); 2185 buf[len] = 0; 2186 for (i=0; i<len; i++) { 2187 if (dot) { 2188 if (isdigit(buf[i])) { 2189 buf[i-1] = buf[i]; 2190 scale *= 10; 2191 } 2192 buf[i] = 0; 2193 } else if (buf[i] == '.') { 2194 dot=1; 2195 buf[i] = 0; 2196 } 2197 } 2198 msec = simple_strtoul(buf, &e, 10); 2199 if (e == buf || (*e && *e != '\n')) 2200 return -EINVAL; 2201 msec = (msec * 1000) / scale; 2202 if (msec == 0) 2203 mddev->safemode_delay = 0; 2204 else { 2205 mddev->safemode_delay = (msec*HZ)/1000; 2206 if (mddev->safemode_delay == 0) 2207 mddev->safemode_delay = 1; 2208 } 2209 return len; 2210 } 2211 static struct md_sysfs_entry md_safe_delay = 2212 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2213 2214 static ssize_t 2215 level_show(mddev_t *mddev, char *page) 2216 { 2217 struct mdk_personality *p = mddev->pers; 2218 if (p) 2219 return sprintf(page, "%s\n", p->name); 2220 else if (mddev->clevel[0]) 2221 return sprintf(page, "%s\n", mddev->clevel); 2222 else if (mddev->level != LEVEL_NONE) 2223 return sprintf(page, "%d\n", mddev->level); 2224 else 2225 return 0; 2226 } 2227 2228 static ssize_t 2229 level_store(mddev_t *mddev, const char *buf, size_t len) 2230 { 2231 int rv = len; 2232 if (mddev->pers) 2233 return -EBUSY; 2234 if (len == 0) 2235 return 0; 2236 if (len >= sizeof(mddev->clevel)) 2237 return -ENOSPC; 2238 strncpy(mddev->clevel, buf, len); 2239 if (mddev->clevel[len-1] == '\n') 2240 len--; 2241 mddev->clevel[len] = 0; 2242 mddev->level = LEVEL_NONE; 2243 return rv; 2244 } 2245 2246 static struct md_sysfs_entry md_level = 2247 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2248 2249 2250 static ssize_t 2251 layout_show(mddev_t *mddev, char *page) 2252 { 2253 /* just a number, not meaningful for all levels */ 2254 if (mddev->reshape_position != MaxSector && 2255 mddev->layout != mddev->new_layout) 2256 return sprintf(page, "%d (%d)\n", 2257 mddev->new_layout, mddev->layout); 2258 return sprintf(page, "%d\n", mddev->layout); 2259 } 2260 2261 static ssize_t 2262 layout_store(mddev_t *mddev, const char *buf, size_t len) 2263 { 2264 char *e; 2265 unsigned long n = simple_strtoul(buf, &e, 10); 2266 2267 if (!*buf || (*e && *e != '\n')) 2268 return -EINVAL; 2269 2270 if (mddev->pers) 2271 return -EBUSY; 2272 if (mddev->reshape_position != MaxSector) 2273 mddev->new_layout = n; 2274 else 2275 mddev->layout = n; 2276 return len; 2277 } 2278 static struct md_sysfs_entry md_layout = 2279 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2280 2281 2282 static ssize_t 2283 raid_disks_show(mddev_t *mddev, char *page) 2284 { 2285 if (mddev->raid_disks == 0) 2286 return 0; 2287 if (mddev->reshape_position != MaxSector && 2288 mddev->delta_disks != 0) 2289 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 2290 mddev->raid_disks - mddev->delta_disks); 2291 return sprintf(page, "%d\n", mddev->raid_disks); 2292 } 2293 2294 static int update_raid_disks(mddev_t *mddev, int raid_disks); 2295 2296 static ssize_t 2297 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2298 { 2299 char *e; 2300 int rv = 0; 2301 unsigned long n = simple_strtoul(buf, &e, 10); 2302 2303 if (!*buf || (*e && *e != '\n')) 2304 return -EINVAL; 2305 2306 if (mddev->pers) 2307 rv = update_raid_disks(mddev, n); 2308 else if (mddev->reshape_position != MaxSector) { 2309 int olddisks = mddev->raid_disks - mddev->delta_disks; 2310 mddev->delta_disks = n - olddisks; 2311 mddev->raid_disks = n; 2312 } else 2313 mddev->raid_disks = n; 2314 return rv ? rv : len; 2315 } 2316 static struct md_sysfs_entry md_raid_disks = 2317 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2318 2319 static ssize_t 2320 chunk_size_show(mddev_t *mddev, char *page) 2321 { 2322 if (mddev->reshape_position != MaxSector && 2323 mddev->chunk_size != mddev->new_chunk) 2324 return sprintf(page, "%d (%d)\n", mddev->new_chunk, 2325 mddev->chunk_size); 2326 return sprintf(page, "%d\n", mddev->chunk_size); 2327 } 2328 2329 static ssize_t 2330 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2331 { 2332 /* can only set chunk_size if array is not yet active */ 2333 char *e; 2334 unsigned long n = simple_strtoul(buf, &e, 10); 2335 2336 if (!*buf || (*e && *e != '\n')) 2337 return -EINVAL; 2338 2339 if (mddev->pers) 2340 return -EBUSY; 2341 else if (mddev->reshape_position != MaxSector) 2342 mddev->new_chunk = n; 2343 else 2344 mddev->chunk_size = n; 2345 return len; 2346 } 2347 static struct md_sysfs_entry md_chunk_size = 2348 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2349 2350 static ssize_t 2351 resync_start_show(mddev_t *mddev, char *page) 2352 { 2353 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2354 } 2355 2356 static ssize_t 2357 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2358 { 2359 /* can only set chunk_size if array is not yet active */ 2360 char *e; 2361 unsigned long long n = simple_strtoull(buf, &e, 10); 2362 2363 if (mddev->pers) 2364 return -EBUSY; 2365 if (!*buf || (*e && *e != '\n')) 2366 return -EINVAL; 2367 2368 mddev->recovery_cp = n; 2369 return len; 2370 } 2371 static struct md_sysfs_entry md_resync_start = 2372 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2373 2374 /* 2375 * The array state can be: 2376 * 2377 * clear 2378 * No devices, no size, no level 2379 * Equivalent to STOP_ARRAY ioctl 2380 * inactive 2381 * May have some settings, but array is not active 2382 * all IO results in error 2383 * When written, doesn't tear down array, but just stops it 2384 * suspended (not supported yet) 2385 * All IO requests will block. The array can be reconfigured. 2386 * Writing this, if accepted, will block until array is quiessent 2387 * readonly 2388 * no resync can happen. no superblocks get written. 2389 * write requests fail 2390 * read-auto 2391 * like readonly, but behaves like 'clean' on a write request. 2392 * 2393 * clean - no pending writes, but otherwise active. 2394 * When written to inactive array, starts without resync 2395 * If a write request arrives then 2396 * if metadata is known, mark 'dirty' and switch to 'active'. 2397 * if not known, block and switch to write-pending 2398 * If written to an active array that has pending writes, then fails. 2399 * active 2400 * fully active: IO and resync can be happening. 2401 * When written to inactive array, starts with resync 2402 * 2403 * write-pending 2404 * clean, but writes are blocked waiting for 'active' to be written. 2405 * 2406 * active-idle 2407 * like active, but no writes have been seen for a while (100msec). 2408 * 2409 */ 2410 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2411 write_pending, active_idle, bad_word}; 2412 static char *array_states[] = { 2413 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2414 "write-pending", "active-idle", NULL }; 2415 2416 static int match_word(const char *word, char **list) 2417 { 2418 int n; 2419 for (n=0; list[n]; n++) 2420 if (cmd_match(word, list[n])) 2421 break; 2422 return n; 2423 } 2424 2425 static ssize_t 2426 array_state_show(mddev_t *mddev, char *page) 2427 { 2428 enum array_state st = inactive; 2429 2430 if (mddev->pers) 2431 switch(mddev->ro) { 2432 case 1: 2433 st = readonly; 2434 break; 2435 case 2: 2436 st = read_auto; 2437 break; 2438 case 0: 2439 if (mddev->in_sync) 2440 st = clean; 2441 else if (mddev->safemode) 2442 st = active_idle; 2443 else 2444 st = active; 2445 } 2446 else { 2447 if (list_empty(&mddev->disks) && 2448 mddev->raid_disks == 0 && 2449 mddev->size == 0) 2450 st = clear; 2451 else 2452 st = inactive; 2453 } 2454 return sprintf(page, "%s\n", array_states[st]); 2455 } 2456 2457 static int do_md_stop(mddev_t * mddev, int ro); 2458 static int do_md_run(mddev_t * mddev); 2459 static int restart_array(mddev_t *mddev); 2460 2461 static ssize_t 2462 array_state_store(mddev_t *mddev, const char *buf, size_t len) 2463 { 2464 int err = -EINVAL; 2465 enum array_state st = match_word(buf, array_states); 2466 switch(st) { 2467 case bad_word: 2468 break; 2469 case clear: 2470 /* stopping an active array */ 2471 if (mddev->pers) { 2472 if (atomic_read(&mddev->active) > 1) 2473 return -EBUSY; 2474 err = do_md_stop(mddev, 0); 2475 } 2476 break; 2477 case inactive: 2478 /* stopping an active array */ 2479 if (mddev->pers) { 2480 if (atomic_read(&mddev->active) > 1) 2481 return -EBUSY; 2482 err = do_md_stop(mddev, 2); 2483 } 2484 break; 2485 case suspended: 2486 break; /* not supported yet */ 2487 case readonly: 2488 if (mddev->pers) 2489 err = do_md_stop(mddev, 1); 2490 else { 2491 mddev->ro = 1; 2492 err = do_md_run(mddev); 2493 } 2494 break; 2495 case read_auto: 2496 /* stopping an active array */ 2497 if (mddev->pers) { 2498 err = do_md_stop(mddev, 1); 2499 if (err == 0) 2500 mddev->ro = 2; /* FIXME mark devices writable */ 2501 } else { 2502 mddev->ro = 2; 2503 err = do_md_run(mddev); 2504 } 2505 break; 2506 case clean: 2507 if (mddev->pers) { 2508 restart_array(mddev); 2509 spin_lock_irq(&mddev->write_lock); 2510 if (atomic_read(&mddev->writes_pending) == 0) { 2511 mddev->in_sync = 1; 2512 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 2513 } 2514 spin_unlock_irq(&mddev->write_lock); 2515 } else { 2516 mddev->ro = 0; 2517 mddev->recovery_cp = MaxSector; 2518 err = do_md_run(mddev); 2519 } 2520 break; 2521 case active: 2522 if (mddev->pers) { 2523 restart_array(mddev); 2524 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2525 wake_up(&mddev->sb_wait); 2526 err = 0; 2527 } else { 2528 mddev->ro = 0; 2529 err = do_md_run(mddev); 2530 } 2531 break; 2532 case write_pending: 2533 case active_idle: 2534 /* these cannot be set */ 2535 break; 2536 } 2537 if (err) 2538 return err; 2539 else 2540 return len; 2541 } 2542 static struct md_sysfs_entry md_array_state = 2543 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2544 2545 static ssize_t 2546 null_show(mddev_t *mddev, char *page) 2547 { 2548 return -EINVAL; 2549 } 2550 2551 static ssize_t 2552 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 2553 { 2554 /* buf must be %d:%d\n? giving major and minor numbers */ 2555 /* The new device is added to the array. 2556 * If the array has a persistent superblock, we read the 2557 * superblock to initialise info and check validity. 2558 * Otherwise, only checking done is that in bind_rdev_to_array, 2559 * which mainly checks size. 2560 */ 2561 char *e; 2562 int major = simple_strtoul(buf, &e, 10); 2563 int minor; 2564 dev_t dev; 2565 mdk_rdev_t *rdev; 2566 int err; 2567 2568 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 2569 return -EINVAL; 2570 minor = simple_strtoul(e+1, &e, 10); 2571 if (*e && *e != '\n') 2572 return -EINVAL; 2573 dev = MKDEV(major, minor); 2574 if (major != MAJOR(dev) || 2575 minor != MINOR(dev)) 2576 return -EOVERFLOW; 2577 2578 2579 if (mddev->persistent) { 2580 rdev = md_import_device(dev, mddev->major_version, 2581 mddev->minor_version); 2582 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 2583 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2584 mdk_rdev_t, same_set); 2585 err = super_types[mddev->major_version] 2586 .load_super(rdev, rdev0, mddev->minor_version); 2587 if (err < 0) 2588 goto out; 2589 } 2590 } else 2591 rdev = md_import_device(dev, -1, -1); 2592 2593 if (IS_ERR(rdev)) 2594 return PTR_ERR(rdev); 2595 err = bind_rdev_to_array(rdev, mddev); 2596 out: 2597 if (err) 2598 export_rdev(rdev); 2599 return err ? err : len; 2600 } 2601 2602 static struct md_sysfs_entry md_new_device = 2603 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 2604 2605 static ssize_t 2606 bitmap_store(mddev_t *mddev, const char *buf, size_t len) 2607 { 2608 char *end; 2609 unsigned long chunk, end_chunk; 2610 2611 if (!mddev->bitmap) 2612 goto out; 2613 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 2614 while (*buf) { 2615 chunk = end_chunk = simple_strtoul(buf, &end, 0); 2616 if (buf == end) break; 2617 if (*end == '-') { /* range */ 2618 buf = end + 1; 2619 end_chunk = simple_strtoul(buf, &end, 0); 2620 if (buf == end) break; 2621 } 2622 if (*end && !isspace(*end)) break; 2623 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 2624 buf = end; 2625 while (isspace(*buf)) buf++; 2626 } 2627 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 2628 out: 2629 return len; 2630 } 2631 2632 static struct md_sysfs_entry md_bitmap = 2633 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 2634 2635 static ssize_t 2636 size_show(mddev_t *mddev, char *page) 2637 { 2638 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2639 } 2640 2641 static int update_size(mddev_t *mddev, unsigned long size); 2642 2643 static ssize_t 2644 size_store(mddev_t *mddev, const char *buf, size_t len) 2645 { 2646 /* If array is inactive, we can reduce the component size, but 2647 * not increase it (except from 0). 2648 * If array is active, we can try an on-line resize 2649 */ 2650 char *e; 2651 int err = 0; 2652 unsigned long long size = simple_strtoull(buf, &e, 10); 2653 if (!*buf || *buf == '\n' || 2654 (*e && *e != '\n')) 2655 return -EINVAL; 2656 2657 if (mddev->pers) { 2658 err = update_size(mddev, size); 2659 md_update_sb(mddev, 1); 2660 } else { 2661 if (mddev->size == 0 || 2662 mddev->size > size) 2663 mddev->size = size; 2664 else 2665 err = -ENOSPC; 2666 } 2667 return err ? err : len; 2668 } 2669 2670 static struct md_sysfs_entry md_size = 2671 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 2672 2673 2674 /* Metdata version. 2675 * This is either 'none' for arrays with externally managed metadata, 2676 * or N.M for internally known formats 2677 */ 2678 static ssize_t 2679 metadata_show(mddev_t *mddev, char *page) 2680 { 2681 if (mddev->persistent) 2682 return sprintf(page, "%d.%d\n", 2683 mddev->major_version, mddev->minor_version); 2684 else 2685 return sprintf(page, "none\n"); 2686 } 2687 2688 static ssize_t 2689 metadata_store(mddev_t *mddev, const char *buf, size_t len) 2690 { 2691 int major, minor; 2692 char *e; 2693 if (!list_empty(&mddev->disks)) 2694 return -EBUSY; 2695 2696 if (cmd_match(buf, "none")) { 2697 mddev->persistent = 0; 2698 mddev->major_version = 0; 2699 mddev->minor_version = 90; 2700 return len; 2701 } 2702 major = simple_strtoul(buf, &e, 10); 2703 if (e==buf || *e != '.') 2704 return -EINVAL; 2705 buf = e+1; 2706 minor = simple_strtoul(buf, &e, 10); 2707 if (e==buf || (*e && *e != '\n') ) 2708 return -EINVAL; 2709 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 2710 return -ENOENT; 2711 mddev->major_version = major; 2712 mddev->minor_version = minor; 2713 mddev->persistent = 1; 2714 return len; 2715 } 2716 2717 static struct md_sysfs_entry md_metadata = 2718 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 2719 2720 static ssize_t 2721 action_show(mddev_t *mddev, char *page) 2722 { 2723 char *type = "idle"; 2724 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2725 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { 2726 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2727 type = "reshape"; 2728 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2729 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2730 type = "resync"; 2731 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2732 type = "check"; 2733 else 2734 type = "repair"; 2735 } else 2736 type = "recover"; 2737 } 2738 return sprintf(page, "%s\n", type); 2739 } 2740 2741 static ssize_t 2742 action_store(mddev_t *mddev, const char *page, size_t len) 2743 { 2744 if (!mddev->pers || !mddev->pers->sync_request) 2745 return -EINVAL; 2746 2747 if (cmd_match(page, "idle")) { 2748 if (mddev->sync_thread) { 2749 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2750 md_unregister_thread(mddev->sync_thread); 2751 mddev->sync_thread = NULL; 2752 mddev->recovery = 0; 2753 } 2754 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2755 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 2756 return -EBUSY; 2757 else if (cmd_match(page, "resync") || cmd_match(page, "recover")) 2758 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2759 else if (cmd_match(page, "reshape")) { 2760 int err; 2761 if (mddev->pers->start_reshape == NULL) 2762 return -EINVAL; 2763 err = mddev->pers->start_reshape(mddev); 2764 if (err) 2765 return err; 2766 } else { 2767 if (cmd_match(page, "check")) 2768 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2769 else if (!cmd_match(page, "repair")) 2770 return -EINVAL; 2771 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2772 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2773 } 2774 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2775 md_wakeup_thread(mddev->thread); 2776 return len; 2777 } 2778 2779 static ssize_t 2780 mismatch_cnt_show(mddev_t *mddev, char *page) 2781 { 2782 return sprintf(page, "%llu\n", 2783 (unsigned long long) mddev->resync_mismatches); 2784 } 2785 2786 static struct md_sysfs_entry md_scan_mode = 2787 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 2788 2789 2790 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 2791 2792 static ssize_t 2793 sync_min_show(mddev_t *mddev, char *page) 2794 { 2795 return sprintf(page, "%d (%s)\n", speed_min(mddev), 2796 mddev->sync_speed_min ? "local": "system"); 2797 } 2798 2799 static ssize_t 2800 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 2801 { 2802 int min; 2803 char *e; 2804 if (strncmp(buf, "system", 6)==0) { 2805 mddev->sync_speed_min = 0; 2806 return len; 2807 } 2808 min = simple_strtoul(buf, &e, 10); 2809 if (buf == e || (*e && *e != '\n') || min <= 0) 2810 return -EINVAL; 2811 mddev->sync_speed_min = min; 2812 return len; 2813 } 2814 2815 static struct md_sysfs_entry md_sync_min = 2816 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 2817 2818 static ssize_t 2819 sync_max_show(mddev_t *mddev, char *page) 2820 { 2821 return sprintf(page, "%d (%s)\n", speed_max(mddev), 2822 mddev->sync_speed_max ? "local": "system"); 2823 } 2824 2825 static ssize_t 2826 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 2827 { 2828 int max; 2829 char *e; 2830 if (strncmp(buf, "system", 6)==0) { 2831 mddev->sync_speed_max = 0; 2832 return len; 2833 } 2834 max = simple_strtoul(buf, &e, 10); 2835 if (buf == e || (*e && *e != '\n') || max <= 0) 2836 return -EINVAL; 2837 mddev->sync_speed_max = max; 2838 return len; 2839 } 2840 2841 static struct md_sysfs_entry md_sync_max = 2842 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 2843 2844 2845 static ssize_t 2846 sync_speed_show(mddev_t *mddev, char *page) 2847 { 2848 unsigned long resync, dt, db; 2849 resync = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)); 2850 dt = ((jiffies - mddev->resync_mark) / HZ); 2851 if (!dt) dt++; 2852 db = resync - (mddev->resync_mark_cnt); 2853 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */ 2854 } 2855 2856 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 2857 2858 static ssize_t 2859 sync_completed_show(mddev_t *mddev, char *page) 2860 { 2861 unsigned long max_blocks, resync; 2862 2863 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2864 max_blocks = mddev->resync_max_sectors; 2865 else 2866 max_blocks = mddev->size << 1; 2867 2868 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 2869 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 2870 } 2871 2872 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 2873 2874 static ssize_t 2875 suspend_lo_show(mddev_t *mddev, char *page) 2876 { 2877 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 2878 } 2879 2880 static ssize_t 2881 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 2882 { 2883 char *e; 2884 unsigned long long new = simple_strtoull(buf, &e, 10); 2885 2886 if (mddev->pers->quiesce == NULL) 2887 return -EINVAL; 2888 if (buf == e || (*e && *e != '\n')) 2889 return -EINVAL; 2890 if (new >= mddev->suspend_hi || 2891 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 2892 mddev->suspend_lo = new; 2893 mddev->pers->quiesce(mddev, 2); 2894 return len; 2895 } else 2896 return -EINVAL; 2897 } 2898 static struct md_sysfs_entry md_suspend_lo = 2899 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 2900 2901 2902 static ssize_t 2903 suspend_hi_show(mddev_t *mddev, char *page) 2904 { 2905 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 2906 } 2907 2908 static ssize_t 2909 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 2910 { 2911 char *e; 2912 unsigned long long new = simple_strtoull(buf, &e, 10); 2913 2914 if (mddev->pers->quiesce == NULL) 2915 return -EINVAL; 2916 if (buf == e || (*e && *e != '\n')) 2917 return -EINVAL; 2918 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 2919 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 2920 mddev->suspend_hi = new; 2921 mddev->pers->quiesce(mddev, 1); 2922 mddev->pers->quiesce(mddev, 0); 2923 return len; 2924 } else 2925 return -EINVAL; 2926 } 2927 static struct md_sysfs_entry md_suspend_hi = 2928 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 2929 2930 static ssize_t 2931 reshape_position_show(mddev_t *mddev, char *page) 2932 { 2933 if (mddev->reshape_position != MaxSector) 2934 return sprintf(page, "%llu\n", 2935 (unsigned long long)mddev->reshape_position); 2936 strcpy(page, "none\n"); 2937 return 5; 2938 } 2939 2940 static ssize_t 2941 reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 2942 { 2943 char *e; 2944 unsigned long long new = simple_strtoull(buf, &e, 10); 2945 if (mddev->pers) 2946 return -EBUSY; 2947 if (buf == e || (*e && *e != '\n')) 2948 return -EINVAL; 2949 mddev->reshape_position = new; 2950 mddev->delta_disks = 0; 2951 mddev->new_level = mddev->level; 2952 mddev->new_layout = mddev->layout; 2953 mddev->new_chunk = mddev->chunk_size; 2954 return len; 2955 } 2956 2957 static struct md_sysfs_entry md_reshape_position = 2958 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 2959 reshape_position_store); 2960 2961 2962 static struct attribute *md_default_attrs[] = { 2963 &md_level.attr, 2964 &md_layout.attr, 2965 &md_raid_disks.attr, 2966 &md_chunk_size.attr, 2967 &md_size.attr, 2968 &md_resync_start.attr, 2969 &md_metadata.attr, 2970 &md_new_device.attr, 2971 &md_safe_delay.attr, 2972 &md_array_state.attr, 2973 &md_reshape_position.attr, 2974 NULL, 2975 }; 2976 2977 static struct attribute *md_redundancy_attrs[] = { 2978 &md_scan_mode.attr, 2979 &md_mismatches.attr, 2980 &md_sync_min.attr, 2981 &md_sync_max.attr, 2982 &md_sync_speed.attr, 2983 &md_sync_completed.attr, 2984 &md_suspend_lo.attr, 2985 &md_suspend_hi.attr, 2986 &md_bitmap.attr, 2987 NULL, 2988 }; 2989 static struct attribute_group md_redundancy_group = { 2990 .name = NULL, 2991 .attrs = md_redundancy_attrs, 2992 }; 2993 2994 2995 static ssize_t 2996 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2997 { 2998 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 2999 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3000 ssize_t rv; 3001 3002 if (!entry->show) 3003 return -EIO; 3004 rv = mddev_lock(mddev); 3005 if (!rv) { 3006 rv = entry->show(mddev, page); 3007 mddev_unlock(mddev); 3008 } 3009 return rv; 3010 } 3011 3012 static ssize_t 3013 md_attr_store(struct kobject *kobj, struct attribute *attr, 3014 const char *page, size_t length) 3015 { 3016 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3017 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3018 ssize_t rv; 3019 3020 if (!entry->store) 3021 return -EIO; 3022 if (!capable(CAP_SYS_ADMIN)) 3023 return -EACCES; 3024 rv = mddev_lock(mddev); 3025 if (!rv) { 3026 rv = entry->store(mddev, page, length); 3027 mddev_unlock(mddev); 3028 } 3029 return rv; 3030 } 3031 3032 static void md_free(struct kobject *ko) 3033 { 3034 mddev_t *mddev = container_of(ko, mddev_t, kobj); 3035 kfree(mddev); 3036 } 3037 3038 static struct sysfs_ops md_sysfs_ops = { 3039 .show = md_attr_show, 3040 .store = md_attr_store, 3041 }; 3042 static struct kobj_type md_ktype = { 3043 .release = md_free, 3044 .sysfs_ops = &md_sysfs_ops, 3045 .default_attrs = md_default_attrs, 3046 }; 3047 3048 int mdp_major = 0; 3049 3050 static struct kobject *md_probe(dev_t dev, int *part, void *data) 3051 { 3052 static DEFINE_MUTEX(disks_mutex); 3053 mddev_t *mddev = mddev_find(dev); 3054 struct gendisk *disk; 3055 int partitioned = (MAJOR(dev) != MD_MAJOR); 3056 int shift = partitioned ? MdpMinorShift : 0; 3057 int unit = MINOR(dev) >> shift; 3058 3059 if (!mddev) 3060 return NULL; 3061 3062 mutex_lock(&disks_mutex); 3063 if (mddev->gendisk) { 3064 mutex_unlock(&disks_mutex); 3065 mddev_put(mddev); 3066 return NULL; 3067 } 3068 disk = alloc_disk(1 << shift); 3069 if (!disk) { 3070 mutex_unlock(&disks_mutex); 3071 mddev_put(mddev); 3072 return NULL; 3073 } 3074 disk->major = MAJOR(dev); 3075 disk->first_minor = unit << shift; 3076 if (partitioned) 3077 sprintf(disk->disk_name, "md_d%d", unit); 3078 else 3079 sprintf(disk->disk_name, "md%d", unit); 3080 disk->fops = &md_fops; 3081 disk->private_data = mddev; 3082 disk->queue = mddev->queue; 3083 add_disk(disk); 3084 mddev->gendisk = disk; 3085 mutex_unlock(&disks_mutex); 3086 mddev->kobj.parent = &disk->kobj; 3087 mddev->kobj.k_name = NULL; 3088 snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); 3089 mddev->kobj.ktype = &md_ktype; 3090 if (kobject_register(&mddev->kobj)) 3091 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3092 disk->disk_name); 3093 return NULL; 3094 } 3095 3096 static void md_safemode_timeout(unsigned long data) 3097 { 3098 mddev_t *mddev = (mddev_t *) data; 3099 3100 mddev->safemode = 1; 3101 md_wakeup_thread(mddev->thread); 3102 } 3103 3104 static int start_dirty_degraded; 3105 3106 static int do_md_run(mddev_t * mddev) 3107 { 3108 int err; 3109 int chunk_size; 3110 struct list_head *tmp; 3111 mdk_rdev_t *rdev; 3112 struct gendisk *disk; 3113 struct mdk_personality *pers; 3114 char b[BDEVNAME_SIZE]; 3115 3116 if (list_empty(&mddev->disks)) 3117 /* cannot run an array with no devices.. */ 3118 return -EINVAL; 3119 3120 if (mddev->pers) 3121 return -EBUSY; 3122 3123 /* 3124 * Analyze all RAID superblock(s) 3125 */ 3126 if (!mddev->raid_disks) 3127 analyze_sbs(mddev); 3128 3129 chunk_size = mddev->chunk_size; 3130 3131 if (chunk_size) { 3132 if (chunk_size > MAX_CHUNK_SIZE) { 3133 printk(KERN_ERR "too big chunk_size: %d > %d\n", 3134 chunk_size, MAX_CHUNK_SIZE); 3135 return -EINVAL; 3136 } 3137 /* 3138 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 3139 */ 3140 if ( (1 << ffz(~chunk_size)) != chunk_size) { 3141 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 3142 return -EINVAL; 3143 } 3144 if (chunk_size < PAGE_SIZE) { 3145 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 3146 chunk_size, PAGE_SIZE); 3147 return -EINVAL; 3148 } 3149 3150 /* devices must have minimum size of one chunk */ 3151 ITERATE_RDEV(mddev,rdev,tmp) { 3152 if (test_bit(Faulty, &rdev->flags)) 3153 continue; 3154 if (rdev->size < chunk_size / 1024) { 3155 printk(KERN_WARNING 3156 "md: Dev %s smaller than chunk_size:" 3157 " %lluk < %dk\n", 3158 bdevname(rdev->bdev,b), 3159 (unsigned long long)rdev->size, 3160 chunk_size / 1024); 3161 return -EINVAL; 3162 } 3163 } 3164 } 3165 3166 #ifdef CONFIG_KMOD 3167 if (mddev->level != LEVEL_NONE) 3168 request_module("md-level-%d", mddev->level); 3169 else if (mddev->clevel[0]) 3170 request_module("md-%s", mddev->clevel); 3171 #endif 3172 3173 /* 3174 * Drop all container device buffers, from now on 3175 * the only valid external interface is through the md 3176 * device. 3177 * Also find largest hardsector size 3178 */ 3179 ITERATE_RDEV(mddev,rdev,tmp) { 3180 if (test_bit(Faulty, &rdev->flags)) 3181 continue; 3182 sync_blockdev(rdev->bdev); 3183 invalidate_bdev(rdev->bdev); 3184 } 3185 3186 md_probe(mddev->unit, NULL, NULL); 3187 disk = mddev->gendisk; 3188 if (!disk) 3189 return -ENOMEM; 3190 3191 spin_lock(&pers_lock); 3192 pers = find_pers(mddev->level, mddev->clevel); 3193 if (!pers || !try_module_get(pers->owner)) { 3194 spin_unlock(&pers_lock); 3195 if (mddev->level != LEVEL_NONE) 3196 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 3197 mddev->level); 3198 else 3199 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 3200 mddev->clevel); 3201 return -EINVAL; 3202 } 3203 mddev->pers = pers; 3204 spin_unlock(&pers_lock); 3205 mddev->level = pers->level; 3206 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3207 3208 if (mddev->reshape_position != MaxSector && 3209 pers->start_reshape == NULL) { 3210 /* This personality cannot handle reshaping... */ 3211 mddev->pers = NULL; 3212 module_put(pers->owner); 3213 return -EINVAL; 3214 } 3215 3216 if (pers->sync_request) { 3217 /* Warn if this is a potentially silly 3218 * configuration. 3219 */ 3220 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3221 mdk_rdev_t *rdev2; 3222 struct list_head *tmp2; 3223 int warned = 0; 3224 ITERATE_RDEV(mddev, rdev, tmp) { 3225 ITERATE_RDEV(mddev, rdev2, tmp2) { 3226 if (rdev < rdev2 && 3227 rdev->bdev->bd_contains == 3228 rdev2->bdev->bd_contains) { 3229 printk(KERN_WARNING 3230 "%s: WARNING: %s appears to be" 3231 " on the same physical disk as" 3232 " %s.\n", 3233 mdname(mddev), 3234 bdevname(rdev->bdev,b), 3235 bdevname(rdev2->bdev,b2)); 3236 warned = 1; 3237 } 3238 } 3239 } 3240 if (warned) 3241 printk(KERN_WARNING 3242 "True protection against single-disk" 3243 " failure might be compromised.\n"); 3244 } 3245 3246 mddev->recovery = 0; 3247 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 3248 mddev->barriers_work = 1; 3249 mddev->ok_start_degraded = start_dirty_degraded; 3250 3251 if (start_readonly) 3252 mddev->ro = 2; /* read-only, but switch on first write */ 3253 3254 err = mddev->pers->run(mddev); 3255 if (!err && mddev->pers->sync_request) { 3256 err = bitmap_create(mddev); 3257 if (err) { 3258 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3259 mdname(mddev), err); 3260 mddev->pers->stop(mddev); 3261 } 3262 } 3263 if (err) { 3264 printk(KERN_ERR "md: pers->run() failed ...\n"); 3265 module_put(mddev->pers->owner); 3266 mddev->pers = NULL; 3267 bitmap_destroy(mddev); 3268 return err; 3269 } 3270 if (mddev->pers->sync_request) { 3271 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3272 printk(KERN_WARNING 3273 "md: cannot register extra attributes for %s\n", 3274 mdname(mddev)); 3275 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3276 mddev->ro = 0; 3277 3278 atomic_set(&mddev->writes_pending,0); 3279 mddev->safemode = 0; 3280 mddev->safemode_timer.function = md_safemode_timeout; 3281 mddev->safemode_timer.data = (unsigned long) mddev; 3282 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3283 mddev->in_sync = 1; 3284 3285 ITERATE_RDEV(mddev,rdev,tmp) 3286 if (rdev->raid_disk >= 0) { 3287 char nm[20]; 3288 sprintf(nm, "rd%d", rdev->raid_disk); 3289 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 3290 printk("md: cannot register %s for %s\n", 3291 nm, mdname(mddev)); 3292 } 3293 3294 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3295 3296 if (mddev->flags) 3297 md_update_sb(mddev, 0); 3298 3299 set_capacity(disk, mddev->array_size<<1); 3300 3301 /* If we call blk_queue_make_request here, it will 3302 * re-initialise max_sectors etc which may have been 3303 * refined inside -> run. So just set the bits we need to set. 3304 * Most initialisation happended when we called 3305 * blk_queue_make_request(..., md_fail_request) 3306 * earlier. 3307 */ 3308 mddev->queue->queuedata = mddev; 3309 mddev->queue->make_request_fn = mddev->pers->make_request; 3310 3311 /* If there is a partially-recovered drive we need to 3312 * start recovery here. If we leave it to md_check_recovery, 3313 * it will remove the drives and not do the right thing 3314 */ 3315 if (mddev->degraded && !mddev->sync_thread) { 3316 struct list_head *rtmp; 3317 int spares = 0; 3318 ITERATE_RDEV(mddev,rdev,rtmp) 3319 if (rdev->raid_disk >= 0 && 3320 !test_bit(In_sync, &rdev->flags) && 3321 !test_bit(Faulty, &rdev->flags)) 3322 /* complete an interrupted recovery */ 3323 spares++; 3324 if (spares && mddev->pers->sync_request) { 3325 mddev->recovery = 0; 3326 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3327 mddev->sync_thread = md_register_thread(md_do_sync, 3328 mddev, 3329 "%s_resync"); 3330 if (!mddev->sync_thread) { 3331 printk(KERN_ERR "%s: could not start resync" 3332 " thread...\n", 3333 mdname(mddev)); 3334 /* leave the spares where they are, it shouldn't hurt */ 3335 mddev->recovery = 0; 3336 } 3337 } 3338 } 3339 md_wakeup_thread(mddev->thread); 3340 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 3341 3342 mddev->changed = 1; 3343 md_new_event(mddev); 3344 kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE); 3345 return 0; 3346 } 3347 3348 static int restart_array(mddev_t *mddev) 3349 { 3350 struct gendisk *disk = mddev->gendisk; 3351 int err; 3352 3353 /* 3354 * Complain if it has no devices 3355 */ 3356 err = -ENXIO; 3357 if (list_empty(&mddev->disks)) 3358 goto out; 3359 3360 if (mddev->pers) { 3361 err = -EBUSY; 3362 if (!mddev->ro) 3363 goto out; 3364 3365 mddev->safemode = 0; 3366 mddev->ro = 0; 3367 set_disk_ro(disk, 0); 3368 3369 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3370 mdname(mddev)); 3371 /* 3372 * Kick recovery or resync if necessary 3373 */ 3374 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3375 md_wakeup_thread(mddev->thread); 3376 md_wakeup_thread(mddev->sync_thread); 3377 err = 0; 3378 } else 3379 err = -EINVAL; 3380 3381 out: 3382 return err; 3383 } 3384 3385 /* similar to deny_write_access, but accounts for our holding a reference 3386 * to the file ourselves */ 3387 static int deny_bitmap_write_access(struct file * file) 3388 { 3389 struct inode *inode = file->f_mapping->host; 3390 3391 spin_lock(&inode->i_lock); 3392 if (atomic_read(&inode->i_writecount) > 1) { 3393 spin_unlock(&inode->i_lock); 3394 return -ETXTBSY; 3395 } 3396 atomic_set(&inode->i_writecount, -1); 3397 spin_unlock(&inode->i_lock); 3398 3399 return 0; 3400 } 3401 3402 static void restore_bitmap_write_access(struct file *file) 3403 { 3404 struct inode *inode = file->f_mapping->host; 3405 3406 spin_lock(&inode->i_lock); 3407 atomic_set(&inode->i_writecount, 1); 3408 spin_unlock(&inode->i_lock); 3409 } 3410 3411 /* mode: 3412 * 0 - completely stop and dis-assemble array 3413 * 1 - switch to readonly 3414 * 2 - stop but do not disassemble array 3415 */ 3416 static int do_md_stop(mddev_t * mddev, int mode) 3417 { 3418 int err = 0; 3419 struct gendisk *disk = mddev->gendisk; 3420 3421 if (mddev->pers) { 3422 if (atomic_read(&mddev->active)>2) { 3423 printk("md: %s still in use.\n",mdname(mddev)); 3424 return -EBUSY; 3425 } 3426 3427 if (mddev->sync_thread) { 3428 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3429 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3430 md_unregister_thread(mddev->sync_thread); 3431 mddev->sync_thread = NULL; 3432 } 3433 3434 del_timer_sync(&mddev->safemode_timer); 3435 3436 invalidate_partition(disk, 0); 3437 3438 switch(mode) { 3439 case 1: /* readonly */ 3440 err = -ENXIO; 3441 if (mddev->ro==1) 3442 goto out; 3443 mddev->ro = 1; 3444 break; 3445 case 0: /* disassemble */ 3446 case 2: /* stop */ 3447 bitmap_flush(mddev); 3448 md_super_wait(mddev); 3449 if (mddev->ro) 3450 set_disk_ro(disk, 0); 3451 blk_queue_make_request(mddev->queue, md_fail_request); 3452 mddev->pers->stop(mddev); 3453 mddev->queue->merge_bvec_fn = NULL; 3454 mddev->queue->unplug_fn = NULL; 3455 mddev->queue->issue_flush_fn = NULL; 3456 mddev->queue->backing_dev_info.congested_fn = NULL; 3457 if (mddev->pers->sync_request) 3458 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3459 3460 module_put(mddev->pers->owner); 3461 mddev->pers = NULL; 3462 3463 set_capacity(disk, 0); 3464 mddev->changed = 1; 3465 3466 if (mddev->ro) 3467 mddev->ro = 0; 3468 } 3469 if (!mddev->in_sync || mddev->flags) { 3470 /* mark array as shutdown cleanly */ 3471 mddev->in_sync = 1; 3472 md_update_sb(mddev, 1); 3473 } 3474 if (mode == 1) 3475 set_disk_ro(disk, 1); 3476 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3477 } 3478 3479 /* 3480 * Free resources if final stop 3481 */ 3482 if (mode == 0) { 3483 mdk_rdev_t *rdev; 3484 struct list_head *tmp; 3485 3486 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 3487 3488 bitmap_destroy(mddev); 3489 if (mddev->bitmap_file) { 3490 restore_bitmap_write_access(mddev->bitmap_file); 3491 fput(mddev->bitmap_file); 3492 mddev->bitmap_file = NULL; 3493 } 3494 mddev->bitmap_offset = 0; 3495 3496 ITERATE_RDEV(mddev,rdev,tmp) 3497 if (rdev->raid_disk >= 0) { 3498 char nm[20]; 3499 sprintf(nm, "rd%d", rdev->raid_disk); 3500 sysfs_remove_link(&mddev->kobj, nm); 3501 } 3502 3503 /* make sure all delayed_delete calls have finished */ 3504 flush_scheduled_work(); 3505 3506 export_array(mddev); 3507 3508 mddev->array_size = 0; 3509 mddev->size = 0; 3510 mddev->raid_disks = 0; 3511 mddev->recovery_cp = 0; 3512 mddev->reshape_position = MaxSector; 3513 3514 } else if (mddev->pers) 3515 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3516 mdname(mddev)); 3517 err = 0; 3518 md_new_event(mddev); 3519 out: 3520 return err; 3521 } 3522 3523 #ifndef MODULE 3524 static void autorun_array(mddev_t *mddev) 3525 { 3526 mdk_rdev_t *rdev; 3527 struct list_head *tmp; 3528 int err; 3529 3530 if (list_empty(&mddev->disks)) 3531 return; 3532 3533 printk(KERN_INFO "md: running: "); 3534 3535 ITERATE_RDEV(mddev,rdev,tmp) { 3536 char b[BDEVNAME_SIZE]; 3537 printk("<%s>", bdevname(rdev->bdev,b)); 3538 } 3539 printk("\n"); 3540 3541 err = do_md_run (mddev); 3542 if (err) { 3543 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3544 do_md_stop (mddev, 0); 3545 } 3546 } 3547 3548 /* 3549 * lets try to run arrays based on all disks that have arrived 3550 * until now. (those are in pending_raid_disks) 3551 * 3552 * the method: pick the first pending disk, collect all disks with 3553 * the same UUID, remove all from the pending list and put them into 3554 * the 'same_array' list. Then order this list based on superblock 3555 * update time (freshest comes first), kick out 'old' disks and 3556 * compare superblocks. If everything's fine then run it. 3557 * 3558 * If "unit" is allocated, then bump its reference count 3559 */ 3560 static void autorun_devices(int part) 3561 { 3562 struct list_head *tmp; 3563 mdk_rdev_t *rdev0, *rdev; 3564 mddev_t *mddev; 3565 char b[BDEVNAME_SIZE]; 3566 3567 printk(KERN_INFO "md: autorun ...\n"); 3568 while (!list_empty(&pending_raid_disks)) { 3569 int unit; 3570 dev_t dev; 3571 LIST_HEAD(candidates); 3572 rdev0 = list_entry(pending_raid_disks.next, 3573 mdk_rdev_t, same_set); 3574 3575 printk(KERN_INFO "md: considering %s ...\n", 3576 bdevname(rdev0->bdev,b)); 3577 INIT_LIST_HEAD(&candidates); 3578 ITERATE_RDEV_PENDING(rdev,tmp) 3579 if (super_90_load(rdev, rdev0, 0) >= 0) { 3580 printk(KERN_INFO "md: adding %s ...\n", 3581 bdevname(rdev->bdev,b)); 3582 list_move(&rdev->same_set, &candidates); 3583 } 3584 /* 3585 * now we have a set of devices, with all of them having 3586 * mostly sane superblocks. It's time to allocate the 3587 * mddev. 3588 */ 3589 if (part) { 3590 dev = MKDEV(mdp_major, 3591 rdev0->preferred_minor << MdpMinorShift); 3592 unit = MINOR(dev) >> MdpMinorShift; 3593 } else { 3594 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 3595 unit = MINOR(dev); 3596 } 3597 if (rdev0->preferred_minor != unit) { 3598 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 3599 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 3600 break; 3601 } 3602 3603 md_probe(dev, NULL, NULL); 3604 mddev = mddev_find(dev); 3605 if (!mddev) { 3606 printk(KERN_ERR 3607 "md: cannot allocate memory for md drive.\n"); 3608 break; 3609 } 3610 if (mddev_lock(mddev)) 3611 printk(KERN_WARNING "md: %s locked, cannot run\n", 3612 mdname(mddev)); 3613 else if (mddev->raid_disks || mddev->major_version 3614 || !list_empty(&mddev->disks)) { 3615 printk(KERN_WARNING 3616 "md: %s already running, cannot run %s\n", 3617 mdname(mddev), bdevname(rdev0->bdev,b)); 3618 mddev_unlock(mddev); 3619 } else { 3620 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 3621 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 3622 list_del_init(&rdev->same_set); 3623 if (bind_rdev_to_array(rdev, mddev)) 3624 export_rdev(rdev); 3625 } 3626 autorun_array(mddev); 3627 mddev_unlock(mddev); 3628 } 3629 /* on success, candidates will be empty, on error 3630 * it won't... 3631 */ 3632 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 3633 export_rdev(rdev); 3634 mddev_put(mddev); 3635 } 3636 printk(KERN_INFO "md: ... autorun DONE.\n"); 3637 } 3638 #endif /* !MODULE */ 3639 3640 static int get_version(void __user * arg) 3641 { 3642 mdu_version_t ver; 3643 3644 ver.major = MD_MAJOR_VERSION; 3645 ver.minor = MD_MINOR_VERSION; 3646 ver.patchlevel = MD_PATCHLEVEL_VERSION; 3647 3648 if (copy_to_user(arg, &ver, sizeof(ver))) 3649 return -EFAULT; 3650 3651 return 0; 3652 } 3653 3654 static int get_array_info(mddev_t * mddev, void __user * arg) 3655 { 3656 mdu_array_info_t info; 3657 int nr,working,active,failed,spare; 3658 mdk_rdev_t *rdev; 3659 struct list_head *tmp; 3660 3661 nr=working=active=failed=spare=0; 3662 ITERATE_RDEV(mddev,rdev,tmp) { 3663 nr++; 3664 if (test_bit(Faulty, &rdev->flags)) 3665 failed++; 3666 else { 3667 working++; 3668 if (test_bit(In_sync, &rdev->flags)) 3669 active++; 3670 else 3671 spare++; 3672 } 3673 } 3674 3675 info.major_version = mddev->major_version; 3676 info.minor_version = mddev->minor_version; 3677 info.patch_version = MD_PATCHLEVEL_VERSION; 3678 info.ctime = mddev->ctime; 3679 info.level = mddev->level; 3680 info.size = mddev->size; 3681 if (info.size != mddev->size) /* overflow */ 3682 info.size = -1; 3683 info.nr_disks = nr; 3684 info.raid_disks = mddev->raid_disks; 3685 info.md_minor = mddev->md_minor; 3686 info.not_persistent= !mddev->persistent; 3687 3688 info.utime = mddev->utime; 3689 info.state = 0; 3690 if (mddev->in_sync) 3691 info.state = (1<<MD_SB_CLEAN); 3692 if (mddev->bitmap && mddev->bitmap_offset) 3693 info.state = (1<<MD_SB_BITMAP_PRESENT); 3694 info.active_disks = active; 3695 info.working_disks = working; 3696 info.failed_disks = failed; 3697 info.spare_disks = spare; 3698 3699 info.layout = mddev->layout; 3700 info.chunk_size = mddev->chunk_size; 3701 3702 if (copy_to_user(arg, &info, sizeof(info))) 3703 return -EFAULT; 3704 3705 return 0; 3706 } 3707 3708 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 3709 { 3710 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 3711 char *ptr, *buf = NULL; 3712 int err = -ENOMEM; 3713 3714 md_allow_write(mddev); 3715 3716 file = kmalloc(sizeof(*file), GFP_KERNEL); 3717 if (!file) 3718 goto out; 3719 3720 /* bitmap disabled, zero the first byte and copy out */ 3721 if (!mddev->bitmap || !mddev->bitmap->file) { 3722 file->pathname[0] = '\0'; 3723 goto copy_out; 3724 } 3725 3726 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 3727 if (!buf) 3728 goto out; 3729 3730 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 3731 if (!ptr) 3732 goto out; 3733 3734 strcpy(file->pathname, ptr); 3735 3736 copy_out: 3737 err = 0; 3738 if (copy_to_user(arg, file, sizeof(*file))) 3739 err = -EFAULT; 3740 out: 3741 kfree(buf); 3742 kfree(file); 3743 return err; 3744 } 3745 3746 static int get_disk_info(mddev_t * mddev, void __user * arg) 3747 { 3748 mdu_disk_info_t info; 3749 unsigned int nr; 3750 mdk_rdev_t *rdev; 3751 3752 if (copy_from_user(&info, arg, sizeof(info))) 3753 return -EFAULT; 3754 3755 nr = info.number; 3756 3757 rdev = find_rdev_nr(mddev, nr); 3758 if (rdev) { 3759 info.major = MAJOR(rdev->bdev->bd_dev); 3760 info.minor = MINOR(rdev->bdev->bd_dev); 3761 info.raid_disk = rdev->raid_disk; 3762 info.state = 0; 3763 if (test_bit(Faulty, &rdev->flags)) 3764 info.state |= (1<<MD_DISK_FAULTY); 3765 else if (test_bit(In_sync, &rdev->flags)) { 3766 info.state |= (1<<MD_DISK_ACTIVE); 3767 info.state |= (1<<MD_DISK_SYNC); 3768 } 3769 if (test_bit(WriteMostly, &rdev->flags)) 3770 info.state |= (1<<MD_DISK_WRITEMOSTLY); 3771 } else { 3772 info.major = info.minor = 0; 3773 info.raid_disk = -1; 3774 info.state = (1<<MD_DISK_REMOVED); 3775 } 3776 3777 if (copy_to_user(arg, &info, sizeof(info))) 3778 return -EFAULT; 3779 3780 return 0; 3781 } 3782 3783 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 3784 { 3785 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3786 mdk_rdev_t *rdev; 3787 dev_t dev = MKDEV(info->major,info->minor); 3788 3789 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 3790 return -EOVERFLOW; 3791 3792 if (!mddev->raid_disks) { 3793 int err; 3794 /* expecting a device which has a superblock */ 3795 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 3796 if (IS_ERR(rdev)) { 3797 printk(KERN_WARNING 3798 "md: md_import_device returned %ld\n", 3799 PTR_ERR(rdev)); 3800 return PTR_ERR(rdev); 3801 } 3802 if (!list_empty(&mddev->disks)) { 3803 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3804 mdk_rdev_t, same_set); 3805 int err = super_types[mddev->major_version] 3806 .load_super(rdev, rdev0, mddev->minor_version); 3807 if (err < 0) { 3808 printk(KERN_WARNING 3809 "md: %s has different UUID to %s\n", 3810 bdevname(rdev->bdev,b), 3811 bdevname(rdev0->bdev,b2)); 3812 export_rdev(rdev); 3813 return -EINVAL; 3814 } 3815 } 3816 err = bind_rdev_to_array(rdev, mddev); 3817 if (err) 3818 export_rdev(rdev); 3819 return err; 3820 } 3821 3822 /* 3823 * add_new_disk can be used once the array is assembled 3824 * to add "hot spares". They must already have a superblock 3825 * written 3826 */ 3827 if (mddev->pers) { 3828 int err; 3829 if (!mddev->pers->hot_add_disk) { 3830 printk(KERN_WARNING 3831 "%s: personality does not support diskops!\n", 3832 mdname(mddev)); 3833 return -EINVAL; 3834 } 3835 if (mddev->persistent) 3836 rdev = md_import_device(dev, mddev->major_version, 3837 mddev->minor_version); 3838 else 3839 rdev = md_import_device(dev, -1, -1); 3840 if (IS_ERR(rdev)) { 3841 printk(KERN_WARNING 3842 "md: md_import_device returned %ld\n", 3843 PTR_ERR(rdev)); 3844 return PTR_ERR(rdev); 3845 } 3846 /* set save_raid_disk if appropriate */ 3847 if (!mddev->persistent) { 3848 if (info->state & (1<<MD_DISK_SYNC) && 3849 info->raid_disk < mddev->raid_disks) 3850 rdev->raid_disk = info->raid_disk; 3851 else 3852 rdev->raid_disk = -1; 3853 } else 3854 super_types[mddev->major_version]. 3855 validate_super(mddev, rdev); 3856 rdev->saved_raid_disk = rdev->raid_disk; 3857 3858 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 3859 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3860 set_bit(WriteMostly, &rdev->flags); 3861 3862 rdev->raid_disk = -1; 3863 err = bind_rdev_to_array(rdev, mddev); 3864 if (!err && !mddev->pers->hot_remove_disk) { 3865 /* If there is hot_add_disk but no hot_remove_disk 3866 * then added disks for geometry changes, 3867 * and should be added immediately. 3868 */ 3869 super_types[mddev->major_version]. 3870 validate_super(mddev, rdev); 3871 err = mddev->pers->hot_add_disk(mddev, rdev); 3872 if (err) 3873 unbind_rdev_from_array(rdev); 3874 } 3875 if (err) 3876 export_rdev(rdev); 3877 3878 md_update_sb(mddev, 1); 3879 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3880 md_wakeup_thread(mddev->thread); 3881 return err; 3882 } 3883 3884 /* otherwise, add_new_disk is only allowed 3885 * for major_version==0 superblocks 3886 */ 3887 if (mddev->major_version != 0) { 3888 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 3889 mdname(mddev)); 3890 return -EINVAL; 3891 } 3892 3893 if (!(info->state & (1<<MD_DISK_FAULTY))) { 3894 int err; 3895 rdev = md_import_device (dev, -1, 0); 3896 if (IS_ERR(rdev)) { 3897 printk(KERN_WARNING 3898 "md: error, md_import_device() returned %ld\n", 3899 PTR_ERR(rdev)); 3900 return PTR_ERR(rdev); 3901 } 3902 rdev->desc_nr = info->number; 3903 if (info->raid_disk < mddev->raid_disks) 3904 rdev->raid_disk = info->raid_disk; 3905 else 3906 rdev->raid_disk = -1; 3907 3908 rdev->flags = 0; 3909 3910 if (rdev->raid_disk < mddev->raid_disks) 3911 if (info->state & (1<<MD_DISK_SYNC)) 3912 set_bit(In_sync, &rdev->flags); 3913 3914 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3915 set_bit(WriteMostly, &rdev->flags); 3916 3917 if (!mddev->persistent) { 3918 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3919 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3920 } else 3921 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3922 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3923 3924 err = bind_rdev_to_array(rdev, mddev); 3925 if (err) { 3926 export_rdev(rdev); 3927 return err; 3928 } 3929 } 3930 3931 return 0; 3932 } 3933 3934 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 3935 { 3936 char b[BDEVNAME_SIZE]; 3937 mdk_rdev_t *rdev; 3938 3939 if (!mddev->pers) 3940 return -ENODEV; 3941 3942 rdev = find_rdev(mddev, dev); 3943 if (!rdev) 3944 return -ENXIO; 3945 3946 if (rdev->raid_disk >= 0) 3947 goto busy; 3948 3949 kick_rdev_from_array(rdev); 3950 md_update_sb(mddev, 1); 3951 md_new_event(mddev); 3952 3953 return 0; 3954 busy: 3955 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 3956 bdevname(rdev->bdev,b), mdname(mddev)); 3957 return -EBUSY; 3958 } 3959 3960 static int hot_add_disk(mddev_t * mddev, dev_t dev) 3961 { 3962 char b[BDEVNAME_SIZE]; 3963 int err; 3964 unsigned int size; 3965 mdk_rdev_t *rdev; 3966 3967 if (!mddev->pers) 3968 return -ENODEV; 3969 3970 if (mddev->major_version != 0) { 3971 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 3972 " version-0 superblocks.\n", 3973 mdname(mddev)); 3974 return -EINVAL; 3975 } 3976 if (!mddev->pers->hot_add_disk) { 3977 printk(KERN_WARNING 3978 "%s: personality does not support diskops!\n", 3979 mdname(mddev)); 3980 return -EINVAL; 3981 } 3982 3983 rdev = md_import_device (dev, -1, 0); 3984 if (IS_ERR(rdev)) { 3985 printk(KERN_WARNING 3986 "md: error, md_import_device() returned %ld\n", 3987 PTR_ERR(rdev)); 3988 return -EINVAL; 3989 } 3990 3991 if (mddev->persistent) 3992 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3993 else 3994 rdev->sb_offset = 3995 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3996 3997 size = calc_dev_size(rdev, mddev->chunk_size); 3998 rdev->size = size; 3999 4000 if (test_bit(Faulty, &rdev->flags)) { 4001 printk(KERN_WARNING 4002 "md: can not hot-add faulty %s disk to %s!\n", 4003 bdevname(rdev->bdev,b), mdname(mddev)); 4004 err = -EINVAL; 4005 goto abort_export; 4006 } 4007 clear_bit(In_sync, &rdev->flags); 4008 rdev->desc_nr = -1; 4009 rdev->saved_raid_disk = -1; 4010 err = bind_rdev_to_array(rdev, mddev); 4011 if (err) 4012 goto abort_export; 4013 4014 /* 4015 * The rest should better be atomic, we can have disk failures 4016 * noticed in interrupt contexts ... 4017 */ 4018 4019 if (rdev->desc_nr == mddev->max_disks) { 4020 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 4021 mdname(mddev)); 4022 err = -EBUSY; 4023 goto abort_unbind_export; 4024 } 4025 4026 rdev->raid_disk = -1; 4027 4028 md_update_sb(mddev, 1); 4029 4030 /* 4031 * Kick recovery, maybe this spare has to be added to the 4032 * array immediately. 4033 */ 4034 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4035 md_wakeup_thread(mddev->thread); 4036 md_new_event(mddev); 4037 return 0; 4038 4039 abort_unbind_export: 4040 unbind_rdev_from_array(rdev); 4041 4042 abort_export: 4043 export_rdev(rdev); 4044 return err; 4045 } 4046 4047 static int set_bitmap_file(mddev_t *mddev, int fd) 4048 { 4049 int err; 4050 4051 if (mddev->pers) { 4052 if (!mddev->pers->quiesce) 4053 return -EBUSY; 4054 if (mddev->recovery || mddev->sync_thread) 4055 return -EBUSY; 4056 /* we should be able to change the bitmap.. */ 4057 } 4058 4059 4060 if (fd >= 0) { 4061 if (mddev->bitmap) 4062 return -EEXIST; /* cannot add when bitmap is present */ 4063 mddev->bitmap_file = fget(fd); 4064 4065 if (mddev->bitmap_file == NULL) { 4066 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 4067 mdname(mddev)); 4068 return -EBADF; 4069 } 4070 4071 err = deny_bitmap_write_access(mddev->bitmap_file); 4072 if (err) { 4073 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 4074 mdname(mddev)); 4075 fput(mddev->bitmap_file); 4076 mddev->bitmap_file = NULL; 4077 return err; 4078 } 4079 mddev->bitmap_offset = 0; /* file overrides offset */ 4080 } else if (mddev->bitmap == NULL) 4081 return -ENOENT; /* cannot remove what isn't there */ 4082 err = 0; 4083 if (mddev->pers) { 4084 mddev->pers->quiesce(mddev, 1); 4085 if (fd >= 0) 4086 err = bitmap_create(mddev); 4087 if (fd < 0 || err) { 4088 bitmap_destroy(mddev); 4089 fd = -1; /* make sure to put the file */ 4090 } 4091 mddev->pers->quiesce(mddev, 0); 4092 } 4093 if (fd < 0) { 4094 if (mddev->bitmap_file) { 4095 restore_bitmap_write_access(mddev->bitmap_file); 4096 fput(mddev->bitmap_file); 4097 } 4098 mddev->bitmap_file = NULL; 4099 } 4100 4101 return err; 4102 } 4103 4104 /* 4105 * set_array_info is used two different ways 4106 * The original usage is when creating a new array. 4107 * In this usage, raid_disks is > 0 and it together with 4108 * level, size, not_persistent,layout,chunksize determine the 4109 * shape of the array. 4110 * This will always create an array with a type-0.90.0 superblock. 4111 * The newer usage is when assembling an array. 4112 * In this case raid_disks will be 0, and the major_version field is 4113 * use to determine which style super-blocks are to be found on the devices. 4114 * The minor and patch _version numbers are also kept incase the 4115 * super_block handler wishes to interpret them. 4116 */ 4117 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 4118 { 4119 4120 if (info->raid_disks == 0) { 4121 /* just setting version number for superblock loading */ 4122 if (info->major_version < 0 || 4123 info->major_version >= ARRAY_SIZE(super_types) || 4124 super_types[info->major_version].name == NULL) { 4125 /* maybe try to auto-load a module? */ 4126 printk(KERN_INFO 4127 "md: superblock version %d not known\n", 4128 info->major_version); 4129 return -EINVAL; 4130 } 4131 mddev->major_version = info->major_version; 4132 mddev->minor_version = info->minor_version; 4133 mddev->patch_version = info->patch_version; 4134 mddev->persistent = !info->not_persistent; 4135 return 0; 4136 } 4137 mddev->major_version = MD_MAJOR_VERSION; 4138 mddev->minor_version = MD_MINOR_VERSION; 4139 mddev->patch_version = MD_PATCHLEVEL_VERSION; 4140 mddev->ctime = get_seconds(); 4141 4142 mddev->level = info->level; 4143 mddev->clevel[0] = 0; 4144 mddev->size = info->size; 4145 mddev->raid_disks = info->raid_disks; 4146 /* don't set md_minor, it is determined by which /dev/md* was 4147 * openned 4148 */ 4149 if (info->state & (1<<MD_SB_CLEAN)) 4150 mddev->recovery_cp = MaxSector; 4151 else 4152 mddev->recovery_cp = 0; 4153 mddev->persistent = ! info->not_persistent; 4154 4155 mddev->layout = info->layout; 4156 mddev->chunk_size = info->chunk_size; 4157 4158 mddev->max_disks = MD_SB_DISKS; 4159 4160 mddev->flags = 0; 4161 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4162 4163 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 4164 mddev->bitmap_offset = 0; 4165 4166 mddev->reshape_position = MaxSector; 4167 4168 /* 4169 * Generate a 128 bit UUID 4170 */ 4171 get_random_bytes(mddev->uuid, 16); 4172 4173 mddev->new_level = mddev->level; 4174 mddev->new_chunk = mddev->chunk_size; 4175 mddev->new_layout = mddev->layout; 4176 mddev->delta_disks = 0; 4177 4178 return 0; 4179 } 4180 4181 static int update_size(mddev_t *mddev, unsigned long size) 4182 { 4183 mdk_rdev_t * rdev; 4184 int rv; 4185 struct list_head *tmp; 4186 int fit = (size == 0); 4187 4188 if (mddev->pers->resize == NULL) 4189 return -EINVAL; 4190 /* The "size" is the amount of each device that is used. 4191 * This can only make sense for arrays with redundancy. 4192 * linear and raid0 always use whatever space is available 4193 * We can only consider changing the size if no resync 4194 * or reconstruction is happening, and if the new size 4195 * is acceptable. It must fit before the sb_offset or, 4196 * if that is <data_offset, it must fit before the 4197 * size of each device. 4198 * If size is zero, we find the largest size that fits. 4199 */ 4200 if (mddev->sync_thread) 4201 return -EBUSY; 4202 ITERATE_RDEV(mddev,rdev,tmp) { 4203 sector_t avail; 4204 avail = rdev->size * 2; 4205 4206 if (fit && (size == 0 || size > avail/2)) 4207 size = avail/2; 4208 if (avail < ((sector_t)size << 1)) 4209 return -ENOSPC; 4210 } 4211 rv = mddev->pers->resize(mddev, (sector_t)size *2); 4212 if (!rv) { 4213 struct block_device *bdev; 4214 4215 bdev = bdget_disk(mddev->gendisk, 0); 4216 if (bdev) { 4217 mutex_lock(&bdev->bd_inode->i_mutex); 4218 i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10); 4219 mutex_unlock(&bdev->bd_inode->i_mutex); 4220 bdput(bdev); 4221 } 4222 } 4223 return rv; 4224 } 4225 4226 static int update_raid_disks(mddev_t *mddev, int raid_disks) 4227 { 4228 int rv; 4229 /* change the number of raid disks */ 4230 if (mddev->pers->check_reshape == NULL) 4231 return -EINVAL; 4232 if (raid_disks <= 0 || 4233 raid_disks >= mddev->max_disks) 4234 return -EINVAL; 4235 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 4236 return -EBUSY; 4237 mddev->delta_disks = raid_disks - mddev->raid_disks; 4238 4239 rv = mddev->pers->check_reshape(mddev); 4240 return rv; 4241 } 4242 4243 4244 /* 4245 * update_array_info is used to change the configuration of an 4246 * on-line array. 4247 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 4248 * fields in the info are checked against the array. 4249 * Any differences that cannot be handled will cause an error. 4250 * Normally, only one change can be managed at a time. 4251 */ 4252 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 4253 { 4254 int rv = 0; 4255 int cnt = 0; 4256 int state = 0; 4257 4258 /* calculate expected state,ignoring low bits */ 4259 if (mddev->bitmap && mddev->bitmap_offset) 4260 state |= (1 << MD_SB_BITMAP_PRESENT); 4261 4262 if (mddev->major_version != info->major_version || 4263 mddev->minor_version != info->minor_version || 4264 /* mddev->patch_version != info->patch_version || */ 4265 mddev->ctime != info->ctime || 4266 mddev->level != info->level || 4267 /* mddev->layout != info->layout || */ 4268 !mddev->persistent != info->not_persistent|| 4269 mddev->chunk_size != info->chunk_size || 4270 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 4271 ((state^info->state) & 0xfffffe00) 4272 ) 4273 return -EINVAL; 4274 /* Check there is only one change */ 4275 if (info->size >= 0 && mddev->size != info->size) cnt++; 4276 if (mddev->raid_disks != info->raid_disks) cnt++; 4277 if (mddev->layout != info->layout) cnt++; 4278 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 4279 if (cnt == 0) return 0; 4280 if (cnt > 1) return -EINVAL; 4281 4282 if (mddev->layout != info->layout) { 4283 /* Change layout 4284 * we don't need to do anything at the md level, the 4285 * personality will take care of it all. 4286 */ 4287 if (mddev->pers->reconfig == NULL) 4288 return -EINVAL; 4289 else 4290 return mddev->pers->reconfig(mddev, info->layout, -1); 4291 } 4292 if (info->size >= 0 && mddev->size != info->size) 4293 rv = update_size(mddev, info->size); 4294 4295 if (mddev->raid_disks != info->raid_disks) 4296 rv = update_raid_disks(mddev, info->raid_disks); 4297 4298 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 4299 if (mddev->pers->quiesce == NULL) 4300 return -EINVAL; 4301 if (mddev->recovery || mddev->sync_thread) 4302 return -EBUSY; 4303 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 4304 /* add the bitmap */ 4305 if (mddev->bitmap) 4306 return -EEXIST; 4307 if (mddev->default_bitmap_offset == 0) 4308 return -EINVAL; 4309 mddev->bitmap_offset = mddev->default_bitmap_offset; 4310 mddev->pers->quiesce(mddev, 1); 4311 rv = bitmap_create(mddev); 4312 if (rv) 4313 bitmap_destroy(mddev); 4314 mddev->pers->quiesce(mddev, 0); 4315 } else { 4316 /* remove the bitmap */ 4317 if (!mddev->bitmap) 4318 return -ENOENT; 4319 if (mddev->bitmap->file) 4320 return -EINVAL; 4321 mddev->pers->quiesce(mddev, 1); 4322 bitmap_destroy(mddev); 4323 mddev->pers->quiesce(mddev, 0); 4324 mddev->bitmap_offset = 0; 4325 } 4326 } 4327 md_update_sb(mddev, 1); 4328 return rv; 4329 } 4330 4331 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 4332 { 4333 mdk_rdev_t *rdev; 4334 4335 if (mddev->pers == NULL) 4336 return -ENODEV; 4337 4338 rdev = find_rdev(mddev, dev); 4339 if (!rdev) 4340 return -ENODEV; 4341 4342 md_error(mddev, rdev); 4343 return 0; 4344 } 4345 4346 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4347 { 4348 mddev_t *mddev = bdev->bd_disk->private_data; 4349 4350 geo->heads = 2; 4351 geo->sectors = 4; 4352 geo->cylinders = get_capacity(mddev->gendisk) / 8; 4353 return 0; 4354 } 4355 4356 static int md_ioctl(struct inode *inode, struct file *file, 4357 unsigned int cmd, unsigned long arg) 4358 { 4359 int err = 0; 4360 void __user *argp = (void __user *)arg; 4361 mddev_t *mddev = NULL; 4362 4363 if (!capable(CAP_SYS_ADMIN)) 4364 return -EACCES; 4365 4366 /* 4367 * Commands dealing with the RAID driver but not any 4368 * particular array: 4369 */ 4370 switch (cmd) 4371 { 4372 case RAID_VERSION: 4373 err = get_version(argp); 4374 goto done; 4375 4376 case PRINT_RAID_DEBUG: 4377 err = 0; 4378 md_print_devices(); 4379 goto done; 4380 4381 #ifndef MODULE 4382 case RAID_AUTORUN: 4383 err = 0; 4384 autostart_arrays(arg); 4385 goto done; 4386 #endif 4387 default:; 4388 } 4389 4390 /* 4391 * Commands creating/starting a new array: 4392 */ 4393 4394 mddev = inode->i_bdev->bd_disk->private_data; 4395 4396 if (!mddev) { 4397 BUG(); 4398 goto abort; 4399 } 4400 4401 err = mddev_lock(mddev); 4402 if (err) { 4403 printk(KERN_INFO 4404 "md: ioctl lock interrupted, reason %d, cmd %d\n", 4405 err, cmd); 4406 goto abort; 4407 } 4408 4409 switch (cmd) 4410 { 4411 case SET_ARRAY_INFO: 4412 { 4413 mdu_array_info_t info; 4414 if (!arg) 4415 memset(&info, 0, sizeof(info)); 4416 else if (copy_from_user(&info, argp, sizeof(info))) { 4417 err = -EFAULT; 4418 goto abort_unlock; 4419 } 4420 if (mddev->pers) { 4421 err = update_array_info(mddev, &info); 4422 if (err) { 4423 printk(KERN_WARNING "md: couldn't update" 4424 " array info. %d\n", err); 4425 goto abort_unlock; 4426 } 4427 goto done_unlock; 4428 } 4429 if (!list_empty(&mddev->disks)) { 4430 printk(KERN_WARNING 4431 "md: array %s already has disks!\n", 4432 mdname(mddev)); 4433 err = -EBUSY; 4434 goto abort_unlock; 4435 } 4436 if (mddev->raid_disks) { 4437 printk(KERN_WARNING 4438 "md: array %s already initialised!\n", 4439 mdname(mddev)); 4440 err = -EBUSY; 4441 goto abort_unlock; 4442 } 4443 err = set_array_info(mddev, &info); 4444 if (err) { 4445 printk(KERN_WARNING "md: couldn't set" 4446 " array info. %d\n", err); 4447 goto abort_unlock; 4448 } 4449 } 4450 goto done_unlock; 4451 4452 default:; 4453 } 4454 4455 /* 4456 * Commands querying/configuring an existing array: 4457 */ 4458 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 4459 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 4460 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 4461 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 4462 && cmd != GET_BITMAP_FILE) { 4463 err = -ENODEV; 4464 goto abort_unlock; 4465 } 4466 4467 /* 4468 * Commands even a read-only array can execute: 4469 */ 4470 switch (cmd) 4471 { 4472 case GET_ARRAY_INFO: 4473 err = get_array_info(mddev, argp); 4474 goto done_unlock; 4475 4476 case GET_BITMAP_FILE: 4477 err = get_bitmap_file(mddev, argp); 4478 goto done_unlock; 4479 4480 case GET_DISK_INFO: 4481 err = get_disk_info(mddev, argp); 4482 goto done_unlock; 4483 4484 case RESTART_ARRAY_RW: 4485 err = restart_array(mddev); 4486 goto done_unlock; 4487 4488 case STOP_ARRAY: 4489 err = do_md_stop (mddev, 0); 4490 goto done_unlock; 4491 4492 case STOP_ARRAY_RO: 4493 err = do_md_stop (mddev, 1); 4494 goto done_unlock; 4495 4496 /* 4497 * We have a problem here : there is no easy way to give a CHS 4498 * virtual geometry. We currently pretend that we have a 2 heads 4499 * 4 sectors (with a BIG number of cylinders...). This drives 4500 * dosfs just mad... ;-) 4501 */ 4502 } 4503 4504 /* 4505 * The remaining ioctls are changing the state of the 4506 * superblock, so we do not allow them on read-only arrays. 4507 * However non-MD ioctls (e.g. get-size) will still come through 4508 * here and hit the 'default' below, so only disallow 4509 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4510 */ 4511 if (_IOC_TYPE(cmd) == MD_MAJOR && 4512 mddev->ro && mddev->pers) { 4513 if (mddev->ro == 2) { 4514 mddev->ro = 0; 4515 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4516 md_wakeup_thread(mddev->thread); 4517 4518 } else { 4519 err = -EROFS; 4520 goto abort_unlock; 4521 } 4522 } 4523 4524 switch (cmd) 4525 { 4526 case ADD_NEW_DISK: 4527 { 4528 mdu_disk_info_t info; 4529 if (copy_from_user(&info, argp, sizeof(info))) 4530 err = -EFAULT; 4531 else 4532 err = add_new_disk(mddev, &info); 4533 goto done_unlock; 4534 } 4535 4536 case HOT_REMOVE_DISK: 4537 err = hot_remove_disk(mddev, new_decode_dev(arg)); 4538 goto done_unlock; 4539 4540 case HOT_ADD_DISK: 4541 err = hot_add_disk(mddev, new_decode_dev(arg)); 4542 goto done_unlock; 4543 4544 case SET_DISK_FAULTY: 4545 err = set_disk_faulty(mddev, new_decode_dev(arg)); 4546 goto done_unlock; 4547 4548 case RUN_ARRAY: 4549 err = do_md_run (mddev); 4550 goto done_unlock; 4551 4552 case SET_BITMAP_FILE: 4553 err = set_bitmap_file(mddev, (int)arg); 4554 goto done_unlock; 4555 4556 default: 4557 err = -EINVAL; 4558 goto abort_unlock; 4559 } 4560 4561 done_unlock: 4562 abort_unlock: 4563 mddev_unlock(mddev); 4564 4565 return err; 4566 done: 4567 if (err) 4568 MD_BUG(); 4569 abort: 4570 return err; 4571 } 4572 4573 static int md_open(struct inode *inode, struct file *file) 4574 { 4575 /* 4576 * Succeed if we can lock the mddev, which confirms that 4577 * it isn't being stopped right now. 4578 */ 4579 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4580 int err; 4581 4582 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 4583 goto out; 4584 4585 err = 0; 4586 mddev_get(mddev); 4587 mddev_unlock(mddev); 4588 4589 check_disk_change(inode->i_bdev); 4590 out: 4591 return err; 4592 } 4593 4594 static int md_release(struct inode *inode, struct file * file) 4595 { 4596 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 4597 4598 BUG_ON(!mddev); 4599 mddev_put(mddev); 4600 4601 return 0; 4602 } 4603 4604 static int md_media_changed(struct gendisk *disk) 4605 { 4606 mddev_t *mddev = disk->private_data; 4607 4608 return mddev->changed; 4609 } 4610 4611 static int md_revalidate(struct gendisk *disk) 4612 { 4613 mddev_t *mddev = disk->private_data; 4614 4615 mddev->changed = 0; 4616 return 0; 4617 } 4618 static struct block_device_operations md_fops = 4619 { 4620 .owner = THIS_MODULE, 4621 .open = md_open, 4622 .release = md_release, 4623 .ioctl = md_ioctl, 4624 .getgeo = md_getgeo, 4625 .media_changed = md_media_changed, 4626 .revalidate_disk= md_revalidate, 4627 }; 4628 4629 static int md_thread(void * arg) 4630 { 4631 mdk_thread_t *thread = arg; 4632 4633 /* 4634 * md_thread is a 'system-thread', it's priority should be very 4635 * high. We avoid resource deadlocks individually in each 4636 * raid personality. (RAID5 does preallocation) We also use RR and 4637 * the very same RT priority as kswapd, thus we will never get 4638 * into a priority inversion deadlock. 4639 * 4640 * we definitely have to have equal or higher priority than 4641 * bdflush, otherwise bdflush will deadlock if there are too 4642 * many dirty RAID5 blocks. 4643 */ 4644 4645 current->flags |= PF_NOFREEZE; 4646 allow_signal(SIGKILL); 4647 while (!kthread_should_stop()) { 4648 4649 /* We need to wait INTERRUPTIBLE so that 4650 * we don't add to the load-average. 4651 * That means we need to be sure no signals are 4652 * pending 4653 */ 4654 if (signal_pending(current)) 4655 flush_signals(current); 4656 4657 wait_event_interruptible_timeout 4658 (thread->wqueue, 4659 test_bit(THREAD_WAKEUP, &thread->flags) 4660 || kthread_should_stop(), 4661 thread->timeout); 4662 4663 clear_bit(THREAD_WAKEUP, &thread->flags); 4664 4665 thread->run(thread->mddev); 4666 } 4667 4668 return 0; 4669 } 4670 4671 void md_wakeup_thread(mdk_thread_t *thread) 4672 { 4673 if (thread) { 4674 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 4675 set_bit(THREAD_WAKEUP, &thread->flags); 4676 wake_up(&thread->wqueue); 4677 } 4678 } 4679 4680 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 4681 const char *name) 4682 { 4683 mdk_thread_t *thread; 4684 4685 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 4686 if (!thread) 4687 return NULL; 4688 4689 init_waitqueue_head(&thread->wqueue); 4690 4691 thread->run = run; 4692 thread->mddev = mddev; 4693 thread->timeout = MAX_SCHEDULE_TIMEOUT; 4694 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 4695 if (IS_ERR(thread->tsk)) { 4696 kfree(thread); 4697 return NULL; 4698 } 4699 return thread; 4700 } 4701 4702 void md_unregister_thread(mdk_thread_t *thread) 4703 { 4704 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 4705 4706 kthread_stop(thread->tsk); 4707 kfree(thread); 4708 } 4709 4710 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 4711 { 4712 if (!mddev) { 4713 MD_BUG(); 4714 return; 4715 } 4716 4717 if (!rdev || test_bit(Faulty, &rdev->flags)) 4718 return; 4719 /* 4720 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4721 mdname(mddev), 4722 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 4723 __builtin_return_address(0),__builtin_return_address(1), 4724 __builtin_return_address(2),__builtin_return_address(3)); 4725 */ 4726 if (!mddev->pers) 4727 return; 4728 if (!mddev->pers->error_handler) 4729 return; 4730 mddev->pers->error_handler(mddev,rdev); 4731 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4732 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4733 md_wakeup_thread(mddev->thread); 4734 md_new_event_inintr(mddev); 4735 } 4736 4737 /* seq_file implementation /proc/mdstat */ 4738 4739 static void status_unused(struct seq_file *seq) 4740 { 4741 int i = 0; 4742 mdk_rdev_t *rdev; 4743 struct list_head *tmp; 4744 4745 seq_printf(seq, "unused devices: "); 4746 4747 ITERATE_RDEV_PENDING(rdev,tmp) { 4748 char b[BDEVNAME_SIZE]; 4749 i++; 4750 seq_printf(seq, "%s ", 4751 bdevname(rdev->bdev,b)); 4752 } 4753 if (!i) 4754 seq_printf(seq, "<none>"); 4755 4756 seq_printf(seq, "\n"); 4757 } 4758 4759 4760 static void status_resync(struct seq_file *seq, mddev_t * mddev) 4761 { 4762 sector_t max_blocks, resync, res; 4763 unsigned long dt, db, rt; 4764 int scale; 4765 unsigned int per_milli; 4766 4767 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 4768 4769 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 4770 max_blocks = mddev->resync_max_sectors >> 1; 4771 else 4772 max_blocks = mddev->size; 4773 4774 /* 4775 * Should not happen. 4776 */ 4777 if (!max_blocks) { 4778 MD_BUG(); 4779 return; 4780 } 4781 /* Pick 'scale' such that (resync>>scale)*1000 will fit 4782 * in a sector_t, and (max_blocks>>scale) will fit in a 4783 * u32, as those are the requirements for sector_div. 4784 * Thus 'scale' must be at least 10 4785 */ 4786 scale = 10; 4787 if (sizeof(sector_t) > sizeof(unsigned long)) { 4788 while ( max_blocks/2 > (1ULL<<(scale+32))) 4789 scale++; 4790 } 4791 res = (resync>>scale)*1000; 4792 sector_div(res, (u32)((max_blocks>>scale)+1)); 4793 4794 per_milli = res; 4795 { 4796 int i, x = per_milli/50, y = 20-x; 4797 seq_printf(seq, "["); 4798 for (i = 0; i < x; i++) 4799 seq_printf(seq, "="); 4800 seq_printf(seq, ">"); 4801 for (i = 0; i < y; i++) 4802 seq_printf(seq, "."); 4803 seq_printf(seq, "] "); 4804 } 4805 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 4806 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 4807 "reshape" : 4808 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 4809 "check" : 4810 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 4811 "resync" : "recovery"))), 4812 per_milli/10, per_milli % 10, 4813 (unsigned long long) resync, 4814 (unsigned long long) max_blocks); 4815 4816 /* 4817 * We do not want to overflow, so the order of operands and 4818 * the * 100 / 100 trick are important. We do a +1 to be 4819 * safe against division by zero. We only estimate anyway. 4820 * 4821 * dt: time from mark until now 4822 * db: blocks written from mark until now 4823 * rt: remaining time 4824 */ 4825 dt = ((jiffies - mddev->resync_mark) / HZ); 4826 if (!dt) dt++; 4827 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 4828 - mddev->resync_mark_cnt; 4829 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; 4830 4831 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 4832 4833 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 4834 } 4835 4836 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 4837 { 4838 struct list_head *tmp; 4839 loff_t l = *pos; 4840 mddev_t *mddev; 4841 4842 if (l >= 0x10000) 4843 return NULL; 4844 if (!l--) 4845 /* header */ 4846 return (void*)1; 4847 4848 spin_lock(&all_mddevs_lock); 4849 list_for_each(tmp,&all_mddevs) 4850 if (!l--) { 4851 mddev = list_entry(tmp, mddev_t, all_mddevs); 4852 mddev_get(mddev); 4853 spin_unlock(&all_mddevs_lock); 4854 return mddev; 4855 } 4856 spin_unlock(&all_mddevs_lock); 4857 if (!l--) 4858 return (void*)2;/* tail */ 4859 return NULL; 4860 } 4861 4862 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4863 { 4864 struct list_head *tmp; 4865 mddev_t *next_mddev, *mddev = v; 4866 4867 ++*pos; 4868 if (v == (void*)2) 4869 return NULL; 4870 4871 spin_lock(&all_mddevs_lock); 4872 if (v == (void*)1) 4873 tmp = all_mddevs.next; 4874 else 4875 tmp = mddev->all_mddevs.next; 4876 if (tmp != &all_mddevs) 4877 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 4878 else { 4879 next_mddev = (void*)2; 4880 *pos = 0x10000; 4881 } 4882 spin_unlock(&all_mddevs_lock); 4883 4884 if (v != (void*)1) 4885 mddev_put(mddev); 4886 return next_mddev; 4887 4888 } 4889 4890 static void md_seq_stop(struct seq_file *seq, void *v) 4891 { 4892 mddev_t *mddev = v; 4893 4894 if (mddev && v != (void*)1 && v != (void*)2) 4895 mddev_put(mddev); 4896 } 4897 4898 struct mdstat_info { 4899 int event; 4900 }; 4901 4902 static int md_seq_show(struct seq_file *seq, void *v) 4903 { 4904 mddev_t *mddev = v; 4905 sector_t size; 4906 struct list_head *tmp2; 4907 mdk_rdev_t *rdev; 4908 struct mdstat_info *mi = seq->private; 4909 struct bitmap *bitmap; 4910 4911 if (v == (void*)1) { 4912 struct mdk_personality *pers; 4913 seq_printf(seq, "Personalities : "); 4914 spin_lock(&pers_lock); 4915 list_for_each_entry(pers, &pers_list, list) 4916 seq_printf(seq, "[%s] ", pers->name); 4917 4918 spin_unlock(&pers_lock); 4919 seq_printf(seq, "\n"); 4920 mi->event = atomic_read(&md_event_count); 4921 return 0; 4922 } 4923 if (v == (void*)2) { 4924 status_unused(seq); 4925 return 0; 4926 } 4927 4928 if (mddev_lock(mddev) < 0) 4929 return -EINTR; 4930 4931 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 4932 seq_printf(seq, "%s : %sactive", mdname(mddev), 4933 mddev->pers ? "" : "in"); 4934 if (mddev->pers) { 4935 if (mddev->ro==1) 4936 seq_printf(seq, " (read-only)"); 4937 if (mddev->ro==2) 4938 seq_printf(seq, "(auto-read-only)"); 4939 seq_printf(seq, " %s", mddev->pers->name); 4940 } 4941 4942 size = 0; 4943 ITERATE_RDEV(mddev,rdev,tmp2) { 4944 char b[BDEVNAME_SIZE]; 4945 seq_printf(seq, " %s[%d]", 4946 bdevname(rdev->bdev,b), rdev->desc_nr); 4947 if (test_bit(WriteMostly, &rdev->flags)) 4948 seq_printf(seq, "(W)"); 4949 if (test_bit(Faulty, &rdev->flags)) { 4950 seq_printf(seq, "(F)"); 4951 continue; 4952 } else if (rdev->raid_disk < 0) 4953 seq_printf(seq, "(S)"); /* spare */ 4954 size += rdev->size; 4955 } 4956 4957 if (!list_empty(&mddev->disks)) { 4958 if (mddev->pers) 4959 seq_printf(seq, "\n %llu blocks", 4960 (unsigned long long)mddev->array_size); 4961 else 4962 seq_printf(seq, "\n %llu blocks", 4963 (unsigned long long)size); 4964 } 4965 if (mddev->persistent) { 4966 if (mddev->major_version != 0 || 4967 mddev->minor_version != 90) { 4968 seq_printf(seq," super %d.%d", 4969 mddev->major_version, 4970 mddev->minor_version); 4971 } 4972 } else 4973 seq_printf(seq, " super non-persistent"); 4974 4975 if (mddev->pers) { 4976 mddev->pers->status (seq, mddev); 4977 seq_printf(seq, "\n "); 4978 if (mddev->pers->sync_request) { 4979 if (mddev->curr_resync > 2) { 4980 status_resync (seq, mddev); 4981 seq_printf(seq, "\n "); 4982 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 4983 seq_printf(seq, "\tresync=DELAYED\n "); 4984 else if (mddev->recovery_cp < MaxSector) 4985 seq_printf(seq, "\tresync=PENDING\n "); 4986 } 4987 } else 4988 seq_printf(seq, "\n "); 4989 4990 if ((bitmap = mddev->bitmap)) { 4991 unsigned long chunk_kb; 4992 unsigned long flags; 4993 spin_lock_irqsave(&bitmap->lock, flags); 4994 chunk_kb = bitmap->chunksize >> 10; 4995 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 4996 "%lu%s chunk", 4997 bitmap->pages - bitmap->missing_pages, 4998 bitmap->pages, 4999 (bitmap->pages - bitmap->missing_pages) 5000 << (PAGE_SHIFT - 10), 5001 chunk_kb ? chunk_kb : bitmap->chunksize, 5002 chunk_kb ? "KB" : "B"); 5003 if (bitmap->file) { 5004 seq_printf(seq, ", file: "); 5005 seq_path(seq, bitmap->file->f_path.mnt, 5006 bitmap->file->f_path.dentry," \t\n"); 5007 } 5008 5009 seq_printf(seq, "\n"); 5010 spin_unlock_irqrestore(&bitmap->lock, flags); 5011 } 5012 5013 seq_printf(seq, "\n"); 5014 } 5015 mddev_unlock(mddev); 5016 5017 return 0; 5018 } 5019 5020 static struct seq_operations md_seq_ops = { 5021 .start = md_seq_start, 5022 .next = md_seq_next, 5023 .stop = md_seq_stop, 5024 .show = md_seq_show, 5025 }; 5026 5027 static int md_seq_open(struct inode *inode, struct file *file) 5028 { 5029 int error; 5030 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 5031 if (mi == NULL) 5032 return -ENOMEM; 5033 5034 error = seq_open(file, &md_seq_ops); 5035 if (error) 5036 kfree(mi); 5037 else { 5038 struct seq_file *p = file->private_data; 5039 p->private = mi; 5040 mi->event = atomic_read(&md_event_count); 5041 } 5042 return error; 5043 } 5044 5045 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 5046 { 5047 struct seq_file *m = filp->private_data; 5048 struct mdstat_info *mi = m->private; 5049 int mask; 5050 5051 poll_wait(filp, &md_event_waiters, wait); 5052 5053 /* always allow read */ 5054 mask = POLLIN | POLLRDNORM; 5055 5056 if (mi->event != atomic_read(&md_event_count)) 5057 mask |= POLLERR | POLLPRI; 5058 return mask; 5059 } 5060 5061 static const struct file_operations md_seq_fops = { 5062 .owner = THIS_MODULE, 5063 .open = md_seq_open, 5064 .read = seq_read, 5065 .llseek = seq_lseek, 5066 .release = seq_release_private, 5067 .poll = mdstat_poll, 5068 }; 5069 5070 int register_md_personality(struct mdk_personality *p) 5071 { 5072 spin_lock(&pers_lock); 5073 list_add_tail(&p->list, &pers_list); 5074 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 5075 spin_unlock(&pers_lock); 5076 return 0; 5077 } 5078 5079 int unregister_md_personality(struct mdk_personality *p) 5080 { 5081 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 5082 spin_lock(&pers_lock); 5083 list_del_init(&p->list); 5084 spin_unlock(&pers_lock); 5085 return 0; 5086 } 5087 5088 static int is_mddev_idle(mddev_t *mddev) 5089 { 5090 mdk_rdev_t * rdev; 5091 struct list_head *tmp; 5092 int idle; 5093 unsigned long curr_events; 5094 5095 idle = 1; 5096 ITERATE_RDEV(mddev,rdev,tmp) { 5097 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 5098 curr_events = disk_stat_read(disk, sectors[0]) + 5099 disk_stat_read(disk, sectors[1]) - 5100 atomic_read(&disk->sync_io); 5101 /* The difference between curr_events and last_events 5102 * will be affected by any new non-sync IO (making 5103 * curr_events bigger) and any difference in the amount of 5104 * in-flight syncio (making current_events bigger or smaller) 5105 * The amount in-flight is currently limited to 5106 * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6 5107 * which is at most 4096 sectors. 5108 * These numbers are fairly fragile and should be made 5109 * more robust, probably by enforcing the 5110 * 'window size' that md_do_sync sort-of uses. 5111 * 5112 * Note: the following is an unsigned comparison. 5113 */ 5114 if ((long)curr_events - (long)rdev->last_events > 4096) { 5115 rdev->last_events = curr_events; 5116 idle = 0; 5117 } 5118 } 5119 return idle; 5120 } 5121 5122 void md_done_sync(mddev_t *mddev, int blocks, int ok) 5123 { 5124 /* another "blocks" (512byte) blocks have been synced */ 5125 atomic_sub(blocks, &mddev->recovery_active); 5126 wake_up(&mddev->recovery_wait); 5127 if (!ok) { 5128 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5129 md_wakeup_thread(mddev->thread); 5130 // stop recovery, signal do_sync .... 5131 } 5132 } 5133 5134 5135 /* md_write_start(mddev, bi) 5136 * If we need to update some array metadata (e.g. 'active' flag 5137 * in superblock) before writing, schedule a superblock update 5138 * and wait for it to complete. 5139 */ 5140 void md_write_start(mddev_t *mddev, struct bio *bi) 5141 { 5142 if (bio_data_dir(bi) != WRITE) 5143 return; 5144 5145 BUG_ON(mddev->ro == 1); 5146 if (mddev->ro == 2) { 5147 /* need to switch to read/write */ 5148 mddev->ro = 0; 5149 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5150 md_wakeup_thread(mddev->thread); 5151 } 5152 atomic_inc(&mddev->writes_pending); 5153 if (mddev->in_sync) { 5154 spin_lock_irq(&mddev->write_lock); 5155 if (mddev->in_sync) { 5156 mddev->in_sync = 0; 5157 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5158 md_wakeup_thread(mddev->thread); 5159 } 5160 spin_unlock_irq(&mddev->write_lock); 5161 } 5162 wait_event(mddev->sb_wait, mddev->flags==0); 5163 } 5164 5165 void md_write_end(mddev_t *mddev) 5166 { 5167 if (atomic_dec_and_test(&mddev->writes_pending)) { 5168 if (mddev->safemode == 2) 5169 md_wakeup_thread(mddev->thread); 5170 else if (mddev->safemode_delay) 5171 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5172 } 5173 } 5174 5175 /* md_allow_write(mddev) 5176 * Calling this ensures that the array is marked 'active' so that writes 5177 * may proceed without blocking. It is important to call this before 5178 * attempting a GFP_KERNEL allocation while holding the mddev lock. 5179 * Must be called with mddev_lock held. 5180 */ 5181 void md_allow_write(mddev_t *mddev) 5182 { 5183 if (!mddev->pers) 5184 return; 5185 if (mddev->ro) 5186 return; 5187 5188 spin_lock_irq(&mddev->write_lock); 5189 if (mddev->in_sync) { 5190 mddev->in_sync = 0; 5191 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5192 if (mddev->safemode_delay && 5193 mddev->safemode == 0) 5194 mddev->safemode = 1; 5195 spin_unlock_irq(&mddev->write_lock); 5196 md_update_sb(mddev, 0); 5197 } else 5198 spin_unlock_irq(&mddev->write_lock); 5199 } 5200 EXPORT_SYMBOL_GPL(md_allow_write); 5201 5202 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 5203 5204 #define SYNC_MARKS 10 5205 #define SYNC_MARK_STEP (3*HZ) 5206 void md_do_sync(mddev_t *mddev) 5207 { 5208 mddev_t *mddev2; 5209 unsigned int currspeed = 0, 5210 window; 5211 sector_t max_sectors,j, io_sectors; 5212 unsigned long mark[SYNC_MARKS]; 5213 sector_t mark_cnt[SYNC_MARKS]; 5214 int last_mark,m; 5215 struct list_head *tmp; 5216 sector_t last_check; 5217 int skipped = 0; 5218 struct list_head *rtmp; 5219 mdk_rdev_t *rdev; 5220 char *desc; 5221 5222 /* just incase thread restarts... */ 5223 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5224 return; 5225 if (mddev->ro) /* never try to sync a read-only array */ 5226 return; 5227 5228 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5229 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 5230 desc = "data-check"; 5231 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5232 desc = "requested-resync"; 5233 else 5234 desc = "resync"; 5235 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5236 desc = "reshape"; 5237 else 5238 desc = "recovery"; 5239 5240 /* we overload curr_resync somewhat here. 5241 * 0 == not engaged in resync at all 5242 * 2 == checking that there is no conflict with another sync 5243 * 1 == like 2, but have yielded to allow conflicting resync to 5244 * commense 5245 * other == active in resync - this many blocks 5246 * 5247 * Before starting a resync we must have set curr_resync to 5248 * 2, and then checked that every "conflicting" array has curr_resync 5249 * less than ours. When we find one that is the same or higher 5250 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 5251 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 5252 * This will mean we have to start checking from the beginning again. 5253 * 5254 */ 5255 5256 do { 5257 mddev->curr_resync = 2; 5258 5259 try_again: 5260 if (kthread_should_stop()) { 5261 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5262 goto skip; 5263 } 5264 ITERATE_MDDEV(mddev2,tmp) { 5265 if (mddev2 == mddev) 5266 continue; 5267 if (mddev2->curr_resync && 5268 match_mddev_units(mddev,mddev2)) { 5269 DEFINE_WAIT(wq); 5270 if (mddev < mddev2 && mddev->curr_resync == 2) { 5271 /* arbitrarily yield */ 5272 mddev->curr_resync = 1; 5273 wake_up(&resync_wait); 5274 } 5275 if (mddev > mddev2 && mddev->curr_resync == 1) 5276 /* no need to wait here, we can wait the next 5277 * time 'round when curr_resync == 2 5278 */ 5279 continue; 5280 prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); 5281 if (!kthread_should_stop() && 5282 mddev2->curr_resync >= mddev->curr_resync) { 5283 printk(KERN_INFO "md: delaying %s of %s" 5284 " until %s has finished (they" 5285 " share one or more physical units)\n", 5286 desc, mdname(mddev), mdname(mddev2)); 5287 mddev_put(mddev2); 5288 schedule(); 5289 finish_wait(&resync_wait, &wq); 5290 goto try_again; 5291 } 5292 finish_wait(&resync_wait, &wq); 5293 } 5294 } 5295 } while (mddev->curr_resync < 2); 5296 5297 j = 0; 5298 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5299 /* resync follows the size requested by the personality, 5300 * which defaults to physical size, but can be virtual size 5301 */ 5302 max_sectors = mddev->resync_max_sectors; 5303 mddev->resync_mismatches = 0; 5304 /* we don't use the checkpoint if there's a bitmap */ 5305 if (!mddev->bitmap && 5306 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5307 j = mddev->recovery_cp; 5308 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5309 max_sectors = mddev->size << 1; 5310 else { 5311 /* recovery follows the physical size of devices */ 5312 max_sectors = mddev->size << 1; 5313 j = MaxSector; 5314 ITERATE_RDEV(mddev,rdev,rtmp) 5315 if (rdev->raid_disk >= 0 && 5316 !test_bit(Faulty, &rdev->flags) && 5317 !test_bit(In_sync, &rdev->flags) && 5318 rdev->recovery_offset < j) 5319 j = rdev->recovery_offset; 5320 } 5321 5322 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 5323 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 5324 " %d KB/sec/disk.\n", speed_min(mddev)); 5325 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 5326 "(but not more than %d KB/sec) for %s.\n", 5327 speed_max(mddev), desc); 5328 5329 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5330 5331 io_sectors = 0; 5332 for (m = 0; m < SYNC_MARKS; m++) { 5333 mark[m] = jiffies; 5334 mark_cnt[m] = io_sectors; 5335 } 5336 last_mark = 0; 5337 mddev->resync_mark = mark[last_mark]; 5338 mddev->resync_mark_cnt = mark_cnt[last_mark]; 5339 5340 /* 5341 * Tune reconstruction: 5342 */ 5343 window = 32*(PAGE_SIZE/512); 5344 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 5345 window/2,(unsigned long long) max_sectors/2); 5346 5347 atomic_set(&mddev->recovery_active, 0); 5348 init_waitqueue_head(&mddev->recovery_wait); 5349 last_check = 0; 5350 5351 if (j>2) { 5352 printk(KERN_INFO 5353 "md: resuming %s of %s from checkpoint.\n", 5354 desc, mdname(mddev)); 5355 mddev->curr_resync = j; 5356 } 5357 5358 while (j < max_sectors) { 5359 sector_t sectors; 5360 5361 skipped = 0; 5362 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5363 currspeed < speed_min(mddev)); 5364 if (sectors == 0) { 5365 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 5366 goto out; 5367 } 5368 5369 if (!skipped) { /* actual IO requested */ 5370 io_sectors += sectors; 5371 atomic_add(sectors, &mddev->recovery_active); 5372 } 5373 5374 j += sectors; 5375 if (j>1) mddev->curr_resync = j; 5376 mddev->curr_mark_cnt = io_sectors; 5377 if (last_check == 0) 5378 /* this is the earliers that rebuilt will be 5379 * visible in /proc/mdstat 5380 */ 5381 md_new_event(mddev); 5382 5383 if (last_check + window > io_sectors || j == max_sectors) 5384 continue; 5385 5386 last_check = io_sectors; 5387 5388 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 5389 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 5390 break; 5391 5392 repeat: 5393 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 5394 /* step marks */ 5395 int next = (last_mark+1) % SYNC_MARKS; 5396 5397 mddev->resync_mark = mark[next]; 5398 mddev->resync_mark_cnt = mark_cnt[next]; 5399 mark[next] = jiffies; 5400 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 5401 last_mark = next; 5402 } 5403 5404 5405 if (kthread_should_stop()) { 5406 /* 5407 * got a signal, exit. 5408 */ 5409 printk(KERN_INFO 5410 "md: md_do_sync() got signal ... exiting\n"); 5411 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5412 goto out; 5413 } 5414 5415 /* 5416 * this loop exits only if either when we are slower than 5417 * the 'hard' speed limit, or the system was IO-idle for 5418 * a jiffy. 5419 * the system might be non-idle CPU-wise, but we only care 5420 * about not overloading the IO subsystem. (things like an 5421 * e2fsck being done on the RAID array should execute fast) 5422 */ 5423 mddev->queue->unplug_fn(mddev->queue); 5424 cond_resched(); 5425 5426 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 5427 /((jiffies-mddev->resync_mark)/HZ +1) +1; 5428 5429 if (currspeed > speed_min(mddev)) { 5430 if ((currspeed > speed_max(mddev)) || 5431 !is_mddev_idle(mddev)) { 5432 msleep(500); 5433 goto repeat; 5434 } 5435 } 5436 } 5437 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 5438 /* 5439 * this also signals 'finished resyncing' to md_stop 5440 */ 5441 out: 5442 mddev->queue->unplug_fn(mddev->queue); 5443 5444 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 5445 5446 /* tell personality that we are finished */ 5447 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5448 5449 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5450 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5451 mddev->curr_resync > 2) { 5452 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5453 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5454 if (mddev->curr_resync >= mddev->recovery_cp) { 5455 printk(KERN_INFO 5456 "md: checkpointing %s of %s.\n", 5457 desc, mdname(mddev)); 5458 mddev->recovery_cp = mddev->curr_resync; 5459 } 5460 } else 5461 mddev->recovery_cp = MaxSector; 5462 } else { 5463 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5464 mddev->curr_resync = MaxSector; 5465 ITERATE_RDEV(mddev,rdev,rtmp) 5466 if (rdev->raid_disk >= 0 && 5467 !test_bit(Faulty, &rdev->flags) && 5468 !test_bit(In_sync, &rdev->flags) && 5469 rdev->recovery_offset < mddev->curr_resync) 5470 rdev->recovery_offset = mddev->curr_resync; 5471 } 5472 } 5473 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5474 5475 skip: 5476 mddev->curr_resync = 0; 5477 wake_up(&resync_wait); 5478 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 5479 md_wakeup_thread(mddev->thread); 5480 } 5481 EXPORT_SYMBOL_GPL(md_do_sync); 5482 5483 5484 static int remove_and_add_spares(mddev_t *mddev) 5485 { 5486 mdk_rdev_t *rdev; 5487 struct list_head *rtmp; 5488 int spares = 0; 5489 5490 ITERATE_RDEV(mddev,rdev,rtmp) 5491 if (rdev->raid_disk >= 0 && 5492 (test_bit(Faulty, &rdev->flags) || 5493 ! test_bit(In_sync, &rdev->flags)) && 5494 atomic_read(&rdev->nr_pending)==0) { 5495 if (mddev->pers->hot_remove_disk( 5496 mddev, rdev->raid_disk)==0) { 5497 char nm[20]; 5498 sprintf(nm,"rd%d", rdev->raid_disk); 5499 sysfs_remove_link(&mddev->kobj, nm); 5500 rdev->raid_disk = -1; 5501 } 5502 } 5503 5504 if (mddev->degraded) { 5505 ITERATE_RDEV(mddev,rdev,rtmp) 5506 if (rdev->raid_disk < 0 5507 && !test_bit(Faulty, &rdev->flags)) { 5508 rdev->recovery_offset = 0; 5509 if (mddev->pers->hot_add_disk(mddev,rdev)) { 5510 char nm[20]; 5511 sprintf(nm, "rd%d", rdev->raid_disk); 5512 if (sysfs_create_link(&mddev->kobj, 5513 &rdev->kobj, nm)) 5514 printk(KERN_WARNING 5515 "md: cannot register " 5516 "%s for %s\n", 5517 nm, mdname(mddev)); 5518 spares++; 5519 md_new_event(mddev); 5520 } else 5521 break; 5522 } 5523 } 5524 return spares; 5525 } 5526 /* 5527 * This routine is regularly called by all per-raid-array threads to 5528 * deal with generic issues like resync and super-block update. 5529 * Raid personalities that don't have a thread (linear/raid0) do not 5530 * need this as they never do any recovery or update the superblock. 5531 * 5532 * It does not do any resync itself, but rather "forks" off other threads 5533 * to do that as needed. 5534 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 5535 * "->recovery" and create a thread at ->sync_thread. 5536 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 5537 * and wakeups up this thread which will reap the thread and finish up. 5538 * This thread also removes any faulty devices (with nr_pending == 0). 5539 * 5540 * The overall approach is: 5541 * 1/ if the superblock needs updating, update it. 5542 * 2/ If a recovery thread is running, don't do anything else. 5543 * 3/ If recovery has finished, clean up, possibly marking spares active. 5544 * 4/ If there are any faulty devices, remove them. 5545 * 5/ If array is degraded, try to add spares devices 5546 * 6/ If array has spares or is not in-sync, start a resync thread. 5547 */ 5548 void md_check_recovery(mddev_t *mddev) 5549 { 5550 mdk_rdev_t *rdev; 5551 struct list_head *rtmp; 5552 5553 5554 if (mddev->bitmap) 5555 bitmap_daemon_work(mddev->bitmap); 5556 5557 if (mddev->ro) 5558 return; 5559 5560 if (signal_pending(current)) { 5561 if (mddev->pers->sync_request) { 5562 printk(KERN_INFO "md: %s in immediate safe mode\n", 5563 mdname(mddev)); 5564 mddev->safemode = 2; 5565 } 5566 flush_signals(current); 5567 } 5568 5569 if ( ! ( 5570 mddev->flags || 5571 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 5572 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 5573 (mddev->safemode == 1) || 5574 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 5575 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 5576 )) 5577 return; 5578 5579 if (mddev_trylock(mddev)) { 5580 int spares = 0; 5581 5582 spin_lock_irq(&mddev->write_lock); 5583 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5584 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5585 mddev->in_sync = 1; 5586 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5587 } 5588 if (mddev->safemode == 1) 5589 mddev->safemode = 0; 5590 spin_unlock_irq(&mddev->write_lock); 5591 5592 if (mddev->flags) 5593 md_update_sb(mddev, 0); 5594 5595 5596 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 5597 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 5598 /* resync/recovery still happening */ 5599 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5600 goto unlock; 5601 } 5602 if (mddev->sync_thread) { 5603 /* resync has finished, collect result */ 5604 md_unregister_thread(mddev->sync_thread); 5605 mddev->sync_thread = NULL; 5606 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 5607 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5608 /* success...*/ 5609 /* activate any spares */ 5610 mddev->pers->spare_active(mddev); 5611 } 5612 md_update_sb(mddev, 1); 5613 5614 /* if array is no-longer degraded, then any saved_raid_disk 5615 * information must be scrapped 5616 */ 5617 if (!mddev->degraded) 5618 ITERATE_RDEV(mddev,rdev,rtmp) 5619 rdev->saved_raid_disk = -1; 5620 5621 mddev->recovery = 0; 5622 /* flag recovery needed just to double check */ 5623 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5624 md_new_event(mddev); 5625 goto unlock; 5626 } 5627 /* Clear some bits that don't mean anything, but 5628 * might be left set 5629 */ 5630 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5631 clear_bit(MD_RECOVERY_ERR, &mddev->recovery); 5632 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 5633 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 5634 5635 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 5636 goto unlock; 5637 /* no recovery is running. 5638 * remove any failed drives, then 5639 * add spares if possible. 5640 * Spare are also removed and re-added, to allow 5641 * the personality to fail the re-add. 5642 */ 5643 5644 if (mddev->reshape_position != MaxSector) { 5645 if (mddev->pers->check_reshape(mddev) != 0) 5646 /* Cannot proceed */ 5647 goto unlock; 5648 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5649 } else if ((spares = remove_and_add_spares(mddev))) { 5650 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5651 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5652 } else if (mddev->recovery_cp < MaxSector) { 5653 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5654 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5655 /* nothing to be done ... */ 5656 goto unlock; 5657 5658 if (mddev->pers->sync_request) { 5659 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5660 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 5661 /* We are adding a device or devices to an array 5662 * which has the bitmap stored on all devices. 5663 * So make sure all bitmap pages get written 5664 */ 5665 bitmap_write_all(mddev->bitmap); 5666 } 5667 mddev->sync_thread = md_register_thread(md_do_sync, 5668 mddev, 5669 "%s_resync"); 5670 if (!mddev->sync_thread) { 5671 printk(KERN_ERR "%s: could not start resync" 5672 " thread...\n", 5673 mdname(mddev)); 5674 /* leave the spares where they are, it shouldn't hurt */ 5675 mddev->recovery = 0; 5676 } else 5677 md_wakeup_thread(mddev->sync_thread); 5678 md_new_event(mddev); 5679 } 5680 unlock: 5681 mddev_unlock(mddev); 5682 } 5683 } 5684 5685 static int md_notify_reboot(struct notifier_block *this, 5686 unsigned long code, void *x) 5687 { 5688 struct list_head *tmp; 5689 mddev_t *mddev; 5690 5691 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 5692 5693 printk(KERN_INFO "md: stopping all md devices.\n"); 5694 5695 ITERATE_MDDEV(mddev,tmp) 5696 if (mddev_trylock(mddev)) { 5697 do_md_stop (mddev, 1); 5698 mddev_unlock(mddev); 5699 } 5700 /* 5701 * certain more exotic SCSI devices are known to be 5702 * volatile wrt too early system reboots. While the 5703 * right place to handle this issue is the given 5704 * driver, we do want to have a safe RAID driver ... 5705 */ 5706 mdelay(1000*1); 5707 } 5708 return NOTIFY_DONE; 5709 } 5710 5711 static struct notifier_block md_notifier = { 5712 .notifier_call = md_notify_reboot, 5713 .next = NULL, 5714 .priority = INT_MAX, /* before any real devices */ 5715 }; 5716 5717 static void md_geninit(void) 5718 { 5719 struct proc_dir_entry *p; 5720 5721 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 5722 5723 p = create_proc_entry("mdstat", S_IRUGO, NULL); 5724 if (p) 5725 p->proc_fops = &md_seq_fops; 5726 } 5727 5728 static int __init md_init(void) 5729 { 5730 if (register_blkdev(MAJOR_NR, "md")) 5731 return -1; 5732 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 5733 unregister_blkdev(MAJOR_NR, "md"); 5734 return -1; 5735 } 5736 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, 5737 md_probe, NULL, NULL); 5738 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 5739 md_probe, NULL, NULL); 5740 5741 register_reboot_notifier(&md_notifier); 5742 raid_table_header = register_sysctl_table(raid_root_table); 5743 5744 md_geninit(); 5745 return (0); 5746 } 5747 5748 5749 #ifndef MODULE 5750 5751 /* 5752 * Searches all registered partitions for autorun RAID arrays 5753 * at boot time. 5754 */ 5755 static dev_t detected_devices[128]; 5756 static int dev_cnt; 5757 5758 void md_autodetect_dev(dev_t dev) 5759 { 5760 if (dev_cnt >= 0 && dev_cnt < 127) 5761 detected_devices[dev_cnt++] = dev; 5762 } 5763 5764 5765 static void autostart_arrays(int part) 5766 { 5767 mdk_rdev_t *rdev; 5768 int i; 5769 5770 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 5771 5772 for (i = 0; i < dev_cnt; i++) { 5773 dev_t dev = detected_devices[i]; 5774 5775 rdev = md_import_device(dev,0, 0); 5776 if (IS_ERR(rdev)) 5777 continue; 5778 5779 if (test_bit(Faulty, &rdev->flags)) { 5780 MD_BUG(); 5781 continue; 5782 } 5783 list_add(&rdev->same_set, &pending_raid_disks); 5784 } 5785 dev_cnt = 0; 5786 5787 autorun_devices(part); 5788 } 5789 5790 #endif /* !MODULE */ 5791 5792 static __exit void md_exit(void) 5793 { 5794 mddev_t *mddev; 5795 struct list_head *tmp; 5796 5797 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); 5798 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 5799 5800 unregister_blkdev(MAJOR_NR,"md"); 5801 unregister_blkdev(mdp_major, "mdp"); 5802 unregister_reboot_notifier(&md_notifier); 5803 unregister_sysctl_table(raid_table_header); 5804 remove_proc_entry("mdstat", NULL); 5805 ITERATE_MDDEV(mddev,tmp) { 5806 struct gendisk *disk = mddev->gendisk; 5807 if (!disk) 5808 continue; 5809 export_array(mddev); 5810 del_gendisk(disk); 5811 put_disk(disk); 5812 mddev->gendisk = NULL; 5813 mddev_put(mddev); 5814 } 5815 } 5816 5817 module_init(md_init) 5818 module_exit(md_exit) 5819 5820 static int get_ro(char *buffer, struct kernel_param *kp) 5821 { 5822 return sprintf(buffer, "%d", start_readonly); 5823 } 5824 static int set_ro(const char *val, struct kernel_param *kp) 5825 { 5826 char *e; 5827 int num = simple_strtoul(val, &e, 10); 5828 if (*val && (*e == '\0' || *e == '\n')) { 5829 start_readonly = num; 5830 return 0; 5831 } 5832 return -EINVAL; 5833 } 5834 5835 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 5836 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 5837 5838 5839 EXPORT_SYMBOL(register_md_personality); 5840 EXPORT_SYMBOL(unregister_md_personality); 5841 EXPORT_SYMBOL(md_error); 5842 EXPORT_SYMBOL(md_done_sync); 5843 EXPORT_SYMBOL(md_write_start); 5844 EXPORT_SYMBOL(md_write_end); 5845 EXPORT_SYMBOL(md_register_thread); 5846 EXPORT_SYMBOL(md_unregister_thread); 5847 EXPORT_SYMBOL(md_wakeup_thread); 5848 EXPORT_SYMBOL(md_check_recovery); 5849 MODULE_LICENSE("GPL"); 5850 MODULE_ALIAS("md"); 5851 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 5852