1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/kthread.h> 36 #include <linux/raid/md.h> 37 #include <linux/raid/bitmap.h> 38 #include <linux/sysctl.h> 39 #include <linux/buffer_head.h> /* for invalidate_bdev */ 40 #include <linux/poll.h> 41 #include <linux/ctype.h> 42 #include <linux/hdreg.h> 43 #include <linux/proc_fs.h> 44 #include <linux/random.h> 45 #include <linux/reboot.h> 46 #include <linux/file.h> 47 #include <linux/delay.h> 48 49 #define MAJOR_NR MD_MAJOR 50 51 /* 63 partitions with the alternate major number (mdp) */ 52 #define MdpMinorShift 6 53 54 #define DEBUG 0 55 #define dprintk(x...) ((void)(DEBUG && printk(x))) 56 57 58 #ifndef MODULE 59 static void autostart_arrays(int part); 60 #endif 61 62 static LIST_HEAD(pers_list); 63 static DEFINE_SPINLOCK(pers_lock); 64 65 static void md_print_devices(void); 66 67 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 68 69 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 70 71 /* 72 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 73 * is 1000 KB/sec, so the extra system load does not show up that much. 74 * Increase it if you want to have more _guaranteed_ speed. Note that 75 * the RAID driver will use the maximum available bandwidth if the IO 76 * subsystem is idle. There is also an 'absolute maximum' reconstruction 77 * speed limit - in case reconstruction slows down your system despite 78 * idle IO detection. 79 * 80 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 81 * or /sys/block/mdX/md/sync_speed_{min,max} 82 */ 83 84 static int sysctl_speed_limit_min = 1000; 85 static int sysctl_speed_limit_max = 200000; 86 static inline int speed_min(mddev_t *mddev) 87 { 88 return mddev->sync_speed_min ? 89 mddev->sync_speed_min : sysctl_speed_limit_min; 90 } 91 92 static inline int speed_max(mddev_t *mddev) 93 { 94 return mddev->sync_speed_max ? 95 mddev->sync_speed_max : sysctl_speed_limit_max; 96 } 97 98 static struct ctl_table_header *raid_table_header; 99 100 static ctl_table raid_table[] = { 101 { 102 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 103 .procname = "speed_limit_min", 104 .data = &sysctl_speed_limit_min, 105 .maxlen = sizeof(int), 106 .mode = S_IRUGO|S_IWUSR, 107 .proc_handler = &proc_dointvec, 108 }, 109 { 110 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 111 .procname = "speed_limit_max", 112 .data = &sysctl_speed_limit_max, 113 .maxlen = sizeof(int), 114 .mode = S_IRUGO|S_IWUSR, 115 .proc_handler = &proc_dointvec, 116 }, 117 { .ctl_name = 0 } 118 }; 119 120 static ctl_table raid_dir_table[] = { 121 { 122 .ctl_name = DEV_RAID, 123 .procname = "raid", 124 .maxlen = 0, 125 .mode = S_IRUGO|S_IXUGO, 126 .child = raid_table, 127 }, 128 { .ctl_name = 0 } 129 }; 130 131 static ctl_table raid_root_table[] = { 132 { 133 .ctl_name = CTL_DEV, 134 .procname = "dev", 135 .maxlen = 0, 136 .mode = 0555, 137 .child = raid_dir_table, 138 }, 139 { .ctl_name = 0 } 140 }; 141 142 static struct block_device_operations md_fops; 143 144 static int start_readonly; 145 146 /* 147 * We have a system wide 'event count' that is incremented 148 * on any 'interesting' event, and readers of /proc/mdstat 149 * can use 'poll' or 'select' to find out when the event 150 * count increases. 151 * 152 * Events are: 153 * start array, stop array, error, add device, remove device, 154 * start build, activate spare 155 */ 156 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 157 static atomic_t md_event_count; 158 void md_new_event(mddev_t *mddev) 159 { 160 atomic_inc(&md_event_count); 161 wake_up(&md_event_waiters); 162 } 163 EXPORT_SYMBOL_GPL(md_new_event); 164 165 /* Alternate version that can be called from interrupts 166 * when calling sysfs_notify isn't needed. 167 */ 168 static void md_new_event_inintr(mddev_t *mddev) 169 { 170 atomic_inc(&md_event_count); 171 wake_up(&md_event_waiters); 172 } 173 174 /* 175 * Enables to iterate over all existing md arrays 176 * all_mddevs_lock protects this list. 177 */ 178 static LIST_HEAD(all_mddevs); 179 static DEFINE_SPINLOCK(all_mddevs_lock); 180 181 182 /* 183 * iterates through all used mddevs in the system. 184 * We take care to grab the all_mddevs_lock whenever navigating 185 * the list, and to always hold a refcount when unlocked. 186 * Any code which breaks out of this loop while own 187 * a reference to the current mddev and must mddev_put it. 188 */ 189 #define for_each_mddev(mddev,tmp) \ 190 \ 191 for (({ spin_lock(&all_mddevs_lock); \ 192 tmp = all_mddevs.next; \ 193 mddev = NULL;}); \ 194 ({ if (tmp != &all_mddevs) \ 195 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 196 spin_unlock(&all_mddevs_lock); \ 197 if (mddev) mddev_put(mddev); \ 198 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 199 tmp != &all_mddevs;}); \ 200 ({ spin_lock(&all_mddevs_lock); \ 201 tmp = tmp->next;}) \ 202 ) 203 204 205 static int md_fail_request(struct request_queue *q, struct bio *bio) 206 { 207 bio_io_error(bio); 208 return 0; 209 } 210 211 static inline mddev_t *mddev_get(mddev_t *mddev) 212 { 213 atomic_inc(&mddev->active); 214 return mddev; 215 } 216 217 static void mddev_put(mddev_t *mddev) 218 { 219 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 220 return; 221 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 222 list_del(&mddev->all_mddevs); 223 spin_unlock(&all_mddevs_lock); 224 blk_cleanup_queue(mddev->queue); 225 kobject_put(&mddev->kobj); 226 } else 227 spin_unlock(&all_mddevs_lock); 228 } 229 230 static mddev_t * mddev_find(dev_t unit) 231 { 232 mddev_t *mddev, *new = NULL; 233 234 retry: 235 spin_lock(&all_mddevs_lock); 236 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 237 if (mddev->unit == unit) { 238 mddev_get(mddev); 239 spin_unlock(&all_mddevs_lock); 240 kfree(new); 241 return mddev; 242 } 243 244 if (new) { 245 list_add(&new->all_mddevs, &all_mddevs); 246 spin_unlock(&all_mddevs_lock); 247 return new; 248 } 249 spin_unlock(&all_mddevs_lock); 250 251 new = kzalloc(sizeof(*new), GFP_KERNEL); 252 if (!new) 253 return NULL; 254 255 new->unit = unit; 256 if (MAJOR(unit) == MD_MAJOR) 257 new->md_minor = MINOR(unit); 258 else 259 new->md_minor = MINOR(unit) >> MdpMinorShift; 260 261 mutex_init(&new->reconfig_mutex); 262 INIT_LIST_HEAD(&new->disks); 263 INIT_LIST_HEAD(&new->all_mddevs); 264 init_timer(&new->safemode_timer); 265 atomic_set(&new->active, 1); 266 atomic_set(&new->openers, 0); 267 spin_lock_init(&new->write_lock); 268 init_waitqueue_head(&new->sb_wait); 269 init_waitqueue_head(&new->recovery_wait); 270 new->reshape_position = MaxSector; 271 new->resync_min = 0; 272 new->resync_max = MaxSector; 273 new->level = LEVEL_NONE; 274 275 new->queue = blk_alloc_queue(GFP_KERNEL); 276 if (!new->queue) { 277 kfree(new); 278 return NULL; 279 } 280 /* Can be unlocked because the queue is new: no concurrency */ 281 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue); 282 283 blk_queue_make_request(new->queue, md_fail_request); 284 285 goto retry; 286 } 287 288 static inline int mddev_lock(mddev_t * mddev) 289 { 290 return mutex_lock_interruptible(&mddev->reconfig_mutex); 291 } 292 293 static inline int mddev_trylock(mddev_t * mddev) 294 { 295 return mutex_trylock(&mddev->reconfig_mutex); 296 } 297 298 static inline void mddev_unlock(mddev_t * mddev) 299 { 300 mutex_unlock(&mddev->reconfig_mutex); 301 302 md_wakeup_thread(mddev->thread); 303 } 304 305 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 306 { 307 mdk_rdev_t * rdev; 308 struct list_head *tmp; 309 310 rdev_for_each(rdev, tmp, mddev) { 311 if (rdev->desc_nr == nr) 312 return rdev; 313 } 314 return NULL; 315 } 316 317 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 318 { 319 struct list_head *tmp; 320 mdk_rdev_t *rdev; 321 322 rdev_for_each(rdev, tmp, mddev) { 323 if (rdev->bdev->bd_dev == dev) 324 return rdev; 325 } 326 return NULL; 327 } 328 329 static struct mdk_personality *find_pers(int level, char *clevel) 330 { 331 struct mdk_personality *pers; 332 list_for_each_entry(pers, &pers_list, list) { 333 if (level != LEVEL_NONE && pers->level == level) 334 return pers; 335 if (strcmp(pers->name, clevel)==0) 336 return pers; 337 } 338 return NULL; 339 } 340 341 /* return the offset of the super block in 512byte sectors */ 342 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 343 { 344 sector_t num_sectors = bdev->bd_inode->i_size / 512; 345 return MD_NEW_SIZE_SECTORS(num_sectors); 346 } 347 348 static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size) 349 { 350 sector_t num_sectors = rdev->sb_start; 351 352 if (chunk_size) 353 num_sectors &= ~((sector_t)chunk_size/512 - 1); 354 return num_sectors; 355 } 356 357 static int alloc_disk_sb(mdk_rdev_t * rdev) 358 { 359 if (rdev->sb_page) 360 MD_BUG(); 361 362 rdev->sb_page = alloc_page(GFP_KERNEL); 363 if (!rdev->sb_page) { 364 printk(KERN_ALERT "md: out of memory.\n"); 365 return -ENOMEM; 366 } 367 368 return 0; 369 } 370 371 static void free_disk_sb(mdk_rdev_t * rdev) 372 { 373 if (rdev->sb_page) { 374 put_page(rdev->sb_page); 375 rdev->sb_loaded = 0; 376 rdev->sb_page = NULL; 377 rdev->sb_start = 0; 378 rdev->size = 0; 379 } 380 } 381 382 383 static void super_written(struct bio *bio, int error) 384 { 385 mdk_rdev_t *rdev = bio->bi_private; 386 mddev_t *mddev = rdev->mddev; 387 388 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 389 printk("md: super_written gets error=%d, uptodate=%d\n", 390 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 391 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 392 md_error(mddev, rdev); 393 } 394 395 if (atomic_dec_and_test(&mddev->pending_writes)) 396 wake_up(&mddev->sb_wait); 397 bio_put(bio); 398 } 399 400 static void super_written_barrier(struct bio *bio, int error) 401 { 402 struct bio *bio2 = bio->bi_private; 403 mdk_rdev_t *rdev = bio2->bi_private; 404 mddev_t *mddev = rdev->mddev; 405 406 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 407 error == -EOPNOTSUPP) { 408 unsigned long flags; 409 /* barriers don't appear to be supported :-( */ 410 set_bit(BarriersNotsupp, &rdev->flags); 411 mddev->barriers_work = 0; 412 spin_lock_irqsave(&mddev->write_lock, flags); 413 bio2->bi_next = mddev->biolist; 414 mddev->biolist = bio2; 415 spin_unlock_irqrestore(&mddev->write_lock, flags); 416 wake_up(&mddev->sb_wait); 417 bio_put(bio); 418 } else { 419 bio_put(bio2); 420 bio->bi_private = rdev; 421 super_written(bio, error); 422 } 423 } 424 425 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 426 sector_t sector, int size, struct page *page) 427 { 428 /* write first size bytes of page to sector of rdev 429 * Increment mddev->pending_writes before returning 430 * and decrement it on completion, waking up sb_wait 431 * if zero is reached. 432 * If an error occurred, call md_error 433 * 434 * As we might need to resubmit the request if BIO_RW_BARRIER 435 * causes ENOTSUPP, we allocate a spare bio... 436 */ 437 struct bio *bio = bio_alloc(GFP_NOIO, 1); 438 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC); 439 440 bio->bi_bdev = rdev->bdev; 441 bio->bi_sector = sector; 442 bio_add_page(bio, page, size, 0); 443 bio->bi_private = rdev; 444 bio->bi_end_io = super_written; 445 bio->bi_rw = rw; 446 447 atomic_inc(&mddev->pending_writes); 448 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 449 struct bio *rbio; 450 rw |= (1<<BIO_RW_BARRIER); 451 rbio = bio_clone(bio, GFP_NOIO); 452 rbio->bi_private = bio; 453 rbio->bi_end_io = super_written_barrier; 454 submit_bio(rw, rbio); 455 } else 456 submit_bio(rw, bio); 457 } 458 459 void md_super_wait(mddev_t *mddev) 460 { 461 /* wait for all superblock writes that were scheduled to complete. 462 * if any had to be retried (due to BARRIER problems), retry them 463 */ 464 DEFINE_WAIT(wq); 465 for(;;) { 466 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 467 if (atomic_read(&mddev->pending_writes)==0) 468 break; 469 while (mddev->biolist) { 470 struct bio *bio; 471 spin_lock_irq(&mddev->write_lock); 472 bio = mddev->biolist; 473 mddev->biolist = bio->bi_next ; 474 bio->bi_next = NULL; 475 spin_unlock_irq(&mddev->write_lock); 476 submit_bio(bio->bi_rw, bio); 477 } 478 schedule(); 479 } 480 finish_wait(&mddev->sb_wait, &wq); 481 } 482 483 static void bi_complete(struct bio *bio, int error) 484 { 485 complete((struct completion*)bio->bi_private); 486 } 487 488 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 489 struct page *page, int rw) 490 { 491 struct bio *bio = bio_alloc(GFP_NOIO, 1); 492 struct completion event; 493 int ret; 494 495 rw |= (1 << BIO_RW_SYNC); 496 497 bio->bi_bdev = bdev; 498 bio->bi_sector = sector; 499 bio_add_page(bio, page, size, 0); 500 init_completion(&event); 501 bio->bi_private = &event; 502 bio->bi_end_io = bi_complete; 503 submit_bio(rw, bio); 504 wait_for_completion(&event); 505 506 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 507 bio_put(bio); 508 return ret; 509 } 510 EXPORT_SYMBOL_GPL(sync_page_io); 511 512 static int read_disk_sb(mdk_rdev_t * rdev, int size) 513 { 514 char b[BDEVNAME_SIZE]; 515 if (!rdev->sb_page) { 516 MD_BUG(); 517 return -EINVAL; 518 } 519 if (rdev->sb_loaded) 520 return 0; 521 522 523 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 524 goto fail; 525 rdev->sb_loaded = 1; 526 return 0; 527 528 fail: 529 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 530 bdevname(rdev->bdev,b)); 531 return -EINVAL; 532 } 533 534 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 535 { 536 return sb1->set_uuid0 == sb2->set_uuid0 && 537 sb1->set_uuid1 == sb2->set_uuid1 && 538 sb1->set_uuid2 == sb2->set_uuid2 && 539 sb1->set_uuid3 == sb2->set_uuid3; 540 } 541 542 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 543 { 544 int ret; 545 mdp_super_t *tmp1, *tmp2; 546 547 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 548 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 549 550 if (!tmp1 || !tmp2) { 551 ret = 0; 552 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 553 goto abort; 554 } 555 556 *tmp1 = *sb1; 557 *tmp2 = *sb2; 558 559 /* 560 * nr_disks is not constant 561 */ 562 tmp1->nr_disks = 0; 563 tmp2->nr_disks = 0; 564 565 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 566 abort: 567 kfree(tmp1); 568 kfree(tmp2); 569 return ret; 570 } 571 572 573 static u32 md_csum_fold(u32 csum) 574 { 575 csum = (csum & 0xffff) + (csum >> 16); 576 return (csum & 0xffff) + (csum >> 16); 577 } 578 579 static unsigned int calc_sb_csum(mdp_super_t * sb) 580 { 581 u64 newcsum = 0; 582 u32 *sb32 = (u32*)sb; 583 int i; 584 unsigned int disk_csum, csum; 585 586 disk_csum = sb->sb_csum; 587 sb->sb_csum = 0; 588 589 for (i = 0; i < MD_SB_BYTES/4 ; i++) 590 newcsum += sb32[i]; 591 csum = (newcsum & 0xffffffff) + (newcsum>>32); 592 593 594 #ifdef CONFIG_ALPHA 595 /* This used to use csum_partial, which was wrong for several 596 * reasons including that different results are returned on 597 * different architectures. It isn't critical that we get exactly 598 * the same return value as before (we always csum_fold before 599 * testing, and that removes any differences). However as we 600 * know that csum_partial always returned a 16bit value on 601 * alphas, do a fold to maximise conformity to previous behaviour. 602 */ 603 sb->sb_csum = md_csum_fold(disk_csum); 604 #else 605 sb->sb_csum = disk_csum; 606 #endif 607 return csum; 608 } 609 610 611 /* 612 * Handle superblock details. 613 * We want to be able to handle multiple superblock formats 614 * so we have a common interface to them all, and an array of 615 * different handlers. 616 * We rely on user-space to write the initial superblock, and support 617 * reading and updating of superblocks. 618 * Interface methods are: 619 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 620 * loads and validates a superblock on dev. 621 * if refdev != NULL, compare superblocks on both devices 622 * Return: 623 * 0 - dev has a superblock that is compatible with refdev 624 * 1 - dev has a superblock that is compatible and newer than refdev 625 * so dev should be used as the refdev in future 626 * -EINVAL superblock incompatible or invalid 627 * -othererror e.g. -EIO 628 * 629 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 630 * Verify that dev is acceptable into mddev. 631 * The first time, mddev->raid_disks will be 0, and data from 632 * dev should be merged in. Subsequent calls check that dev 633 * is new enough. Return 0 or -EINVAL 634 * 635 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 636 * Update the superblock for rdev with data in mddev 637 * This does not write to disc. 638 * 639 */ 640 641 struct super_type { 642 char *name; 643 struct module *owner; 644 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, 645 int minor_version); 646 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 647 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 648 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, 649 sector_t num_sectors); 650 }; 651 652 /* 653 * load_super for 0.90.0 654 */ 655 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 656 { 657 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 658 mdp_super_t *sb; 659 int ret; 660 661 /* 662 * Calculate the position of the superblock (512byte sectors), 663 * it's at the end of the disk. 664 * 665 * It also happens to be a multiple of 4Kb. 666 */ 667 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 668 669 ret = read_disk_sb(rdev, MD_SB_BYTES); 670 if (ret) return ret; 671 672 ret = -EINVAL; 673 674 bdevname(rdev->bdev, b); 675 sb = (mdp_super_t*)page_address(rdev->sb_page); 676 677 if (sb->md_magic != MD_SB_MAGIC) { 678 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 679 b); 680 goto abort; 681 } 682 683 if (sb->major_version != 0 || 684 sb->minor_version < 90 || 685 sb->minor_version > 91) { 686 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 687 sb->major_version, sb->minor_version, 688 b); 689 goto abort; 690 } 691 692 if (sb->raid_disks <= 0) 693 goto abort; 694 695 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 696 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 697 b); 698 goto abort; 699 } 700 701 rdev->preferred_minor = sb->md_minor; 702 rdev->data_offset = 0; 703 rdev->sb_size = MD_SB_BYTES; 704 705 if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { 706 if (sb->level != 1 && sb->level != 4 707 && sb->level != 5 && sb->level != 6 708 && sb->level != 10) { 709 /* FIXME use a better test */ 710 printk(KERN_WARNING 711 "md: bitmaps not supported for this level.\n"); 712 goto abort; 713 } 714 } 715 716 if (sb->level == LEVEL_MULTIPATH) 717 rdev->desc_nr = -1; 718 else 719 rdev->desc_nr = sb->this_disk.number; 720 721 if (!refdev) { 722 ret = 1; 723 } else { 724 __u64 ev1, ev2; 725 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 726 if (!uuid_equal(refsb, sb)) { 727 printk(KERN_WARNING "md: %s has different UUID to %s\n", 728 b, bdevname(refdev->bdev,b2)); 729 goto abort; 730 } 731 if (!sb_equal(refsb, sb)) { 732 printk(KERN_WARNING "md: %s has same UUID" 733 " but different superblock to %s\n", 734 b, bdevname(refdev->bdev, b2)); 735 goto abort; 736 } 737 ev1 = md_event(sb); 738 ev2 = md_event(refsb); 739 if (ev1 > ev2) 740 ret = 1; 741 else 742 ret = 0; 743 } 744 rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; 745 746 if (rdev->size < sb->size && sb->level > 1) 747 /* "this cannot possibly happen" ... */ 748 ret = -EINVAL; 749 750 abort: 751 return ret; 752 } 753 754 /* 755 * validate_super for 0.90.0 756 */ 757 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 758 { 759 mdp_disk_t *desc; 760 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 761 __u64 ev1 = md_event(sb); 762 763 rdev->raid_disk = -1; 764 clear_bit(Faulty, &rdev->flags); 765 clear_bit(In_sync, &rdev->flags); 766 clear_bit(WriteMostly, &rdev->flags); 767 clear_bit(BarriersNotsupp, &rdev->flags); 768 769 if (mddev->raid_disks == 0) { 770 mddev->major_version = 0; 771 mddev->minor_version = sb->minor_version; 772 mddev->patch_version = sb->patch_version; 773 mddev->external = 0; 774 mddev->chunk_size = sb->chunk_size; 775 mddev->ctime = sb->ctime; 776 mddev->utime = sb->utime; 777 mddev->level = sb->level; 778 mddev->clevel[0] = 0; 779 mddev->layout = sb->layout; 780 mddev->raid_disks = sb->raid_disks; 781 mddev->size = sb->size; 782 mddev->events = ev1; 783 mddev->bitmap_offset = 0; 784 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 785 786 if (mddev->minor_version >= 91) { 787 mddev->reshape_position = sb->reshape_position; 788 mddev->delta_disks = sb->delta_disks; 789 mddev->new_level = sb->new_level; 790 mddev->new_layout = sb->new_layout; 791 mddev->new_chunk = sb->new_chunk; 792 } else { 793 mddev->reshape_position = MaxSector; 794 mddev->delta_disks = 0; 795 mddev->new_level = mddev->level; 796 mddev->new_layout = mddev->layout; 797 mddev->new_chunk = mddev->chunk_size; 798 } 799 800 if (sb->state & (1<<MD_SB_CLEAN)) 801 mddev->recovery_cp = MaxSector; 802 else { 803 if (sb->events_hi == sb->cp_events_hi && 804 sb->events_lo == sb->cp_events_lo) { 805 mddev->recovery_cp = sb->recovery_cp; 806 } else 807 mddev->recovery_cp = 0; 808 } 809 810 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 811 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 812 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 813 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 814 815 mddev->max_disks = MD_SB_DISKS; 816 817 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 818 mddev->bitmap_file == NULL) 819 mddev->bitmap_offset = mddev->default_bitmap_offset; 820 821 } else if (mddev->pers == NULL) { 822 /* Insist on good event counter while assembling */ 823 ++ev1; 824 if (ev1 < mddev->events) 825 return -EINVAL; 826 } else if (mddev->bitmap) { 827 /* if adding to array with a bitmap, then we can accept an 828 * older device ... but not too old. 829 */ 830 if (ev1 < mddev->bitmap->events_cleared) 831 return 0; 832 } else { 833 if (ev1 < mddev->events) 834 /* just a hot-add of a new device, leave raid_disk at -1 */ 835 return 0; 836 } 837 838 if (mddev->level != LEVEL_MULTIPATH) { 839 desc = sb->disks + rdev->desc_nr; 840 841 if (desc->state & (1<<MD_DISK_FAULTY)) 842 set_bit(Faulty, &rdev->flags); 843 else if (desc->state & (1<<MD_DISK_SYNC) /* && 844 desc->raid_disk < mddev->raid_disks */) { 845 set_bit(In_sync, &rdev->flags); 846 rdev->raid_disk = desc->raid_disk; 847 } 848 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 849 set_bit(WriteMostly, &rdev->flags); 850 } else /* MULTIPATH are always insync */ 851 set_bit(In_sync, &rdev->flags); 852 return 0; 853 } 854 855 /* 856 * sync_super for 0.90.0 857 */ 858 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 859 { 860 mdp_super_t *sb; 861 struct list_head *tmp; 862 mdk_rdev_t *rdev2; 863 int next_spare = mddev->raid_disks; 864 865 866 /* make rdev->sb match mddev data.. 867 * 868 * 1/ zero out disks 869 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 870 * 3/ any empty disks < next_spare become removed 871 * 872 * disks[0] gets initialised to REMOVED because 873 * we cannot be sure from other fields if it has 874 * been initialised or not. 875 */ 876 int i; 877 int active=0, working=0,failed=0,spare=0,nr_disks=0; 878 879 rdev->sb_size = MD_SB_BYTES; 880 881 sb = (mdp_super_t*)page_address(rdev->sb_page); 882 883 memset(sb, 0, sizeof(*sb)); 884 885 sb->md_magic = MD_SB_MAGIC; 886 sb->major_version = mddev->major_version; 887 sb->patch_version = mddev->patch_version; 888 sb->gvalid_words = 0; /* ignored */ 889 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 890 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 891 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 892 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 893 894 sb->ctime = mddev->ctime; 895 sb->level = mddev->level; 896 sb->size = mddev->size; 897 sb->raid_disks = mddev->raid_disks; 898 sb->md_minor = mddev->md_minor; 899 sb->not_persistent = 0; 900 sb->utime = mddev->utime; 901 sb->state = 0; 902 sb->events_hi = (mddev->events>>32); 903 sb->events_lo = (u32)mddev->events; 904 905 if (mddev->reshape_position == MaxSector) 906 sb->minor_version = 90; 907 else { 908 sb->minor_version = 91; 909 sb->reshape_position = mddev->reshape_position; 910 sb->new_level = mddev->new_level; 911 sb->delta_disks = mddev->delta_disks; 912 sb->new_layout = mddev->new_layout; 913 sb->new_chunk = mddev->new_chunk; 914 } 915 mddev->minor_version = sb->minor_version; 916 if (mddev->in_sync) 917 { 918 sb->recovery_cp = mddev->recovery_cp; 919 sb->cp_events_hi = (mddev->events>>32); 920 sb->cp_events_lo = (u32)mddev->events; 921 if (mddev->recovery_cp == MaxSector) 922 sb->state = (1<< MD_SB_CLEAN); 923 } else 924 sb->recovery_cp = 0; 925 926 sb->layout = mddev->layout; 927 sb->chunk_size = mddev->chunk_size; 928 929 if (mddev->bitmap && mddev->bitmap_file == NULL) 930 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 931 932 sb->disks[0].state = (1<<MD_DISK_REMOVED); 933 rdev_for_each(rdev2, tmp, mddev) { 934 mdp_disk_t *d; 935 int desc_nr; 936 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 937 && !test_bit(Faulty, &rdev2->flags)) 938 desc_nr = rdev2->raid_disk; 939 else 940 desc_nr = next_spare++; 941 rdev2->desc_nr = desc_nr; 942 d = &sb->disks[rdev2->desc_nr]; 943 nr_disks++; 944 d->number = rdev2->desc_nr; 945 d->major = MAJOR(rdev2->bdev->bd_dev); 946 d->minor = MINOR(rdev2->bdev->bd_dev); 947 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 948 && !test_bit(Faulty, &rdev2->flags)) 949 d->raid_disk = rdev2->raid_disk; 950 else 951 d->raid_disk = rdev2->desc_nr; /* compatibility */ 952 if (test_bit(Faulty, &rdev2->flags)) 953 d->state = (1<<MD_DISK_FAULTY); 954 else if (test_bit(In_sync, &rdev2->flags)) { 955 d->state = (1<<MD_DISK_ACTIVE); 956 d->state |= (1<<MD_DISK_SYNC); 957 active++; 958 working++; 959 } else { 960 d->state = 0; 961 spare++; 962 working++; 963 } 964 if (test_bit(WriteMostly, &rdev2->flags)) 965 d->state |= (1<<MD_DISK_WRITEMOSTLY); 966 } 967 /* now set the "removed" and "faulty" bits on any missing devices */ 968 for (i=0 ; i < mddev->raid_disks ; i++) { 969 mdp_disk_t *d = &sb->disks[i]; 970 if (d->state == 0 && d->number == 0) { 971 d->number = i; 972 d->raid_disk = i; 973 d->state = (1<<MD_DISK_REMOVED); 974 d->state |= (1<<MD_DISK_FAULTY); 975 failed++; 976 } 977 } 978 sb->nr_disks = nr_disks; 979 sb->active_disks = active; 980 sb->working_disks = working; 981 sb->failed_disks = failed; 982 sb->spare_disks = spare; 983 984 sb->this_disk = sb->disks[rdev->desc_nr]; 985 sb->sb_csum = calc_sb_csum(sb); 986 } 987 988 /* 989 * rdev_size_change for 0.90.0 990 */ 991 static unsigned long long 992 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 993 { 994 if (num_sectors && num_sectors < rdev->mddev->size * 2) 995 return 0; /* component must fit device */ 996 if (rdev->mddev->bitmap_offset) 997 return 0; /* can't move bitmap */ 998 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 999 if (!num_sectors || num_sectors > rdev->sb_start) 1000 num_sectors = rdev->sb_start; 1001 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1002 rdev->sb_page); 1003 md_super_wait(rdev->mddev); 1004 return num_sectors / 2; /* kB for sysfs */ 1005 } 1006 1007 1008 /* 1009 * version 1 superblock 1010 */ 1011 1012 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1013 { 1014 __le32 disk_csum; 1015 u32 csum; 1016 unsigned long long newcsum; 1017 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1018 __le32 *isuper = (__le32*)sb; 1019 int i; 1020 1021 disk_csum = sb->sb_csum; 1022 sb->sb_csum = 0; 1023 newcsum = 0; 1024 for (i=0; size>=4; size -= 4 ) 1025 newcsum += le32_to_cpu(*isuper++); 1026 1027 if (size == 2) 1028 newcsum += le16_to_cpu(*(__le16*) isuper); 1029 1030 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1031 sb->sb_csum = disk_csum; 1032 return cpu_to_le32(csum); 1033 } 1034 1035 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1036 { 1037 struct mdp_superblock_1 *sb; 1038 int ret; 1039 sector_t sb_start; 1040 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1041 int bmask; 1042 1043 /* 1044 * Calculate the position of the superblock in 512byte sectors. 1045 * It is always aligned to a 4K boundary and 1046 * depeding on minor_version, it can be: 1047 * 0: At least 8K, but less than 12K, from end of device 1048 * 1: At start of device 1049 * 2: 4K from start of device. 1050 */ 1051 switch(minor_version) { 1052 case 0: 1053 sb_start = rdev->bdev->bd_inode->i_size >> 9; 1054 sb_start -= 8*2; 1055 sb_start &= ~(sector_t)(4*2-1); 1056 break; 1057 case 1: 1058 sb_start = 0; 1059 break; 1060 case 2: 1061 sb_start = 8; 1062 break; 1063 default: 1064 return -EINVAL; 1065 } 1066 rdev->sb_start = sb_start; 1067 1068 /* superblock is rarely larger than 1K, but it can be larger, 1069 * and it is safe to read 4k, so we do that 1070 */ 1071 ret = read_disk_sb(rdev, 4096); 1072 if (ret) return ret; 1073 1074 1075 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1076 1077 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1078 sb->major_version != cpu_to_le32(1) || 1079 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1080 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1081 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1082 return -EINVAL; 1083 1084 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1085 printk("md: invalid superblock checksum on %s\n", 1086 bdevname(rdev->bdev,b)); 1087 return -EINVAL; 1088 } 1089 if (le64_to_cpu(sb->data_size) < 10) { 1090 printk("md: data_size too small on %s\n", 1091 bdevname(rdev->bdev,b)); 1092 return -EINVAL; 1093 } 1094 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) { 1095 if (sb->level != cpu_to_le32(1) && 1096 sb->level != cpu_to_le32(4) && 1097 sb->level != cpu_to_le32(5) && 1098 sb->level != cpu_to_le32(6) && 1099 sb->level != cpu_to_le32(10)) { 1100 printk(KERN_WARNING 1101 "md: bitmaps not supported for this level.\n"); 1102 return -EINVAL; 1103 } 1104 } 1105 1106 rdev->preferred_minor = 0xffff; 1107 rdev->data_offset = le64_to_cpu(sb->data_offset); 1108 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1109 1110 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1111 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1112 if (rdev->sb_size & bmask) 1113 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1114 1115 if (minor_version 1116 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1117 return -EINVAL; 1118 1119 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1120 rdev->desc_nr = -1; 1121 else 1122 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1123 1124 if (!refdev) { 1125 ret = 1; 1126 } else { 1127 __u64 ev1, ev2; 1128 struct mdp_superblock_1 *refsb = 1129 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1130 1131 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1132 sb->level != refsb->level || 1133 sb->layout != refsb->layout || 1134 sb->chunksize != refsb->chunksize) { 1135 printk(KERN_WARNING "md: %s has strangely different" 1136 " superblock to %s\n", 1137 bdevname(rdev->bdev,b), 1138 bdevname(refdev->bdev,b2)); 1139 return -EINVAL; 1140 } 1141 ev1 = le64_to_cpu(sb->events); 1142 ev2 = le64_to_cpu(refsb->events); 1143 1144 if (ev1 > ev2) 1145 ret = 1; 1146 else 1147 ret = 0; 1148 } 1149 if (minor_version) 1150 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 1151 else 1152 rdev->size = rdev->sb_start / 2; 1153 if (rdev->size < le64_to_cpu(sb->data_size)/2) 1154 return -EINVAL; 1155 rdev->size = le64_to_cpu(sb->data_size)/2; 1156 if (le32_to_cpu(sb->chunksize)) 1157 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1158 1159 if (le64_to_cpu(sb->size) > rdev->size*2) 1160 return -EINVAL; 1161 return ret; 1162 } 1163 1164 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1165 { 1166 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1167 __u64 ev1 = le64_to_cpu(sb->events); 1168 1169 rdev->raid_disk = -1; 1170 clear_bit(Faulty, &rdev->flags); 1171 clear_bit(In_sync, &rdev->flags); 1172 clear_bit(WriteMostly, &rdev->flags); 1173 clear_bit(BarriersNotsupp, &rdev->flags); 1174 1175 if (mddev->raid_disks == 0) { 1176 mddev->major_version = 1; 1177 mddev->patch_version = 0; 1178 mddev->external = 0; 1179 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 1180 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1181 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1182 mddev->level = le32_to_cpu(sb->level); 1183 mddev->clevel[0] = 0; 1184 mddev->layout = le32_to_cpu(sb->layout); 1185 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1186 mddev->size = le64_to_cpu(sb->size)/2; 1187 mddev->events = ev1; 1188 mddev->bitmap_offset = 0; 1189 mddev->default_bitmap_offset = 1024 >> 9; 1190 1191 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1192 memcpy(mddev->uuid, sb->set_uuid, 16); 1193 1194 mddev->max_disks = (4096-256)/2; 1195 1196 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1197 mddev->bitmap_file == NULL ) 1198 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1199 1200 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1201 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1202 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1203 mddev->new_level = le32_to_cpu(sb->new_level); 1204 mddev->new_layout = le32_to_cpu(sb->new_layout); 1205 mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; 1206 } else { 1207 mddev->reshape_position = MaxSector; 1208 mddev->delta_disks = 0; 1209 mddev->new_level = mddev->level; 1210 mddev->new_layout = mddev->layout; 1211 mddev->new_chunk = mddev->chunk_size; 1212 } 1213 1214 } else if (mddev->pers == NULL) { 1215 /* Insist of good event counter while assembling */ 1216 ++ev1; 1217 if (ev1 < mddev->events) 1218 return -EINVAL; 1219 } else if (mddev->bitmap) { 1220 /* If adding to array with a bitmap, then we can accept an 1221 * older device, but not too old. 1222 */ 1223 if (ev1 < mddev->bitmap->events_cleared) 1224 return 0; 1225 } else { 1226 if (ev1 < mddev->events) 1227 /* just a hot-add of a new device, leave raid_disk at -1 */ 1228 return 0; 1229 } 1230 if (mddev->level != LEVEL_MULTIPATH) { 1231 int role; 1232 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1233 switch(role) { 1234 case 0xffff: /* spare */ 1235 break; 1236 case 0xfffe: /* faulty */ 1237 set_bit(Faulty, &rdev->flags); 1238 break; 1239 default: 1240 if ((le32_to_cpu(sb->feature_map) & 1241 MD_FEATURE_RECOVERY_OFFSET)) 1242 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1243 else 1244 set_bit(In_sync, &rdev->flags); 1245 rdev->raid_disk = role; 1246 break; 1247 } 1248 if (sb->devflags & WriteMostly1) 1249 set_bit(WriteMostly, &rdev->flags); 1250 } else /* MULTIPATH are always insync */ 1251 set_bit(In_sync, &rdev->flags); 1252 1253 return 0; 1254 } 1255 1256 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1257 { 1258 struct mdp_superblock_1 *sb; 1259 struct list_head *tmp; 1260 mdk_rdev_t *rdev2; 1261 int max_dev, i; 1262 /* make rdev->sb match mddev and rdev data. */ 1263 1264 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1265 1266 sb->feature_map = 0; 1267 sb->pad0 = 0; 1268 sb->recovery_offset = cpu_to_le64(0); 1269 memset(sb->pad1, 0, sizeof(sb->pad1)); 1270 memset(sb->pad2, 0, sizeof(sb->pad2)); 1271 memset(sb->pad3, 0, sizeof(sb->pad3)); 1272 1273 sb->utime = cpu_to_le64((__u64)mddev->utime); 1274 sb->events = cpu_to_le64(mddev->events); 1275 if (mddev->in_sync) 1276 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1277 else 1278 sb->resync_offset = cpu_to_le64(0); 1279 1280 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1281 1282 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1283 sb->size = cpu_to_le64(mddev->size<<1); 1284 1285 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1286 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1287 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1288 } 1289 1290 if (rdev->raid_disk >= 0 && 1291 !test_bit(In_sync, &rdev->flags) && 1292 rdev->recovery_offset > 0) { 1293 sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1294 sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); 1295 } 1296 1297 if (mddev->reshape_position != MaxSector) { 1298 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1299 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1300 sb->new_layout = cpu_to_le32(mddev->new_layout); 1301 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1302 sb->new_level = cpu_to_le32(mddev->new_level); 1303 sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); 1304 } 1305 1306 max_dev = 0; 1307 rdev_for_each(rdev2, tmp, mddev) 1308 if (rdev2->desc_nr+1 > max_dev) 1309 max_dev = rdev2->desc_nr+1; 1310 1311 if (max_dev > le32_to_cpu(sb->max_dev)) 1312 sb->max_dev = cpu_to_le32(max_dev); 1313 for (i=0; i<max_dev;i++) 1314 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1315 1316 rdev_for_each(rdev2, tmp, mddev) { 1317 i = rdev2->desc_nr; 1318 if (test_bit(Faulty, &rdev2->flags)) 1319 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1320 else if (test_bit(In_sync, &rdev2->flags)) 1321 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1322 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1323 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1324 else 1325 sb->dev_roles[i] = cpu_to_le16(0xffff); 1326 } 1327 1328 sb->sb_csum = calc_sb_1_csum(sb); 1329 } 1330 1331 static unsigned long long 1332 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1333 { 1334 struct mdp_superblock_1 *sb; 1335 sector_t max_sectors; 1336 if (num_sectors && num_sectors < rdev->mddev->size * 2) 1337 return 0; /* component must fit device */ 1338 if (rdev->sb_start < rdev->data_offset) { 1339 /* minor versions 1 and 2; superblock before data */ 1340 max_sectors = rdev->bdev->bd_inode->i_size >> 9; 1341 max_sectors -= rdev->data_offset; 1342 if (!num_sectors || num_sectors > max_sectors) 1343 num_sectors = max_sectors; 1344 } else if (rdev->mddev->bitmap_offset) { 1345 /* minor version 0 with bitmap we can't move */ 1346 return 0; 1347 } else { 1348 /* minor version 0; superblock after data */ 1349 sector_t sb_start; 1350 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1351 sb_start &= ~(sector_t)(4*2 - 1); 1352 max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; 1353 if (!num_sectors || num_sectors > max_sectors) 1354 num_sectors = max_sectors; 1355 rdev->sb_start = sb_start; 1356 } 1357 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1358 sb->data_size = cpu_to_le64(num_sectors); 1359 sb->super_offset = rdev->sb_start; 1360 sb->sb_csum = calc_sb_1_csum(sb); 1361 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1362 rdev->sb_page); 1363 md_super_wait(rdev->mddev); 1364 return num_sectors / 2; /* kB for sysfs */ 1365 } 1366 1367 static struct super_type super_types[] = { 1368 [0] = { 1369 .name = "0.90.0", 1370 .owner = THIS_MODULE, 1371 .load_super = super_90_load, 1372 .validate_super = super_90_validate, 1373 .sync_super = super_90_sync, 1374 .rdev_size_change = super_90_rdev_size_change, 1375 }, 1376 [1] = { 1377 .name = "md-1", 1378 .owner = THIS_MODULE, 1379 .load_super = super_1_load, 1380 .validate_super = super_1_validate, 1381 .sync_super = super_1_sync, 1382 .rdev_size_change = super_1_rdev_size_change, 1383 }, 1384 }; 1385 1386 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1387 { 1388 mdk_rdev_t *rdev, *rdev2; 1389 1390 rcu_read_lock(); 1391 rdev_for_each_rcu(rdev, mddev1) 1392 rdev_for_each_rcu(rdev2, mddev2) 1393 if (rdev->bdev->bd_contains == 1394 rdev2->bdev->bd_contains) { 1395 rcu_read_unlock(); 1396 return 1; 1397 } 1398 rcu_read_unlock(); 1399 return 0; 1400 } 1401 1402 static LIST_HEAD(pending_raid_disks); 1403 1404 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1405 { 1406 char b[BDEVNAME_SIZE]; 1407 struct kobject *ko; 1408 char *s; 1409 int err; 1410 1411 if (rdev->mddev) { 1412 MD_BUG(); 1413 return -EINVAL; 1414 } 1415 1416 /* prevent duplicates */ 1417 if (find_rdev(mddev, rdev->bdev->bd_dev)) 1418 return -EEXIST; 1419 1420 /* make sure rdev->size exceeds mddev->size */ 1421 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1422 if (mddev->pers) { 1423 /* Cannot change size, so fail 1424 * If mddev->level <= 0, then we don't care 1425 * about aligning sizes (e.g. linear) 1426 */ 1427 if (mddev->level > 0) 1428 return -ENOSPC; 1429 } else 1430 mddev->size = rdev->size; 1431 } 1432 1433 /* Verify rdev->desc_nr is unique. 1434 * If it is -1, assign a free number, else 1435 * check number is not in use 1436 */ 1437 if (rdev->desc_nr < 0) { 1438 int choice = 0; 1439 if (mddev->pers) choice = mddev->raid_disks; 1440 while (find_rdev_nr(mddev, choice)) 1441 choice++; 1442 rdev->desc_nr = choice; 1443 } else { 1444 if (find_rdev_nr(mddev, rdev->desc_nr)) 1445 return -EBUSY; 1446 } 1447 bdevname(rdev->bdev,b); 1448 while ( (s=strchr(b, '/')) != NULL) 1449 *s = '!'; 1450 1451 rdev->mddev = mddev; 1452 printk(KERN_INFO "md: bind<%s>\n", b); 1453 1454 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 1455 goto fail; 1456 1457 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 1458 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1459 kobject_del(&rdev->kobj); 1460 goto fail; 1461 } 1462 list_add_rcu(&rdev->same_set, &mddev->disks); 1463 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1464 return 0; 1465 1466 fail: 1467 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1468 b, mdname(mddev)); 1469 return err; 1470 } 1471 1472 static void md_delayed_delete(struct work_struct *ws) 1473 { 1474 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1475 kobject_del(&rdev->kobj); 1476 kobject_put(&rdev->kobj); 1477 } 1478 1479 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1480 { 1481 char b[BDEVNAME_SIZE]; 1482 if (!rdev->mddev) { 1483 MD_BUG(); 1484 return; 1485 } 1486 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1487 list_del_rcu(&rdev->same_set); 1488 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1489 rdev->mddev = NULL; 1490 sysfs_remove_link(&rdev->kobj, "block"); 1491 1492 /* We need to delay this, otherwise we can deadlock when 1493 * writing to 'remove' to "dev/state". We also need 1494 * to delay it due to rcu usage. 1495 */ 1496 synchronize_rcu(); 1497 INIT_WORK(&rdev->del_work, md_delayed_delete); 1498 kobject_get(&rdev->kobj); 1499 schedule_work(&rdev->del_work); 1500 } 1501 1502 /* 1503 * prevent the device from being mounted, repartitioned or 1504 * otherwise reused by a RAID array (or any other kernel 1505 * subsystem), by bd_claiming the device. 1506 */ 1507 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) 1508 { 1509 int err = 0; 1510 struct block_device *bdev; 1511 char b[BDEVNAME_SIZE]; 1512 1513 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1514 if (IS_ERR(bdev)) { 1515 printk(KERN_ERR "md: could not open %s.\n", 1516 __bdevname(dev, b)); 1517 return PTR_ERR(bdev); 1518 } 1519 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); 1520 if (err) { 1521 printk(KERN_ERR "md: could not bd_claim %s.\n", 1522 bdevname(bdev, b)); 1523 blkdev_put(bdev); 1524 return err; 1525 } 1526 if (!shared) 1527 set_bit(AllReserved, &rdev->flags); 1528 rdev->bdev = bdev; 1529 return err; 1530 } 1531 1532 static void unlock_rdev(mdk_rdev_t *rdev) 1533 { 1534 struct block_device *bdev = rdev->bdev; 1535 rdev->bdev = NULL; 1536 if (!bdev) 1537 MD_BUG(); 1538 bd_release(bdev); 1539 blkdev_put(bdev); 1540 } 1541 1542 void md_autodetect_dev(dev_t dev); 1543 1544 static void export_rdev(mdk_rdev_t * rdev) 1545 { 1546 char b[BDEVNAME_SIZE]; 1547 printk(KERN_INFO "md: export_rdev(%s)\n", 1548 bdevname(rdev->bdev,b)); 1549 if (rdev->mddev) 1550 MD_BUG(); 1551 free_disk_sb(rdev); 1552 #ifndef MODULE 1553 if (test_bit(AutoDetected, &rdev->flags)) 1554 md_autodetect_dev(rdev->bdev->bd_dev); 1555 #endif 1556 unlock_rdev(rdev); 1557 kobject_put(&rdev->kobj); 1558 } 1559 1560 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1561 { 1562 unbind_rdev_from_array(rdev); 1563 export_rdev(rdev); 1564 } 1565 1566 static void export_array(mddev_t *mddev) 1567 { 1568 struct list_head *tmp; 1569 mdk_rdev_t *rdev; 1570 1571 rdev_for_each(rdev, tmp, mddev) { 1572 if (!rdev->mddev) { 1573 MD_BUG(); 1574 continue; 1575 } 1576 kick_rdev_from_array(rdev); 1577 } 1578 if (!list_empty(&mddev->disks)) 1579 MD_BUG(); 1580 mddev->raid_disks = 0; 1581 mddev->major_version = 0; 1582 } 1583 1584 static void print_desc(mdp_disk_t *desc) 1585 { 1586 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1587 desc->major,desc->minor,desc->raid_disk,desc->state); 1588 } 1589 1590 static void print_sb(mdp_super_t *sb) 1591 { 1592 int i; 1593 1594 printk(KERN_INFO 1595 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1596 sb->major_version, sb->minor_version, sb->patch_version, 1597 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1598 sb->ctime); 1599 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1600 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1601 sb->md_minor, sb->layout, sb->chunk_size); 1602 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1603 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1604 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1605 sb->failed_disks, sb->spare_disks, 1606 sb->sb_csum, (unsigned long)sb->events_lo); 1607 1608 printk(KERN_INFO); 1609 for (i = 0; i < MD_SB_DISKS; i++) { 1610 mdp_disk_t *desc; 1611 1612 desc = sb->disks + i; 1613 if (desc->number || desc->major || desc->minor || 1614 desc->raid_disk || (desc->state && (desc->state != 4))) { 1615 printk(" D %2d: ", i); 1616 print_desc(desc); 1617 } 1618 } 1619 printk(KERN_INFO "md: THIS: "); 1620 print_desc(&sb->this_disk); 1621 1622 } 1623 1624 static void print_rdev(mdk_rdev_t *rdev) 1625 { 1626 char b[BDEVNAME_SIZE]; 1627 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1628 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1629 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1630 rdev->desc_nr); 1631 if (rdev->sb_loaded) { 1632 printk(KERN_INFO "md: rdev superblock:\n"); 1633 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1634 } else 1635 printk(KERN_INFO "md: no rdev superblock!\n"); 1636 } 1637 1638 static void md_print_devices(void) 1639 { 1640 struct list_head *tmp, *tmp2; 1641 mdk_rdev_t *rdev; 1642 mddev_t *mddev; 1643 char b[BDEVNAME_SIZE]; 1644 1645 printk("\n"); 1646 printk("md: **********************************\n"); 1647 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1648 printk("md: **********************************\n"); 1649 for_each_mddev(mddev, tmp) { 1650 1651 if (mddev->bitmap) 1652 bitmap_print_sb(mddev->bitmap); 1653 else 1654 printk("%s: ", mdname(mddev)); 1655 rdev_for_each(rdev, tmp2, mddev) 1656 printk("<%s>", bdevname(rdev->bdev,b)); 1657 printk("\n"); 1658 1659 rdev_for_each(rdev, tmp2, mddev) 1660 print_rdev(rdev); 1661 } 1662 printk("md: **********************************\n"); 1663 printk("\n"); 1664 } 1665 1666 1667 static void sync_sbs(mddev_t * mddev, int nospares) 1668 { 1669 /* Update each superblock (in-memory image), but 1670 * if we are allowed to, skip spares which already 1671 * have the right event counter, or have one earlier 1672 * (which would mean they aren't being marked as dirty 1673 * with the rest of the array) 1674 */ 1675 mdk_rdev_t *rdev; 1676 struct list_head *tmp; 1677 1678 rdev_for_each(rdev, tmp, mddev) { 1679 if (rdev->sb_events == mddev->events || 1680 (nospares && 1681 rdev->raid_disk < 0 && 1682 (rdev->sb_events&1)==0 && 1683 rdev->sb_events+1 == mddev->events)) { 1684 /* Don't update this superblock */ 1685 rdev->sb_loaded = 2; 1686 } else { 1687 super_types[mddev->major_version]. 1688 sync_super(mddev, rdev); 1689 rdev->sb_loaded = 1; 1690 } 1691 } 1692 } 1693 1694 static void md_update_sb(mddev_t * mddev, int force_change) 1695 { 1696 struct list_head *tmp; 1697 mdk_rdev_t *rdev; 1698 int sync_req; 1699 int nospares = 0; 1700 1701 if (mddev->external) 1702 return; 1703 repeat: 1704 spin_lock_irq(&mddev->write_lock); 1705 1706 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1707 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1708 force_change = 1; 1709 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1710 /* just a clean<-> dirty transition, possibly leave spares alone, 1711 * though if events isn't the right even/odd, we will have to do 1712 * spares after all 1713 */ 1714 nospares = 1; 1715 if (force_change) 1716 nospares = 0; 1717 if (mddev->degraded) 1718 /* If the array is degraded, then skipping spares is both 1719 * dangerous and fairly pointless. 1720 * Dangerous because a device that was removed from the array 1721 * might have a event_count that still looks up-to-date, 1722 * so it can be re-added without a resync. 1723 * Pointless because if there are any spares to skip, 1724 * then a recovery will happen and soon that array won't 1725 * be degraded any more and the spare can go back to sleep then. 1726 */ 1727 nospares = 0; 1728 1729 sync_req = mddev->in_sync; 1730 mddev->utime = get_seconds(); 1731 1732 /* If this is just a dirty<->clean transition, and the array is clean 1733 * and 'events' is odd, we can roll back to the previous clean state */ 1734 if (nospares 1735 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1736 && (mddev->events & 1) 1737 && mddev->events != 1) 1738 mddev->events--; 1739 else { 1740 /* otherwise we have to go forward and ... */ 1741 mddev->events ++; 1742 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1743 /* .. if the array isn't clean, insist on an odd 'events' */ 1744 if ((mddev->events&1)==0) { 1745 mddev->events++; 1746 nospares = 0; 1747 } 1748 } else { 1749 /* otherwise insist on an even 'events' (for clean states) */ 1750 if ((mddev->events&1)) { 1751 mddev->events++; 1752 nospares = 0; 1753 } 1754 } 1755 } 1756 1757 if (!mddev->events) { 1758 /* 1759 * oops, this 64-bit counter should never wrap. 1760 * Either we are in around ~1 trillion A.C., assuming 1761 * 1 reboot per second, or we have a bug: 1762 */ 1763 MD_BUG(); 1764 mddev->events --; 1765 } 1766 1767 /* 1768 * do not write anything to disk if using 1769 * nonpersistent superblocks 1770 */ 1771 if (!mddev->persistent) { 1772 if (!mddev->external) 1773 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1774 1775 spin_unlock_irq(&mddev->write_lock); 1776 wake_up(&mddev->sb_wait); 1777 return; 1778 } 1779 sync_sbs(mddev, nospares); 1780 spin_unlock_irq(&mddev->write_lock); 1781 1782 dprintk(KERN_INFO 1783 "md: updating %s RAID superblock on device (in sync %d)\n", 1784 mdname(mddev),mddev->in_sync); 1785 1786 bitmap_update_sb(mddev->bitmap); 1787 rdev_for_each(rdev, tmp, mddev) { 1788 char b[BDEVNAME_SIZE]; 1789 dprintk(KERN_INFO "md: "); 1790 if (rdev->sb_loaded != 1) 1791 continue; /* no noise on spare devices */ 1792 if (test_bit(Faulty, &rdev->flags)) 1793 dprintk("(skipping faulty "); 1794 1795 dprintk("%s ", bdevname(rdev->bdev,b)); 1796 if (!test_bit(Faulty, &rdev->flags)) { 1797 md_super_write(mddev,rdev, 1798 rdev->sb_start, rdev->sb_size, 1799 rdev->sb_page); 1800 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1801 bdevname(rdev->bdev,b), 1802 (unsigned long long)rdev->sb_start); 1803 rdev->sb_events = mddev->events; 1804 1805 } else 1806 dprintk(")\n"); 1807 if (mddev->level == LEVEL_MULTIPATH) 1808 /* only need to write one superblock... */ 1809 break; 1810 } 1811 md_super_wait(mddev); 1812 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 1813 1814 spin_lock_irq(&mddev->write_lock); 1815 if (mddev->in_sync != sync_req || 1816 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 1817 /* have to write it out again */ 1818 spin_unlock_irq(&mddev->write_lock); 1819 goto repeat; 1820 } 1821 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 1822 spin_unlock_irq(&mddev->write_lock); 1823 wake_up(&mddev->sb_wait); 1824 1825 } 1826 1827 /* words written to sysfs files may, or may not, be \n terminated. 1828 * We want to accept with case. For this we use cmd_match. 1829 */ 1830 static int cmd_match(const char *cmd, const char *str) 1831 { 1832 /* See if cmd, written into a sysfs file, matches 1833 * str. They must either be the same, or cmd can 1834 * have a trailing newline 1835 */ 1836 while (*cmd && *str && *cmd == *str) { 1837 cmd++; 1838 str++; 1839 } 1840 if (*cmd == '\n') 1841 cmd++; 1842 if (*str || *cmd) 1843 return 0; 1844 return 1; 1845 } 1846 1847 struct rdev_sysfs_entry { 1848 struct attribute attr; 1849 ssize_t (*show)(mdk_rdev_t *, char *); 1850 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 1851 }; 1852 1853 static ssize_t 1854 state_show(mdk_rdev_t *rdev, char *page) 1855 { 1856 char *sep = ""; 1857 size_t len = 0; 1858 1859 if (test_bit(Faulty, &rdev->flags)) { 1860 len+= sprintf(page+len, "%sfaulty",sep); 1861 sep = ","; 1862 } 1863 if (test_bit(In_sync, &rdev->flags)) { 1864 len += sprintf(page+len, "%sin_sync",sep); 1865 sep = ","; 1866 } 1867 if (test_bit(WriteMostly, &rdev->flags)) { 1868 len += sprintf(page+len, "%swrite_mostly",sep); 1869 sep = ","; 1870 } 1871 if (test_bit(Blocked, &rdev->flags)) { 1872 len += sprintf(page+len, "%sblocked", sep); 1873 sep = ","; 1874 } 1875 if (!test_bit(Faulty, &rdev->flags) && 1876 !test_bit(In_sync, &rdev->flags)) { 1877 len += sprintf(page+len, "%sspare", sep); 1878 sep = ","; 1879 } 1880 return len+sprintf(page+len, "\n"); 1881 } 1882 1883 static ssize_t 1884 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1885 { 1886 /* can write 1887 * faulty - simulates and error 1888 * remove - disconnects the device 1889 * writemostly - sets write_mostly 1890 * -writemostly - clears write_mostly 1891 * blocked - sets the Blocked flag 1892 * -blocked - clears the Blocked flag 1893 */ 1894 int err = -EINVAL; 1895 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1896 md_error(rdev->mddev, rdev); 1897 err = 0; 1898 } else if (cmd_match(buf, "remove")) { 1899 if (rdev->raid_disk >= 0) 1900 err = -EBUSY; 1901 else { 1902 mddev_t *mddev = rdev->mddev; 1903 kick_rdev_from_array(rdev); 1904 if (mddev->pers) 1905 md_update_sb(mddev, 1); 1906 md_new_event(mddev); 1907 err = 0; 1908 } 1909 } else if (cmd_match(buf, "writemostly")) { 1910 set_bit(WriteMostly, &rdev->flags); 1911 err = 0; 1912 } else if (cmd_match(buf, "-writemostly")) { 1913 clear_bit(WriteMostly, &rdev->flags); 1914 err = 0; 1915 } else if (cmd_match(buf, "blocked")) { 1916 set_bit(Blocked, &rdev->flags); 1917 err = 0; 1918 } else if (cmd_match(buf, "-blocked")) { 1919 clear_bit(Blocked, &rdev->flags); 1920 wake_up(&rdev->blocked_wait); 1921 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 1922 md_wakeup_thread(rdev->mddev->thread); 1923 1924 err = 0; 1925 } 1926 if (!err) 1927 sysfs_notify(&rdev->kobj, NULL, "state"); 1928 return err ? err : len; 1929 } 1930 static struct rdev_sysfs_entry rdev_state = 1931 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 1932 1933 static ssize_t 1934 errors_show(mdk_rdev_t *rdev, char *page) 1935 { 1936 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 1937 } 1938 1939 static ssize_t 1940 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1941 { 1942 char *e; 1943 unsigned long n = simple_strtoul(buf, &e, 10); 1944 if (*buf && (*e == 0 || *e == '\n')) { 1945 atomic_set(&rdev->corrected_errors, n); 1946 return len; 1947 } 1948 return -EINVAL; 1949 } 1950 static struct rdev_sysfs_entry rdev_errors = 1951 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 1952 1953 static ssize_t 1954 slot_show(mdk_rdev_t *rdev, char *page) 1955 { 1956 if (rdev->raid_disk < 0) 1957 return sprintf(page, "none\n"); 1958 else 1959 return sprintf(page, "%d\n", rdev->raid_disk); 1960 } 1961 1962 static ssize_t 1963 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 1964 { 1965 char *e; 1966 int err; 1967 char nm[20]; 1968 int slot = simple_strtoul(buf, &e, 10); 1969 if (strncmp(buf, "none", 4)==0) 1970 slot = -1; 1971 else if (e==buf || (*e && *e!= '\n')) 1972 return -EINVAL; 1973 if (rdev->mddev->pers && slot == -1) { 1974 /* Setting 'slot' on an active array requires also 1975 * updating the 'rd%d' link, and communicating 1976 * with the personality with ->hot_*_disk. 1977 * For now we only support removing 1978 * failed/spare devices. This normally happens automatically, 1979 * but not when the metadata is externally managed. 1980 */ 1981 if (rdev->raid_disk == -1) 1982 return -EEXIST; 1983 /* personality does all needed checks */ 1984 if (rdev->mddev->pers->hot_add_disk == NULL) 1985 return -EINVAL; 1986 err = rdev->mddev->pers-> 1987 hot_remove_disk(rdev->mddev, rdev->raid_disk); 1988 if (err) 1989 return err; 1990 sprintf(nm, "rd%d", rdev->raid_disk); 1991 sysfs_remove_link(&rdev->mddev->kobj, nm); 1992 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 1993 md_wakeup_thread(rdev->mddev->thread); 1994 } else if (rdev->mddev->pers) { 1995 mdk_rdev_t *rdev2; 1996 struct list_head *tmp; 1997 /* Activating a spare .. or possibly reactivating 1998 * if we every get bitmaps working here. 1999 */ 2000 2001 if (rdev->raid_disk != -1) 2002 return -EBUSY; 2003 2004 if (rdev->mddev->pers->hot_add_disk == NULL) 2005 return -EINVAL; 2006 2007 rdev_for_each(rdev2, tmp, rdev->mddev) 2008 if (rdev2->raid_disk == slot) 2009 return -EEXIST; 2010 2011 rdev->raid_disk = slot; 2012 if (test_bit(In_sync, &rdev->flags)) 2013 rdev->saved_raid_disk = slot; 2014 else 2015 rdev->saved_raid_disk = -1; 2016 err = rdev->mddev->pers-> 2017 hot_add_disk(rdev->mddev, rdev); 2018 if (err) { 2019 rdev->raid_disk = -1; 2020 return err; 2021 } else 2022 sysfs_notify(&rdev->kobj, NULL, "state"); 2023 sprintf(nm, "rd%d", rdev->raid_disk); 2024 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) 2025 printk(KERN_WARNING 2026 "md: cannot register " 2027 "%s for %s\n", 2028 nm, mdname(rdev->mddev)); 2029 2030 /* don't wakeup anyone, leave that to userspace. */ 2031 } else { 2032 if (slot >= rdev->mddev->raid_disks) 2033 return -ENOSPC; 2034 rdev->raid_disk = slot; 2035 /* assume it is working */ 2036 clear_bit(Faulty, &rdev->flags); 2037 clear_bit(WriteMostly, &rdev->flags); 2038 set_bit(In_sync, &rdev->flags); 2039 sysfs_notify(&rdev->kobj, NULL, "state"); 2040 } 2041 return len; 2042 } 2043 2044 2045 static struct rdev_sysfs_entry rdev_slot = 2046 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2047 2048 static ssize_t 2049 offset_show(mdk_rdev_t *rdev, char *page) 2050 { 2051 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2052 } 2053 2054 static ssize_t 2055 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2056 { 2057 char *e; 2058 unsigned long long offset = simple_strtoull(buf, &e, 10); 2059 if (e==buf || (*e && *e != '\n')) 2060 return -EINVAL; 2061 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2062 return -EBUSY; 2063 if (rdev->size && rdev->mddev->external) 2064 /* Must set offset before size, so overlap checks 2065 * can be sane */ 2066 return -EBUSY; 2067 rdev->data_offset = offset; 2068 return len; 2069 } 2070 2071 static struct rdev_sysfs_entry rdev_offset = 2072 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2073 2074 static ssize_t 2075 rdev_size_show(mdk_rdev_t *rdev, char *page) 2076 { 2077 return sprintf(page, "%llu\n", (unsigned long long)rdev->size); 2078 } 2079 2080 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2081 { 2082 /* check if two start/length pairs overlap */ 2083 if (s1+l1 <= s2) 2084 return 0; 2085 if (s2+l2 <= s1) 2086 return 0; 2087 return 1; 2088 } 2089 2090 static ssize_t 2091 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2092 { 2093 unsigned long long size; 2094 unsigned long long oldsize = rdev->size; 2095 mddev_t *my_mddev = rdev->mddev; 2096 2097 if (strict_strtoull(buf, 10, &size) < 0) 2098 return -EINVAL; 2099 if (my_mddev->pers && rdev->raid_disk >= 0) { 2100 if (my_mddev->persistent) { 2101 size = super_types[my_mddev->major_version]. 2102 rdev_size_change(rdev, size * 2); 2103 if (!size) 2104 return -EBUSY; 2105 } else if (!size) { 2106 size = (rdev->bdev->bd_inode->i_size >> 10); 2107 size -= rdev->data_offset/2; 2108 } 2109 } 2110 if (size < my_mddev->size) 2111 return -EINVAL; /* component must fit device */ 2112 2113 rdev->size = size; 2114 if (size > oldsize && my_mddev->external) { 2115 /* need to check that all other rdevs with the same ->bdev 2116 * do not overlap. We need to unlock the mddev to avoid 2117 * a deadlock. We have already changed rdev->size, and if 2118 * we have to change it back, we will have the lock again. 2119 */ 2120 mddev_t *mddev; 2121 int overlap = 0; 2122 struct list_head *tmp, *tmp2; 2123 2124 mddev_unlock(my_mddev); 2125 for_each_mddev(mddev, tmp) { 2126 mdk_rdev_t *rdev2; 2127 2128 mddev_lock(mddev); 2129 rdev_for_each(rdev2, tmp2, mddev) 2130 if (test_bit(AllReserved, &rdev2->flags) || 2131 (rdev->bdev == rdev2->bdev && 2132 rdev != rdev2 && 2133 overlaps(rdev->data_offset, rdev->size * 2, 2134 rdev2->data_offset, 2135 rdev2->size * 2))) { 2136 overlap = 1; 2137 break; 2138 } 2139 mddev_unlock(mddev); 2140 if (overlap) { 2141 mddev_put(mddev); 2142 break; 2143 } 2144 } 2145 mddev_lock(my_mddev); 2146 if (overlap) { 2147 /* Someone else could have slipped in a size 2148 * change here, but doing so is just silly. 2149 * We put oldsize back because we *know* it is 2150 * safe, and trust userspace not to race with 2151 * itself 2152 */ 2153 rdev->size = oldsize; 2154 return -EBUSY; 2155 } 2156 } 2157 return len; 2158 } 2159 2160 static struct rdev_sysfs_entry rdev_size = 2161 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2162 2163 static struct attribute *rdev_default_attrs[] = { 2164 &rdev_state.attr, 2165 &rdev_errors.attr, 2166 &rdev_slot.attr, 2167 &rdev_offset.attr, 2168 &rdev_size.attr, 2169 NULL, 2170 }; 2171 static ssize_t 2172 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2173 { 2174 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2175 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2176 mddev_t *mddev = rdev->mddev; 2177 ssize_t rv; 2178 2179 if (!entry->show) 2180 return -EIO; 2181 2182 rv = mddev ? mddev_lock(mddev) : -EBUSY; 2183 if (!rv) { 2184 if (rdev->mddev == NULL) 2185 rv = -EBUSY; 2186 else 2187 rv = entry->show(rdev, page); 2188 mddev_unlock(mddev); 2189 } 2190 return rv; 2191 } 2192 2193 static ssize_t 2194 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 2195 const char *page, size_t length) 2196 { 2197 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2198 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2199 ssize_t rv; 2200 mddev_t *mddev = rdev->mddev; 2201 2202 if (!entry->store) 2203 return -EIO; 2204 if (!capable(CAP_SYS_ADMIN)) 2205 return -EACCES; 2206 rv = mddev ? mddev_lock(mddev): -EBUSY; 2207 if (!rv) { 2208 if (rdev->mddev == NULL) 2209 rv = -EBUSY; 2210 else 2211 rv = entry->store(rdev, page, length); 2212 mddev_unlock(mddev); 2213 } 2214 return rv; 2215 } 2216 2217 static void rdev_free(struct kobject *ko) 2218 { 2219 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2220 kfree(rdev); 2221 } 2222 static struct sysfs_ops rdev_sysfs_ops = { 2223 .show = rdev_attr_show, 2224 .store = rdev_attr_store, 2225 }; 2226 static struct kobj_type rdev_ktype = { 2227 .release = rdev_free, 2228 .sysfs_ops = &rdev_sysfs_ops, 2229 .default_attrs = rdev_default_attrs, 2230 }; 2231 2232 /* 2233 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2234 * 2235 * mark the device faulty if: 2236 * 2237 * - the device is nonexistent (zero size) 2238 * - the device has no valid superblock 2239 * 2240 * a faulty rdev _never_ has rdev->sb set. 2241 */ 2242 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2243 { 2244 char b[BDEVNAME_SIZE]; 2245 int err; 2246 mdk_rdev_t *rdev; 2247 sector_t size; 2248 2249 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2250 if (!rdev) { 2251 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2252 return ERR_PTR(-ENOMEM); 2253 } 2254 2255 if ((err = alloc_disk_sb(rdev))) 2256 goto abort_free; 2257 2258 err = lock_rdev(rdev, newdev, super_format == -2); 2259 if (err) 2260 goto abort_free; 2261 2262 kobject_init(&rdev->kobj, &rdev_ktype); 2263 2264 rdev->desc_nr = -1; 2265 rdev->saved_raid_disk = -1; 2266 rdev->raid_disk = -1; 2267 rdev->flags = 0; 2268 rdev->data_offset = 0; 2269 rdev->sb_events = 0; 2270 atomic_set(&rdev->nr_pending, 0); 2271 atomic_set(&rdev->read_errors, 0); 2272 atomic_set(&rdev->corrected_errors, 0); 2273 2274 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2275 if (!size) { 2276 printk(KERN_WARNING 2277 "md: %s has zero or unknown size, marking faulty!\n", 2278 bdevname(rdev->bdev,b)); 2279 err = -EINVAL; 2280 goto abort_free; 2281 } 2282 2283 if (super_format >= 0) { 2284 err = super_types[super_format]. 2285 load_super(rdev, NULL, super_minor); 2286 if (err == -EINVAL) { 2287 printk(KERN_WARNING 2288 "md: %s does not have a valid v%d.%d " 2289 "superblock, not importing!\n", 2290 bdevname(rdev->bdev,b), 2291 super_format, super_minor); 2292 goto abort_free; 2293 } 2294 if (err < 0) { 2295 printk(KERN_WARNING 2296 "md: could not read %s's sb, not importing!\n", 2297 bdevname(rdev->bdev,b)); 2298 goto abort_free; 2299 } 2300 } 2301 2302 INIT_LIST_HEAD(&rdev->same_set); 2303 init_waitqueue_head(&rdev->blocked_wait); 2304 2305 return rdev; 2306 2307 abort_free: 2308 if (rdev->sb_page) { 2309 if (rdev->bdev) 2310 unlock_rdev(rdev); 2311 free_disk_sb(rdev); 2312 } 2313 kfree(rdev); 2314 return ERR_PTR(err); 2315 } 2316 2317 /* 2318 * Check a full RAID array for plausibility 2319 */ 2320 2321 2322 static void analyze_sbs(mddev_t * mddev) 2323 { 2324 int i; 2325 struct list_head *tmp; 2326 mdk_rdev_t *rdev, *freshest; 2327 char b[BDEVNAME_SIZE]; 2328 2329 freshest = NULL; 2330 rdev_for_each(rdev, tmp, mddev) 2331 switch (super_types[mddev->major_version]. 2332 load_super(rdev, freshest, mddev->minor_version)) { 2333 case 1: 2334 freshest = rdev; 2335 break; 2336 case 0: 2337 break; 2338 default: 2339 printk( KERN_ERR \ 2340 "md: fatal superblock inconsistency in %s" 2341 " -- removing from array\n", 2342 bdevname(rdev->bdev,b)); 2343 kick_rdev_from_array(rdev); 2344 } 2345 2346 2347 super_types[mddev->major_version]. 2348 validate_super(mddev, freshest); 2349 2350 i = 0; 2351 rdev_for_each(rdev, tmp, mddev) { 2352 if (rdev != freshest) 2353 if (super_types[mddev->major_version]. 2354 validate_super(mddev, rdev)) { 2355 printk(KERN_WARNING "md: kicking non-fresh %s" 2356 " from array!\n", 2357 bdevname(rdev->bdev,b)); 2358 kick_rdev_from_array(rdev); 2359 continue; 2360 } 2361 if (mddev->level == LEVEL_MULTIPATH) { 2362 rdev->desc_nr = i++; 2363 rdev->raid_disk = rdev->desc_nr; 2364 set_bit(In_sync, &rdev->flags); 2365 } else if (rdev->raid_disk >= mddev->raid_disks) { 2366 rdev->raid_disk = -1; 2367 clear_bit(In_sync, &rdev->flags); 2368 } 2369 } 2370 2371 2372 2373 if (mddev->recovery_cp != MaxSector && 2374 mddev->level >= 1) 2375 printk(KERN_ERR "md: %s: raid array is not clean" 2376 " -- starting background reconstruction\n", 2377 mdname(mddev)); 2378 2379 } 2380 2381 static void md_safemode_timeout(unsigned long data); 2382 2383 static ssize_t 2384 safe_delay_show(mddev_t *mddev, char *page) 2385 { 2386 int msec = (mddev->safemode_delay*1000)/HZ; 2387 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2388 } 2389 static ssize_t 2390 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2391 { 2392 int scale=1; 2393 int dot=0; 2394 int i; 2395 unsigned long msec; 2396 char buf[30]; 2397 2398 /* remove a period, and count digits after it */ 2399 if (len >= sizeof(buf)) 2400 return -EINVAL; 2401 strlcpy(buf, cbuf, sizeof(buf)); 2402 for (i=0; i<len; i++) { 2403 if (dot) { 2404 if (isdigit(buf[i])) { 2405 buf[i-1] = buf[i]; 2406 scale *= 10; 2407 } 2408 buf[i] = 0; 2409 } else if (buf[i] == '.') { 2410 dot=1; 2411 buf[i] = 0; 2412 } 2413 } 2414 if (strict_strtoul(buf, 10, &msec) < 0) 2415 return -EINVAL; 2416 msec = (msec * 1000) / scale; 2417 if (msec == 0) 2418 mddev->safemode_delay = 0; 2419 else { 2420 unsigned long old_delay = mddev->safemode_delay; 2421 mddev->safemode_delay = (msec*HZ)/1000; 2422 if (mddev->safemode_delay == 0) 2423 mddev->safemode_delay = 1; 2424 if (mddev->safemode_delay < old_delay) 2425 md_safemode_timeout((unsigned long)mddev); 2426 } 2427 return len; 2428 } 2429 static struct md_sysfs_entry md_safe_delay = 2430 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2431 2432 static ssize_t 2433 level_show(mddev_t *mddev, char *page) 2434 { 2435 struct mdk_personality *p = mddev->pers; 2436 if (p) 2437 return sprintf(page, "%s\n", p->name); 2438 else if (mddev->clevel[0]) 2439 return sprintf(page, "%s\n", mddev->clevel); 2440 else if (mddev->level != LEVEL_NONE) 2441 return sprintf(page, "%d\n", mddev->level); 2442 else 2443 return 0; 2444 } 2445 2446 static ssize_t 2447 level_store(mddev_t *mddev, const char *buf, size_t len) 2448 { 2449 ssize_t rv = len; 2450 if (mddev->pers) 2451 return -EBUSY; 2452 if (len == 0) 2453 return 0; 2454 if (len >= sizeof(mddev->clevel)) 2455 return -ENOSPC; 2456 strncpy(mddev->clevel, buf, len); 2457 if (mddev->clevel[len-1] == '\n') 2458 len--; 2459 mddev->clevel[len] = 0; 2460 mddev->level = LEVEL_NONE; 2461 return rv; 2462 } 2463 2464 static struct md_sysfs_entry md_level = 2465 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2466 2467 2468 static ssize_t 2469 layout_show(mddev_t *mddev, char *page) 2470 { 2471 /* just a number, not meaningful for all levels */ 2472 if (mddev->reshape_position != MaxSector && 2473 mddev->layout != mddev->new_layout) 2474 return sprintf(page, "%d (%d)\n", 2475 mddev->new_layout, mddev->layout); 2476 return sprintf(page, "%d\n", mddev->layout); 2477 } 2478 2479 static ssize_t 2480 layout_store(mddev_t *mddev, const char *buf, size_t len) 2481 { 2482 char *e; 2483 unsigned long n = simple_strtoul(buf, &e, 10); 2484 2485 if (!*buf || (*e && *e != '\n')) 2486 return -EINVAL; 2487 2488 if (mddev->pers) 2489 return -EBUSY; 2490 if (mddev->reshape_position != MaxSector) 2491 mddev->new_layout = n; 2492 else 2493 mddev->layout = n; 2494 return len; 2495 } 2496 static struct md_sysfs_entry md_layout = 2497 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2498 2499 2500 static ssize_t 2501 raid_disks_show(mddev_t *mddev, char *page) 2502 { 2503 if (mddev->raid_disks == 0) 2504 return 0; 2505 if (mddev->reshape_position != MaxSector && 2506 mddev->delta_disks != 0) 2507 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 2508 mddev->raid_disks - mddev->delta_disks); 2509 return sprintf(page, "%d\n", mddev->raid_disks); 2510 } 2511 2512 static int update_raid_disks(mddev_t *mddev, int raid_disks); 2513 2514 static ssize_t 2515 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2516 { 2517 char *e; 2518 int rv = 0; 2519 unsigned long n = simple_strtoul(buf, &e, 10); 2520 2521 if (!*buf || (*e && *e != '\n')) 2522 return -EINVAL; 2523 2524 if (mddev->pers) 2525 rv = update_raid_disks(mddev, n); 2526 else if (mddev->reshape_position != MaxSector) { 2527 int olddisks = mddev->raid_disks - mddev->delta_disks; 2528 mddev->delta_disks = n - olddisks; 2529 mddev->raid_disks = n; 2530 } else 2531 mddev->raid_disks = n; 2532 return rv ? rv : len; 2533 } 2534 static struct md_sysfs_entry md_raid_disks = 2535 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2536 2537 static ssize_t 2538 chunk_size_show(mddev_t *mddev, char *page) 2539 { 2540 if (mddev->reshape_position != MaxSector && 2541 mddev->chunk_size != mddev->new_chunk) 2542 return sprintf(page, "%d (%d)\n", mddev->new_chunk, 2543 mddev->chunk_size); 2544 return sprintf(page, "%d\n", mddev->chunk_size); 2545 } 2546 2547 static ssize_t 2548 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2549 { 2550 /* can only set chunk_size if array is not yet active */ 2551 char *e; 2552 unsigned long n = simple_strtoul(buf, &e, 10); 2553 2554 if (!*buf || (*e && *e != '\n')) 2555 return -EINVAL; 2556 2557 if (mddev->pers) 2558 return -EBUSY; 2559 else if (mddev->reshape_position != MaxSector) 2560 mddev->new_chunk = n; 2561 else 2562 mddev->chunk_size = n; 2563 return len; 2564 } 2565 static struct md_sysfs_entry md_chunk_size = 2566 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2567 2568 static ssize_t 2569 resync_start_show(mddev_t *mddev, char *page) 2570 { 2571 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2572 } 2573 2574 static ssize_t 2575 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2576 { 2577 char *e; 2578 unsigned long long n = simple_strtoull(buf, &e, 10); 2579 2580 if (mddev->pers) 2581 return -EBUSY; 2582 if (!*buf || (*e && *e != '\n')) 2583 return -EINVAL; 2584 2585 mddev->recovery_cp = n; 2586 return len; 2587 } 2588 static struct md_sysfs_entry md_resync_start = 2589 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2590 2591 /* 2592 * The array state can be: 2593 * 2594 * clear 2595 * No devices, no size, no level 2596 * Equivalent to STOP_ARRAY ioctl 2597 * inactive 2598 * May have some settings, but array is not active 2599 * all IO results in error 2600 * When written, doesn't tear down array, but just stops it 2601 * suspended (not supported yet) 2602 * All IO requests will block. The array can be reconfigured. 2603 * Writing this, if accepted, will block until array is quiescent 2604 * readonly 2605 * no resync can happen. no superblocks get written. 2606 * write requests fail 2607 * read-auto 2608 * like readonly, but behaves like 'clean' on a write request. 2609 * 2610 * clean - no pending writes, but otherwise active. 2611 * When written to inactive array, starts without resync 2612 * If a write request arrives then 2613 * if metadata is known, mark 'dirty' and switch to 'active'. 2614 * if not known, block and switch to write-pending 2615 * If written to an active array that has pending writes, then fails. 2616 * active 2617 * fully active: IO and resync can be happening. 2618 * When written to inactive array, starts with resync 2619 * 2620 * write-pending 2621 * clean, but writes are blocked waiting for 'active' to be written. 2622 * 2623 * active-idle 2624 * like active, but no writes have been seen for a while (100msec). 2625 * 2626 */ 2627 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2628 write_pending, active_idle, bad_word}; 2629 static char *array_states[] = { 2630 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 2631 "write-pending", "active-idle", NULL }; 2632 2633 static int match_word(const char *word, char **list) 2634 { 2635 int n; 2636 for (n=0; list[n]; n++) 2637 if (cmd_match(word, list[n])) 2638 break; 2639 return n; 2640 } 2641 2642 static ssize_t 2643 array_state_show(mddev_t *mddev, char *page) 2644 { 2645 enum array_state st = inactive; 2646 2647 if (mddev->pers) 2648 switch(mddev->ro) { 2649 case 1: 2650 st = readonly; 2651 break; 2652 case 2: 2653 st = read_auto; 2654 break; 2655 case 0: 2656 if (mddev->in_sync) 2657 st = clean; 2658 else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2659 st = write_pending; 2660 else if (mddev->safemode) 2661 st = active_idle; 2662 else 2663 st = active; 2664 } 2665 else { 2666 if (list_empty(&mddev->disks) && 2667 mddev->raid_disks == 0 && 2668 mddev->size == 0) 2669 st = clear; 2670 else 2671 st = inactive; 2672 } 2673 return sprintf(page, "%s\n", array_states[st]); 2674 } 2675 2676 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 2677 static int do_md_run(mddev_t * mddev); 2678 static int restart_array(mddev_t *mddev); 2679 2680 static ssize_t 2681 array_state_store(mddev_t *mddev, const char *buf, size_t len) 2682 { 2683 int err = -EINVAL; 2684 enum array_state st = match_word(buf, array_states); 2685 switch(st) { 2686 case bad_word: 2687 break; 2688 case clear: 2689 /* stopping an active array */ 2690 if (atomic_read(&mddev->openers) > 0) 2691 return -EBUSY; 2692 err = do_md_stop(mddev, 0, 0); 2693 break; 2694 case inactive: 2695 /* stopping an active array */ 2696 if (mddev->pers) { 2697 if (atomic_read(&mddev->openers) > 0) 2698 return -EBUSY; 2699 err = do_md_stop(mddev, 2, 0); 2700 } else 2701 err = 0; /* already inactive */ 2702 break; 2703 case suspended: 2704 break; /* not supported yet */ 2705 case readonly: 2706 if (mddev->pers) 2707 err = do_md_stop(mddev, 1, 0); 2708 else { 2709 mddev->ro = 1; 2710 set_disk_ro(mddev->gendisk, 1); 2711 err = do_md_run(mddev); 2712 } 2713 break; 2714 case read_auto: 2715 if (mddev->pers) { 2716 if (mddev->ro == 0) 2717 err = do_md_stop(mddev, 1, 0); 2718 else if (mddev->ro == 1) 2719 err = restart_array(mddev); 2720 if (err == 0) { 2721 mddev->ro = 2; 2722 set_disk_ro(mddev->gendisk, 0); 2723 } 2724 } else { 2725 mddev->ro = 2; 2726 err = do_md_run(mddev); 2727 } 2728 break; 2729 case clean: 2730 if (mddev->pers) { 2731 restart_array(mddev); 2732 spin_lock_irq(&mddev->write_lock); 2733 if (atomic_read(&mddev->writes_pending) == 0) { 2734 if (mddev->in_sync == 0) { 2735 mddev->in_sync = 1; 2736 if (mddev->safemode == 1) 2737 mddev->safemode = 0; 2738 if (mddev->persistent) 2739 set_bit(MD_CHANGE_CLEAN, 2740 &mddev->flags); 2741 } 2742 err = 0; 2743 } else 2744 err = -EBUSY; 2745 spin_unlock_irq(&mddev->write_lock); 2746 } else { 2747 mddev->ro = 0; 2748 mddev->recovery_cp = MaxSector; 2749 err = do_md_run(mddev); 2750 } 2751 break; 2752 case active: 2753 if (mddev->pers) { 2754 restart_array(mddev); 2755 if (mddev->external) 2756 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2757 wake_up(&mddev->sb_wait); 2758 err = 0; 2759 } else { 2760 mddev->ro = 0; 2761 set_disk_ro(mddev->gendisk, 0); 2762 err = do_md_run(mddev); 2763 } 2764 break; 2765 case write_pending: 2766 case active_idle: 2767 /* these cannot be set */ 2768 break; 2769 } 2770 if (err) 2771 return err; 2772 else { 2773 sysfs_notify(&mddev->kobj, NULL, "array_state"); 2774 return len; 2775 } 2776 } 2777 static struct md_sysfs_entry md_array_state = 2778 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 2779 2780 static ssize_t 2781 null_show(mddev_t *mddev, char *page) 2782 { 2783 return -EINVAL; 2784 } 2785 2786 static ssize_t 2787 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 2788 { 2789 /* buf must be %d:%d\n? giving major and minor numbers */ 2790 /* The new device is added to the array. 2791 * If the array has a persistent superblock, we read the 2792 * superblock to initialise info and check validity. 2793 * Otherwise, only checking done is that in bind_rdev_to_array, 2794 * which mainly checks size. 2795 */ 2796 char *e; 2797 int major = simple_strtoul(buf, &e, 10); 2798 int minor; 2799 dev_t dev; 2800 mdk_rdev_t *rdev; 2801 int err; 2802 2803 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 2804 return -EINVAL; 2805 minor = simple_strtoul(e+1, &e, 10); 2806 if (*e && *e != '\n') 2807 return -EINVAL; 2808 dev = MKDEV(major, minor); 2809 if (major != MAJOR(dev) || 2810 minor != MINOR(dev)) 2811 return -EOVERFLOW; 2812 2813 2814 if (mddev->persistent) { 2815 rdev = md_import_device(dev, mddev->major_version, 2816 mddev->minor_version); 2817 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 2818 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2819 mdk_rdev_t, same_set); 2820 err = super_types[mddev->major_version] 2821 .load_super(rdev, rdev0, mddev->minor_version); 2822 if (err < 0) 2823 goto out; 2824 } 2825 } else if (mddev->external) 2826 rdev = md_import_device(dev, -2, -1); 2827 else 2828 rdev = md_import_device(dev, -1, -1); 2829 2830 if (IS_ERR(rdev)) 2831 return PTR_ERR(rdev); 2832 err = bind_rdev_to_array(rdev, mddev); 2833 out: 2834 if (err) 2835 export_rdev(rdev); 2836 return err ? err : len; 2837 } 2838 2839 static struct md_sysfs_entry md_new_device = 2840 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 2841 2842 static ssize_t 2843 bitmap_store(mddev_t *mddev, const char *buf, size_t len) 2844 { 2845 char *end; 2846 unsigned long chunk, end_chunk; 2847 2848 if (!mddev->bitmap) 2849 goto out; 2850 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 2851 while (*buf) { 2852 chunk = end_chunk = simple_strtoul(buf, &end, 0); 2853 if (buf == end) break; 2854 if (*end == '-') { /* range */ 2855 buf = end + 1; 2856 end_chunk = simple_strtoul(buf, &end, 0); 2857 if (buf == end) break; 2858 } 2859 if (*end && !isspace(*end)) break; 2860 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 2861 buf = end; 2862 while (isspace(*buf)) buf++; 2863 } 2864 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 2865 out: 2866 return len; 2867 } 2868 2869 static struct md_sysfs_entry md_bitmap = 2870 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 2871 2872 static ssize_t 2873 size_show(mddev_t *mddev, char *page) 2874 { 2875 return sprintf(page, "%llu\n", (unsigned long long)mddev->size); 2876 } 2877 2878 static int update_size(mddev_t *mddev, sector_t num_sectors); 2879 2880 static ssize_t 2881 size_store(mddev_t *mddev, const char *buf, size_t len) 2882 { 2883 /* If array is inactive, we can reduce the component size, but 2884 * not increase it (except from 0). 2885 * If array is active, we can try an on-line resize 2886 */ 2887 char *e; 2888 int err = 0; 2889 unsigned long long size = simple_strtoull(buf, &e, 10); 2890 if (!*buf || *buf == '\n' || 2891 (*e && *e != '\n')) 2892 return -EINVAL; 2893 2894 if (mddev->pers) { 2895 err = update_size(mddev, size * 2); 2896 md_update_sb(mddev, 1); 2897 } else { 2898 if (mddev->size == 0 || 2899 mddev->size > size) 2900 mddev->size = size; 2901 else 2902 err = -ENOSPC; 2903 } 2904 return err ? err : len; 2905 } 2906 2907 static struct md_sysfs_entry md_size = 2908 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 2909 2910 2911 /* Metdata version. 2912 * This is one of 2913 * 'none' for arrays with no metadata (good luck...) 2914 * 'external' for arrays with externally managed metadata, 2915 * or N.M for internally known formats 2916 */ 2917 static ssize_t 2918 metadata_show(mddev_t *mddev, char *page) 2919 { 2920 if (mddev->persistent) 2921 return sprintf(page, "%d.%d\n", 2922 mddev->major_version, mddev->minor_version); 2923 else if (mddev->external) 2924 return sprintf(page, "external:%s\n", mddev->metadata_type); 2925 else 2926 return sprintf(page, "none\n"); 2927 } 2928 2929 static ssize_t 2930 metadata_store(mddev_t *mddev, const char *buf, size_t len) 2931 { 2932 int major, minor; 2933 char *e; 2934 /* Changing the details of 'external' metadata is 2935 * always permitted. Otherwise there must be 2936 * no devices attached to the array. 2937 */ 2938 if (mddev->external && strncmp(buf, "external:", 9) == 0) 2939 ; 2940 else if (!list_empty(&mddev->disks)) 2941 return -EBUSY; 2942 2943 if (cmd_match(buf, "none")) { 2944 mddev->persistent = 0; 2945 mddev->external = 0; 2946 mddev->major_version = 0; 2947 mddev->minor_version = 90; 2948 return len; 2949 } 2950 if (strncmp(buf, "external:", 9) == 0) { 2951 size_t namelen = len-9; 2952 if (namelen >= sizeof(mddev->metadata_type)) 2953 namelen = sizeof(mddev->metadata_type)-1; 2954 strncpy(mddev->metadata_type, buf+9, namelen); 2955 mddev->metadata_type[namelen] = 0; 2956 if (namelen && mddev->metadata_type[namelen-1] == '\n') 2957 mddev->metadata_type[--namelen] = 0; 2958 mddev->persistent = 0; 2959 mddev->external = 1; 2960 mddev->major_version = 0; 2961 mddev->minor_version = 90; 2962 return len; 2963 } 2964 major = simple_strtoul(buf, &e, 10); 2965 if (e==buf || *e != '.') 2966 return -EINVAL; 2967 buf = e+1; 2968 minor = simple_strtoul(buf, &e, 10); 2969 if (e==buf || (*e && *e != '\n') ) 2970 return -EINVAL; 2971 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 2972 return -ENOENT; 2973 mddev->major_version = major; 2974 mddev->minor_version = minor; 2975 mddev->persistent = 1; 2976 mddev->external = 0; 2977 return len; 2978 } 2979 2980 static struct md_sysfs_entry md_metadata = 2981 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 2982 2983 static ssize_t 2984 action_show(mddev_t *mddev, char *page) 2985 { 2986 char *type = "idle"; 2987 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 2988 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 2989 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2990 type = "reshape"; 2991 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2992 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2993 type = "resync"; 2994 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2995 type = "check"; 2996 else 2997 type = "repair"; 2998 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 2999 type = "recover"; 3000 } 3001 return sprintf(page, "%s\n", type); 3002 } 3003 3004 static ssize_t 3005 action_store(mddev_t *mddev, const char *page, size_t len) 3006 { 3007 if (!mddev->pers || !mddev->pers->sync_request) 3008 return -EINVAL; 3009 3010 if (cmd_match(page, "idle")) { 3011 if (mddev->sync_thread) { 3012 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3013 md_unregister_thread(mddev->sync_thread); 3014 mddev->sync_thread = NULL; 3015 mddev->recovery = 0; 3016 } 3017 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3018 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3019 return -EBUSY; 3020 else if (cmd_match(page, "resync")) 3021 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3022 else if (cmd_match(page, "recover")) { 3023 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 3024 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3025 } else if (cmd_match(page, "reshape")) { 3026 int err; 3027 if (mddev->pers->start_reshape == NULL) 3028 return -EINVAL; 3029 err = mddev->pers->start_reshape(mddev); 3030 if (err) 3031 return err; 3032 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3033 } else { 3034 if (cmd_match(page, "check")) 3035 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3036 else if (!cmd_match(page, "repair")) 3037 return -EINVAL; 3038 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 3039 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3040 } 3041 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3042 md_wakeup_thread(mddev->thread); 3043 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 3044 return len; 3045 } 3046 3047 static ssize_t 3048 mismatch_cnt_show(mddev_t *mddev, char *page) 3049 { 3050 return sprintf(page, "%llu\n", 3051 (unsigned long long) mddev->resync_mismatches); 3052 } 3053 3054 static struct md_sysfs_entry md_scan_mode = 3055 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 3056 3057 3058 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 3059 3060 static ssize_t 3061 sync_min_show(mddev_t *mddev, char *page) 3062 { 3063 return sprintf(page, "%d (%s)\n", speed_min(mddev), 3064 mddev->sync_speed_min ? "local": "system"); 3065 } 3066 3067 static ssize_t 3068 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 3069 { 3070 int min; 3071 char *e; 3072 if (strncmp(buf, "system", 6)==0) { 3073 mddev->sync_speed_min = 0; 3074 return len; 3075 } 3076 min = simple_strtoul(buf, &e, 10); 3077 if (buf == e || (*e && *e != '\n') || min <= 0) 3078 return -EINVAL; 3079 mddev->sync_speed_min = min; 3080 return len; 3081 } 3082 3083 static struct md_sysfs_entry md_sync_min = 3084 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 3085 3086 static ssize_t 3087 sync_max_show(mddev_t *mddev, char *page) 3088 { 3089 return sprintf(page, "%d (%s)\n", speed_max(mddev), 3090 mddev->sync_speed_max ? "local": "system"); 3091 } 3092 3093 static ssize_t 3094 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 3095 { 3096 int max; 3097 char *e; 3098 if (strncmp(buf, "system", 6)==0) { 3099 mddev->sync_speed_max = 0; 3100 return len; 3101 } 3102 max = simple_strtoul(buf, &e, 10); 3103 if (buf == e || (*e && *e != '\n') || max <= 0) 3104 return -EINVAL; 3105 mddev->sync_speed_max = max; 3106 return len; 3107 } 3108 3109 static struct md_sysfs_entry md_sync_max = 3110 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 3111 3112 static ssize_t 3113 degraded_show(mddev_t *mddev, char *page) 3114 { 3115 return sprintf(page, "%d\n", mddev->degraded); 3116 } 3117 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 3118 3119 static ssize_t 3120 sync_force_parallel_show(mddev_t *mddev, char *page) 3121 { 3122 return sprintf(page, "%d\n", mddev->parallel_resync); 3123 } 3124 3125 static ssize_t 3126 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) 3127 { 3128 long n; 3129 3130 if (strict_strtol(buf, 10, &n)) 3131 return -EINVAL; 3132 3133 if (n != 0 && n != 1) 3134 return -EINVAL; 3135 3136 mddev->parallel_resync = n; 3137 3138 if (mddev->sync_thread) 3139 wake_up(&resync_wait); 3140 3141 return len; 3142 } 3143 3144 /* force parallel resync, even with shared block devices */ 3145 static struct md_sysfs_entry md_sync_force_parallel = 3146 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 3147 sync_force_parallel_show, sync_force_parallel_store); 3148 3149 static ssize_t 3150 sync_speed_show(mddev_t *mddev, char *page) 3151 { 3152 unsigned long resync, dt, db; 3153 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 3154 dt = (jiffies - mddev->resync_mark) / HZ; 3155 if (!dt) dt++; 3156 db = resync - mddev->resync_mark_cnt; 3157 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 3158 } 3159 3160 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3161 3162 static ssize_t 3163 sync_completed_show(mddev_t *mddev, char *page) 3164 { 3165 unsigned long max_blocks, resync; 3166 3167 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3168 max_blocks = mddev->resync_max_sectors; 3169 else 3170 max_blocks = mddev->size << 1; 3171 3172 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); 3173 return sprintf(page, "%lu / %lu\n", resync, max_blocks); 3174 } 3175 3176 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3177 3178 static ssize_t 3179 min_sync_show(mddev_t *mddev, char *page) 3180 { 3181 return sprintf(page, "%llu\n", 3182 (unsigned long long)mddev->resync_min); 3183 } 3184 static ssize_t 3185 min_sync_store(mddev_t *mddev, const char *buf, size_t len) 3186 { 3187 unsigned long long min; 3188 if (strict_strtoull(buf, 10, &min)) 3189 return -EINVAL; 3190 if (min > mddev->resync_max) 3191 return -EINVAL; 3192 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3193 return -EBUSY; 3194 3195 /* Must be a multiple of chunk_size */ 3196 if (mddev->chunk_size) { 3197 if (min & (sector_t)((mddev->chunk_size>>9)-1)) 3198 return -EINVAL; 3199 } 3200 mddev->resync_min = min; 3201 3202 return len; 3203 } 3204 3205 static struct md_sysfs_entry md_min_sync = 3206 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 3207 3208 static ssize_t 3209 max_sync_show(mddev_t *mddev, char *page) 3210 { 3211 if (mddev->resync_max == MaxSector) 3212 return sprintf(page, "max\n"); 3213 else 3214 return sprintf(page, "%llu\n", 3215 (unsigned long long)mddev->resync_max); 3216 } 3217 static ssize_t 3218 max_sync_store(mddev_t *mddev, const char *buf, size_t len) 3219 { 3220 if (strncmp(buf, "max", 3) == 0) 3221 mddev->resync_max = MaxSector; 3222 else { 3223 unsigned long long max; 3224 if (strict_strtoull(buf, 10, &max)) 3225 return -EINVAL; 3226 if (max < mddev->resync_min) 3227 return -EINVAL; 3228 if (max < mddev->resync_max && 3229 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3230 return -EBUSY; 3231 3232 /* Must be a multiple of chunk_size */ 3233 if (mddev->chunk_size) { 3234 if (max & (sector_t)((mddev->chunk_size>>9)-1)) 3235 return -EINVAL; 3236 } 3237 mddev->resync_max = max; 3238 } 3239 wake_up(&mddev->recovery_wait); 3240 return len; 3241 } 3242 3243 static struct md_sysfs_entry md_max_sync = 3244 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 3245 3246 static ssize_t 3247 suspend_lo_show(mddev_t *mddev, char *page) 3248 { 3249 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 3250 } 3251 3252 static ssize_t 3253 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 3254 { 3255 char *e; 3256 unsigned long long new = simple_strtoull(buf, &e, 10); 3257 3258 if (mddev->pers->quiesce == NULL) 3259 return -EINVAL; 3260 if (buf == e || (*e && *e != '\n')) 3261 return -EINVAL; 3262 if (new >= mddev->suspend_hi || 3263 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 3264 mddev->suspend_lo = new; 3265 mddev->pers->quiesce(mddev, 2); 3266 return len; 3267 } else 3268 return -EINVAL; 3269 } 3270 static struct md_sysfs_entry md_suspend_lo = 3271 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 3272 3273 3274 static ssize_t 3275 suspend_hi_show(mddev_t *mddev, char *page) 3276 { 3277 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 3278 } 3279 3280 static ssize_t 3281 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 3282 { 3283 char *e; 3284 unsigned long long new = simple_strtoull(buf, &e, 10); 3285 3286 if (mddev->pers->quiesce == NULL) 3287 return -EINVAL; 3288 if (buf == e || (*e && *e != '\n')) 3289 return -EINVAL; 3290 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 3291 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 3292 mddev->suspend_hi = new; 3293 mddev->pers->quiesce(mddev, 1); 3294 mddev->pers->quiesce(mddev, 0); 3295 return len; 3296 } else 3297 return -EINVAL; 3298 } 3299 static struct md_sysfs_entry md_suspend_hi = 3300 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 3301 3302 static ssize_t 3303 reshape_position_show(mddev_t *mddev, char *page) 3304 { 3305 if (mddev->reshape_position != MaxSector) 3306 return sprintf(page, "%llu\n", 3307 (unsigned long long)mddev->reshape_position); 3308 strcpy(page, "none\n"); 3309 return 5; 3310 } 3311 3312 static ssize_t 3313 reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 3314 { 3315 char *e; 3316 unsigned long long new = simple_strtoull(buf, &e, 10); 3317 if (mddev->pers) 3318 return -EBUSY; 3319 if (buf == e || (*e && *e != '\n')) 3320 return -EINVAL; 3321 mddev->reshape_position = new; 3322 mddev->delta_disks = 0; 3323 mddev->new_level = mddev->level; 3324 mddev->new_layout = mddev->layout; 3325 mddev->new_chunk = mddev->chunk_size; 3326 return len; 3327 } 3328 3329 static struct md_sysfs_entry md_reshape_position = 3330 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 3331 reshape_position_store); 3332 3333 3334 static struct attribute *md_default_attrs[] = { 3335 &md_level.attr, 3336 &md_layout.attr, 3337 &md_raid_disks.attr, 3338 &md_chunk_size.attr, 3339 &md_size.attr, 3340 &md_resync_start.attr, 3341 &md_metadata.attr, 3342 &md_new_device.attr, 3343 &md_safe_delay.attr, 3344 &md_array_state.attr, 3345 &md_reshape_position.attr, 3346 NULL, 3347 }; 3348 3349 static struct attribute *md_redundancy_attrs[] = { 3350 &md_scan_mode.attr, 3351 &md_mismatches.attr, 3352 &md_sync_min.attr, 3353 &md_sync_max.attr, 3354 &md_sync_speed.attr, 3355 &md_sync_force_parallel.attr, 3356 &md_sync_completed.attr, 3357 &md_min_sync.attr, 3358 &md_max_sync.attr, 3359 &md_suspend_lo.attr, 3360 &md_suspend_hi.attr, 3361 &md_bitmap.attr, 3362 &md_degraded.attr, 3363 NULL, 3364 }; 3365 static struct attribute_group md_redundancy_group = { 3366 .name = NULL, 3367 .attrs = md_redundancy_attrs, 3368 }; 3369 3370 3371 static ssize_t 3372 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3373 { 3374 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3375 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3376 ssize_t rv; 3377 3378 if (!entry->show) 3379 return -EIO; 3380 rv = mddev_lock(mddev); 3381 if (!rv) { 3382 rv = entry->show(mddev, page); 3383 mddev_unlock(mddev); 3384 } 3385 return rv; 3386 } 3387 3388 static ssize_t 3389 md_attr_store(struct kobject *kobj, struct attribute *attr, 3390 const char *page, size_t length) 3391 { 3392 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3393 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3394 ssize_t rv; 3395 3396 if (!entry->store) 3397 return -EIO; 3398 if (!capable(CAP_SYS_ADMIN)) 3399 return -EACCES; 3400 rv = mddev_lock(mddev); 3401 if (!rv) { 3402 rv = entry->store(mddev, page, length); 3403 mddev_unlock(mddev); 3404 } 3405 return rv; 3406 } 3407 3408 static void md_free(struct kobject *ko) 3409 { 3410 mddev_t *mddev = container_of(ko, mddev_t, kobj); 3411 kfree(mddev); 3412 } 3413 3414 static struct sysfs_ops md_sysfs_ops = { 3415 .show = md_attr_show, 3416 .store = md_attr_store, 3417 }; 3418 static struct kobj_type md_ktype = { 3419 .release = md_free, 3420 .sysfs_ops = &md_sysfs_ops, 3421 .default_attrs = md_default_attrs, 3422 }; 3423 3424 int mdp_major = 0; 3425 3426 static struct kobject *md_probe(dev_t dev, int *part, void *data) 3427 { 3428 static DEFINE_MUTEX(disks_mutex); 3429 mddev_t *mddev = mddev_find(dev); 3430 struct gendisk *disk; 3431 int partitioned = (MAJOR(dev) != MD_MAJOR); 3432 int shift = partitioned ? MdpMinorShift : 0; 3433 int unit = MINOR(dev) >> shift; 3434 int error; 3435 3436 if (!mddev) 3437 return NULL; 3438 3439 mutex_lock(&disks_mutex); 3440 if (mddev->gendisk) { 3441 mutex_unlock(&disks_mutex); 3442 mddev_put(mddev); 3443 return NULL; 3444 } 3445 disk = alloc_disk(1 << shift); 3446 if (!disk) { 3447 mutex_unlock(&disks_mutex); 3448 mddev_put(mddev); 3449 return NULL; 3450 } 3451 disk->major = MAJOR(dev); 3452 disk->first_minor = unit << shift; 3453 if (partitioned) 3454 sprintf(disk->disk_name, "md_d%d", unit); 3455 else 3456 sprintf(disk->disk_name, "md%d", unit); 3457 disk->fops = &md_fops; 3458 disk->private_data = mddev; 3459 disk->queue = mddev->queue; 3460 add_disk(disk); 3461 mddev->gendisk = disk; 3462 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 3463 &disk_to_dev(disk)->kobj, "%s", "md"); 3464 mutex_unlock(&disks_mutex); 3465 if (error) 3466 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3467 disk->disk_name); 3468 else 3469 kobject_uevent(&mddev->kobj, KOBJ_ADD); 3470 return NULL; 3471 } 3472 3473 static void md_safemode_timeout(unsigned long data) 3474 { 3475 mddev_t *mddev = (mddev_t *) data; 3476 3477 if (!atomic_read(&mddev->writes_pending)) { 3478 mddev->safemode = 1; 3479 if (mddev->external) 3480 set_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags); 3481 } 3482 md_wakeup_thread(mddev->thread); 3483 } 3484 3485 static int start_dirty_degraded; 3486 3487 static int do_md_run(mddev_t * mddev) 3488 { 3489 int err; 3490 int chunk_size; 3491 struct list_head *tmp; 3492 mdk_rdev_t *rdev; 3493 struct gendisk *disk; 3494 struct mdk_personality *pers; 3495 char b[BDEVNAME_SIZE]; 3496 3497 if (list_empty(&mddev->disks)) 3498 /* cannot run an array with no devices.. */ 3499 return -EINVAL; 3500 3501 if (mddev->pers) 3502 return -EBUSY; 3503 3504 /* 3505 * Analyze all RAID superblock(s) 3506 */ 3507 if (!mddev->raid_disks) { 3508 if (!mddev->persistent) 3509 return -EINVAL; 3510 analyze_sbs(mddev); 3511 } 3512 3513 chunk_size = mddev->chunk_size; 3514 3515 if (chunk_size) { 3516 if (chunk_size > MAX_CHUNK_SIZE) { 3517 printk(KERN_ERR "too big chunk_size: %d > %d\n", 3518 chunk_size, MAX_CHUNK_SIZE); 3519 return -EINVAL; 3520 } 3521 /* 3522 * chunk-size has to be a power of 2 3523 */ 3524 if ( (1 << ffz(~chunk_size)) != chunk_size) { 3525 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 3526 return -EINVAL; 3527 } 3528 3529 /* devices must have minimum size of one chunk */ 3530 rdev_for_each(rdev, tmp, mddev) { 3531 if (test_bit(Faulty, &rdev->flags)) 3532 continue; 3533 if (rdev->size < chunk_size / 1024) { 3534 printk(KERN_WARNING 3535 "md: Dev %s smaller than chunk_size:" 3536 " %lluk < %dk\n", 3537 bdevname(rdev->bdev,b), 3538 (unsigned long long)rdev->size, 3539 chunk_size / 1024); 3540 return -EINVAL; 3541 } 3542 } 3543 } 3544 3545 if (mddev->level != LEVEL_NONE) 3546 request_module("md-level-%d", mddev->level); 3547 else if (mddev->clevel[0]) 3548 request_module("md-%s", mddev->clevel); 3549 3550 /* 3551 * Drop all container device buffers, from now on 3552 * the only valid external interface is through the md 3553 * device. 3554 */ 3555 rdev_for_each(rdev, tmp, mddev) { 3556 if (test_bit(Faulty, &rdev->flags)) 3557 continue; 3558 sync_blockdev(rdev->bdev); 3559 invalidate_bdev(rdev->bdev); 3560 3561 /* perform some consistency tests on the device. 3562 * We don't want the data to overlap the metadata, 3563 * Internal Bitmap issues has handled elsewhere. 3564 */ 3565 if (rdev->data_offset < rdev->sb_start) { 3566 if (mddev->size && 3567 rdev->data_offset + mddev->size*2 3568 > rdev->sb_start) { 3569 printk("md: %s: data overlaps metadata\n", 3570 mdname(mddev)); 3571 return -EINVAL; 3572 } 3573 } else { 3574 if (rdev->sb_start + rdev->sb_size/512 3575 > rdev->data_offset) { 3576 printk("md: %s: metadata overlaps data\n", 3577 mdname(mddev)); 3578 return -EINVAL; 3579 } 3580 } 3581 sysfs_notify(&rdev->kobj, NULL, "state"); 3582 } 3583 3584 md_probe(mddev->unit, NULL, NULL); 3585 disk = mddev->gendisk; 3586 if (!disk) 3587 return -ENOMEM; 3588 3589 spin_lock(&pers_lock); 3590 pers = find_pers(mddev->level, mddev->clevel); 3591 if (!pers || !try_module_get(pers->owner)) { 3592 spin_unlock(&pers_lock); 3593 if (mddev->level != LEVEL_NONE) 3594 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 3595 mddev->level); 3596 else 3597 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 3598 mddev->clevel); 3599 return -EINVAL; 3600 } 3601 mddev->pers = pers; 3602 spin_unlock(&pers_lock); 3603 mddev->level = pers->level; 3604 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3605 3606 if (mddev->reshape_position != MaxSector && 3607 pers->start_reshape == NULL) { 3608 /* This personality cannot handle reshaping... */ 3609 mddev->pers = NULL; 3610 module_put(pers->owner); 3611 return -EINVAL; 3612 } 3613 3614 if (pers->sync_request) { 3615 /* Warn if this is a potentially silly 3616 * configuration. 3617 */ 3618 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 3619 mdk_rdev_t *rdev2; 3620 struct list_head *tmp2; 3621 int warned = 0; 3622 rdev_for_each(rdev, tmp, mddev) { 3623 rdev_for_each(rdev2, tmp2, mddev) { 3624 if (rdev < rdev2 && 3625 rdev->bdev->bd_contains == 3626 rdev2->bdev->bd_contains) { 3627 printk(KERN_WARNING 3628 "%s: WARNING: %s appears to be" 3629 " on the same physical disk as" 3630 " %s.\n", 3631 mdname(mddev), 3632 bdevname(rdev->bdev,b), 3633 bdevname(rdev2->bdev,b2)); 3634 warned = 1; 3635 } 3636 } 3637 } 3638 if (warned) 3639 printk(KERN_WARNING 3640 "True protection against single-disk" 3641 " failure might be compromised.\n"); 3642 } 3643 3644 mddev->recovery = 0; 3645 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 3646 mddev->barriers_work = 1; 3647 mddev->ok_start_degraded = start_dirty_degraded; 3648 3649 if (start_readonly) 3650 mddev->ro = 2; /* read-only, but switch on first write */ 3651 3652 err = mddev->pers->run(mddev); 3653 if (err) 3654 printk(KERN_ERR "md: pers->run() failed ...\n"); 3655 else if (mddev->pers->sync_request) { 3656 err = bitmap_create(mddev); 3657 if (err) { 3658 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 3659 mdname(mddev), err); 3660 mddev->pers->stop(mddev); 3661 } 3662 } 3663 if (err) { 3664 module_put(mddev->pers->owner); 3665 mddev->pers = NULL; 3666 bitmap_destroy(mddev); 3667 return err; 3668 } 3669 if (mddev->pers->sync_request) { 3670 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3671 printk(KERN_WARNING 3672 "md: cannot register extra attributes for %s\n", 3673 mdname(mddev)); 3674 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 3675 mddev->ro = 0; 3676 3677 atomic_set(&mddev->writes_pending,0); 3678 mddev->safemode = 0; 3679 mddev->safemode_timer.function = md_safemode_timeout; 3680 mddev->safemode_timer.data = (unsigned long) mddev; 3681 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 3682 mddev->in_sync = 1; 3683 3684 rdev_for_each(rdev, tmp, mddev) 3685 if (rdev->raid_disk >= 0) { 3686 char nm[20]; 3687 sprintf(nm, "rd%d", rdev->raid_disk); 3688 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 3689 printk("md: cannot register %s for %s\n", 3690 nm, mdname(mddev)); 3691 } 3692 3693 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3694 3695 if (mddev->flags) 3696 md_update_sb(mddev, 0); 3697 3698 set_capacity(disk, mddev->array_sectors); 3699 3700 /* If we call blk_queue_make_request here, it will 3701 * re-initialise max_sectors etc which may have been 3702 * refined inside -> run. So just set the bits we need to set. 3703 * Most initialisation happended when we called 3704 * blk_queue_make_request(..., md_fail_request) 3705 * earlier. 3706 */ 3707 mddev->queue->queuedata = mddev; 3708 mddev->queue->make_request_fn = mddev->pers->make_request; 3709 3710 /* If there is a partially-recovered drive we need to 3711 * start recovery here. If we leave it to md_check_recovery, 3712 * it will remove the drives and not do the right thing 3713 */ 3714 if (mddev->degraded && !mddev->sync_thread) { 3715 struct list_head *rtmp; 3716 int spares = 0; 3717 rdev_for_each(rdev, rtmp, mddev) 3718 if (rdev->raid_disk >= 0 && 3719 !test_bit(In_sync, &rdev->flags) && 3720 !test_bit(Faulty, &rdev->flags)) 3721 /* complete an interrupted recovery */ 3722 spares++; 3723 if (spares && mddev->pers->sync_request) { 3724 mddev->recovery = 0; 3725 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3726 mddev->sync_thread = md_register_thread(md_do_sync, 3727 mddev, 3728 "%s_resync"); 3729 if (!mddev->sync_thread) { 3730 printk(KERN_ERR "%s: could not start resync" 3731 " thread...\n", 3732 mdname(mddev)); 3733 /* leave the spares where they are, it shouldn't hurt */ 3734 mddev->recovery = 0; 3735 } 3736 } 3737 } 3738 md_wakeup_thread(mddev->thread); 3739 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 3740 3741 mddev->changed = 1; 3742 md_new_event(mddev); 3743 sysfs_notify(&mddev->kobj, NULL, "array_state"); 3744 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 3745 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3746 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 3747 return 0; 3748 } 3749 3750 static int restart_array(mddev_t *mddev) 3751 { 3752 struct gendisk *disk = mddev->gendisk; 3753 3754 /* Complain if it has no devices */ 3755 if (list_empty(&mddev->disks)) 3756 return -ENXIO; 3757 if (!mddev->pers) 3758 return -EINVAL; 3759 if (!mddev->ro) 3760 return -EBUSY; 3761 mddev->safemode = 0; 3762 mddev->ro = 0; 3763 set_disk_ro(disk, 0); 3764 printk(KERN_INFO "md: %s switched to read-write mode.\n", 3765 mdname(mddev)); 3766 /* Kick recovery or resync if necessary */ 3767 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3768 md_wakeup_thread(mddev->thread); 3769 md_wakeup_thread(mddev->sync_thread); 3770 sysfs_notify(&mddev->kobj, NULL, "array_state"); 3771 return 0; 3772 } 3773 3774 /* similar to deny_write_access, but accounts for our holding a reference 3775 * to the file ourselves */ 3776 static int deny_bitmap_write_access(struct file * file) 3777 { 3778 struct inode *inode = file->f_mapping->host; 3779 3780 spin_lock(&inode->i_lock); 3781 if (atomic_read(&inode->i_writecount) > 1) { 3782 spin_unlock(&inode->i_lock); 3783 return -ETXTBSY; 3784 } 3785 atomic_set(&inode->i_writecount, -1); 3786 spin_unlock(&inode->i_lock); 3787 3788 return 0; 3789 } 3790 3791 static void restore_bitmap_write_access(struct file *file) 3792 { 3793 struct inode *inode = file->f_mapping->host; 3794 3795 spin_lock(&inode->i_lock); 3796 atomic_set(&inode->i_writecount, 1); 3797 spin_unlock(&inode->i_lock); 3798 } 3799 3800 /* mode: 3801 * 0 - completely stop and dis-assemble array 3802 * 1 - switch to readonly 3803 * 2 - stop but do not disassemble array 3804 */ 3805 static int do_md_stop(mddev_t * mddev, int mode, int is_open) 3806 { 3807 int err = 0; 3808 struct gendisk *disk = mddev->gendisk; 3809 3810 if (atomic_read(&mddev->openers) > is_open) { 3811 printk("md: %s still in use.\n",mdname(mddev)); 3812 return -EBUSY; 3813 } 3814 3815 if (mddev->pers) { 3816 3817 if (mddev->sync_thread) { 3818 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3819 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3820 md_unregister_thread(mddev->sync_thread); 3821 mddev->sync_thread = NULL; 3822 } 3823 3824 del_timer_sync(&mddev->safemode_timer); 3825 3826 switch(mode) { 3827 case 1: /* readonly */ 3828 err = -ENXIO; 3829 if (mddev->ro==1) 3830 goto out; 3831 mddev->ro = 1; 3832 break; 3833 case 0: /* disassemble */ 3834 case 2: /* stop */ 3835 bitmap_flush(mddev); 3836 md_super_wait(mddev); 3837 if (mddev->ro) 3838 set_disk_ro(disk, 0); 3839 blk_queue_make_request(mddev->queue, md_fail_request); 3840 mddev->pers->stop(mddev); 3841 mddev->queue->merge_bvec_fn = NULL; 3842 mddev->queue->unplug_fn = NULL; 3843 mddev->queue->backing_dev_info.congested_fn = NULL; 3844 if (mddev->pers->sync_request) 3845 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3846 3847 module_put(mddev->pers->owner); 3848 mddev->pers = NULL; 3849 /* tell userspace to handle 'inactive' */ 3850 sysfs_notify(&mddev->kobj, NULL, "array_state"); 3851 3852 set_capacity(disk, 0); 3853 mddev->changed = 1; 3854 3855 if (mddev->ro) 3856 mddev->ro = 0; 3857 } 3858 if (!mddev->in_sync || mddev->flags) { 3859 /* mark array as shutdown cleanly */ 3860 mddev->in_sync = 1; 3861 md_update_sb(mddev, 1); 3862 } 3863 if (mode == 1) 3864 set_disk_ro(disk, 1); 3865 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3866 } 3867 3868 /* 3869 * Free resources if final stop 3870 */ 3871 if (mode == 0) { 3872 mdk_rdev_t *rdev; 3873 struct list_head *tmp; 3874 3875 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 3876 3877 bitmap_destroy(mddev); 3878 if (mddev->bitmap_file) { 3879 restore_bitmap_write_access(mddev->bitmap_file); 3880 fput(mddev->bitmap_file); 3881 mddev->bitmap_file = NULL; 3882 } 3883 mddev->bitmap_offset = 0; 3884 3885 rdev_for_each(rdev, tmp, mddev) 3886 if (rdev->raid_disk >= 0) { 3887 char nm[20]; 3888 sprintf(nm, "rd%d", rdev->raid_disk); 3889 sysfs_remove_link(&mddev->kobj, nm); 3890 } 3891 3892 /* make sure all md_delayed_delete calls have finished */ 3893 flush_scheduled_work(); 3894 3895 export_array(mddev); 3896 3897 mddev->array_sectors = 0; 3898 mddev->size = 0; 3899 mddev->raid_disks = 0; 3900 mddev->recovery_cp = 0; 3901 mddev->resync_min = 0; 3902 mddev->resync_max = MaxSector; 3903 mddev->reshape_position = MaxSector; 3904 mddev->external = 0; 3905 mddev->persistent = 0; 3906 mddev->level = LEVEL_NONE; 3907 mddev->clevel[0] = 0; 3908 mddev->flags = 0; 3909 mddev->ro = 0; 3910 mddev->metadata_type[0] = 0; 3911 mddev->chunk_size = 0; 3912 mddev->ctime = mddev->utime = 0; 3913 mddev->layout = 0; 3914 mddev->max_disks = 0; 3915 mddev->events = 0; 3916 mddev->delta_disks = 0; 3917 mddev->new_level = LEVEL_NONE; 3918 mddev->new_layout = 0; 3919 mddev->new_chunk = 0; 3920 mddev->curr_resync = 0; 3921 mddev->resync_mismatches = 0; 3922 mddev->suspend_lo = mddev->suspend_hi = 0; 3923 mddev->sync_speed_min = mddev->sync_speed_max = 0; 3924 mddev->recovery = 0; 3925 mddev->in_sync = 0; 3926 mddev->changed = 0; 3927 mddev->degraded = 0; 3928 mddev->barriers_work = 0; 3929 mddev->safemode = 0; 3930 3931 } else if (mddev->pers) 3932 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3933 mdname(mddev)); 3934 err = 0; 3935 md_new_event(mddev); 3936 sysfs_notify(&mddev->kobj, NULL, "array_state"); 3937 out: 3938 return err; 3939 } 3940 3941 #ifndef MODULE 3942 static void autorun_array(mddev_t *mddev) 3943 { 3944 mdk_rdev_t *rdev; 3945 struct list_head *tmp; 3946 int err; 3947 3948 if (list_empty(&mddev->disks)) 3949 return; 3950 3951 printk(KERN_INFO "md: running: "); 3952 3953 rdev_for_each(rdev, tmp, mddev) { 3954 char b[BDEVNAME_SIZE]; 3955 printk("<%s>", bdevname(rdev->bdev,b)); 3956 } 3957 printk("\n"); 3958 3959 err = do_md_run(mddev); 3960 if (err) { 3961 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 3962 do_md_stop(mddev, 0, 0); 3963 } 3964 } 3965 3966 /* 3967 * lets try to run arrays based on all disks that have arrived 3968 * until now. (those are in pending_raid_disks) 3969 * 3970 * the method: pick the first pending disk, collect all disks with 3971 * the same UUID, remove all from the pending list and put them into 3972 * the 'same_array' list. Then order this list based on superblock 3973 * update time (freshest comes first), kick out 'old' disks and 3974 * compare superblocks. If everything's fine then run it. 3975 * 3976 * If "unit" is allocated, then bump its reference count 3977 */ 3978 static void autorun_devices(int part) 3979 { 3980 struct list_head *tmp; 3981 mdk_rdev_t *rdev0, *rdev; 3982 mddev_t *mddev; 3983 char b[BDEVNAME_SIZE]; 3984 3985 printk(KERN_INFO "md: autorun ...\n"); 3986 while (!list_empty(&pending_raid_disks)) { 3987 int unit; 3988 dev_t dev; 3989 LIST_HEAD(candidates); 3990 rdev0 = list_entry(pending_raid_disks.next, 3991 mdk_rdev_t, same_set); 3992 3993 printk(KERN_INFO "md: considering %s ...\n", 3994 bdevname(rdev0->bdev,b)); 3995 INIT_LIST_HEAD(&candidates); 3996 rdev_for_each_list(rdev, tmp, pending_raid_disks) 3997 if (super_90_load(rdev, rdev0, 0) >= 0) { 3998 printk(KERN_INFO "md: adding %s ...\n", 3999 bdevname(rdev->bdev,b)); 4000 list_move(&rdev->same_set, &candidates); 4001 } 4002 /* 4003 * now we have a set of devices, with all of them having 4004 * mostly sane superblocks. It's time to allocate the 4005 * mddev. 4006 */ 4007 if (part) { 4008 dev = MKDEV(mdp_major, 4009 rdev0->preferred_minor << MdpMinorShift); 4010 unit = MINOR(dev) >> MdpMinorShift; 4011 } else { 4012 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 4013 unit = MINOR(dev); 4014 } 4015 if (rdev0->preferred_minor != unit) { 4016 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 4017 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 4018 break; 4019 } 4020 4021 md_probe(dev, NULL, NULL); 4022 mddev = mddev_find(dev); 4023 if (!mddev || !mddev->gendisk) { 4024 if (mddev) 4025 mddev_put(mddev); 4026 printk(KERN_ERR 4027 "md: cannot allocate memory for md drive.\n"); 4028 break; 4029 } 4030 if (mddev_lock(mddev)) 4031 printk(KERN_WARNING "md: %s locked, cannot run\n", 4032 mdname(mddev)); 4033 else if (mddev->raid_disks || mddev->major_version 4034 || !list_empty(&mddev->disks)) { 4035 printk(KERN_WARNING 4036 "md: %s already running, cannot run %s\n", 4037 mdname(mddev), bdevname(rdev0->bdev,b)); 4038 mddev_unlock(mddev); 4039 } else { 4040 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 4041 mddev->persistent = 1; 4042 rdev_for_each_list(rdev, tmp, candidates) { 4043 list_del_init(&rdev->same_set); 4044 if (bind_rdev_to_array(rdev, mddev)) 4045 export_rdev(rdev); 4046 } 4047 autorun_array(mddev); 4048 mddev_unlock(mddev); 4049 } 4050 /* on success, candidates will be empty, on error 4051 * it won't... 4052 */ 4053 rdev_for_each_list(rdev, tmp, candidates) { 4054 list_del_init(&rdev->same_set); 4055 export_rdev(rdev); 4056 } 4057 mddev_put(mddev); 4058 } 4059 printk(KERN_INFO "md: ... autorun DONE.\n"); 4060 } 4061 #endif /* !MODULE */ 4062 4063 static int get_version(void __user * arg) 4064 { 4065 mdu_version_t ver; 4066 4067 ver.major = MD_MAJOR_VERSION; 4068 ver.minor = MD_MINOR_VERSION; 4069 ver.patchlevel = MD_PATCHLEVEL_VERSION; 4070 4071 if (copy_to_user(arg, &ver, sizeof(ver))) 4072 return -EFAULT; 4073 4074 return 0; 4075 } 4076 4077 static int get_array_info(mddev_t * mddev, void __user * arg) 4078 { 4079 mdu_array_info_t info; 4080 int nr,working,active,failed,spare; 4081 mdk_rdev_t *rdev; 4082 struct list_head *tmp; 4083 4084 nr=working=active=failed=spare=0; 4085 rdev_for_each(rdev, tmp, mddev) { 4086 nr++; 4087 if (test_bit(Faulty, &rdev->flags)) 4088 failed++; 4089 else { 4090 working++; 4091 if (test_bit(In_sync, &rdev->flags)) 4092 active++; 4093 else 4094 spare++; 4095 } 4096 } 4097 4098 info.major_version = mddev->major_version; 4099 info.minor_version = mddev->minor_version; 4100 info.patch_version = MD_PATCHLEVEL_VERSION; 4101 info.ctime = mddev->ctime; 4102 info.level = mddev->level; 4103 info.size = mddev->size; 4104 if (info.size != mddev->size) /* overflow */ 4105 info.size = -1; 4106 info.nr_disks = nr; 4107 info.raid_disks = mddev->raid_disks; 4108 info.md_minor = mddev->md_minor; 4109 info.not_persistent= !mddev->persistent; 4110 4111 info.utime = mddev->utime; 4112 info.state = 0; 4113 if (mddev->in_sync) 4114 info.state = (1<<MD_SB_CLEAN); 4115 if (mddev->bitmap && mddev->bitmap_offset) 4116 info.state = (1<<MD_SB_BITMAP_PRESENT); 4117 info.active_disks = active; 4118 info.working_disks = working; 4119 info.failed_disks = failed; 4120 info.spare_disks = spare; 4121 4122 info.layout = mddev->layout; 4123 info.chunk_size = mddev->chunk_size; 4124 4125 if (copy_to_user(arg, &info, sizeof(info))) 4126 return -EFAULT; 4127 4128 return 0; 4129 } 4130 4131 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 4132 { 4133 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 4134 char *ptr, *buf = NULL; 4135 int err = -ENOMEM; 4136 4137 if (md_allow_write(mddev)) 4138 file = kmalloc(sizeof(*file), GFP_NOIO); 4139 else 4140 file = kmalloc(sizeof(*file), GFP_KERNEL); 4141 4142 if (!file) 4143 goto out; 4144 4145 /* bitmap disabled, zero the first byte and copy out */ 4146 if (!mddev->bitmap || !mddev->bitmap->file) { 4147 file->pathname[0] = '\0'; 4148 goto copy_out; 4149 } 4150 4151 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 4152 if (!buf) 4153 goto out; 4154 4155 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); 4156 if (IS_ERR(ptr)) 4157 goto out; 4158 4159 strcpy(file->pathname, ptr); 4160 4161 copy_out: 4162 err = 0; 4163 if (copy_to_user(arg, file, sizeof(*file))) 4164 err = -EFAULT; 4165 out: 4166 kfree(buf); 4167 kfree(file); 4168 return err; 4169 } 4170 4171 static int get_disk_info(mddev_t * mddev, void __user * arg) 4172 { 4173 mdu_disk_info_t info; 4174 mdk_rdev_t *rdev; 4175 4176 if (copy_from_user(&info, arg, sizeof(info))) 4177 return -EFAULT; 4178 4179 rdev = find_rdev_nr(mddev, info.number); 4180 if (rdev) { 4181 info.major = MAJOR(rdev->bdev->bd_dev); 4182 info.minor = MINOR(rdev->bdev->bd_dev); 4183 info.raid_disk = rdev->raid_disk; 4184 info.state = 0; 4185 if (test_bit(Faulty, &rdev->flags)) 4186 info.state |= (1<<MD_DISK_FAULTY); 4187 else if (test_bit(In_sync, &rdev->flags)) { 4188 info.state |= (1<<MD_DISK_ACTIVE); 4189 info.state |= (1<<MD_DISK_SYNC); 4190 } 4191 if (test_bit(WriteMostly, &rdev->flags)) 4192 info.state |= (1<<MD_DISK_WRITEMOSTLY); 4193 } else { 4194 info.major = info.minor = 0; 4195 info.raid_disk = -1; 4196 info.state = (1<<MD_DISK_REMOVED); 4197 } 4198 4199 if (copy_to_user(arg, &info, sizeof(info))) 4200 return -EFAULT; 4201 4202 return 0; 4203 } 4204 4205 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 4206 { 4207 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4208 mdk_rdev_t *rdev; 4209 dev_t dev = MKDEV(info->major,info->minor); 4210 4211 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 4212 return -EOVERFLOW; 4213 4214 if (!mddev->raid_disks) { 4215 int err; 4216 /* expecting a device which has a superblock */ 4217 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 4218 if (IS_ERR(rdev)) { 4219 printk(KERN_WARNING 4220 "md: md_import_device returned %ld\n", 4221 PTR_ERR(rdev)); 4222 return PTR_ERR(rdev); 4223 } 4224 if (!list_empty(&mddev->disks)) { 4225 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 4226 mdk_rdev_t, same_set); 4227 int err = super_types[mddev->major_version] 4228 .load_super(rdev, rdev0, mddev->minor_version); 4229 if (err < 0) { 4230 printk(KERN_WARNING 4231 "md: %s has different UUID to %s\n", 4232 bdevname(rdev->bdev,b), 4233 bdevname(rdev0->bdev,b2)); 4234 export_rdev(rdev); 4235 return -EINVAL; 4236 } 4237 } 4238 err = bind_rdev_to_array(rdev, mddev); 4239 if (err) 4240 export_rdev(rdev); 4241 return err; 4242 } 4243 4244 /* 4245 * add_new_disk can be used once the array is assembled 4246 * to add "hot spares". They must already have a superblock 4247 * written 4248 */ 4249 if (mddev->pers) { 4250 int err; 4251 if (!mddev->pers->hot_add_disk) { 4252 printk(KERN_WARNING 4253 "%s: personality does not support diskops!\n", 4254 mdname(mddev)); 4255 return -EINVAL; 4256 } 4257 if (mddev->persistent) 4258 rdev = md_import_device(dev, mddev->major_version, 4259 mddev->minor_version); 4260 else 4261 rdev = md_import_device(dev, -1, -1); 4262 if (IS_ERR(rdev)) { 4263 printk(KERN_WARNING 4264 "md: md_import_device returned %ld\n", 4265 PTR_ERR(rdev)); 4266 return PTR_ERR(rdev); 4267 } 4268 /* set save_raid_disk if appropriate */ 4269 if (!mddev->persistent) { 4270 if (info->state & (1<<MD_DISK_SYNC) && 4271 info->raid_disk < mddev->raid_disks) 4272 rdev->raid_disk = info->raid_disk; 4273 else 4274 rdev->raid_disk = -1; 4275 } else 4276 super_types[mddev->major_version]. 4277 validate_super(mddev, rdev); 4278 rdev->saved_raid_disk = rdev->raid_disk; 4279 4280 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 4281 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4282 set_bit(WriteMostly, &rdev->flags); 4283 4284 rdev->raid_disk = -1; 4285 err = bind_rdev_to_array(rdev, mddev); 4286 if (!err && !mddev->pers->hot_remove_disk) { 4287 /* If there is hot_add_disk but no hot_remove_disk 4288 * then added disks for geometry changes, 4289 * and should be added immediately. 4290 */ 4291 super_types[mddev->major_version]. 4292 validate_super(mddev, rdev); 4293 err = mddev->pers->hot_add_disk(mddev, rdev); 4294 if (err) 4295 unbind_rdev_from_array(rdev); 4296 } 4297 if (err) 4298 export_rdev(rdev); 4299 else 4300 sysfs_notify(&rdev->kobj, NULL, "state"); 4301 4302 md_update_sb(mddev, 1); 4303 if (mddev->degraded) 4304 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4305 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4306 md_wakeup_thread(mddev->thread); 4307 return err; 4308 } 4309 4310 /* otherwise, add_new_disk is only allowed 4311 * for major_version==0 superblocks 4312 */ 4313 if (mddev->major_version != 0) { 4314 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 4315 mdname(mddev)); 4316 return -EINVAL; 4317 } 4318 4319 if (!(info->state & (1<<MD_DISK_FAULTY))) { 4320 int err; 4321 rdev = md_import_device(dev, -1, 0); 4322 if (IS_ERR(rdev)) { 4323 printk(KERN_WARNING 4324 "md: error, md_import_device() returned %ld\n", 4325 PTR_ERR(rdev)); 4326 return PTR_ERR(rdev); 4327 } 4328 rdev->desc_nr = info->number; 4329 if (info->raid_disk < mddev->raid_disks) 4330 rdev->raid_disk = info->raid_disk; 4331 else 4332 rdev->raid_disk = -1; 4333 4334 if (rdev->raid_disk < mddev->raid_disks) 4335 if (info->state & (1<<MD_DISK_SYNC)) 4336 set_bit(In_sync, &rdev->flags); 4337 4338 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4339 set_bit(WriteMostly, &rdev->flags); 4340 4341 if (!mddev->persistent) { 4342 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 4343 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4344 } else 4345 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4346 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; 4347 4348 err = bind_rdev_to_array(rdev, mddev); 4349 if (err) { 4350 export_rdev(rdev); 4351 return err; 4352 } 4353 } 4354 4355 return 0; 4356 } 4357 4358 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 4359 { 4360 char b[BDEVNAME_SIZE]; 4361 mdk_rdev_t *rdev; 4362 4363 rdev = find_rdev(mddev, dev); 4364 if (!rdev) 4365 return -ENXIO; 4366 4367 if (rdev->raid_disk >= 0) 4368 goto busy; 4369 4370 kick_rdev_from_array(rdev); 4371 md_update_sb(mddev, 1); 4372 md_new_event(mddev); 4373 4374 return 0; 4375 busy: 4376 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 4377 bdevname(rdev->bdev,b), mdname(mddev)); 4378 return -EBUSY; 4379 } 4380 4381 static int hot_add_disk(mddev_t * mddev, dev_t dev) 4382 { 4383 char b[BDEVNAME_SIZE]; 4384 int err; 4385 mdk_rdev_t *rdev; 4386 4387 if (!mddev->pers) 4388 return -ENODEV; 4389 4390 if (mddev->major_version != 0) { 4391 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 4392 " version-0 superblocks.\n", 4393 mdname(mddev)); 4394 return -EINVAL; 4395 } 4396 if (!mddev->pers->hot_add_disk) { 4397 printk(KERN_WARNING 4398 "%s: personality does not support diskops!\n", 4399 mdname(mddev)); 4400 return -EINVAL; 4401 } 4402 4403 rdev = md_import_device(dev, -1, 0); 4404 if (IS_ERR(rdev)) { 4405 printk(KERN_WARNING 4406 "md: error, md_import_device() returned %ld\n", 4407 PTR_ERR(rdev)); 4408 return -EINVAL; 4409 } 4410 4411 if (mddev->persistent) 4412 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4413 else 4414 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4415 4416 rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; 4417 4418 if (test_bit(Faulty, &rdev->flags)) { 4419 printk(KERN_WARNING 4420 "md: can not hot-add faulty %s disk to %s!\n", 4421 bdevname(rdev->bdev,b), mdname(mddev)); 4422 err = -EINVAL; 4423 goto abort_export; 4424 } 4425 clear_bit(In_sync, &rdev->flags); 4426 rdev->desc_nr = -1; 4427 rdev->saved_raid_disk = -1; 4428 err = bind_rdev_to_array(rdev, mddev); 4429 if (err) 4430 goto abort_export; 4431 4432 /* 4433 * The rest should better be atomic, we can have disk failures 4434 * noticed in interrupt contexts ... 4435 */ 4436 4437 if (rdev->desc_nr == mddev->max_disks) { 4438 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 4439 mdname(mddev)); 4440 err = -EBUSY; 4441 goto abort_unbind_export; 4442 } 4443 4444 rdev->raid_disk = -1; 4445 4446 md_update_sb(mddev, 1); 4447 4448 /* 4449 * Kick recovery, maybe this spare has to be added to the 4450 * array immediately. 4451 */ 4452 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4453 md_wakeup_thread(mddev->thread); 4454 md_new_event(mddev); 4455 return 0; 4456 4457 abort_unbind_export: 4458 unbind_rdev_from_array(rdev); 4459 4460 abort_export: 4461 export_rdev(rdev); 4462 return err; 4463 } 4464 4465 static int set_bitmap_file(mddev_t *mddev, int fd) 4466 { 4467 int err; 4468 4469 if (mddev->pers) { 4470 if (!mddev->pers->quiesce) 4471 return -EBUSY; 4472 if (mddev->recovery || mddev->sync_thread) 4473 return -EBUSY; 4474 /* we should be able to change the bitmap.. */ 4475 } 4476 4477 4478 if (fd >= 0) { 4479 if (mddev->bitmap) 4480 return -EEXIST; /* cannot add when bitmap is present */ 4481 mddev->bitmap_file = fget(fd); 4482 4483 if (mddev->bitmap_file == NULL) { 4484 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 4485 mdname(mddev)); 4486 return -EBADF; 4487 } 4488 4489 err = deny_bitmap_write_access(mddev->bitmap_file); 4490 if (err) { 4491 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 4492 mdname(mddev)); 4493 fput(mddev->bitmap_file); 4494 mddev->bitmap_file = NULL; 4495 return err; 4496 } 4497 mddev->bitmap_offset = 0; /* file overrides offset */ 4498 } else if (mddev->bitmap == NULL) 4499 return -ENOENT; /* cannot remove what isn't there */ 4500 err = 0; 4501 if (mddev->pers) { 4502 mddev->pers->quiesce(mddev, 1); 4503 if (fd >= 0) 4504 err = bitmap_create(mddev); 4505 if (fd < 0 || err) { 4506 bitmap_destroy(mddev); 4507 fd = -1; /* make sure to put the file */ 4508 } 4509 mddev->pers->quiesce(mddev, 0); 4510 } 4511 if (fd < 0) { 4512 if (mddev->bitmap_file) { 4513 restore_bitmap_write_access(mddev->bitmap_file); 4514 fput(mddev->bitmap_file); 4515 } 4516 mddev->bitmap_file = NULL; 4517 } 4518 4519 return err; 4520 } 4521 4522 /* 4523 * set_array_info is used two different ways 4524 * The original usage is when creating a new array. 4525 * In this usage, raid_disks is > 0 and it together with 4526 * level, size, not_persistent,layout,chunksize determine the 4527 * shape of the array. 4528 * This will always create an array with a type-0.90.0 superblock. 4529 * The newer usage is when assembling an array. 4530 * In this case raid_disks will be 0, and the major_version field is 4531 * use to determine which style super-blocks are to be found on the devices. 4532 * The minor and patch _version numbers are also kept incase the 4533 * super_block handler wishes to interpret them. 4534 */ 4535 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 4536 { 4537 4538 if (info->raid_disks == 0) { 4539 /* just setting version number for superblock loading */ 4540 if (info->major_version < 0 || 4541 info->major_version >= ARRAY_SIZE(super_types) || 4542 super_types[info->major_version].name == NULL) { 4543 /* maybe try to auto-load a module? */ 4544 printk(KERN_INFO 4545 "md: superblock version %d not known\n", 4546 info->major_version); 4547 return -EINVAL; 4548 } 4549 mddev->major_version = info->major_version; 4550 mddev->minor_version = info->minor_version; 4551 mddev->patch_version = info->patch_version; 4552 mddev->persistent = !info->not_persistent; 4553 return 0; 4554 } 4555 mddev->major_version = MD_MAJOR_VERSION; 4556 mddev->minor_version = MD_MINOR_VERSION; 4557 mddev->patch_version = MD_PATCHLEVEL_VERSION; 4558 mddev->ctime = get_seconds(); 4559 4560 mddev->level = info->level; 4561 mddev->clevel[0] = 0; 4562 mddev->size = info->size; 4563 mddev->raid_disks = info->raid_disks; 4564 /* don't set md_minor, it is determined by which /dev/md* was 4565 * openned 4566 */ 4567 if (info->state & (1<<MD_SB_CLEAN)) 4568 mddev->recovery_cp = MaxSector; 4569 else 4570 mddev->recovery_cp = 0; 4571 mddev->persistent = ! info->not_persistent; 4572 mddev->external = 0; 4573 4574 mddev->layout = info->layout; 4575 mddev->chunk_size = info->chunk_size; 4576 4577 mddev->max_disks = MD_SB_DISKS; 4578 4579 if (mddev->persistent) 4580 mddev->flags = 0; 4581 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4582 4583 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 4584 mddev->bitmap_offset = 0; 4585 4586 mddev->reshape_position = MaxSector; 4587 4588 /* 4589 * Generate a 128 bit UUID 4590 */ 4591 get_random_bytes(mddev->uuid, 16); 4592 4593 mddev->new_level = mddev->level; 4594 mddev->new_chunk = mddev->chunk_size; 4595 mddev->new_layout = mddev->layout; 4596 mddev->delta_disks = 0; 4597 4598 return 0; 4599 } 4600 4601 static int update_size(mddev_t *mddev, sector_t num_sectors) 4602 { 4603 mdk_rdev_t * rdev; 4604 int rv; 4605 struct list_head *tmp; 4606 int fit = (num_sectors == 0); 4607 4608 if (mddev->pers->resize == NULL) 4609 return -EINVAL; 4610 /* The "num_sectors" is the number of sectors of each device that 4611 * is used. This can only make sense for arrays with redundancy. 4612 * linear and raid0 always use whatever space is available. We can only 4613 * consider changing this number if no resync or reconstruction is 4614 * happening, and if the new size is acceptable. It must fit before the 4615 * sb_start or, if that is <data_offset, it must fit before the size 4616 * of each device. If num_sectors is zero, we find the largest size 4617 * that fits. 4618 4619 */ 4620 if (mddev->sync_thread) 4621 return -EBUSY; 4622 if (mddev->bitmap) 4623 /* Sorry, cannot grow a bitmap yet, just remove it, 4624 * grow, and re-add. 4625 */ 4626 return -EBUSY; 4627 rdev_for_each(rdev, tmp, mddev) { 4628 sector_t avail; 4629 avail = rdev->size * 2; 4630 4631 if (fit && (num_sectors == 0 || num_sectors > avail)) 4632 num_sectors = avail; 4633 if (avail < num_sectors) 4634 return -ENOSPC; 4635 } 4636 rv = mddev->pers->resize(mddev, num_sectors); 4637 if (!rv) { 4638 struct block_device *bdev; 4639 4640 bdev = bdget_disk(mddev->gendisk, 0); 4641 if (bdev) { 4642 mutex_lock(&bdev->bd_inode->i_mutex); 4643 i_size_write(bdev->bd_inode, 4644 (loff_t)mddev->array_sectors << 9); 4645 mutex_unlock(&bdev->bd_inode->i_mutex); 4646 bdput(bdev); 4647 } 4648 } 4649 return rv; 4650 } 4651 4652 static int update_raid_disks(mddev_t *mddev, int raid_disks) 4653 { 4654 int rv; 4655 /* change the number of raid disks */ 4656 if (mddev->pers->check_reshape == NULL) 4657 return -EINVAL; 4658 if (raid_disks <= 0 || 4659 raid_disks >= mddev->max_disks) 4660 return -EINVAL; 4661 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 4662 return -EBUSY; 4663 mddev->delta_disks = raid_disks - mddev->raid_disks; 4664 4665 rv = mddev->pers->check_reshape(mddev); 4666 return rv; 4667 } 4668 4669 4670 /* 4671 * update_array_info is used to change the configuration of an 4672 * on-line array. 4673 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 4674 * fields in the info are checked against the array. 4675 * Any differences that cannot be handled will cause an error. 4676 * Normally, only one change can be managed at a time. 4677 */ 4678 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 4679 { 4680 int rv = 0; 4681 int cnt = 0; 4682 int state = 0; 4683 4684 /* calculate expected state,ignoring low bits */ 4685 if (mddev->bitmap && mddev->bitmap_offset) 4686 state |= (1 << MD_SB_BITMAP_PRESENT); 4687 4688 if (mddev->major_version != info->major_version || 4689 mddev->minor_version != info->minor_version || 4690 /* mddev->patch_version != info->patch_version || */ 4691 mddev->ctime != info->ctime || 4692 mddev->level != info->level || 4693 /* mddev->layout != info->layout || */ 4694 !mddev->persistent != info->not_persistent|| 4695 mddev->chunk_size != info->chunk_size || 4696 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 4697 ((state^info->state) & 0xfffffe00) 4698 ) 4699 return -EINVAL; 4700 /* Check there is only one change */ 4701 if (info->size >= 0 && mddev->size != info->size) cnt++; 4702 if (mddev->raid_disks != info->raid_disks) cnt++; 4703 if (mddev->layout != info->layout) cnt++; 4704 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; 4705 if (cnt == 0) return 0; 4706 if (cnt > 1) return -EINVAL; 4707 4708 if (mddev->layout != info->layout) { 4709 /* Change layout 4710 * we don't need to do anything at the md level, the 4711 * personality will take care of it all. 4712 */ 4713 if (mddev->pers->reconfig == NULL) 4714 return -EINVAL; 4715 else 4716 return mddev->pers->reconfig(mddev, info->layout, -1); 4717 } 4718 if (info->size >= 0 && mddev->size != info->size) 4719 rv = update_size(mddev, (sector_t)info->size * 2); 4720 4721 if (mddev->raid_disks != info->raid_disks) 4722 rv = update_raid_disks(mddev, info->raid_disks); 4723 4724 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 4725 if (mddev->pers->quiesce == NULL) 4726 return -EINVAL; 4727 if (mddev->recovery || mddev->sync_thread) 4728 return -EBUSY; 4729 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 4730 /* add the bitmap */ 4731 if (mddev->bitmap) 4732 return -EEXIST; 4733 if (mddev->default_bitmap_offset == 0) 4734 return -EINVAL; 4735 mddev->bitmap_offset = mddev->default_bitmap_offset; 4736 mddev->pers->quiesce(mddev, 1); 4737 rv = bitmap_create(mddev); 4738 if (rv) 4739 bitmap_destroy(mddev); 4740 mddev->pers->quiesce(mddev, 0); 4741 } else { 4742 /* remove the bitmap */ 4743 if (!mddev->bitmap) 4744 return -ENOENT; 4745 if (mddev->bitmap->file) 4746 return -EINVAL; 4747 mddev->pers->quiesce(mddev, 1); 4748 bitmap_destroy(mddev); 4749 mddev->pers->quiesce(mddev, 0); 4750 mddev->bitmap_offset = 0; 4751 } 4752 } 4753 md_update_sb(mddev, 1); 4754 return rv; 4755 } 4756 4757 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 4758 { 4759 mdk_rdev_t *rdev; 4760 4761 if (mddev->pers == NULL) 4762 return -ENODEV; 4763 4764 rdev = find_rdev(mddev, dev); 4765 if (!rdev) 4766 return -ENODEV; 4767 4768 md_error(mddev, rdev); 4769 return 0; 4770 } 4771 4772 /* 4773 * We have a problem here : there is no easy way to give a CHS 4774 * virtual geometry. We currently pretend that we have a 2 heads 4775 * 4 sectors (with a BIG number of cylinders...). This drives 4776 * dosfs just mad... ;-) 4777 */ 4778 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 4779 { 4780 mddev_t *mddev = bdev->bd_disk->private_data; 4781 4782 geo->heads = 2; 4783 geo->sectors = 4; 4784 geo->cylinders = get_capacity(mddev->gendisk) / 8; 4785 return 0; 4786 } 4787 4788 static int md_ioctl(struct inode *inode, struct file *file, 4789 unsigned int cmd, unsigned long arg) 4790 { 4791 int err = 0; 4792 void __user *argp = (void __user *)arg; 4793 mddev_t *mddev = NULL; 4794 4795 if (!capable(CAP_SYS_ADMIN)) 4796 return -EACCES; 4797 4798 /* 4799 * Commands dealing with the RAID driver but not any 4800 * particular array: 4801 */ 4802 switch (cmd) 4803 { 4804 case RAID_VERSION: 4805 err = get_version(argp); 4806 goto done; 4807 4808 case PRINT_RAID_DEBUG: 4809 err = 0; 4810 md_print_devices(); 4811 goto done; 4812 4813 #ifndef MODULE 4814 case RAID_AUTORUN: 4815 err = 0; 4816 autostart_arrays(arg); 4817 goto done; 4818 #endif 4819 default:; 4820 } 4821 4822 /* 4823 * Commands creating/starting a new array: 4824 */ 4825 4826 mddev = inode->i_bdev->bd_disk->private_data; 4827 4828 if (!mddev) { 4829 BUG(); 4830 goto abort; 4831 } 4832 4833 err = mddev_lock(mddev); 4834 if (err) { 4835 printk(KERN_INFO 4836 "md: ioctl lock interrupted, reason %d, cmd %d\n", 4837 err, cmd); 4838 goto abort; 4839 } 4840 4841 switch (cmd) 4842 { 4843 case SET_ARRAY_INFO: 4844 { 4845 mdu_array_info_t info; 4846 if (!arg) 4847 memset(&info, 0, sizeof(info)); 4848 else if (copy_from_user(&info, argp, sizeof(info))) { 4849 err = -EFAULT; 4850 goto abort_unlock; 4851 } 4852 if (mddev->pers) { 4853 err = update_array_info(mddev, &info); 4854 if (err) { 4855 printk(KERN_WARNING "md: couldn't update" 4856 " array info. %d\n", err); 4857 goto abort_unlock; 4858 } 4859 goto done_unlock; 4860 } 4861 if (!list_empty(&mddev->disks)) { 4862 printk(KERN_WARNING 4863 "md: array %s already has disks!\n", 4864 mdname(mddev)); 4865 err = -EBUSY; 4866 goto abort_unlock; 4867 } 4868 if (mddev->raid_disks) { 4869 printk(KERN_WARNING 4870 "md: array %s already initialised!\n", 4871 mdname(mddev)); 4872 err = -EBUSY; 4873 goto abort_unlock; 4874 } 4875 err = set_array_info(mddev, &info); 4876 if (err) { 4877 printk(KERN_WARNING "md: couldn't set" 4878 " array info. %d\n", err); 4879 goto abort_unlock; 4880 } 4881 } 4882 goto done_unlock; 4883 4884 default:; 4885 } 4886 4887 /* 4888 * Commands querying/configuring an existing array: 4889 */ 4890 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 4891 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 4892 if ((!mddev->raid_disks && !mddev->external) 4893 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 4894 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 4895 && cmd != GET_BITMAP_FILE) { 4896 err = -ENODEV; 4897 goto abort_unlock; 4898 } 4899 4900 /* 4901 * Commands even a read-only array can execute: 4902 */ 4903 switch (cmd) 4904 { 4905 case GET_ARRAY_INFO: 4906 err = get_array_info(mddev, argp); 4907 goto done_unlock; 4908 4909 case GET_BITMAP_FILE: 4910 err = get_bitmap_file(mddev, argp); 4911 goto done_unlock; 4912 4913 case GET_DISK_INFO: 4914 err = get_disk_info(mddev, argp); 4915 goto done_unlock; 4916 4917 case RESTART_ARRAY_RW: 4918 err = restart_array(mddev); 4919 goto done_unlock; 4920 4921 case STOP_ARRAY: 4922 err = do_md_stop(mddev, 0, 1); 4923 goto done_unlock; 4924 4925 case STOP_ARRAY_RO: 4926 err = do_md_stop(mddev, 1, 1); 4927 goto done_unlock; 4928 4929 } 4930 4931 /* 4932 * The remaining ioctls are changing the state of the 4933 * superblock, so we do not allow them on read-only arrays. 4934 * However non-MD ioctls (e.g. get-size) will still come through 4935 * here and hit the 'default' below, so only disallow 4936 * 'md' ioctls, and switch to rw mode if started auto-readonly. 4937 */ 4938 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 4939 if (mddev->ro == 2) { 4940 mddev->ro = 0; 4941 sysfs_notify(&mddev->kobj, NULL, "array_state"); 4942 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4943 md_wakeup_thread(mddev->thread); 4944 } else { 4945 err = -EROFS; 4946 goto abort_unlock; 4947 } 4948 } 4949 4950 switch (cmd) 4951 { 4952 case ADD_NEW_DISK: 4953 { 4954 mdu_disk_info_t info; 4955 if (copy_from_user(&info, argp, sizeof(info))) 4956 err = -EFAULT; 4957 else 4958 err = add_new_disk(mddev, &info); 4959 goto done_unlock; 4960 } 4961 4962 case HOT_REMOVE_DISK: 4963 err = hot_remove_disk(mddev, new_decode_dev(arg)); 4964 goto done_unlock; 4965 4966 case HOT_ADD_DISK: 4967 err = hot_add_disk(mddev, new_decode_dev(arg)); 4968 goto done_unlock; 4969 4970 case SET_DISK_FAULTY: 4971 err = set_disk_faulty(mddev, new_decode_dev(arg)); 4972 goto done_unlock; 4973 4974 case RUN_ARRAY: 4975 err = do_md_run(mddev); 4976 goto done_unlock; 4977 4978 case SET_BITMAP_FILE: 4979 err = set_bitmap_file(mddev, (int)arg); 4980 goto done_unlock; 4981 4982 default: 4983 err = -EINVAL; 4984 goto abort_unlock; 4985 } 4986 4987 done_unlock: 4988 abort_unlock: 4989 mddev_unlock(mddev); 4990 4991 return err; 4992 done: 4993 if (err) 4994 MD_BUG(); 4995 abort: 4996 return err; 4997 } 4998 4999 static int md_open(struct inode *inode, struct file *file) 5000 { 5001 /* 5002 * Succeed if we can lock the mddev, which confirms that 5003 * it isn't being stopped right now. 5004 */ 5005 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 5006 int err; 5007 5008 if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 5009 goto out; 5010 5011 err = 0; 5012 mddev_get(mddev); 5013 atomic_inc(&mddev->openers); 5014 mddev_unlock(mddev); 5015 5016 check_disk_change(inode->i_bdev); 5017 out: 5018 return err; 5019 } 5020 5021 static int md_release(struct inode *inode, struct file * file) 5022 { 5023 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 5024 5025 BUG_ON(!mddev); 5026 atomic_dec(&mddev->openers); 5027 mddev_put(mddev); 5028 5029 return 0; 5030 } 5031 5032 static int md_media_changed(struct gendisk *disk) 5033 { 5034 mddev_t *mddev = disk->private_data; 5035 5036 return mddev->changed; 5037 } 5038 5039 static int md_revalidate(struct gendisk *disk) 5040 { 5041 mddev_t *mddev = disk->private_data; 5042 5043 mddev->changed = 0; 5044 return 0; 5045 } 5046 static struct block_device_operations md_fops = 5047 { 5048 .owner = THIS_MODULE, 5049 .open = md_open, 5050 .release = md_release, 5051 .ioctl = md_ioctl, 5052 .getgeo = md_getgeo, 5053 .media_changed = md_media_changed, 5054 .revalidate_disk= md_revalidate, 5055 }; 5056 5057 static int md_thread(void * arg) 5058 { 5059 mdk_thread_t *thread = arg; 5060 5061 /* 5062 * md_thread is a 'system-thread', it's priority should be very 5063 * high. We avoid resource deadlocks individually in each 5064 * raid personality. (RAID5 does preallocation) We also use RR and 5065 * the very same RT priority as kswapd, thus we will never get 5066 * into a priority inversion deadlock. 5067 * 5068 * we definitely have to have equal or higher priority than 5069 * bdflush, otherwise bdflush will deadlock if there are too 5070 * many dirty RAID5 blocks. 5071 */ 5072 5073 allow_signal(SIGKILL); 5074 while (!kthread_should_stop()) { 5075 5076 /* We need to wait INTERRUPTIBLE so that 5077 * we don't add to the load-average. 5078 * That means we need to be sure no signals are 5079 * pending 5080 */ 5081 if (signal_pending(current)) 5082 flush_signals(current); 5083 5084 wait_event_interruptible_timeout 5085 (thread->wqueue, 5086 test_bit(THREAD_WAKEUP, &thread->flags) 5087 || kthread_should_stop(), 5088 thread->timeout); 5089 5090 clear_bit(THREAD_WAKEUP, &thread->flags); 5091 5092 thread->run(thread->mddev); 5093 } 5094 5095 return 0; 5096 } 5097 5098 void md_wakeup_thread(mdk_thread_t *thread) 5099 { 5100 if (thread) { 5101 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 5102 set_bit(THREAD_WAKEUP, &thread->flags); 5103 wake_up(&thread->wqueue); 5104 } 5105 } 5106 5107 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 5108 const char *name) 5109 { 5110 mdk_thread_t *thread; 5111 5112 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 5113 if (!thread) 5114 return NULL; 5115 5116 init_waitqueue_head(&thread->wqueue); 5117 5118 thread->run = run; 5119 thread->mddev = mddev; 5120 thread->timeout = MAX_SCHEDULE_TIMEOUT; 5121 thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); 5122 if (IS_ERR(thread->tsk)) { 5123 kfree(thread); 5124 return NULL; 5125 } 5126 return thread; 5127 } 5128 5129 void md_unregister_thread(mdk_thread_t *thread) 5130 { 5131 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 5132 5133 kthread_stop(thread->tsk); 5134 kfree(thread); 5135 } 5136 5137 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 5138 { 5139 if (!mddev) { 5140 MD_BUG(); 5141 return; 5142 } 5143 5144 if (!rdev || test_bit(Faulty, &rdev->flags)) 5145 return; 5146 5147 if (mddev->external) 5148 set_bit(Blocked, &rdev->flags); 5149 /* 5150 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 5151 mdname(mddev), 5152 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 5153 __builtin_return_address(0),__builtin_return_address(1), 5154 __builtin_return_address(2),__builtin_return_address(3)); 5155 */ 5156 if (!mddev->pers) 5157 return; 5158 if (!mddev->pers->error_handler) 5159 return; 5160 mddev->pers->error_handler(mddev,rdev); 5161 if (mddev->degraded) 5162 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5163 set_bit(StateChanged, &rdev->flags); 5164 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5165 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5166 md_wakeup_thread(mddev->thread); 5167 md_new_event_inintr(mddev); 5168 } 5169 5170 /* seq_file implementation /proc/mdstat */ 5171 5172 static void status_unused(struct seq_file *seq) 5173 { 5174 int i = 0; 5175 mdk_rdev_t *rdev; 5176 struct list_head *tmp; 5177 5178 seq_printf(seq, "unused devices: "); 5179 5180 rdev_for_each_list(rdev, tmp, pending_raid_disks) { 5181 char b[BDEVNAME_SIZE]; 5182 i++; 5183 seq_printf(seq, "%s ", 5184 bdevname(rdev->bdev,b)); 5185 } 5186 if (!i) 5187 seq_printf(seq, "<none>"); 5188 5189 seq_printf(seq, "\n"); 5190 } 5191 5192 5193 static void status_resync(struct seq_file *seq, mddev_t * mddev) 5194 { 5195 sector_t max_blocks, resync, res; 5196 unsigned long dt, db, rt; 5197 int scale; 5198 unsigned int per_milli; 5199 5200 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 5201 5202 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5203 max_blocks = mddev->resync_max_sectors >> 1; 5204 else 5205 max_blocks = mddev->size; 5206 5207 /* 5208 * Should not happen. 5209 */ 5210 if (!max_blocks) { 5211 MD_BUG(); 5212 return; 5213 } 5214 /* Pick 'scale' such that (resync>>scale)*1000 will fit 5215 * in a sector_t, and (max_blocks>>scale) will fit in a 5216 * u32, as those are the requirements for sector_div. 5217 * Thus 'scale' must be at least 10 5218 */ 5219 scale = 10; 5220 if (sizeof(sector_t) > sizeof(unsigned long)) { 5221 while ( max_blocks/2 > (1ULL<<(scale+32))) 5222 scale++; 5223 } 5224 res = (resync>>scale)*1000; 5225 sector_div(res, (u32)((max_blocks>>scale)+1)); 5226 5227 per_milli = res; 5228 { 5229 int i, x = per_milli/50, y = 20-x; 5230 seq_printf(seq, "["); 5231 for (i = 0; i < x; i++) 5232 seq_printf(seq, "="); 5233 seq_printf(seq, ">"); 5234 for (i = 0; i < y; i++) 5235 seq_printf(seq, "."); 5236 seq_printf(seq, "] "); 5237 } 5238 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 5239 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 5240 "reshape" : 5241 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 5242 "check" : 5243 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 5244 "resync" : "recovery"))), 5245 per_milli/10, per_milli % 10, 5246 (unsigned long long) resync, 5247 (unsigned long long) max_blocks); 5248 5249 /* 5250 * We do not want to overflow, so the order of operands and 5251 * the * 100 / 100 trick are important. We do a +1 to be 5252 * safe against division by zero. We only estimate anyway. 5253 * 5254 * dt: time from mark until now 5255 * db: blocks written from mark until now 5256 * rt: remaining time 5257 */ 5258 dt = ((jiffies - mddev->resync_mark) / HZ); 5259 if (!dt) dt++; 5260 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 5261 - mddev->resync_mark_cnt; 5262 rt = (dt * ((unsigned long)(max_blocks-resync) / (db/2/100+1)))/100; 5263 5264 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 5265 5266 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 5267 } 5268 5269 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 5270 { 5271 struct list_head *tmp; 5272 loff_t l = *pos; 5273 mddev_t *mddev; 5274 5275 if (l >= 0x10000) 5276 return NULL; 5277 if (!l--) 5278 /* header */ 5279 return (void*)1; 5280 5281 spin_lock(&all_mddevs_lock); 5282 list_for_each(tmp,&all_mddevs) 5283 if (!l--) { 5284 mddev = list_entry(tmp, mddev_t, all_mddevs); 5285 mddev_get(mddev); 5286 spin_unlock(&all_mddevs_lock); 5287 return mddev; 5288 } 5289 spin_unlock(&all_mddevs_lock); 5290 if (!l--) 5291 return (void*)2;/* tail */ 5292 return NULL; 5293 } 5294 5295 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 5296 { 5297 struct list_head *tmp; 5298 mddev_t *next_mddev, *mddev = v; 5299 5300 ++*pos; 5301 if (v == (void*)2) 5302 return NULL; 5303 5304 spin_lock(&all_mddevs_lock); 5305 if (v == (void*)1) 5306 tmp = all_mddevs.next; 5307 else 5308 tmp = mddev->all_mddevs.next; 5309 if (tmp != &all_mddevs) 5310 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 5311 else { 5312 next_mddev = (void*)2; 5313 *pos = 0x10000; 5314 } 5315 spin_unlock(&all_mddevs_lock); 5316 5317 if (v != (void*)1) 5318 mddev_put(mddev); 5319 return next_mddev; 5320 5321 } 5322 5323 static void md_seq_stop(struct seq_file *seq, void *v) 5324 { 5325 mddev_t *mddev = v; 5326 5327 if (mddev && v != (void*)1 && v != (void*)2) 5328 mddev_put(mddev); 5329 } 5330 5331 struct mdstat_info { 5332 int event; 5333 }; 5334 5335 static int md_seq_show(struct seq_file *seq, void *v) 5336 { 5337 mddev_t *mddev = v; 5338 sector_t size; 5339 struct list_head *tmp2; 5340 mdk_rdev_t *rdev; 5341 struct mdstat_info *mi = seq->private; 5342 struct bitmap *bitmap; 5343 5344 if (v == (void*)1) { 5345 struct mdk_personality *pers; 5346 seq_printf(seq, "Personalities : "); 5347 spin_lock(&pers_lock); 5348 list_for_each_entry(pers, &pers_list, list) 5349 seq_printf(seq, "[%s] ", pers->name); 5350 5351 spin_unlock(&pers_lock); 5352 seq_printf(seq, "\n"); 5353 mi->event = atomic_read(&md_event_count); 5354 return 0; 5355 } 5356 if (v == (void*)2) { 5357 status_unused(seq); 5358 return 0; 5359 } 5360 5361 if (mddev_lock(mddev) < 0) 5362 return -EINTR; 5363 5364 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 5365 seq_printf(seq, "%s : %sactive", mdname(mddev), 5366 mddev->pers ? "" : "in"); 5367 if (mddev->pers) { 5368 if (mddev->ro==1) 5369 seq_printf(seq, " (read-only)"); 5370 if (mddev->ro==2) 5371 seq_printf(seq, " (auto-read-only)"); 5372 seq_printf(seq, " %s", mddev->pers->name); 5373 } 5374 5375 size = 0; 5376 rdev_for_each(rdev, tmp2, mddev) { 5377 char b[BDEVNAME_SIZE]; 5378 seq_printf(seq, " %s[%d]", 5379 bdevname(rdev->bdev,b), rdev->desc_nr); 5380 if (test_bit(WriteMostly, &rdev->flags)) 5381 seq_printf(seq, "(W)"); 5382 if (test_bit(Faulty, &rdev->flags)) { 5383 seq_printf(seq, "(F)"); 5384 continue; 5385 } else if (rdev->raid_disk < 0) 5386 seq_printf(seq, "(S)"); /* spare */ 5387 size += rdev->size; 5388 } 5389 5390 if (!list_empty(&mddev->disks)) { 5391 if (mddev->pers) 5392 seq_printf(seq, "\n %llu blocks", 5393 (unsigned long long) 5394 mddev->array_sectors / 2); 5395 else 5396 seq_printf(seq, "\n %llu blocks", 5397 (unsigned long long)size); 5398 } 5399 if (mddev->persistent) { 5400 if (mddev->major_version != 0 || 5401 mddev->minor_version != 90) { 5402 seq_printf(seq," super %d.%d", 5403 mddev->major_version, 5404 mddev->minor_version); 5405 } 5406 } else if (mddev->external) 5407 seq_printf(seq, " super external:%s", 5408 mddev->metadata_type); 5409 else 5410 seq_printf(seq, " super non-persistent"); 5411 5412 if (mddev->pers) { 5413 mddev->pers->status(seq, mddev); 5414 seq_printf(seq, "\n "); 5415 if (mddev->pers->sync_request) { 5416 if (mddev->curr_resync > 2) { 5417 status_resync(seq, mddev); 5418 seq_printf(seq, "\n "); 5419 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 5420 seq_printf(seq, "\tresync=DELAYED\n "); 5421 else if (mddev->recovery_cp < MaxSector) 5422 seq_printf(seq, "\tresync=PENDING\n "); 5423 } 5424 } else 5425 seq_printf(seq, "\n "); 5426 5427 if ((bitmap = mddev->bitmap)) { 5428 unsigned long chunk_kb; 5429 unsigned long flags; 5430 spin_lock_irqsave(&bitmap->lock, flags); 5431 chunk_kb = bitmap->chunksize >> 10; 5432 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 5433 "%lu%s chunk", 5434 bitmap->pages - bitmap->missing_pages, 5435 bitmap->pages, 5436 (bitmap->pages - bitmap->missing_pages) 5437 << (PAGE_SHIFT - 10), 5438 chunk_kb ? chunk_kb : bitmap->chunksize, 5439 chunk_kb ? "KB" : "B"); 5440 if (bitmap->file) { 5441 seq_printf(seq, ", file: "); 5442 seq_path(seq, &bitmap->file->f_path, " \t\n"); 5443 } 5444 5445 seq_printf(seq, "\n"); 5446 spin_unlock_irqrestore(&bitmap->lock, flags); 5447 } 5448 5449 seq_printf(seq, "\n"); 5450 } 5451 mddev_unlock(mddev); 5452 5453 return 0; 5454 } 5455 5456 static struct seq_operations md_seq_ops = { 5457 .start = md_seq_start, 5458 .next = md_seq_next, 5459 .stop = md_seq_stop, 5460 .show = md_seq_show, 5461 }; 5462 5463 static int md_seq_open(struct inode *inode, struct file *file) 5464 { 5465 int error; 5466 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 5467 if (mi == NULL) 5468 return -ENOMEM; 5469 5470 error = seq_open(file, &md_seq_ops); 5471 if (error) 5472 kfree(mi); 5473 else { 5474 struct seq_file *p = file->private_data; 5475 p->private = mi; 5476 mi->event = atomic_read(&md_event_count); 5477 } 5478 return error; 5479 } 5480 5481 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 5482 { 5483 struct seq_file *m = filp->private_data; 5484 struct mdstat_info *mi = m->private; 5485 int mask; 5486 5487 poll_wait(filp, &md_event_waiters, wait); 5488 5489 /* always allow read */ 5490 mask = POLLIN | POLLRDNORM; 5491 5492 if (mi->event != atomic_read(&md_event_count)) 5493 mask |= POLLERR | POLLPRI; 5494 return mask; 5495 } 5496 5497 static const struct file_operations md_seq_fops = { 5498 .owner = THIS_MODULE, 5499 .open = md_seq_open, 5500 .read = seq_read, 5501 .llseek = seq_lseek, 5502 .release = seq_release_private, 5503 .poll = mdstat_poll, 5504 }; 5505 5506 int register_md_personality(struct mdk_personality *p) 5507 { 5508 spin_lock(&pers_lock); 5509 list_add_tail(&p->list, &pers_list); 5510 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 5511 spin_unlock(&pers_lock); 5512 return 0; 5513 } 5514 5515 int unregister_md_personality(struct mdk_personality *p) 5516 { 5517 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 5518 spin_lock(&pers_lock); 5519 list_del_init(&p->list); 5520 spin_unlock(&pers_lock); 5521 return 0; 5522 } 5523 5524 static int is_mddev_idle(mddev_t *mddev) 5525 { 5526 mdk_rdev_t * rdev; 5527 int idle; 5528 long curr_events; 5529 5530 idle = 1; 5531 rcu_read_lock(); 5532 rdev_for_each_rcu(rdev, mddev) { 5533 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 5534 curr_events = part_stat_read(&disk->part0, sectors[0]) + 5535 part_stat_read(&disk->part0, sectors[1]) - 5536 atomic_read(&disk->sync_io); 5537 /* sync IO will cause sync_io to increase before the disk_stats 5538 * as sync_io is counted when a request starts, and 5539 * disk_stats is counted when it completes. 5540 * So resync activity will cause curr_events to be smaller than 5541 * when there was no such activity. 5542 * non-sync IO will cause disk_stat to increase without 5543 * increasing sync_io so curr_events will (eventually) 5544 * be larger than it was before. Once it becomes 5545 * substantially larger, the test below will cause 5546 * the array to appear non-idle, and resync will slow 5547 * down. 5548 * If there is a lot of outstanding resync activity when 5549 * we set last_event to curr_events, then all that activity 5550 * completing might cause the array to appear non-idle 5551 * and resync will be slowed down even though there might 5552 * not have been non-resync activity. This will only 5553 * happen once though. 'last_events' will soon reflect 5554 * the state where there is little or no outstanding 5555 * resync requests, and further resync activity will 5556 * always make curr_events less than last_events. 5557 * 5558 */ 5559 if (curr_events - rdev->last_events > 4096) { 5560 rdev->last_events = curr_events; 5561 idle = 0; 5562 } 5563 } 5564 rcu_read_unlock(); 5565 return idle; 5566 } 5567 5568 void md_done_sync(mddev_t *mddev, int blocks, int ok) 5569 { 5570 /* another "blocks" (512byte) blocks have been synced */ 5571 atomic_sub(blocks, &mddev->recovery_active); 5572 wake_up(&mddev->recovery_wait); 5573 if (!ok) { 5574 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5575 md_wakeup_thread(mddev->thread); 5576 // stop recovery, signal do_sync .... 5577 } 5578 } 5579 5580 5581 /* md_write_start(mddev, bi) 5582 * If we need to update some array metadata (e.g. 'active' flag 5583 * in superblock) before writing, schedule a superblock update 5584 * and wait for it to complete. 5585 */ 5586 void md_write_start(mddev_t *mddev, struct bio *bi) 5587 { 5588 int did_change = 0; 5589 if (bio_data_dir(bi) != WRITE) 5590 return; 5591 5592 BUG_ON(mddev->ro == 1); 5593 if (mddev->ro == 2) { 5594 /* need to switch to read/write */ 5595 mddev->ro = 0; 5596 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5597 md_wakeup_thread(mddev->thread); 5598 md_wakeup_thread(mddev->sync_thread); 5599 did_change = 1; 5600 } 5601 atomic_inc(&mddev->writes_pending); 5602 if (mddev->safemode == 1) 5603 mddev->safemode = 0; 5604 if (mddev->in_sync) { 5605 spin_lock_irq(&mddev->write_lock); 5606 if (mddev->in_sync) { 5607 mddev->in_sync = 0; 5608 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5609 md_wakeup_thread(mddev->thread); 5610 did_change = 1; 5611 } 5612 spin_unlock_irq(&mddev->write_lock); 5613 } 5614 if (did_change) 5615 sysfs_notify(&mddev->kobj, NULL, "array_state"); 5616 wait_event(mddev->sb_wait, 5617 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 5618 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 5619 } 5620 5621 void md_write_end(mddev_t *mddev) 5622 { 5623 if (atomic_dec_and_test(&mddev->writes_pending)) { 5624 if (mddev->safemode == 2) 5625 md_wakeup_thread(mddev->thread); 5626 else if (mddev->safemode_delay) 5627 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 5628 } 5629 } 5630 5631 /* md_allow_write(mddev) 5632 * Calling this ensures that the array is marked 'active' so that writes 5633 * may proceed without blocking. It is important to call this before 5634 * attempting a GFP_KERNEL allocation while holding the mddev lock. 5635 * Must be called with mddev_lock held. 5636 * 5637 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 5638 * is dropped, so return -EAGAIN after notifying userspace. 5639 */ 5640 int md_allow_write(mddev_t *mddev) 5641 { 5642 if (!mddev->pers) 5643 return 0; 5644 if (mddev->ro) 5645 return 0; 5646 if (!mddev->pers->sync_request) 5647 return 0; 5648 5649 spin_lock_irq(&mddev->write_lock); 5650 if (mddev->in_sync) { 5651 mddev->in_sync = 0; 5652 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5653 if (mddev->safemode_delay && 5654 mddev->safemode == 0) 5655 mddev->safemode = 1; 5656 spin_unlock_irq(&mddev->write_lock); 5657 md_update_sb(mddev, 0); 5658 sysfs_notify(&mddev->kobj, NULL, "array_state"); 5659 } else 5660 spin_unlock_irq(&mddev->write_lock); 5661 5662 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 5663 return -EAGAIN; 5664 else 5665 return 0; 5666 } 5667 EXPORT_SYMBOL_GPL(md_allow_write); 5668 5669 #define SYNC_MARKS 10 5670 #define SYNC_MARK_STEP (3*HZ) 5671 void md_do_sync(mddev_t *mddev) 5672 { 5673 mddev_t *mddev2; 5674 unsigned int currspeed = 0, 5675 window; 5676 sector_t max_sectors,j, io_sectors; 5677 unsigned long mark[SYNC_MARKS]; 5678 sector_t mark_cnt[SYNC_MARKS]; 5679 int last_mark,m; 5680 struct list_head *tmp; 5681 sector_t last_check; 5682 int skipped = 0; 5683 struct list_head *rtmp; 5684 mdk_rdev_t *rdev; 5685 char *desc; 5686 5687 /* just incase thread restarts... */ 5688 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 5689 return; 5690 if (mddev->ro) /* never try to sync a read-only array */ 5691 return; 5692 5693 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5694 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 5695 desc = "data-check"; 5696 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5697 desc = "requested-resync"; 5698 else 5699 desc = "resync"; 5700 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5701 desc = "reshape"; 5702 else 5703 desc = "recovery"; 5704 5705 /* we overload curr_resync somewhat here. 5706 * 0 == not engaged in resync at all 5707 * 2 == checking that there is no conflict with another sync 5708 * 1 == like 2, but have yielded to allow conflicting resync to 5709 * commense 5710 * other == active in resync - this many blocks 5711 * 5712 * Before starting a resync we must have set curr_resync to 5713 * 2, and then checked that every "conflicting" array has curr_resync 5714 * less than ours. When we find one that is the same or higher 5715 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 5716 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 5717 * This will mean we have to start checking from the beginning again. 5718 * 5719 */ 5720 5721 do { 5722 mddev->curr_resync = 2; 5723 5724 try_again: 5725 if (kthread_should_stop()) { 5726 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5727 goto skip; 5728 } 5729 for_each_mddev(mddev2, tmp) { 5730 if (mddev2 == mddev) 5731 continue; 5732 if (!mddev->parallel_resync 5733 && mddev2->curr_resync 5734 && match_mddev_units(mddev, mddev2)) { 5735 DEFINE_WAIT(wq); 5736 if (mddev < mddev2 && mddev->curr_resync == 2) { 5737 /* arbitrarily yield */ 5738 mddev->curr_resync = 1; 5739 wake_up(&resync_wait); 5740 } 5741 if (mddev > mddev2 && mddev->curr_resync == 1) 5742 /* no need to wait here, we can wait the next 5743 * time 'round when curr_resync == 2 5744 */ 5745 continue; 5746 /* We need to wait 'interruptible' so as not to 5747 * contribute to the load average, and not to 5748 * be caught by 'softlockup' 5749 */ 5750 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 5751 if (!kthread_should_stop() && 5752 mddev2->curr_resync >= mddev->curr_resync) { 5753 printk(KERN_INFO "md: delaying %s of %s" 5754 " until %s has finished (they" 5755 " share one or more physical units)\n", 5756 desc, mdname(mddev), mdname(mddev2)); 5757 mddev_put(mddev2); 5758 if (signal_pending(current)) 5759 flush_signals(current); 5760 schedule(); 5761 finish_wait(&resync_wait, &wq); 5762 goto try_again; 5763 } 5764 finish_wait(&resync_wait, &wq); 5765 } 5766 } 5767 } while (mddev->curr_resync < 2); 5768 5769 j = 0; 5770 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5771 /* resync follows the size requested by the personality, 5772 * which defaults to physical size, but can be virtual size 5773 */ 5774 max_sectors = mddev->resync_max_sectors; 5775 mddev->resync_mismatches = 0; 5776 /* we don't use the checkpoint if there's a bitmap */ 5777 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 5778 j = mddev->resync_min; 5779 else if (!mddev->bitmap) 5780 j = mddev->recovery_cp; 5781 5782 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 5783 max_sectors = mddev->size << 1; 5784 else { 5785 /* recovery follows the physical size of devices */ 5786 max_sectors = mddev->size << 1; 5787 j = MaxSector; 5788 rdev_for_each(rdev, rtmp, mddev) 5789 if (rdev->raid_disk >= 0 && 5790 !test_bit(Faulty, &rdev->flags) && 5791 !test_bit(In_sync, &rdev->flags) && 5792 rdev->recovery_offset < j) 5793 j = rdev->recovery_offset; 5794 } 5795 5796 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 5797 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 5798 " %d KB/sec/disk.\n", speed_min(mddev)); 5799 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 5800 "(but not more than %d KB/sec) for %s.\n", 5801 speed_max(mddev), desc); 5802 5803 is_mddev_idle(mddev); /* this also initializes IO event counters */ 5804 5805 io_sectors = 0; 5806 for (m = 0; m < SYNC_MARKS; m++) { 5807 mark[m] = jiffies; 5808 mark_cnt[m] = io_sectors; 5809 } 5810 last_mark = 0; 5811 mddev->resync_mark = mark[last_mark]; 5812 mddev->resync_mark_cnt = mark_cnt[last_mark]; 5813 5814 /* 5815 * Tune reconstruction: 5816 */ 5817 window = 32*(PAGE_SIZE/512); 5818 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 5819 window/2,(unsigned long long) max_sectors/2); 5820 5821 atomic_set(&mddev->recovery_active, 0); 5822 last_check = 0; 5823 5824 if (j>2) { 5825 printk(KERN_INFO 5826 "md: resuming %s of %s from checkpoint.\n", 5827 desc, mdname(mddev)); 5828 mddev->curr_resync = j; 5829 } 5830 5831 while (j < max_sectors) { 5832 sector_t sectors; 5833 5834 skipped = 0; 5835 if (j >= mddev->resync_max) { 5836 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5837 wait_event(mddev->recovery_wait, 5838 mddev->resync_max > j 5839 || kthread_should_stop()); 5840 } 5841 if (kthread_should_stop()) 5842 goto interrupted; 5843 sectors = mddev->pers->sync_request(mddev, j, &skipped, 5844 currspeed < speed_min(mddev)); 5845 if (sectors == 0) { 5846 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5847 goto out; 5848 } 5849 5850 if (!skipped) { /* actual IO requested */ 5851 io_sectors += sectors; 5852 atomic_add(sectors, &mddev->recovery_active); 5853 } 5854 5855 j += sectors; 5856 if (j>1) mddev->curr_resync = j; 5857 mddev->curr_mark_cnt = io_sectors; 5858 if (last_check == 0) 5859 /* this is the earliers that rebuilt will be 5860 * visible in /proc/mdstat 5861 */ 5862 md_new_event(mddev); 5863 5864 if (last_check + window > io_sectors || j == max_sectors) 5865 continue; 5866 5867 last_check = io_sectors; 5868 5869 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5870 break; 5871 5872 repeat: 5873 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 5874 /* step marks */ 5875 int next = (last_mark+1) % SYNC_MARKS; 5876 5877 mddev->resync_mark = mark[next]; 5878 mddev->resync_mark_cnt = mark_cnt[next]; 5879 mark[next] = jiffies; 5880 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 5881 last_mark = next; 5882 } 5883 5884 5885 if (kthread_should_stop()) 5886 goto interrupted; 5887 5888 5889 /* 5890 * this loop exits only if either when we are slower than 5891 * the 'hard' speed limit, or the system was IO-idle for 5892 * a jiffy. 5893 * the system might be non-idle CPU-wise, but we only care 5894 * about not overloading the IO subsystem. (things like an 5895 * e2fsck being done on the RAID array should execute fast) 5896 */ 5897 blk_unplug(mddev->queue); 5898 cond_resched(); 5899 5900 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 5901 /((jiffies-mddev->resync_mark)/HZ +1) +1; 5902 5903 if (currspeed > speed_min(mddev)) { 5904 if ((currspeed > speed_max(mddev)) || 5905 !is_mddev_idle(mddev)) { 5906 msleep(500); 5907 goto repeat; 5908 } 5909 } 5910 } 5911 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 5912 /* 5913 * this also signals 'finished resyncing' to md_stop 5914 */ 5915 out: 5916 blk_unplug(mddev->queue); 5917 5918 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 5919 5920 /* tell personality that we are finished */ 5921 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 5922 5923 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 5924 mddev->curr_resync > 2) { 5925 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 5926 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5927 if (mddev->curr_resync >= mddev->recovery_cp) { 5928 printk(KERN_INFO 5929 "md: checkpointing %s of %s.\n", 5930 desc, mdname(mddev)); 5931 mddev->recovery_cp = mddev->curr_resync; 5932 } 5933 } else 5934 mddev->recovery_cp = MaxSector; 5935 } else { 5936 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 5937 mddev->curr_resync = MaxSector; 5938 rdev_for_each(rdev, rtmp, mddev) 5939 if (rdev->raid_disk >= 0 && 5940 !test_bit(Faulty, &rdev->flags) && 5941 !test_bit(In_sync, &rdev->flags) && 5942 rdev->recovery_offset < mddev->curr_resync) 5943 rdev->recovery_offset = mddev->curr_resync; 5944 } 5945 } 5946 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5947 5948 skip: 5949 mddev->curr_resync = 0; 5950 mddev->resync_min = 0; 5951 mddev->resync_max = MaxSector; 5952 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 5953 wake_up(&resync_wait); 5954 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 5955 md_wakeup_thread(mddev->thread); 5956 return; 5957 5958 interrupted: 5959 /* 5960 * got a signal, exit. 5961 */ 5962 printk(KERN_INFO 5963 "md: md_do_sync() got signal ... exiting\n"); 5964 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5965 goto out; 5966 5967 } 5968 EXPORT_SYMBOL_GPL(md_do_sync); 5969 5970 5971 static int remove_and_add_spares(mddev_t *mddev) 5972 { 5973 mdk_rdev_t *rdev; 5974 struct list_head *rtmp; 5975 int spares = 0; 5976 5977 rdev_for_each(rdev, rtmp, mddev) 5978 if (rdev->raid_disk >= 0 && 5979 !test_bit(Blocked, &rdev->flags) && 5980 (test_bit(Faulty, &rdev->flags) || 5981 ! test_bit(In_sync, &rdev->flags)) && 5982 atomic_read(&rdev->nr_pending)==0) { 5983 if (mddev->pers->hot_remove_disk( 5984 mddev, rdev->raid_disk)==0) { 5985 char nm[20]; 5986 sprintf(nm,"rd%d", rdev->raid_disk); 5987 sysfs_remove_link(&mddev->kobj, nm); 5988 rdev->raid_disk = -1; 5989 } 5990 } 5991 5992 if (mddev->degraded && ! mddev->ro) { 5993 rdev_for_each(rdev, rtmp, mddev) { 5994 if (rdev->raid_disk >= 0 && 5995 !test_bit(In_sync, &rdev->flags) && 5996 !test_bit(Blocked, &rdev->flags)) 5997 spares++; 5998 if (rdev->raid_disk < 0 5999 && !test_bit(Faulty, &rdev->flags)) { 6000 rdev->recovery_offset = 0; 6001 if (mddev->pers-> 6002 hot_add_disk(mddev, rdev) == 0) { 6003 char nm[20]; 6004 sprintf(nm, "rd%d", rdev->raid_disk); 6005 if (sysfs_create_link(&mddev->kobj, 6006 &rdev->kobj, nm)) 6007 printk(KERN_WARNING 6008 "md: cannot register " 6009 "%s for %s\n", 6010 nm, mdname(mddev)); 6011 spares++; 6012 md_new_event(mddev); 6013 } else 6014 break; 6015 } 6016 } 6017 } 6018 return spares; 6019 } 6020 /* 6021 * This routine is regularly called by all per-raid-array threads to 6022 * deal with generic issues like resync and super-block update. 6023 * Raid personalities that don't have a thread (linear/raid0) do not 6024 * need this as they never do any recovery or update the superblock. 6025 * 6026 * It does not do any resync itself, but rather "forks" off other threads 6027 * to do that as needed. 6028 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 6029 * "->recovery" and create a thread at ->sync_thread. 6030 * When the thread finishes it sets MD_RECOVERY_DONE 6031 * and wakeups up this thread which will reap the thread and finish up. 6032 * This thread also removes any faulty devices (with nr_pending == 0). 6033 * 6034 * The overall approach is: 6035 * 1/ if the superblock needs updating, update it. 6036 * 2/ If a recovery thread is running, don't do anything else. 6037 * 3/ If recovery has finished, clean up, possibly marking spares active. 6038 * 4/ If there are any faulty devices, remove them. 6039 * 5/ If array is degraded, try to add spares devices 6040 * 6/ If array has spares or is not in-sync, start a resync thread. 6041 */ 6042 void md_check_recovery(mddev_t *mddev) 6043 { 6044 mdk_rdev_t *rdev; 6045 struct list_head *rtmp; 6046 6047 6048 if (mddev->bitmap) 6049 bitmap_daemon_work(mddev->bitmap); 6050 6051 if (test_and_clear_bit(MD_NOTIFY_ARRAY_STATE, &mddev->flags)) 6052 sysfs_notify(&mddev->kobj, NULL, "array_state"); 6053 6054 if (mddev->ro) 6055 return; 6056 6057 if (signal_pending(current)) { 6058 if (mddev->pers->sync_request && !mddev->external) { 6059 printk(KERN_INFO "md: %s in immediate safe mode\n", 6060 mdname(mddev)); 6061 mddev->safemode = 2; 6062 } 6063 flush_signals(current); 6064 } 6065 6066 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 6067 return; 6068 if ( ! ( 6069 (mddev->flags && !mddev->external) || 6070 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 6071 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 6072 (mddev->external == 0 && mddev->safemode == 1) || 6073 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 6074 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 6075 )) 6076 return; 6077 6078 if (mddev_trylock(mddev)) { 6079 int spares = 0; 6080 6081 if (mddev->ro) { 6082 /* Only thing we do on a ro array is remove 6083 * failed devices. 6084 */ 6085 remove_and_add_spares(mddev); 6086 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6087 goto unlock; 6088 } 6089 6090 if (!mddev->external) { 6091 int did_change = 0; 6092 spin_lock_irq(&mddev->write_lock); 6093 if (mddev->safemode && 6094 !atomic_read(&mddev->writes_pending) && 6095 !mddev->in_sync && 6096 mddev->recovery_cp == MaxSector) { 6097 mddev->in_sync = 1; 6098 did_change = 1; 6099 if (mddev->persistent) 6100 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6101 } 6102 if (mddev->safemode == 1) 6103 mddev->safemode = 0; 6104 spin_unlock_irq(&mddev->write_lock); 6105 if (did_change) 6106 sysfs_notify(&mddev->kobj, NULL, "array_state"); 6107 } 6108 6109 if (mddev->flags) 6110 md_update_sb(mddev, 0); 6111 6112 rdev_for_each(rdev, rtmp, mddev) 6113 if (test_and_clear_bit(StateChanged, &rdev->flags)) 6114 sysfs_notify(&rdev->kobj, NULL, "state"); 6115 6116 6117 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6118 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6119 /* resync/recovery still happening */ 6120 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6121 goto unlock; 6122 } 6123 if (mddev->sync_thread) { 6124 /* resync has finished, collect result */ 6125 md_unregister_thread(mddev->sync_thread); 6126 mddev->sync_thread = NULL; 6127 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 6128 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 6129 /* success...*/ 6130 /* activate any spares */ 6131 if (mddev->pers->spare_active(mddev)) 6132 sysfs_notify(&mddev->kobj, NULL, 6133 "degraded"); 6134 } 6135 md_update_sb(mddev, 1); 6136 6137 /* if array is no-longer degraded, then any saved_raid_disk 6138 * information must be scrapped 6139 */ 6140 if (!mddev->degraded) 6141 rdev_for_each(rdev, rtmp, mddev) 6142 rdev->saved_raid_disk = -1; 6143 6144 mddev->recovery = 0; 6145 /* flag recovery needed just to double check */ 6146 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6147 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 6148 md_new_event(mddev); 6149 goto unlock; 6150 } 6151 /* Set RUNNING before clearing NEEDED to avoid 6152 * any transients in the value of "sync_action". 6153 */ 6154 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6155 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6156 /* Clear some bits that don't mean anything, but 6157 * might be left set 6158 */ 6159 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6160 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6161 6162 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 6163 goto unlock; 6164 /* no recovery is running. 6165 * remove any failed drives, then 6166 * add spares if possible. 6167 * Spare are also removed and re-added, to allow 6168 * the personality to fail the re-add. 6169 */ 6170 6171 if (mddev->reshape_position != MaxSector) { 6172 if (mddev->pers->check_reshape(mddev) != 0) 6173 /* Cannot proceed */ 6174 goto unlock; 6175 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6176 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6177 } else if ((spares = remove_and_add_spares(mddev))) { 6178 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6179 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6180 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 6181 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6182 } else if (mddev->recovery_cp < MaxSector) { 6183 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6184 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6185 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6186 /* nothing to be done ... */ 6187 goto unlock; 6188 6189 if (mddev->pers->sync_request) { 6190 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6191 /* We are adding a device or devices to an array 6192 * which has the bitmap stored on all devices. 6193 * So make sure all bitmap pages get written 6194 */ 6195 bitmap_write_all(mddev->bitmap); 6196 } 6197 mddev->sync_thread = md_register_thread(md_do_sync, 6198 mddev, 6199 "%s_resync"); 6200 if (!mddev->sync_thread) { 6201 printk(KERN_ERR "%s: could not start resync" 6202 " thread...\n", 6203 mdname(mddev)); 6204 /* leave the spares where they are, it shouldn't hurt */ 6205 mddev->recovery = 0; 6206 } else 6207 md_wakeup_thread(mddev->sync_thread); 6208 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 6209 md_new_event(mddev); 6210 } 6211 unlock: 6212 if (!mddev->sync_thread) { 6213 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6214 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 6215 &mddev->recovery)) 6216 sysfs_notify(&mddev->kobj, NULL, "sync_action"); 6217 } 6218 mddev_unlock(mddev); 6219 } 6220 } 6221 6222 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 6223 { 6224 sysfs_notify(&rdev->kobj, NULL, "state"); 6225 wait_event_timeout(rdev->blocked_wait, 6226 !test_bit(Blocked, &rdev->flags), 6227 msecs_to_jiffies(5000)); 6228 rdev_dec_pending(rdev, mddev); 6229 } 6230 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 6231 6232 static int md_notify_reboot(struct notifier_block *this, 6233 unsigned long code, void *x) 6234 { 6235 struct list_head *tmp; 6236 mddev_t *mddev; 6237 6238 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 6239 6240 printk(KERN_INFO "md: stopping all md devices.\n"); 6241 6242 for_each_mddev(mddev, tmp) 6243 if (mddev_trylock(mddev)) { 6244 /* Force a switch to readonly even array 6245 * appears to still be in use. Hence 6246 * the '100'. 6247 */ 6248 do_md_stop(mddev, 1, 100); 6249 mddev_unlock(mddev); 6250 } 6251 /* 6252 * certain more exotic SCSI devices are known to be 6253 * volatile wrt too early system reboots. While the 6254 * right place to handle this issue is the given 6255 * driver, we do want to have a safe RAID driver ... 6256 */ 6257 mdelay(1000*1); 6258 } 6259 return NOTIFY_DONE; 6260 } 6261 6262 static struct notifier_block md_notifier = { 6263 .notifier_call = md_notify_reboot, 6264 .next = NULL, 6265 .priority = INT_MAX, /* before any real devices */ 6266 }; 6267 6268 static void md_geninit(void) 6269 { 6270 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 6271 6272 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 6273 } 6274 6275 static int __init md_init(void) 6276 { 6277 if (register_blkdev(MAJOR_NR, "md")) 6278 return -1; 6279 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 6280 unregister_blkdev(MAJOR_NR, "md"); 6281 return -1; 6282 } 6283 blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, 6284 md_probe, NULL, NULL); 6285 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 6286 md_probe, NULL, NULL); 6287 6288 register_reboot_notifier(&md_notifier); 6289 raid_table_header = register_sysctl_table(raid_root_table); 6290 6291 md_geninit(); 6292 return 0; 6293 } 6294 6295 6296 #ifndef MODULE 6297 6298 /* 6299 * Searches all registered partitions for autorun RAID arrays 6300 * at boot time. 6301 */ 6302 6303 static LIST_HEAD(all_detected_devices); 6304 struct detected_devices_node { 6305 struct list_head list; 6306 dev_t dev; 6307 }; 6308 6309 void md_autodetect_dev(dev_t dev) 6310 { 6311 struct detected_devices_node *node_detected_dev; 6312 6313 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 6314 if (node_detected_dev) { 6315 node_detected_dev->dev = dev; 6316 list_add_tail(&node_detected_dev->list, &all_detected_devices); 6317 } else { 6318 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 6319 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 6320 } 6321 } 6322 6323 6324 static void autostart_arrays(int part) 6325 { 6326 mdk_rdev_t *rdev; 6327 struct detected_devices_node *node_detected_dev; 6328 dev_t dev; 6329 int i_scanned, i_passed; 6330 6331 i_scanned = 0; 6332 i_passed = 0; 6333 6334 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 6335 6336 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 6337 i_scanned++; 6338 node_detected_dev = list_entry(all_detected_devices.next, 6339 struct detected_devices_node, list); 6340 list_del(&node_detected_dev->list); 6341 dev = node_detected_dev->dev; 6342 kfree(node_detected_dev); 6343 rdev = md_import_device(dev,0, 90); 6344 if (IS_ERR(rdev)) 6345 continue; 6346 6347 if (test_bit(Faulty, &rdev->flags)) { 6348 MD_BUG(); 6349 continue; 6350 } 6351 set_bit(AutoDetected, &rdev->flags); 6352 list_add(&rdev->same_set, &pending_raid_disks); 6353 i_passed++; 6354 } 6355 6356 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 6357 i_scanned, i_passed); 6358 6359 autorun_devices(part); 6360 } 6361 6362 #endif /* !MODULE */ 6363 6364 static __exit void md_exit(void) 6365 { 6366 mddev_t *mddev; 6367 struct list_head *tmp; 6368 6369 blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); 6370 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 6371 6372 unregister_blkdev(MAJOR_NR,"md"); 6373 unregister_blkdev(mdp_major, "mdp"); 6374 unregister_reboot_notifier(&md_notifier); 6375 unregister_sysctl_table(raid_table_header); 6376 remove_proc_entry("mdstat", NULL); 6377 for_each_mddev(mddev, tmp) { 6378 struct gendisk *disk = mddev->gendisk; 6379 if (!disk) 6380 continue; 6381 export_array(mddev); 6382 del_gendisk(disk); 6383 put_disk(disk); 6384 mddev->gendisk = NULL; 6385 mddev_put(mddev); 6386 } 6387 } 6388 6389 subsys_initcall(md_init); 6390 module_exit(md_exit) 6391 6392 static int get_ro(char *buffer, struct kernel_param *kp) 6393 { 6394 return sprintf(buffer, "%d", start_readonly); 6395 } 6396 static int set_ro(const char *val, struct kernel_param *kp) 6397 { 6398 char *e; 6399 int num = simple_strtoul(val, &e, 10); 6400 if (*val && (*e == '\0' || *e == '\n')) { 6401 start_readonly = num; 6402 return 0; 6403 } 6404 return -EINVAL; 6405 } 6406 6407 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 6408 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 6409 6410 6411 EXPORT_SYMBOL(register_md_personality); 6412 EXPORT_SYMBOL(unregister_md_personality); 6413 EXPORT_SYMBOL(md_error); 6414 EXPORT_SYMBOL(md_done_sync); 6415 EXPORT_SYMBOL(md_write_start); 6416 EXPORT_SYMBOL(md_write_end); 6417 EXPORT_SYMBOL(md_register_thread); 6418 EXPORT_SYMBOL(md_unregister_thread); 6419 EXPORT_SYMBOL(md_wakeup_thread); 6420 EXPORT_SYMBOL(md_check_recovery); 6421 MODULE_LICENSE("GPL"); 6422 MODULE_ALIAS("md"); 6423 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 6424