1 /* 2 md.c : Multiple Devices driver for Linux 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 4 5 completely rewritten, based on the MD driver code from Marc Zyngier 6 7 Changes: 8 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 13 - kmod support by: Cyrus Durgin 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 16 17 - lots of fixes and improvements to the RAID1/RAID5 and generic 18 RAID code (such as request based resynchronization): 19 20 Neil Brown <neilb@cse.unsw.edu.au>. 21 22 - persistent bitmap code 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 24 25 This program is free software; you can redistribute it and/or modify 26 it under the terms of the GNU General Public License as published by 27 the Free Software Foundation; either version 2, or (at your option) 28 any later version. 29 30 You should have received a copy of the GNU General Public License 31 (for example /usr/src/linux/COPYING); if not, write to the Free 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 33 */ 34 35 #include <linux/module.h> 36 #include <linux/config.h> 37 #include <linux/linkage.h> 38 #include <linux/raid/md.h> 39 #include <linux/raid/bitmap.h> 40 #include <linux/sysctl.h> 41 #include <linux/devfs_fs_kernel.h> 42 #include <linux/buffer_head.h> /* for invalidate_bdev */ 43 #include <linux/suspend.h> 44 45 #include <linux/init.h> 46 47 #include <linux/file.h> 48 49 #ifdef CONFIG_KMOD 50 #include <linux/kmod.h> 51 #endif 52 53 #include <asm/unaligned.h> 54 55 #define MAJOR_NR MD_MAJOR 56 #define MD_DRIVER 57 58 /* 63 partitions with the alternate major number (mdp) */ 59 #define MdpMinorShift 6 60 61 #define DEBUG 0 62 #define dprintk(x...) ((void)(DEBUG && printk(x))) 63 64 65 #ifndef MODULE 66 static void autostart_arrays (int part); 67 #endif 68 69 static mdk_personality_t *pers[MAX_PERSONALITY]; 70 static DEFINE_SPINLOCK(pers_lock); 71 72 /* 73 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 74 * is 1000 KB/sec, so the extra system load does not show up that much. 75 * Increase it if you want to have more _guaranteed_ speed. Note that 76 * the RAID driver will use the maximum available bandwith if the IO 77 * subsystem is idle. There is also an 'absolute maximum' reconstruction 78 * speed limit - in case reconstruction slows down your system despite 79 * idle IO detection. 80 * 81 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 82 */ 83 84 static int sysctl_speed_limit_min = 1000; 85 static int sysctl_speed_limit_max = 200000; 86 87 static struct ctl_table_header *raid_table_header; 88 89 static ctl_table raid_table[] = { 90 { 91 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 92 .procname = "speed_limit_min", 93 .data = &sysctl_speed_limit_min, 94 .maxlen = sizeof(int), 95 .mode = 0644, 96 .proc_handler = &proc_dointvec, 97 }, 98 { 99 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 100 .procname = "speed_limit_max", 101 .data = &sysctl_speed_limit_max, 102 .maxlen = sizeof(int), 103 .mode = 0644, 104 .proc_handler = &proc_dointvec, 105 }, 106 { .ctl_name = 0 } 107 }; 108 109 static ctl_table raid_dir_table[] = { 110 { 111 .ctl_name = DEV_RAID, 112 .procname = "raid", 113 .maxlen = 0, 114 .mode = 0555, 115 .child = raid_table, 116 }, 117 { .ctl_name = 0 } 118 }; 119 120 static ctl_table raid_root_table[] = { 121 { 122 .ctl_name = CTL_DEV, 123 .procname = "dev", 124 .maxlen = 0, 125 .mode = 0555, 126 .child = raid_dir_table, 127 }, 128 { .ctl_name = 0 } 129 }; 130 131 static struct block_device_operations md_fops; 132 133 /* 134 * Enables to iterate over all existing md arrays 135 * all_mddevs_lock protects this list. 136 */ 137 static LIST_HEAD(all_mddevs); 138 static DEFINE_SPINLOCK(all_mddevs_lock); 139 140 141 /* 142 * iterates through all used mddevs in the system. 143 * We take care to grab the all_mddevs_lock whenever navigating 144 * the list, and to always hold a refcount when unlocked. 145 * Any code which breaks out of this loop while own 146 * a reference to the current mddev and must mddev_put it. 147 */ 148 #define ITERATE_MDDEV(mddev,tmp) \ 149 \ 150 for (({ spin_lock(&all_mddevs_lock); \ 151 tmp = all_mddevs.next; \ 152 mddev = NULL;}); \ 153 ({ if (tmp != &all_mddevs) \ 154 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 155 spin_unlock(&all_mddevs_lock); \ 156 if (mddev) mddev_put(mddev); \ 157 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 158 tmp != &all_mddevs;}); \ 159 ({ spin_lock(&all_mddevs_lock); \ 160 tmp = tmp->next;}) \ 161 ) 162 163 164 static int md_fail_request (request_queue_t *q, struct bio *bio) 165 { 166 bio_io_error(bio, bio->bi_size); 167 return 0; 168 } 169 170 static inline mddev_t *mddev_get(mddev_t *mddev) 171 { 172 atomic_inc(&mddev->active); 173 return mddev; 174 } 175 176 static void mddev_put(mddev_t *mddev) 177 { 178 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 179 return; 180 if (!mddev->raid_disks && list_empty(&mddev->disks)) { 181 list_del(&mddev->all_mddevs); 182 blk_put_queue(mddev->queue); 183 kfree(mddev); 184 } 185 spin_unlock(&all_mddevs_lock); 186 } 187 188 static mddev_t * mddev_find(dev_t unit) 189 { 190 mddev_t *mddev, *new = NULL; 191 192 retry: 193 spin_lock(&all_mddevs_lock); 194 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 195 if (mddev->unit == unit) { 196 mddev_get(mddev); 197 spin_unlock(&all_mddevs_lock); 198 kfree(new); 199 return mddev; 200 } 201 202 if (new) { 203 list_add(&new->all_mddevs, &all_mddevs); 204 spin_unlock(&all_mddevs_lock); 205 return new; 206 } 207 spin_unlock(&all_mddevs_lock); 208 209 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 210 if (!new) 211 return NULL; 212 213 memset(new, 0, sizeof(*new)); 214 215 new->unit = unit; 216 if (MAJOR(unit) == MD_MAJOR) 217 new->md_minor = MINOR(unit); 218 else 219 new->md_minor = MINOR(unit) >> MdpMinorShift; 220 221 init_MUTEX(&new->reconfig_sem); 222 INIT_LIST_HEAD(&new->disks); 223 INIT_LIST_HEAD(&new->all_mddevs); 224 init_timer(&new->safemode_timer); 225 atomic_set(&new->active, 1); 226 spin_lock_init(&new->write_lock); 227 init_waitqueue_head(&new->sb_wait); 228 229 new->queue = blk_alloc_queue(GFP_KERNEL); 230 if (!new->queue) { 231 kfree(new); 232 return NULL; 233 } 234 235 blk_queue_make_request(new->queue, md_fail_request); 236 237 goto retry; 238 } 239 240 static inline int mddev_lock(mddev_t * mddev) 241 { 242 return down_interruptible(&mddev->reconfig_sem); 243 } 244 245 static inline void mddev_lock_uninterruptible(mddev_t * mddev) 246 { 247 down(&mddev->reconfig_sem); 248 } 249 250 static inline int mddev_trylock(mddev_t * mddev) 251 { 252 return down_trylock(&mddev->reconfig_sem); 253 } 254 255 static inline void mddev_unlock(mddev_t * mddev) 256 { 257 up(&mddev->reconfig_sem); 258 259 if (mddev->thread) 260 md_wakeup_thread(mddev->thread); 261 } 262 263 mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 264 { 265 mdk_rdev_t * rdev; 266 struct list_head *tmp; 267 268 ITERATE_RDEV(mddev,rdev,tmp) { 269 if (rdev->desc_nr == nr) 270 return rdev; 271 } 272 return NULL; 273 } 274 275 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 276 { 277 struct list_head *tmp; 278 mdk_rdev_t *rdev; 279 280 ITERATE_RDEV(mddev,rdev,tmp) { 281 if (rdev->bdev->bd_dev == dev) 282 return rdev; 283 } 284 return NULL; 285 } 286 287 inline static sector_t calc_dev_sboffset(struct block_device *bdev) 288 { 289 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 290 return MD_NEW_SIZE_BLOCKS(size); 291 } 292 293 static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size) 294 { 295 sector_t size; 296 297 size = rdev->sb_offset; 298 299 if (chunk_size) 300 size &= ~((sector_t)chunk_size/1024 - 1); 301 return size; 302 } 303 304 static int alloc_disk_sb(mdk_rdev_t * rdev) 305 { 306 if (rdev->sb_page) 307 MD_BUG(); 308 309 rdev->sb_page = alloc_page(GFP_KERNEL); 310 if (!rdev->sb_page) { 311 printk(KERN_ALERT "md: out of memory.\n"); 312 return -EINVAL; 313 } 314 315 return 0; 316 } 317 318 static void free_disk_sb(mdk_rdev_t * rdev) 319 { 320 if (rdev->sb_page) { 321 page_cache_release(rdev->sb_page); 322 rdev->sb_loaded = 0; 323 rdev->sb_page = NULL; 324 rdev->sb_offset = 0; 325 rdev->size = 0; 326 } 327 } 328 329 330 static int super_written(struct bio *bio, unsigned int bytes_done, int error) 331 { 332 mdk_rdev_t *rdev = bio->bi_private; 333 if (bio->bi_size) 334 return 1; 335 336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) 337 md_error(rdev->mddev, rdev); 338 339 if (atomic_dec_and_test(&rdev->mddev->pending_writes)) 340 wake_up(&rdev->mddev->sb_wait); 341 bio_put(bio); 342 return 0; 343 } 344 345 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 346 sector_t sector, int size, struct page *page) 347 { 348 /* write first size bytes of page to sector of rdev 349 * Increment mddev->pending_writes before returning 350 * and decrement it on completion, waking up sb_wait 351 * if zero is reached. 352 * If an error occurred, call md_error 353 */ 354 struct bio *bio = bio_alloc(GFP_NOIO, 1); 355 356 bio->bi_bdev = rdev->bdev; 357 bio->bi_sector = sector; 358 bio_add_page(bio, page, size, 0); 359 bio->bi_private = rdev; 360 bio->bi_end_io = super_written; 361 atomic_inc(&mddev->pending_writes); 362 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); 363 } 364 365 static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 366 { 367 if (bio->bi_size) 368 return 1; 369 370 complete((struct completion*)bio->bi_private); 371 return 0; 372 } 373 374 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 375 struct page *page, int rw) 376 { 377 struct bio *bio = bio_alloc(GFP_NOIO, 1); 378 struct completion event; 379 int ret; 380 381 rw |= (1 << BIO_RW_SYNC); 382 383 bio->bi_bdev = bdev; 384 bio->bi_sector = sector; 385 bio_add_page(bio, page, size, 0); 386 init_completion(&event); 387 bio->bi_private = &event; 388 bio->bi_end_io = bi_complete; 389 submit_bio(rw, bio); 390 wait_for_completion(&event); 391 392 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 393 bio_put(bio); 394 return ret; 395 } 396 397 static int read_disk_sb(mdk_rdev_t * rdev) 398 { 399 char b[BDEVNAME_SIZE]; 400 if (!rdev->sb_page) { 401 MD_BUG(); 402 return -EINVAL; 403 } 404 if (rdev->sb_loaded) 405 return 0; 406 407 408 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) 409 goto fail; 410 rdev->sb_loaded = 1; 411 return 0; 412 413 fail: 414 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 415 bdevname(rdev->bdev,b)); 416 return -EINVAL; 417 } 418 419 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 420 { 421 if ( (sb1->set_uuid0 == sb2->set_uuid0) && 422 (sb1->set_uuid1 == sb2->set_uuid1) && 423 (sb1->set_uuid2 == sb2->set_uuid2) && 424 (sb1->set_uuid3 == sb2->set_uuid3)) 425 426 return 1; 427 428 return 0; 429 } 430 431 432 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 433 { 434 int ret; 435 mdp_super_t *tmp1, *tmp2; 436 437 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 438 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 439 440 if (!tmp1 || !tmp2) { 441 ret = 0; 442 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); 443 goto abort; 444 } 445 446 *tmp1 = *sb1; 447 *tmp2 = *sb2; 448 449 /* 450 * nr_disks is not constant 451 */ 452 tmp1->nr_disks = 0; 453 tmp2->nr_disks = 0; 454 455 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) 456 ret = 0; 457 else 458 ret = 1; 459 460 abort: 461 kfree(tmp1); 462 kfree(tmp2); 463 return ret; 464 } 465 466 static unsigned int calc_sb_csum(mdp_super_t * sb) 467 { 468 unsigned int disk_csum, csum; 469 470 disk_csum = sb->sb_csum; 471 sb->sb_csum = 0; 472 csum = csum_partial((void *)sb, MD_SB_BYTES, 0); 473 sb->sb_csum = disk_csum; 474 return csum; 475 } 476 477 478 /* 479 * Handle superblock details. 480 * We want to be able to handle multiple superblock formats 481 * so we have a common interface to them all, and an array of 482 * different handlers. 483 * We rely on user-space to write the initial superblock, and support 484 * reading and updating of superblocks. 485 * Interface methods are: 486 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 487 * loads and validates a superblock on dev. 488 * if refdev != NULL, compare superblocks on both devices 489 * Return: 490 * 0 - dev has a superblock that is compatible with refdev 491 * 1 - dev has a superblock that is compatible and newer than refdev 492 * so dev should be used as the refdev in future 493 * -EINVAL superblock incompatible or invalid 494 * -othererror e.g. -EIO 495 * 496 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 497 * Verify that dev is acceptable into mddev. 498 * The first time, mddev->raid_disks will be 0, and data from 499 * dev should be merged in. Subsequent calls check that dev 500 * is new enough. Return 0 or -EINVAL 501 * 502 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 503 * Update the superblock for rdev with data in mddev 504 * This does not write to disc. 505 * 506 */ 507 508 struct super_type { 509 char *name; 510 struct module *owner; 511 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version); 512 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 513 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 514 }; 515 516 /* 517 * load_super for 0.90.0 518 */ 519 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 520 { 521 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 522 mdp_super_t *sb; 523 int ret; 524 sector_t sb_offset; 525 526 /* 527 * Calculate the position of the superblock, 528 * it's at the end of the disk. 529 * 530 * It also happens to be a multiple of 4Kb. 531 */ 532 sb_offset = calc_dev_sboffset(rdev->bdev); 533 rdev->sb_offset = sb_offset; 534 535 ret = read_disk_sb(rdev); 536 if (ret) return ret; 537 538 ret = -EINVAL; 539 540 bdevname(rdev->bdev, b); 541 sb = (mdp_super_t*)page_address(rdev->sb_page); 542 543 if (sb->md_magic != MD_SB_MAGIC) { 544 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 545 b); 546 goto abort; 547 } 548 549 if (sb->major_version != 0 || 550 sb->minor_version != 90) { 551 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 552 sb->major_version, sb->minor_version, 553 b); 554 goto abort; 555 } 556 557 if (sb->raid_disks <= 0) 558 goto abort; 559 560 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) { 561 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 562 b); 563 goto abort; 564 } 565 566 rdev->preferred_minor = sb->md_minor; 567 rdev->data_offset = 0; 568 569 if (sb->level == LEVEL_MULTIPATH) 570 rdev->desc_nr = -1; 571 else 572 rdev->desc_nr = sb->this_disk.number; 573 574 if (refdev == 0) 575 ret = 1; 576 else { 577 __u64 ev1, ev2; 578 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 579 if (!uuid_equal(refsb, sb)) { 580 printk(KERN_WARNING "md: %s has different UUID to %s\n", 581 b, bdevname(refdev->bdev,b2)); 582 goto abort; 583 } 584 if (!sb_equal(refsb, sb)) { 585 printk(KERN_WARNING "md: %s has same UUID" 586 " but different superblock to %s\n", 587 b, bdevname(refdev->bdev, b2)); 588 goto abort; 589 } 590 ev1 = md_event(sb); 591 ev2 = md_event(refsb); 592 if (ev1 > ev2) 593 ret = 1; 594 else 595 ret = 0; 596 } 597 rdev->size = calc_dev_size(rdev, sb->chunk_size); 598 599 abort: 600 return ret; 601 } 602 603 /* 604 * validate_super for 0.90.0 605 */ 606 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 607 { 608 mdp_disk_t *desc; 609 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 610 611 rdev->raid_disk = -1; 612 rdev->in_sync = 0; 613 if (mddev->raid_disks == 0) { 614 mddev->major_version = 0; 615 mddev->minor_version = sb->minor_version; 616 mddev->patch_version = sb->patch_version; 617 mddev->persistent = ! sb->not_persistent; 618 mddev->chunk_size = sb->chunk_size; 619 mddev->ctime = sb->ctime; 620 mddev->utime = sb->utime; 621 mddev->level = sb->level; 622 mddev->layout = sb->layout; 623 mddev->raid_disks = sb->raid_disks; 624 mddev->size = sb->size; 625 mddev->events = md_event(sb); 626 627 if (sb->state & (1<<MD_SB_CLEAN)) 628 mddev->recovery_cp = MaxSector; 629 else { 630 if (sb->events_hi == sb->cp_events_hi && 631 sb->events_lo == sb->cp_events_lo) { 632 mddev->recovery_cp = sb->recovery_cp; 633 } else 634 mddev->recovery_cp = 0; 635 } 636 637 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 638 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 639 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 640 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 641 642 mddev->max_disks = MD_SB_DISKS; 643 644 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 645 mddev->bitmap_file == NULL) { 646 if (mddev->level != 1) { 647 /* FIXME use a better test */ 648 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 649 return -EINVAL; 650 } 651 mddev->bitmap_offset = (MD_SB_BYTES >> 9); 652 } 653 654 } else if (mddev->pers == NULL) { 655 /* Insist on good event counter while assembling */ 656 __u64 ev1 = md_event(sb); 657 ++ev1; 658 if (ev1 < mddev->events) 659 return -EINVAL; 660 } else if (mddev->bitmap) { 661 /* if adding to array with a bitmap, then we can accept an 662 * older device ... but not too old. 663 */ 664 __u64 ev1 = md_event(sb); 665 if (ev1 < mddev->bitmap->events_cleared) 666 return 0; 667 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 668 return 0; 669 670 if (mddev->level != LEVEL_MULTIPATH) { 671 rdev->faulty = 0; 672 desc = sb->disks + rdev->desc_nr; 673 674 if (desc->state & (1<<MD_DISK_FAULTY)) 675 rdev->faulty = 1; 676 else if (desc->state & (1<<MD_DISK_SYNC) && 677 desc->raid_disk < mddev->raid_disks) { 678 rdev->in_sync = 1; 679 rdev->raid_disk = desc->raid_disk; 680 } 681 } else /* MULTIPATH are always insync */ 682 rdev->in_sync = 1; 683 return 0; 684 } 685 686 /* 687 * sync_super for 0.90.0 688 */ 689 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 690 { 691 mdp_super_t *sb; 692 struct list_head *tmp; 693 mdk_rdev_t *rdev2; 694 int next_spare = mddev->raid_disks; 695 696 /* make rdev->sb match mddev data.. 697 * 698 * 1/ zero out disks 699 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 700 * 3/ any empty disks < next_spare become removed 701 * 702 * disks[0] gets initialised to REMOVED because 703 * we cannot be sure from other fields if it has 704 * been initialised or not. 705 */ 706 int i; 707 int active=0, working=0,failed=0,spare=0,nr_disks=0; 708 709 sb = (mdp_super_t*)page_address(rdev->sb_page); 710 711 memset(sb, 0, sizeof(*sb)); 712 713 sb->md_magic = MD_SB_MAGIC; 714 sb->major_version = mddev->major_version; 715 sb->minor_version = mddev->minor_version; 716 sb->patch_version = mddev->patch_version; 717 sb->gvalid_words = 0; /* ignored */ 718 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 719 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 720 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 721 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 722 723 sb->ctime = mddev->ctime; 724 sb->level = mddev->level; 725 sb->size = mddev->size; 726 sb->raid_disks = mddev->raid_disks; 727 sb->md_minor = mddev->md_minor; 728 sb->not_persistent = !mddev->persistent; 729 sb->utime = mddev->utime; 730 sb->state = 0; 731 sb->events_hi = (mddev->events>>32); 732 sb->events_lo = (u32)mddev->events; 733 734 if (mddev->in_sync) 735 { 736 sb->recovery_cp = mddev->recovery_cp; 737 sb->cp_events_hi = (mddev->events>>32); 738 sb->cp_events_lo = (u32)mddev->events; 739 if (mddev->recovery_cp == MaxSector) 740 sb->state = (1<< MD_SB_CLEAN); 741 } else 742 sb->recovery_cp = 0; 743 744 sb->layout = mddev->layout; 745 sb->chunk_size = mddev->chunk_size; 746 747 if (mddev->bitmap && mddev->bitmap_file == NULL) 748 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 749 750 sb->disks[0].state = (1<<MD_DISK_REMOVED); 751 ITERATE_RDEV(mddev,rdev2,tmp) { 752 mdp_disk_t *d; 753 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) 754 rdev2->desc_nr = rdev2->raid_disk; 755 else 756 rdev2->desc_nr = next_spare++; 757 d = &sb->disks[rdev2->desc_nr]; 758 nr_disks++; 759 d->number = rdev2->desc_nr; 760 d->major = MAJOR(rdev2->bdev->bd_dev); 761 d->minor = MINOR(rdev2->bdev->bd_dev); 762 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) 763 d->raid_disk = rdev2->raid_disk; 764 else 765 d->raid_disk = rdev2->desc_nr; /* compatibility */ 766 if (rdev2->faulty) { 767 d->state = (1<<MD_DISK_FAULTY); 768 failed++; 769 } else if (rdev2->in_sync) { 770 d->state = (1<<MD_DISK_ACTIVE); 771 d->state |= (1<<MD_DISK_SYNC); 772 active++; 773 working++; 774 } else { 775 d->state = 0; 776 spare++; 777 working++; 778 } 779 } 780 781 /* now set the "removed" and "faulty" bits on any missing devices */ 782 for (i=0 ; i < mddev->raid_disks ; i++) { 783 mdp_disk_t *d = &sb->disks[i]; 784 if (d->state == 0 && d->number == 0) { 785 d->number = i; 786 d->raid_disk = i; 787 d->state = (1<<MD_DISK_REMOVED); 788 d->state |= (1<<MD_DISK_FAULTY); 789 failed++; 790 } 791 } 792 sb->nr_disks = nr_disks; 793 sb->active_disks = active; 794 sb->working_disks = working; 795 sb->failed_disks = failed; 796 sb->spare_disks = spare; 797 798 sb->this_disk = sb->disks[rdev->desc_nr]; 799 sb->sb_csum = calc_sb_csum(sb); 800 } 801 802 /* 803 * version 1 superblock 804 */ 805 806 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) 807 { 808 unsigned int disk_csum, csum; 809 unsigned long long newcsum; 810 int size = 256 + le32_to_cpu(sb->max_dev)*2; 811 unsigned int *isuper = (unsigned int*)sb; 812 int i; 813 814 disk_csum = sb->sb_csum; 815 sb->sb_csum = 0; 816 newcsum = 0; 817 for (i=0; size>=4; size -= 4 ) 818 newcsum += le32_to_cpu(*isuper++); 819 820 if (size == 2) 821 newcsum += le16_to_cpu(*(unsigned short*) isuper); 822 823 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 824 sb->sb_csum = disk_csum; 825 return cpu_to_le32(csum); 826 } 827 828 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 829 { 830 struct mdp_superblock_1 *sb; 831 int ret; 832 sector_t sb_offset; 833 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 834 835 /* 836 * Calculate the position of the superblock. 837 * It is always aligned to a 4K boundary and 838 * depeding on minor_version, it can be: 839 * 0: At least 8K, but less than 12K, from end of device 840 * 1: At start of device 841 * 2: 4K from start of device. 842 */ 843 switch(minor_version) { 844 case 0: 845 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 846 sb_offset -= 8*2; 847 sb_offset &= ~(sector_t)(4*2-1); 848 /* convert from sectors to K */ 849 sb_offset /= 2; 850 break; 851 case 1: 852 sb_offset = 0; 853 break; 854 case 2: 855 sb_offset = 4; 856 break; 857 default: 858 return -EINVAL; 859 } 860 rdev->sb_offset = sb_offset; 861 862 ret = read_disk_sb(rdev); 863 if (ret) return ret; 864 865 866 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 867 868 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 869 sb->major_version != cpu_to_le32(1) || 870 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 871 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || 872 sb->feature_map != 0) 873 return -EINVAL; 874 875 if (calc_sb_1_csum(sb) != sb->sb_csum) { 876 printk("md: invalid superblock checksum on %s\n", 877 bdevname(rdev->bdev,b)); 878 return -EINVAL; 879 } 880 if (le64_to_cpu(sb->data_size) < 10) { 881 printk("md: data_size too small on %s\n", 882 bdevname(rdev->bdev,b)); 883 return -EINVAL; 884 } 885 rdev->preferred_minor = 0xffff; 886 rdev->data_offset = le64_to_cpu(sb->data_offset); 887 888 if (refdev == 0) 889 return 1; 890 else { 891 __u64 ev1, ev2; 892 struct mdp_superblock_1 *refsb = 893 (struct mdp_superblock_1*)page_address(refdev->sb_page); 894 895 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 896 sb->level != refsb->level || 897 sb->layout != refsb->layout || 898 sb->chunksize != refsb->chunksize) { 899 printk(KERN_WARNING "md: %s has strangely different" 900 " superblock to %s\n", 901 bdevname(rdev->bdev,b), 902 bdevname(refdev->bdev,b2)); 903 return -EINVAL; 904 } 905 ev1 = le64_to_cpu(sb->events); 906 ev2 = le64_to_cpu(refsb->events); 907 908 if (ev1 > ev2) 909 return 1; 910 } 911 if (minor_version) 912 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; 913 else 914 rdev->size = rdev->sb_offset; 915 if (rdev->size < le64_to_cpu(sb->data_size)/2) 916 return -EINVAL; 917 rdev->size = le64_to_cpu(sb->data_size)/2; 918 if (le32_to_cpu(sb->chunksize)) 919 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 920 return 0; 921 } 922 923 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 924 { 925 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 926 927 rdev->raid_disk = -1; 928 rdev->in_sync = 0; 929 if (mddev->raid_disks == 0) { 930 mddev->major_version = 1; 931 mddev->patch_version = 0; 932 mddev->persistent = 1; 933 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9; 934 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 935 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 936 mddev->level = le32_to_cpu(sb->level); 937 mddev->layout = le32_to_cpu(sb->layout); 938 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 939 mddev->size = le64_to_cpu(sb->size)/2; 940 mddev->events = le64_to_cpu(sb->events); 941 942 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 943 memcpy(mddev->uuid, sb->set_uuid, 16); 944 945 mddev->max_disks = (4096-256)/2; 946 947 if ((le32_to_cpu(sb->feature_map) & 1) && 948 mddev->bitmap_file == NULL ) { 949 if (mddev->level != 1) { 950 printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); 951 return -EINVAL; 952 } 953 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 954 } 955 } else if (mddev->pers == NULL) { 956 /* Insist of good event counter while assembling */ 957 __u64 ev1 = le64_to_cpu(sb->events); 958 ++ev1; 959 if (ev1 < mddev->events) 960 return -EINVAL; 961 } else if (mddev->bitmap) { 962 /* If adding to array with a bitmap, then we can accept an 963 * older device, but not too old. 964 */ 965 __u64 ev1 = le64_to_cpu(sb->events); 966 if (ev1 < mddev->bitmap->events_cleared) 967 return 0; 968 } else /* just a hot-add of a new device, leave raid_disk at -1 */ 969 return 0; 970 971 if (mddev->level != LEVEL_MULTIPATH) { 972 int role; 973 rdev->desc_nr = le32_to_cpu(sb->dev_number); 974 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 975 switch(role) { 976 case 0xffff: /* spare */ 977 rdev->faulty = 0; 978 break; 979 case 0xfffe: /* faulty */ 980 rdev->faulty = 1; 981 break; 982 default: 983 rdev->in_sync = 1; 984 rdev->faulty = 0; 985 rdev->raid_disk = role; 986 break; 987 } 988 } else /* MULTIPATH are always insync */ 989 rdev->in_sync = 1; 990 991 return 0; 992 } 993 994 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 995 { 996 struct mdp_superblock_1 *sb; 997 struct list_head *tmp; 998 mdk_rdev_t *rdev2; 999 int max_dev, i; 1000 /* make rdev->sb match mddev and rdev data. */ 1001 1002 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1003 1004 sb->feature_map = 0; 1005 sb->pad0 = 0; 1006 memset(sb->pad1, 0, sizeof(sb->pad1)); 1007 memset(sb->pad2, 0, sizeof(sb->pad2)); 1008 memset(sb->pad3, 0, sizeof(sb->pad3)); 1009 1010 sb->utime = cpu_to_le64((__u64)mddev->utime); 1011 sb->events = cpu_to_le64(mddev->events); 1012 if (mddev->in_sync) 1013 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1014 else 1015 sb->resync_offset = cpu_to_le64(0); 1016 1017 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1018 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1019 sb->feature_map = cpu_to_le32(1); 1020 } 1021 1022 max_dev = 0; 1023 ITERATE_RDEV(mddev,rdev2,tmp) 1024 if (rdev2->desc_nr+1 > max_dev) 1025 max_dev = rdev2->desc_nr+1; 1026 1027 sb->max_dev = cpu_to_le32(max_dev); 1028 for (i=0; i<max_dev;i++) 1029 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1030 1031 ITERATE_RDEV(mddev,rdev2,tmp) { 1032 i = rdev2->desc_nr; 1033 if (rdev2->faulty) 1034 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1035 else if (rdev2->in_sync) 1036 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1037 else 1038 sb->dev_roles[i] = cpu_to_le16(0xffff); 1039 } 1040 1041 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */ 1042 sb->sb_csum = calc_sb_1_csum(sb); 1043 } 1044 1045 1046 static struct super_type super_types[] = { 1047 [0] = { 1048 .name = "0.90.0", 1049 .owner = THIS_MODULE, 1050 .load_super = super_90_load, 1051 .validate_super = super_90_validate, 1052 .sync_super = super_90_sync, 1053 }, 1054 [1] = { 1055 .name = "md-1", 1056 .owner = THIS_MODULE, 1057 .load_super = super_1_load, 1058 .validate_super = super_1_validate, 1059 .sync_super = super_1_sync, 1060 }, 1061 }; 1062 1063 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev) 1064 { 1065 struct list_head *tmp; 1066 mdk_rdev_t *rdev; 1067 1068 ITERATE_RDEV(mddev,rdev,tmp) 1069 if (rdev->bdev->bd_contains == dev->bdev->bd_contains) 1070 return rdev; 1071 1072 return NULL; 1073 } 1074 1075 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1076 { 1077 struct list_head *tmp; 1078 mdk_rdev_t *rdev; 1079 1080 ITERATE_RDEV(mddev1,rdev,tmp) 1081 if (match_dev_unit(mddev2, rdev)) 1082 return 1; 1083 1084 return 0; 1085 } 1086 1087 static LIST_HEAD(pending_raid_disks); 1088 1089 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1090 { 1091 mdk_rdev_t *same_pdev; 1092 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1093 1094 if (rdev->mddev) { 1095 MD_BUG(); 1096 return -EINVAL; 1097 } 1098 same_pdev = match_dev_unit(mddev, rdev); 1099 if (same_pdev) 1100 printk(KERN_WARNING 1101 "%s: WARNING: %s appears to be on the same physical" 1102 " disk as %s. True\n protection against single-disk" 1103 " failure might be compromised.\n", 1104 mdname(mddev), bdevname(rdev->bdev,b), 1105 bdevname(same_pdev->bdev,b2)); 1106 1107 /* Verify rdev->desc_nr is unique. 1108 * If it is -1, assign a free number, else 1109 * check number is not in use 1110 */ 1111 if (rdev->desc_nr < 0) { 1112 int choice = 0; 1113 if (mddev->pers) choice = mddev->raid_disks; 1114 while (find_rdev_nr(mddev, choice)) 1115 choice++; 1116 rdev->desc_nr = choice; 1117 } else { 1118 if (find_rdev_nr(mddev, rdev->desc_nr)) 1119 return -EBUSY; 1120 } 1121 1122 list_add(&rdev->same_set, &mddev->disks); 1123 rdev->mddev = mddev; 1124 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); 1125 return 0; 1126 } 1127 1128 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1129 { 1130 char b[BDEVNAME_SIZE]; 1131 if (!rdev->mddev) { 1132 MD_BUG(); 1133 return; 1134 } 1135 list_del_init(&rdev->same_set); 1136 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1137 rdev->mddev = NULL; 1138 } 1139 1140 /* 1141 * prevent the device from being mounted, repartitioned or 1142 * otherwise reused by a RAID array (or any other kernel 1143 * subsystem), by bd_claiming the device. 1144 */ 1145 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev) 1146 { 1147 int err = 0; 1148 struct block_device *bdev; 1149 char b[BDEVNAME_SIZE]; 1150 1151 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1152 if (IS_ERR(bdev)) { 1153 printk(KERN_ERR "md: could not open %s.\n", 1154 __bdevname(dev, b)); 1155 return PTR_ERR(bdev); 1156 } 1157 err = bd_claim(bdev, rdev); 1158 if (err) { 1159 printk(KERN_ERR "md: could not bd_claim %s.\n", 1160 bdevname(bdev, b)); 1161 blkdev_put(bdev); 1162 return err; 1163 } 1164 rdev->bdev = bdev; 1165 return err; 1166 } 1167 1168 static void unlock_rdev(mdk_rdev_t *rdev) 1169 { 1170 struct block_device *bdev = rdev->bdev; 1171 rdev->bdev = NULL; 1172 if (!bdev) 1173 MD_BUG(); 1174 bd_release(bdev); 1175 blkdev_put(bdev); 1176 } 1177 1178 void md_autodetect_dev(dev_t dev); 1179 1180 static void export_rdev(mdk_rdev_t * rdev) 1181 { 1182 char b[BDEVNAME_SIZE]; 1183 printk(KERN_INFO "md: export_rdev(%s)\n", 1184 bdevname(rdev->bdev,b)); 1185 if (rdev->mddev) 1186 MD_BUG(); 1187 free_disk_sb(rdev); 1188 list_del_init(&rdev->same_set); 1189 #ifndef MODULE 1190 md_autodetect_dev(rdev->bdev->bd_dev); 1191 #endif 1192 unlock_rdev(rdev); 1193 kfree(rdev); 1194 } 1195 1196 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1197 { 1198 unbind_rdev_from_array(rdev); 1199 export_rdev(rdev); 1200 } 1201 1202 static void export_array(mddev_t *mddev) 1203 { 1204 struct list_head *tmp; 1205 mdk_rdev_t *rdev; 1206 1207 ITERATE_RDEV(mddev,rdev,tmp) { 1208 if (!rdev->mddev) { 1209 MD_BUG(); 1210 continue; 1211 } 1212 kick_rdev_from_array(rdev); 1213 } 1214 if (!list_empty(&mddev->disks)) 1215 MD_BUG(); 1216 mddev->raid_disks = 0; 1217 mddev->major_version = 0; 1218 } 1219 1220 static void print_desc(mdp_disk_t *desc) 1221 { 1222 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1223 desc->major,desc->minor,desc->raid_disk,desc->state); 1224 } 1225 1226 static void print_sb(mdp_super_t *sb) 1227 { 1228 int i; 1229 1230 printk(KERN_INFO 1231 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1232 sb->major_version, sb->minor_version, sb->patch_version, 1233 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1234 sb->ctime); 1235 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1236 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1237 sb->md_minor, sb->layout, sb->chunk_size); 1238 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1239 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1240 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1241 sb->failed_disks, sb->spare_disks, 1242 sb->sb_csum, (unsigned long)sb->events_lo); 1243 1244 printk(KERN_INFO); 1245 for (i = 0; i < MD_SB_DISKS; i++) { 1246 mdp_disk_t *desc; 1247 1248 desc = sb->disks + i; 1249 if (desc->number || desc->major || desc->minor || 1250 desc->raid_disk || (desc->state && (desc->state != 4))) { 1251 printk(" D %2d: ", i); 1252 print_desc(desc); 1253 } 1254 } 1255 printk(KERN_INFO "md: THIS: "); 1256 print_desc(&sb->this_disk); 1257 1258 } 1259 1260 static void print_rdev(mdk_rdev_t *rdev) 1261 { 1262 char b[BDEVNAME_SIZE]; 1263 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", 1264 bdevname(rdev->bdev,b), (unsigned long long)rdev->size, 1265 rdev->faulty, rdev->in_sync, rdev->desc_nr); 1266 if (rdev->sb_loaded) { 1267 printk(KERN_INFO "md: rdev superblock:\n"); 1268 print_sb((mdp_super_t*)page_address(rdev->sb_page)); 1269 } else 1270 printk(KERN_INFO "md: no rdev superblock!\n"); 1271 } 1272 1273 void md_print_devices(void) 1274 { 1275 struct list_head *tmp, *tmp2; 1276 mdk_rdev_t *rdev; 1277 mddev_t *mddev; 1278 char b[BDEVNAME_SIZE]; 1279 1280 printk("\n"); 1281 printk("md: **********************************\n"); 1282 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1283 printk("md: **********************************\n"); 1284 ITERATE_MDDEV(mddev,tmp) { 1285 1286 if (mddev->bitmap) 1287 bitmap_print_sb(mddev->bitmap); 1288 else 1289 printk("%s: ", mdname(mddev)); 1290 ITERATE_RDEV(mddev,rdev,tmp2) 1291 printk("<%s>", bdevname(rdev->bdev,b)); 1292 printk("\n"); 1293 1294 ITERATE_RDEV(mddev,rdev,tmp2) 1295 print_rdev(rdev); 1296 } 1297 printk("md: **********************************\n"); 1298 printk("\n"); 1299 } 1300 1301 1302 static void sync_sbs(mddev_t * mddev) 1303 { 1304 mdk_rdev_t *rdev; 1305 struct list_head *tmp; 1306 1307 ITERATE_RDEV(mddev,rdev,tmp) { 1308 super_types[mddev->major_version]. 1309 sync_super(mddev, rdev); 1310 rdev->sb_loaded = 1; 1311 } 1312 } 1313 1314 static void md_update_sb(mddev_t * mddev) 1315 { 1316 int err; 1317 struct list_head *tmp; 1318 mdk_rdev_t *rdev; 1319 int sync_req; 1320 1321 repeat: 1322 spin_lock(&mddev->write_lock); 1323 sync_req = mddev->in_sync; 1324 mddev->utime = get_seconds(); 1325 mddev->events ++; 1326 1327 if (!mddev->events) { 1328 /* 1329 * oops, this 64-bit counter should never wrap. 1330 * Either we are in around ~1 trillion A.C., assuming 1331 * 1 reboot per second, or we have a bug: 1332 */ 1333 MD_BUG(); 1334 mddev->events --; 1335 } 1336 mddev->sb_dirty = 2; 1337 sync_sbs(mddev); 1338 1339 /* 1340 * do not write anything to disk if using 1341 * nonpersistent superblocks 1342 */ 1343 if (!mddev->persistent) { 1344 mddev->sb_dirty = 0; 1345 spin_unlock(&mddev->write_lock); 1346 wake_up(&mddev->sb_wait); 1347 return; 1348 } 1349 spin_unlock(&mddev->write_lock); 1350 1351 dprintk(KERN_INFO 1352 "md: updating %s RAID superblock on device (in sync %d)\n", 1353 mdname(mddev),mddev->in_sync); 1354 1355 err = bitmap_update_sb(mddev->bitmap); 1356 ITERATE_RDEV(mddev,rdev,tmp) { 1357 char b[BDEVNAME_SIZE]; 1358 dprintk(KERN_INFO "md: "); 1359 if (rdev->faulty) 1360 dprintk("(skipping faulty "); 1361 1362 dprintk("%s ", bdevname(rdev->bdev,b)); 1363 if (!rdev->faulty) { 1364 md_super_write(mddev,rdev, 1365 rdev->sb_offset<<1, MD_SB_BYTES, 1366 rdev->sb_page); 1367 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 1368 bdevname(rdev->bdev,b), 1369 (unsigned long long)rdev->sb_offset); 1370 1371 } else 1372 dprintk(")\n"); 1373 if (mddev->level == LEVEL_MULTIPATH) 1374 /* only need to write one superblock... */ 1375 break; 1376 } 1377 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); 1378 /* if there was a failure, sb_dirty was set to 1, and we re-write super */ 1379 1380 spin_lock(&mddev->write_lock); 1381 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { 1382 /* have to write it out again */ 1383 spin_unlock(&mddev->write_lock); 1384 goto repeat; 1385 } 1386 mddev->sb_dirty = 0; 1387 spin_unlock(&mddev->write_lock); 1388 wake_up(&mddev->sb_wait); 1389 1390 } 1391 1392 /* 1393 * Import a device. If 'super_format' >= 0, then sanity check the superblock 1394 * 1395 * mark the device faulty if: 1396 * 1397 * - the device is nonexistent (zero size) 1398 * - the device has no valid superblock 1399 * 1400 * a faulty rdev _never_ has rdev->sb set. 1401 */ 1402 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 1403 { 1404 char b[BDEVNAME_SIZE]; 1405 int err; 1406 mdk_rdev_t *rdev; 1407 sector_t size; 1408 1409 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1410 if (!rdev) { 1411 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1412 return ERR_PTR(-ENOMEM); 1413 } 1414 memset(rdev, 0, sizeof(*rdev)); 1415 1416 if ((err = alloc_disk_sb(rdev))) 1417 goto abort_free; 1418 1419 err = lock_rdev(rdev, newdev); 1420 if (err) 1421 goto abort_free; 1422 1423 rdev->desc_nr = -1; 1424 rdev->faulty = 0; 1425 rdev->in_sync = 0; 1426 rdev->data_offset = 0; 1427 atomic_set(&rdev->nr_pending, 0); 1428 1429 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1430 if (!size) { 1431 printk(KERN_WARNING 1432 "md: %s has zero or unknown size, marking faulty!\n", 1433 bdevname(rdev->bdev,b)); 1434 err = -EINVAL; 1435 goto abort_free; 1436 } 1437 1438 if (super_format >= 0) { 1439 err = super_types[super_format]. 1440 load_super(rdev, NULL, super_minor); 1441 if (err == -EINVAL) { 1442 printk(KERN_WARNING 1443 "md: %s has invalid sb, not importing!\n", 1444 bdevname(rdev->bdev,b)); 1445 goto abort_free; 1446 } 1447 if (err < 0) { 1448 printk(KERN_WARNING 1449 "md: could not read %s's sb, not importing!\n", 1450 bdevname(rdev->bdev,b)); 1451 goto abort_free; 1452 } 1453 } 1454 INIT_LIST_HEAD(&rdev->same_set); 1455 1456 return rdev; 1457 1458 abort_free: 1459 if (rdev->sb_page) { 1460 if (rdev->bdev) 1461 unlock_rdev(rdev); 1462 free_disk_sb(rdev); 1463 } 1464 kfree(rdev); 1465 return ERR_PTR(err); 1466 } 1467 1468 /* 1469 * Check a full RAID array for plausibility 1470 */ 1471 1472 1473 static void analyze_sbs(mddev_t * mddev) 1474 { 1475 int i; 1476 struct list_head *tmp; 1477 mdk_rdev_t *rdev, *freshest; 1478 char b[BDEVNAME_SIZE]; 1479 1480 freshest = NULL; 1481 ITERATE_RDEV(mddev,rdev,tmp) 1482 switch (super_types[mddev->major_version]. 1483 load_super(rdev, freshest, mddev->minor_version)) { 1484 case 1: 1485 freshest = rdev; 1486 break; 1487 case 0: 1488 break; 1489 default: 1490 printk( KERN_ERR \ 1491 "md: fatal superblock inconsistency in %s" 1492 " -- removing from array\n", 1493 bdevname(rdev->bdev,b)); 1494 kick_rdev_from_array(rdev); 1495 } 1496 1497 1498 super_types[mddev->major_version]. 1499 validate_super(mddev, freshest); 1500 1501 i = 0; 1502 ITERATE_RDEV(mddev,rdev,tmp) { 1503 if (rdev != freshest) 1504 if (super_types[mddev->major_version]. 1505 validate_super(mddev, rdev)) { 1506 printk(KERN_WARNING "md: kicking non-fresh %s" 1507 " from array!\n", 1508 bdevname(rdev->bdev,b)); 1509 kick_rdev_from_array(rdev); 1510 continue; 1511 } 1512 if (mddev->level == LEVEL_MULTIPATH) { 1513 rdev->desc_nr = i++; 1514 rdev->raid_disk = rdev->desc_nr; 1515 rdev->in_sync = 1; 1516 } 1517 } 1518 1519 1520 1521 if (mddev->recovery_cp != MaxSector && 1522 mddev->level >= 1) 1523 printk(KERN_ERR "md: %s: raid array is not clean" 1524 " -- starting background reconstruction\n", 1525 mdname(mddev)); 1526 1527 } 1528 1529 int mdp_major = 0; 1530 1531 static struct kobject *md_probe(dev_t dev, int *part, void *data) 1532 { 1533 static DECLARE_MUTEX(disks_sem); 1534 mddev_t *mddev = mddev_find(dev); 1535 struct gendisk *disk; 1536 int partitioned = (MAJOR(dev) != MD_MAJOR); 1537 int shift = partitioned ? MdpMinorShift : 0; 1538 int unit = MINOR(dev) >> shift; 1539 1540 if (!mddev) 1541 return NULL; 1542 1543 down(&disks_sem); 1544 if (mddev->gendisk) { 1545 up(&disks_sem); 1546 mddev_put(mddev); 1547 return NULL; 1548 } 1549 disk = alloc_disk(1 << shift); 1550 if (!disk) { 1551 up(&disks_sem); 1552 mddev_put(mddev); 1553 return NULL; 1554 } 1555 disk->major = MAJOR(dev); 1556 disk->first_minor = unit << shift; 1557 if (partitioned) { 1558 sprintf(disk->disk_name, "md_d%d", unit); 1559 sprintf(disk->devfs_name, "md/d%d", unit); 1560 } else { 1561 sprintf(disk->disk_name, "md%d", unit); 1562 sprintf(disk->devfs_name, "md/%d", unit); 1563 } 1564 disk->fops = &md_fops; 1565 disk->private_data = mddev; 1566 disk->queue = mddev->queue; 1567 add_disk(disk); 1568 mddev->gendisk = disk; 1569 up(&disks_sem); 1570 return NULL; 1571 } 1572 1573 void md_wakeup_thread(mdk_thread_t *thread); 1574 1575 static void md_safemode_timeout(unsigned long data) 1576 { 1577 mddev_t *mddev = (mddev_t *) data; 1578 1579 mddev->safemode = 1; 1580 md_wakeup_thread(mddev->thread); 1581 } 1582 1583 1584 static int do_md_run(mddev_t * mddev) 1585 { 1586 int pnum, err; 1587 int chunk_size; 1588 struct list_head *tmp; 1589 mdk_rdev_t *rdev; 1590 struct gendisk *disk; 1591 char b[BDEVNAME_SIZE]; 1592 1593 if (list_empty(&mddev->disks)) 1594 /* cannot run an array with no devices.. */ 1595 return -EINVAL; 1596 1597 if (mddev->pers) 1598 return -EBUSY; 1599 1600 /* 1601 * Analyze all RAID superblock(s) 1602 */ 1603 if (!mddev->raid_disks) 1604 analyze_sbs(mddev); 1605 1606 chunk_size = mddev->chunk_size; 1607 pnum = level_to_pers(mddev->level); 1608 1609 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 1610 if (!chunk_size) { 1611 /* 1612 * 'default chunksize' in the old md code used to 1613 * be PAGE_SIZE, baaad. 1614 * we abort here to be on the safe side. We don't 1615 * want to continue the bad practice. 1616 */ 1617 printk(KERN_ERR 1618 "no chunksize specified, see 'man raidtab'\n"); 1619 return -EINVAL; 1620 } 1621 if (chunk_size > MAX_CHUNK_SIZE) { 1622 printk(KERN_ERR "too big chunk_size: %d > %d\n", 1623 chunk_size, MAX_CHUNK_SIZE); 1624 return -EINVAL; 1625 } 1626 /* 1627 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE 1628 */ 1629 if ( (1 << ffz(~chunk_size)) != chunk_size) { 1630 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size); 1631 return -EINVAL; 1632 } 1633 if (chunk_size < PAGE_SIZE) { 1634 printk(KERN_ERR "too small chunk_size: %d < %ld\n", 1635 chunk_size, PAGE_SIZE); 1636 return -EINVAL; 1637 } 1638 1639 /* devices must have minimum size of one chunk */ 1640 ITERATE_RDEV(mddev,rdev,tmp) { 1641 if (rdev->faulty) 1642 continue; 1643 if (rdev->size < chunk_size / 1024) { 1644 printk(KERN_WARNING 1645 "md: Dev %s smaller than chunk_size:" 1646 " %lluk < %dk\n", 1647 bdevname(rdev->bdev,b), 1648 (unsigned long long)rdev->size, 1649 chunk_size / 1024); 1650 return -EINVAL; 1651 } 1652 } 1653 } 1654 1655 #ifdef CONFIG_KMOD 1656 if (!pers[pnum]) 1657 { 1658 request_module("md-personality-%d", pnum); 1659 } 1660 #endif 1661 1662 /* 1663 * Drop all container device buffers, from now on 1664 * the only valid external interface is through the md 1665 * device. 1666 * Also find largest hardsector size 1667 */ 1668 ITERATE_RDEV(mddev,rdev,tmp) { 1669 if (rdev->faulty) 1670 continue; 1671 sync_blockdev(rdev->bdev); 1672 invalidate_bdev(rdev->bdev, 0); 1673 } 1674 1675 md_probe(mddev->unit, NULL, NULL); 1676 disk = mddev->gendisk; 1677 if (!disk) 1678 return -ENOMEM; 1679 1680 spin_lock(&pers_lock); 1681 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 1682 spin_unlock(&pers_lock); 1683 printk(KERN_WARNING "md: personality %d is not loaded!\n", 1684 pnum); 1685 return -EINVAL; 1686 } 1687 1688 mddev->pers = pers[pnum]; 1689 spin_unlock(&pers_lock); 1690 1691 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 1692 1693 /* before we start the array running, initialise the bitmap */ 1694 err = bitmap_create(mddev); 1695 if (err) 1696 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 1697 mdname(mddev), err); 1698 else 1699 err = mddev->pers->run(mddev); 1700 if (err) { 1701 printk(KERN_ERR "md: pers->run() failed ...\n"); 1702 module_put(mddev->pers->owner); 1703 mddev->pers = NULL; 1704 bitmap_destroy(mddev); 1705 return err; 1706 } 1707 atomic_set(&mddev->writes_pending,0); 1708 mddev->safemode = 0; 1709 mddev->safemode_timer.function = md_safemode_timeout; 1710 mddev->safemode_timer.data = (unsigned long) mddev; 1711 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ 1712 mddev->in_sync = 1; 1713 1714 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1715 1716 if (mddev->sb_dirty) 1717 md_update_sb(mddev); 1718 1719 set_capacity(disk, mddev->array_size<<1); 1720 1721 /* If we call blk_queue_make_request here, it will 1722 * re-initialise max_sectors etc which may have been 1723 * refined inside -> run. So just set the bits we need to set. 1724 * Most initialisation happended when we called 1725 * blk_queue_make_request(..., md_fail_request) 1726 * earlier. 1727 */ 1728 mddev->queue->queuedata = mddev; 1729 mddev->queue->make_request_fn = mddev->pers->make_request; 1730 1731 mddev->changed = 1; 1732 return 0; 1733 } 1734 1735 static int restart_array(mddev_t *mddev) 1736 { 1737 struct gendisk *disk = mddev->gendisk; 1738 int err; 1739 1740 /* 1741 * Complain if it has no devices 1742 */ 1743 err = -ENXIO; 1744 if (list_empty(&mddev->disks)) 1745 goto out; 1746 1747 if (mddev->pers) { 1748 err = -EBUSY; 1749 if (!mddev->ro) 1750 goto out; 1751 1752 mddev->safemode = 0; 1753 mddev->ro = 0; 1754 set_disk_ro(disk, 0); 1755 1756 printk(KERN_INFO "md: %s switched to read-write mode.\n", 1757 mdname(mddev)); 1758 /* 1759 * Kick recovery or resync if necessary 1760 */ 1761 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1762 md_wakeup_thread(mddev->thread); 1763 err = 0; 1764 } else { 1765 printk(KERN_ERR "md: %s has no personality assigned.\n", 1766 mdname(mddev)); 1767 err = -EINVAL; 1768 } 1769 1770 out: 1771 return err; 1772 } 1773 1774 static int do_md_stop(mddev_t * mddev, int ro) 1775 { 1776 int err = 0; 1777 struct gendisk *disk = mddev->gendisk; 1778 1779 if (mddev->pers) { 1780 if (atomic_read(&mddev->active)>2) { 1781 printk("md: %s still in use.\n",mdname(mddev)); 1782 return -EBUSY; 1783 } 1784 1785 if (mddev->sync_thread) { 1786 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1787 md_unregister_thread(mddev->sync_thread); 1788 mddev->sync_thread = NULL; 1789 } 1790 1791 del_timer_sync(&mddev->safemode_timer); 1792 1793 invalidate_partition(disk, 0); 1794 1795 if (ro) { 1796 err = -ENXIO; 1797 if (mddev->ro) 1798 goto out; 1799 mddev->ro = 1; 1800 } else { 1801 if (mddev->ro) 1802 set_disk_ro(disk, 0); 1803 blk_queue_make_request(mddev->queue, md_fail_request); 1804 mddev->pers->stop(mddev); 1805 module_put(mddev->pers->owner); 1806 mddev->pers = NULL; 1807 if (mddev->ro) 1808 mddev->ro = 0; 1809 } 1810 if (!mddev->in_sync) { 1811 /* mark array as shutdown cleanly */ 1812 mddev->in_sync = 1; 1813 md_update_sb(mddev); 1814 } 1815 if (ro) 1816 set_disk_ro(disk, 1); 1817 } 1818 1819 bitmap_destroy(mddev); 1820 if (mddev->bitmap_file) { 1821 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); 1822 fput(mddev->bitmap_file); 1823 mddev->bitmap_file = NULL; 1824 } 1825 1826 /* 1827 * Free resources if final stop 1828 */ 1829 if (!ro) { 1830 struct gendisk *disk; 1831 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 1832 1833 export_array(mddev); 1834 1835 mddev->array_size = 0; 1836 disk = mddev->gendisk; 1837 if (disk) 1838 set_capacity(disk, 0); 1839 mddev->changed = 1; 1840 } else 1841 printk(KERN_INFO "md: %s switched to read-only mode.\n", 1842 mdname(mddev)); 1843 err = 0; 1844 out: 1845 return err; 1846 } 1847 1848 static void autorun_array(mddev_t *mddev) 1849 { 1850 mdk_rdev_t *rdev; 1851 struct list_head *tmp; 1852 int err; 1853 1854 if (list_empty(&mddev->disks)) 1855 return; 1856 1857 printk(KERN_INFO "md: running: "); 1858 1859 ITERATE_RDEV(mddev,rdev,tmp) { 1860 char b[BDEVNAME_SIZE]; 1861 printk("<%s>", bdevname(rdev->bdev,b)); 1862 } 1863 printk("\n"); 1864 1865 err = do_md_run (mddev); 1866 if (err) { 1867 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 1868 do_md_stop (mddev, 0); 1869 } 1870 } 1871 1872 /* 1873 * lets try to run arrays based on all disks that have arrived 1874 * until now. (those are in pending_raid_disks) 1875 * 1876 * the method: pick the first pending disk, collect all disks with 1877 * the same UUID, remove all from the pending list and put them into 1878 * the 'same_array' list. Then order this list based on superblock 1879 * update time (freshest comes first), kick out 'old' disks and 1880 * compare superblocks. If everything's fine then run it. 1881 * 1882 * If "unit" is allocated, then bump its reference count 1883 */ 1884 static void autorun_devices(int part) 1885 { 1886 struct list_head candidates; 1887 struct list_head *tmp; 1888 mdk_rdev_t *rdev0, *rdev; 1889 mddev_t *mddev; 1890 char b[BDEVNAME_SIZE]; 1891 1892 printk(KERN_INFO "md: autorun ...\n"); 1893 while (!list_empty(&pending_raid_disks)) { 1894 dev_t dev; 1895 rdev0 = list_entry(pending_raid_disks.next, 1896 mdk_rdev_t, same_set); 1897 1898 printk(KERN_INFO "md: considering %s ...\n", 1899 bdevname(rdev0->bdev,b)); 1900 INIT_LIST_HEAD(&candidates); 1901 ITERATE_RDEV_PENDING(rdev,tmp) 1902 if (super_90_load(rdev, rdev0, 0) >= 0) { 1903 printk(KERN_INFO "md: adding %s ...\n", 1904 bdevname(rdev->bdev,b)); 1905 list_move(&rdev->same_set, &candidates); 1906 } 1907 /* 1908 * now we have a set of devices, with all of them having 1909 * mostly sane superblocks. It's time to allocate the 1910 * mddev. 1911 */ 1912 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) { 1913 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 1914 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 1915 break; 1916 } 1917 if (part) 1918 dev = MKDEV(mdp_major, 1919 rdev0->preferred_minor << MdpMinorShift); 1920 else 1921 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 1922 1923 md_probe(dev, NULL, NULL); 1924 mddev = mddev_find(dev); 1925 if (!mddev) { 1926 printk(KERN_ERR 1927 "md: cannot allocate memory for md drive.\n"); 1928 break; 1929 } 1930 if (mddev_lock(mddev)) 1931 printk(KERN_WARNING "md: %s locked, cannot run\n", 1932 mdname(mddev)); 1933 else if (mddev->raid_disks || mddev->major_version 1934 || !list_empty(&mddev->disks)) { 1935 printk(KERN_WARNING 1936 "md: %s already running, cannot run %s\n", 1937 mdname(mddev), bdevname(rdev0->bdev,b)); 1938 mddev_unlock(mddev); 1939 } else { 1940 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 1941 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) { 1942 list_del_init(&rdev->same_set); 1943 if (bind_rdev_to_array(rdev, mddev)) 1944 export_rdev(rdev); 1945 } 1946 autorun_array(mddev); 1947 mddev_unlock(mddev); 1948 } 1949 /* on success, candidates will be empty, on error 1950 * it won't... 1951 */ 1952 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) 1953 export_rdev(rdev); 1954 mddev_put(mddev); 1955 } 1956 printk(KERN_INFO "md: ... autorun DONE.\n"); 1957 } 1958 1959 /* 1960 * import RAID devices based on one partition 1961 * if possible, the array gets run as well. 1962 */ 1963 1964 static int autostart_array(dev_t startdev) 1965 { 1966 char b[BDEVNAME_SIZE]; 1967 int err = -EINVAL, i; 1968 mdp_super_t *sb = NULL; 1969 mdk_rdev_t *start_rdev = NULL, *rdev; 1970 1971 start_rdev = md_import_device(startdev, 0, 0); 1972 if (IS_ERR(start_rdev)) 1973 return err; 1974 1975 1976 /* NOTE: this can only work for 0.90.0 superblocks */ 1977 sb = (mdp_super_t*)page_address(start_rdev->sb_page); 1978 if (sb->major_version != 0 || 1979 sb->minor_version != 90 ) { 1980 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n"); 1981 export_rdev(start_rdev); 1982 return err; 1983 } 1984 1985 if (start_rdev->faulty) { 1986 printk(KERN_WARNING 1987 "md: can not autostart based on faulty %s!\n", 1988 bdevname(start_rdev->bdev,b)); 1989 export_rdev(start_rdev); 1990 return err; 1991 } 1992 list_add(&start_rdev->same_set, &pending_raid_disks); 1993 1994 for (i = 0; i < MD_SB_DISKS; i++) { 1995 mdp_disk_t *desc = sb->disks + i; 1996 dev_t dev = MKDEV(desc->major, desc->minor); 1997 1998 if (!dev) 1999 continue; 2000 if (dev == startdev) 2001 continue; 2002 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor) 2003 continue; 2004 rdev = md_import_device(dev, 0, 0); 2005 if (IS_ERR(rdev)) 2006 continue; 2007 2008 list_add(&rdev->same_set, &pending_raid_disks); 2009 } 2010 2011 /* 2012 * possibly return codes 2013 */ 2014 autorun_devices(0); 2015 return 0; 2016 2017 } 2018 2019 2020 static int get_version(void __user * arg) 2021 { 2022 mdu_version_t ver; 2023 2024 ver.major = MD_MAJOR_VERSION; 2025 ver.minor = MD_MINOR_VERSION; 2026 ver.patchlevel = MD_PATCHLEVEL_VERSION; 2027 2028 if (copy_to_user(arg, &ver, sizeof(ver))) 2029 return -EFAULT; 2030 2031 return 0; 2032 } 2033 2034 static int get_array_info(mddev_t * mddev, void __user * arg) 2035 { 2036 mdu_array_info_t info; 2037 int nr,working,active,failed,spare; 2038 mdk_rdev_t *rdev; 2039 struct list_head *tmp; 2040 2041 nr=working=active=failed=spare=0; 2042 ITERATE_RDEV(mddev,rdev,tmp) { 2043 nr++; 2044 if (rdev->faulty) 2045 failed++; 2046 else { 2047 working++; 2048 if (rdev->in_sync) 2049 active++; 2050 else 2051 spare++; 2052 } 2053 } 2054 2055 info.major_version = mddev->major_version; 2056 info.minor_version = mddev->minor_version; 2057 info.patch_version = MD_PATCHLEVEL_VERSION; 2058 info.ctime = mddev->ctime; 2059 info.level = mddev->level; 2060 info.size = mddev->size; 2061 info.nr_disks = nr; 2062 info.raid_disks = mddev->raid_disks; 2063 info.md_minor = mddev->md_minor; 2064 info.not_persistent= !mddev->persistent; 2065 2066 info.utime = mddev->utime; 2067 info.state = 0; 2068 if (mddev->in_sync) 2069 info.state = (1<<MD_SB_CLEAN); 2070 info.active_disks = active; 2071 info.working_disks = working; 2072 info.failed_disks = failed; 2073 info.spare_disks = spare; 2074 2075 info.layout = mddev->layout; 2076 info.chunk_size = mddev->chunk_size; 2077 2078 if (copy_to_user(arg, &info, sizeof(info))) 2079 return -EFAULT; 2080 2081 return 0; 2082 } 2083 2084 static int get_bitmap_file(mddev_t * mddev, void * arg) 2085 { 2086 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 2087 char *ptr, *buf = NULL; 2088 int err = -ENOMEM; 2089 2090 file = kmalloc(sizeof(*file), GFP_KERNEL); 2091 if (!file) 2092 goto out; 2093 2094 /* bitmap disabled, zero the first byte and copy out */ 2095 if (!mddev->bitmap || !mddev->bitmap->file) { 2096 file->pathname[0] = '\0'; 2097 goto copy_out; 2098 } 2099 2100 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 2101 if (!buf) 2102 goto out; 2103 2104 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); 2105 if (!ptr) 2106 goto out; 2107 2108 strcpy(file->pathname, ptr); 2109 2110 copy_out: 2111 err = 0; 2112 if (copy_to_user(arg, file, sizeof(*file))) 2113 err = -EFAULT; 2114 out: 2115 kfree(buf); 2116 kfree(file); 2117 return err; 2118 } 2119 2120 static int get_disk_info(mddev_t * mddev, void __user * arg) 2121 { 2122 mdu_disk_info_t info; 2123 unsigned int nr; 2124 mdk_rdev_t *rdev; 2125 2126 if (copy_from_user(&info, arg, sizeof(info))) 2127 return -EFAULT; 2128 2129 nr = info.number; 2130 2131 rdev = find_rdev_nr(mddev, nr); 2132 if (rdev) { 2133 info.major = MAJOR(rdev->bdev->bd_dev); 2134 info.minor = MINOR(rdev->bdev->bd_dev); 2135 info.raid_disk = rdev->raid_disk; 2136 info.state = 0; 2137 if (rdev->faulty) 2138 info.state |= (1<<MD_DISK_FAULTY); 2139 else if (rdev->in_sync) { 2140 info.state |= (1<<MD_DISK_ACTIVE); 2141 info.state |= (1<<MD_DISK_SYNC); 2142 } 2143 } else { 2144 info.major = info.minor = 0; 2145 info.raid_disk = -1; 2146 info.state = (1<<MD_DISK_REMOVED); 2147 } 2148 2149 if (copy_to_user(arg, &info, sizeof(info))) 2150 return -EFAULT; 2151 2152 return 0; 2153 } 2154 2155 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 2156 { 2157 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 2158 mdk_rdev_t *rdev; 2159 dev_t dev = MKDEV(info->major,info->minor); 2160 2161 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 2162 return -EOVERFLOW; 2163 2164 if (!mddev->raid_disks) { 2165 int err; 2166 /* expecting a device which has a superblock */ 2167 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 2168 if (IS_ERR(rdev)) { 2169 printk(KERN_WARNING 2170 "md: md_import_device returned %ld\n", 2171 PTR_ERR(rdev)); 2172 return PTR_ERR(rdev); 2173 } 2174 if (!list_empty(&mddev->disks)) { 2175 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 2176 mdk_rdev_t, same_set); 2177 int err = super_types[mddev->major_version] 2178 .load_super(rdev, rdev0, mddev->minor_version); 2179 if (err < 0) { 2180 printk(KERN_WARNING 2181 "md: %s has different UUID to %s\n", 2182 bdevname(rdev->bdev,b), 2183 bdevname(rdev0->bdev,b2)); 2184 export_rdev(rdev); 2185 return -EINVAL; 2186 } 2187 } 2188 err = bind_rdev_to_array(rdev, mddev); 2189 if (err) 2190 export_rdev(rdev); 2191 return err; 2192 } 2193 2194 /* 2195 * add_new_disk can be used once the array is assembled 2196 * to add "hot spares". They must already have a superblock 2197 * written 2198 */ 2199 if (mddev->pers) { 2200 int err; 2201 if (!mddev->pers->hot_add_disk) { 2202 printk(KERN_WARNING 2203 "%s: personality does not support diskops!\n", 2204 mdname(mddev)); 2205 return -EINVAL; 2206 } 2207 rdev = md_import_device(dev, mddev->major_version, 2208 mddev->minor_version); 2209 if (IS_ERR(rdev)) { 2210 printk(KERN_WARNING 2211 "md: md_import_device returned %ld\n", 2212 PTR_ERR(rdev)); 2213 return PTR_ERR(rdev); 2214 } 2215 /* set save_raid_disk if appropriate */ 2216 if (!mddev->persistent) { 2217 if (info->state & (1<<MD_DISK_SYNC) && 2218 info->raid_disk < mddev->raid_disks) 2219 rdev->raid_disk = info->raid_disk; 2220 else 2221 rdev->raid_disk = -1; 2222 } else 2223 super_types[mddev->major_version]. 2224 validate_super(mddev, rdev); 2225 rdev->saved_raid_disk = rdev->raid_disk; 2226 2227 rdev->in_sync = 0; /* just to be sure */ 2228 rdev->raid_disk = -1; 2229 err = bind_rdev_to_array(rdev, mddev); 2230 if (err) 2231 export_rdev(rdev); 2232 2233 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2234 if (mddev->thread) 2235 md_wakeup_thread(mddev->thread); 2236 return err; 2237 } 2238 2239 /* otherwise, add_new_disk is only allowed 2240 * for major_version==0 superblocks 2241 */ 2242 if (mddev->major_version != 0) { 2243 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 2244 mdname(mddev)); 2245 return -EINVAL; 2246 } 2247 2248 if (!(info->state & (1<<MD_DISK_FAULTY))) { 2249 int err; 2250 rdev = md_import_device (dev, -1, 0); 2251 if (IS_ERR(rdev)) { 2252 printk(KERN_WARNING 2253 "md: error, md_import_device() returned %ld\n", 2254 PTR_ERR(rdev)); 2255 return PTR_ERR(rdev); 2256 } 2257 rdev->desc_nr = info->number; 2258 if (info->raid_disk < mddev->raid_disks) 2259 rdev->raid_disk = info->raid_disk; 2260 else 2261 rdev->raid_disk = -1; 2262 2263 rdev->faulty = 0; 2264 if (rdev->raid_disk < mddev->raid_disks) 2265 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); 2266 else 2267 rdev->in_sync = 0; 2268 2269 err = bind_rdev_to_array(rdev, mddev); 2270 if (err) { 2271 export_rdev(rdev); 2272 return err; 2273 } 2274 2275 if (!mddev->persistent) { 2276 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 2277 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2278 } else 2279 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2280 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 2281 2282 if (!mddev->size || (mddev->size > rdev->size)) 2283 mddev->size = rdev->size; 2284 } 2285 2286 return 0; 2287 } 2288 2289 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 2290 { 2291 char b[BDEVNAME_SIZE]; 2292 mdk_rdev_t *rdev; 2293 2294 if (!mddev->pers) 2295 return -ENODEV; 2296 2297 rdev = find_rdev(mddev, dev); 2298 if (!rdev) 2299 return -ENXIO; 2300 2301 if (rdev->raid_disk >= 0) 2302 goto busy; 2303 2304 kick_rdev_from_array(rdev); 2305 md_update_sb(mddev); 2306 2307 return 0; 2308 busy: 2309 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n", 2310 bdevname(rdev->bdev,b), mdname(mddev)); 2311 return -EBUSY; 2312 } 2313 2314 static int hot_add_disk(mddev_t * mddev, dev_t dev) 2315 { 2316 char b[BDEVNAME_SIZE]; 2317 int err; 2318 unsigned int size; 2319 mdk_rdev_t *rdev; 2320 2321 if (!mddev->pers) 2322 return -ENODEV; 2323 2324 if (mddev->major_version != 0) { 2325 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 2326 " version-0 superblocks.\n", 2327 mdname(mddev)); 2328 return -EINVAL; 2329 } 2330 if (!mddev->pers->hot_add_disk) { 2331 printk(KERN_WARNING 2332 "%s: personality does not support diskops!\n", 2333 mdname(mddev)); 2334 return -EINVAL; 2335 } 2336 2337 rdev = md_import_device (dev, -1, 0); 2338 if (IS_ERR(rdev)) { 2339 printk(KERN_WARNING 2340 "md: error, md_import_device() returned %ld\n", 2341 PTR_ERR(rdev)); 2342 return -EINVAL; 2343 } 2344 2345 if (mddev->persistent) 2346 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 2347 else 2348 rdev->sb_offset = 2349 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2350 2351 size = calc_dev_size(rdev, mddev->chunk_size); 2352 rdev->size = size; 2353 2354 if (size < mddev->size) { 2355 printk(KERN_WARNING 2356 "%s: disk size %llu blocks < array size %llu\n", 2357 mdname(mddev), (unsigned long long)size, 2358 (unsigned long long)mddev->size); 2359 err = -ENOSPC; 2360 goto abort_export; 2361 } 2362 2363 if (rdev->faulty) { 2364 printk(KERN_WARNING 2365 "md: can not hot-add faulty %s disk to %s!\n", 2366 bdevname(rdev->bdev,b), mdname(mddev)); 2367 err = -EINVAL; 2368 goto abort_export; 2369 } 2370 rdev->in_sync = 0; 2371 rdev->desc_nr = -1; 2372 bind_rdev_to_array(rdev, mddev); 2373 2374 /* 2375 * The rest should better be atomic, we can have disk failures 2376 * noticed in interrupt contexts ... 2377 */ 2378 2379 if (rdev->desc_nr == mddev->max_disks) { 2380 printk(KERN_WARNING "%s: can not hot-add to full array!\n", 2381 mdname(mddev)); 2382 err = -EBUSY; 2383 goto abort_unbind_export; 2384 } 2385 2386 rdev->raid_disk = -1; 2387 2388 md_update_sb(mddev); 2389 2390 /* 2391 * Kick recovery, maybe this spare has to be added to the 2392 * array immediately. 2393 */ 2394 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2395 md_wakeup_thread(mddev->thread); 2396 2397 return 0; 2398 2399 abort_unbind_export: 2400 unbind_rdev_from_array(rdev); 2401 2402 abort_export: 2403 export_rdev(rdev); 2404 return err; 2405 } 2406 2407 /* similar to deny_write_access, but accounts for our holding a reference 2408 * to the file ourselves */ 2409 static int deny_bitmap_write_access(struct file * file) 2410 { 2411 struct inode *inode = file->f_mapping->host; 2412 2413 spin_lock(&inode->i_lock); 2414 if (atomic_read(&inode->i_writecount) > 1) { 2415 spin_unlock(&inode->i_lock); 2416 return -ETXTBSY; 2417 } 2418 atomic_set(&inode->i_writecount, -1); 2419 spin_unlock(&inode->i_lock); 2420 2421 return 0; 2422 } 2423 2424 static int set_bitmap_file(mddev_t *mddev, int fd) 2425 { 2426 int err; 2427 2428 if (mddev->pers) 2429 return -EBUSY; 2430 2431 mddev->bitmap_file = fget(fd); 2432 2433 if (mddev->bitmap_file == NULL) { 2434 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 2435 mdname(mddev)); 2436 return -EBADF; 2437 } 2438 2439 err = deny_bitmap_write_access(mddev->bitmap_file); 2440 if (err) { 2441 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 2442 mdname(mddev)); 2443 fput(mddev->bitmap_file); 2444 mddev->bitmap_file = NULL; 2445 } else 2446 mddev->bitmap_offset = 0; /* file overrides offset */ 2447 return err; 2448 } 2449 2450 /* 2451 * set_array_info is used two different ways 2452 * The original usage is when creating a new array. 2453 * In this usage, raid_disks is > 0 and it together with 2454 * level, size, not_persistent,layout,chunksize determine the 2455 * shape of the array. 2456 * This will always create an array with a type-0.90.0 superblock. 2457 * The newer usage is when assembling an array. 2458 * In this case raid_disks will be 0, and the major_version field is 2459 * use to determine which style super-blocks are to be found on the devices. 2460 * The minor and patch _version numbers are also kept incase the 2461 * super_block handler wishes to interpret them. 2462 */ 2463 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 2464 { 2465 2466 if (info->raid_disks == 0) { 2467 /* just setting version number for superblock loading */ 2468 if (info->major_version < 0 || 2469 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) || 2470 super_types[info->major_version].name == NULL) { 2471 /* maybe try to auto-load a module? */ 2472 printk(KERN_INFO 2473 "md: superblock version %d not known\n", 2474 info->major_version); 2475 return -EINVAL; 2476 } 2477 mddev->major_version = info->major_version; 2478 mddev->minor_version = info->minor_version; 2479 mddev->patch_version = info->patch_version; 2480 return 0; 2481 } 2482 mddev->major_version = MD_MAJOR_VERSION; 2483 mddev->minor_version = MD_MINOR_VERSION; 2484 mddev->patch_version = MD_PATCHLEVEL_VERSION; 2485 mddev->ctime = get_seconds(); 2486 2487 mddev->level = info->level; 2488 mddev->size = info->size; 2489 mddev->raid_disks = info->raid_disks; 2490 /* don't set md_minor, it is determined by which /dev/md* was 2491 * openned 2492 */ 2493 if (info->state & (1<<MD_SB_CLEAN)) 2494 mddev->recovery_cp = MaxSector; 2495 else 2496 mddev->recovery_cp = 0; 2497 mddev->persistent = ! info->not_persistent; 2498 2499 mddev->layout = info->layout; 2500 mddev->chunk_size = info->chunk_size; 2501 2502 mddev->max_disks = MD_SB_DISKS; 2503 2504 mddev->sb_dirty = 1; 2505 2506 /* 2507 * Generate a 128 bit UUID 2508 */ 2509 get_random_bytes(mddev->uuid, 16); 2510 2511 return 0; 2512 } 2513 2514 /* 2515 * update_array_info is used to change the configuration of an 2516 * on-line array. 2517 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 2518 * fields in the info are checked against the array. 2519 * Any differences that cannot be handled will cause an error. 2520 * Normally, only one change can be managed at a time. 2521 */ 2522 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 2523 { 2524 int rv = 0; 2525 int cnt = 0; 2526 2527 if (mddev->major_version != info->major_version || 2528 mddev->minor_version != info->minor_version || 2529 /* mddev->patch_version != info->patch_version || */ 2530 mddev->ctime != info->ctime || 2531 mddev->level != info->level || 2532 /* mddev->layout != info->layout || */ 2533 !mddev->persistent != info->not_persistent|| 2534 mddev->chunk_size != info->chunk_size ) 2535 return -EINVAL; 2536 /* Check there is only one change */ 2537 if (mddev->size != info->size) cnt++; 2538 if (mddev->raid_disks != info->raid_disks) cnt++; 2539 if (mddev->layout != info->layout) cnt++; 2540 if (cnt == 0) return 0; 2541 if (cnt > 1) return -EINVAL; 2542 2543 if (mddev->layout != info->layout) { 2544 /* Change layout 2545 * we don't need to do anything at the md level, the 2546 * personality will take care of it all. 2547 */ 2548 if (mddev->pers->reconfig == NULL) 2549 return -EINVAL; 2550 else 2551 return mddev->pers->reconfig(mddev, info->layout, -1); 2552 } 2553 if (mddev->size != info->size) { 2554 mdk_rdev_t * rdev; 2555 struct list_head *tmp; 2556 if (mddev->pers->resize == NULL) 2557 return -EINVAL; 2558 /* The "size" is the amount of each device that is used. 2559 * This can only make sense for arrays with redundancy. 2560 * linear and raid0 always use whatever space is available 2561 * We can only consider changing the size if no resync 2562 * or reconstruction is happening, and if the new size 2563 * is acceptable. It must fit before the sb_offset or, 2564 * if that is <data_offset, it must fit before the 2565 * size of each device. 2566 * If size is zero, we find the largest size that fits. 2567 */ 2568 if (mddev->sync_thread) 2569 return -EBUSY; 2570 ITERATE_RDEV(mddev,rdev,tmp) { 2571 sector_t avail; 2572 int fit = (info->size == 0); 2573 if (rdev->sb_offset > rdev->data_offset) 2574 avail = (rdev->sb_offset*2) - rdev->data_offset; 2575 else 2576 avail = get_capacity(rdev->bdev->bd_disk) 2577 - rdev->data_offset; 2578 if (fit && (info->size == 0 || info->size > avail/2)) 2579 info->size = avail/2; 2580 if (avail < ((sector_t)info->size << 1)) 2581 return -ENOSPC; 2582 } 2583 rv = mddev->pers->resize(mddev, (sector_t)info->size *2); 2584 if (!rv) { 2585 struct block_device *bdev; 2586 2587 bdev = bdget_disk(mddev->gendisk, 0); 2588 if (bdev) { 2589 down(&bdev->bd_inode->i_sem); 2590 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2591 up(&bdev->bd_inode->i_sem); 2592 bdput(bdev); 2593 } 2594 } 2595 } 2596 if (mddev->raid_disks != info->raid_disks) { 2597 /* change the number of raid disks */ 2598 if (mddev->pers->reshape == NULL) 2599 return -EINVAL; 2600 if (info->raid_disks <= 0 || 2601 info->raid_disks >= mddev->max_disks) 2602 return -EINVAL; 2603 if (mddev->sync_thread) 2604 return -EBUSY; 2605 rv = mddev->pers->reshape(mddev, info->raid_disks); 2606 if (!rv) { 2607 struct block_device *bdev; 2608 2609 bdev = bdget_disk(mddev->gendisk, 0); 2610 if (bdev) { 2611 down(&bdev->bd_inode->i_sem); 2612 i_size_write(bdev->bd_inode, mddev->array_size << 10); 2613 up(&bdev->bd_inode->i_sem); 2614 bdput(bdev); 2615 } 2616 } 2617 } 2618 md_update_sb(mddev); 2619 return rv; 2620 } 2621 2622 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 2623 { 2624 mdk_rdev_t *rdev; 2625 2626 if (mddev->pers == NULL) 2627 return -ENODEV; 2628 2629 rdev = find_rdev(mddev, dev); 2630 if (!rdev) 2631 return -ENODEV; 2632 2633 md_error(mddev, rdev); 2634 return 0; 2635 } 2636 2637 static int md_ioctl(struct inode *inode, struct file *file, 2638 unsigned int cmd, unsigned long arg) 2639 { 2640 int err = 0; 2641 void __user *argp = (void __user *)arg; 2642 struct hd_geometry __user *loc = argp; 2643 mddev_t *mddev = NULL; 2644 2645 if (!capable(CAP_SYS_ADMIN)) 2646 return -EACCES; 2647 2648 /* 2649 * Commands dealing with the RAID driver but not any 2650 * particular array: 2651 */ 2652 switch (cmd) 2653 { 2654 case RAID_VERSION: 2655 err = get_version(argp); 2656 goto done; 2657 2658 case PRINT_RAID_DEBUG: 2659 err = 0; 2660 md_print_devices(); 2661 goto done; 2662 2663 #ifndef MODULE 2664 case RAID_AUTORUN: 2665 err = 0; 2666 autostart_arrays(arg); 2667 goto done; 2668 #endif 2669 default:; 2670 } 2671 2672 /* 2673 * Commands creating/starting a new array: 2674 */ 2675 2676 mddev = inode->i_bdev->bd_disk->private_data; 2677 2678 if (!mddev) { 2679 BUG(); 2680 goto abort; 2681 } 2682 2683 2684 if (cmd == START_ARRAY) { 2685 /* START_ARRAY doesn't need to lock the array as autostart_array 2686 * does the locking, and it could even be a different array 2687 */ 2688 static int cnt = 3; 2689 if (cnt > 0 ) { 2690 printk(KERN_WARNING 2691 "md: %s(pid %d) used deprecated START_ARRAY ioctl. " 2692 "This will not be supported beyond 2.6\n", 2693 current->comm, current->pid); 2694 cnt--; 2695 } 2696 err = autostart_array(new_decode_dev(arg)); 2697 if (err) { 2698 printk(KERN_WARNING "md: autostart failed!\n"); 2699 goto abort; 2700 } 2701 goto done; 2702 } 2703 2704 err = mddev_lock(mddev); 2705 if (err) { 2706 printk(KERN_INFO 2707 "md: ioctl lock interrupted, reason %d, cmd %d\n", 2708 err, cmd); 2709 goto abort; 2710 } 2711 2712 switch (cmd) 2713 { 2714 case SET_ARRAY_INFO: 2715 { 2716 mdu_array_info_t info; 2717 if (!arg) 2718 memset(&info, 0, sizeof(info)); 2719 else if (copy_from_user(&info, argp, sizeof(info))) { 2720 err = -EFAULT; 2721 goto abort_unlock; 2722 } 2723 if (mddev->pers) { 2724 err = update_array_info(mddev, &info); 2725 if (err) { 2726 printk(KERN_WARNING "md: couldn't update" 2727 " array info. %d\n", err); 2728 goto abort_unlock; 2729 } 2730 goto done_unlock; 2731 } 2732 if (!list_empty(&mddev->disks)) { 2733 printk(KERN_WARNING 2734 "md: array %s already has disks!\n", 2735 mdname(mddev)); 2736 err = -EBUSY; 2737 goto abort_unlock; 2738 } 2739 if (mddev->raid_disks) { 2740 printk(KERN_WARNING 2741 "md: array %s already initialised!\n", 2742 mdname(mddev)); 2743 err = -EBUSY; 2744 goto abort_unlock; 2745 } 2746 err = set_array_info(mddev, &info); 2747 if (err) { 2748 printk(KERN_WARNING "md: couldn't set" 2749 " array info. %d\n", err); 2750 goto abort_unlock; 2751 } 2752 } 2753 goto done_unlock; 2754 2755 default:; 2756 } 2757 2758 /* 2759 * Commands querying/configuring an existing array: 2760 */ 2761 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 2762 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ 2763 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 2764 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { 2765 err = -ENODEV; 2766 goto abort_unlock; 2767 } 2768 2769 /* 2770 * Commands even a read-only array can execute: 2771 */ 2772 switch (cmd) 2773 { 2774 case GET_ARRAY_INFO: 2775 err = get_array_info(mddev, argp); 2776 goto done_unlock; 2777 2778 case GET_BITMAP_FILE: 2779 err = get_bitmap_file(mddev, (void *)arg); 2780 goto done_unlock; 2781 2782 case GET_DISK_INFO: 2783 err = get_disk_info(mddev, argp); 2784 goto done_unlock; 2785 2786 case RESTART_ARRAY_RW: 2787 err = restart_array(mddev); 2788 goto done_unlock; 2789 2790 case STOP_ARRAY: 2791 err = do_md_stop (mddev, 0); 2792 goto done_unlock; 2793 2794 case STOP_ARRAY_RO: 2795 err = do_md_stop (mddev, 1); 2796 goto done_unlock; 2797 2798 /* 2799 * We have a problem here : there is no easy way to give a CHS 2800 * virtual geometry. We currently pretend that we have a 2 heads 2801 * 4 sectors (with a BIG number of cylinders...). This drives 2802 * dosfs just mad... ;-) 2803 */ 2804 case HDIO_GETGEO: 2805 if (!loc) { 2806 err = -EINVAL; 2807 goto abort_unlock; 2808 } 2809 err = put_user (2, (char __user *) &loc->heads); 2810 if (err) 2811 goto abort_unlock; 2812 err = put_user (4, (char __user *) &loc->sectors); 2813 if (err) 2814 goto abort_unlock; 2815 err = put_user(get_capacity(mddev->gendisk)/8, 2816 (short __user *) &loc->cylinders); 2817 if (err) 2818 goto abort_unlock; 2819 err = put_user (get_start_sect(inode->i_bdev), 2820 (long __user *) &loc->start); 2821 goto done_unlock; 2822 } 2823 2824 /* 2825 * The remaining ioctls are changing the state of the 2826 * superblock, so we do not allow read-only arrays 2827 * here: 2828 */ 2829 if (mddev->ro) { 2830 err = -EROFS; 2831 goto abort_unlock; 2832 } 2833 2834 switch (cmd) 2835 { 2836 case ADD_NEW_DISK: 2837 { 2838 mdu_disk_info_t info; 2839 if (copy_from_user(&info, argp, sizeof(info))) 2840 err = -EFAULT; 2841 else 2842 err = add_new_disk(mddev, &info); 2843 goto done_unlock; 2844 } 2845 2846 case HOT_REMOVE_DISK: 2847 err = hot_remove_disk(mddev, new_decode_dev(arg)); 2848 goto done_unlock; 2849 2850 case HOT_ADD_DISK: 2851 err = hot_add_disk(mddev, new_decode_dev(arg)); 2852 goto done_unlock; 2853 2854 case SET_DISK_FAULTY: 2855 err = set_disk_faulty(mddev, new_decode_dev(arg)); 2856 goto done_unlock; 2857 2858 case RUN_ARRAY: 2859 err = do_md_run (mddev); 2860 goto done_unlock; 2861 2862 case SET_BITMAP_FILE: 2863 err = set_bitmap_file(mddev, (int)arg); 2864 goto done_unlock; 2865 2866 default: 2867 if (_IOC_TYPE(cmd) == MD_MAJOR) 2868 printk(KERN_WARNING "md: %s(pid %d) used" 2869 " obsolete MD ioctl, upgrade your" 2870 " software to use new ictls.\n", 2871 current->comm, current->pid); 2872 err = -EINVAL; 2873 goto abort_unlock; 2874 } 2875 2876 done_unlock: 2877 abort_unlock: 2878 mddev_unlock(mddev); 2879 2880 return err; 2881 done: 2882 if (err) 2883 MD_BUG(); 2884 abort: 2885 return err; 2886 } 2887 2888 static int md_open(struct inode *inode, struct file *file) 2889 { 2890 /* 2891 * Succeed if we can lock the mddev, which confirms that 2892 * it isn't being stopped right now. 2893 */ 2894 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 2895 int err; 2896 2897 if ((err = mddev_lock(mddev))) 2898 goto out; 2899 2900 err = 0; 2901 mddev_get(mddev); 2902 mddev_unlock(mddev); 2903 2904 check_disk_change(inode->i_bdev); 2905 out: 2906 return err; 2907 } 2908 2909 static int md_release(struct inode *inode, struct file * file) 2910 { 2911 mddev_t *mddev = inode->i_bdev->bd_disk->private_data; 2912 2913 if (!mddev) 2914 BUG(); 2915 mddev_put(mddev); 2916 2917 return 0; 2918 } 2919 2920 static int md_media_changed(struct gendisk *disk) 2921 { 2922 mddev_t *mddev = disk->private_data; 2923 2924 return mddev->changed; 2925 } 2926 2927 static int md_revalidate(struct gendisk *disk) 2928 { 2929 mddev_t *mddev = disk->private_data; 2930 2931 mddev->changed = 0; 2932 return 0; 2933 } 2934 static struct block_device_operations md_fops = 2935 { 2936 .owner = THIS_MODULE, 2937 .open = md_open, 2938 .release = md_release, 2939 .ioctl = md_ioctl, 2940 .media_changed = md_media_changed, 2941 .revalidate_disk= md_revalidate, 2942 }; 2943 2944 static int md_thread(void * arg) 2945 { 2946 mdk_thread_t *thread = arg; 2947 2948 lock_kernel(); 2949 2950 /* 2951 * Detach thread 2952 */ 2953 2954 daemonize(thread->name, mdname(thread->mddev)); 2955 2956 current->exit_signal = SIGCHLD; 2957 allow_signal(SIGKILL); 2958 thread->tsk = current; 2959 2960 /* 2961 * md_thread is a 'system-thread', it's priority should be very 2962 * high. We avoid resource deadlocks individually in each 2963 * raid personality. (RAID5 does preallocation) We also use RR and 2964 * the very same RT priority as kswapd, thus we will never get 2965 * into a priority inversion deadlock. 2966 * 2967 * we definitely have to have equal or higher priority than 2968 * bdflush, otherwise bdflush will deadlock if there are too 2969 * many dirty RAID5 blocks. 2970 */ 2971 unlock_kernel(); 2972 2973 complete(thread->event); 2974 while (thread->run) { 2975 void (*run)(mddev_t *); 2976 2977 wait_event_interruptible_timeout(thread->wqueue, 2978 test_bit(THREAD_WAKEUP, &thread->flags), 2979 thread->timeout); 2980 try_to_freeze(); 2981 2982 clear_bit(THREAD_WAKEUP, &thread->flags); 2983 2984 run = thread->run; 2985 if (run) 2986 run(thread->mddev); 2987 2988 if (signal_pending(current)) 2989 flush_signals(current); 2990 } 2991 complete(thread->event); 2992 return 0; 2993 } 2994 2995 void md_wakeup_thread(mdk_thread_t *thread) 2996 { 2997 if (thread) { 2998 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 2999 set_bit(THREAD_WAKEUP, &thread->flags); 3000 wake_up(&thread->wqueue); 3001 } 3002 } 3003 3004 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 3005 const char *name) 3006 { 3007 mdk_thread_t *thread; 3008 int ret; 3009 struct completion event; 3010 3011 thread = (mdk_thread_t *) kmalloc 3012 (sizeof(mdk_thread_t), GFP_KERNEL); 3013 if (!thread) 3014 return NULL; 3015 3016 memset(thread, 0, sizeof(mdk_thread_t)); 3017 init_waitqueue_head(&thread->wqueue); 3018 3019 init_completion(&event); 3020 thread->event = &event; 3021 thread->run = run; 3022 thread->mddev = mddev; 3023 thread->name = name; 3024 thread->timeout = MAX_SCHEDULE_TIMEOUT; 3025 ret = kernel_thread(md_thread, thread, 0); 3026 if (ret < 0) { 3027 kfree(thread); 3028 return NULL; 3029 } 3030 wait_for_completion(&event); 3031 return thread; 3032 } 3033 3034 void md_unregister_thread(mdk_thread_t *thread) 3035 { 3036 struct completion event; 3037 3038 init_completion(&event); 3039 3040 thread->event = &event; 3041 3042 /* As soon as ->run is set to NULL, the task could disappear, 3043 * so we need to hold tasklist_lock until we have sent the signal 3044 */ 3045 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid); 3046 read_lock(&tasklist_lock); 3047 thread->run = NULL; 3048 send_sig(SIGKILL, thread->tsk, 1); 3049 read_unlock(&tasklist_lock); 3050 wait_for_completion(&event); 3051 kfree(thread); 3052 } 3053 3054 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 3055 { 3056 if (!mddev) { 3057 MD_BUG(); 3058 return; 3059 } 3060 3061 if (!rdev || rdev->faulty) 3062 return; 3063 /* 3064 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3065 mdname(mddev), 3066 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3067 __builtin_return_address(0),__builtin_return_address(1), 3068 __builtin_return_address(2),__builtin_return_address(3)); 3069 */ 3070 if (!mddev->pers->error_handler) 3071 return; 3072 mddev->pers->error_handler(mddev,rdev); 3073 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3074 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3075 md_wakeup_thread(mddev->thread); 3076 } 3077 3078 /* seq_file implementation /proc/mdstat */ 3079 3080 static void status_unused(struct seq_file *seq) 3081 { 3082 int i = 0; 3083 mdk_rdev_t *rdev; 3084 struct list_head *tmp; 3085 3086 seq_printf(seq, "unused devices: "); 3087 3088 ITERATE_RDEV_PENDING(rdev,tmp) { 3089 char b[BDEVNAME_SIZE]; 3090 i++; 3091 seq_printf(seq, "%s ", 3092 bdevname(rdev->bdev,b)); 3093 } 3094 if (!i) 3095 seq_printf(seq, "<none>"); 3096 3097 seq_printf(seq, "\n"); 3098 } 3099 3100 3101 static void status_resync(struct seq_file *seq, mddev_t * mddev) 3102 { 3103 unsigned long max_blocks, resync, res, dt, db, rt; 3104 3105 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; 3106 3107 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3108 max_blocks = mddev->resync_max_sectors >> 1; 3109 else 3110 max_blocks = mddev->size; 3111 3112 /* 3113 * Should not happen. 3114 */ 3115 if (!max_blocks) { 3116 MD_BUG(); 3117 return; 3118 } 3119 res = (resync/1024)*1000/(max_blocks/1024 + 1); 3120 { 3121 int i, x = res/50, y = 20-x; 3122 seq_printf(seq, "["); 3123 for (i = 0; i < x; i++) 3124 seq_printf(seq, "="); 3125 seq_printf(seq, ">"); 3126 for (i = 0; i < y; i++) 3127 seq_printf(seq, "."); 3128 seq_printf(seq, "] "); 3129 } 3130 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", 3131 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 3132 "resync" : "recovery"), 3133 res/10, res % 10, resync, max_blocks); 3134 3135 /* 3136 * We do not want to overflow, so the order of operands and 3137 * the * 100 / 100 trick are important. We do a +1 to be 3138 * safe against division by zero. We only estimate anyway. 3139 * 3140 * dt: time from mark until now 3141 * db: blocks written from mark until now 3142 * rt: remaining time 3143 */ 3144 dt = ((jiffies - mddev->resync_mark) / HZ); 3145 if (!dt) dt++; 3146 db = resync - (mddev->resync_mark_cnt/2); 3147 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; 3148 3149 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); 3150 3151 seq_printf(seq, " speed=%ldK/sec", db/dt); 3152 } 3153 3154 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 3155 { 3156 struct list_head *tmp; 3157 loff_t l = *pos; 3158 mddev_t *mddev; 3159 3160 if (l >= 0x10000) 3161 return NULL; 3162 if (!l--) 3163 /* header */ 3164 return (void*)1; 3165 3166 spin_lock(&all_mddevs_lock); 3167 list_for_each(tmp,&all_mddevs) 3168 if (!l--) { 3169 mddev = list_entry(tmp, mddev_t, all_mddevs); 3170 mddev_get(mddev); 3171 spin_unlock(&all_mddevs_lock); 3172 return mddev; 3173 } 3174 spin_unlock(&all_mddevs_lock); 3175 if (!l--) 3176 return (void*)2;/* tail */ 3177 return NULL; 3178 } 3179 3180 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3181 { 3182 struct list_head *tmp; 3183 mddev_t *next_mddev, *mddev = v; 3184 3185 ++*pos; 3186 if (v == (void*)2) 3187 return NULL; 3188 3189 spin_lock(&all_mddevs_lock); 3190 if (v == (void*)1) 3191 tmp = all_mddevs.next; 3192 else 3193 tmp = mddev->all_mddevs.next; 3194 if (tmp != &all_mddevs) 3195 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 3196 else { 3197 next_mddev = (void*)2; 3198 *pos = 0x10000; 3199 } 3200 spin_unlock(&all_mddevs_lock); 3201 3202 if (v != (void*)1) 3203 mddev_put(mddev); 3204 return next_mddev; 3205 3206 } 3207 3208 static void md_seq_stop(struct seq_file *seq, void *v) 3209 { 3210 mddev_t *mddev = v; 3211 3212 if (mddev && v != (void*)1 && v != (void*)2) 3213 mddev_put(mddev); 3214 } 3215 3216 static int md_seq_show(struct seq_file *seq, void *v) 3217 { 3218 mddev_t *mddev = v; 3219 sector_t size; 3220 struct list_head *tmp2; 3221 mdk_rdev_t *rdev; 3222 int i; 3223 struct bitmap *bitmap; 3224 3225 if (v == (void*)1) { 3226 seq_printf(seq, "Personalities : "); 3227 spin_lock(&pers_lock); 3228 for (i = 0; i < MAX_PERSONALITY; i++) 3229 if (pers[i]) 3230 seq_printf(seq, "[%s] ", pers[i]->name); 3231 3232 spin_unlock(&pers_lock); 3233 seq_printf(seq, "\n"); 3234 return 0; 3235 } 3236 if (v == (void*)2) { 3237 status_unused(seq); 3238 return 0; 3239 } 3240 3241 if (mddev_lock(mddev)!=0) 3242 return -EINTR; 3243 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 3244 seq_printf(seq, "%s : %sactive", mdname(mddev), 3245 mddev->pers ? "" : "in"); 3246 if (mddev->pers) { 3247 if (mddev->ro) 3248 seq_printf(seq, " (read-only)"); 3249 seq_printf(seq, " %s", mddev->pers->name); 3250 } 3251 3252 size = 0; 3253 ITERATE_RDEV(mddev,rdev,tmp2) { 3254 char b[BDEVNAME_SIZE]; 3255 seq_printf(seq, " %s[%d]", 3256 bdevname(rdev->bdev,b), rdev->desc_nr); 3257 if (rdev->faulty) { 3258 seq_printf(seq, "(F)"); 3259 continue; 3260 } 3261 size += rdev->size; 3262 } 3263 3264 if (!list_empty(&mddev->disks)) { 3265 if (mddev->pers) 3266 seq_printf(seq, "\n %llu blocks", 3267 (unsigned long long)mddev->array_size); 3268 else 3269 seq_printf(seq, "\n %llu blocks", 3270 (unsigned long long)size); 3271 } 3272 3273 if (mddev->pers) { 3274 mddev->pers->status (seq, mddev); 3275 seq_printf(seq, "\n "); 3276 if (mddev->curr_resync > 2) { 3277 status_resync (seq, mddev); 3278 seq_printf(seq, "\n "); 3279 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3280 seq_printf(seq, " resync=DELAYED\n "); 3281 } else 3282 seq_printf(seq, "\n "); 3283 3284 if ((bitmap = mddev->bitmap)) { 3285 unsigned long chunk_kb; 3286 unsigned long flags; 3287 spin_lock_irqsave(&bitmap->lock, flags); 3288 chunk_kb = bitmap->chunksize >> 10; 3289 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 3290 "%lu%s chunk", 3291 bitmap->pages - bitmap->missing_pages, 3292 bitmap->pages, 3293 (bitmap->pages - bitmap->missing_pages) 3294 << (PAGE_SHIFT - 10), 3295 chunk_kb ? chunk_kb : bitmap->chunksize, 3296 chunk_kb ? "KB" : "B"); 3297 if (bitmap->file) { 3298 seq_printf(seq, ", file: "); 3299 seq_path(seq, bitmap->file->f_vfsmnt, 3300 bitmap->file->f_dentry," \t\n"); 3301 } 3302 3303 seq_printf(seq, "\n"); 3304 spin_unlock_irqrestore(&bitmap->lock, flags); 3305 } 3306 3307 seq_printf(seq, "\n"); 3308 } 3309 mddev_unlock(mddev); 3310 3311 return 0; 3312 } 3313 3314 static struct seq_operations md_seq_ops = { 3315 .start = md_seq_start, 3316 .next = md_seq_next, 3317 .stop = md_seq_stop, 3318 .show = md_seq_show, 3319 }; 3320 3321 static int md_seq_open(struct inode *inode, struct file *file) 3322 { 3323 int error; 3324 3325 error = seq_open(file, &md_seq_ops); 3326 return error; 3327 } 3328 3329 static struct file_operations md_seq_fops = { 3330 .open = md_seq_open, 3331 .read = seq_read, 3332 .llseek = seq_lseek, 3333 .release = seq_release, 3334 }; 3335 3336 int register_md_personality(int pnum, mdk_personality_t *p) 3337 { 3338 if (pnum >= MAX_PERSONALITY) { 3339 printk(KERN_ERR 3340 "md: tried to install personality %s as nr %d, but max is %lu\n", 3341 p->name, pnum, MAX_PERSONALITY-1); 3342 return -EINVAL; 3343 } 3344 3345 spin_lock(&pers_lock); 3346 if (pers[pnum]) { 3347 spin_unlock(&pers_lock); 3348 return -EBUSY; 3349 } 3350 3351 pers[pnum] = p; 3352 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum); 3353 spin_unlock(&pers_lock); 3354 return 0; 3355 } 3356 3357 int unregister_md_personality(int pnum) 3358 { 3359 if (pnum >= MAX_PERSONALITY) 3360 return -EINVAL; 3361 3362 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name); 3363 spin_lock(&pers_lock); 3364 pers[pnum] = NULL; 3365 spin_unlock(&pers_lock); 3366 return 0; 3367 } 3368 3369 static int is_mddev_idle(mddev_t *mddev) 3370 { 3371 mdk_rdev_t * rdev; 3372 struct list_head *tmp; 3373 int idle; 3374 unsigned long curr_events; 3375 3376 idle = 1; 3377 ITERATE_RDEV(mddev,rdev,tmp) { 3378 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 3379 curr_events = disk_stat_read(disk, read_sectors) + 3380 disk_stat_read(disk, write_sectors) - 3381 atomic_read(&disk->sync_io); 3382 /* Allow some slack between valud of curr_events and last_events, 3383 * as there are some uninteresting races. 3384 * Note: the following is an unsigned comparison. 3385 */ 3386 if ((curr_events - rdev->last_events + 32) > 64) { 3387 rdev->last_events = curr_events; 3388 idle = 0; 3389 } 3390 } 3391 return idle; 3392 } 3393 3394 void md_done_sync(mddev_t *mddev, int blocks, int ok) 3395 { 3396 /* another "blocks" (512byte) blocks have been synced */ 3397 atomic_sub(blocks, &mddev->recovery_active); 3398 wake_up(&mddev->recovery_wait); 3399 if (!ok) { 3400 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3401 md_wakeup_thread(mddev->thread); 3402 // stop recovery, signal do_sync .... 3403 } 3404 } 3405 3406 3407 /* md_write_start(mddev, bi) 3408 * If we need to update some array metadata (e.g. 'active' flag 3409 * in superblock) before writing, schedule a superblock update 3410 * and wait for it to complete. 3411 */ 3412 void md_write_start(mddev_t *mddev, struct bio *bi) 3413 { 3414 DEFINE_WAIT(w); 3415 if (bio_data_dir(bi) != WRITE) 3416 return; 3417 3418 atomic_inc(&mddev->writes_pending); 3419 if (mddev->in_sync) { 3420 spin_lock(&mddev->write_lock); 3421 if (mddev->in_sync) { 3422 mddev->in_sync = 0; 3423 mddev->sb_dirty = 1; 3424 md_wakeup_thread(mddev->thread); 3425 } 3426 spin_unlock(&mddev->write_lock); 3427 } 3428 wait_event(mddev->sb_wait, mddev->sb_dirty==0); 3429 } 3430 3431 void md_write_end(mddev_t *mddev) 3432 { 3433 if (atomic_dec_and_test(&mddev->writes_pending)) { 3434 if (mddev->safemode == 2) 3435 md_wakeup_thread(mddev->thread); 3436 else 3437 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 3438 } 3439 } 3440 3441 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3442 3443 #define SYNC_MARKS 10 3444 #define SYNC_MARK_STEP (3*HZ) 3445 static void md_do_sync(mddev_t *mddev) 3446 { 3447 mddev_t *mddev2; 3448 unsigned int currspeed = 0, 3449 window; 3450 sector_t max_sectors,j, io_sectors; 3451 unsigned long mark[SYNC_MARKS]; 3452 sector_t mark_cnt[SYNC_MARKS]; 3453 int last_mark,m; 3454 struct list_head *tmp; 3455 sector_t last_check; 3456 int skipped = 0; 3457 3458 /* just incase thread restarts... */ 3459 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3460 return; 3461 3462 /* we overload curr_resync somewhat here. 3463 * 0 == not engaged in resync at all 3464 * 2 == checking that there is no conflict with another sync 3465 * 1 == like 2, but have yielded to allow conflicting resync to 3466 * commense 3467 * other == active in resync - this many blocks 3468 * 3469 * Before starting a resync we must have set curr_resync to 3470 * 2, and then checked that every "conflicting" array has curr_resync 3471 * less than ours. When we find one that is the same or higher 3472 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 3473 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 3474 * This will mean we have to start checking from the beginning again. 3475 * 3476 */ 3477 3478 do { 3479 mddev->curr_resync = 2; 3480 3481 try_again: 3482 if (signal_pending(current)) { 3483 flush_signals(current); 3484 goto skip; 3485 } 3486 ITERATE_MDDEV(mddev2,tmp) { 3487 printk("."); 3488 if (mddev2 == mddev) 3489 continue; 3490 if (mddev2->curr_resync && 3491 match_mddev_units(mddev,mddev2)) { 3492 DEFINE_WAIT(wq); 3493 if (mddev < mddev2 && mddev->curr_resync == 2) { 3494 /* arbitrarily yield */ 3495 mddev->curr_resync = 1; 3496 wake_up(&resync_wait); 3497 } 3498 if (mddev > mddev2 && mddev->curr_resync == 1) 3499 /* no need to wait here, we can wait the next 3500 * time 'round when curr_resync == 2 3501 */ 3502 continue; 3503 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 3504 if (!signal_pending(current) 3505 && mddev2->curr_resync >= mddev->curr_resync) { 3506 printk(KERN_INFO "md: delaying resync of %s" 3507 " until %s has finished resync (they" 3508 " share one or more physical units)\n", 3509 mdname(mddev), mdname(mddev2)); 3510 mddev_put(mddev2); 3511 schedule(); 3512 finish_wait(&resync_wait, &wq); 3513 goto try_again; 3514 } 3515 finish_wait(&resync_wait, &wq); 3516 } 3517 } 3518 } while (mddev->curr_resync < 2); 3519 3520 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3521 /* resync follows the size requested by the personality, 3522 * which defaults to physical size, but can be virtual size 3523 */ 3524 max_sectors = mddev->resync_max_sectors; 3525 else 3526 /* recovery follows the physical size of devices */ 3527 max_sectors = mddev->size << 1; 3528 3529 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 3530 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 3531 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 3532 printk(KERN_INFO "md: using maximum available idle IO bandwith " 3533 "(but not more than %d KB/sec) for reconstruction.\n", 3534 sysctl_speed_limit_max); 3535 3536 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3537 /* we don't use the checkpoint if there's a bitmap */ 3538 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap) 3539 j = mddev->recovery_cp; 3540 else 3541 j = 0; 3542 io_sectors = 0; 3543 for (m = 0; m < SYNC_MARKS; m++) { 3544 mark[m] = jiffies; 3545 mark_cnt[m] = io_sectors; 3546 } 3547 last_mark = 0; 3548 mddev->resync_mark = mark[last_mark]; 3549 mddev->resync_mark_cnt = mark_cnt[last_mark]; 3550 3551 /* 3552 * Tune reconstruction: 3553 */ 3554 window = 32*(PAGE_SIZE/512); 3555 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 3556 window/2,(unsigned long long) max_sectors/2); 3557 3558 atomic_set(&mddev->recovery_active, 0); 3559 init_waitqueue_head(&mddev->recovery_wait); 3560 last_check = 0; 3561 3562 if (j>2) { 3563 printk(KERN_INFO 3564 "md: resuming recovery of %s from checkpoint.\n", 3565 mdname(mddev)); 3566 mddev->curr_resync = j; 3567 } 3568 3569 while (j < max_sectors) { 3570 sector_t sectors; 3571 3572 skipped = 0; 3573 sectors = mddev->pers->sync_request(mddev, j, &skipped, 3574 currspeed < sysctl_speed_limit_min); 3575 if (sectors == 0) { 3576 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3577 goto out; 3578 } 3579 3580 if (!skipped) { /* actual IO requested */ 3581 io_sectors += sectors; 3582 atomic_add(sectors, &mddev->recovery_active); 3583 } 3584 3585 j += sectors; 3586 if (j>1) mddev->curr_resync = j; 3587 3588 3589 if (last_check + window > io_sectors || j == max_sectors) 3590 continue; 3591 3592 last_check = io_sectors; 3593 3594 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 3595 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 3596 break; 3597 3598 repeat: 3599 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 3600 /* step marks */ 3601 int next = (last_mark+1) % SYNC_MARKS; 3602 3603 mddev->resync_mark = mark[next]; 3604 mddev->resync_mark_cnt = mark_cnt[next]; 3605 mark[next] = jiffies; 3606 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 3607 last_mark = next; 3608 } 3609 3610 3611 if (signal_pending(current)) { 3612 /* 3613 * got a signal, exit. 3614 */ 3615 printk(KERN_INFO 3616 "md: md_do_sync() got signal ... exiting\n"); 3617 flush_signals(current); 3618 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3619 goto out; 3620 } 3621 3622 /* 3623 * this loop exits only if either when we are slower than 3624 * the 'hard' speed limit, or the system was IO-idle for 3625 * a jiffy. 3626 * the system might be non-idle CPU-wise, but we only care 3627 * about not overloading the IO subsystem. (things like an 3628 * e2fsck being done on the RAID array should execute fast) 3629 */ 3630 mddev->queue->unplug_fn(mddev->queue); 3631 cond_resched(); 3632 3633 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 3634 /((jiffies-mddev->resync_mark)/HZ +1) +1; 3635 3636 if (currspeed > sysctl_speed_limit_min) { 3637 if ((currspeed > sysctl_speed_limit_max) || 3638 !is_mddev_idle(mddev)) { 3639 msleep_interruptible(250); 3640 goto repeat; 3641 } 3642 } 3643 } 3644 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev)); 3645 /* 3646 * this also signals 'finished resyncing' to md_stop 3647 */ 3648 out: 3649 mddev->queue->unplug_fn(mddev->queue); 3650 3651 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 3652 3653 /* tell personality that we are finished */ 3654 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 3655 3656 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3657 mddev->curr_resync > 2 && 3658 mddev->curr_resync >= mddev->recovery_cp) { 3659 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 3660 printk(KERN_INFO 3661 "md: checkpointing recovery of %s.\n", 3662 mdname(mddev)); 3663 mddev->recovery_cp = mddev->curr_resync; 3664 } else 3665 mddev->recovery_cp = MaxSector; 3666 } 3667 3668 skip: 3669 mddev->curr_resync = 0; 3670 wake_up(&resync_wait); 3671 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 3672 md_wakeup_thread(mddev->thread); 3673 } 3674 3675 3676 /* 3677 * This routine is regularly called by all per-raid-array threads to 3678 * deal with generic issues like resync and super-block update. 3679 * Raid personalities that don't have a thread (linear/raid0) do not 3680 * need this as they never do any recovery or update the superblock. 3681 * 3682 * It does not do any resync itself, but rather "forks" off other threads 3683 * to do that as needed. 3684 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 3685 * "->recovery" and create a thread at ->sync_thread. 3686 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR) 3687 * and wakeups up this thread which will reap the thread and finish up. 3688 * This thread also removes any faulty devices (with nr_pending == 0). 3689 * 3690 * The overall approach is: 3691 * 1/ if the superblock needs updating, update it. 3692 * 2/ If a recovery thread is running, don't do anything else. 3693 * 3/ If recovery has finished, clean up, possibly marking spares active. 3694 * 4/ If there are any faulty devices, remove them. 3695 * 5/ If array is degraded, try to add spares devices 3696 * 6/ If array has spares or is not in-sync, start a resync thread. 3697 */ 3698 void md_check_recovery(mddev_t *mddev) 3699 { 3700 mdk_rdev_t *rdev; 3701 struct list_head *rtmp; 3702 3703 3704 if (mddev->bitmap) 3705 bitmap_daemon_work(mddev->bitmap); 3706 3707 if (mddev->ro) 3708 return; 3709 3710 if (signal_pending(current)) { 3711 if (mddev->pers->sync_request) { 3712 printk(KERN_INFO "md: %s in immediate safe mode\n", 3713 mdname(mddev)); 3714 mddev->safemode = 2; 3715 } 3716 flush_signals(current); 3717 } 3718 3719 if ( ! ( 3720 mddev->sb_dirty || 3721 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 3722 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 3723 (mddev->safemode == 1) || 3724 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 3725 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 3726 )) 3727 return; 3728 3729 if (mddev_trylock(mddev)==0) { 3730 int spares =0; 3731 3732 spin_lock(&mddev->write_lock); 3733 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 3734 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 3735 mddev->in_sync = 1; 3736 mddev->sb_dirty = 1; 3737 } 3738 if (mddev->safemode == 1) 3739 mddev->safemode = 0; 3740 spin_unlock(&mddev->write_lock); 3741 3742 if (mddev->sb_dirty) 3743 md_update_sb(mddev); 3744 3745 3746 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 3747 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 3748 /* resync/recovery still happening */ 3749 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3750 goto unlock; 3751 } 3752 if (mddev->sync_thread) { 3753 /* resync has finished, collect result */ 3754 md_unregister_thread(mddev->sync_thread); 3755 mddev->sync_thread = NULL; 3756 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3757 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 3758 /* success...*/ 3759 /* activate any spares */ 3760 mddev->pers->spare_active(mddev); 3761 } 3762 md_update_sb(mddev); 3763 3764 /* if array is no-longer degraded, then any saved_raid_disk 3765 * information must be scrapped 3766 */ 3767 if (!mddev->degraded) 3768 ITERATE_RDEV(mddev,rdev,rtmp) 3769 rdev->saved_raid_disk = -1; 3770 3771 mddev->recovery = 0; 3772 /* flag recovery needed just to double check */ 3773 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3774 goto unlock; 3775 } 3776 if (mddev->recovery) 3777 /* probably just the RECOVERY_NEEDED flag */ 3778 mddev->recovery = 0; 3779 3780 /* no recovery is running. 3781 * remove any failed drives, then 3782 * add spares if possible. 3783 * Spare are also removed and re-added, to allow 3784 * the personality to fail the re-add. 3785 */ 3786 ITERATE_RDEV(mddev,rdev,rtmp) 3787 if (rdev->raid_disk >= 0 && 3788 (rdev->faulty || ! rdev->in_sync) && 3789 atomic_read(&rdev->nr_pending)==0) { 3790 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) 3791 rdev->raid_disk = -1; 3792 } 3793 3794 if (mddev->degraded) { 3795 ITERATE_RDEV(mddev,rdev,rtmp) 3796 if (rdev->raid_disk < 0 3797 && !rdev->faulty) { 3798 if (mddev->pers->hot_add_disk(mddev,rdev)) 3799 spares++; 3800 else 3801 break; 3802 } 3803 } 3804 3805 if (!spares && (mddev->recovery_cp == MaxSector )) { 3806 /* nothing we can do ... */ 3807 goto unlock; 3808 } 3809 if (mddev->pers->sync_request) { 3810 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3811 if (!spares) 3812 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3813 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 3814 /* We are adding a device or devices to an array 3815 * which has the bitmap stored on all devices. 3816 * So make sure all bitmap pages get written 3817 */ 3818 bitmap_write_all(mddev->bitmap); 3819 } 3820 mddev->sync_thread = md_register_thread(md_do_sync, 3821 mddev, 3822 "%s_resync"); 3823 if (!mddev->sync_thread) { 3824 printk(KERN_ERR "%s: could not start resync" 3825 " thread...\n", 3826 mdname(mddev)); 3827 /* leave the spares where they are, it shouldn't hurt */ 3828 mddev->recovery = 0; 3829 } else { 3830 md_wakeup_thread(mddev->sync_thread); 3831 } 3832 } 3833 unlock: 3834 mddev_unlock(mddev); 3835 } 3836 } 3837 3838 static int md_notify_reboot(struct notifier_block *this, 3839 unsigned long code, void *x) 3840 { 3841 struct list_head *tmp; 3842 mddev_t *mddev; 3843 3844 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 3845 3846 printk(KERN_INFO "md: stopping all md devices.\n"); 3847 3848 ITERATE_MDDEV(mddev,tmp) 3849 if (mddev_trylock(mddev)==0) 3850 do_md_stop (mddev, 1); 3851 /* 3852 * certain more exotic SCSI devices are known to be 3853 * volatile wrt too early system reboots. While the 3854 * right place to handle this issue is the given 3855 * driver, we do want to have a safe RAID driver ... 3856 */ 3857 mdelay(1000*1); 3858 } 3859 return NOTIFY_DONE; 3860 } 3861 3862 static struct notifier_block md_notifier = { 3863 .notifier_call = md_notify_reboot, 3864 .next = NULL, 3865 .priority = INT_MAX, /* before any real devices */ 3866 }; 3867 3868 static void md_geninit(void) 3869 { 3870 struct proc_dir_entry *p; 3871 3872 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 3873 3874 p = create_proc_entry("mdstat", S_IRUGO, NULL); 3875 if (p) 3876 p->proc_fops = &md_seq_fops; 3877 } 3878 3879 static int __init md_init(void) 3880 { 3881 int minor; 3882 3883 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d," 3884 " MD_SB_DISKS=%d\n", 3885 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3886 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3887 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR, 3888 BITMAP_MINOR); 3889 3890 if (register_blkdev(MAJOR_NR, "md")) 3891 return -1; 3892 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 3893 unregister_blkdev(MAJOR_NR, "md"); 3894 return -1; 3895 } 3896 devfs_mk_dir("md"); 3897 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE, 3898 md_probe, NULL, NULL); 3899 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE, 3900 md_probe, NULL, NULL); 3901 3902 for (minor=0; minor < MAX_MD_DEVS; ++minor) 3903 devfs_mk_bdev(MKDEV(MAJOR_NR, minor), 3904 S_IFBLK|S_IRUSR|S_IWUSR, 3905 "md/%d", minor); 3906 3907 for (minor=0; minor < MAX_MD_DEVS; ++minor) 3908 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift), 3909 S_IFBLK|S_IRUSR|S_IWUSR, 3910 "md/mdp%d", minor); 3911 3912 3913 register_reboot_notifier(&md_notifier); 3914 raid_table_header = register_sysctl_table(raid_root_table, 1); 3915 3916 md_geninit(); 3917 return (0); 3918 } 3919 3920 3921 #ifndef MODULE 3922 3923 /* 3924 * Searches all registered partitions for autorun RAID arrays 3925 * at boot time. 3926 */ 3927 static dev_t detected_devices[128]; 3928 static int dev_cnt; 3929 3930 void md_autodetect_dev(dev_t dev) 3931 { 3932 if (dev_cnt >= 0 && dev_cnt < 127) 3933 detected_devices[dev_cnt++] = dev; 3934 } 3935 3936 3937 static void autostart_arrays(int part) 3938 { 3939 mdk_rdev_t *rdev; 3940 int i; 3941 3942 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 3943 3944 for (i = 0; i < dev_cnt; i++) { 3945 dev_t dev = detected_devices[i]; 3946 3947 rdev = md_import_device(dev,0, 0); 3948 if (IS_ERR(rdev)) 3949 continue; 3950 3951 if (rdev->faulty) { 3952 MD_BUG(); 3953 continue; 3954 } 3955 list_add(&rdev->same_set, &pending_raid_disks); 3956 } 3957 dev_cnt = 0; 3958 3959 autorun_devices(part); 3960 } 3961 3962 #endif 3963 3964 static __exit void md_exit(void) 3965 { 3966 mddev_t *mddev; 3967 struct list_head *tmp; 3968 int i; 3969 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS); 3970 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift); 3971 for (i=0; i < MAX_MD_DEVS; i++) 3972 devfs_remove("md/%d", i); 3973 for (i=0; i < MAX_MD_DEVS; i++) 3974 devfs_remove("md/d%d", i); 3975 3976 devfs_remove("md"); 3977 3978 unregister_blkdev(MAJOR_NR,"md"); 3979 unregister_blkdev(mdp_major, "mdp"); 3980 unregister_reboot_notifier(&md_notifier); 3981 unregister_sysctl_table(raid_table_header); 3982 remove_proc_entry("mdstat", NULL); 3983 ITERATE_MDDEV(mddev,tmp) { 3984 struct gendisk *disk = mddev->gendisk; 3985 if (!disk) 3986 continue; 3987 export_array(mddev); 3988 del_gendisk(disk); 3989 put_disk(disk); 3990 mddev->gendisk = NULL; 3991 mddev_put(mddev); 3992 } 3993 } 3994 3995 module_init(md_init) 3996 module_exit(md_exit) 3997 3998 EXPORT_SYMBOL(register_md_personality); 3999 EXPORT_SYMBOL(unregister_md_personality); 4000 EXPORT_SYMBOL(md_error); 4001 EXPORT_SYMBOL(md_done_sync); 4002 EXPORT_SYMBOL(md_write_start); 4003 EXPORT_SYMBOL(md_write_end); 4004 EXPORT_SYMBOL(md_register_thread); 4005 EXPORT_SYMBOL(md_unregister_thread); 4006 EXPORT_SYMBOL(md_wakeup_thread); 4007 EXPORT_SYMBOL(md_print_devices); 4008 EXPORT_SYMBOL(md_check_recovery); 4009 MODULE_LICENSE("GPL"); 4010