1 /* 2 * Copyright (C) 2010-2011 Neil Brown 3 * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include <linux/slab.h> 9 10 #include "md.h" 11 #include "raid1.h" 12 #include "raid5.h" 13 #include "bitmap.h" 14 15 #include <linux/device-mapper.h> 16 17 #define DM_MSG_PREFIX "raid" 18 19 /* 20 * The following flags are used by dm-raid.c to set up the array state. 21 * They must be cleared before md_run is called. 22 */ 23 #define FirstUse 10 /* rdev flag */ 24 25 struct raid_dev { 26 /* 27 * Two DM devices, one to hold metadata and one to hold the 28 * actual data/parity. The reason for this is to not confuse 29 * ti->len and give more flexibility in altering size and 30 * characteristics. 31 * 32 * While it is possible for this device to be associated 33 * with a different physical device than the data_dev, it 34 * is intended for it to be the same. 35 * |--------- Physical Device ---------| 36 * |- meta_dev -|------ data_dev ------| 37 */ 38 struct dm_dev *meta_dev; 39 struct dm_dev *data_dev; 40 struct mdk_rdev_s rdev; 41 }; 42 43 /* 44 * Flags for rs->print_flags field. 45 */ 46 #define DMPF_SYNC 0x1 47 #define DMPF_NOSYNC 0x2 48 #define DMPF_REBUILD 0x4 49 #define DMPF_DAEMON_SLEEP 0x8 50 #define DMPF_MIN_RECOVERY_RATE 0x10 51 #define DMPF_MAX_RECOVERY_RATE 0x20 52 #define DMPF_MAX_WRITE_BEHIND 0x40 53 #define DMPF_STRIPE_CACHE 0x80 54 #define DMPF_REGION_SIZE 0X100 55 struct raid_set { 56 struct dm_target *ti; 57 58 uint64_t print_flags; 59 60 struct mddev_s md; 61 struct raid_type *raid_type; 62 struct dm_target_callbacks callbacks; 63 64 struct raid_dev dev[0]; 65 }; 66 67 /* Supported raid types and properties. */ 68 static struct raid_type { 69 const char *name; /* RAID algorithm. */ 70 const char *descr; /* Descriptor text for logging. */ 71 const unsigned parity_devs; /* # of parity devices. */ 72 const unsigned minimal_devs; /* minimal # of devices in set. */ 73 const unsigned level; /* RAID level. */ 74 const unsigned algorithm; /* RAID algorithm. */ 75 } raid_types[] = { 76 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 77 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 78 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 79 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 80 {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, 81 {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, 82 {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, 83 {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, 84 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 85 }; 86 87 static struct raid_type *get_raid_type(char *name) 88 { 89 int i; 90 91 for (i = 0; i < ARRAY_SIZE(raid_types); i++) 92 if (!strcmp(raid_types[i].name, name)) 93 return &raid_types[i]; 94 95 return NULL; 96 } 97 98 static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) 99 { 100 unsigned i; 101 struct raid_set *rs; 102 sector_t sectors_per_dev; 103 104 if (raid_devs <= raid_type->parity_devs) { 105 ti->error = "Insufficient number of devices"; 106 return ERR_PTR(-EINVAL); 107 } 108 109 sectors_per_dev = ti->len; 110 if ((raid_type->level > 1) && 111 sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { 112 ti->error = "Target length not divisible by number of data devices"; 113 return ERR_PTR(-EINVAL); 114 } 115 116 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); 117 if (!rs) { 118 ti->error = "Cannot allocate raid context"; 119 return ERR_PTR(-ENOMEM); 120 } 121 122 mddev_init(&rs->md); 123 124 rs->ti = ti; 125 rs->raid_type = raid_type; 126 rs->md.raid_disks = raid_devs; 127 rs->md.level = raid_type->level; 128 rs->md.new_level = rs->md.level; 129 rs->md.dev_sectors = sectors_per_dev; 130 rs->md.layout = raid_type->algorithm; 131 rs->md.new_layout = rs->md.layout; 132 rs->md.delta_disks = 0; 133 rs->md.recovery_cp = 0; 134 135 for (i = 0; i < raid_devs; i++) 136 md_rdev_init(&rs->dev[i].rdev); 137 138 /* 139 * Remaining items to be initialized by further RAID params: 140 * rs->md.persistent 141 * rs->md.external 142 * rs->md.chunk_sectors 143 * rs->md.new_chunk_sectors 144 */ 145 146 return rs; 147 } 148 149 static void context_free(struct raid_set *rs) 150 { 151 int i; 152 153 for (i = 0; i < rs->md.raid_disks; i++) { 154 if (rs->dev[i].meta_dev) 155 dm_put_device(rs->ti, rs->dev[i].meta_dev); 156 if (rs->dev[i].rdev.sb_page) 157 put_page(rs->dev[i].rdev.sb_page); 158 rs->dev[i].rdev.sb_page = NULL; 159 rs->dev[i].rdev.sb_loaded = 0; 160 if (rs->dev[i].data_dev) 161 dm_put_device(rs->ti, rs->dev[i].data_dev); 162 } 163 164 kfree(rs); 165 } 166 167 /* 168 * For every device we have two words 169 * <meta_dev>: meta device name or '-' if missing 170 * <data_dev>: data device name or '-' if missing 171 * 172 * The following are permitted: 173 * - - 174 * - <data_dev> 175 * <meta_dev> <data_dev> 176 * 177 * The following is not allowed: 178 * <meta_dev> - 179 * 180 * This code parses those words. If there is a failure, 181 * the caller must use context_free to unwind the operations. 182 */ 183 static int dev_parms(struct raid_set *rs, char **argv) 184 { 185 int i; 186 int rebuild = 0; 187 int metadata_available = 0; 188 int ret = 0; 189 190 for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { 191 rs->dev[i].rdev.raid_disk = i; 192 193 rs->dev[i].meta_dev = NULL; 194 rs->dev[i].data_dev = NULL; 195 196 /* 197 * There are no offsets, since there is a separate device 198 * for data and metadata. 199 */ 200 rs->dev[i].rdev.data_offset = 0; 201 rs->dev[i].rdev.mddev = &rs->md; 202 203 if (strcmp(argv[0], "-")) { 204 ret = dm_get_device(rs->ti, argv[0], 205 dm_table_get_mode(rs->ti->table), 206 &rs->dev[i].meta_dev); 207 rs->ti->error = "RAID metadata device lookup failure"; 208 if (ret) 209 return ret; 210 211 rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); 212 if (!rs->dev[i].rdev.sb_page) 213 return -ENOMEM; 214 } 215 216 if (!strcmp(argv[1], "-")) { 217 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && 218 (!rs->dev[i].rdev.recovery_offset)) { 219 rs->ti->error = "Drive designated for rebuild not specified"; 220 return -EINVAL; 221 } 222 223 rs->ti->error = "No data device supplied with metadata device"; 224 if (rs->dev[i].meta_dev) 225 return -EINVAL; 226 227 continue; 228 } 229 230 ret = dm_get_device(rs->ti, argv[1], 231 dm_table_get_mode(rs->ti->table), 232 &rs->dev[i].data_dev); 233 if (ret) { 234 rs->ti->error = "RAID device lookup failure"; 235 return ret; 236 } 237 238 if (rs->dev[i].meta_dev) { 239 metadata_available = 1; 240 rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; 241 } 242 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; 243 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); 244 if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 245 rebuild++; 246 } 247 248 if (metadata_available) { 249 rs->md.external = 0; 250 rs->md.persistent = 1; 251 rs->md.major_version = 2; 252 } else if (rebuild && !rs->md.recovery_cp) { 253 /* 254 * Without metadata, we will not be able to tell if the array 255 * is in-sync or not - we must assume it is not. Therefore, 256 * it is impossible to rebuild a drive. 257 * 258 * Even if there is metadata, the on-disk information may 259 * indicate that the array is not in-sync and it will then 260 * fail at that time. 261 * 262 * User could specify 'nosync' option if desperate. 263 */ 264 DMERR("Unable to rebuild drive while array is not in-sync"); 265 rs->ti->error = "RAID device lookup failure"; 266 return -EINVAL; 267 } 268 269 return 0; 270 } 271 272 /* 273 * validate_region_size 274 * @rs 275 * @region_size: region size in sectors. If 0, pick a size (4MiB default). 276 * 277 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). 278 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. 279 * 280 * Returns: 0 on success, -EINVAL on failure. 281 */ 282 static int validate_region_size(struct raid_set *rs, unsigned long region_size) 283 { 284 unsigned long min_region_size = rs->ti->len / (1 << 21); 285 286 if (!region_size) { 287 /* 288 * Choose a reasonable default. All figures in sectors. 289 */ 290 if (min_region_size > (1 << 13)) { 291 DMINFO("Choosing default region size of %lu sectors", 292 region_size); 293 region_size = min_region_size; 294 } else { 295 DMINFO("Choosing default region size of 4MiB"); 296 region_size = 1 << 13; /* sectors */ 297 } 298 } else { 299 /* 300 * Validate user-supplied value. 301 */ 302 if (region_size > rs->ti->len) { 303 rs->ti->error = "Supplied region size is too large"; 304 return -EINVAL; 305 } 306 307 if (region_size < min_region_size) { 308 DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", 309 region_size, min_region_size); 310 rs->ti->error = "Supplied region size is too small"; 311 return -EINVAL; 312 } 313 314 if (!is_power_of_2(region_size)) { 315 rs->ti->error = "Region size is not a power of 2"; 316 return -EINVAL; 317 } 318 319 if (region_size < rs->md.chunk_sectors) { 320 rs->ti->error = "Region size is smaller than the chunk size"; 321 return -EINVAL; 322 } 323 } 324 325 /* 326 * Convert sectors to bytes. 327 */ 328 rs->md.bitmap_info.chunksize = (region_size << 9); 329 330 return 0; 331 } 332 333 /* 334 * Possible arguments are... 335 * <chunk_size> [optional_args] 336 * 337 * Argument definitions 338 * <chunk_size> The number of sectors per disk that 339 * will form the "stripe" 340 * [[no]sync] Force or prevent recovery of the 341 * entire array 342 * [rebuild <idx>] Rebuild the drive indicated by the index 343 * [daemon_sleep <ms>] Time between bitmap daemon work to 344 * clear bits 345 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization 346 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization 347 * [write_mostly <idx>] Indicate a write mostly drive via index 348 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 349 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 350 * [region_size <sectors>] Defines granularity of bitmap 351 */ 352 static int parse_raid_params(struct raid_set *rs, char **argv, 353 unsigned num_raid_params) 354 { 355 unsigned i, rebuild_cnt = 0; 356 unsigned long value, region_size = 0; 357 char *key; 358 359 /* 360 * First, parse the in-order required arguments 361 * "chunk_size" is the only argument of this type. 362 */ 363 if ((strict_strtoul(argv[0], 10, &value) < 0)) { 364 rs->ti->error = "Bad chunk size"; 365 return -EINVAL; 366 } else if (rs->raid_type->level == 1) { 367 if (value) 368 DMERR("Ignoring chunk size parameter for RAID 1"); 369 value = 0; 370 } else if (!is_power_of_2(value)) { 371 rs->ti->error = "Chunk size must be a power of 2"; 372 return -EINVAL; 373 } else if (value < 8) { 374 rs->ti->error = "Chunk size value is too small"; 375 return -EINVAL; 376 } 377 378 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; 379 argv++; 380 num_raid_params--; 381 382 /* 383 * We set each individual device as In_sync with a completed 384 * 'recovery_offset'. If there has been a device failure or 385 * replacement then one of the following cases applies: 386 * 387 * 1) User specifies 'rebuild'. 388 * - Device is reset when param is read. 389 * 2) A new device is supplied. 390 * - No matching superblock found, resets device. 391 * 3) Device failure was transient and returns on reload. 392 * - Failure noticed, resets device for bitmap replay. 393 * 4) Device hadn't completed recovery after previous failure. 394 * - Superblock is read and overrides recovery_offset. 395 * 396 * What is found in the superblocks of the devices is always 397 * authoritative, unless 'rebuild' or '[no]sync' was specified. 398 */ 399 for (i = 0; i < rs->md.raid_disks; i++) { 400 set_bit(In_sync, &rs->dev[i].rdev.flags); 401 rs->dev[i].rdev.recovery_offset = MaxSector; 402 } 403 404 /* 405 * Second, parse the unordered optional arguments 406 */ 407 for (i = 0; i < num_raid_params; i++) { 408 if (!strcasecmp(argv[i], "nosync")) { 409 rs->md.recovery_cp = MaxSector; 410 rs->print_flags |= DMPF_NOSYNC; 411 continue; 412 } 413 if (!strcasecmp(argv[i], "sync")) { 414 rs->md.recovery_cp = 0; 415 rs->print_flags |= DMPF_SYNC; 416 continue; 417 } 418 419 /* The rest of the optional arguments come in key/value pairs */ 420 if ((i + 1) >= num_raid_params) { 421 rs->ti->error = "Wrong number of raid parameters given"; 422 return -EINVAL; 423 } 424 425 key = argv[i++]; 426 if (strict_strtoul(argv[i], 10, &value) < 0) { 427 rs->ti->error = "Bad numerical argument given in raid params"; 428 return -EINVAL; 429 } 430 431 if (!strcasecmp(key, "rebuild")) { 432 rebuild_cnt++; 433 if (((rs->raid_type->level != 1) && 434 (rebuild_cnt > rs->raid_type->parity_devs)) || 435 ((rs->raid_type->level == 1) && 436 (rebuild_cnt > (rs->md.raid_disks - 1)))) { 437 rs->ti->error = "Too many rebuild devices specified for given RAID type"; 438 return -EINVAL; 439 } 440 if (value > rs->md.raid_disks) { 441 rs->ti->error = "Invalid rebuild index given"; 442 return -EINVAL; 443 } 444 clear_bit(In_sync, &rs->dev[value].rdev.flags); 445 rs->dev[value].rdev.recovery_offset = 0; 446 rs->print_flags |= DMPF_REBUILD; 447 } else if (!strcasecmp(key, "write_mostly")) { 448 if (rs->raid_type->level != 1) { 449 rs->ti->error = "write_mostly option is only valid for RAID1"; 450 return -EINVAL; 451 } 452 if (value >= rs->md.raid_disks) { 453 rs->ti->error = "Invalid write_mostly drive index given"; 454 return -EINVAL; 455 } 456 set_bit(WriteMostly, &rs->dev[value].rdev.flags); 457 } else if (!strcasecmp(key, "max_write_behind")) { 458 if (rs->raid_type->level != 1) { 459 rs->ti->error = "max_write_behind option is only valid for RAID1"; 460 return -EINVAL; 461 } 462 rs->print_flags |= DMPF_MAX_WRITE_BEHIND; 463 464 /* 465 * In device-mapper, we specify things in sectors, but 466 * MD records this value in kB 467 */ 468 value /= 2; 469 if (value > COUNTER_MAX) { 470 rs->ti->error = "Max write-behind limit out of range"; 471 return -EINVAL; 472 } 473 rs->md.bitmap_info.max_write_behind = value; 474 } else if (!strcasecmp(key, "daemon_sleep")) { 475 rs->print_flags |= DMPF_DAEMON_SLEEP; 476 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { 477 rs->ti->error = "daemon sleep period out of range"; 478 return -EINVAL; 479 } 480 rs->md.bitmap_info.daemon_sleep = value; 481 } else if (!strcasecmp(key, "stripe_cache")) { 482 rs->print_flags |= DMPF_STRIPE_CACHE; 483 484 /* 485 * In device-mapper, we specify things in sectors, but 486 * MD records this value in kB 487 */ 488 value /= 2; 489 490 if (rs->raid_type->level < 5) { 491 rs->ti->error = "Inappropriate argument: stripe_cache"; 492 return -EINVAL; 493 } 494 if (raid5_set_cache_size(&rs->md, (int)value)) { 495 rs->ti->error = "Bad stripe_cache size"; 496 return -EINVAL; 497 } 498 } else if (!strcasecmp(key, "min_recovery_rate")) { 499 rs->print_flags |= DMPF_MIN_RECOVERY_RATE; 500 if (value > INT_MAX) { 501 rs->ti->error = "min_recovery_rate out of range"; 502 return -EINVAL; 503 } 504 rs->md.sync_speed_min = (int)value; 505 } else if (!strcasecmp(key, "max_recovery_rate")) { 506 rs->print_flags |= DMPF_MAX_RECOVERY_RATE; 507 if (value > INT_MAX) { 508 rs->ti->error = "max_recovery_rate out of range"; 509 return -EINVAL; 510 } 511 rs->md.sync_speed_max = (int)value; 512 } else if (!strcasecmp(key, "region_size")) { 513 rs->print_flags |= DMPF_REGION_SIZE; 514 region_size = value; 515 } else { 516 DMERR("Unable to parse RAID parameter: %s", key); 517 rs->ti->error = "Unable to parse RAID parameters"; 518 return -EINVAL; 519 } 520 } 521 522 if (validate_region_size(rs, region_size)) 523 return -EINVAL; 524 525 if (rs->md.chunk_sectors) 526 rs->ti->split_io = rs->md.chunk_sectors; 527 else 528 rs->ti->split_io = region_size; 529 530 if (rs->md.chunk_sectors) 531 rs->ti->split_io = rs->md.chunk_sectors; 532 else 533 rs->ti->split_io = region_size; 534 535 /* Assume there are no metadata devices until the drives are parsed */ 536 rs->md.persistent = 0; 537 rs->md.external = 1; 538 539 return 0; 540 } 541 542 static void do_table_event(struct work_struct *ws) 543 { 544 struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); 545 546 dm_table_event(rs->ti->table); 547 } 548 549 static int raid_is_congested(struct dm_target_callbacks *cb, int bits) 550 { 551 struct raid_set *rs = container_of(cb, struct raid_set, callbacks); 552 553 if (rs->raid_type->level == 1) 554 return md_raid1_congested(&rs->md, bits); 555 556 return md_raid5_congested(&rs->md, bits); 557 } 558 559 /* 560 * This structure is never routinely used by userspace, unlike md superblocks. 561 * Devices with this superblock should only ever be accessed via device-mapper. 562 */ 563 #define DM_RAID_MAGIC 0x64526D44 564 struct dm_raid_superblock { 565 __le32 magic; /* "DmRd" */ 566 __le32 features; /* Used to indicate possible future changes */ 567 568 __le32 num_devices; /* Number of devices in this array. (Max 64) */ 569 __le32 array_position; /* The position of this drive in the array */ 570 571 __le64 events; /* Incremented by md when superblock updated */ 572 __le64 failed_devices; /* Bit field of devices to indicate failures */ 573 574 /* 575 * This offset tracks the progress of the repair or replacement of 576 * an individual drive. 577 */ 578 __le64 disk_recovery_offset; 579 580 /* 581 * This offset tracks the progress of the initial array 582 * synchronisation/parity calculation. 583 */ 584 __le64 array_resync_offset; 585 586 /* 587 * RAID characteristics 588 */ 589 __le32 level; 590 __le32 layout; 591 __le32 stripe_sectors; 592 593 __u8 pad[452]; /* Round struct to 512 bytes. */ 594 /* Always set to 0 when writing. */ 595 } __packed; 596 597 static int read_disk_sb(mdk_rdev_t *rdev, int size) 598 { 599 BUG_ON(!rdev->sb_page); 600 601 if (rdev->sb_loaded) 602 return 0; 603 604 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { 605 DMERR("Failed to read device superblock"); 606 return -EINVAL; 607 } 608 609 rdev->sb_loaded = 1; 610 611 return 0; 612 } 613 614 static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev) 615 { 616 mdk_rdev_t *r, *t; 617 uint64_t failed_devices; 618 struct dm_raid_superblock *sb; 619 620 sb = page_address(rdev->sb_page); 621 failed_devices = le64_to_cpu(sb->failed_devices); 622 623 rdev_for_each(r, t, mddev) 624 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) 625 failed_devices |= (1ULL << r->raid_disk); 626 627 memset(sb, 0, sizeof(*sb)); 628 629 sb->magic = cpu_to_le32(DM_RAID_MAGIC); 630 sb->features = cpu_to_le32(0); /* No features yet */ 631 632 sb->num_devices = cpu_to_le32(mddev->raid_disks); 633 sb->array_position = cpu_to_le32(rdev->raid_disk); 634 635 sb->events = cpu_to_le64(mddev->events); 636 sb->failed_devices = cpu_to_le64(failed_devices); 637 638 sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); 639 sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); 640 641 sb->level = cpu_to_le32(mddev->level); 642 sb->layout = cpu_to_le32(mddev->layout); 643 sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); 644 } 645 646 /* 647 * super_load 648 * 649 * This function creates a superblock if one is not found on the device 650 * and will decide which superblock to use if there's a choice. 651 * 652 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise 653 */ 654 static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev) 655 { 656 int ret; 657 struct dm_raid_superblock *sb; 658 struct dm_raid_superblock *refsb; 659 uint64_t events_sb, events_refsb; 660 661 rdev->sb_start = 0; 662 rdev->sb_size = sizeof(*sb); 663 664 ret = read_disk_sb(rdev, rdev->sb_size); 665 if (ret) 666 return ret; 667 668 sb = page_address(rdev->sb_page); 669 if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { 670 super_sync(rdev->mddev, rdev); 671 672 set_bit(FirstUse, &rdev->flags); 673 674 /* Force writing of superblocks to disk */ 675 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); 676 677 /* Any superblock is better than none, choose that if given */ 678 return refdev ? 0 : 1; 679 } 680 681 if (!refdev) 682 return 1; 683 684 events_sb = le64_to_cpu(sb->events); 685 686 refsb = page_address(refdev->sb_page); 687 events_refsb = le64_to_cpu(refsb->events); 688 689 return (events_sb > events_refsb) ? 1 : 0; 690 } 691 692 static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev) 693 { 694 int role; 695 struct raid_set *rs = container_of(mddev, struct raid_set, md); 696 uint64_t events_sb; 697 uint64_t failed_devices; 698 struct dm_raid_superblock *sb; 699 uint32_t new_devs = 0; 700 uint32_t rebuilds = 0; 701 mdk_rdev_t *r, *t; 702 struct dm_raid_superblock *sb2; 703 704 sb = page_address(rdev->sb_page); 705 events_sb = le64_to_cpu(sb->events); 706 failed_devices = le64_to_cpu(sb->failed_devices); 707 708 /* 709 * Initialise to 1 if this is a new superblock. 710 */ 711 mddev->events = events_sb ? : 1; 712 713 /* 714 * Reshaping is not currently allowed 715 */ 716 if ((le32_to_cpu(sb->level) != mddev->level) || 717 (le32_to_cpu(sb->layout) != mddev->layout) || 718 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { 719 DMERR("Reshaping arrays not yet supported."); 720 return -EINVAL; 721 } 722 723 /* We can only change the number of devices in RAID1 right now */ 724 if ((rs->raid_type->level != 1) && 725 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { 726 DMERR("Reshaping arrays not yet supported."); 727 return -EINVAL; 728 } 729 730 if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))) 731 mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); 732 733 /* 734 * During load, we set FirstUse if a new superblock was written. 735 * There are two reasons we might not have a superblock: 736 * 1) The array is brand new - in which case, all of the 737 * devices must have their In_sync bit set. Also, 738 * recovery_cp must be 0, unless forced. 739 * 2) This is a new device being added to an old array 740 * and the new device needs to be rebuilt - in which 741 * case the In_sync bit will /not/ be set and 742 * recovery_cp must be MaxSector. 743 */ 744 rdev_for_each(r, t, mddev) { 745 if (!test_bit(In_sync, &r->flags)) { 746 if (!test_bit(FirstUse, &r->flags)) 747 DMERR("Superblock area of " 748 "rebuild device %d should have been " 749 "cleared.", r->raid_disk); 750 set_bit(FirstUse, &r->flags); 751 rebuilds++; 752 } else if (test_bit(FirstUse, &r->flags)) 753 new_devs++; 754 } 755 756 if (!rebuilds) { 757 if (new_devs == mddev->raid_disks) { 758 DMINFO("Superblocks created for new array"); 759 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); 760 } else if (new_devs) { 761 DMERR("New device injected " 762 "into existing array without 'rebuild' " 763 "parameter specified"); 764 return -EINVAL; 765 } 766 } else if (new_devs) { 767 DMERR("'rebuild' devices cannot be " 768 "injected into an array with other first-time devices"); 769 return -EINVAL; 770 } else if (mddev->recovery_cp != MaxSector) { 771 DMERR("'rebuild' specified while array is not in-sync"); 772 return -EINVAL; 773 } 774 775 /* 776 * Now we set the Faulty bit for those devices that are 777 * recorded in the superblock as failed. 778 */ 779 rdev_for_each(r, t, mddev) { 780 if (!r->sb_page) 781 continue; 782 sb2 = page_address(r->sb_page); 783 sb2->failed_devices = 0; 784 785 /* 786 * Check for any device re-ordering. 787 */ 788 if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { 789 role = le32_to_cpu(sb2->array_position); 790 if (role != r->raid_disk) { 791 if (rs->raid_type->level != 1) { 792 rs->ti->error = "Cannot change device " 793 "positions in RAID array"; 794 return -EINVAL; 795 } 796 DMINFO("RAID1 device #%d now at position #%d", 797 role, r->raid_disk); 798 } 799 800 /* 801 * Partial recovery is performed on 802 * returning failed devices. 803 */ 804 if (failed_devices & (1 << role)) 805 set_bit(Faulty, &r->flags); 806 } 807 } 808 809 return 0; 810 } 811 812 static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev) 813 { 814 struct dm_raid_superblock *sb = page_address(rdev->sb_page); 815 816 /* 817 * If mddev->events is not set, we know we have not yet initialized 818 * the array. 819 */ 820 if (!mddev->events && super_init_validation(mddev, rdev)) 821 return -EINVAL; 822 823 mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ 824 rdev->mddev->bitmap_info.default_offset = 4096 >> 9; 825 if (!test_bit(FirstUse, &rdev->flags)) { 826 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); 827 if (rdev->recovery_offset != MaxSector) 828 clear_bit(In_sync, &rdev->flags); 829 } 830 831 /* 832 * If a device comes back, set it as not In_sync and no longer faulty. 833 */ 834 if (test_bit(Faulty, &rdev->flags)) { 835 clear_bit(Faulty, &rdev->flags); 836 clear_bit(In_sync, &rdev->flags); 837 rdev->saved_raid_disk = rdev->raid_disk; 838 rdev->recovery_offset = 0; 839 } 840 841 clear_bit(FirstUse, &rdev->flags); 842 843 return 0; 844 } 845 846 /* 847 * Analyse superblocks and select the freshest. 848 */ 849 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) 850 { 851 int ret; 852 mdk_rdev_t *rdev, *freshest, *tmp; 853 mddev_t *mddev = &rs->md; 854 855 freshest = NULL; 856 rdev_for_each(rdev, tmp, mddev) { 857 if (!rdev->meta_bdev) 858 continue; 859 860 ret = super_load(rdev, freshest); 861 862 switch (ret) { 863 case 1: 864 freshest = rdev; 865 break; 866 case 0: 867 break; 868 default: 869 ti->error = "Failed to load superblock"; 870 return ret; 871 } 872 } 873 874 if (!freshest) 875 return 0; 876 877 /* 878 * Validation of the freshest device provides the source of 879 * validation for the remaining devices. 880 */ 881 ti->error = "Unable to assemble array: Invalid superblocks"; 882 if (super_validate(mddev, freshest)) 883 return -EINVAL; 884 885 rdev_for_each(rdev, tmp, mddev) 886 if ((rdev != freshest) && super_validate(mddev, rdev)) 887 return -EINVAL; 888 889 return 0; 890 } 891 892 /* 893 * Construct a RAID4/5/6 mapping: 894 * Args: 895 * <raid_type> <#raid_params> <raid_params> \ 896 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } 897 * 898 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for 899 * details on possible <raid_params>. 900 */ 901 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) 902 { 903 int ret; 904 struct raid_type *rt; 905 unsigned long num_raid_params, num_raid_devs; 906 struct raid_set *rs = NULL; 907 908 /* Must have at least <raid_type> <#raid_params> */ 909 if (argc < 2) { 910 ti->error = "Too few arguments"; 911 return -EINVAL; 912 } 913 914 /* raid type */ 915 rt = get_raid_type(argv[0]); 916 if (!rt) { 917 ti->error = "Unrecognised raid_type"; 918 return -EINVAL; 919 } 920 argc--; 921 argv++; 922 923 /* number of RAID parameters */ 924 if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { 925 ti->error = "Cannot understand number of RAID parameters"; 926 return -EINVAL; 927 } 928 argc--; 929 argv++; 930 931 /* Skip over RAID params for now and find out # of devices */ 932 if (num_raid_params + 1 > argc) { 933 ti->error = "Arguments do not agree with counts given"; 934 return -EINVAL; 935 } 936 937 if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || 938 (num_raid_devs >= INT_MAX)) { 939 ti->error = "Cannot understand number of raid devices"; 940 return -EINVAL; 941 } 942 943 rs = context_alloc(ti, rt, (unsigned)num_raid_devs); 944 if (IS_ERR(rs)) 945 return PTR_ERR(rs); 946 947 ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); 948 if (ret) 949 goto bad; 950 951 ret = -EINVAL; 952 953 argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ 954 argv += num_raid_params + 1; 955 956 if (argc != (num_raid_devs * 2)) { 957 ti->error = "Supplied RAID devices does not match the count given"; 958 goto bad; 959 } 960 961 ret = dev_parms(rs, argv); 962 if (ret) 963 goto bad; 964 965 rs->md.sync_super = super_sync; 966 ret = analyse_superblocks(ti, rs); 967 if (ret) 968 goto bad; 969 970 INIT_WORK(&rs->md.event_work, do_table_event); 971 ti->private = rs; 972 973 mutex_lock(&rs->md.reconfig_mutex); 974 ret = md_run(&rs->md); 975 rs->md.in_sync = 0; /* Assume already marked dirty */ 976 mutex_unlock(&rs->md.reconfig_mutex); 977 978 if (ret) { 979 ti->error = "Fail to run raid array"; 980 goto bad; 981 } 982 983 rs->callbacks.congested_fn = raid_is_congested; 984 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 985 986 mddev_suspend(&rs->md); 987 return 0; 988 989 bad: 990 context_free(rs); 991 992 return ret; 993 } 994 995 static void raid_dtr(struct dm_target *ti) 996 { 997 struct raid_set *rs = ti->private; 998 999 list_del_init(&rs->callbacks.list); 1000 md_stop(&rs->md); 1001 context_free(rs); 1002 } 1003 1004 static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) 1005 { 1006 struct raid_set *rs = ti->private; 1007 mddev_t *mddev = &rs->md; 1008 1009 mddev->pers->make_request(mddev, bio); 1010 1011 return DM_MAPIO_SUBMITTED; 1012 } 1013 1014 static int raid_status(struct dm_target *ti, status_type_t type, 1015 char *result, unsigned maxlen) 1016 { 1017 struct raid_set *rs = ti->private; 1018 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ 1019 unsigned sz = 0; 1020 int i; 1021 sector_t sync; 1022 1023 switch (type) { 1024 case STATUSTYPE_INFO: 1025 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); 1026 1027 for (i = 0; i < rs->md.raid_disks; i++) { 1028 if (test_bit(Faulty, &rs->dev[i].rdev.flags)) 1029 DMEMIT("D"); 1030 else if (test_bit(In_sync, &rs->dev[i].rdev.flags)) 1031 DMEMIT("A"); 1032 else 1033 DMEMIT("a"); 1034 } 1035 1036 if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) 1037 sync = rs->md.curr_resync_completed; 1038 else 1039 sync = rs->md.recovery_cp; 1040 1041 if (sync > rs->md.resync_max_sectors) 1042 sync = rs->md.resync_max_sectors; 1043 1044 DMEMIT(" %llu/%llu", 1045 (unsigned long long) sync, 1046 (unsigned long long) rs->md.resync_max_sectors); 1047 1048 break; 1049 case STATUSTYPE_TABLE: 1050 /* The string you would use to construct this array */ 1051 for (i = 0; i < rs->md.raid_disks; i++) { 1052 if ((rs->print_flags & DMPF_REBUILD) && 1053 rs->dev[i].data_dev && 1054 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1055 raid_param_cnt += 2; /* for rebuilds */ 1056 if (rs->dev[i].data_dev && 1057 test_bit(WriteMostly, &rs->dev[i].rdev.flags)) 1058 raid_param_cnt += 2; 1059 } 1060 1061 raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2); 1062 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) 1063 raid_param_cnt--; 1064 1065 DMEMIT("%s %u %u", rs->raid_type->name, 1066 raid_param_cnt, rs->md.chunk_sectors); 1067 1068 if ((rs->print_flags & DMPF_SYNC) && 1069 (rs->md.recovery_cp == MaxSector)) 1070 DMEMIT(" sync"); 1071 if (rs->print_flags & DMPF_NOSYNC) 1072 DMEMIT(" nosync"); 1073 1074 for (i = 0; i < rs->md.raid_disks; i++) 1075 if ((rs->print_flags & DMPF_REBUILD) && 1076 rs->dev[i].data_dev && 1077 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1078 DMEMIT(" rebuild %u", i); 1079 1080 if (rs->print_flags & DMPF_DAEMON_SLEEP) 1081 DMEMIT(" daemon_sleep %lu", 1082 rs->md.bitmap_info.daemon_sleep); 1083 1084 if (rs->print_flags & DMPF_MIN_RECOVERY_RATE) 1085 DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); 1086 1087 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) 1088 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); 1089 1090 for (i = 0; i < rs->md.raid_disks; i++) 1091 if (rs->dev[i].data_dev && 1092 test_bit(WriteMostly, &rs->dev[i].rdev.flags)) 1093 DMEMIT(" write_mostly %u", i); 1094 1095 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) 1096 DMEMIT(" max_write_behind %lu", 1097 rs->md.bitmap_info.max_write_behind); 1098 1099 if (rs->print_flags & DMPF_STRIPE_CACHE) { 1100 raid5_conf_t *conf = rs->md.private; 1101 1102 /* convert from kiB to sectors */ 1103 DMEMIT(" stripe_cache %d", 1104 conf ? conf->max_nr_stripes * 2 : 0); 1105 } 1106 1107 if (rs->print_flags & DMPF_REGION_SIZE) 1108 DMEMIT(" region_size %lu", 1109 rs->md.bitmap_info.chunksize >> 9); 1110 1111 DMEMIT(" %d", rs->md.raid_disks); 1112 for (i = 0; i < rs->md.raid_disks; i++) { 1113 if (rs->dev[i].meta_dev) 1114 DMEMIT(" %s", rs->dev[i].meta_dev->name); 1115 else 1116 DMEMIT(" -"); 1117 1118 if (rs->dev[i].data_dev) 1119 DMEMIT(" %s", rs->dev[i].data_dev->name); 1120 else 1121 DMEMIT(" -"); 1122 } 1123 } 1124 1125 return 0; 1126 } 1127 1128 static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) 1129 { 1130 struct raid_set *rs = ti->private; 1131 unsigned i; 1132 int ret = 0; 1133 1134 for (i = 0; !ret && i < rs->md.raid_disks; i++) 1135 if (rs->dev[i].data_dev) 1136 ret = fn(ti, 1137 rs->dev[i].data_dev, 1138 0, /* No offset on data devs */ 1139 rs->md.dev_sectors, 1140 data); 1141 1142 return ret; 1143 } 1144 1145 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) 1146 { 1147 struct raid_set *rs = ti->private; 1148 unsigned chunk_size = rs->md.chunk_sectors << 9; 1149 raid5_conf_t *conf = rs->md.private; 1150 1151 blk_limits_io_min(limits, chunk_size); 1152 blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); 1153 } 1154 1155 static void raid_presuspend(struct dm_target *ti) 1156 { 1157 struct raid_set *rs = ti->private; 1158 1159 md_stop_writes(&rs->md); 1160 } 1161 1162 static void raid_postsuspend(struct dm_target *ti) 1163 { 1164 struct raid_set *rs = ti->private; 1165 1166 mddev_suspend(&rs->md); 1167 } 1168 1169 static void raid_resume(struct dm_target *ti) 1170 { 1171 struct raid_set *rs = ti->private; 1172 1173 bitmap_load(&rs->md); 1174 mddev_resume(&rs->md); 1175 } 1176 1177 static struct target_type raid_target = { 1178 .name = "raid", 1179 .version = {1, 1, 0}, 1180 .module = THIS_MODULE, 1181 .ctr = raid_ctr, 1182 .dtr = raid_dtr, 1183 .map = raid_map, 1184 .status = raid_status, 1185 .iterate_devices = raid_iterate_devices, 1186 .io_hints = raid_io_hints, 1187 .presuspend = raid_presuspend, 1188 .postsuspend = raid_postsuspend, 1189 .resume = raid_resume, 1190 }; 1191 1192 static int __init dm_raid_init(void) 1193 { 1194 return dm_register_target(&raid_target); 1195 } 1196 1197 static void __exit dm_raid_exit(void) 1198 { 1199 dm_unregister_target(&raid_target); 1200 } 1201 1202 module_init(dm_raid_init); 1203 module_exit(dm_raid_exit); 1204 1205 MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); 1206 MODULE_ALIAS("dm-raid4"); 1207 MODULE_ALIAS("dm-raid5"); 1208 MODULE_ALIAS("dm-raid6"); 1209 MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>"); 1210 MODULE_LICENSE("GPL"); 1211