1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2013, 2018 by Delphix. All rights reserved. 26 * Copyright (c) 2016, 2017 Intel Corporation. 27 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. 28 */ 29 30 /* 31 * Functions to convert between a list of vdevs and an nvlist representing the 32 * configuration. Each entry in the list can be one of: 33 * 34 * Device vdevs 35 * disk=(path=..., devid=...) 36 * file=(path=...) 37 * 38 * Group vdevs 39 * raidz[1|2]=(...) 40 * mirror=(...) 41 * 42 * Hot spares 43 * 44 * While the underlying implementation supports it, group vdevs cannot contain 45 * other group vdevs. All userland verification of devices is contained within 46 * this file. If successful, the nvlist returned can be passed directly to the 47 * kernel; we've done as much verification as possible in userland. 48 * 49 * Hot spares are a special case, and passed down as an array of disk vdevs, at 50 * the same level as the root of the vdev tree. 51 * 52 * The only function exported by this file is 'make_root_vdev'. The 53 * function performs several passes: 54 * 55 * 1. Construct the vdev specification. Performs syntax validation and 56 * makes sure each device is valid. 57 * 2. Check for devices in use. Using libblkid to make sure that no 58 * devices are also in use. Some can be overridden using the 'force' 59 * flag, others cannot. 60 * 3. Check for replication errors if the 'force' flag is not specified. 61 * validates that the replication level is consistent across the 62 * entire pool. 63 * 4. Call libzfs to label any whole disks with an EFI label. 64 */ 65 66 #include <assert.h> 67 #include <ctype.h> 68 #include <errno.h> 69 #include <fcntl.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <libzutil.h> 73 #include <limits.h> 74 #include <sys/spa.h> 75 #include <stdio.h> 76 #include <string.h> 77 #include <unistd.h> 78 #include "zpool_util.h" 79 #include <sys/zfs_context.h> 80 #include <sys/stat.h> 81 82 /* 83 * For any given vdev specification, we can have multiple errors. The 84 * vdev_error() function keeps track of whether we have seen an error yet, and 85 * prints out a header if its the first error we've seen. 86 */ 87 boolean_t error_seen; 88 boolean_t is_force; 89 90 void 91 vdev_error(const char *fmt, ...) 92 { 93 va_list ap; 94 95 if (!error_seen) { 96 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 97 if (!is_force) 98 (void) fprintf(stderr, gettext("use '-f' to override " 99 "the following errors:\n")); 100 else 101 (void) fprintf(stderr, gettext("the following errors " 102 "must be manually repaired:\n")); 103 error_seen = B_TRUE; 104 } 105 106 va_start(ap, fmt); 107 (void) vfprintf(stderr, fmt, ap); 108 va_end(ap); 109 } 110 111 /* 112 * Check that a file is valid. All we can do in this case is check that it's 113 * not in use by another pool, and not in use by swap. 114 */ 115 int 116 check_file_generic(const char *file, boolean_t force, boolean_t isspare) 117 { 118 char *name; 119 int fd; 120 int ret = 0; 121 pool_state_t state; 122 boolean_t inuse; 123 124 if ((fd = open(file, O_RDONLY)) < 0) 125 return (0); 126 127 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 128 const char *desc; 129 130 switch (state) { 131 case POOL_STATE_ACTIVE: 132 desc = gettext("active"); 133 break; 134 135 case POOL_STATE_EXPORTED: 136 desc = gettext("exported"); 137 break; 138 139 case POOL_STATE_POTENTIALLY_ACTIVE: 140 desc = gettext("potentially active"); 141 break; 142 143 default: 144 desc = gettext("unknown"); 145 break; 146 } 147 148 /* 149 * Allow hot spares to be shared between pools. 150 */ 151 if (state == POOL_STATE_SPARE && isspare) { 152 free(name); 153 (void) close(fd); 154 return (0); 155 } 156 157 if (state == POOL_STATE_ACTIVE || 158 state == POOL_STATE_SPARE || !force) { 159 switch (state) { 160 case POOL_STATE_SPARE: 161 vdev_error(gettext("%s is reserved as a hot " 162 "spare for pool %s\n"), file, name); 163 break; 164 default: 165 vdev_error(gettext("%s is part of %s pool " 166 "'%s'\n"), file, desc, name); 167 break; 168 } 169 ret = -1; 170 } 171 172 free(name); 173 } 174 175 (void) close(fd); 176 return (ret); 177 } 178 179 /* 180 * This may be a shorthand device path or it could be total gibberish. 181 * Check to see if it is a known device available in zfs_vdev_paths. 182 * As part of this check, see if we've been given an entire disk 183 * (minus the slice number). 184 */ 185 static int 186 is_shorthand_path(const char *arg, char *path, size_t path_size, 187 struct stat64 *statbuf, boolean_t *wholedisk) 188 { 189 int error; 190 191 error = zfs_resolve_shortname(arg, path, path_size); 192 if (error == 0) { 193 *wholedisk = zfs_dev_is_whole_disk(path); 194 if (*wholedisk || (stat64(path, statbuf) == 0)) 195 return (0); 196 } 197 198 strlcpy(path, arg, path_size); 199 memset(statbuf, 0, sizeof (*statbuf)); 200 *wholedisk = B_FALSE; 201 202 return (error); 203 } 204 205 /* 206 * Determine if the given path is a hot spare within the given configuration. 207 * If no configuration is given we rely solely on the label. 208 */ 209 static boolean_t 210 is_spare(nvlist_t *config, const char *path) 211 { 212 int fd; 213 pool_state_t state; 214 char *name = NULL; 215 nvlist_t *label; 216 uint64_t guid, spareguid; 217 nvlist_t *nvroot; 218 nvlist_t **spares; 219 uint_t i, nspares; 220 boolean_t inuse; 221 222 if (zpool_is_draid_spare(path)) 223 return (B_TRUE); 224 225 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) 226 return (B_FALSE); 227 228 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 229 !inuse || 230 state != POOL_STATE_SPARE || 231 zpool_read_label(fd, &label, NULL) != 0) { 232 free(name); 233 (void) close(fd); 234 return (B_FALSE); 235 } 236 free(name); 237 (void) close(fd); 238 239 if (config == NULL) { 240 nvlist_free(label); 241 return (B_TRUE); 242 } 243 244 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 245 nvlist_free(label); 246 247 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 248 &nvroot) == 0); 249 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 250 &spares, &nspares) == 0) { 251 for (i = 0; i < nspares; i++) { 252 verify(nvlist_lookup_uint64(spares[i], 253 ZPOOL_CONFIG_GUID, &spareguid) == 0); 254 if (spareguid == guid) 255 return (B_TRUE); 256 } 257 } 258 259 return (B_FALSE); 260 } 261 262 /* 263 * Create a leaf vdev. Determine if this is a file or a device. If it's a 264 * device, fill in the device id to make a complete nvlist. Valid forms for a 265 * leaf vdev are: 266 * 267 * /dev/xxx Complete disk path 268 * /xxx Full path to file 269 * xxx Shorthand for <zfs_vdev_paths>/xxx 270 * draid* Virtual dRAID spare 271 */ 272 static nvlist_t * 273 make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) 274 { 275 char path[MAXPATHLEN]; 276 struct stat64 statbuf; 277 nvlist_t *vdev = NULL; 278 const char *type = NULL; 279 boolean_t wholedisk = B_FALSE; 280 uint64_t ashift = 0; 281 int err; 282 283 /* 284 * Determine what type of vdev this is, and put the full path into 285 * 'path'. We detect whether this is a device of file afterwards by 286 * checking the st_mode of the file. 287 */ 288 if (arg[0] == '/') { 289 /* 290 * Complete device or file path. Exact type is determined by 291 * examining the file descriptor afterwards. Symbolic links 292 * are resolved to their real paths to determine whole disk 293 * and S_ISBLK/S_ISREG type checks. However, we are careful 294 * to store the given path as ZPOOL_CONFIG_PATH to ensure we 295 * can leverage udev's persistent device labels. 296 */ 297 if (realpath(arg, path) == NULL) { 298 (void) fprintf(stderr, 299 gettext("cannot resolve path '%s'\n"), arg); 300 return (NULL); 301 } 302 303 wholedisk = zfs_dev_is_whole_disk(path); 304 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 305 (void) fprintf(stderr, 306 gettext("cannot open '%s': %s\n"), 307 path, strerror(errno)); 308 return (NULL); 309 } 310 311 /* After whole disk check restore original passed path */ 312 strlcpy(path, arg, sizeof (path)); 313 } else if (zpool_is_draid_spare(arg)) { 314 if (!is_primary) { 315 (void) fprintf(stderr, 316 gettext("cannot open '%s': dRAID spares can only " 317 "be used to replace primary vdevs\n"), arg); 318 return (NULL); 319 } 320 321 wholedisk = B_TRUE; 322 strlcpy(path, arg, sizeof (path)); 323 type = VDEV_TYPE_DRAID_SPARE; 324 } else { 325 err = is_shorthand_path(arg, path, sizeof (path), 326 &statbuf, &wholedisk); 327 if (err != 0) { 328 /* 329 * If we got ENOENT, then the user gave us 330 * gibberish, so try to direct them with a 331 * reasonable error message. Otherwise, 332 * regurgitate strerror() since it's the best we 333 * can do. 334 */ 335 if (err == ENOENT) { 336 (void) fprintf(stderr, 337 gettext("cannot open '%s': no such " 338 "device in %s\n"), arg, DISK_ROOT); 339 (void) fprintf(stderr, 340 gettext("must be a full path or " 341 "shorthand device name\n")); 342 return (NULL); 343 } else { 344 (void) fprintf(stderr, 345 gettext("cannot open '%s': %s\n"), 346 path, strerror(errno)); 347 return (NULL); 348 } 349 } 350 } 351 352 if (type == NULL) { 353 /* 354 * Determine whether this is a device or a file. 355 */ 356 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 357 type = VDEV_TYPE_DISK; 358 } else if (S_ISREG(statbuf.st_mode)) { 359 type = VDEV_TYPE_FILE; 360 } else { 361 fprintf(stderr, gettext("cannot use '%s': must " 362 "be a block device or regular file\n"), path); 363 return (NULL); 364 } 365 } 366 367 /* 368 * Finally, we have the complete device or file, and we know that it is 369 * acceptable to use. Construct the nvlist to describe this vdev. All 370 * vdevs have a 'path' element, and devices also have a 'devid' element. 371 */ 372 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 373 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 374 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 375 376 /* Lookup and add the enclosure sysfs path (if exists) */ 377 update_vdev_config_dev_sysfs_path(vdev, path, 378 ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); 379 380 if (strcmp(type, VDEV_TYPE_DISK) == 0) 381 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 382 (uint64_t)wholedisk) == 0); 383 384 /* 385 * Override defaults if custom properties are provided. 386 */ 387 if (props != NULL) { 388 const char *value = NULL; 389 390 if (nvlist_lookup_string(props, 391 zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { 392 if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { 393 (void) fprintf(stderr, 394 gettext("ashift must be a number.\n")); 395 return (NULL); 396 } 397 if (ashift != 0 && 398 (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { 399 (void) fprintf(stderr, 400 gettext("invalid 'ashift=%" PRIu64 "' " 401 "property: only values between %" PRId32 " " 402 "and %" PRId32 " are allowed.\n"), 403 ashift, ASHIFT_MIN, ASHIFT_MAX); 404 return (NULL); 405 } 406 } 407 } 408 409 /* 410 * If the device is known to incorrectly report its physical sector 411 * size explicitly provide the known correct value. 412 */ 413 if (ashift == 0) { 414 int sector_size; 415 416 if (check_sector_size_database(path, §or_size) == B_TRUE) 417 ashift = highbit64(sector_size) - 1; 418 } 419 420 if (ashift > 0) 421 (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); 422 423 return (vdev); 424 } 425 426 /* 427 * Go through and verify the replication level of the pool is consistent. 428 * Performs the following checks: 429 * 430 * For the new spec, verifies that devices in mirrors and raidz are the 431 * same size. 432 * 433 * If the current configuration already has inconsistent replication 434 * levels, ignore any other potential problems in the new spec. 435 * 436 * Otherwise, make sure that the current spec (if there is one) and the new 437 * spec have consistent replication levels. 438 * 439 * If there is no current spec (create), make sure new spec has at least 440 * one general purpose vdev. 441 */ 442 typedef struct replication_level { 443 const char *zprl_type; 444 uint64_t zprl_children; 445 uint64_t zprl_parity; 446 } replication_level_t; 447 448 #define ZPOOL_FUZZ (16 * 1024 * 1024) 449 450 /* 451 * N.B. For the purposes of comparing replication levels dRAID can be 452 * considered functionally equivalent to raidz. 453 */ 454 static boolean_t 455 is_raidz_mirror(replication_level_t *a, replication_level_t *b, 456 replication_level_t **raidz, replication_level_t **mirror) 457 { 458 if ((strcmp(a->zprl_type, "raidz") == 0 || 459 strcmp(a->zprl_type, "draid") == 0) && 460 strcmp(b->zprl_type, "mirror") == 0) { 461 *raidz = a; 462 *mirror = b; 463 return (B_TRUE); 464 } 465 return (B_FALSE); 466 } 467 468 /* 469 * Comparison for determining if dRAID and raidz where passed in either order. 470 */ 471 static boolean_t 472 is_raidz_draid(replication_level_t *a, replication_level_t *b) 473 { 474 if ((strcmp(a->zprl_type, "raidz") == 0 || 475 strcmp(a->zprl_type, "draid") == 0) && 476 (strcmp(b->zprl_type, "raidz") == 0 || 477 strcmp(b->zprl_type, "draid") == 0)) { 478 return (B_TRUE); 479 } 480 481 return (B_FALSE); 482 } 483 484 /* 485 * Given a list of toplevel vdevs, return the current replication level. If 486 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 487 * an error message will be displayed for each self-inconsistent vdev. 488 */ 489 static replication_level_t * 490 get_replication(nvlist_t *nvroot, boolean_t fatal) 491 { 492 nvlist_t **top; 493 uint_t t, toplevels; 494 nvlist_t **child; 495 uint_t c, children; 496 nvlist_t *nv; 497 const char *type; 498 replication_level_t lastrep = {0}; 499 replication_level_t rep; 500 replication_level_t *ret; 501 replication_level_t *raidz, *mirror; 502 boolean_t dontreport; 503 504 ret = safe_malloc(sizeof (replication_level_t)); 505 506 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 507 &top, &toplevels) == 0); 508 509 for (t = 0; t < toplevels; t++) { 510 uint64_t is_log = B_FALSE; 511 512 nv = top[t]; 513 514 /* 515 * For separate logs we ignore the top level vdev replication 516 * constraints. 517 */ 518 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 519 if (is_log) 520 continue; 521 522 /* 523 * Ignore holes introduced by removing aux devices, along 524 * with indirect vdevs introduced by previously removed 525 * vdevs. 526 */ 527 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 528 if (strcmp(type, VDEV_TYPE_HOLE) == 0 || 529 strcmp(type, VDEV_TYPE_INDIRECT) == 0) 530 continue; 531 532 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 533 &child, &children) != 0) { 534 /* 535 * This is a 'file' or 'disk' vdev. 536 */ 537 rep.zprl_type = type; 538 rep.zprl_children = 1; 539 rep.zprl_parity = 0; 540 } else { 541 int64_t vdev_size; 542 543 /* 544 * This is a mirror or RAID-Z vdev. Go through and make 545 * sure the contents are all the same (files vs. disks), 546 * keeping track of the number of elements in the 547 * process. 548 * 549 * We also check that the size of each vdev (if it can 550 * be determined) is the same. 551 */ 552 rep.zprl_type = type; 553 rep.zprl_children = 0; 554 555 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || 556 strcmp(type, VDEV_TYPE_DRAID) == 0) { 557 verify(nvlist_lookup_uint64(nv, 558 ZPOOL_CONFIG_NPARITY, 559 &rep.zprl_parity) == 0); 560 assert(rep.zprl_parity != 0); 561 } else { 562 rep.zprl_parity = 0; 563 } 564 565 /* 566 * The 'dontreport' variable indicates that we've 567 * already reported an error for this spec, so don't 568 * bother doing it again. 569 */ 570 type = NULL; 571 dontreport = 0; 572 vdev_size = -1LL; 573 for (c = 0; c < children; c++) { 574 nvlist_t *cnv = child[c]; 575 const char *path; 576 struct stat64 statbuf; 577 const char *childtype; 578 int fd, err; 579 580 rep.zprl_children++; 581 582 verify(nvlist_lookup_string(cnv, 583 ZPOOL_CONFIG_TYPE, &childtype) == 0); 584 585 /* 586 * If this is a replacing or spare vdev, then 587 * get the real first child of the vdev: do this 588 * in a loop because replacing and spare vdevs 589 * can be nested. 590 */ 591 while (strcmp(childtype, 592 VDEV_TYPE_REPLACING) == 0 || 593 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 594 nvlist_t **rchild; 595 uint_t rchildren; 596 597 verify(nvlist_lookup_nvlist_array(cnv, 598 ZPOOL_CONFIG_CHILDREN, &rchild, 599 &rchildren) == 0); 600 assert(rchildren == 2); 601 cnv = rchild[0]; 602 603 verify(nvlist_lookup_string(cnv, 604 ZPOOL_CONFIG_TYPE, 605 &childtype) == 0); 606 } 607 608 verify(nvlist_lookup_string(cnv, 609 ZPOOL_CONFIG_PATH, &path) == 0); 610 611 /* 612 * Skip active spares they should never cause 613 * the pool to be evaluated as inconsistent. 614 */ 615 if (is_spare(NULL, path)) 616 continue; 617 618 /* 619 * If we have a raidz/mirror that combines disks 620 * with files, only report it as an error when 621 * fatal is set to ensure all the replication 622 * checks aren't skipped in check_replication(). 623 */ 624 if (fatal && !dontreport && type != NULL && 625 strcmp(type, childtype) != 0) { 626 if (ret != NULL) 627 free(ret); 628 ret = NULL; 629 vdev_error(gettext( 630 "mismatched replication " 631 "level: %s contains both " 632 "files and devices\n"), 633 rep.zprl_type); 634 dontreport = B_TRUE; 635 } 636 637 /* 638 * According to stat(2), the value of 'st_size' 639 * is undefined for block devices and character 640 * devices. But there is no effective way to 641 * determine the real size in userland. 642 * 643 * Instead, we'll take advantage of an 644 * implementation detail of spec_size(). If the 645 * device is currently open, then we (should) 646 * return a valid size. 647 * 648 * If we still don't get a valid size (indicated 649 * by a size of 0 or MAXOFFSET_T), then ignore 650 * this device altogether. 651 */ 652 if ((fd = open(path, O_RDONLY)) >= 0) { 653 err = fstat64_blk(fd, &statbuf); 654 (void) close(fd); 655 } else { 656 err = stat64(path, &statbuf); 657 } 658 659 if (err != 0 || 660 statbuf.st_size == 0 || 661 statbuf.st_size == MAXOFFSET_T) 662 continue; 663 664 int64_t size = statbuf.st_size; 665 666 /* 667 * Also make sure that devices and 668 * slices have a consistent size. If 669 * they differ by a significant amount 670 * (~16MB) then report an error. 671 */ 672 if (!dontreport && 673 (vdev_size != -1LL && 674 (llabs(size - vdev_size) > 675 ZPOOL_FUZZ))) { 676 if (ret != NULL) 677 free(ret); 678 ret = NULL; 679 if (fatal) 680 vdev_error(gettext( 681 "%s contains devices of " 682 "different sizes\n"), 683 rep.zprl_type); 684 else 685 return (NULL); 686 dontreport = B_TRUE; 687 } 688 689 type = childtype; 690 vdev_size = size; 691 } 692 } 693 694 /* 695 * At this point, we have the replication of the last toplevel 696 * vdev in 'rep'. Compare it to 'lastrep' to see if it is 697 * different. 698 */ 699 if (lastrep.zprl_type != NULL) { 700 if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || 701 is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { 702 /* 703 * Accepted raidz and mirror when they can 704 * handle the same number of disk failures. 705 */ 706 if (raidz->zprl_parity != 707 mirror->zprl_children - 1) { 708 if (ret != NULL) 709 free(ret); 710 ret = NULL; 711 if (fatal) 712 vdev_error(gettext( 713 "mismatched replication " 714 "level: " 715 "%s and %s vdevs with " 716 "different redundancy, " 717 "%llu vs. %llu (%llu-way) " 718 "are present\n"), 719 raidz->zprl_type, 720 mirror->zprl_type, 721 (u_longlong_t) 722 raidz->zprl_parity, 723 (u_longlong_t) 724 mirror->zprl_children - 1, 725 (u_longlong_t) 726 mirror->zprl_children); 727 else 728 return (NULL); 729 } 730 } else if (is_raidz_draid(&lastrep, &rep)) { 731 /* 732 * Accepted raidz and draid when they can 733 * handle the same number of disk failures. 734 */ 735 if (lastrep.zprl_parity != rep.zprl_parity) { 736 if (ret != NULL) 737 free(ret); 738 ret = NULL; 739 if (fatal) 740 vdev_error(gettext( 741 "mismatched replication " 742 "level: %s and %s vdevs " 743 "with different " 744 "redundancy, %llu vs. " 745 "%llu are present\n"), 746 lastrep.zprl_type, 747 rep.zprl_type, 748 (u_longlong_t) 749 lastrep.zprl_parity, 750 (u_longlong_t) 751 rep.zprl_parity); 752 else 753 return (NULL); 754 } 755 } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 756 0) { 757 if (ret != NULL) 758 free(ret); 759 ret = NULL; 760 if (fatal) 761 vdev_error(gettext( 762 "mismatched replication level: " 763 "both %s and %s vdevs are " 764 "present\n"), 765 lastrep.zprl_type, rep.zprl_type); 766 else 767 return (NULL); 768 } else if (lastrep.zprl_parity != rep.zprl_parity) { 769 if (ret) 770 free(ret); 771 ret = NULL; 772 if (fatal) 773 vdev_error(gettext( 774 "mismatched replication level: " 775 "both %llu and %llu device parity " 776 "%s vdevs are present\n"), 777 (u_longlong_t) 778 lastrep.zprl_parity, 779 (u_longlong_t)rep.zprl_parity, 780 rep.zprl_type); 781 else 782 return (NULL); 783 } else if (lastrep.zprl_children != rep.zprl_children) { 784 if (ret) 785 free(ret); 786 ret = NULL; 787 if (fatal) 788 vdev_error(gettext( 789 "mismatched replication level: " 790 "both %llu-way and %llu-way %s " 791 "vdevs are present\n"), 792 (u_longlong_t) 793 lastrep.zprl_children, 794 (u_longlong_t) 795 rep.zprl_children, 796 rep.zprl_type); 797 else 798 return (NULL); 799 } 800 } 801 lastrep = rep; 802 } 803 804 if (ret != NULL) 805 *ret = rep; 806 807 return (ret); 808 } 809 810 /* 811 * Check the replication level of the vdev spec against the current pool. Calls 812 * get_replication() to make sure the new spec is self-consistent. If the pool 813 * has a consistent replication level, then we ignore any errors. Otherwise, 814 * report any difference between the two. 815 */ 816 static int 817 check_replication(nvlist_t *config, nvlist_t *newroot) 818 { 819 nvlist_t **child; 820 uint_t children; 821 replication_level_t *current = NULL, *new; 822 replication_level_t *raidz, *mirror; 823 int ret; 824 825 /* 826 * If we have a current pool configuration, check to see if it's 827 * self-consistent. If not, simply return success. 828 */ 829 if (config != NULL) { 830 nvlist_t *nvroot; 831 832 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 833 &nvroot) == 0); 834 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 835 return (0); 836 } 837 /* 838 * for spares there may be no children, and therefore no 839 * replication level to check 840 */ 841 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 842 &child, &children) != 0) || (children == 0)) { 843 free(current); 844 return (0); 845 } 846 847 /* 848 * If all we have is logs then there's no replication level to check. 849 */ 850 if (num_logs(newroot) == children) { 851 free(current); 852 return (0); 853 } 854 855 /* 856 * Get the replication level of the new vdev spec, reporting any 857 * inconsistencies found. 858 */ 859 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 860 free(current); 861 return (-1); 862 } 863 864 /* 865 * Check to see if the new vdev spec matches the replication level of 866 * the current pool. 867 */ 868 ret = 0; 869 if (current != NULL) { 870 if (is_raidz_mirror(current, new, &raidz, &mirror) || 871 is_raidz_mirror(new, current, &raidz, &mirror)) { 872 if (raidz->zprl_parity != mirror->zprl_children - 1) { 873 vdev_error(gettext( 874 "mismatched replication level: pool and " 875 "new vdev with different redundancy, %s " 876 "and %s vdevs, %llu vs. %llu (%llu-way)\n"), 877 raidz->zprl_type, 878 mirror->zprl_type, 879 (u_longlong_t)raidz->zprl_parity, 880 (u_longlong_t)mirror->zprl_children - 1, 881 (u_longlong_t)mirror->zprl_children); 882 ret = -1; 883 } 884 } else if (is_raidz_draid(current, new)) { 885 if (current->zprl_parity != new->zprl_parity) { 886 vdev_error(gettext( 887 "mismatched replication level: pool and " 888 "new vdev with different redundancy, %s " 889 "and %s vdevs, %llu vs. %llu\n"), 890 current->zprl_type, 891 new->zprl_type, 892 (u_longlong_t)current->zprl_parity, 893 (u_longlong_t)new->zprl_parity); 894 ret = -1; 895 } 896 } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { 897 vdev_error(gettext( 898 "mismatched replication level: pool uses %s " 899 "and new vdev is %s\n"), 900 current->zprl_type, new->zprl_type); 901 ret = -1; 902 } else if (current->zprl_parity != new->zprl_parity) { 903 vdev_error(gettext( 904 "mismatched replication level: pool uses %llu " 905 "device parity and new vdev uses %llu\n"), 906 (u_longlong_t)current->zprl_parity, 907 (u_longlong_t)new->zprl_parity); 908 ret = -1; 909 } else if (current->zprl_children != new->zprl_children) { 910 vdev_error(gettext( 911 "mismatched replication level: pool uses %llu-way " 912 "%s and new vdev uses %llu-way %s\n"), 913 (u_longlong_t)current->zprl_children, 914 current->zprl_type, 915 (u_longlong_t)new->zprl_children, 916 new->zprl_type); 917 ret = -1; 918 } 919 } 920 921 free(new); 922 if (current != NULL) 923 free(current); 924 925 return (ret); 926 } 927 928 static int 929 zero_label(const char *path) 930 { 931 const int size = 4096; 932 char buf[size]; 933 int err, fd; 934 935 if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) { 936 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 937 path, strerror(errno)); 938 return (-1); 939 } 940 941 memset(buf, 0, size); 942 err = write(fd, buf, size); 943 (void) fdatasync(fd); 944 (void) close(fd); 945 946 if (err == -1) { 947 (void) fprintf(stderr, gettext("cannot zero first %d bytes " 948 "of '%s': %s\n"), size, path, strerror(errno)); 949 return (-1); 950 } 951 952 if (err != size) { 953 (void) fprintf(stderr, gettext("could only zero %d/%d bytes " 954 "of '%s'\n"), err, size, path); 955 return (-1); 956 } 957 958 return (0); 959 } 960 961 static void 962 lines_to_stderr(char *lines[], int lines_cnt) 963 { 964 int i; 965 for (i = 0; i < lines_cnt; i++) { 966 fprintf(stderr, "%s\n", lines[i]); 967 } 968 } 969 970 /* 971 * Go through and find any whole disks in the vdev specification, labelling them 972 * as appropriate. When constructing the vdev spec, we were unable to open this 973 * device in order to provide a devid. Now that we have labelled the disk and 974 * know that slice 0 is valid, we can construct the devid now. 975 * 976 * If the disk was already labeled with an EFI label, we will have gotten the 977 * devid already (because we were able to open the whole disk). Otherwise, we 978 * need to get the devid after we label the disk. 979 */ 980 static int 981 make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing) 982 { 983 nvlist_t **child; 984 uint_t c, children; 985 const char *type, *path; 986 char devpath[MAXPATHLEN]; 987 char udevpath[MAXPATHLEN]; 988 uint64_t wholedisk; 989 struct stat64 statbuf; 990 int is_exclusive = 0; 991 int fd; 992 int ret; 993 994 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 995 996 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 997 &child, &children) != 0) { 998 999 if (strcmp(type, VDEV_TYPE_DISK) != 0) 1000 return (0); 1001 1002 /* 1003 * We have a disk device. If this is a whole disk write 1004 * out the efi partition table, otherwise write zero's to 1005 * the first 4k of the partition. This is to ensure that 1006 * libblkid will not misidentify the partition due to a 1007 * magic value left by the previous filesystem. 1008 */ 1009 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); 1010 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1011 &wholedisk)); 1012 1013 if (!wholedisk) { 1014 /* 1015 * Update device id string for mpath nodes (Linux only) 1016 */ 1017 if (is_mpath_whole_disk(path)) 1018 update_vdev_config_dev_strs(nv); 1019 1020 if (!is_spare(NULL, path)) 1021 (void) zero_label(path); 1022 return (0); 1023 } 1024 1025 if (realpath(path, devpath) == NULL) { 1026 ret = errno; 1027 (void) fprintf(stderr, 1028 gettext("cannot resolve path '%s'\n"), path); 1029 return (ret); 1030 } 1031 1032 /* 1033 * Remove any previously existing symlink from a udev path to 1034 * the device before labeling the disk. This ensures that 1035 * only newly created links are used. Otherwise there is a 1036 * window between when udev deletes and recreates the link 1037 * during which access attempts will fail with ENOENT. 1038 */ 1039 strlcpy(udevpath, path, MAXPATHLEN); 1040 (void) zfs_append_partition(udevpath, MAXPATHLEN); 1041 1042 fd = open(devpath, O_RDWR|O_EXCL); 1043 if (fd == -1) { 1044 if (errno == EBUSY) 1045 is_exclusive = 1; 1046 #ifdef __FreeBSD__ 1047 if (errno == EPERM) 1048 is_exclusive = 1; 1049 #endif 1050 } else { 1051 (void) close(fd); 1052 } 1053 1054 /* 1055 * If the partition exists, contains a valid spare label, 1056 * and is opened exclusively there is no need to partition 1057 * it. Hot spares have already been partitioned and are 1058 * held open exclusively by the kernel as a safety measure. 1059 * 1060 * If the provided path is for a /dev/disk/ device its 1061 * symbolic link will be removed, partition table created, 1062 * and then block until udev creates the new link. 1063 */ 1064 if (!is_exclusive && !is_spare(NULL, udevpath)) { 1065 char *devnode = strrchr(devpath, '/') + 1; 1066 char **lines = NULL; 1067 int lines_cnt = 0; 1068 1069 ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); 1070 if (ret == 0) { 1071 ret = lstat64(udevpath, &statbuf); 1072 if (ret == 0 && S_ISLNK(statbuf.st_mode)) 1073 (void) unlink(udevpath); 1074 } 1075 1076 /* 1077 * When labeling a pool the raw device node name 1078 * is provided as it appears under /dev/. 1079 * 1080 * Note that 'zhp' will be NULL when we're creating a 1081 * pool. 1082 */ 1083 if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode, 1084 nv, zhp == NULL ? "create" : 1085 replacing ? "replace" : "add", &lines, 1086 &lines_cnt) != 0) { 1087 (void) fprintf(stderr, 1088 gettext( 1089 "Error preparing/labeling disk.\n")); 1090 if (lines_cnt > 0) { 1091 (void) fprintf(stderr, 1092 gettext("zfs_prepare_disk output:\n")); 1093 lines_to_stderr(lines, lines_cnt); 1094 } 1095 1096 libzfs_free_str_array(lines, lines_cnt); 1097 return (-1); 1098 } 1099 libzfs_free_str_array(lines, lines_cnt); 1100 1101 /* 1102 * Wait for udev to signal the device is available 1103 * by the provided path. 1104 */ 1105 ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); 1106 if (ret) { 1107 (void) fprintf(stderr, 1108 gettext("missing link: %s was " 1109 "partitioned but %s is missing\n"), 1110 devnode, udevpath); 1111 return (ret); 1112 } 1113 1114 ret = zero_label(udevpath); 1115 if (ret) 1116 return (ret); 1117 } 1118 1119 /* 1120 * Update the path to refer to the partition. The presence of 1121 * the 'whole_disk' field indicates to the CLI that we should 1122 * chop off the partition number when displaying the device in 1123 * future output. 1124 */ 1125 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0); 1126 1127 /* 1128 * Update device id strings for whole disks (Linux only) 1129 */ 1130 update_vdev_config_dev_strs(nv); 1131 1132 return (0); 1133 } 1134 1135 for (c = 0; c < children; c++) 1136 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1137 return (ret); 1138 1139 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1140 &child, &children) == 0) 1141 for (c = 0; c < children; c++) 1142 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1143 return (ret); 1144 1145 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1146 &child, &children) == 0) 1147 for (c = 0; c < children; c++) 1148 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1149 return (ret); 1150 1151 return (0); 1152 } 1153 1154 /* 1155 * Go through and find any devices that are in use. We rely on libdiskmgt for 1156 * the majority of this task. 1157 */ 1158 static boolean_t 1159 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, 1160 boolean_t replacing, boolean_t isspare) 1161 { 1162 nvlist_t **child; 1163 uint_t c, children; 1164 const char *type, *path; 1165 int ret = 0; 1166 char buf[MAXPATHLEN]; 1167 uint64_t wholedisk = B_FALSE; 1168 boolean_t anyinuse = B_FALSE; 1169 1170 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1171 1172 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1173 &child, &children) != 0) { 1174 1175 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); 1176 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1177 verify(!nvlist_lookup_uint64(nv, 1178 ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); 1179 1180 /* 1181 * As a generic check, we look to see if this is a replace of a 1182 * hot spare within the same pool. If so, we allow it 1183 * regardless of what libblkid or zpool_in_use() says. 1184 */ 1185 if (replacing) { 1186 (void) strlcpy(buf, path, sizeof (buf)); 1187 if (wholedisk) { 1188 ret = zfs_append_partition(buf, sizeof (buf)); 1189 if (ret == -1) 1190 return (-1); 1191 } 1192 1193 if (is_spare(config, buf)) 1194 return (B_FALSE); 1195 } 1196 1197 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1198 ret = check_device(path, force, isspare, wholedisk); 1199 1200 else if (strcmp(type, VDEV_TYPE_FILE) == 0) 1201 ret = check_file(path, force, isspare); 1202 1203 return (ret != 0); 1204 } 1205 1206 for (c = 0; c < children; c++) 1207 if (is_device_in_use(config, child[c], force, replacing, 1208 B_FALSE)) 1209 anyinuse = B_TRUE; 1210 1211 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1212 &child, &children) == 0) 1213 for (c = 0; c < children; c++) 1214 if (is_device_in_use(config, child[c], force, replacing, 1215 B_TRUE)) 1216 anyinuse = B_TRUE; 1217 1218 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1219 &child, &children) == 0) 1220 for (c = 0; c < children; c++) 1221 if (is_device_in_use(config, child[c], force, replacing, 1222 B_FALSE)) 1223 anyinuse = B_TRUE; 1224 1225 return (anyinuse); 1226 } 1227 1228 /* 1229 * Returns the parity level extracted from a raidz or draid type. 1230 * If the parity cannot be determined zero is returned. 1231 */ 1232 static int 1233 get_parity(const char *type) 1234 { 1235 long parity = 0; 1236 const char *p; 1237 1238 if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { 1239 p = type + strlen(VDEV_TYPE_RAIDZ); 1240 1241 if (*p == '\0') { 1242 /* when unspecified default to single parity */ 1243 return (1); 1244 } else if (*p == '0') { 1245 /* no zero prefixes allowed */ 1246 return (0); 1247 } else { 1248 /* 0-3, no suffixes allowed */ 1249 char *end; 1250 errno = 0; 1251 parity = strtol(p, &end, 10); 1252 if (errno != 0 || *end != '\0' || 1253 parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { 1254 return (0); 1255 } 1256 } 1257 } else if (strncmp(type, VDEV_TYPE_DRAID, 1258 strlen(VDEV_TYPE_DRAID)) == 0) { 1259 p = type + strlen(VDEV_TYPE_DRAID); 1260 1261 if (*p == '\0' || *p == ':') { 1262 /* when unspecified default to single parity */ 1263 return (1); 1264 } else if (*p == '0') { 1265 /* no zero prefixes allowed */ 1266 return (0); 1267 } else { 1268 /* 0-3, allowed suffixes: '\0' or ':' */ 1269 char *end; 1270 errno = 0; 1271 parity = strtol(p, &end, 10); 1272 if (errno != 0 || 1273 parity < 1 || parity > VDEV_DRAID_MAXPARITY || 1274 (*end != '\0' && *end != ':')) { 1275 return (0); 1276 } 1277 } 1278 } 1279 1280 return ((int)parity); 1281 } 1282 1283 /* 1284 * Assign the minimum and maximum number of devices allowed for 1285 * the specified type. On error NULL is returned, otherwise the 1286 * type prefix is returned (raidz, mirror, etc). 1287 */ 1288 static const char * 1289 is_grouping(const char *type, int *mindev, int *maxdev) 1290 { 1291 int nparity; 1292 1293 if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || 1294 strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { 1295 nparity = get_parity(type); 1296 if (nparity == 0) 1297 return (NULL); 1298 if (mindev != NULL) 1299 *mindev = nparity + 1; 1300 if (maxdev != NULL) 1301 *maxdev = 255; 1302 1303 if (strncmp(type, VDEV_TYPE_RAIDZ, 1304 strlen(VDEV_TYPE_RAIDZ)) == 0) { 1305 return (VDEV_TYPE_RAIDZ); 1306 } else { 1307 return (VDEV_TYPE_DRAID); 1308 } 1309 } 1310 1311 if (maxdev != NULL) 1312 *maxdev = INT_MAX; 1313 1314 if (strcmp(type, "mirror") == 0) { 1315 if (mindev != NULL) 1316 *mindev = 2; 1317 return (VDEV_TYPE_MIRROR); 1318 } 1319 1320 if (strcmp(type, "spare") == 0) { 1321 if (mindev != NULL) 1322 *mindev = 1; 1323 return (VDEV_TYPE_SPARE); 1324 } 1325 1326 if (strcmp(type, "log") == 0) { 1327 if (mindev != NULL) 1328 *mindev = 1; 1329 return (VDEV_TYPE_LOG); 1330 } 1331 1332 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || 1333 strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1334 if (mindev != NULL) 1335 *mindev = 1; 1336 return (type); 1337 } 1338 1339 if (strcmp(type, "cache") == 0) { 1340 if (mindev != NULL) 1341 *mindev = 1; 1342 return (VDEV_TYPE_L2CACHE); 1343 } 1344 1345 return (NULL); 1346 } 1347 1348 /* 1349 * Extract the configuration parameters encoded in the dRAID type and 1350 * use them to generate a dRAID configuration. The expected format is: 1351 * 1352 * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>] 1353 * 1354 * The intent is to be able to generate a good configuration when no 1355 * additional information is provided. The only mandatory component 1356 * of the 'type' is the 'draid' prefix. If a value is not provided 1357 * then reasonable defaults are used. The optional components may 1358 * appear in any order but the d/s/c suffix is required. 1359 * 1360 * Valid inputs: 1361 * - data: number of data devices per group (1-255) 1362 * - parity: number of parity blocks per group (1-3) 1363 * - spares: number of distributed spare (0-100) 1364 * - children: total number of devices (1-255) 1365 * 1366 * Examples: 1367 * - zpool create tank draid <devices...> 1368 * - zpool create tank draid2:8d:51c:2s <devices...> 1369 */ 1370 static int 1371 draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) 1372 { 1373 uint64_t nparity; 1374 uint64_t nspares = 0; 1375 uint64_t ndata = UINT64_MAX; 1376 uint64_t ngroups = 1; 1377 long value; 1378 1379 if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) 1380 return (EINVAL); 1381 1382 nparity = (uint64_t)get_parity(type); 1383 if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { 1384 fprintf(stderr, 1385 gettext("invalid dRAID parity level %llu; must be " 1386 "between 1 and %d\n"), (u_longlong_t)nparity, 1387 VDEV_DRAID_MAXPARITY); 1388 return (EINVAL); 1389 } 1390 1391 char *p = (char *)type; 1392 while ((p = strchr(p, ':')) != NULL) { 1393 char *end; 1394 1395 p = p + 1; 1396 errno = 0; 1397 1398 if (!isdigit(p[0])) { 1399 (void) fprintf(stderr, gettext("invalid dRAID " 1400 "syntax; expected [:<number><c|d|s>] not '%s'\n"), 1401 type); 1402 return (EINVAL); 1403 } 1404 1405 /* Expected non-zero value with c/d/s suffix */ 1406 value = strtol(p, &end, 10); 1407 char suffix = tolower(*end); 1408 if (errno != 0 || 1409 (suffix != 'c' && suffix != 'd' && suffix != 's')) { 1410 (void) fprintf(stderr, gettext("invalid dRAID " 1411 "syntax; expected [:<number><c|d|s>] not '%s'\n"), 1412 type); 1413 return (EINVAL); 1414 } 1415 1416 if (suffix == 'c') { 1417 if ((uint64_t)value != children) { 1418 fprintf(stderr, 1419 gettext("invalid number of dRAID children; " 1420 "%llu required but %llu provided\n"), 1421 (u_longlong_t)value, 1422 (u_longlong_t)children); 1423 return (EINVAL); 1424 } 1425 } else if (suffix == 'd') { 1426 ndata = (uint64_t)value; 1427 } else if (suffix == 's') { 1428 nspares = (uint64_t)value; 1429 } else { 1430 verify(0); /* Unreachable */ 1431 } 1432 } 1433 1434 /* 1435 * When a specific number of data disks is not provided limit a 1436 * redundancy group to 8 data disks. This value was selected to 1437 * provide a reasonable tradeoff between capacity and performance. 1438 */ 1439 if (ndata == UINT64_MAX) { 1440 if (children > nspares + nparity) { 1441 ndata = MIN(children - nspares - nparity, 8); 1442 } else { 1443 fprintf(stderr, gettext("request number of " 1444 "distributed spares %llu and parity level %llu\n" 1445 "leaves no disks available for data\n"), 1446 (u_longlong_t)nspares, (u_longlong_t)nparity); 1447 return (EINVAL); 1448 } 1449 } 1450 1451 /* Verify the maximum allowed group size is never exceeded. */ 1452 if (ndata == 0 || (ndata + nparity > children - nspares)) { 1453 fprintf(stderr, gettext("requested number of dRAID data " 1454 "disks per group %llu is too high,\nat most %llu disks " 1455 "are available for data\n"), (u_longlong_t)ndata, 1456 (u_longlong_t)(children - nspares - nparity)); 1457 return (EINVAL); 1458 } 1459 1460 /* 1461 * Verify the requested number of spares can be satisfied. 1462 * An arbitrary limit of 100 distributed spares is applied. 1463 */ 1464 if (nspares > 100 || nspares > (children - (ndata + nparity))) { 1465 fprintf(stderr, 1466 gettext("invalid number of dRAID spares %llu; additional " 1467 "disks would be required\n"), (u_longlong_t)nspares); 1468 return (EINVAL); 1469 } 1470 1471 /* Verify the requested number children is sufficient. */ 1472 if (children < (ndata + nparity + nspares)) { 1473 fprintf(stderr, gettext("%llu disks were provided, but at " 1474 "least %llu disks are required for this config\n"), 1475 (u_longlong_t)children, 1476 (u_longlong_t)(ndata + nparity + nspares)); 1477 } 1478 1479 if (children > VDEV_DRAID_MAX_CHILDREN) { 1480 fprintf(stderr, gettext("%llu disks were provided, but " 1481 "dRAID only supports up to %u disks"), 1482 (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); 1483 } 1484 1485 /* 1486 * Calculate the minimum number of groups required to fill a slice. 1487 * This is the LCM of the stripe width (ndata + nparity) and the 1488 * number of data drives (children - nspares). 1489 */ 1490 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1491 ngroups++; 1492 1493 /* Store the basic dRAID configuration. */ 1494 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); 1495 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1496 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1497 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1498 1499 return (0); 1500 } 1501 1502 /* 1503 * Construct a syntactically valid vdev specification, 1504 * and ensure that all devices and files exist and can be opened. 1505 * Note: we don't bother freeing anything in the error paths 1506 * because the program is just going to exit anyway. 1507 */ 1508 static nvlist_t * 1509 construct_spec(nvlist_t *props, int argc, char **argv) 1510 { 1511 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1512 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1513 const char *type, *fulltype; 1514 boolean_t is_log, is_special, is_dedup, is_spare; 1515 boolean_t seen_logs; 1516 1517 top = NULL; 1518 toplevels = 0; 1519 spares = NULL; 1520 l2cache = NULL; 1521 nspares = 0; 1522 nlogs = 0; 1523 nl2cache = 0; 1524 is_log = is_special = is_dedup = is_spare = B_FALSE; 1525 seen_logs = B_FALSE; 1526 nvroot = NULL; 1527 1528 while (argc > 0) { 1529 fulltype = argv[0]; 1530 nv = NULL; 1531 1532 /* 1533 * If it's a mirror, raidz, or draid the subsequent arguments 1534 * are its leaves -- until we encounter the next mirror, 1535 * raidz or draid. 1536 */ 1537 if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { 1538 nvlist_t **child = NULL; 1539 int c, children = 0; 1540 1541 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1542 if (spares != NULL) { 1543 (void) fprintf(stderr, 1544 gettext("invalid vdev " 1545 "specification: 'spare' can be " 1546 "specified only once\n")); 1547 goto spec_out; 1548 } 1549 is_spare = B_TRUE; 1550 is_log = is_special = is_dedup = B_FALSE; 1551 } 1552 1553 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1554 if (seen_logs) { 1555 (void) fprintf(stderr, 1556 gettext("invalid vdev " 1557 "specification: 'log' can be " 1558 "specified only once\n")); 1559 goto spec_out; 1560 } 1561 seen_logs = B_TRUE; 1562 is_log = B_TRUE; 1563 is_special = is_dedup = is_spare = B_FALSE; 1564 argc--; 1565 argv++; 1566 /* 1567 * A log is not a real grouping device. 1568 * We just set is_log and continue. 1569 */ 1570 continue; 1571 } 1572 1573 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { 1574 is_special = B_TRUE; 1575 is_log = is_dedup = is_spare = B_FALSE; 1576 argc--; 1577 argv++; 1578 continue; 1579 } 1580 1581 if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1582 is_dedup = B_TRUE; 1583 is_log = is_special = is_spare = B_FALSE; 1584 argc--; 1585 argv++; 1586 continue; 1587 } 1588 1589 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1590 if (l2cache != NULL) { 1591 (void) fprintf(stderr, 1592 gettext("invalid vdev " 1593 "specification: 'cache' can be " 1594 "specified only once\n")); 1595 goto spec_out; 1596 } 1597 is_log = is_special = B_FALSE; 1598 is_dedup = is_spare = B_FALSE; 1599 } 1600 1601 if (is_log) { 1602 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1603 (void) fprintf(stderr, 1604 gettext("invalid vdev " 1605 "specification: unsupported 'log' " 1606 "device: %s\n"), type); 1607 goto spec_out; 1608 } 1609 nlogs++; 1610 } 1611 1612 for (c = 1; c < argc; c++) { 1613 if (is_grouping(argv[c], NULL, NULL) != NULL) 1614 break; 1615 1616 children++; 1617 child = realloc(child, 1618 children * sizeof (nvlist_t *)); 1619 if (child == NULL) 1620 zpool_no_memory(); 1621 if ((nv = make_leaf_vdev(props, argv[c], 1622 !(is_log || is_special || is_dedup || 1623 is_spare))) == NULL) { 1624 for (c = 0; c < children - 1; c++) 1625 nvlist_free(child[c]); 1626 free(child); 1627 goto spec_out; 1628 } 1629 1630 child[children - 1] = nv; 1631 } 1632 1633 if (children < mindev) { 1634 (void) fprintf(stderr, gettext("invalid vdev " 1635 "specification: %s requires at least %d " 1636 "devices\n"), argv[0], mindev); 1637 for (c = 0; c < children; c++) 1638 nvlist_free(child[c]); 1639 free(child); 1640 goto spec_out; 1641 } 1642 1643 if (children > maxdev) { 1644 (void) fprintf(stderr, gettext("invalid vdev " 1645 "specification: %s supports no more than " 1646 "%d devices\n"), argv[0], maxdev); 1647 for (c = 0; c < children; c++) 1648 nvlist_free(child[c]); 1649 free(child); 1650 goto spec_out; 1651 } 1652 1653 argc -= c; 1654 argv += c; 1655 1656 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1657 spares = child; 1658 nspares = children; 1659 continue; 1660 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1661 l2cache = child; 1662 nl2cache = children; 1663 continue; 1664 } else { 1665 /* create a top-level vdev with children */ 1666 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1667 0) == 0); 1668 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1669 type) == 0); 1670 verify(nvlist_add_uint64(nv, 1671 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1672 if (is_log) { 1673 verify(nvlist_add_string(nv, 1674 ZPOOL_CONFIG_ALLOCATION_BIAS, 1675 VDEV_ALLOC_BIAS_LOG) == 0); 1676 } 1677 if (is_special) { 1678 verify(nvlist_add_string(nv, 1679 ZPOOL_CONFIG_ALLOCATION_BIAS, 1680 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1681 } 1682 if (is_dedup) { 1683 verify(nvlist_add_string(nv, 1684 ZPOOL_CONFIG_ALLOCATION_BIAS, 1685 VDEV_ALLOC_BIAS_DEDUP) == 0); 1686 } 1687 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1688 verify(nvlist_add_uint64(nv, 1689 ZPOOL_CONFIG_NPARITY, 1690 mindev - 1) == 0); 1691 } 1692 if (strcmp(type, VDEV_TYPE_DRAID) == 0) { 1693 if (draid_config_by_type(nv, 1694 fulltype, children) != 0) { 1695 for (c = 0; c < children; c++) 1696 nvlist_free(child[c]); 1697 free(child); 1698 goto spec_out; 1699 } 1700 } 1701 verify(nvlist_add_nvlist_array(nv, 1702 ZPOOL_CONFIG_CHILDREN, 1703 (const nvlist_t **)child, children) == 0); 1704 1705 for (c = 0; c < children; c++) 1706 nvlist_free(child[c]); 1707 free(child); 1708 } 1709 } else { 1710 /* 1711 * We have a device. Pass off to make_leaf_vdev() to 1712 * construct the appropriate nvlist describing the vdev. 1713 */ 1714 if ((nv = make_leaf_vdev(props, argv[0], !(is_log || 1715 is_special || is_dedup || is_spare))) == NULL) 1716 goto spec_out; 1717 1718 verify(nvlist_add_uint64(nv, 1719 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1720 if (is_log) { 1721 verify(nvlist_add_string(nv, 1722 ZPOOL_CONFIG_ALLOCATION_BIAS, 1723 VDEV_ALLOC_BIAS_LOG) == 0); 1724 nlogs++; 1725 } 1726 1727 if (is_special) { 1728 verify(nvlist_add_string(nv, 1729 ZPOOL_CONFIG_ALLOCATION_BIAS, 1730 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1731 } 1732 if (is_dedup) { 1733 verify(nvlist_add_string(nv, 1734 ZPOOL_CONFIG_ALLOCATION_BIAS, 1735 VDEV_ALLOC_BIAS_DEDUP) == 0); 1736 } 1737 argc--; 1738 argv++; 1739 } 1740 1741 toplevels++; 1742 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1743 if (top == NULL) 1744 zpool_no_memory(); 1745 top[toplevels - 1] = nv; 1746 } 1747 1748 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1749 (void) fprintf(stderr, gettext("invalid vdev " 1750 "specification: at least one toplevel vdev must be " 1751 "specified\n")); 1752 goto spec_out; 1753 } 1754 1755 if (seen_logs && nlogs == 0) { 1756 (void) fprintf(stderr, gettext("invalid vdev specification: " 1757 "log requires at least 1 device\n")); 1758 goto spec_out; 1759 } 1760 1761 /* 1762 * Finally, create nvroot and add all top-level vdevs to it. 1763 */ 1764 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1765 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1766 VDEV_TYPE_ROOT) == 0); 1767 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1768 (const nvlist_t **)top, toplevels) == 0); 1769 if (nspares != 0) 1770 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1771 (const nvlist_t **)spares, nspares) == 0); 1772 if (nl2cache != 0) 1773 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1774 (const nvlist_t **)l2cache, nl2cache) == 0); 1775 1776 spec_out: 1777 for (t = 0; t < toplevels; t++) 1778 nvlist_free(top[t]); 1779 for (t = 0; t < nspares; t++) 1780 nvlist_free(spares[t]); 1781 for (t = 0; t < nl2cache; t++) 1782 nvlist_free(l2cache[t]); 1783 1784 free(spares); 1785 free(l2cache); 1786 free(top); 1787 1788 return (nvroot); 1789 } 1790 1791 nvlist_t * 1792 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, 1793 splitflags_t flags, int argc, char **argv) 1794 { 1795 nvlist_t *newroot = NULL, **child; 1796 uint_t c, children; 1797 1798 if (argc > 0) { 1799 if ((newroot = construct_spec(props, argc, argv)) == NULL) { 1800 (void) fprintf(stderr, gettext("Unable to build a " 1801 "pool from the specified devices\n")); 1802 return (NULL); 1803 } 1804 1805 if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) { 1806 nvlist_free(newroot); 1807 return (NULL); 1808 } 1809 1810 /* avoid any tricks in the spec */ 1811 verify(nvlist_lookup_nvlist_array(newroot, 1812 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); 1813 for (c = 0; c < children; c++) { 1814 const char *path; 1815 const char *type; 1816 int min, max; 1817 1818 verify(nvlist_lookup_string(child[c], 1819 ZPOOL_CONFIG_PATH, &path) == 0); 1820 if ((type = is_grouping(path, &min, &max)) != NULL) { 1821 (void) fprintf(stderr, gettext("Cannot use " 1822 "'%s' as a device for splitting\n"), type); 1823 nvlist_free(newroot); 1824 return (NULL); 1825 } 1826 } 1827 } 1828 1829 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { 1830 nvlist_free(newroot); 1831 return (NULL); 1832 } 1833 1834 return (newroot); 1835 } 1836 1837 static int 1838 num_normal_vdevs(nvlist_t *nvroot) 1839 { 1840 nvlist_t **top; 1841 uint_t t, toplevels, normal = 0; 1842 1843 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1844 &top, &toplevels) == 0); 1845 1846 for (t = 0; t < toplevels; t++) { 1847 uint64_t log = B_FALSE; 1848 1849 (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); 1850 if (log) 1851 continue; 1852 if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) 1853 continue; 1854 1855 normal++; 1856 } 1857 1858 return (normal); 1859 } 1860 1861 /* 1862 * Get and validate the contents of the given vdev specification. This ensures 1863 * that the nvlist returned is well-formed, that all the devices exist, and that 1864 * they are not currently in use by any other known consumer. The 'poolconfig' 1865 * parameter is the current configuration of the pool when adding devices 1866 * existing pool, and is used to perform additional checks, such as changing the 1867 * replication level of the pool. It can be 'NULL' to indicate that this is a 1868 * new pool. The 'force' flag controls whether devices should be forcefully 1869 * added, even if they appear in use. 1870 */ 1871 nvlist_t * 1872 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, 1873 boolean_t replacing, boolean_t dryrun, int argc, char **argv) 1874 { 1875 nvlist_t *newroot; 1876 nvlist_t *poolconfig = NULL; 1877 is_force = force; 1878 1879 /* 1880 * Construct the vdev specification. If this is successful, we know 1881 * that we have a valid specification, and that all devices can be 1882 * opened. 1883 */ 1884 if ((newroot = construct_spec(props, argc, argv)) == NULL) 1885 return (NULL); 1886 1887 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) { 1888 nvlist_free(newroot); 1889 return (NULL); 1890 } 1891 1892 /* 1893 * Validate each device to make sure that it's not shared with another 1894 * subsystem. We do this even if 'force' is set, because there are some 1895 * uses (such as a dedicated dump device) that even '-f' cannot 1896 * override. 1897 */ 1898 if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { 1899 nvlist_free(newroot); 1900 return (NULL); 1901 } 1902 1903 /* 1904 * Check the replication level of the given vdevs and report any errors 1905 * found. We include the existing pool spec, if any, as we need to 1906 * catch changes against the existing replication level. 1907 */ 1908 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1909 nvlist_free(newroot); 1910 return (NULL); 1911 } 1912 1913 /* 1914 * On pool create the new vdev spec must have one normal vdev. 1915 */ 1916 if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { 1917 vdev_error(gettext("at least one general top-level vdev must " 1918 "be specified\n")); 1919 nvlist_free(newroot); 1920 return (NULL); 1921 } 1922 1923 /* 1924 * Run through the vdev specification and label any whole disks found. 1925 */ 1926 if (!dryrun && make_disks(zhp, newroot, replacing) != 0) { 1927 nvlist_free(newroot); 1928 return (NULL); 1929 } 1930 1931 return (newroot); 1932 } 1933