1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2013, 2018 by Delphix. All rights reserved. 26 * Copyright (c) 2016, 2017 Intel Corporation. 27 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. 28 */ 29 30 /* 31 * Functions to convert between a list of vdevs and an nvlist representing the 32 * configuration. Each entry in the list can be one of: 33 * 34 * Device vdevs 35 * disk=(path=..., devid=...) 36 * file=(path=...) 37 * 38 * Group vdevs 39 * raidz[1|2]=(...) 40 * mirror=(...) 41 * 42 * Hot spares 43 * 44 * While the underlying implementation supports it, group vdevs cannot contain 45 * other group vdevs. All userland verification of devices is contained within 46 * this file. If successful, the nvlist returned can be passed directly to the 47 * kernel; we've done as much verification as possible in userland. 48 * 49 * Hot spares are a special case, and passed down as an array of disk vdevs, at 50 * the same level as the root of the vdev tree. 51 * 52 * The only function exported by this file is 'make_root_vdev'. The 53 * function performs several passes: 54 * 55 * 1. Construct the vdev specification. Performs syntax validation and 56 * makes sure each device is valid. 57 * 2. Check for devices in use. Using libblkid to make sure that no 58 * devices are also in use. Some can be overridden using the 'force' 59 * flag, others cannot. 60 * 3. Check for replication errors if the 'force' flag is not specified. 61 * validates that the replication level is consistent across the 62 * entire pool. 63 * 4. Call libzfs to label any whole disks with an EFI label. 64 */ 65 66 #include <assert.h> 67 #include <ctype.h> 68 #include <errno.h> 69 #include <fcntl.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <libzutil.h> 73 #include <limits.h> 74 #include <sys/spa.h> 75 #include <stdio.h> 76 #include <string.h> 77 #include <unistd.h> 78 #include "zpool_util.h" 79 #include <sys/zfs_context.h> 80 #include <sys/stat.h> 81 82 /* 83 * For any given vdev specification, we can have multiple errors. The 84 * vdev_error() function keeps track of whether we have seen an error yet, and 85 * prints out a header if its the first error we've seen. 86 */ 87 boolean_t error_seen; 88 boolean_t is_force; 89 90 void 91 vdev_error(const char *fmt, ...) 92 { 93 va_list ap; 94 95 if (!error_seen) { 96 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 97 if (!is_force) 98 (void) fprintf(stderr, gettext("use '-f' to override " 99 "the following errors:\n")); 100 else 101 (void) fprintf(stderr, gettext("the following errors " 102 "must be manually repaired:\n")); 103 error_seen = B_TRUE; 104 } 105 106 va_start(ap, fmt); 107 (void) vfprintf(stderr, fmt, ap); 108 va_end(ap); 109 } 110 111 /* 112 * Check that a file is valid. All we can do in this case is check that it's 113 * not in use by another pool, and not in use by swap. 114 */ 115 int 116 check_file_generic(const char *file, boolean_t force, boolean_t isspare) 117 { 118 char *name; 119 int fd; 120 int ret = 0; 121 pool_state_t state; 122 boolean_t inuse; 123 124 if ((fd = open(file, O_RDONLY)) < 0) 125 return (0); 126 127 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 128 const char *desc; 129 130 switch (state) { 131 case POOL_STATE_ACTIVE: 132 desc = gettext("active"); 133 break; 134 135 case POOL_STATE_EXPORTED: 136 desc = gettext("exported"); 137 break; 138 139 case POOL_STATE_POTENTIALLY_ACTIVE: 140 desc = gettext("potentially active"); 141 break; 142 143 default: 144 desc = gettext("unknown"); 145 break; 146 } 147 148 /* 149 * Allow hot spares to be shared between pools. 150 */ 151 if (state == POOL_STATE_SPARE && isspare) { 152 free(name); 153 (void) close(fd); 154 return (0); 155 } 156 157 if (state == POOL_STATE_ACTIVE || 158 state == POOL_STATE_SPARE || !force) { 159 switch (state) { 160 case POOL_STATE_SPARE: 161 vdev_error(gettext("%s is reserved as a hot " 162 "spare for pool %s\n"), file, name); 163 break; 164 default: 165 vdev_error(gettext("%s is part of %s pool " 166 "'%s'\n"), file, desc, name); 167 break; 168 } 169 ret = -1; 170 } 171 172 free(name); 173 } 174 175 (void) close(fd); 176 return (ret); 177 } 178 179 /* 180 * This may be a shorthand device path or it could be total gibberish. 181 * Check to see if it is a known device available in zfs_vdev_paths. 182 * As part of this check, see if we've been given an entire disk 183 * (minus the slice number). 184 */ 185 static int 186 is_shorthand_path(const char *arg, char *path, size_t path_size, 187 struct stat64 *statbuf, boolean_t *wholedisk) 188 { 189 int error; 190 191 error = zfs_resolve_shortname(arg, path, path_size); 192 if (error == 0) { 193 *wholedisk = zfs_dev_is_whole_disk(path); 194 if (*wholedisk || (stat64(path, statbuf) == 0)) 195 return (0); 196 } 197 198 strlcpy(path, arg, path_size); 199 memset(statbuf, 0, sizeof (*statbuf)); 200 *wholedisk = B_FALSE; 201 202 return (error); 203 } 204 205 /* 206 * Determine if the given path is a hot spare within the given configuration. 207 * If no configuration is given we rely solely on the label. 208 */ 209 static boolean_t 210 is_spare(nvlist_t *config, const char *path) 211 { 212 int fd; 213 pool_state_t state; 214 char *name = NULL; 215 nvlist_t *label; 216 uint64_t guid, spareguid; 217 nvlist_t *nvroot; 218 nvlist_t **spares; 219 uint_t i, nspares; 220 boolean_t inuse; 221 222 if (zpool_is_draid_spare(path)) 223 return (B_TRUE); 224 225 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) 226 return (B_FALSE); 227 228 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 229 !inuse || 230 state != POOL_STATE_SPARE || 231 zpool_read_label(fd, &label, NULL) != 0) { 232 free(name); 233 (void) close(fd); 234 return (B_FALSE); 235 } 236 free(name); 237 (void) close(fd); 238 239 if (config == NULL) { 240 nvlist_free(label); 241 return (B_TRUE); 242 } 243 244 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 245 nvlist_free(label); 246 247 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 248 &nvroot) == 0); 249 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 250 &spares, &nspares) == 0) { 251 for (i = 0; i < nspares; i++) { 252 verify(nvlist_lookup_uint64(spares[i], 253 ZPOOL_CONFIG_GUID, &spareguid) == 0); 254 if (spareguid == guid) 255 return (B_TRUE); 256 } 257 } 258 259 return (B_FALSE); 260 } 261 262 /* 263 * Create a leaf vdev. Determine if this is a file or a device. If it's a 264 * device, fill in the device id to make a complete nvlist. Valid forms for a 265 * leaf vdev are: 266 * 267 * /dev/xxx Complete disk path 268 * /xxx Full path to file 269 * xxx Shorthand for <zfs_vdev_paths>/xxx 270 * draid* Virtual dRAID spare 271 */ 272 static nvlist_t * 273 make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) 274 { 275 char path[MAXPATHLEN]; 276 struct stat64 statbuf; 277 nvlist_t *vdev = NULL; 278 const char *type = NULL; 279 boolean_t wholedisk = B_FALSE; 280 uint64_t ashift = 0; 281 int err; 282 283 /* 284 * Determine what type of vdev this is, and put the full path into 285 * 'path'. We detect whether this is a device of file afterwards by 286 * checking the st_mode of the file. 287 */ 288 if (arg[0] == '/') { 289 /* 290 * Complete device or file path. Exact type is determined by 291 * examining the file descriptor afterwards. Symbolic links 292 * are resolved to their real paths to determine whole disk 293 * and S_ISBLK/S_ISREG type checks. However, we are careful 294 * to store the given path as ZPOOL_CONFIG_PATH to ensure we 295 * can leverage udev's persistent device labels. 296 */ 297 if (realpath(arg, path) == NULL) { 298 (void) fprintf(stderr, 299 gettext("cannot resolve path '%s'\n"), arg); 300 return (NULL); 301 } 302 303 wholedisk = zfs_dev_is_whole_disk(path); 304 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 305 (void) fprintf(stderr, 306 gettext("cannot open '%s': %s\n"), 307 path, strerror(errno)); 308 return (NULL); 309 } 310 311 /* After whole disk check restore original passed path */ 312 strlcpy(path, arg, sizeof (path)); 313 } else if (zpool_is_draid_spare(arg)) { 314 if (!is_primary) { 315 (void) fprintf(stderr, 316 gettext("cannot open '%s': dRAID spares can only " 317 "be used to replace primary vdevs\n"), arg); 318 return (NULL); 319 } 320 321 wholedisk = B_TRUE; 322 strlcpy(path, arg, sizeof (path)); 323 type = VDEV_TYPE_DRAID_SPARE; 324 } else { 325 err = is_shorthand_path(arg, path, sizeof (path), 326 &statbuf, &wholedisk); 327 if (err != 0) { 328 /* 329 * If we got ENOENT, then the user gave us 330 * gibberish, so try to direct them with a 331 * reasonable error message. Otherwise, 332 * regurgitate strerror() since it's the best we 333 * can do. 334 */ 335 if (err == ENOENT) { 336 (void) fprintf(stderr, 337 gettext("cannot open '%s': no such " 338 "device in %s\n"), arg, DISK_ROOT); 339 (void) fprintf(stderr, 340 gettext("must be a full path or " 341 "shorthand device name\n")); 342 return (NULL); 343 } else { 344 (void) fprintf(stderr, 345 gettext("cannot open '%s': %s\n"), 346 path, strerror(errno)); 347 return (NULL); 348 } 349 } 350 } 351 352 if (type == NULL) { 353 /* 354 * Determine whether this is a device or a file. 355 */ 356 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 357 type = VDEV_TYPE_DISK; 358 } else if (S_ISREG(statbuf.st_mode)) { 359 type = VDEV_TYPE_FILE; 360 } else { 361 fprintf(stderr, gettext("cannot use '%s': must " 362 "be a block device or regular file\n"), path); 363 return (NULL); 364 } 365 } 366 367 /* 368 * Finally, we have the complete device or file, and we know that it is 369 * acceptable to use. Construct the nvlist to describe this vdev. All 370 * vdevs have a 'path' element, and devices also have a 'devid' element. 371 */ 372 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 373 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 374 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 375 376 /* Lookup and add the enclosure sysfs path (if exists) */ 377 update_vdev_config_dev_sysfs_path(vdev, path, 378 ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); 379 380 if (strcmp(type, VDEV_TYPE_DISK) == 0) 381 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 382 (uint64_t)wholedisk) == 0); 383 384 /* 385 * Override defaults if custom properties are provided. 386 */ 387 if (props != NULL) { 388 const char *value = NULL; 389 390 if (nvlist_lookup_string(props, 391 zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { 392 if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { 393 (void) fprintf(stderr, 394 gettext("ashift must be a number.\n")); 395 return (NULL); 396 } 397 if (ashift != 0 && 398 (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { 399 (void) fprintf(stderr, 400 gettext("invalid 'ashift=%" PRIu64 "' " 401 "property: only values between %" PRId32 " " 402 "and %" PRId32 " are allowed.\n"), 403 ashift, ASHIFT_MIN, ASHIFT_MAX); 404 return (NULL); 405 } 406 } 407 } 408 409 /* 410 * If the device is known to incorrectly report its physical sector 411 * size explicitly provide the known correct value. 412 */ 413 if (ashift == 0) { 414 int sector_size; 415 416 if (check_sector_size_database(path, §or_size) == B_TRUE) 417 ashift = highbit64(sector_size) - 1; 418 } 419 420 if (ashift > 0) 421 (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); 422 423 return (vdev); 424 } 425 426 /* 427 * Go through and verify the replication level of the pool is consistent. 428 * Performs the following checks: 429 * 430 * For the new spec, verifies that devices in mirrors and raidz are the 431 * same size. 432 * 433 * If the current configuration already has inconsistent replication 434 * levels, ignore any other potential problems in the new spec. 435 * 436 * Otherwise, make sure that the current spec (if there is one) and the new 437 * spec have consistent replication levels. 438 * 439 * If there is no current spec (create), make sure new spec has at least 440 * one general purpose vdev. 441 */ 442 typedef struct replication_level { 443 const char *zprl_type; 444 uint64_t zprl_children; 445 uint64_t zprl_parity; 446 } replication_level_t; 447 448 #define ZPOOL_FUZZ (16 * 1024 * 1024) 449 450 /* 451 * N.B. For the purposes of comparing replication levels dRAID can be 452 * considered functionally equivalent to raidz. 453 */ 454 static boolean_t 455 is_raidz_mirror(replication_level_t *a, replication_level_t *b, 456 replication_level_t **raidz, replication_level_t **mirror) 457 { 458 if ((strcmp(a->zprl_type, "raidz") == 0 || 459 strcmp(a->zprl_type, "draid") == 0) && 460 strcmp(b->zprl_type, "mirror") == 0) { 461 *raidz = a; 462 *mirror = b; 463 return (B_TRUE); 464 } 465 return (B_FALSE); 466 } 467 468 /* 469 * Comparison for determining if dRAID and raidz where passed in either order. 470 */ 471 static boolean_t 472 is_raidz_draid(replication_level_t *a, replication_level_t *b) 473 { 474 if ((strcmp(a->zprl_type, "raidz") == 0 || 475 strcmp(a->zprl_type, "draid") == 0) && 476 (strcmp(b->zprl_type, "raidz") == 0 || 477 strcmp(b->zprl_type, "draid") == 0)) { 478 return (B_TRUE); 479 } 480 481 return (B_FALSE); 482 } 483 484 /* 485 * Given a list of toplevel vdevs, return the current replication level. If 486 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 487 * an error message will be displayed for each self-inconsistent vdev. 488 */ 489 static replication_level_t * 490 get_replication(nvlist_t *nvroot, boolean_t fatal) 491 { 492 nvlist_t **top; 493 uint_t t, toplevels; 494 nvlist_t **child; 495 uint_t c, children; 496 nvlist_t *nv; 497 const char *type; 498 replication_level_t lastrep = {0}; 499 replication_level_t rep; 500 replication_level_t *ret; 501 replication_level_t *raidz, *mirror; 502 boolean_t dontreport; 503 504 ret = safe_malloc(sizeof (replication_level_t)); 505 506 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 507 &top, &toplevels) == 0); 508 509 for (t = 0; t < toplevels; t++) { 510 uint64_t is_log = B_FALSE; 511 512 nv = top[t]; 513 514 /* 515 * For separate logs we ignore the top level vdev replication 516 * constraints. 517 */ 518 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 519 if (is_log) 520 continue; 521 522 /* 523 * Ignore holes introduced by removing aux devices, along 524 * with indirect vdevs introduced by previously removed 525 * vdevs. 526 */ 527 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 528 if (strcmp(type, VDEV_TYPE_HOLE) == 0 || 529 strcmp(type, VDEV_TYPE_INDIRECT) == 0) 530 continue; 531 532 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 533 &child, &children) != 0) { 534 /* 535 * This is a 'file' or 'disk' vdev. 536 */ 537 rep.zprl_type = type; 538 rep.zprl_children = 1; 539 rep.zprl_parity = 0; 540 } else { 541 int64_t vdev_size; 542 543 /* 544 * This is a mirror or RAID-Z vdev. Go through and make 545 * sure the contents are all the same (files vs. disks), 546 * keeping track of the number of elements in the 547 * process. 548 * 549 * We also check that the size of each vdev (if it can 550 * be determined) is the same. 551 */ 552 rep.zprl_type = type; 553 rep.zprl_children = 0; 554 555 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || 556 strcmp(type, VDEV_TYPE_DRAID) == 0) { 557 verify(nvlist_lookup_uint64(nv, 558 ZPOOL_CONFIG_NPARITY, 559 &rep.zprl_parity) == 0); 560 assert(rep.zprl_parity != 0); 561 } else { 562 rep.zprl_parity = 0; 563 } 564 565 /* 566 * The 'dontreport' variable indicates that we've 567 * already reported an error for this spec, so don't 568 * bother doing it again. 569 */ 570 type = NULL; 571 dontreport = 0; 572 vdev_size = -1LL; 573 for (c = 0; c < children; c++) { 574 nvlist_t *cnv = child[c]; 575 const char *path; 576 struct stat64 statbuf; 577 const char *childtype; 578 int fd, err; 579 580 rep.zprl_children++; 581 582 verify(nvlist_lookup_string(cnv, 583 ZPOOL_CONFIG_TYPE, &childtype) == 0); 584 585 /* 586 * If this is a replacing or spare vdev, then 587 * get the real first child of the vdev: do this 588 * in a loop because replacing and spare vdevs 589 * can be nested. 590 */ 591 while (strcmp(childtype, 592 VDEV_TYPE_REPLACING) == 0 || 593 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 594 nvlist_t **rchild; 595 uint_t rchildren; 596 597 verify(nvlist_lookup_nvlist_array(cnv, 598 ZPOOL_CONFIG_CHILDREN, &rchild, 599 &rchildren) == 0); 600 assert(rchildren == 2); 601 cnv = rchild[0]; 602 603 verify(nvlist_lookup_string(cnv, 604 ZPOOL_CONFIG_TYPE, 605 &childtype) == 0); 606 } 607 608 verify(nvlist_lookup_string(cnv, 609 ZPOOL_CONFIG_PATH, &path) == 0); 610 611 /* 612 * If we have a raidz/mirror that combines disks 613 * with files, report it as an error. 614 */ 615 if (!dontreport && type != NULL && 616 strcmp(type, childtype) != 0) { 617 if (ret != NULL) 618 free(ret); 619 ret = NULL; 620 if (fatal) 621 vdev_error(gettext( 622 "mismatched replication " 623 "level: %s contains both " 624 "files and devices\n"), 625 rep.zprl_type); 626 else 627 return (NULL); 628 dontreport = B_TRUE; 629 } 630 631 /* 632 * According to stat(2), the value of 'st_size' 633 * is undefined for block devices and character 634 * devices. But there is no effective way to 635 * determine the real size in userland. 636 * 637 * Instead, we'll take advantage of an 638 * implementation detail of spec_size(). If the 639 * device is currently open, then we (should) 640 * return a valid size. 641 * 642 * If we still don't get a valid size (indicated 643 * by a size of 0 or MAXOFFSET_T), then ignore 644 * this device altogether. 645 */ 646 if ((fd = open(path, O_RDONLY)) >= 0) { 647 err = fstat64_blk(fd, &statbuf); 648 (void) close(fd); 649 } else { 650 err = stat64(path, &statbuf); 651 } 652 653 if (err != 0 || 654 statbuf.st_size == 0 || 655 statbuf.st_size == MAXOFFSET_T) 656 continue; 657 658 int64_t size = statbuf.st_size; 659 660 /* 661 * Also make sure that devices and 662 * slices have a consistent size. If 663 * they differ by a significant amount 664 * (~16MB) then report an error. 665 */ 666 if (!dontreport && 667 (vdev_size != -1LL && 668 (llabs(size - vdev_size) > 669 ZPOOL_FUZZ))) { 670 if (ret != NULL) 671 free(ret); 672 ret = NULL; 673 if (fatal) 674 vdev_error(gettext( 675 "%s contains devices of " 676 "different sizes\n"), 677 rep.zprl_type); 678 else 679 return (NULL); 680 dontreport = B_TRUE; 681 } 682 683 type = childtype; 684 vdev_size = size; 685 } 686 } 687 688 /* 689 * At this point, we have the replication of the last toplevel 690 * vdev in 'rep'. Compare it to 'lastrep' to see if it is 691 * different. 692 */ 693 if (lastrep.zprl_type != NULL) { 694 if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || 695 is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { 696 /* 697 * Accepted raidz and mirror when they can 698 * handle the same number of disk failures. 699 */ 700 if (raidz->zprl_parity != 701 mirror->zprl_children - 1) { 702 if (ret != NULL) 703 free(ret); 704 ret = NULL; 705 if (fatal) 706 vdev_error(gettext( 707 "mismatched replication " 708 "level: " 709 "%s and %s vdevs with " 710 "different redundancy, " 711 "%llu vs. %llu (%llu-way) " 712 "are present\n"), 713 raidz->zprl_type, 714 mirror->zprl_type, 715 (u_longlong_t) 716 raidz->zprl_parity, 717 (u_longlong_t) 718 mirror->zprl_children - 1, 719 (u_longlong_t) 720 mirror->zprl_children); 721 else 722 return (NULL); 723 } 724 } else if (is_raidz_draid(&lastrep, &rep)) { 725 /* 726 * Accepted raidz and draid when they can 727 * handle the same number of disk failures. 728 */ 729 if (lastrep.zprl_parity != rep.zprl_parity) { 730 if (ret != NULL) 731 free(ret); 732 ret = NULL; 733 if (fatal) 734 vdev_error(gettext( 735 "mismatched replication " 736 "level: %s and %s vdevs " 737 "with different " 738 "redundancy, %llu vs. " 739 "%llu are present\n"), 740 lastrep.zprl_type, 741 rep.zprl_type, 742 (u_longlong_t) 743 lastrep.zprl_parity, 744 (u_longlong_t) 745 rep.zprl_parity); 746 else 747 return (NULL); 748 } 749 } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 750 0) { 751 if (ret != NULL) 752 free(ret); 753 ret = NULL; 754 if (fatal) 755 vdev_error(gettext( 756 "mismatched replication level: " 757 "both %s and %s vdevs are " 758 "present\n"), 759 lastrep.zprl_type, rep.zprl_type); 760 else 761 return (NULL); 762 } else if (lastrep.zprl_parity != rep.zprl_parity) { 763 if (ret) 764 free(ret); 765 ret = NULL; 766 if (fatal) 767 vdev_error(gettext( 768 "mismatched replication level: " 769 "both %llu and %llu device parity " 770 "%s vdevs are present\n"), 771 (u_longlong_t) 772 lastrep.zprl_parity, 773 (u_longlong_t)rep.zprl_parity, 774 rep.zprl_type); 775 else 776 return (NULL); 777 } else if (lastrep.zprl_children != rep.zprl_children) { 778 if (ret) 779 free(ret); 780 ret = NULL; 781 if (fatal) 782 vdev_error(gettext( 783 "mismatched replication level: " 784 "both %llu-way and %llu-way %s " 785 "vdevs are present\n"), 786 (u_longlong_t) 787 lastrep.zprl_children, 788 (u_longlong_t) 789 rep.zprl_children, 790 rep.zprl_type); 791 else 792 return (NULL); 793 } 794 } 795 lastrep = rep; 796 } 797 798 if (ret != NULL) 799 *ret = rep; 800 801 return (ret); 802 } 803 804 /* 805 * Check the replication level of the vdev spec against the current pool. Calls 806 * get_replication() to make sure the new spec is self-consistent. If the pool 807 * has a consistent replication level, then we ignore any errors. Otherwise, 808 * report any difference between the two. 809 */ 810 static int 811 check_replication(nvlist_t *config, nvlist_t *newroot) 812 { 813 nvlist_t **child; 814 uint_t children; 815 replication_level_t *current = NULL, *new; 816 replication_level_t *raidz, *mirror; 817 int ret; 818 819 /* 820 * If we have a current pool configuration, check to see if it's 821 * self-consistent. If not, simply return success. 822 */ 823 if (config != NULL) { 824 nvlist_t *nvroot; 825 826 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 827 &nvroot) == 0); 828 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 829 return (0); 830 } 831 /* 832 * for spares there may be no children, and therefore no 833 * replication level to check 834 */ 835 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 836 &child, &children) != 0) || (children == 0)) { 837 free(current); 838 return (0); 839 } 840 841 /* 842 * If all we have is logs then there's no replication level to check. 843 */ 844 if (num_logs(newroot) == children) { 845 free(current); 846 return (0); 847 } 848 849 /* 850 * Get the replication level of the new vdev spec, reporting any 851 * inconsistencies found. 852 */ 853 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 854 free(current); 855 return (-1); 856 } 857 858 /* 859 * Check to see if the new vdev spec matches the replication level of 860 * the current pool. 861 */ 862 ret = 0; 863 if (current != NULL) { 864 if (is_raidz_mirror(current, new, &raidz, &mirror) || 865 is_raidz_mirror(new, current, &raidz, &mirror)) { 866 if (raidz->zprl_parity != mirror->zprl_children - 1) { 867 vdev_error(gettext( 868 "mismatched replication level: pool and " 869 "new vdev with different redundancy, %s " 870 "and %s vdevs, %llu vs. %llu (%llu-way)\n"), 871 raidz->zprl_type, 872 mirror->zprl_type, 873 (u_longlong_t)raidz->zprl_parity, 874 (u_longlong_t)mirror->zprl_children - 1, 875 (u_longlong_t)mirror->zprl_children); 876 ret = -1; 877 } 878 } else if (is_raidz_draid(current, new)) { 879 if (current->zprl_parity != new->zprl_parity) { 880 vdev_error(gettext( 881 "mismatched replication level: pool and " 882 "new vdev with different redundancy, %s " 883 "and %s vdevs, %llu vs. %llu\n"), 884 current->zprl_type, 885 new->zprl_type, 886 (u_longlong_t)current->zprl_parity, 887 (u_longlong_t)new->zprl_parity); 888 ret = -1; 889 } 890 } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { 891 vdev_error(gettext( 892 "mismatched replication level: pool uses %s " 893 "and new vdev is %s\n"), 894 current->zprl_type, new->zprl_type); 895 ret = -1; 896 } else if (current->zprl_parity != new->zprl_parity) { 897 vdev_error(gettext( 898 "mismatched replication level: pool uses %llu " 899 "device parity and new vdev uses %llu\n"), 900 (u_longlong_t)current->zprl_parity, 901 (u_longlong_t)new->zprl_parity); 902 ret = -1; 903 } else if (current->zprl_children != new->zprl_children) { 904 vdev_error(gettext( 905 "mismatched replication level: pool uses %llu-way " 906 "%s and new vdev uses %llu-way %s\n"), 907 (u_longlong_t)current->zprl_children, 908 current->zprl_type, 909 (u_longlong_t)new->zprl_children, 910 new->zprl_type); 911 ret = -1; 912 } 913 } 914 915 free(new); 916 if (current != NULL) 917 free(current); 918 919 return (ret); 920 } 921 922 static int 923 zero_label(const char *path) 924 { 925 const int size = 4096; 926 char buf[size]; 927 int err, fd; 928 929 if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) { 930 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 931 path, strerror(errno)); 932 return (-1); 933 } 934 935 memset(buf, 0, size); 936 err = write(fd, buf, size); 937 (void) fdatasync(fd); 938 (void) close(fd); 939 940 if (err == -1) { 941 (void) fprintf(stderr, gettext("cannot zero first %d bytes " 942 "of '%s': %s\n"), size, path, strerror(errno)); 943 return (-1); 944 } 945 946 if (err != size) { 947 (void) fprintf(stderr, gettext("could only zero %d/%d bytes " 948 "of '%s'\n"), err, size, path); 949 return (-1); 950 } 951 952 return (0); 953 } 954 955 static void 956 lines_to_stderr(char *lines[], int lines_cnt) 957 { 958 int i; 959 for (i = 0; i < lines_cnt; i++) { 960 fprintf(stderr, "%s\n", lines[i]); 961 } 962 } 963 964 /* 965 * Go through and find any whole disks in the vdev specification, labelling them 966 * as appropriate. When constructing the vdev spec, we were unable to open this 967 * device in order to provide a devid. Now that we have labelled the disk and 968 * know that slice 0 is valid, we can construct the devid now. 969 * 970 * If the disk was already labeled with an EFI label, we will have gotten the 971 * devid already (because we were able to open the whole disk). Otherwise, we 972 * need to get the devid after we label the disk. 973 */ 974 static int 975 make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing) 976 { 977 nvlist_t **child; 978 uint_t c, children; 979 const char *type, *path; 980 char devpath[MAXPATHLEN]; 981 char udevpath[MAXPATHLEN]; 982 uint64_t wholedisk; 983 struct stat64 statbuf; 984 int is_exclusive = 0; 985 int fd; 986 int ret; 987 988 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 989 990 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 991 &child, &children) != 0) { 992 993 if (strcmp(type, VDEV_TYPE_DISK) != 0) 994 return (0); 995 996 /* 997 * We have a disk device. If this is a whole disk write 998 * out the efi partition table, otherwise write zero's to 999 * the first 4k of the partition. This is to ensure that 1000 * libblkid will not misidentify the partition due to a 1001 * magic value left by the previous filesystem. 1002 */ 1003 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); 1004 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1005 &wholedisk)); 1006 1007 if (!wholedisk) { 1008 /* 1009 * Update device id string for mpath nodes (Linux only) 1010 */ 1011 if (is_mpath_whole_disk(path)) 1012 update_vdev_config_dev_strs(nv); 1013 1014 if (!is_spare(NULL, path)) 1015 (void) zero_label(path); 1016 return (0); 1017 } 1018 1019 if (realpath(path, devpath) == NULL) { 1020 ret = errno; 1021 (void) fprintf(stderr, 1022 gettext("cannot resolve path '%s'\n"), path); 1023 return (ret); 1024 } 1025 1026 /* 1027 * Remove any previously existing symlink from a udev path to 1028 * the device before labeling the disk. This ensures that 1029 * only newly created links are used. Otherwise there is a 1030 * window between when udev deletes and recreates the link 1031 * during which access attempts will fail with ENOENT. 1032 */ 1033 strlcpy(udevpath, path, MAXPATHLEN); 1034 (void) zfs_append_partition(udevpath, MAXPATHLEN); 1035 1036 fd = open(devpath, O_RDWR|O_EXCL); 1037 if (fd == -1) { 1038 if (errno == EBUSY) 1039 is_exclusive = 1; 1040 #ifdef __FreeBSD__ 1041 if (errno == EPERM) 1042 is_exclusive = 1; 1043 #endif 1044 } else { 1045 (void) close(fd); 1046 } 1047 1048 /* 1049 * If the partition exists, contains a valid spare label, 1050 * and is opened exclusively there is no need to partition 1051 * it. Hot spares have already been partitioned and are 1052 * held open exclusively by the kernel as a safety measure. 1053 * 1054 * If the provided path is for a /dev/disk/ device its 1055 * symbolic link will be removed, partition table created, 1056 * and then block until udev creates the new link. 1057 */ 1058 if (!is_exclusive && !is_spare(NULL, udevpath)) { 1059 char *devnode = strrchr(devpath, '/') + 1; 1060 char **lines = NULL; 1061 int lines_cnt = 0; 1062 1063 ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); 1064 if (ret == 0) { 1065 ret = lstat64(udevpath, &statbuf); 1066 if (ret == 0 && S_ISLNK(statbuf.st_mode)) 1067 (void) unlink(udevpath); 1068 } 1069 1070 /* 1071 * When labeling a pool the raw device node name 1072 * is provided as it appears under /dev/. 1073 * 1074 * Note that 'zhp' will be NULL when we're creating a 1075 * pool. 1076 */ 1077 if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode, 1078 nv, zhp == NULL ? "create" : 1079 replacing ? "replace" : "add", &lines, 1080 &lines_cnt) != 0) { 1081 (void) fprintf(stderr, 1082 gettext( 1083 "Error preparing/labeling disk.\n")); 1084 if (lines_cnt > 0) { 1085 (void) fprintf(stderr, 1086 gettext("zfs_prepare_disk output:\n")); 1087 lines_to_stderr(lines, lines_cnt); 1088 } 1089 1090 libzfs_free_str_array(lines, lines_cnt); 1091 return (-1); 1092 } 1093 libzfs_free_str_array(lines, lines_cnt); 1094 1095 /* 1096 * Wait for udev to signal the device is available 1097 * by the provided path. 1098 */ 1099 ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); 1100 if (ret) { 1101 (void) fprintf(stderr, 1102 gettext("missing link: %s was " 1103 "partitioned but %s is missing\n"), 1104 devnode, udevpath); 1105 return (ret); 1106 } 1107 1108 ret = zero_label(udevpath); 1109 if (ret) 1110 return (ret); 1111 } 1112 1113 /* 1114 * Update the path to refer to the partition. The presence of 1115 * the 'whole_disk' field indicates to the CLI that we should 1116 * chop off the partition number when displaying the device in 1117 * future output. 1118 */ 1119 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0); 1120 1121 /* 1122 * Update device id strings for whole disks (Linux only) 1123 */ 1124 update_vdev_config_dev_strs(nv); 1125 1126 return (0); 1127 } 1128 1129 for (c = 0; c < children; c++) 1130 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1131 return (ret); 1132 1133 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1134 &child, &children) == 0) 1135 for (c = 0; c < children; c++) 1136 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1137 return (ret); 1138 1139 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1140 &child, &children) == 0) 1141 for (c = 0; c < children; c++) 1142 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1143 return (ret); 1144 1145 return (0); 1146 } 1147 1148 /* 1149 * Go through and find any devices that are in use. We rely on libdiskmgt for 1150 * the majority of this task. 1151 */ 1152 static boolean_t 1153 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, 1154 boolean_t replacing, boolean_t isspare) 1155 { 1156 nvlist_t **child; 1157 uint_t c, children; 1158 const char *type, *path; 1159 int ret = 0; 1160 char buf[MAXPATHLEN]; 1161 uint64_t wholedisk = B_FALSE; 1162 boolean_t anyinuse = B_FALSE; 1163 1164 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1165 1166 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1167 &child, &children) != 0) { 1168 1169 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); 1170 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1171 verify(!nvlist_lookup_uint64(nv, 1172 ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); 1173 1174 /* 1175 * As a generic check, we look to see if this is a replace of a 1176 * hot spare within the same pool. If so, we allow it 1177 * regardless of what libblkid or zpool_in_use() says. 1178 */ 1179 if (replacing) { 1180 (void) strlcpy(buf, path, sizeof (buf)); 1181 if (wholedisk) { 1182 ret = zfs_append_partition(buf, sizeof (buf)); 1183 if (ret == -1) 1184 return (-1); 1185 } 1186 1187 if (is_spare(config, buf)) 1188 return (B_FALSE); 1189 } 1190 1191 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1192 ret = check_device(path, force, isspare, wholedisk); 1193 1194 else if (strcmp(type, VDEV_TYPE_FILE) == 0) 1195 ret = check_file(path, force, isspare); 1196 1197 return (ret != 0); 1198 } 1199 1200 for (c = 0; c < children; c++) 1201 if (is_device_in_use(config, child[c], force, replacing, 1202 B_FALSE)) 1203 anyinuse = B_TRUE; 1204 1205 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1206 &child, &children) == 0) 1207 for (c = 0; c < children; c++) 1208 if (is_device_in_use(config, child[c], force, replacing, 1209 B_TRUE)) 1210 anyinuse = B_TRUE; 1211 1212 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1213 &child, &children) == 0) 1214 for (c = 0; c < children; c++) 1215 if (is_device_in_use(config, child[c], force, replacing, 1216 B_FALSE)) 1217 anyinuse = B_TRUE; 1218 1219 return (anyinuse); 1220 } 1221 1222 /* 1223 * Returns the parity level extracted from a raidz or draid type. 1224 * If the parity cannot be determined zero is returned. 1225 */ 1226 static int 1227 get_parity(const char *type) 1228 { 1229 long parity = 0; 1230 const char *p; 1231 1232 if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { 1233 p = type + strlen(VDEV_TYPE_RAIDZ); 1234 1235 if (*p == '\0') { 1236 /* when unspecified default to single parity */ 1237 return (1); 1238 } else if (*p == '0') { 1239 /* no zero prefixes allowed */ 1240 return (0); 1241 } else { 1242 /* 0-3, no suffixes allowed */ 1243 char *end; 1244 errno = 0; 1245 parity = strtol(p, &end, 10); 1246 if (errno != 0 || *end != '\0' || 1247 parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { 1248 return (0); 1249 } 1250 } 1251 } else if (strncmp(type, VDEV_TYPE_DRAID, 1252 strlen(VDEV_TYPE_DRAID)) == 0) { 1253 p = type + strlen(VDEV_TYPE_DRAID); 1254 1255 if (*p == '\0' || *p == ':') { 1256 /* when unspecified default to single parity */ 1257 return (1); 1258 } else if (*p == '0') { 1259 /* no zero prefixes allowed */ 1260 return (0); 1261 } else { 1262 /* 0-3, allowed suffixes: '\0' or ':' */ 1263 char *end; 1264 errno = 0; 1265 parity = strtol(p, &end, 10); 1266 if (errno != 0 || 1267 parity < 1 || parity > VDEV_DRAID_MAXPARITY || 1268 (*end != '\0' && *end != ':')) { 1269 return (0); 1270 } 1271 } 1272 } 1273 1274 return ((int)parity); 1275 } 1276 1277 /* 1278 * Assign the minimum and maximum number of devices allowed for 1279 * the specified type. On error NULL is returned, otherwise the 1280 * type prefix is returned (raidz, mirror, etc). 1281 */ 1282 static const char * 1283 is_grouping(const char *type, int *mindev, int *maxdev) 1284 { 1285 int nparity; 1286 1287 if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || 1288 strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { 1289 nparity = get_parity(type); 1290 if (nparity == 0) 1291 return (NULL); 1292 if (mindev != NULL) 1293 *mindev = nparity + 1; 1294 if (maxdev != NULL) 1295 *maxdev = 255; 1296 1297 if (strncmp(type, VDEV_TYPE_RAIDZ, 1298 strlen(VDEV_TYPE_RAIDZ)) == 0) { 1299 return (VDEV_TYPE_RAIDZ); 1300 } else { 1301 return (VDEV_TYPE_DRAID); 1302 } 1303 } 1304 1305 if (maxdev != NULL) 1306 *maxdev = INT_MAX; 1307 1308 if (strcmp(type, "mirror") == 0) { 1309 if (mindev != NULL) 1310 *mindev = 2; 1311 return (VDEV_TYPE_MIRROR); 1312 } 1313 1314 if (strcmp(type, "spare") == 0) { 1315 if (mindev != NULL) 1316 *mindev = 1; 1317 return (VDEV_TYPE_SPARE); 1318 } 1319 1320 if (strcmp(type, "log") == 0) { 1321 if (mindev != NULL) 1322 *mindev = 1; 1323 return (VDEV_TYPE_LOG); 1324 } 1325 1326 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || 1327 strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1328 if (mindev != NULL) 1329 *mindev = 1; 1330 return (type); 1331 } 1332 1333 if (strcmp(type, "cache") == 0) { 1334 if (mindev != NULL) 1335 *mindev = 1; 1336 return (VDEV_TYPE_L2CACHE); 1337 } 1338 1339 return (NULL); 1340 } 1341 1342 /* 1343 * Extract the configuration parameters encoded in the dRAID type and 1344 * use them to generate a dRAID configuration. The expected format is: 1345 * 1346 * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>] 1347 * 1348 * The intent is to be able to generate a good configuration when no 1349 * additional information is provided. The only mandatory component 1350 * of the 'type' is the 'draid' prefix. If a value is not provided 1351 * then reasonable defaults are used. The optional components may 1352 * appear in any order but the d/s/c suffix is required. 1353 * 1354 * Valid inputs: 1355 * - data: number of data devices per group (1-255) 1356 * - parity: number of parity blocks per group (1-3) 1357 * - spares: number of distributed spare (0-100) 1358 * - children: total number of devices (1-255) 1359 * 1360 * Examples: 1361 * - zpool create tank draid <devices...> 1362 * - zpool create tank draid2:8d:51c:2s <devices...> 1363 */ 1364 static int 1365 draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) 1366 { 1367 uint64_t nparity; 1368 uint64_t nspares = 0; 1369 uint64_t ndata = UINT64_MAX; 1370 uint64_t ngroups = 1; 1371 long value; 1372 1373 if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) 1374 return (EINVAL); 1375 1376 nparity = (uint64_t)get_parity(type); 1377 if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { 1378 fprintf(stderr, 1379 gettext("invalid dRAID parity level %llu; must be " 1380 "between 1 and %d\n"), (u_longlong_t)nparity, 1381 VDEV_DRAID_MAXPARITY); 1382 return (EINVAL); 1383 } 1384 1385 char *p = (char *)type; 1386 while ((p = strchr(p, ':')) != NULL) { 1387 char *end; 1388 1389 p = p + 1; 1390 errno = 0; 1391 1392 if (!isdigit(p[0])) { 1393 (void) fprintf(stderr, gettext("invalid dRAID " 1394 "syntax; expected [:<number><c|d|s>] not '%s'\n"), 1395 type); 1396 return (EINVAL); 1397 } 1398 1399 /* Expected non-zero value with c/d/s suffix */ 1400 value = strtol(p, &end, 10); 1401 char suffix = tolower(*end); 1402 if (errno != 0 || 1403 (suffix != 'c' && suffix != 'd' && suffix != 's')) { 1404 (void) fprintf(stderr, gettext("invalid dRAID " 1405 "syntax; expected [:<number><c|d|s>] not '%s'\n"), 1406 type); 1407 return (EINVAL); 1408 } 1409 1410 if (suffix == 'c') { 1411 if ((uint64_t)value != children) { 1412 fprintf(stderr, 1413 gettext("invalid number of dRAID children; " 1414 "%llu required but %llu provided\n"), 1415 (u_longlong_t)value, 1416 (u_longlong_t)children); 1417 return (EINVAL); 1418 } 1419 } else if (suffix == 'd') { 1420 ndata = (uint64_t)value; 1421 } else if (suffix == 's') { 1422 nspares = (uint64_t)value; 1423 } else { 1424 verify(0); /* Unreachable */ 1425 } 1426 } 1427 1428 /* 1429 * When a specific number of data disks is not provided limit a 1430 * redundancy group to 8 data disks. This value was selected to 1431 * provide a reasonable tradeoff between capacity and performance. 1432 */ 1433 if (ndata == UINT64_MAX) { 1434 if (children > nspares + nparity) { 1435 ndata = MIN(children - nspares - nparity, 8); 1436 } else { 1437 fprintf(stderr, gettext("request number of " 1438 "distributed spares %llu and parity level %llu\n" 1439 "leaves no disks available for data\n"), 1440 (u_longlong_t)nspares, (u_longlong_t)nparity); 1441 return (EINVAL); 1442 } 1443 } 1444 1445 /* Verify the maximum allowed group size is never exceeded. */ 1446 if (ndata == 0 || (ndata + nparity > children - nspares)) { 1447 fprintf(stderr, gettext("requested number of dRAID data " 1448 "disks per group %llu is too high,\nat most %llu disks " 1449 "are available for data\n"), (u_longlong_t)ndata, 1450 (u_longlong_t)(children - nspares - nparity)); 1451 return (EINVAL); 1452 } 1453 1454 /* 1455 * Verify the requested number of spares can be satisfied. 1456 * An arbitrary limit of 100 distributed spares is applied. 1457 */ 1458 if (nspares > 100 || nspares > (children - (ndata + nparity))) { 1459 fprintf(stderr, 1460 gettext("invalid number of dRAID spares %llu; additional " 1461 "disks would be required\n"), (u_longlong_t)nspares); 1462 return (EINVAL); 1463 } 1464 1465 /* Verify the requested number children is sufficient. */ 1466 if (children < (ndata + nparity + nspares)) { 1467 fprintf(stderr, gettext("%llu disks were provided, but at " 1468 "least %llu disks are required for this config\n"), 1469 (u_longlong_t)children, 1470 (u_longlong_t)(ndata + nparity + nspares)); 1471 } 1472 1473 if (children > VDEV_DRAID_MAX_CHILDREN) { 1474 fprintf(stderr, gettext("%llu disks were provided, but " 1475 "dRAID only supports up to %u disks"), 1476 (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); 1477 } 1478 1479 /* 1480 * Calculate the minimum number of groups required to fill a slice. 1481 * This is the LCM of the stripe width (ndata + nparity) and the 1482 * number of data drives (children - nspares). 1483 */ 1484 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1485 ngroups++; 1486 1487 /* Store the basic dRAID configuration. */ 1488 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); 1489 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1490 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1491 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1492 1493 return (0); 1494 } 1495 1496 /* 1497 * Construct a syntactically valid vdev specification, 1498 * and ensure that all devices and files exist and can be opened. 1499 * Note: we don't bother freeing anything in the error paths 1500 * because the program is just going to exit anyway. 1501 */ 1502 static nvlist_t * 1503 construct_spec(nvlist_t *props, int argc, char **argv) 1504 { 1505 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1506 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1507 const char *type, *fulltype; 1508 boolean_t is_log, is_special, is_dedup, is_spare; 1509 boolean_t seen_logs; 1510 1511 top = NULL; 1512 toplevels = 0; 1513 spares = NULL; 1514 l2cache = NULL; 1515 nspares = 0; 1516 nlogs = 0; 1517 nl2cache = 0; 1518 is_log = is_special = is_dedup = is_spare = B_FALSE; 1519 seen_logs = B_FALSE; 1520 nvroot = NULL; 1521 1522 while (argc > 0) { 1523 fulltype = argv[0]; 1524 nv = NULL; 1525 1526 /* 1527 * If it's a mirror, raidz, or draid the subsequent arguments 1528 * are its leaves -- until we encounter the next mirror, 1529 * raidz or draid. 1530 */ 1531 if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { 1532 nvlist_t **child = NULL; 1533 int c, children = 0; 1534 1535 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1536 if (spares != NULL) { 1537 (void) fprintf(stderr, 1538 gettext("invalid vdev " 1539 "specification: 'spare' can be " 1540 "specified only once\n")); 1541 goto spec_out; 1542 } 1543 is_spare = B_TRUE; 1544 is_log = is_special = is_dedup = B_FALSE; 1545 } 1546 1547 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1548 if (seen_logs) { 1549 (void) fprintf(stderr, 1550 gettext("invalid vdev " 1551 "specification: 'log' can be " 1552 "specified only once\n")); 1553 goto spec_out; 1554 } 1555 seen_logs = B_TRUE; 1556 is_log = B_TRUE; 1557 is_special = is_dedup = is_spare = B_FALSE; 1558 argc--; 1559 argv++; 1560 /* 1561 * A log is not a real grouping device. 1562 * We just set is_log and continue. 1563 */ 1564 continue; 1565 } 1566 1567 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { 1568 is_special = B_TRUE; 1569 is_log = is_dedup = is_spare = B_FALSE; 1570 argc--; 1571 argv++; 1572 continue; 1573 } 1574 1575 if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1576 is_dedup = B_TRUE; 1577 is_log = is_special = is_spare = B_FALSE; 1578 argc--; 1579 argv++; 1580 continue; 1581 } 1582 1583 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1584 if (l2cache != NULL) { 1585 (void) fprintf(stderr, 1586 gettext("invalid vdev " 1587 "specification: 'cache' can be " 1588 "specified only once\n")); 1589 goto spec_out; 1590 } 1591 is_log = is_special = B_FALSE; 1592 is_dedup = is_spare = B_FALSE; 1593 } 1594 1595 if (is_log) { 1596 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1597 (void) fprintf(stderr, 1598 gettext("invalid vdev " 1599 "specification: unsupported 'log' " 1600 "device: %s\n"), type); 1601 goto spec_out; 1602 } 1603 nlogs++; 1604 } 1605 1606 for (c = 1; c < argc; c++) { 1607 if (is_grouping(argv[c], NULL, NULL) != NULL) 1608 break; 1609 1610 children++; 1611 child = realloc(child, 1612 children * sizeof (nvlist_t *)); 1613 if (child == NULL) 1614 zpool_no_memory(); 1615 if ((nv = make_leaf_vdev(props, argv[c], 1616 !(is_log || is_special || is_dedup || 1617 is_spare))) == NULL) { 1618 for (c = 0; c < children - 1; c++) 1619 nvlist_free(child[c]); 1620 free(child); 1621 goto spec_out; 1622 } 1623 1624 child[children - 1] = nv; 1625 } 1626 1627 if (children < mindev) { 1628 (void) fprintf(stderr, gettext("invalid vdev " 1629 "specification: %s requires at least %d " 1630 "devices\n"), argv[0], mindev); 1631 for (c = 0; c < children; c++) 1632 nvlist_free(child[c]); 1633 free(child); 1634 goto spec_out; 1635 } 1636 1637 if (children > maxdev) { 1638 (void) fprintf(stderr, gettext("invalid vdev " 1639 "specification: %s supports no more than " 1640 "%d devices\n"), argv[0], maxdev); 1641 for (c = 0; c < children; c++) 1642 nvlist_free(child[c]); 1643 free(child); 1644 goto spec_out; 1645 } 1646 1647 argc -= c; 1648 argv += c; 1649 1650 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1651 spares = child; 1652 nspares = children; 1653 continue; 1654 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1655 l2cache = child; 1656 nl2cache = children; 1657 continue; 1658 } else { 1659 /* create a top-level vdev with children */ 1660 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1661 0) == 0); 1662 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1663 type) == 0); 1664 verify(nvlist_add_uint64(nv, 1665 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1666 if (is_log) { 1667 verify(nvlist_add_string(nv, 1668 ZPOOL_CONFIG_ALLOCATION_BIAS, 1669 VDEV_ALLOC_BIAS_LOG) == 0); 1670 } 1671 if (is_special) { 1672 verify(nvlist_add_string(nv, 1673 ZPOOL_CONFIG_ALLOCATION_BIAS, 1674 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1675 } 1676 if (is_dedup) { 1677 verify(nvlist_add_string(nv, 1678 ZPOOL_CONFIG_ALLOCATION_BIAS, 1679 VDEV_ALLOC_BIAS_DEDUP) == 0); 1680 } 1681 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1682 verify(nvlist_add_uint64(nv, 1683 ZPOOL_CONFIG_NPARITY, 1684 mindev - 1) == 0); 1685 } 1686 if (strcmp(type, VDEV_TYPE_DRAID) == 0) { 1687 if (draid_config_by_type(nv, 1688 fulltype, children) != 0) { 1689 for (c = 0; c < children; c++) 1690 nvlist_free(child[c]); 1691 free(child); 1692 goto spec_out; 1693 } 1694 } 1695 verify(nvlist_add_nvlist_array(nv, 1696 ZPOOL_CONFIG_CHILDREN, 1697 (const nvlist_t **)child, children) == 0); 1698 1699 for (c = 0; c < children; c++) 1700 nvlist_free(child[c]); 1701 free(child); 1702 } 1703 } else { 1704 /* 1705 * We have a device. Pass off to make_leaf_vdev() to 1706 * construct the appropriate nvlist describing the vdev. 1707 */ 1708 if ((nv = make_leaf_vdev(props, argv[0], !(is_log || 1709 is_special || is_dedup || is_spare))) == NULL) 1710 goto spec_out; 1711 1712 verify(nvlist_add_uint64(nv, 1713 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1714 if (is_log) { 1715 verify(nvlist_add_string(nv, 1716 ZPOOL_CONFIG_ALLOCATION_BIAS, 1717 VDEV_ALLOC_BIAS_LOG) == 0); 1718 nlogs++; 1719 } 1720 1721 if (is_special) { 1722 verify(nvlist_add_string(nv, 1723 ZPOOL_CONFIG_ALLOCATION_BIAS, 1724 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1725 } 1726 if (is_dedup) { 1727 verify(nvlist_add_string(nv, 1728 ZPOOL_CONFIG_ALLOCATION_BIAS, 1729 VDEV_ALLOC_BIAS_DEDUP) == 0); 1730 } 1731 argc--; 1732 argv++; 1733 } 1734 1735 toplevels++; 1736 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1737 if (top == NULL) 1738 zpool_no_memory(); 1739 top[toplevels - 1] = nv; 1740 } 1741 1742 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1743 (void) fprintf(stderr, gettext("invalid vdev " 1744 "specification: at least one toplevel vdev must be " 1745 "specified\n")); 1746 goto spec_out; 1747 } 1748 1749 if (seen_logs && nlogs == 0) { 1750 (void) fprintf(stderr, gettext("invalid vdev specification: " 1751 "log requires at least 1 device\n")); 1752 goto spec_out; 1753 } 1754 1755 /* 1756 * Finally, create nvroot and add all top-level vdevs to it. 1757 */ 1758 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1759 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1760 VDEV_TYPE_ROOT) == 0); 1761 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1762 (const nvlist_t **)top, toplevels) == 0); 1763 if (nspares != 0) 1764 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1765 (const nvlist_t **)spares, nspares) == 0); 1766 if (nl2cache != 0) 1767 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1768 (const nvlist_t **)l2cache, nl2cache) == 0); 1769 1770 spec_out: 1771 for (t = 0; t < toplevels; t++) 1772 nvlist_free(top[t]); 1773 for (t = 0; t < nspares; t++) 1774 nvlist_free(spares[t]); 1775 for (t = 0; t < nl2cache; t++) 1776 nvlist_free(l2cache[t]); 1777 1778 free(spares); 1779 free(l2cache); 1780 free(top); 1781 1782 return (nvroot); 1783 } 1784 1785 nvlist_t * 1786 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, 1787 splitflags_t flags, int argc, char **argv) 1788 { 1789 nvlist_t *newroot = NULL, **child; 1790 uint_t c, children; 1791 1792 if (argc > 0) { 1793 if ((newroot = construct_spec(props, argc, argv)) == NULL) { 1794 (void) fprintf(stderr, gettext("Unable to build a " 1795 "pool from the specified devices\n")); 1796 return (NULL); 1797 } 1798 1799 if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) { 1800 nvlist_free(newroot); 1801 return (NULL); 1802 } 1803 1804 /* avoid any tricks in the spec */ 1805 verify(nvlist_lookup_nvlist_array(newroot, 1806 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); 1807 for (c = 0; c < children; c++) { 1808 const char *path; 1809 const char *type; 1810 int min, max; 1811 1812 verify(nvlist_lookup_string(child[c], 1813 ZPOOL_CONFIG_PATH, &path) == 0); 1814 if ((type = is_grouping(path, &min, &max)) != NULL) { 1815 (void) fprintf(stderr, gettext("Cannot use " 1816 "'%s' as a device for splitting\n"), type); 1817 nvlist_free(newroot); 1818 return (NULL); 1819 } 1820 } 1821 } 1822 1823 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { 1824 nvlist_free(newroot); 1825 return (NULL); 1826 } 1827 1828 return (newroot); 1829 } 1830 1831 static int 1832 num_normal_vdevs(nvlist_t *nvroot) 1833 { 1834 nvlist_t **top; 1835 uint_t t, toplevels, normal = 0; 1836 1837 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1838 &top, &toplevels) == 0); 1839 1840 for (t = 0; t < toplevels; t++) { 1841 uint64_t log = B_FALSE; 1842 1843 (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); 1844 if (log) 1845 continue; 1846 if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) 1847 continue; 1848 1849 normal++; 1850 } 1851 1852 return (normal); 1853 } 1854 1855 /* 1856 * Get and validate the contents of the given vdev specification. This ensures 1857 * that the nvlist returned is well-formed, that all the devices exist, and that 1858 * they are not currently in use by any other known consumer. The 'poolconfig' 1859 * parameter is the current configuration of the pool when adding devices 1860 * existing pool, and is used to perform additional checks, such as changing the 1861 * replication level of the pool. It can be 'NULL' to indicate that this is a 1862 * new pool. The 'force' flag controls whether devices should be forcefully 1863 * added, even if they appear in use. 1864 */ 1865 nvlist_t * 1866 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, 1867 boolean_t replacing, boolean_t dryrun, int argc, char **argv) 1868 { 1869 nvlist_t *newroot; 1870 nvlist_t *poolconfig = NULL; 1871 is_force = force; 1872 1873 /* 1874 * Construct the vdev specification. If this is successful, we know 1875 * that we have a valid specification, and that all devices can be 1876 * opened. 1877 */ 1878 if ((newroot = construct_spec(props, argc, argv)) == NULL) 1879 return (NULL); 1880 1881 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) { 1882 nvlist_free(newroot); 1883 return (NULL); 1884 } 1885 1886 /* 1887 * Validate each device to make sure that it's not shared with another 1888 * subsystem. We do this even if 'force' is set, because there are some 1889 * uses (such as a dedicated dump device) that even '-f' cannot 1890 * override. 1891 */ 1892 if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { 1893 nvlist_free(newroot); 1894 return (NULL); 1895 } 1896 1897 /* 1898 * Check the replication level of the given vdevs and report any errors 1899 * found. We include the existing pool spec, if any, as we need to 1900 * catch changes against the existing replication level. 1901 */ 1902 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1903 nvlist_free(newroot); 1904 return (NULL); 1905 } 1906 1907 /* 1908 * On pool create the new vdev spec must have one normal vdev. 1909 */ 1910 if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { 1911 vdev_error(gettext("at least one general top-level vdev must " 1912 "be specified\n")); 1913 nvlist_free(newroot); 1914 return (NULL); 1915 } 1916 1917 /* 1918 * Run through the vdev specification and label any whole disks found. 1919 */ 1920 if (!dryrun && make_disks(zhp, newroot, replacing) != 0) { 1921 nvlist_free(newroot); 1922 return (NULL); 1923 } 1924 1925 return (newroot); 1926 } 1927