1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2013, 2018 by Delphix. All rights reserved. 26 * Copyright (c) 2016, 2017 Intel Corporation. 27 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. 28 */ 29 30 /* 31 * Functions to convert between a list of vdevs and an nvlist representing the 32 * configuration. Each entry in the list can be one of: 33 * 34 * Device vdevs 35 * disk=(path=..., devid=...) 36 * file=(path=...) 37 * 38 * Group vdevs 39 * raidz[1|2]=(...) 40 * mirror=(...) 41 * 42 * Hot spares 43 * 44 * While the underlying implementation supports it, group vdevs cannot contain 45 * other group vdevs. All userland verification of devices is contained within 46 * this file. If successful, the nvlist returned can be passed directly to the 47 * kernel; we've done as much verification as possible in userland. 48 * 49 * Hot spares are a special case, and passed down as an array of disk vdevs, at 50 * the same level as the root of the vdev tree. 51 * 52 * The only function exported by this file is 'make_root_vdev'. The 53 * function performs several passes: 54 * 55 * 1. Construct the vdev specification. Performs syntax validation and 56 * makes sure each device is valid. 57 * 2. Check for devices in use. Using libblkid to make sure that no 58 * devices are also in use. Some can be overridden using the 'force' 59 * flag, others cannot. 60 * 3. Check for replication errors if the 'force' flag is not specified. 61 * validates that the replication level is consistent across the 62 * entire pool. 63 * 4. Call libzfs to label any whole disks with an EFI label. 64 */ 65 66 #include <assert.h> 67 #include <ctype.h> 68 #include <errno.h> 69 #include <fcntl.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <libzutil.h> 73 #include <limits.h> 74 #include <sys/spa.h> 75 #include <stdio.h> 76 #include <string.h> 77 #include <unistd.h> 78 #include "zpool_util.h" 79 #include <sys/zfs_context.h> 80 #include <sys/stat.h> 81 82 /* 83 * For any given vdev specification, we can have multiple errors. The 84 * vdev_error() function keeps track of whether we have seen an error yet, and 85 * prints out a header if its the first error we've seen. 86 */ 87 boolean_t error_seen; 88 boolean_t is_force; 89 90 void 91 vdev_error(const char *fmt, ...) 92 { 93 va_list ap; 94 95 if (!error_seen) { 96 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 97 if (!is_force) 98 (void) fprintf(stderr, gettext("use '-f' to override " 99 "the following errors:\n")); 100 else 101 (void) fprintf(stderr, gettext("the following errors " 102 "must be manually repaired:\n")); 103 error_seen = B_TRUE; 104 } 105 106 va_start(ap, fmt); 107 (void) vfprintf(stderr, fmt, ap); 108 va_end(ap); 109 } 110 111 /* 112 * Check that a file is valid. All we can do in this case is check that it's 113 * not in use by another pool, and not in use by swap. 114 */ 115 int 116 check_file_generic(const char *file, boolean_t force, boolean_t isspare) 117 { 118 char *name; 119 int fd; 120 int ret = 0; 121 pool_state_t state; 122 boolean_t inuse; 123 124 if ((fd = open(file, O_RDONLY)) < 0) 125 return (0); 126 127 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 128 const char *desc; 129 130 switch (state) { 131 case POOL_STATE_ACTIVE: 132 desc = gettext("active"); 133 break; 134 135 case POOL_STATE_EXPORTED: 136 desc = gettext("exported"); 137 break; 138 139 case POOL_STATE_POTENTIALLY_ACTIVE: 140 desc = gettext("potentially active"); 141 break; 142 143 default: 144 desc = gettext("unknown"); 145 break; 146 } 147 148 /* 149 * Allow hot spares to be shared between pools. 150 */ 151 if (state == POOL_STATE_SPARE && isspare) { 152 free(name); 153 (void) close(fd); 154 return (0); 155 } 156 157 if (state == POOL_STATE_ACTIVE || 158 state == POOL_STATE_SPARE || !force) { 159 switch (state) { 160 case POOL_STATE_SPARE: 161 vdev_error(gettext("%s is reserved as a hot " 162 "spare for pool %s\n"), file, name); 163 break; 164 default: 165 vdev_error(gettext("%s is part of %s pool " 166 "'%s'\n"), file, desc, name); 167 break; 168 } 169 ret = -1; 170 } 171 172 free(name); 173 } 174 175 (void) close(fd); 176 return (ret); 177 } 178 179 /* 180 * This may be a shorthand device path or it could be total gibberish. 181 * Check to see if it is a known device available in zfs_vdev_paths. 182 * As part of this check, see if we've been given an entire disk 183 * (minus the slice number). 184 */ 185 static int 186 is_shorthand_path(const char *arg, char *path, size_t path_size, 187 struct stat64 *statbuf, boolean_t *wholedisk) 188 { 189 int error; 190 191 error = zfs_resolve_shortname(arg, path, path_size); 192 if (error == 0) { 193 *wholedisk = zfs_dev_is_whole_disk(path); 194 if (*wholedisk || (stat64(path, statbuf) == 0)) 195 return (0); 196 } 197 198 strlcpy(path, arg, path_size); 199 memset(statbuf, 0, sizeof (*statbuf)); 200 *wholedisk = B_FALSE; 201 202 return (error); 203 } 204 205 /* 206 * Determine if the given path is a hot spare within the given configuration. 207 * If no configuration is given we rely solely on the label. 208 */ 209 static boolean_t 210 is_spare(nvlist_t *config, const char *path) 211 { 212 int fd; 213 pool_state_t state; 214 char *name = NULL; 215 nvlist_t *label; 216 uint64_t guid, spareguid; 217 nvlist_t *nvroot; 218 nvlist_t **spares; 219 uint_t i, nspares; 220 boolean_t inuse; 221 222 if (zpool_is_draid_spare(path)) 223 return (B_TRUE); 224 225 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) 226 return (B_FALSE); 227 228 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 229 !inuse || 230 state != POOL_STATE_SPARE || 231 zpool_read_label(fd, &label, NULL) != 0) { 232 free(name); 233 (void) close(fd); 234 return (B_FALSE); 235 } 236 free(name); 237 (void) close(fd); 238 239 if (config == NULL) { 240 nvlist_free(label); 241 return (B_TRUE); 242 } 243 244 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 245 nvlist_free(label); 246 247 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 248 &nvroot) == 0); 249 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 250 &spares, &nspares) == 0) { 251 for (i = 0; i < nspares; i++) { 252 verify(nvlist_lookup_uint64(spares[i], 253 ZPOOL_CONFIG_GUID, &spareguid) == 0); 254 if (spareguid == guid) 255 return (B_TRUE); 256 } 257 } 258 259 return (B_FALSE); 260 } 261 262 /* 263 * Create a leaf vdev. Determine if this is a file or a device. If it's a 264 * device, fill in the device id to make a complete nvlist. Valid forms for a 265 * leaf vdev are: 266 * 267 * /dev/xxx Complete disk path 268 * /xxx Full path to file 269 * xxx Shorthand for <zfs_vdev_paths>/xxx 270 * draid* Virtual dRAID spare 271 */ 272 static nvlist_t * 273 make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) 274 { 275 char path[MAXPATHLEN]; 276 struct stat64 statbuf; 277 nvlist_t *vdev = NULL; 278 const char *type = NULL; 279 boolean_t wholedisk = B_FALSE; 280 uint64_t ashift = 0; 281 int err; 282 283 /* 284 * Determine what type of vdev this is, and put the full path into 285 * 'path'. We detect whether this is a device of file afterwards by 286 * checking the st_mode of the file. 287 */ 288 if (arg[0] == '/') { 289 /* 290 * Complete device or file path. Exact type is determined by 291 * examining the file descriptor afterwards. Symbolic links 292 * are resolved to their real paths to determine whole disk 293 * and S_ISBLK/S_ISREG type checks. However, we are careful 294 * to store the given path as ZPOOL_CONFIG_PATH to ensure we 295 * can leverage udev's persistent device labels. 296 */ 297 if (realpath(arg, path) == NULL) { 298 (void) fprintf(stderr, 299 gettext("cannot resolve path '%s'\n"), arg); 300 return (NULL); 301 } 302 303 wholedisk = zfs_dev_is_whole_disk(path); 304 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 305 (void) fprintf(stderr, 306 gettext("cannot open '%s': %s\n"), 307 path, strerror(errno)); 308 return (NULL); 309 } 310 311 /* After whole disk check restore original passed path */ 312 strlcpy(path, arg, sizeof (path)); 313 } else if (zpool_is_draid_spare(arg)) { 314 if (!is_primary) { 315 (void) fprintf(stderr, 316 gettext("cannot open '%s': dRAID spares can only " 317 "be used to replace primary vdevs\n"), arg); 318 return (NULL); 319 } 320 321 wholedisk = B_TRUE; 322 strlcpy(path, arg, sizeof (path)); 323 type = VDEV_TYPE_DRAID_SPARE; 324 } else { 325 err = is_shorthand_path(arg, path, sizeof (path), 326 &statbuf, &wholedisk); 327 if (err != 0) { 328 /* 329 * If we got ENOENT, then the user gave us 330 * gibberish, so try to direct them with a 331 * reasonable error message. Otherwise, 332 * regurgitate strerror() since it's the best we 333 * can do. 334 */ 335 if (err == ENOENT) { 336 (void) fprintf(stderr, 337 gettext("cannot open '%s': no such " 338 "device in %s\n"), arg, DISK_ROOT); 339 (void) fprintf(stderr, 340 gettext("must be a full path or " 341 "shorthand device name\n")); 342 return (NULL); 343 } else { 344 (void) fprintf(stderr, 345 gettext("cannot open '%s': %s\n"), 346 path, strerror(errno)); 347 return (NULL); 348 } 349 } 350 } 351 352 if (type == NULL) { 353 /* 354 * Determine whether this is a device or a file. 355 */ 356 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 357 type = VDEV_TYPE_DISK; 358 } else if (S_ISREG(statbuf.st_mode)) { 359 type = VDEV_TYPE_FILE; 360 } else { 361 fprintf(stderr, gettext("cannot use '%s': must " 362 "be a block device or regular file\n"), path); 363 return (NULL); 364 } 365 } 366 367 /* 368 * Finally, we have the complete device or file, and we know that it is 369 * acceptable to use. Construct the nvlist to describe this vdev. All 370 * vdevs have a 'path' element, and devices also have a 'devid' element. 371 */ 372 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 373 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 374 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 375 376 /* Lookup and add the enclosure sysfs path (if exists) */ 377 update_vdev_config_dev_sysfs_path(vdev, path, 378 ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); 379 380 if (strcmp(type, VDEV_TYPE_DISK) == 0) 381 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 382 (uint64_t)wholedisk) == 0); 383 384 /* 385 * Override defaults if custom properties are provided. 386 */ 387 if (props != NULL) { 388 const char *value = NULL; 389 390 if (nvlist_lookup_string(props, 391 zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { 392 if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { 393 (void) fprintf(stderr, 394 gettext("ashift must be a number.\n")); 395 return (NULL); 396 } 397 if (ashift != 0 && 398 (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { 399 (void) fprintf(stderr, 400 gettext("invalid 'ashift=%" PRIu64 "' " 401 "property: only values between %" PRId32 " " 402 "and %" PRId32 " are allowed.\n"), 403 ashift, ASHIFT_MIN, ASHIFT_MAX); 404 return (NULL); 405 } 406 } 407 } 408 409 /* 410 * If the device is known to incorrectly report its physical sector 411 * size explicitly provide the known correct value. 412 */ 413 if (ashift == 0) { 414 int sector_size; 415 416 if (check_sector_size_database(path, §or_size) == B_TRUE) 417 ashift = highbit64(sector_size) - 1; 418 } 419 420 if (ashift > 0) 421 (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); 422 423 return (vdev); 424 } 425 426 /* 427 * Go through and verify the replication level of the pool is consistent. 428 * Performs the following checks: 429 * 430 * For the new spec, verifies that devices in mirrors and raidz are the 431 * same size. 432 * 433 * If the current configuration already has inconsistent replication 434 * levels, ignore any other potential problems in the new spec. 435 * 436 * Otherwise, make sure that the current spec (if there is one) and the new 437 * spec have consistent replication levels. 438 * 439 * If there is no current spec (create), make sure new spec has at least 440 * one general purpose vdev. 441 */ 442 typedef struct replication_level { 443 const char *zprl_type; 444 uint64_t zprl_children; 445 uint64_t zprl_parity; 446 } replication_level_t; 447 448 #define ZPOOL_FUZZ (16 * 1024 * 1024) 449 450 /* 451 * N.B. For the purposes of comparing replication levels dRAID can be 452 * considered functionally equivalent to raidz. 453 */ 454 static boolean_t 455 is_raidz_mirror(replication_level_t *a, replication_level_t *b, 456 replication_level_t **raidz, replication_level_t **mirror) 457 { 458 if ((strcmp(a->zprl_type, "raidz") == 0 || 459 strcmp(a->zprl_type, "draid") == 0) && 460 strcmp(b->zprl_type, "mirror") == 0) { 461 *raidz = a; 462 *mirror = b; 463 return (B_TRUE); 464 } 465 return (B_FALSE); 466 } 467 468 /* 469 * Comparison for determining if dRAID and raidz where passed in either order. 470 */ 471 static boolean_t 472 is_raidz_draid(replication_level_t *a, replication_level_t *b) 473 { 474 if ((strcmp(a->zprl_type, "raidz") == 0 || 475 strcmp(a->zprl_type, "draid") == 0) && 476 (strcmp(b->zprl_type, "raidz") == 0 || 477 strcmp(b->zprl_type, "draid") == 0)) { 478 return (B_TRUE); 479 } 480 481 return (B_FALSE); 482 } 483 484 /* 485 * Given a list of toplevel vdevs, return the current replication level. If 486 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 487 * an error message will be displayed for each self-inconsistent vdev. 488 */ 489 static replication_level_t * 490 get_replication(nvlist_t *nvroot, boolean_t fatal) 491 { 492 nvlist_t **top; 493 uint_t t, toplevels; 494 nvlist_t **child; 495 uint_t c, children; 496 nvlist_t *nv; 497 const char *type; 498 replication_level_t lastrep = {0}; 499 replication_level_t rep; 500 replication_level_t *ret; 501 replication_level_t *raidz, *mirror; 502 boolean_t dontreport; 503 504 ret = safe_malloc(sizeof (replication_level_t)); 505 506 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 507 &top, &toplevels) == 0); 508 509 for (t = 0; t < toplevels; t++) { 510 uint64_t is_log = B_FALSE; 511 512 nv = top[t]; 513 514 /* 515 * For separate logs we ignore the top level vdev replication 516 * constraints. 517 */ 518 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 519 if (is_log) 520 continue; 521 522 /* 523 * Ignore holes introduced by removing aux devices, along 524 * with indirect vdevs introduced by previously removed 525 * vdevs. 526 */ 527 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 528 if (strcmp(type, VDEV_TYPE_HOLE) == 0 || 529 strcmp(type, VDEV_TYPE_INDIRECT) == 0) 530 continue; 531 532 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 533 &child, &children) != 0) { 534 /* 535 * This is a 'file' or 'disk' vdev. 536 */ 537 rep.zprl_type = type; 538 rep.zprl_children = 1; 539 rep.zprl_parity = 0; 540 } else { 541 int64_t vdev_size; 542 543 /* 544 * This is a mirror or RAID-Z vdev. Go through and make 545 * sure the contents are all the same (files vs. disks), 546 * keeping track of the number of elements in the 547 * process. 548 * 549 * We also check that the size of each vdev (if it can 550 * be determined) is the same. 551 */ 552 rep.zprl_type = type; 553 rep.zprl_children = 0; 554 555 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || 556 strcmp(type, VDEV_TYPE_DRAID) == 0) { 557 verify(nvlist_lookup_uint64(nv, 558 ZPOOL_CONFIG_NPARITY, 559 &rep.zprl_parity) == 0); 560 assert(rep.zprl_parity != 0); 561 } else { 562 rep.zprl_parity = 0; 563 } 564 565 /* 566 * The 'dontreport' variable indicates that we've 567 * already reported an error for this spec, so don't 568 * bother doing it again. 569 */ 570 type = NULL; 571 dontreport = 0; 572 vdev_size = -1LL; 573 for (c = 0; c < children; c++) { 574 nvlist_t *cnv = child[c]; 575 const char *path; 576 struct stat64 statbuf; 577 int64_t size = -1LL; 578 const char *childtype; 579 int fd, err; 580 581 rep.zprl_children++; 582 583 verify(nvlist_lookup_string(cnv, 584 ZPOOL_CONFIG_TYPE, &childtype) == 0); 585 586 /* 587 * If this is a replacing or spare vdev, then 588 * get the real first child of the vdev: do this 589 * in a loop because replacing and spare vdevs 590 * can be nested. 591 */ 592 while (strcmp(childtype, 593 VDEV_TYPE_REPLACING) == 0 || 594 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 595 nvlist_t **rchild; 596 uint_t rchildren; 597 598 verify(nvlist_lookup_nvlist_array(cnv, 599 ZPOOL_CONFIG_CHILDREN, &rchild, 600 &rchildren) == 0); 601 assert(rchildren == 2); 602 cnv = rchild[0]; 603 604 verify(nvlist_lookup_string(cnv, 605 ZPOOL_CONFIG_TYPE, 606 &childtype) == 0); 607 } 608 609 verify(nvlist_lookup_string(cnv, 610 ZPOOL_CONFIG_PATH, &path) == 0); 611 612 /* 613 * If we have a raidz/mirror that combines disks 614 * with files, report it as an error. 615 */ 616 if (!dontreport && type != NULL && 617 strcmp(type, childtype) != 0) { 618 if (ret != NULL) 619 free(ret); 620 ret = NULL; 621 if (fatal) 622 vdev_error(gettext( 623 "mismatched replication " 624 "level: %s contains both " 625 "files and devices\n"), 626 rep.zprl_type); 627 else 628 return (NULL); 629 dontreport = B_TRUE; 630 } 631 632 /* 633 * According to stat(2), the value of 'st_size' 634 * is undefined for block devices and character 635 * devices. But there is no effective way to 636 * determine the real size in userland. 637 * 638 * Instead, we'll take advantage of an 639 * implementation detail of spec_size(). If the 640 * device is currently open, then we (should) 641 * return a valid size. 642 * 643 * If we still don't get a valid size (indicated 644 * by a size of 0 or MAXOFFSET_T), then ignore 645 * this device altogether. 646 */ 647 if ((fd = open(path, O_RDONLY)) >= 0) { 648 err = fstat64_blk(fd, &statbuf); 649 (void) close(fd); 650 } else { 651 err = stat64(path, &statbuf); 652 } 653 654 if (err != 0 || 655 statbuf.st_size == 0 || 656 statbuf.st_size == MAXOFFSET_T) 657 continue; 658 659 size = statbuf.st_size; 660 661 /* 662 * Also make sure that devices and 663 * slices have a consistent size. If 664 * they differ by a significant amount 665 * (~16MB) then report an error. 666 */ 667 if (!dontreport && 668 (vdev_size != -1LL && 669 (llabs(size - vdev_size) > 670 ZPOOL_FUZZ))) { 671 if (ret != NULL) 672 free(ret); 673 ret = NULL; 674 if (fatal) 675 vdev_error(gettext( 676 "%s contains devices of " 677 "different sizes\n"), 678 rep.zprl_type); 679 else 680 return (NULL); 681 dontreport = B_TRUE; 682 } 683 684 type = childtype; 685 vdev_size = size; 686 } 687 } 688 689 /* 690 * At this point, we have the replication of the last toplevel 691 * vdev in 'rep'. Compare it to 'lastrep' to see if it is 692 * different. 693 */ 694 if (lastrep.zprl_type != NULL) { 695 if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || 696 is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { 697 /* 698 * Accepted raidz and mirror when they can 699 * handle the same number of disk failures. 700 */ 701 if (raidz->zprl_parity != 702 mirror->zprl_children - 1) { 703 if (ret != NULL) 704 free(ret); 705 ret = NULL; 706 if (fatal) 707 vdev_error(gettext( 708 "mismatched replication " 709 "level: " 710 "%s and %s vdevs with " 711 "different redundancy, " 712 "%llu vs. %llu (%llu-way) " 713 "are present\n"), 714 raidz->zprl_type, 715 mirror->zprl_type, 716 (u_longlong_t) 717 raidz->zprl_parity, 718 (u_longlong_t) 719 mirror->zprl_children - 1, 720 (u_longlong_t) 721 mirror->zprl_children); 722 else 723 return (NULL); 724 } 725 } else if (is_raidz_draid(&lastrep, &rep)) { 726 /* 727 * Accepted raidz and draid when they can 728 * handle the same number of disk failures. 729 */ 730 if (lastrep.zprl_parity != rep.zprl_parity) { 731 if (ret != NULL) 732 free(ret); 733 ret = NULL; 734 if (fatal) 735 vdev_error(gettext( 736 "mismatched replication " 737 "level: %s and %s vdevs " 738 "with different " 739 "redundancy, %llu vs. " 740 "%llu are present\n"), 741 lastrep.zprl_type, 742 rep.zprl_type, 743 (u_longlong_t) 744 lastrep.zprl_parity, 745 (u_longlong_t) 746 rep.zprl_parity); 747 else 748 return (NULL); 749 } 750 } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 751 0) { 752 if (ret != NULL) 753 free(ret); 754 ret = NULL; 755 if (fatal) 756 vdev_error(gettext( 757 "mismatched replication level: " 758 "both %s and %s vdevs are " 759 "present\n"), 760 lastrep.zprl_type, rep.zprl_type); 761 else 762 return (NULL); 763 } else if (lastrep.zprl_parity != rep.zprl_parity) { 764 if (ret) 765 free(ret); 766 ret = NULL; 767 if (fatal) 768 vdev_error(gettext( 769 "mismatched replication level: " 770 "both %llu and %llu device parity " 771 "%s vdevs are present\n"), 772 (u_longlong_t) 773 lastrep.zprl_parity, 774 (u_longlong_t)rep.zprl_parity, 775 rep.zprl_type); 776 else 777 return (NULL); 778 } else if (lastrep.zprl_children != rep.zprl_children) { 779 if (ret) 780 free(ret); 781 ret = NULL; 782 if (fatal) 783 vdev_error(gettext( 784 "mismatched replication level: " 785 "both %llu-way and %llu-way %s " 786 "vdevs are present\n"), 787 (u_longlong_t) 788 lastrep.zprl_children, 789 (u_longlong_t) 790 rep.zprl_children, 791 rep.zprl_type); 792 else 793 return (NULL); 794 } 795 } 796 lastrep = rep; 797 } 798 799 if (ret != NULL) 800 *ret = rep; 801 802 return (ret); 803 } 804 805 /* 806 * Check the replication level of the vdev spec against the current pool. Calls 807 * get_replication() to make sure the new spec is self-consistent. If the pool 808 * has a consistent replication level, then we ignore any errors. Otherwise, 809 * report any difference between the two. 810 */ 811 static int 812 check_replication(nvlist_t *config, nvlist_t *newroot) 813 { 814 nvlist_t **child; 815 uint_t children; 816 replication_level_t *current = NULL, *new; 817 replication_level_t *raidz, *mirror; 818 int ret; 819 820 /* 821 * If we have a current pool configuration, check to see if it's 822 * self-consistent. If not, simply return success. 823 */ 824 if (config != NULL) { 825 nvlist_t *nvroot; 826 827 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 828 &nvroot) == 0); 829 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 830 return (0); 831 } 832 /* 833 * for spares there may be no children, and therefore no 834 * replication level to check 835 */ 836 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 837 &child, &children) != 0) || (children == 0)) { 838 free(current); 839 return (0); 840 } 841 842 /* 843 * If all we have is logs then there's no replication level to check. 844 */ 845 if (num_logs(newroot) == children) { 846 free(current); 847 return (0); 848 } 849 850 /* 851 * Get the replication level of the new vdev spec, reporting any 852 * inconsistencies found. 853 */ 854 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 855 free(current); 856 return (-1); 857 } 858 859 /* 860 * Check to see if the new vdev spec matches the replication level of 861 * the current pool. 862 */ 863 ret = 0; 864 if (current != NULL) { 865 if (is_raidz_mirror(current, new, &raidz, &mirror) || 866 is_raidz_mirror(new, current, &raidz, &mirror)) { 867 if (raidz->zprl_parity != mirror->zprl_children - 1) { 868 vdev_error(gettext( 869 "mismatched replication level: pool and " 870 "new vdev with different redundancy, %s " 871 "and %s vdevs, %llu vs. %llu (%llu-way)\n"), 872 raidz->zprl_type, 873 mirror->zprl_type, 874 (u_longlong_t)raidz->zprl_parity, 875 (u_longlong_t)mirror->zprl_children - 1, 876 (u_longlong_t)mirror->zprl_children); 877 ret = -1; 878 } 879 } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { 880 vdev_error(gettext( 881 "mismatched replication level: pool uses %s " 882 "and new vdev is %s\n"), 883 current->zprl_type, new->zprl_type); 884 ret = -1; 885 } else if (current->zprl_parity != new->zprl_parity) { 886 vdev_error(gettext( 887 "mismatched replication level: pool uses %llu " 888 "device parity and new vdev uses %llu\n"), 889 (u_longlong_t)current->zprl_parity, 890 (u_longlong_t)new->zprl_parity); 891 ret = -1; 892 } else if (current->zprl_children != new->zprl_children) { 893 vdev_error(gettext( 894 "mismatched replication level: pool uses %llu-way " 895 "%s and new vdev uses %llu-way %s\n"), 896 (u_longlong_t)current->zprl_children, 897 current->zprl_type, 898 (u_longlong_t)new->zprl_children, 899 new->zprl_type); 900 ret = -1; 901 } 902 } 903 904 free(new); 905 if (current != NULL) 906 free(current); 907 908 return (ret); 909 } 910 911 static int 912 zero_label(const char *path) 913 { 914 const int size = 4096; 915 char buf[size]; 916 int err, fd; 917 918 if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) { 919 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 920 path, strerror(errno)); 921 return (-1); 922 } 923 924 memset(buf, 0, size); 925 err = write(fd, buf, size); 926 (void) fdatasync(fd); 927 (void) close(fd); 928 929 if (err == -1) { 930 (void) fprintf(stderr, gettext("cannot zero first %d bytes " 931 "of '%s': %s\n"), size, path, strerror(errno)); 932 return (-1); 933 } 934 935 if (err != size) { 936 (void) fprintf(stderr, gettext("could only zero %d/%d bytes " 937 "of '%s'\n"), err, size, path); 938 return (-1); 939 } 940 941 return (0); 942 } 943 944 static void 945 lines_to_stderr(char *lines[], int lines_cnt) 946 { 947 int i; 948 for (i = 0; i < lines_cnt; i++) { 949 fprintf(stderr, "%s\n", lines[i]); 950 } 951 } 952 953 /* 954 * Go through and find any whole disks in the vdev specification, labelling them 955 * as appropriate. When constructing the vdev spec, we were unable to open this 956 * device in order to provide a devid. Now that we have labelled the disk and 957 * know that slice 0 is valid, we can construct the devid now. 958 * 959 * If the disk was already labeled with an EFI label, we will have gotten the 960 * devid already (because we were able to open the whole disk). Otherwise, we 961 * need to get the devid after we label the disk. 962 */ 963 static int 964 make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing) 965 { 966 nvlist_t **child; 967 uint_t c, children; 968 const char *type, *path; 969 char devpath[MAXPATHLEN]; 970 char udevpath[MAXPATHLEN]; 971 uint64_t wholedisk; 972 struct stat64 statbuf; 973 int is_exclusive = 0; 974 int fd; 975 int ret; 976 977 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 978 979 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 980 &child, &children) != 0) { 981 982 if (strcmp(type, VDEV_TYPE_DISK) != 0) 983 return (0); 984 985 /* 986 * We have a disk device. If this is a whole disk write 987 * out the efi partition table, otherwise write zero's to 988 * the first 4k of the partition. This is to ensure that 989 * libblkid will not misidentify the partition due to a 990 * magic value left by the previous filesystem. 991 */ 992 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); 993 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 994 &wholedisk)); 995 996 if (!wholedisk) { 997 /* 998 * Update device id string for mpath nodes (Linux only) 999 */ 1000 if (is_mpath_whole_disk(path)) 1001 update_vdev_config_dev_strs(nv); 1002 1003 if (!is_spare(NULL, path)) 1004 (void) zero_label(path); 1005 return (0); 1006 } 1007 1008 if (realpath(path, devpath) == NULL) { 1009 ret = errno; 1010 (void) fprintf(stderr, 1011 gettext("cannot resolve path '%s'\n"), path); 1012 return (ret); 1013 } 1014 1015 /* 1016 * Remove any previously existing symlink from a udev path to 1017 * the device before labeling the disk. This ensures that 1018 * only newly created links are used. Otherwise there is a 1019 * window between when udev deletes and recreates the link 1020 * during which access attempts will fail with ENOENT. 1021 */ 1022 strlcpy(udevpath, path, MAXPATHLEN); 1023 (void) zfs_append_partition(udevpath, MAXPATHLEN); 1024 1025 fd = open(devpath, O_RDWR|O_EXCL); 1026 if (fd == -1) { 1027 if (errno == EBUSY) 1028 is_exclusive = 1; 1029 #ifdef __FreeBSD__ 1030 if (errno == EPERM) 1031 is_exclusive = 1; 1032 #endif 1033 } else { 1034 (void) close(fd); 1035 } 1036 1037 /* 1038 * If the partition exists, contains a valid spare label, 1039 * and is opened exclusively there is no need to partition 1040 * it. Hot spares have already been partitioned and are 1041 * held open exclusively by the kernel as a safety measure. 1042 * 1043 * If the provided path is for a /dev/disk/ device its 1044 * symbolic link will be removed, partition table created, 1045 * and then block until udev creates the new link. 1046 */ 1047 if (!is_exclusive && !is_spare(NULL, udevpath)) { 1048 char *devnode = strrchr(devpath, '/') + 1; 1049 char **lines = NULL; 1050 int lines_cnt = 0; 1051 1052 ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); 1053 if (ret == 0) { 1054 ret = lstat64(udevpath, &statbuf); 1055 if (ret == 0 && S_ISLNK(statbuf.st_mode)) 1056 (void) unlink(udevpath); 1057 } 1058 1059 /* 1060 * When labeling a pool the raw device node name 1061 * is provided as it appears under /dev/. 1062 * 1063 * Note that 'zhp' will be NULL when we're creating a 1064 * pool. 1065 */ 1066 if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode, 1067 nv, zhp == NULL ? "create" : 1068 replacing ? "replace" : "add", &lines, 1069 &lines_cnt) != 0) { 1070 (void) fprintf(stderr, 1071 gettext( 1072 "Error preparing/labeling disk.\n")); 1073 if (lines_cnt > 0) { 1074 (void) fprintf(stderr, 1075 gettext("zfs_prepare_disk output:\n")); 1076 lines_to_stderr(lines, lines_cnt); 1077 } 1078 1079 libzfs_free_str_array(lines, lines_cnt); 1080 return (-1); 1081 } 1082 libzfs_free_str_array(lines, lines_cnt); 1083 1084 /* 1085 * Wait for udev to signal the device is available 1086 * by the provided path. 1087 */ 1088 ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); 1089 if (ret) { 1090 (void) fprintf(stderr, 1091 gettext("missing link: %s was " 1092 "partitioned but %s is missing\n"), 1093 devnode, udevpath); 1094 return (ret); 1095 } 1096 1097 ret = zero_label(udevpath); 1098 if (ret) 1099 return (ret); 1100 } 1101 1102 /* 1103 * Update the path to refer to the partition. The presence of 1104 * the 'whole_disk' field indicates to the CLI that we should 1105 * chop off the partition number when displaying the device in 1106 * future output. 1107 */ 1108 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0); 1109 1110 /* 1111 * Update device id strings for whole disks (Linux only) 1112 */ 1113 update_vdev_config_dev_strs(nv); 1114 1115 return (0); 1116 } 1117 1118 for (c = 0; c < children; c++) 1119 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1120 return (ret); 1121 1122 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1123 &child, &children) == 0) 1124 for (c = 0; c < children; c++) 1125 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1126 return (ret); 1127 1128 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1129 &child, &children) == 0) 1130 for (c = 0; c < children; c++) 1131 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1132 return (ret); 1133 1134 return (0); 1135 } 1136 1137 /* 1138 * Go through and find any devices that are in use. We rely on libdiskmgt for 1139 * the majority of this task. 1140 */ 1141 static boolean_t 1142 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, 1143 boolean_t replacing, boolean_t isspare) 1144 { 1145 nvlist_t **child; 1146 uint_t c, children; 1147 const char *type, *path; 1148 int ret = 0; 1149 char buf[MAXPATHLEN]; 1150 uint64_t wholedisk = B_FALSE; 1151 boolean_t anyinuse = B_FALSE; 1152 1153 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1154 1155 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1156 &child, &children) != 0) { 1157 1158 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); 1159 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1160 verify(!nvlist_lookup_uint64(nv, 1161 ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); 1162 1163 /* 1164 * As a generic check, we look to see if this is a replace of a 1165 * hot spare within the same pool. If so, we allow it 1166 * regardless of what libblkid or zpool_in_use() says. 1167 */ 1168 if (replacing) { 1169 (void) strlcpy(buf, path, sizeof (buf)); 1170 if (wholedisk) { 1171 ret = zfs_append_partition(buf, sizeof (buf)); 1172 if (ret == -1) 1173 return (-1); 1174 } 1175 1176 if (is_spare(config, buf)) 1177 return (B_FALSE); 1178 } 1179 1180 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1181 ret = check_device(path, force, isspare, wholedisk); 1182 1183 else if (strcmp(type, VDEV_TYPE_FILE) == 0) 1184 ret = check_file(path, force, isspare); 1185 1186 return (ret != 0); 1187 } 1188 1189 for (c = 0; c < children; c++) 1190 if (is_device_in_use(config, child[c], force, replacing, 1191 B_FALSE)) 1192 anyinuse = B_TRUE; 1193 1194 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1195 &child, &children) == 0) 1196 for (c = 0; c < children; c++) 1197 if (is_device_in_use(config, child[c], force, replacing, 1198 B_TRUE)) 1199 anyinuse = B_TRUE; 1200 1201 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1202 &child, &children) == 0) 1203 for (c = 0; c < children; c++) 1204 if (is_device_in_use(config, child[c], force, replacing, 1205 B_FALSE)) 1206 anyinuse = B_TRUE; 1207 1208 return (anyinuse); 1209 } 1210 1211 /* 1212 * Returns the parity level extracted from a raidz or draid type. 1213 * If the parity cannot be determined zero is returned. 1214 */ 1215 static int 1216 get_parity(const char *type) 1217 { 1218 long parity = 0; 1219 const char *p; 1220 1221 if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { 1222 p = type + strlen(VDEV_TYPE_RAIDZ); 1223 1224 if (*p == '\0') { 1225 /* when unspecified default to single parity */ 1226 return (1); 1227 } else if (*p == '0') { 1228 /* no zero prefixes allowed */ 1229 return (0); 1230 } else { 1231 /* 0-3, no suffixes allowed */ 1232 char *end; 1233 errno = 0; 1234 parity = strtol(p, &end, 10); 1235 if (errno != 0 || *end != '\0' || 1236 parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { 1237 return (0); 1238 } 1239 } 1240 } else if (strncmp(type, VDEV_TYPE_DRAID, 1241 strlen(VDEV_TYPE_DRAID)) == 0) { 1242 p = type + strlen(VDEV_TYPE_DRAID); 1243 1244 if (*p == '\0' || *p == ':') { 1245 /* when unspecified default to single parity */ 1246 return (1); 1247 } else if (*p == '0') { 1248 /* no zero prefixes allowed */ 1249 return (0); 1250 } else { 1251 /* 0-3, allowed suffixes: '\0' or ':' */ 1252 char *end; 1253 errno = 0; 1254 parity = strtol(p, &end, 10); 1255 if (errno != 0 || 1256 parity < 1 || parity > VDEV_DRAID_MAXPARITY || 1257 (*end != '\0' && *end != ':')) { 1258 return (0); 1259 } 1260 } 1261 } 1262 1263 return ((int)parity); 1264 } 1265 1266 /* 1267 * Assign the minimum and maximum number of devices allowed for 1268 * the specified type. On error NULL is returned, otherwise the 1269 * type prefix is returned (raidz, mirror, etc). 1270 */ 1271 static const char * 1272 is_grouping(const char *type, int *mindev, int *maxdev) 1273 { 1274 int nparity; 1275 1276 if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || 1277 strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { 1278 nparity = get_parity(type); 1279 if (nparity == 0) 1280 return (NULL); 1281 if (mindev != NULL) 1282 *mindev = nparity + 1; 1283 if (maxdev != NULL) 1284 *maxdev = 255; 1285 1286 if (strncmp(type, VDEV_TYPE_RAIDZ, 1287 strlen(VDEV_TYPE_RAIDZ)) == 0) { 1288 return (VDEV_TYPE_RAIDZ); 1289 } else { 1290 return (VDEV_TYPE_DRAID); 1291 } 1292 } 1293 1294 if (maxdev != NULL) 1295 *maxdev = INT_MAX; 1296 1297 if (strcmp(type, "mirror") == 0) { 1298 if (mindev != NULL) 1299 *mindev = 2; 1300 return (VDEV_TYPE_MIRROR); 1301 } 1302 1303 if (strcmp(type, "spare") == 0) { 1304 if (mindev != NULL) 1305 *mindev = 1; 1306 return (VDEV_TYPE_SPARE); 1307 } 1308 1309 if (strcmp(type, "log") == 0) { 1310 if (mindev != NULL) 1311 *mindev = 1; 1312 return (VDEV_TYPE_LOG); 1313 } 1314 1315 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || 1316 strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1317 if (mindev != NULL) 1318 *mindev = 1; 1319 return (type); 1320 } 1321 1322 if (strcmp(type, "cache") == 0) { 1323 if (mindev != NULL) 1324 *mindev = 1; 1325 return (VDEV_TYPE_L2CACHE); 1326 } 1327 1328 return (NULL); 1329 } 1330 1331 /* 1332 * Extract the configuration parameters encoded in the dRAID type and 1333 * use them to generate a dRAID configuration. The expected format is: 1334 * 1335 * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>] 1336 * 1337 * The intent is to be able to generate a good configuration when no 1338 * additional information is provided. The only mandatory component 1339 * of the 'type' is the 'draid' prefix. If a value is not provided 1340 * then reasonable defaults are used. The optional components may 1341 * appear in any order but the d/s/c suffix is required. 1342 * 1343 * Valid inputs: 1344 * - data: number of data devices per group (1-255) 1345 * - parity: number of parity blocks per group (1-3) 1346 * - spares: number of distributed spare (0-100) 1347 * - children: total number of devices (1-255) 1348 * 1349 * Examples: 1350 * - zpool create tank draid <devices...> 1351 * - zpool create tank draid2:8d:51c:2s <devices...> 1352 */ 1353 static int 1354 draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) 1355 { 1356 uint64_t nparity = 1; 1357 uint64_t nspares = 0; 1358 uint64_t ndata = UINT64_MAX; 1359 uint64_t ngroups = 1; 1360 long value; 1361 1362 if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) 1363 return (EINVAL); 1364 1365 nparity = (uint64_t)get_parity(type); 1366 if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { 1367 fprintf(stderr, 1368 gettext("invalid dRAID parity level %llu; must be " 1369 "between 1 and %d\n"), (u_longlong_t)nparity, 1370 VDEV_DRAID_MAXPARITY); 1371 return (EINVAL); 1372 } 1373 1374 char *p = (char *)type; 1375 while ((p = strchr(p, ':')) != NULL) { 1376 char *end; 1377 1378 p = p + 1; 1379 errno = 0; 1380 1381 if (!isdigit(p[0])) { 1382 (void) fprintf(stderr, gettext("invalid dRAID " 1383 "syntax; expected [:<number><c|d|s>] not '%s'\n"), 1384 type); 1385 return (EINVAL); 1386 } 1387 1388 /* Expected non-zero value with c/d/s suffix */ 1389 value = strtol(p, &end, 10); 1390 char suffix = tolower(*end); 1391 if (errno != 0 || 1392 (suffix != 'c' && suffix != 'd' && suffix != 's')) { 1393 (void) fprintf(stderr, gettext("invalid dRAID " 1394 "syntax; expected [:<number><c|d|s>] not '%s'\n"), 1395 type); 1396 return (EINVAL); 1397 } 1398 1399 if (suffix == 'c') { 1400 if ((uint64_t)value != children) { 1401 fprintf(stderr, 1402 gettext("invalid number of dRAID children; " 1403 "%llu required but %llu provided\n"), 1404 (u_longlong_t)value, 1405 (u_longlong_t)children); 1406 return (EINVAL); 1407 } 1408 } else if (suffix == 'd') { 1409 ndata = (uint64_t)value; 1410 } else if (suffix == 's') { 1411 nspares = (uint64_t)value; 1412 } else { 1413 verify(0); /* Unreachable */ 1414 } 1415 } 1416 1417 /* 1418 * When a specific number of data disks is not provided limit a 1419 * redundancy group to 8 data disks. This value was selected to 1420 * provide a reasonable tradeoff between capacity and performance. 1421 */ 1422 if (ndata == UINT64_MAX) { 1423 if (children > nspares + nparity) { 1424 ndata = MIN(children - nspares - nparity, 8); 1425 } else { 1426 fprintf(stderr, gettext("request number of " 1427 "distributed spares %llu and parity level %llu\n" 1428 "leaves no disks available for data\n"), 1429 (u_longlong_t)nspares, (u_longlong_t)nparity); 1430 return (EINVAL); 1431 } 1432 } 1433 1434 /* Verify the maximum allowed group size is never exceeded. */ 1435 if (ndata == 0 || (ndata + nparity > children - nspares)) { 1436 fprintf(stderr, gettext("requested number of dRAID data " 1437 "disks per group %llu is too high,\nat most %llu disks " 1438 "are available for data\n"), (u_longlong_t)ndata, 1439 (u_longlong_t)(children - nspares - nparity)); 1440 return (EINVAL); 1441 } 1442 1443 /* 1444 * Verify the requested number of spares can be satisfied. 1445 * An arbitrary limit of 100 distributed spares is applied. 1446 */ 1447 if (nspares > 100 || nspares > (children - (ndata + nparity))) { 1448 fprintf(stderr, 1449 gettext("invalid number of dRAID spares %llu; additional " 1450 "disks would be required\n"), (u_longlong_t)nspares); 1451 return (EINVAL); 1452 } 1453 1454 /* Verify the requested number children is sufficient. */ 1455 if (children < (ndata + nparity + nspares)) { 1456 fprintf(stderr, gettext("%llu disks were provided, but at " 1457 "least %llu disks are required for this config\n"), 1458 (u_longlong_t)children, 1459 (u_longlong_t)(ndata + nparity + nspares)); 1460 } 1461 1462 if (children > VDEV_DRAID_MAX_CHILDREN) { 1463 fprintf(stderr, gettext("%llu disks were provided, but " 1464 "dRAID only supports up to %u disks"), 1465 (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); 1466 } 1467 1468 /* 1469 * Calculate the minimum number of groups required to fill a slice. 1470 * This is the LCM of the stripe width (ndata + nparity) and the 1471 * number of data drives (children - nspares). 1472 */ 1473 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1474 ngroups++; 1475 1476 /* Store the basic dRAID configuration. */ 1477 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); 1478 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1479 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1480 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1481 1482 return (0); 1483 } 1484 1485 /* 1486 * Construct a syntactically valid vdev specification, 1487 * and ensure that all devices and files exist and can be opened. 1488 * Note: we don't bother freeing anything in the error paths 1489 * because the program is just going to exit anyway. 1490 */ 1491 static nvlist_t * 1492 construct_spec(nvlist_t *props, int argc, char **argv) 1493 { 1494 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1495 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1496 const char *type, *fulltype; 1497 boolean_t is_log, is_special, is_dedup, is_spare; 1498 boolean_t seen_logs; 1499 1500 top = NULL; 1501 toplevels = 0; 1502 spares = NULL; 1503 l2cache = NULL; 1504 nspares = 0; 1505 nlogs = 0; 1506 nl2cache = 0; 1507 is_log = is_special = is_dedup = is_spare = B_FALSE; 1508 seen_logs = B_FALSE; 1509 nvroot = NULL; 1510 1511 while (argc > 0) { 1512 fulltype = argv[0]; 1513 nv = NULL; 1514 1515 /* 1516 * If it's a mirror, raidz, or draid the subsequent arguments 1517 * are its leaves -- until we encounter the next mirror, 1518 * raidz or draid. 1519 */ 1520 if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { 1521 nvlist_t **child = NULL; 1522 int c, children = 0; 1523 1524 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1525 if (spares != NULL) { 1526 (void) fprintf(stderr, 1527 gettext("invalid vdev " 1528 "specification: 'spare' can be " 1529 "specified only once\n")); 1530 goto spec_out; 1531 } 1532 is_spare = B_TRUE; 1533 is_log = is_special = is_dedup = B_FALSE; 1534 } 1535 1536 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1537 if (seen_logs) { 1538 (void) fprintf(stderr, 1539 gettext("invalid vdev " 1540 "specification: 'log' can be " 1541 "specified only once\n")); 1542 goto spec_out; 1543 } 1544 seen_logs = B_TRUE; 1545 is_log = B_TRUE; 1546 is_special = is_dedup = is_spare = B_FALSE; 1547 argc--; 1548 argv++; 1549 /* 1550 * A log is not a real grouping device. 1551 * We just set is_log and continue. 1552 */ 1553 continue; 1554 } 1555 1556 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { 1557 is_special = B_TRUE; 1558 is_log = is_dedup = is_spare = B_FALSE; 1559 argc--; 1560 argv++; 1561 continue; 1562 } 1563 1564 if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1565 is_dedup = B_TRUE; 1566 is_log = is_special = is_spare = B_FALSE; 1567 argc--; 1568 argv++; 1569 continue; 1570 } 1571 1572 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1573 if (l2cache != NULL) { 1574 (void) fprintf(stderr, 1575 gettext("invalid vdev " 1576 "specification: 'cache' can be " 1577 "specified only once\n")); 1578 goto spec_out; 1579 } 1580 is_log = is_special = B_FALSE; 1581 is_dedup = is_spare = B_FALSE; 1582 } 1583 1584 if (is_log || is_special || is_dedup) { 1585 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1586 (void) fprintf(stderr, 1587 gettext("invalid vdev " 1588 "specification: unsupported '%s' " 1589 "device: %s\n"), is_log ? "log" : 1590 "special", type); 1591 goto spec_out; 1592 } 1593 nlogs++; 1594 } 1595 1596 for (c = 1; c < argc; c++) { 1597 if (is_grouping(argv[c], NULL, NULL) != NULL) 1598 break; 1599 1600 children++; 1601 child = realloc(child, 1602 children * sizeof (nvlist_t *)); 1603 if (child == NULL) 1604 zpool_no_memory(); 1605 if ((nv = make_leaf_vdev(props, argv[c], 1606 !(is_log || is_special || is_dedup || 1607 is_spare))) == NULL) { 1608 for (c = 0; c < children - 1; c++) 1609 nvlist_free(child[c]); 1610 free(child); 1611 goto spec_out; 1612 } 1613 1614 child[children - 1] = nv; 1615 } 1616 1617 if (children < mindev) { 1618 (void) fprintf(stderr, gettext("invalid vdev " 1619 "specification: %s requires at least %d " 1620 "devices\n"), argv[0], mindev); 1621 for (c = 0; c < children; c++) 1622 nvlist_free(child[c]); 1623 free(child); 1624 goto spec_out; 1625 } 1626 1627 if (children > maxdev) { 1628 (void) fprintf(stderr, gettext("invalid vdev " 1629 "specification: %s supports no more than " 1630 "%d devices\n"), argv[0], maxdev); 1631 for (c = 0; c < children; c++) 1632 nvlist_free(child[c]); 1633 free(child); 1634 goto spec_out; 1635 } 1636 1637 argc -= c; 1638 argv += c; 1639 1640 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1641 spares = child; 1642 nspares = children; 1643 continue; 1644 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1645 l2cache = child; 1646 nl2cache = children; 1647 continue; 1648 } else { 1649 /* create a top-level vdev with children */ 1650 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1651 0) == 0); 1652 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1653 type) == 0); 1654 verify(nvlist_add_uint64(nv, 1655 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1656 if (is_log) { 1657 verify(nvlist_add_string(nv, 1658 ZPOOL_CONFIG_ALLOCATION_BIAS, 1659 VDEV_ALLOC_BIAS_LOG) == 0); 1660 } 1661 if (is_special) { 1662 verify(nvlist_add_string(nv, 1663 ZPOOL_CONFIG_ALLOCATION_BIAS, 1664 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1665 } 1666 if (is_dedup) { 1667 verify(nvlist_add_string(nv, 1668 ZPOOL_CONFIG_ALLOCATION_BIAS, 1669 VDEV_ALLOC_BIAS_DEDUP) == 0); 1670 } 1671 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1672 verify(nvlist_add_uint64(nv, 1673 ZPOOL_CONFIG_NPARITY, 1674 mindev - 1) == 0); 1675 } 1676 if (strcmp(type, VDEV_TYPE_DRAID) == 0) { 1677 if (draid_config_by_type(nv, 1678 fulltype, children) != 0) { 1679 for (c = 0; c < children; c++) 1680 nvlist_free(child[c]); 1681 free(child); 1682 goto spec_out; 1683 } 1684 } 1685 verify(nvlist_add_nvlist_array(nv, 1686 ZPOOL_CONFIG_CHILDREN, 1687 (const nvlist_t **)child, children) == 0); 1688 1689 for (c = 0; c < children; c++) 1690 nvlist_free(child[c]); 1691 free(child); 1692 } 1693 } else { 1694 /* 1695 * We have a device. Pass off to make_leaf_vdev() to 1696 * construct the appropriate nvlist describing the vdev. 1697 */ 1698 if ((nv = make_leaf_vdev(props, argv[0], !(is_log || 1699 is_special || is_dedup || is_spare))) == NULL) 1700 goto spec_out; 1701 1702 verify(nvlist_add_uint64(nv, 1703 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1704 if (is_log) { 1705 verify(nvlist_add_string(nv, 1706 ZPOOL_CONFIG_ALLOCATION_BIAS, 1707 VDEV_ALLOC_BIAS_LOG) == 0); 1708 nlogs++; 1709 } 1710 1711 if (is_special) { 1712 verify(nvlist_add_string(nv, 1713 ZPOOL_CONFIG_ALLOCATION_BIAS, 1714 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1715 } 1716 if (is_dedup) { 1717 verify(nvlist_add_string(nv, 1718 ZPOOL_CONFIG_ALLOCATION_BIAS, 1719 VDEV_ALLOC_BIAS_DEDUP) == 0); 1720 } 1721 argc--; 1722 argv++; 1723 } 1724 1725 toplevels++; 1726 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1727 if (top == NULL) 1728 zpool_no_memory(); 1729 top[toplevels - 1] = nv; 1730 } 1731 1732 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1733 (void) fprintf(stderr, gettext("invalid vdev " 1734 "specification: at least one toplevel vdev must be " 1735 "specified\n")); 1736 goto spec_out; 1737 } 1738 1739 if (seen_logs && nlogs == 0) { 1740 (void) fprintf(stderr, gettext("invalid vdev specification: " 1741 "log requires at least 1 device\n")); 1742 goto spec_out; 1743 } 1744 1745 /* 1746 * Finally, create nvroot and add all top-level vdevs to it. 1747 */ 1748 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1749 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1750 VDEV_TYPE_ROOT) == 0); 1751 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1752 (const nvlist_t **)top, toplevels) == 0); 1753 if (nspares != 0) 1754 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1755 (const nvlist_t **)spares, nspares) == 0); 1756 if (nl2cache != 0) 1757 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1758 (const nvlist_t **)l2cache, nl2cache) == 0); 1759 1760 spec_out: 1761 for (t = 0; t < toplevels; t++) 1762 nvlist_free(top[t]); 1763 for (t = 0; t < nspares; t++) 1764 nvlist_free(spares[t]); 1765 for (t = 0; t < nl2cache; t++) 1766 nvlist_free(l2cache[t]); 1767 1768 free(spares); 1769 free(l2cache); 1770 free(top); 1771 1772 return (nvroot); 1773 } 1774 1775 nvlist_t * 1776 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, 1777 splitflags_t flags, int argc, char **argv) 1778 { 1779 nvlist_t *newroot = NULL, **child; 1780 uint_t c, children; 1781 1782 if (argc > 0) { 1783 if ((newroot = construct_spec(props, argc, argv)) == NULL) { 1784 (void) fprintf(stderr, gettext("Unable to build a " 1785 "pool from the specified devices\n")); 1786 return (NULL); 1787 } 1788 1789 if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) { 1790 nvlist_free(newroot); 1791 return (NULL); 1792 } 1793 1794 /* avoid any tricks in the spec */ 1795 verify(nvlist_lookup_nvlist_array(newroot, 1796 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); 1797 for (c = 0; c < children; c++) { 1798 const char *path; 1799 const char *type; 1800 int min, max; 1801 1802 verify(nvlist_lookup_string(child[c], 1803 ZPOOL_CONFIG_PATH, &path) == 0); 1804 if ((type = is_grouping(path, &min, &max)) != NULL) { 1805 (void) fprintf(stderr, gettext("Cannot use " 1806 "'%s' as a device for splitting\n"), type); 1807 nvlist_free(newroot); 1808 return (NULL); 1809 } 1810 } 1811 } 1812 1813 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { 1814 nvlist_free(newroot); 1815 return (NULL); 1816 } 1817 1818 return (newroot); 1819 } 1820 1821 static int 1822 num_normal_vdevs(nvlist_t *nvroot) 1823 { 1824 nvlist_t **top; 1825 uint_t t, toplevels, normal = 0; 1826 1827 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1828 &top, &toplevels) == 0); 1829 1830 for (t = 0; t < toplevels; t++) { 1831 uint64_t log = B_FALSE; 1832 1833 (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); 1834 if (log) 1835 continue; 1836 if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) 1837 continue; 1838 1839 normal++; 1840 } 1841 1842 return (normal); 1843 } 1844 1845 /* 1846 * Get and validate the contents of the given vdev specification. This ensures 1847 * that the nvlist returned is well-formed, that all the devices exist, and that 1848 * they are not currently in use by any other known consumer. The 'poolconfig' 1849 * parameter is the current configuration of the pool when adding devices 1850 * existing pool, and is used to perform additional checks, such as changing the 1851 * replication level of the pool. It can be 'NULL' to indicate that this is a 1852 * new pool. The 'force' flag controls whether devices should be forcefully 1853 * added, even if they appear in use. 1854 */ 1855 nvlist_t * 1856 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, 1857 boolean_t replacing, boolean_t dryrun, int argc, char **argv) 1858 { 1859 nvlist_t *newroot; 1860 nvlist_t *poolconfig = NULL; 1861 is_force = force; 1862 1863 /* 1864 * Construct the vdev specification. If this is successful, we know 1865 * that we have a valid specification, and that all devices can be 1866 * opened. 1867 */ 1868 if ((newroot = construct_spec(props, argc, argv)) == NULL) 1869 return (NULL); 1870 1871 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) { 1872 nvlist_free(newroot); 1873 return (NULL); 1874 } 1875 1876 /* 1877 * Validate each device to make sure that it's not shared with another 1878 * subsystem. We do this even if 'force' is set, because there are some 1879 * uses (such as a dedicated dump device) that even '-f' cannot 1880 * override. 1881 */ 1882 if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { 1883 nvlist_free(newroot); 1884 return (NULL); 1885 } 1886 1887 /* 1888 * Check the replication level of the given vdevs and report any errors 1889 * found. We include the existing pool spec, if any, as we need to 1890 * catch changes against the existing replication level. 1891 */ 1892 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1893 nvlist_free(newroot); 1894 return (NULL); 1895 } 1896 1897 /* 1898 * On pool create the new vdev spec must have one normal vdev. 1899 */ 1900 if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { 1901 vdev_error(gettext("at least one general top-level vdev must " 1902 "be specified\n")); 1903 nvlist_free(newroot); 1904 return (NULL); 1905 } 1906 1907 /* 1908 * Run through the vdev specification and label any whole disks found. 1909 */ 1910 if (!dryrun && make_disks(zhp, newroot, replacing) != 0) { 1911 nvlist_free(newroot); 1912 return (NULL); 1913 } 1914 1915 return (newroot); 1916 } 1917