1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2016, 2017 Intel Corporation. 26 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. 27 */ 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'make_root_vdev'. The 52 * function performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libblkid to make sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Call libzfs to label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <ctype.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libintl.h> 70 #include <libnvpair.h> 71 #include <libzutil.h> 72 #include <limits.h> 73 #include <sys/spa.h> 74 #include <stdio.h> 75 #include <string.h> 76 #include <unistd.h> 77 #include "zpool_util.h" 78 #include <sys/zfs_context.h> 79 #include <sys/stat.h> 80 81 /* 82 * For any given vdev specification, we can have multiple errors. The 83 * vdev_error() function keeps track of whether we have seen an error yet, and 84 * prints out a header if its the first error we've seen. 85 */ 86 boolean_t error_seen; 87 boolean_t is_force; 88 89 /*PRINTFLIKE1*/ 90 void 91 vdev_error(const char *fmt, ...) 92 { 93 va_list ap; 94 95 if (!error_seen) { 96 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 97 if (!is_force) 98 (void) fprintf(stderr, gettext("use '-f' to override " 99 "the following errors:\n")); 100 else 101 (void) fprintf(stderr, gettext("the following errors " 102 "must be manually repaired:\n")); 103 error_seen = B_TRUE; 104 } 105 106 va_start(ap, fmt); 107 (void) vfprintf(stderr, fmt, ap); 108 va_end(ap); 109 } 110 111 /* 112 * Check that a file is valid. All we can do in this case is check that it's 113 * not in use by another pool, and not in use by swap. 114 */ 115 int 116 check_file(const char *file, boolean_t force, boolean_t isspare) 117 { 118 char *name; 119 int fd; 120 int ret = 0; 121 pool_state_t state; 122 boolean_t inuse; 123 124 if ((fd = open(file, O_RDONLY)) < 0) 125 return (0); 126 127 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 128 const char *desc; 129 130 switch (state) { 131 case POOL_STATE_ACTIVE: 132 desc = gettext("active"); 133 break; 134 135 case POOL_STATE_EXPORTED: 136 desc = gettext("exported"); 137 break; 138 139 case POOL_STATE_POTENTIALLY_ACTIVE: 140 desc = gettext("potentially active"); 141 break; 142 143 default: 144 desc = gettext("unknown"); 145 break; 146 } 147 148 /* 149 * Allow hot spares to be shared between pools. 150 */ 151 if (state == POOL_STATE_SPARE && isspare) { 152 free(name); 153 (void) close(fd); 154 return (0); 155 } 156 157 if (state == POOL_STATE_ACTIVE || 158 state == POOL_STATE_SPARE || !force) { 159 switch (state) { 160 case POOL_STATE_SPARE: 161 vdev_error(gettext("%s is reserved as a hot " 162 "spare for pool %s\n"), file, name); 163 break; 164 default: 165 vdev_error(gettext("%s is part of %s pool " 166 "'%s'\n"), file, desc, name); 167 break; 168 } 169 ret = -1; 170 } 171 172 free(name); 173 } 174 175 (void) close(fd); 176 return (ret); 177 } 178 179 /* 180 * This may be a shorthand device path or it could be total gibberish. 181 * Check to see if it is a known device available in zfs_vdev_paths. 182 * As part of this check, see if we've been given an entire disk 183 * (minus the slice number). 184 */ 185 static int 186 is_shorthand_path(const char *arg, char *path, size_t path_size, 187 struct stat64 *statbuf, boolean_t *wholedisk) 188 { 189 int error; 190 191 error = zfs_resolve_shortname(arg, path, path_size); 192 if (error == 0) { 193 *wholedisk = zfs_dev_is_whole_disk(path); 194 if (*wholedisk || (stat64(path, statbuf) == 0)) 195 return (0); 196 } 197 198 strlcpy(path, arg, path_size); 199 memset(statbuf, 0, sizeof (*statbuf)); 200 *wholedisk = B_FALSE; 201 202 return (error); 203 } 204 205 /* 206 * Determine if the given path is a hot spare within the given configuration. 207 * If no configuration is given we rely solely on the label. 208 */ 209 static boolean_t 210 is_spare(nvlist_t *config, const char *path) 211 { 212 int fd; 213 pool_state_t state; 214 char *name = NULL; 215 nvlist_t *label; 216 uint64_t guid, spareguid; 217 nvlist_t *nvroot; 218 nvlist_t **spares; 219 uint_t i, nspares; 220 boolean_t inuse; 221 222 if (zpool_is_draid_spare(path)) 223 return (B_TRUE); 224 225 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) 226 return (B_FALSE); 227 228 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 229 !inuse || 230 state != POOL_STATE_SPARE || 231 zpool_read_label(fd, &label, NULL) != 0) { 232 free(name); 233 (void) close(fd); 234 return (B_FALSE); 235 } 236 free(name); 237 (void) close(fd); 238 239 if (config == NULL) { 240 nvlist_free(label); 241 return (B_TRUE); 242 } 243 244 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 245 nvlist_free(label); 246 247 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 248 &nvroot) == 0); 249 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 250 &spares, &nspares) == 0) { 251 for (i = 0; i < nspares; i++) { 252 verify(nvlist_lookup_uint64(spares[i], 253 ZPOOL_CONFIG_GUID, &spareguid) == 0); 254 if (spareguid == guid) 255 return (B_TRUE); 256 } 257 } 258 259 return (B_FALSE); 260 } 261 262 /* 263 * Create a leaf vdev. Determine if this is a file or a device. If it's a 264 * device, fill in the device id to make a complete nvlist. Valid forms for a 265 * leaf vdev are: 266 * 267 * /dev/xxx Complete disk path 268 * /xxx Full path to file 269 * xxx Shorthand for <zfs_vdev_paths>/xxx 270 * draid* Virtual dRAID spare 271 */ 272 static nvlist_t * 273 make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) 274 { 275 char path[MAXPATHLEN]; 276 struct stat64 statbuf; 277 nvlist_t *vdev = NULL; 278 char *type = NULL; 279 boolean_t wholedisk = B_FALSE; 280 uint64_t ashift = 0; 281 int err; 282 283 /* 284 * Determine what type of vdev this is, and put the full path into 285 * 'path'. We detect whether this is a device of file afterwards by 286 * checking the st_mode of the file. 287 */ 288 if (arg[0] == '/') { 289 /* 290 * Complete device or file path. Exact type is determined by 291 * examining the file descriptor afterwards. Symbolic links 292 * are resolved to their real paths to determine whole disk 293 * and S_ISBLK/S_ISREG type checks. However, we are careful 294 * to store the given path as ZPOOL_CONFIG_PATH to ensure we 295 * can leverage udev's persistent device labels. 296 */ 297 if (realpath(arg, path) == NULL) { 298 (void) fprintf(stderr, 299 gettext("cannot resolve path '%s'\n"), arg); 300 return (NULL); 301 } 302 303 wholedisk = zfs_dev_is_whole_disk(path); 304 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 305 (void) fprintf(stderr, 306 gettext("cannot open '%s': %s\n"), 307 path, strerror(errno)); 308 return (NULL); 309 } 310 311 /* After whole disk check restore original passed path */ 312 strlcpy(path, arg, sizeof (path)); 313 } else if (zpool_is_draid_spare(arg)) { 314 if (!is_primary) { 315 (void) fprintf(stderr, 316 gettext("cannot open '%s': dRAID spares can only " 317 "be used to replace primary vdevs\n"), arg); 318 return (NULL); 319 } 320 321 wholedisk = B_TRUE; 322 strlcpy(path, arg, sizeof (path)); 323 type = VDEV_TYPE_DRAID_SPARE; 324 } else { 325 err = is_shorthand_path(arg, path, sizeof (path), 326 &statbuf, &wholedisk); 327 if (err != 0) { 328 /* 329 * If we got ENOENT, then the user gave us 330 * gibberish, so try to direct them with a 331 * reasonable error message. Otherwise, 332 * regurgitate strerror() since it's the best we 333 * can do. 334 */ 335 if (err == ENOENT) { 336 (void) fprintf(stderr, 337 gettext("cannot open '%s': no such " 338 "device in %s\n"), arg, DISK_ROOT); 339 (void) fprintf(stderr, 340 gettext("must be a full path or " 341 "shorthand device name\n")); 342 return (NULL); 343 } else { 344 (void) fprintf(stderr, 345 gettext("cannot open '%s': %s\n"), 346 path, strerror(errno)); 347 return (NULL); 348 } 349 } 350 } 351 352 if (type == NULL) { 353 /* 354 * Determine whether this is a device or a file. 355 */ 356 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 357 type = VDEV_TYPE_DISK; 358 } else if (S_ISREG(statbuf.st_mode)) { 359 type = VDEV_TYPE_FILE; 360 } else { 361 fprintf(stderr, gettext("cannot use '%s': must " 362 "be a block device or regular file\n"), path); 363 return (NULL); 364 } 365 } 366 367 /* 368 * Finally, we have the complete device or file, and we know that it is 369 * acceptable to use. Construct the nvlist to describe this vdev. All 370 * vdevs have a 'path' element, and devices also have a 'devid' element. 371 */ 372 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 373 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 374 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 375 376 if (strcmp(type, VDEV_TYPE_DISK) == 0) 377 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 378 (uint64_t)wholedisk) == 0); 379 380 /* 381 * Override defaults if custom properties are provided. 382 */ 383 if (props != NULL) { 384 char *value = NULL; 385 386 if (nvlist_lookup_string(props, 387 zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { 388 if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { 389 (void) fprintf(stderr, 390 gettext("ashift must be a number.\n")); 391 return (NULL); 392 } 393 if (ashift != 0 && 394 (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { 395 (void) fprintf(stderr, 396 gettext("invalid 'ashift=%" PRIu64 "' " 397 "property: only values between %" PRId32 " " 398 "and %" PRId32 " are allowed.\n"), 399 ashift, ASHIFT_MIN, ASHIFT_MAX); 400 return (NULL); 401 } 402 } 403 } 404 405 /* 406 * If the device is known to incorrectly report its physical sector 407 * size explicitly provide the known correct value. 408 */ 409 if (ashift == 0) { 410 int sector_size; 411 412 if (check_sector_size_database(path, §or_size) == B_TRUE) 413 ashift = highbit64(sector_size) - 1; 414 } 415 416 if (ashift > 0) 417 (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); 418 419 return (vdev); 420 } 421 422 /* 423 * Go through and verify the replication level of the pool is consistent. 424 * Performs the following checks: 425 * 426 * For the new spec, verifies that devices in mirrors and raidz are the 427 * same size. 428 * 429 * If the current configuration already has inconsistent replication 430 * levels, ignore any other potential problems in the new spec. 431 * 432 * Otherwise, make sure that the current spec (if there is one) and the new 433 * spec have consistent replication levels. 434 * 435 * If there is no current spec (create), make sure new spec has at least 436 * one general purpose vdev. 437 */ 438 typedef struct replication_level { 439 char *zprl_type; 440 uint64_t zprl_children; 441 uint64_t zprl_parity; 442 } replication_level_t; 443 444 #define ZPOOL_FUZZ (16 * 1024 * 1024) 445 446 /* 447 * N.B. For the purposes of comparing replication levels dRAID can be 448 * considered functionally equivalent to raidz. 449 */ 450 static boolean_t 451 is_raidz_mirror(replication_level_t *a, replication_level_t *b, 452 replication_level_t **raidz, replication_level_t **mirror) 453 { 454 if ((strcmp(a->zprl_type, "raidz") == 0 || 455 strcmp(a->zprl_type, "draid") == 0) && 456 strcmp(b->zprl_type, "mirror") == 0) { 457 *raidz = a; 458 *mirror = b; 459 return (B_TRUE); 460 } 461 return (B_FALSE); 462 } 463 464 /* 465 * Comparison for determining if dRAID and raidz where passed in either order. 466 */ 467 static boolean_t 468 is_raidz_draid(replication_level_t *a, replication_level_t *b) 469 { 470 if ((strcmp(a->zprl_type, "raidz") == 0 || 471 strcmp(a->zprl_type, "draid") == 0) && 472 (strcmp(b->zprl_type, "raidz") == 0 || 473 strcmp(b->zprl_type, "draid") == 0)) { 474 return (B_TRUE); 475 } 476 477 return (B_FALSE); 478 } 479 480 /* 481 * Given a list of toplevel vdevs, return the current replication level. If 482 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 483 * an error message will be displayed for each self-inconsistent vdev. 484 */ 485 static replication_level_t * 486 get_replication(nvlist_t *nvroot, boolean_t fatal) 487 { 488 nvlist_t **top; 489 uint_t t, toplevels; 490 nvlist_t **child; 491 uint_t c, children; 492 nvlist_t *nv; 493 char *type; 494 replication_level_t lastrep = {0}; 495 replication_level_t rep; 496 replication_level_t *ret; 497 replication_level_t *raidz, *mirror; 498 boolean_t dontreport; 499 500 ret = safe_malloc(sizeof (replication_level_t)); 501 502 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 503 &top, &toplevels) == 0); 504 505 for (t = 0; t < toplevels; t++) { 506 uint64_t is_log = B_FALSE; 507 508 nv = top[t]; 509 510 /* 511 * For separate logs we ignore the top level vdev replication 512 * constraints. 513 */ 514 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 515 if (is_log) 516 continue; 517 518 /* Ignore holes introduced by removing aux devices */ 519 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 520 if (strcmp(type, VDEV_TYPE_HOLE) == 0) 521 continue; 522 523 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 524 &child, &children) != 0) { 525 /* 526 * This is a 'file' or 'disk' vdev. 527 */ 528 rep.zprl_type = type; 529 rep.zprl_children = 1; 530 rep.zprl_parity = 0; 531 } else { 532 int64_t vdev_size; 533 534 /* 535 * This is a mirror or RAID-Z vdev. Go through and make 536 * sure the contents are all the same (files vs. disks), 537 * keeping track of the number of elements in the 538 * process. 539 * 540 * We also check that the size of each vdev (if it can 541 * be determined) is the same. 542 */ 543 rep.zprl_type = type; 544 rep.zprl_children = 0; 545 546 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || 547 strcmp(type, VDEV_TYPE_DRAID) == 0) { 548 verify(nvlist_lookup_uint64(nv, 549 ZPOOL_CONFIG_NPARITY, 550 &rep.zprl_parity) == 0); 551 assert(rep.zprl_parity != 0); 552 } else { 553 rep.zprl_parity = 0; 554 } 555 556 /* 557 * The 'dontreport' variable indicates that we've 558 * already reported an error for this spec, so don't 559 * bother doing it again. 560 */ 561 type = NULL; 562 dontreport = 0; 563 vdev_size = -1LL; 564 for (c = 0; c < children; c++) { 565 nvlist_t *cnv = child[c]; 566 char *path; 567 struct stat64 statbuf; 568 int64_t size = -1LL; 569 char *childtype; 570 int fd, err; 571 572 rep.zprl_children++; 573 574 verify(nvlist_lookup_string(cnv, 575 ZPOOL_CONFIG_TYPE, &childtype) == 0); 576 577 /* 578 * If this is a replacing or spare vdev, then 579 * get the real first child of the vdev: do this 580 * in a loop because replacing and spare vdevs 581 * can be nested. 582 */ 583 while (strcmp(childtype, 584 VDEV_TYPE_REPLACING) == 0 || 585 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 586 nvlist_t **rchild; 587 uint_t rchildren; 588 589 verify(nvlist_lookup_nvlist_array(cnv, 590 ZPOOL_CONFIG_CHILDREN, &rchild, 591 &rchildren) == 0); 592 assert(rchildren == 2); 593 cnv = rchild[0]; 594 595 verify(nvlist_lookup_string(cnv, 596 ZPOOL_CONFIG_TYPE, 597 &childtype) == 0); 598 } 599 600 verify(nvlist_lookup_string(cnv, 601 ZPOOL_CONFIG_PATH, &path) == 0); 602 603 /* 604 * If we have a raidz/mirror that combines disks 605 * with files, report it as an error. 606 */ 607 if (!dontreport && type != NULL && 608 strcmp(type, childtype) != 0) { 609 if (ret != NULL) 610 free(ret); 611 ret = NULL; 612 if (fatal) 613 vdev_error(gettext( 614 "mismatched replication " 615 "level: %s contains both " 616 "files and devices\n"), 617 rep.zprl_type); 618 else 619 return (NULL); 620 dontreport = B_TRUE; 621 } 622 623 /* 624 * According to stat(2), the value of 'st_size' 625 * is undefined for block devices and character 626 * devices. But there is no effective way to 627 * determine the real size in userland. 628 * 629 * Instead, we'll take advantage of an 630 * implementation detail of spec_size(). If the 631 * device is currently open, then we (should) 632 * return a valid size. 633 * 634 * If we still don't get a valid size (indicated 635 * by a size of 0 or MAXOFFSET_T), then ignore 636 * this device altogether. 637 */ 638 if ((fd = open(path, O_RDONLY)) >= 0) { 639 err = fstat64_blk(fd, &statbuf); 640 (void) close(fd); 641 } else { 642 err = stat64(path, &statbuf); 643 } 644 645 if (err != 0 || 646 statbuf.st_size == 0 || 647 statbuf.st_size == MAXOFFSET_T) 648 continue; 649 650 size = statbuf.st_size; 651 652 /* 653 * Also make sure that devices and 654 * slices have a consistent size. If 655 * they differ by a significant amount 656 * (~16MB) then report an error. 657 */ 658 if (!dontreport && 659 (vdev_size != -1LL && 660 (llabs(size - vdev_size) > 661 ZPOOL_FUZZ))) { 662 if (ret != NULL) 663 free(ret); 664 ret = NULL; 665 if (fatal) 666 vdev_error(gettext( 667 "%s contains devices of " 668 "different sizes\n"), 669 rep.zprl_type); 670 else 671 return (NULL); 672 dontreport = B_TRUE; 673 } 674 675 type = childtype; 676 vdev_size = size; 677 } 678 } 679 680 /* 681 * At this point, we have the replication of the last toplevel 682 * vdev in 'rep'. Compare it to 'lastrep' to see if it is 683 * different. 684 */ 685 if (lastrep.zprl_type != NULL) { 686 if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || 687 is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { 688 /* 689 * Accepted raidz and mirror when they can 690 * handle the same number of disk failures. 691 */ 692 if (raidz->zprl_parity != 693 mirror->zprl_children - 1) { 694 if (ret != NULL) 695 free(ret); 696 ret = NULL; 697 if (fatal) 698 vdev_error(gettext( 699 "mismatched replication " 700 "level: " 701 "%s and %s vdevs with " 702 "different redundancy, " 703 "%llu vs. %llu (%llu-way) " 704 "are present\n"), 705 raidz->zprl_type, 706 mirror->zprl_type, 707 raidz->zprl_parity, 708 mirror->zprl_children - 1, 709 mirror->zprl_children); 710 else 711 return (NULL); 712 } 713 } else if (is_raidz_draid(&lastrep, &rep)) { 714 /* 715 * Accepted raidz and draid when they can 716 * handle the same number of disk failures. 717 */ 718 if (lastrep.zprl_parity != rep.zprl_parity) { 719 if (ret != NULL) 720 free(ret); 721 ret = NULL; 722 if (fatal) 723 vdev_error(gettext( 724 "mismatched replication " 725 "level: %s and %s vdevs " 726 "with different " 727 "redundancy, %llu vs. " 728 "%llu are present\n"), 729 lastrep.zprl_type, 730 rep.zprl_type, 731 lastrep.zprl_parity, 732 rep.zprl_parity); 733 else 734 return (NULL); 735 } 736 } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 737 0) { 738 if (ret != NULL) 739 free(ret); 740 ret = NULL; 741 if (fatal) 742 vdev_error(gettext( 743 "mismatched replication level: " 744 "both %s and %s vdevs are " 745 "present\n"), 746 lastrep.zprl_type, rep.zprl_type); 747 else 748 return (NULL); 749 } else if (lastrep.zprl_parity != rep.zprl_parity) { 750 if (ret) 751 free(ret); 752 ret = NULL; 753 if (fatal) 754 vdev_error(gettext( 755 "mismatched replication level: " 756 "both %llu and %llu device parity " 757 "%s vdevs are present\n"), 758 lastrep.zprl_parity, 759 rep.zprl_parity, 760 rep.zprl_type); 761 else 762 return (NULL); 763 } else if (lastrep.zprl_children != rep.zprl_children) { 764 if (ret) 765 free(ret); 766 ret = NULL; 767 if (fatal) 768 vdev_error(gettext( 769 "mismatched replication level: " 770 "both %llu-way and %llu-way %s " 771 "vdevs are present\n"), 772 lastrep.zprl_children, 773 rep.zprl_children, 774 rep.zprl_type); 775 else 776 return (NULL); 777 } 778 } 779 lastrep = rep; 780 } 781 782 if (ret != NULL) 783 *ret = rep; 784 785 return (ret); 786 } 787 788 /* 789 * Check the replication level of the vdev spec against the current pool. Calls 790 * get_replication() to make sure the new spec is self-consistent. If the pool 791 * has a consistent replication level, then we ignore any errors. Otherwise, 792 * report any difference between the two. 793 */ 794 static int 795 check_replication(nvlist_t *config, nvlist_t *newroot) 796 { 797 nvlist_t **child; 798 uint_t children; 799 replication_level_t *current = NULL, *new; 800 replication_level_t *raidz, *mirror; 801 int ret; 802 803 /* 804 * If we have a current pool configuration, check to see if it's 805 * self-consistent. If not, simply return success. 806 */ 807 if (config != NULL) { 808 nvlist_t *nvroot; 809 810 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 811 &nvroot) == 0); 812 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 813 return (0); 814 } 815 /* 816 * for spares there may be no children, and therefore no 817 * replication level to check 818 */ 819 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 820 &child, &children) != 0) || (children == 0)) { 821 free(current); 822 return (0); 823 } 824 825 /* 826 * If all we have is logs then there's no replication level to check. 827 */ 828 if (num_logs(newroot) == children) { 829 free(current); 830 return (0); 831 } 832 833 /* 834 * Get the replication level of the new vdev spec, reporting any 835 * inconsistencies found. 836 */ 837 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 838 free(current); 839 return (-1); 840 } 841 842 /* 843 * Check to see if the new vdev spec matches the replication level of 844 * the current pool. 845 */ 846 ret = 0; 847 if (current != NULL) { 848 if (is_raidz_mirror(current, new, &raidz, &mirror) || 849 is_raidz_mirror(new, current, &raidz, &mirror)) { 850 if (raidz->zprl_parity != mirror->zprl_children - 1) { 851 vdev_error(gettext( 852 "mismatched replication level: pool and " 853 "new vdev with different redundancy, %s " 854 "and %s vdevs, %llu vs. %llu (%llu-way)\n"), 855 raidz->zprl_type, 856 mirror->zprl_type, 857 raidz->zprl_parity, 858 mirror->zprl_children - 1, 859 mirror->zprl_children); 860 ret = -1; 861 } 862 } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { 863 vdev_error(gettext( 864 "mismatched replication level: pool uses %s " 865 "and new vdev is %s\n"), 866 current->zprl_type, new->zprl_type); 867 ret = -1; 868 } else if (current->zprl_parity != new->zprl_parity) { 869 vdev_error(gettext( 870 "mismatched replication level: pool uses %llu " 871 "device parity and new vdev uses %llu\n"), 872 current->zprl_parity, new->zprl_parity); 873 ret = -1; 874 } else if (current->zprl_children != new->zprl_children) { 875 vdev_error(gettext( 876 "mismatched replication level: pool uses %llu-way " 877 "%s and new vdev uses %llu-way %s\n"), 878 current->zprl_children, current->zprl_type, 879 new->zprl_children, new->zprl_type); 880 ret = -1; 881 } 882 } 883 884 free(new); 885 if (current != NULL) 886 free(current); 887 888 return (ret); 889 } 890 891 static int 892 zero_label(char *path) 893 { 894 const int size = 4096; 895 char buf[size]; 896 int err, fd; 897 898 if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) { 899 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 900 path, strerror(errno)); 901 return (-1); 902 } 903 904 memset(buf, 0, size); 905 err = write(fd, buf, size); 906 (void) fdatasync(fd); 907 (void) close(fd); 908 909 if (err == -1) { 910 (void) fprintf(stderr, gettext("cannot zero first %d bytes " 911 "of '%s': %s\n"), size, path, strerror(errno)); 912 return (-1); 913 } 914 915 if (err != size) { 916 (void) fprintf(stderr, gettext("could only zero %d/%d bytes " 917 "of '%s'\n"), err, size, path); 918 return (-1); 919 } 920 921 return (0); 922 } 923 924 /* 925 * Go through and find any whole disks in the vdev specification, labelling them 926 * as appropriate. When constructing the vdev spec, we were unable to open this 927 * device in order to provide a devid. Now that we have labelled the disk and 928 * know that slice 0 is valid, we can construct the devid now. 929 * 930 * If the disk was already labeled with an EFI label, we will have gotten the 931 * devid already (because we were able to open the whole disk). Otherwise, we 932 * need to get the devid after we label the disk. 933 */ 934 static int 935 make_disks(zpool_handle_t *zhp, nvlist_t *nv) 936 { 937 nvlist_t **child; 938 uint_t c, children; 939 char *type, *path; 940 char devpath[MAXPATHLEN]; 941 char udevpath[MAXPATHLEN]; 942 uint64_t wholedisk; 943 struct stat64 statbuf; 944 int is_exclusive = 0; 945 int fd; 946 int ret; 947 948 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 949 950 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 951 &child, &children) != 0) { 952 953 if (strcmp(type, VDEV_TYPE_DISK) != 0) 954 return (0); 955 956 /* 957 * We have a disk device. If this is a whole disk write 958 * out the efi partition table, otherwise write zero's to 959 * the first 4k of the partition. This is to ensure that 960 * libblkid will not misidentify the partition due to a 961 * magic value left by the previous filesystem. 962 */ 963 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); 964 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 965 &wholedisk)); 966 967 if (!wholedisk) { 968 /* 969 * Update device id string for mpath nodes (Linux only) 970 */ 971 if (is_mpath_whole_disk(path)) 972 update_vdev_config_dev_strs(nv); 973 974 if (!is_spare(NULL, path)) 975 (void) zero_label(path); 976 return (0); 977 } 978 979 if (realpath(path, devpath) == NULL) { 980 ret = errno; 981 (void) fprintf(stderr, 982 gettext("cannot resolve path '%s'\n"), path); 983 return (ret); 984 } 985 986 /* 987 * Remove any previously existing symlink from a udev path to 988 * the device before labeling the disk. This ensures that 989 * only newly created links are used. Otherwise there is a 990 * window between when udev deletes and recreates the link 991 * during which access attempts will fail with ENOENT. 992 */ 993 strlcpy(udevpath, path, MAXPATHLEN); 994 (void) zfs_append_partition(udevpath, MAXPATHLEN); 995 996 fd = open(devpath, O_RDWR|O_EXCL); 997 if (fd == -1) { 998 if (errno == EBUSY) 999 is_exclusive = 1; 1000 #ifdef __FreeBSD__ 1001 if (errno == EPERM) 1002 is_exclusive = 1; 1003 #endif 1004 } else { 1005 (void) close(fd); 1006 } 1007 1008 /* 1009 * If the partition exists, contains a valid spare label, 1010 * and is opened exclusively there is no need to partition 1011 * it. Hot spares have already been partitioned and are 1012 * held open exclusively by the kernel as a safety measure. 1013 * 1014 * If the provided path is for a /dev/disk/ device its 1015 * symbolic link will be removed, partition table created, 1016 * and then block until udev creates the new link. 1017 */ 1018 if (!is_exclusive && !is_spare(NULL, udevpath)) { 1019 char *devnode = strrchr(devpath, '/') + 1; 1020 1021 ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); 1022 if (ret == 0) { 1023 ret = lstat64(udevpath, &statbuf); 1024 if (ret == 0 && S_ISLNK(statbuf.st_mode)) 1025 (void) unlink(udevpath); 1026 } 1027 1028 /* 1029 * When labeling a pool the raw device node name 1030 * is provided as it appears under /dev/. 1031 */ 1032 if (zpool_label_disk(g_zfs, zhp, devnode) == -1) 1033 return (-1); 1034 1035 /* 1036 * Wait for udev to signal the device is available 1037 * by the provided path. 1038 */ 1039 ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); 1040 if (ret) { 1041 (void) fprintf(stderr, 1042 gettext("missing link: %s was " 1043 "partitioned but %s is missing\n"), 1044 devnode, udevpath); 1045 return (ret); 1046 } 1047 1048 ret = zero_label(udevpath); 1049 if (ret) 1050 return (ret); 1051 } 1052 1053 /* 1054 * Update the path to refer to the partition. The presence of 1055 * the 'whole_disk' field indicates to the CLI that we should 1056 * chop off the partition number when displaying the device in 1057 * future output. 1058 */ 1059 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0); 1060 1061 /* 1062 * Update device id strings for whole disks (Linux only) 1063 */ 1064 update_vdev_config_dev_strs(nv); 1065 1066 return (0); 1067 } 1068 1069 for (c = 0; c < children; c++) 1070 if ((ret = make_disks(zhp, child[c])) != 0) 1071 return (ret); 1072 1073 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1074 &child, &children) == 0) 1075 for (c = 0; c < children; c++) 1076 if ((ret = make_disks(zhp, child[c])) != 0) 1077 return (ret); 1078 1079 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1080 &child, &children) == 0) 1081 for (c = 0; c < children; c++) 1082 if ((ret = make_disks(zhp, child[c])) != 0) 1083 return (ret); 1084 1085 return (0); 1086 } 1087 1088 /* 1089 * Go through and find any devices that are in use. We rely on libdiskmgt for 1090 * the majority of this task. 1091 */ 1092 static boolean_t 1093 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, 1094 boolean_t replacing, boolean_t isspare) 1095 { 1096 nvlist_t **child; 1097 uint_t c, children; 1098 char *type, *path; 1099 int ret = 0; 1100 char buf[MAXPATHLEN]; 1101 uint64_t wholedisk = B_FALSE; 1102 boolean_t anyinuse = B_FALSE; 1103 1104 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1105 1106 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1107 &child, &children) != 0) { 1108 1109 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); 1110 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1111 verify(!nvlist_lookup_uint64(nv, 1112 ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); 1113 1114 /* 1115 * As a generic check, we look to see if this is a replace of a 1116 * hot spare within the same pool. If so, we allow it 1117 * regardless of what libblkid or zpool_in_use() says. 1118 */ 1119 if (replacing) { 1120 (void) strlcpy(buf, path, sizeof (buf)); 1121 if (wholedisk) { 1122 ret = zfs_append_partition(buf, sizeof (buf)); 1123 if (ret == -1) 1124 return (-1); 1125 } 1126 1127 if (is_spare(config, buf)) 1128 return (B_FALSE); 1129 } 1130 1131 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1132 ret = check_device(path, force, isspare, wholedisk); 1133 1134 else if (strcmp(type, VDEV_TYPE_FILE) == 0) 1135 ret = check_file(path, force, isspare); 1136 1137 return (ret != 0); 1138 } 1139 1140 for (c = 0; c < children; c++) 1141 if (is_device_in_use(config, child[c], force, replacing, 1142 B_FALSE)) 1143 anyinuse = B_TRUE; 1144 1145 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1146 &child, &children) == 0) 1147 for (c = 0; c < children; c++) 1148 if (is_device_in_use(config, child[c], force, replacing, 1149 B_TRUE)) 1150 anyinuse = B_TRUE; 1151 1152 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1153 &child, &children) == 0) 1154 for (c = 0; c < children; c++) 1155 if (is_device_in_use(config, child[c], force, replacing, 1156 B_FALSE)) 1157 anyinuse = B_TRUE; 1158 1159 return (anyinuse); 1160 } 1161 1162 /* 1163 * Returns the parity level extracted from a raidz or draid type. 1164 * If the parity cannot be determined zero is returned. 1165 */ 1166 static int 1167 get_parity(const char *type) 1168 { 1169 long parity = 0; 1170 const char *p; 1171 1172 if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { 1173 p = type + strlen(VDEV_TYPE_RAIDZ); 1174 1175 if (*p == '\0') { 1176 /* when unspecified default to single parity */ 1177 return (1); 1178 } else if (*p == '0') { 1179 /* no zero prefixes allowed */ 1180 return (0); 1181 } else { 1182 /* 0-3, no suffixes allowed */ 1183 char *end; 1184 errno = 0; 1185 parity = strtol(p, &end, 10); 1186 if (errno != 0 || *end != '\0' || 1187 parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { 1188 return (0); 1189 } 1190 } 1191 } else if (strncmp(type, VDEV_TYPE_DRAID, 1192 strlen(VDEV_TYPE_DRAID)) == 0) { 1193 p = type + strlen(VDEV_TYPE_DRAID); 1194 1195 if (*p == '\0' || *p == ':') { 1196 /* when unspecified default to single parity */ 1197 return (1); 1198 } else if (*p == '0') { 1199 /* no zero prefixes allowed */ 1200 return (0); 1201 } else { 1202 /* 0-3, allowed suffixes: '\0' or ':' */ 1203 char *end; 1204 errno = 0; 1205 parity = strtol(p, &end, 10); 1206 if (errno != 0 || 1207 parity < 1 || parity > VDEV_DRAID_MAXPARITY || 1208 (*end != '\0' && *end != ':')) { 1209 return (0); 1210 } 1211 } 1212 } 1213 1214 return ((int)parity); 1215 } 1216 1217 /* 1218 * Assign the minimum and maximum number of devices allowed for 1219 * the specified type. On error NULL is returned, otherwise the 1220 * type prefix is returned (raidz, mirror, etc). 1221 */ 1222 static const char * 1223 is_grouping(const char *type, int *mindev, int *maxdev) 1224 { 1225 int nparity; 1226 1227 if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || 1228 strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { 1229 nparity = get_parity(type); 1230 if (nparity == 0) 1231 return (NULL); 1232 if (mindev != NULL) 1233 *mindev = nparity + 1; 1234 if (maxdev != NULL) 1235 *maxdev = 255; 1236 1237 if (strncmp(type, VDEV_TYPE_RAIDZ, 1238 strlen(VDEV_TYPE_RAIDZ)) == 0) { 1239 return (VDEV_TYPE_RAIDZ); 1240 } else { 1241 return (VDEV_TYPE_DRAID); 1242 } 1243 } 1244 1245 if (maxdev != NULL) 1246 *maxdev = INT_MAX; 1247 1248 if (strcmp(type, "mirror") == 0) { 1249 if (mindev != NULL) 1250 *mindev = 2; 1251 return (VDEV_TYPE_MIRROR); 1252 } 1253 1254 if (strcmp(type, "spare") == 0) { 1255 if (mindev != NULL) 1256 *mindev = 1; 1257 return (VDEV_TYPE_SPARE); 1258 } 1259 1260 if (strcmp(type, "log") == 0) { 1261 if (mindev != NULL) 1262 *mindev = 1; 1263 return (VDEV_TYPE_LOG); 1264 } 1265 1266 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || 1267 strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1268 if (mindev != NULL) 1269 *mindev = 1; 1270 return (type); 1271 } 1272 1273 if (strcmp(type, "cache") == 0) { 1274 if (mindev != NULL) 1275 *mindev = 1; 1276 return (VDEV_TYPE_L2CACHE); 1277 } 1278 1279 return (NULL); 1280 } 1281 1282 /* 1283 * Extract the configuration parameters encoded in the dRAID type and 1284 * use them to generate a dRAID configuration. The expected format is: 1285 * 1286 * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>] 1287 * 1288 * The intent is to be able to generate a good configuration when no 1289 * additional information is provided. The only mandatory component 1290 * of the 'type' is the 'draid' prefix. If a value is not provided 1291 * then reasonable defaults are used. The optional components may 1292 * appear in any order but the d/s/c suffix is required. 1293 * 1294 * Valid inputs: 1295 * - data: number of data devices per group (1-255) 1296 * - parity: number of parity blocks per group (1-3) 1297 * - spares: number of distributed spare (0-100) 1298 * - children: total number of devices (1-255) 1299 * 1300 * Examples: 1301 * - zpool create tank draid <devices...> 1302 * - zpool create tank draid2:8d:51c:2s <devices...> 1303 */ 1304 static int 1305 draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) 1306 { 1307 uint64_t nparity = 1; 1308 uint64_t nspares = 0; 1309 uint64_t ndata = UINT64_MAX; 1310 uint64_t ngroups = 1; 1311 long value; 1312 1313 if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) 1314 return (EINVAL); 1315 1316 nparity = (uint64_t)get_parity(type); 1317 if (nparity == 0) 1318 return (EINVAL); 1319 1320 char *p = (char *)type; 1321 while ((p = strchr(p, ':')) != NULL) { 1322 char *end; 1323 1324 p = p + 1; 1325 errno = 0; 1326 1327 if (!isdigit(p[0])) { 1328 (void) fprintf(stderr, gettext("invalid dRAID " 1329 "syntax; expected [:<number><c|d|s>] not '%s'\n"), 1330 type); 1331 return (EINVAL); 1332 } 1333 1334 /* Expected non-zero value with c/d/s suffix */ 1335 value = strtol(p, &end, 10); 1336 char suffix = tolower(*end); 1337 if (errno != 0 || 1338 (suffix != 'c' && suffix != 'd' && suffix != 's')) { 1339 (void) fprintf(stderr, gettext("invalid dRAID " 1340 "syntax; expected [:<number><c|d|s>] not '%s'\n"), 1341 type); 1342 return (EINVAL); 1343 } 1344 1345 if (suffix == 'c') { 1346 if ((uint64_t)value != children) { 1347 fprintf(stderr, 1348 gettext("invalid number of dRAID children; " 1349 "%llu required but %llu provided\n"), 1350 (u_longlong_t)value, 1351 (u_longlong_t)children); 1352 return (EINVAL); 1353 } 1354 } else if (suffix == 'd') { 1355 ndata = (uint64_t)value; 1356 } else if (suffix == 's') { 1357 nspares = (uint64_t)value; 1358 } else { 1359 verify(0); /* Unreachable */ 1360 } 1361 } 1362 1363 /* 1364 * When a specific number of data disks is not provided limit a 1365 * redundancy group to 8 data disks. This value was selected to 1366 * provide a reasonable tradeoff between capacity and performance. 1367 */ 1368 if (ndata == UINT64_MAX) { 1369 if (children > nspares + nparity) { 1370 ndata = MIN(children - nspares - nparity, 8); 1371 } else { 1372 fprintf(stderr, gettext("request number of " 1373 "distributed spares %llu and parity level %llu\n" 1374 "leaves no disks available for data\n"), 1375 (u_longlong_t)nspares, (u_longlong_t)nparity); 1376 return (EINVAL); 1377 } 1378 } 1379 1380 /* Verify the maximum allowed group size is never exceeded. */ 1381 if (ndata == 0 || (ndata + nparity > children - nspares)) { 1382 fprintf(stderr, gettext("requested number of dRAID data " 1383 "disks per group %llu is too high,\nat most %llu disks " 1384 "are available for data\n"), (u_longlong_t)ndata, 1385 (u_longlong_t)(children - nspares - nparity)); 1386 return (EINVAL); 1387 } 1388 1389 if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { 1390 fprintf(stderr, 1391 gettext("invalid dRAID parity level %llu; must be " 1392 "between 1 and %d\n"), (u_longlong_t)nparity, 1393 VDEV_DRAID_MAXPARITY); 1394 return (EINVAL); 1395 } 1396 1397 /* 1398 * Verify the requested number of spares can be satisfied. 1399 * An arbitrary limit of 100 distributed spares is applied. 1400 */ 1401 if (nspares > 100 || nspares > (children - (ndata + nparity))) { 1402 fprintf(stderr, 1403 gettext("invalid number of dRAID spares %llu; additional " 1404 "disks would be required\n"), (u_longlong_t)nspares); 1405 return (EINVAL); 1406 } 1407 1408 /* Verify the requested number children is sufficient. */ 1409 if (children < (ndata + nparity + nspares)) { 1410 fprintf(stderr, gettext("%llu disks were provided, but at " 1411 "least %llu disks are required for this config\n"), 1412 (u_longlong_t)children, 1413 (u_longlong_t)(ndata + nparity + nspares)); 1414 } 1415 1416 if (children > VDEV_DRAID_MAX_CHILDREN) { 1417 fprintf(stderr, gettext("%llu disks were provided, but " 1418 "dRAID only supports up to %u disks"), 1419 (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); 1420 } 1421 1422 /* 1423 * Calculate the minimum number of groups required to fill a slice. 1424 * This is the LCM of the stripe width (ndata + nparity) and the 1425 * number of data drives (children - nspares). 1426 */ 1427 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1428 ngroups++; 1429 1430 /* Store the basic dRAID configuration. */ 1431 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); 1432 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1433 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1434 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1435 1436 return (0); 1437 } 1438 1439 /* 1440 * Construct a syntactically valid vdev specification, 1441 * and ensure that all devices and files exist and can be opened. 1442 * Note: we don't bother freeing anything in the error paths 1443 * because the program is just going to exit anyway. 1444 */ 1445 static nvlist_t * 1446 construct_spec(nvlist_t *props, int argc, char **argv) 1447 { 1448 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1449 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1450 const char *type, *fulltype; 1451 boolean_t is_log, is_special, is_dedup, is_spare; 1452 boolean_t seen_logs; 1453 1454 top = NULL; 1455 toplevels = 0; 1456 spares = NULL; 1457 l2cache = NULL; 1458 nspares = 0; 1459 nlogs = 0; 1460 nl2cache = 0; 1461 is_log = is_special = is_dedup = is_spare = B_FALSE; 1462 seen_logs = B_FALSE; 1463 nvroot = NULL; 1464 1465 while (argc > 0) { 1466 fulltype = argv[0]; 1467 nv = NULL; 1468 1469 /* 1470 * If it's a mirror, raidz, or draid the subsequent arguments 1471 * are its leaves -- until we encounter the next mirror, 1472 * raidz or draid. 1473 */ 1474 if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { 1475 nvlist_t **child = NULL; 1476 int c, children = 0; 1477 1478 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1479 if (spares != NULL) { 1480 (void) fprintf(stderr, 1481 gettext("invalid vdev " 1482 "specification: 'spare' can be " 1483 "specified only once\n")); 1484 goto spec_out; 1485 } 1486 is_spare = B_TRUE; 1487 is_log = is_special = is_dedup = B_FALSE; 1488 } 1489 1490 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1491 if (seen_logs) { 1492 (void) fprintf(stderr, 1493 gettext("invalid vdev " 1494 "specification: 'log' can be " 1495 "specified only once\n")); 1496 goto spec_out; 1497 } 1498 seen_logs = B_TRUE; 1499 is_log = B_TRUE; 1500 is_special = is_dedup = is_spare = B_FALSE; 1501 argc--; 1502 argv++; 1503 /* 1504 * A log is not a real grouping device. 1505 * We just set is_log and continue. 1506 */ 1507 continue; 1508 } 1509 1510 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { 1511 is_special = B_TRUE; 1512 is_log = is_dedup = is_spare = B_FALSE; 1513 argc--; 1514 argv++; 1515 continue; 1516 } 1517 1518 if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1519 is_dedup = B_TRUE; 1520 is_log = is_special = is_spare = B_FALSE; 1521 argc--; 1522 argv++; 1523 continue; 1524 } 1525 1526 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1527 if (l2cache != NULL) { 1528 (void) fprintf(stderr, 1529 gettext("invalid vdev " 1530 "specification: 'cache' can be " 1531 "specified only once\n")); 1532 goto spec_out; 1533 } 1534 is_log = is_special = B_FALSE; 1535 is_dedup = is_spare = B_FALSE; 1536 } 1537 1538 if (is_log || is_special || is_dedup) { 1539 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1540 (void) fprintf(stderr, 1541 gettext("invalid vdev " 1542 "specification: unsupported '%s' " 1543 "device: %s\n"), is_log ? "log" : 1544 "special", type); 1545 goto spec_out; 1546 } 1547 nlogs++; 1548 } 1549 1550 for (c = 1; c < argc; c++) { 1551 if (is_grouping(argv[c], NULL, NULL) != NULL) 1552 break; 1553 1554 children++; 1555 child = realloc(child, 1556 children * sizeof (nvlist_t *)); 1557 if (child == NULL) 1558 zpool_no_memory(); 1559 if ((nv = make_leaf_vdev(props, argv[c], 1560 !(is_log || is_special || is_dedup || 1561 is_spare))) == NULL) { 1562 for (c = 0; c < children - 1; c++) 1563 nvlist_free(child[c]); 1564 free(child); 1565 goto spec_out; 1566 } 1567 1568 child[children - 1] = nv; 1569 } 1570 1571 if (children < mindev) { 1572 (void) fprintf(stderr, gettext("invalid vdev " 1573 "specification: %s requires at least %d " 1574 "devices\n"), argv[0], mindev); 1575 for (c = 0; c < children; c++) 1576 nvlist_free(child[c]); 1577 free(child); 1578 goto spec_out; 1579 } 1580 1581 if (children > maxdev) { 1582 (void) fprintf(stderr, gettext("invalid vdev " 1583 "specification: %s supports no more than " 1584 "%d devices\n"), argv[0], maxdev); 1585 for (c = 0; c < children; c++) 1586 nvlist_free(child[c]); 1587 free(child); 1588 goto spec_out; 1589 } 1590 1591 argc -= c; 1592 argv += c; 1593 1594 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1595 spares = child; 1596 nspares = children; 1597 continue; 1598 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1599 l2cache = child; 1600 nl2cache = children; 1601 continue; 1602 } else { 1603 /* create a top-level vdev with children */ 1604 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1605 0) == 0); 1606 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1607 type) == 0); 1608 verify(nvlist_add_uint64(nv, 1609 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1610 if (is_log) { 1611 verify(nvlist_add_string(nv, 1612 ZPOOL_CONFIG_ALLOCATION_BIAS, 1613 VDEV_ALLOC_BIAS_LOG) == 0); 1614 } 1615 if (is_special) { 1616 verify(nvlist_add_string(nv, 1617 ZPOOL_CONFIG_ALLOCATION_BIAS, 1618 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1619 } 1620 if (is_dedup) { 1621 verify(nvlist_add_string(nv, 1622 ZPOOL_CONFIG_ALLOCATION_BIAS, 1623 VDEV_ALLOC_BIAS_DEDUP) == 0); 1624 } 1625 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1626 verify(nvlist_add_uint64(nv, 1627 ZPOOL_CONFIG_NPARITY, 1628 mindev - 1) == 0); 1629 } 1630 if (strcmp(type, VDEV_TYPE_DRAID) == 0) { 1631 if (draid_config_by_type(nv, 1632 fulltype, children) != 0) { 1633 for (c = 0; c < children; c++) 1634 nvlist_free(child[c]); 1635 free(child); 1636 goto spec_out; 1637 } 1638 } 1639 verify(nvlist_add_nvlist_array(nv, 1640 ZPOOL_CONFIG_CHILDREN, child, 1641 children) == 0); 1642 1643 for (c = 0; c < children; c++) 1644 nvlist_free(child[c]); 1645 free(child); 1646 } 1647 } else { 1648 /* 1649 * We have a device. Pass off to make_leaf_vdev() to 1650 * construct the appropriate nvlist describing the vdev. 1651 */ 1652 if ((nv = make_leaf_vdev(props, argv[0], !(is_log || 1653 is_special || is_dedup || is_spare))) == NULL) 1654 goto spec_out; 1655 1656 verify(nvlist_add_uint64(nv, 1657 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1658 if (is_log) { 1659 verify(nvlist_add_string(nv, 1660 ZPOOL_CONFIG_ALLOCATION_BIAS, 1661 VDEV_ALLOC_BIAS_LOG) == 0); 1662 nlogs++; 1663 } 1664 1665 if (is_special) { 1666 verify(nvlist_add_string(nv, 1667 ZPOOL_CONFIG_ALLOCATION_BIAS, 1668 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1669 } 1670 if (is_dedup) { 1671 verify(nvlist_add_string(nv, 1672 ZPOOL_CONFIG_ALLOCATION_BIAS, 1673 VDEV_ALLOC_BIAS_DEDUP) == 0); 1674 } 1675 argc--; 1676 argv++; 1677 } 1678 1679 toplevels++; 1680 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1681 if (top == NULL) 1682 zpool_no_memory(); 1683 top[toplevels - 1] = nv; 1684 } 1685 1686 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1687 (void) fprintf(stderr, gettext("invalid vdev " 1688 "specification: at least one toplevel vdev must be " 1689 "specified\n")); 1690 goto spec_out; 1691 } 1692 1693 if (seen_logs && nlogs == 0) { 1694 (void) fprintf(stderr, gettext("invalid vdev specification: " 1695 "log requires at least 1 device\n")); 1696 goto spec_out; 1697 } 1698 1699 /* 1700 * Finally, create nvroot and add all top-level vdevs to it. 1701 */ 1702 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1703 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1704 VDEV_TYPE_ROOT) == 0); 1705 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1706 top, toplevels) == 0); 1707 if (nspares != 0) 1708 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1709 spares, nspares) == 0); 1710 if (nl2cache != 0) 1711 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1712 l2cache, nl2cache) == 0); 1713 1714 spec_out: 1715 for (t = 0; t < toplevels; t++) 1716 nvlist_free(top[t]); 1717 for (t = 0; t < nspares; t++) 1718 nvlist_free(spares[t]); 1719 for (t = 0; t < nl2cache; t++) 1720 nvlist_free(l2cache[t]); 1721 1722 free(spares); 1723 free(l2cache); 1724 free(top); 1725 1726 return (nvroot); 1727 } 1728 1729 nvlist_t * 1730 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, 1731 splitflags_t flags, int argc, char **argv) 1732 { 1733 nvlist_t *newroot = NULL, **child; 1734 uint_t c, children; 1735 1736 if (argc > 0) { 1737 if ((newroot = construct_spec(props, argc, argv)) == NULL) { 1738 (void) fprintf(stderr, gettext("Unable to build a " 1739 "pool from the specified devices\n")); 1740 return (NULL); 1741 } 1742 1743 if (!flags.dryrun && make_disks(zhp, newroot) != 0) { 1744 nvlist_free(newroot); 1745 return (NULL); 1746 } 1747 1748 /* avoid any tricks in the spec */ 1749 verify(nvlist_lookup_nvlist_array(newroot, 1750 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); 1751 for (c = 0; c < children; c++) { 1752 char *path; 1753 const char *type; 1754 int min, max; 1755 1756 verify(nvlist_lookup_string(child[c], 1757 ZPOOL_CONFIG_PATH, &path) == 0); 1758 if ((type = is_grouping(path, &min, &max)) != NULL) { 1759 (void) fprintf(stderr, gettext("Cannot use " 1760 "'%s' as a device for splitting\n"), type); 1761 nvlist_free(newroot); 1762 return (NULL); 1763 } 1764 } 1765 } 1766 1767 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { 1768 nvlist_free(newroot); 1769 return (NULL); 1770 } 1771 1772 return (newroot); 1773 } 1774 1775 static int 1776 num_normal_vdevs(nvlist_t *nvroot) 1777 { 1778 nvlist_t **top; 1779 uint_t t, toplevels, normal = 0; 1780 1781 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1782 &top, &toplevels) == 0); 1783 1784 for (t = 0; t < toplevels; t++) { 1785 uint64_t log = B_FALSE; 1786 1787 (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); 1788 if (log) 1789 continue; 1790 if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) 1791 continue; 1792 1793 normal++; 1794 } 1795 1796 return (normal); 1797 } 1798 1799 /* 1800 * Get and validate the contents of the given vdev specification. This ensures 1801 * that the nvlist returned is well-formed, that all the devices exist, and that 1802 * they are not currently in use by any other known consumer. The 'poolconfig' 1803 * parameter is the current configuration of the pool when adding devices 1804 * existing pool, and is used to perform additional checks, such as changing the 1805 * replication level of the pool. It can be 'NULL' to indicate that this is a 1806 * new pool. The 'force' flag controls whether devices should be forcefully 1807 * added, even if they appear in use. 1808 */ 1809 nvlist_t * 1810 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, 1811 boolean_t replacing, boolean_t dryrun, int argc, char **argv) 1812 { 1813 nvlist_t *newroot; 1814 nvlist_t *poolconfig = NULL; 1815 is_force = force; 1816 1817 /* 1818 * Construct the vdev specification. If this is successful, we know 1819 * that we have a valid specification, and that all devices can be 1820 * opened. 1821 */ 1822 if ((newroot = construct_spec(props, argc, argv)) == NULL) 1823 return (NULL); 1824 1825 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) { 1826 nvlist_free(newroot); 1827 return (NULL); 1828 } 1829 1830 /* 1831 * Validate each device to make sure that it's not shared with another 1832 * subsystem. We do this even if 'force' is set, because there are some 1833 * uses (such as a dedicated dump device) that even '-f' cannot 1834 * override. 1835 */ 1836 if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { 1837 nvlist_free(newroot); 1838 return (NULL); 1839 } 1840 1841 /* 1842 * Check the replication level of the given vdevs and report any errors 1843 * found. We include the existing pool spec, if any, as we need to 1844 * catch changes against the existing replication level. 1845 */ 1846 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1847 nvlist_free(newroot); 1848 return (NULL); 1849 } 1850 1851 /* 1852 * On pool create the new vdev spec must have one normal vdev. 1853 */ 1854 if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { 1855 vdev_error(gettext("at least one general top-level vdev must " 1856 "be specified\n")); 1857 nvlist_free(newroot); 1858 return (NULL); 1859 } 1860 1861 /* 1862 * Run through the vdev specification and label any whole disks found. 1863 */ 1864 if (!dryrun && make_disks(zhp, newroot) != 0) { 1865 nvlist_free(newroot); 1866 return (NULL); 1867 } 1868 1869 return (newroot); 1870 } 1871