1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2013, 2018 by Delphix. All rights reserved. 26 * Copyright (c) 2016, 2017 Intel Corporation. 27 * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>. 28 */ 29 30 /* 31 * Functions to convert between a list of vdevs and an nvlist representing the 32 * configuration. Each entry in the list can be one of: 33 * 34 * Device vdevs 35 * disk=(path=..., devid=...) 36 * file=(path=...) 37 * 38 * Group vdevs 39 * raidz[1|2]=(...) 40 * mirror=(...) 41 * 42 * Hot spares 43 * 44 * While the underlying implementation supports it, group vdevs cannot contain 45 * other group vdevs. All userland verification of devices is contained within 46 * this file. If successful, the nvlist returned can be passed directly to the 47 * kernel; we've done as much verification as possible in userland. 48 * 49 * Hot spares are a special case, and passed down as an array of disk vdevs, at 50 * the same level as the root of the vdev tree. 51 * 52 * The only function exported by this file is 'make_root_vdev'. The 53 * function performs several passes: 54 * 55 * 1. Construct the vdev specification. Performs syntax validation and 56 * makes sure each device is valid. 57 * 2. Check for devices in use. Using libblkid to make sure that no 58 * devices are also in use. Some can be overridden using the 'force' 59 * flag, others cannot. 60 * 3. Check for replication errors if the 'force' flag is not specified. 61 * validates that the replication level is consistent across the 62 * entire pool. 63 * 4. Call libzfs to label any whole disks with an EFI label. 64 */ 65 66 #include <assert.h> 67 #include <ctype.h> 68 #include <errno.h> 69 #include <fcntl.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <libzutil.h> 73 #include <limits.h> 74 #include <sys/spa.h> 75 #include <stdio.h> 76 #include <string.h> 77 #include <unistd.h> 78 #include "zpool_util.h" 79 #include <sys/zfs_context.h> 80 #include <sys/stat.h> 81 82 /* 83 * For any given vdev specification, we can have multiple errors. The 84 * vdev_error() function keeps track of whether we have seen an error yet, and 85 * prints out a header if its the first error we've seen. 86 */ 87 boolean_t error_seen; 88 boolean_t is_force; 89 90 void 91 vdev_error(const char *fmt, ...) 92 { 93 va_list ap; 94 95 if (!error_seen) { 96 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 97 if (!is_force) 98 (void) fprintf(stderr, gettext("use '-f' to override " 99 "the following errors:\n")); 100 else 101 (void) fprintf(stderr, gettext("the following errors " 102 "must be manually repaired:\n")); 103 error_seen = B_TRUE; 104 } 105 106 va_start(ap, fmt); 107 (void) vfprintf(stderr, fmt, ap); 108 va_end(ap); 109 } 110 111 /* 112 * Check that a file is valid. All we can do in this case is check that it's 113 * not in use by another pool, and not in use by swap. 114 */ 115 int 116 check_file_generic(const char *file, boolean_t force, boolean_t isspare) 117 { 118 char *name; 119 int fd; 120 int ret = 0; 121 pool_state_t state; 122 boolean_t inuse; 123 124 if ((fd = open(file, O_RDONLY)) < 0) 125 return (0); 126 127 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 128 const char *desc; 129 130 switch (state) { 131 case POOL_STATE_ACTIVE: 132 desc = gettext("active"); 133 break; 134 135 case POOL_STATE_EXPORTED: 136 desc = gettext("exported"); 137 break; 138 139 case POOL_STATE_POTENTIALLY_ACTIVE: 140 desc = gettext("potentially active"); 141 break; 142 143 default: 144 desc = gettext("unknown"); 145 break; 146 } 147 148 /* 149 * Allow hot spares to be shared between pools. 150 */ 151 if (state == POOL_STATE_SPARE && isspare) { 152 free(name); 153 (void) close(fd); 154 return (0); 155 } 156 157 if (state == POOL_STATE_ACTIVE || 158 state == POOL_STATE_SPARE || !force) { 159 switch (state) { 160 case POOL_STATE_SPARE: 161 vdev_error(gettext("%s is reserved as a hot " 162 "spare for pool %s\n"), file, name); 163 break; 164 default: 165 vdev_error(gettext("%s is part of %s pool " 166 "'%s'\n"), file, desc, name); 167 break; 168 } 169 ret = -1; 170 } 171 172 free(name); 173 } 174 175 (void) close(fd); 176 return (ret); 177 } 178 179 /* 180 * This may be a shorthand device path or it could be total gibberish. 181 * Check to see if it is a known device available in zfs_vdev_paths. 182 * As part of this check, see if we've been given an entire disk 183 * (minus the slice number). 184 */ 185 static int 186 is_shorthand_path(const char *arg, char *path, size_t path_size, 187 struct stat64 *statbuf, boolean_t *wholedisk) 188 { 189 int error; 190 191 error = zfs_resolve_shortname(arg, path, path_size); 192 if (error == 0) { 193 *wholedisk = zfs_dev_is_whole_disk(path); 194 if (*wholedisk || (stat64(path, statbuf) == 0)) 195 return (0); 196 } 197 198 strlcpy(path, arg, path_size); 199 memset(statbuf, 0, sizeof (*statbuf)); 200 *wholedisk = B_FALSE; 201 202 return (error); 203 } 204 205 /* 206 * Determine if the given path is a hot spare within the given configuration. 207 * If no configuration is given we rely solely on the label. 208 */ 209 static boolean_t 210 is_spare(nvlist_t *config, const char *path) 211 { 212 int fd; 213 pool_state_t state; 214 char *name = NULL; 215 nvlist_t *label; 216 uint64_t guid, spareguid; 217 nvlist_t *nvroot; 218 nvlist_t **spares; 219 uint_t i, nspares; 220 boolean_t inuse; 221 222 if (zpool_is_draid_spare(path)) 223 return (B_TRUE); 224 225 if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) 226 return (B_FALSE); 227 228 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 229 !inuse || 230 state != POOL_STATE_SPARE || 231 zpool_read_label(fd, &label, NULL) != 0) { 232 free(name); 233 (void) close(fd); 234 return (B_FALSE); 235 } 236 free(name); 237 (void) close(fd); 238 239 if (config == NULL) { 240 nvlist_free(label); 241 return (B_TRUE); 242 } 243 244 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 245 nvlist_free(label); 246 247 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 248 &nvroot) == 0); 249 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 250 &spares, &nspares) == 0) { 251 for (i = 0; i < nspares; i++) { 252 verify(nvlist_lookup_uint64(spares[i], 253 ZPOOL_CONFIG_GUID, &spareguid) == 0); 254 if (spareguid == guid) 255 return (B_TRUE); 256 } 257 } 258 259 return (B_FALSE); 260 } 261 262 /* 263 * Create a leaf vdev. Determine if this is a file or a device. If it's a 264 * device, fill in the device id to make a complete nvlist. Valid forms for a 265 * leaf vdev are: 266 * 267 * /dev/xxx Complete disk path 268 * /xxx Full path to file 269 * xxx Shorthand for <zfs_vdev_paths>/xxx 270 * draid* Virtual dRAID spare 271 */ 272 static nvlist_t * 273 make_leaf_vdev(const char *arg, boolean_t is_primary, uint64_t ashift) 274 { 275 char path[MAXPATHLEN]; 276 struct stat64 statbuf; 277 nvlist_t *vdev = NULL; 278 const char *type = NULL; 279 boolean_t wholedisk = B_FALSE; 280 int err; 281 282 /* 283 * Determine what type of vdev this is, and put the full path into 284 * 'path'. We detect whether this is a device of file afterwards by 285 * checking the st_mode of the file. 286 */ 287 if (arg[0] == '/') { 288 /* 289 * Complete device or file path. Exact type is determined by 290 * examining the file descriptor afterwards. Symbolic links 291 * are resolved to their real paths to determine whole disk 292 * and S_ISBLK/S_ISREG type checks. However, we are careful 293 * to store the given path as ZPOOL_CONFIG_PATH to ensure we 294 * can leverage udev's persistent device labels. 295 */ 296 if (realpath(arg, path) == NULL) { 297 (void) fprintf(stderr, 298 gettext("cannot resolve path '%s'\n"), arg); 299 return (NULL); 300 } 301 302 wholedisk = zfs_dev_is_whole_disk(path); 303 if (!wholedisk && (stat64(path, &statbuf) != 0)) { 304 (void) fprintf(stderr, 305 gettext("cannot open '%s': %s\n"), 306 path, strerror(errno)); 307 return (NULL); 308 } 309 310 /* After whole disk check restore original passed path */ 311 strlcpy(path, arg, sizeof (path)); 312 } else if (zpool_is_draid_spare(arg)) { 313 if (!is_primary) { 314 (void) fprintf(stderr, 315 gettext("cannot open '%s': dRAID spares can only " 316 "be used to replace primary vdevs\n"), arg); 317 return (NULL); 318 } 319 320 wholedisk = B_TRUE; 321 strlcpy(path, arg, sizeof (path)); 322 type = VDEV_TYPE_DRAID_SPARE; 323 } else { 324 err = is_shorthand_path(arg, path, sizeof (path), 325 &statbuf, &wholedisk); 326 if (err != 0) { 327 /* 328 * If we got ENOENT, then the user gave us 329 * gibberish, so try to direct them with a 330 * reasonable error message. Otherwise, 331 * regurgitate strerror() since it's the best we 332 * can do. 333 */ 334 if (err == ENOENT) { 335 (void) fprintf(stderr, 336 gettext("cannot open '%s': no such " 337 "device in %s\n"), arg, DISK_ROOT); 338 (void) fprintf(stderr, 339 gettext("must be a full path or " 340 "shorthand device name\n")); 341 return (NULL); 342 } else { 343 (void) fprintf(stderr, 344 gettext("cannot open '%s': %s\n"), 345 path, strerror(errno)); 346 return (NULL); 347 } 348 } 349 } 350 351 if (type == NULL) { 352 /* 353 * Determine whether this is a device or a file. 354 */ 355 if (wholedisk || S_ISBLK(statbuf.st_mode)) { 356 type = VDEV_TYPE_DISK; 357 } else if (S_ISREG(statbuf.st_mode)) { 358 type = VDEV_TYPE_FILE; 359 } else { 360 fprintf(stderr, gettext("cannot use '%s': must " 361 "be a block device or regular file\n"), path); 362 return (NULL); 363 } 364 } 365 366 /* 367 * Finally, we have the complete device or file, and we know that it is 368 * acceptable to use. Construct the nvlist to describe this vdev. All 369 * vdevs have a 'path' element, and devices also have a 'devid' element. 370 */ 371 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 372 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 373 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 374 375 /* Lookup and add the enclosure sysfs path (if exists) */ 376 update_vdev_config_dev_sysfs_path(vdev, path, 377 ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); 378 379 if (strcmp(type, VDEV_TYPE_DISK) == 0) 380 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 381 (uint64_t)wholedisk) == 0); 382 383 /* 384 * If the device is known to incorrectly report its physical sector 385 * size explicitly provide the known correct value. 386 */ 387 if (ashift == 0) { 388 int sector_size; 389 390 if (check_sector_size_database(path, §or_size) == B_TRUE) 391 ashift = highbit64(sector_size) - 1; 392 } 393 394 if (ashift > 0) 395 (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); 396 397 return (vdev); 398 } 399 400 /* 401 * Go through and verify the replication level of the pool is consistent. 402 * Performs the following checks: 403 * 404 * For the new spec, verifies that devices in mirrors and raidz are the 405 * same size. 406 * 407 * If the current configuration already has inconsistent replication 408 * levels, ignore any other potential problems in the new spec. 409 * 410 * Otherwise, make sure that the current spec (if there is one) and the new 411 * spec have consistent replication levels. 412 * 413 * If there is no current spec (create), make sure new spec has at least 414 * one general purpose vdev. 415 */ 416 typedef struct replication_level { 417 const char *zprl_type; 418 uint64_t zprl_children; 419 uint64_t zprl_parity; 420 } replication_level_t; 421 422 #define ZPOOL_FUZZ (16 * 1024 * 1024) 423 424 /* 425 * N.B. For the purposes of comparing replication levels dRAID can be 426 * considered functionally equivalent to raidz. 427 */ 428 static boolean_t 429 is_raidz_mirror(replication_level_t *a, replication_level_t *b, 430 replication_level_t **raidz, replication_level_t **mirror) 431 { 432 if ((strcmp(a->zprl_type, "raidz") == 0 || 433 strcmp(a->zprl_type, "draid") == 0) && 434 strcmp(b->zprl_type, "mirror") == 0) { 435 *raidz = a; 436 *mirror = b; 437 return (B_TRUE); 438 } 439 return (B_FALSE); 440 } 441 442 /* 443 * Comparison for determining if dRAID and raidz where passed in either order. 444 */ 445 static boolean_t 446 is_raidz_draid(replication_level_t *a, replication_level_t *b) 447 { 448 if ((strcmp(a->zprl_type, "raidz") == 0 || 449 strcmp(a->zprl_type, "draid") == 0) && 450 (strcmp(b->zprl_type, "raidz") == 0 || 451 strcmp(b->zprl_type, "draid") == 0)) { 452 return (B_TRUE); 453 } 454 455 return (B_FALSE); 456 } 457 458 /* 459 * Given a list of toplevel vdevs, return the current replication level. If 460 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 461 * an error message will be displayed for each self-inconsistent vdev. 462 */ 463 static replication_level_t * 464 get_replication(nvlist_t *nvroot, boolean_t fatal) 465 { 466 nvlist_t **top; 467 uint_t t, toplevels; 468 nvlist_t **child; 469 uint_t c, children; 470 nvlist_t *nv; 471 const char *type; 472 replication_level_t lastrep = {0}; 473 replication_level_t rep; 474 replication_level_t *ret; 475 replication_level_t *raidz, *mirror; 476 boolean_t dontreport; 477 478 ret = safe_malloc(sizeof (replication_level_t)); 479 480 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 481 &top, &toplevels) == 0); 482 483 for (t = 0; t < toplevels; t++) { 484 uint64_t is_log = B_FALSE; 485 486 nv = top[t]; 487 488 /* 489 * For separate logs we ignore the top level vdev replication 490 * constraints. 491 */ 492 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); 493 if (is_log) 494 continue; 495 496 /* 497 * Ignore holes introduced by removing aux devices, along 498 * with indirect vdevs introduced by previously removed 499 * vdevs. 500 */ 501 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 502 if (strcmp(type, VDEV_TYPE_HOLE) == 0 || 503 strcmp(type, VDEV_TYPE_INDIRECT) == 0) 504 continue; 505 506 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 507 &child, &children) != 0) { 508 /* 509 * This is a 'file' or 'disk' vdev. 510 */ 511 rep.zprl_type = type; 512 rep.zprl_children = 1; 513 rep.zprl_parity = 0; 514 } else { 515 int64_t vdev_size; 516 517 /* 518 * This is a mirror or RAID-Z vdev. Go through and make 519 * sure the contents are all the same (files vs. disks), 520 * keeping track of the number of elements in the 521 * process. 522 * 523 * We also check that the size of each vdev (if it can 524 * be determined) is the same. 525 */ 526 rep.zprl_type = type; 527 rep.zprl_children = 0; 528 529 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || 530 strcmp(type, VDEV_TYPE_DRAID) == 0) { 531 verify(nvlist_lookup_uint64(nv, 532 ZPOOL_CONFIG_NPARITY, 533 &rep.zprl_parity) == 0); 534 assert(rep.zprl_parity != 0); 535 } else { 536 rep.zprl_parity = 0; 537 } 538 539 /* 540 * The 'dontreport' variable indicates that we've 541 * already reported an error for this spec, so don't 542 * bother doing it again. 543 */ 544 type = NULL; 545 dontreport = 0; 546 vdev_size = -1LL; 547 for (c = 0; c < children; c++) { 548 nvlist_t *cnv = child[c]; 549 const char *path; 550 struct stat64 statbuf; 551 const char *childtype; 552 int fd, err; 553 554 rep.zprl_children++; 555 556 verify(nvlist_lookup_string(cnv, 557 ZPOOL_CONFIG_TYPE, &childtype) == 0); 558 559 /* 560 * If this is a replacing or spare vdev, then 561 * get the real first child of the vdev: do this 562 * in a loop because replacing and spare vdevs 563 * can be nested. 564 */ 565 while (strcmp(childtype, 566 VDEV_TYPE_REPLACING) == 0 || 567 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 568 nvlist_t **rchild; 569 uint_t rchildren; 570 571 verify(nvlist_lookup_nvlist_array(cnv, 572 ZPOOL_CONFIG_CHILDREN, &rchild, 573 &rchildren) == 0); 574 assert(rchildren == 2); 575 cnv = rchild[0]; 576 577 verify(nvlist_lookup_string(cnv, 578 ZPOOL_CONFIG_TYPE, 579 &childtype) == 0); 580 } 581 582 verify(nvlist_lookup_string(cnv, 583 ZPOOL_CONFIG_PATH, &path) == 0); 584 585 /* 586 * Skip active spares they should never cause 587 * the pool to be evaluated as inconsistent. 588 */ 589 if (is_spare(NULL, path)) 590 continue; 591 592 /* 593 * If we have a raidz/mirror that combines disks 594 * with files, only report it as an error when 595 * fatal is set to ensure all the replication 596 * checks aren't skipped in check_replication(). 597 */ 598 if (fatal && !dontreport && type != NULL && 599 strcmp(type, childtype) != 0) { 600 if (ret != NULL) 601 free(ret); 602 ret = NULL; 603 vdev_error(gettext( 604 "mismatched replication " 605 "level: %s contains both " 606 "files and devices\n"), 607 rep.zprl_type); 608 dontreport = B_TRUE; 609 } 610 611 /* 612 * According to stat(2), the value of 'st_size' 613 * is undefined for block devices and character 614 * devices. But there is no effective way to 615 * determine the real size in userland. 616 * 617 * Instead, we'll take advantage of an 618 * implementation detail of spec_size(). If the 619 * device is currently open, then we (should) 620 * return a valid size. 621 * 622 * If we still don't get a valid size (indicated 623 * by a size of 0 or MAXOFFSET_T), then ignore 624 * this device altogether. 625 */ 626 if ((fd = open(path, O_RDONLY)) >= 0) { 627 err = fstat64_blk(fd, &statbuf); 628 (void) close(fd); 629 } else { 630 err = stat64(path, &statbuf); 631 } 632 633 if (err != 0 || 634 statbuf.st_size == 0 || 635 statbuf.st_size == MAXOFFSET_T) 636 continue; 637 638 int64_t size = statbuf.st_size; 639 640 /* 641 * Also make sure that devices and 642 * slices have a consistent size. If 643 * they differ by a significant amount 644 * (~16MB) then report an error. 645 */ 646 if (!dontreport && 647 (vdev_size != -1LL && 648 (llabs(size - vdev_size) > 649 ZPOOL_FUZZ))) { 650 if (ret != NULL) 651 free(ret); 652 ret = NULL; 653 if (fatal) 654 vdev_error(gettext( 655 "%s contains devices of " 656 "different sizes\n"), 657 rep.zprl_type); 658 else 659 return (NULL); 660 dontreport = B_TRUE; 661 } 662 663 type = childtype; 664 vdev_size = size; 665 } 666 } 667 668 /* 669 * At this point, we have the replication of the last toplevel 670 * vdev in 'rep'. Compare it to 'lastrep' to see if it is 671 * different. 672 */ 673 if (lastrep.zprl_type != NULL) { 674 if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || 675 is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { 676 /* 677 * Accepted raidz and mirror when they can 678 * handle the same number of disk failures. 679 */ 680 if (raidz->zprl_parity != 681 mirror->zprl_children - 1) { 682 if (ret != NULL) 683 free(ret); 684 ret = NULL; 685 if (fatal) 686 vdev_error(gettext( 687 "mismatched replication " 688 "level: " 689 "%s and %s vdevs with " 690 "different redundancy, " 691 "%llu vs. %llu (%llu-way) " 692 "are present\n"), 693 raidz->zprl_type, 694 mirror->zprl_type, 695 (u_longlong_t) 696 raidz->zprl_parity, 697 (u_longlong_t) 698 mirror->zprl_children - 1, 699 (u_longlong_t) 700 mirror->zprl_children); 701 else 702 return (NULL); 703 } 704 } else if (is_raidz_draid(&lastrep, &rep)) { 705 /* 706 * Accepted raidz and draid when they can 707 * handle the same number of disk failures. 708 */ 709 if (lastrep.zprl_parity != rep.zprl_parity) { 710 if (ret != NULL) 711 free(ret); 712 ret = NULL; 713 if (fatal) 714 vdev_error(gettext( 715 "mismatched replication " 716 "level: %s and %s vdevs " 717 "with different " 718 "redundancy, %llu vs. " 719 "%llu are present\n"), 720 lastrep.zprl_type, 721 rep.zprl_type, 722 (u_longlong_t) 723 lastrep.zprl_parity, 724 (u_longlong_t) 725 rep.zprl_parity); 726 else 727 return (NULL); 728 } 729 } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 730 0) { 731 if (ret != NULL) 732 free(ret); 733 ret = NULL; 734 if (fatal) 735 vdev_error(gettext( 736 "mismatched replication level: " 737 "both %s and %s vdevs are " 738 "present\n"), 739 lastrep.zprl_type, rep.zprl_type); 740 else 741 return (NULL); 742 } else if (lastrep.zprl_parity != rep.zprl_parity) { 743 if (ret) 744 free(ret); 745 ret = NULL; 746 if (fatal) 747 vdev_error(gettext( 748 "mismatched replication level: " 749 "both %llu and %llu device parity " 750 "%s vdevs are present\n"), 751 (u_longlong_t) 752 lastrep.zprl_parity, 753 (u_longlong_t)rep.zprl_parity, 754 rep.zprl_type); 755 else 756 return (NULL); 757 } else if (lastrep.zprl_children != rep.zprl_children) { 758 if (ret) 759 free(ret); 760 ret = NULL; 761 if (fatal) 762 vdev_error(gettext( 763 "mismatched replication level: " 764 "both %llu-way and %llu-way %s " 765 "vdevs are present\n"), 766 (u_longlong_t) 767 lastrep.zprl_children, 768 (u_longlong_t) 769 rep.zprl_children, 770 rep.zprl_type); 771 else 772 return (NULL); 773 } 774 } 775 lastrep = rep; 776 } 777 778 if (ret != NULL) 779 *ret = rep; 780 781 return (ret); 782 } 783 784 /* 785 * Check the replication level of the vdev spec against the current pool. Calls 786 * get_replication() to make sure the new spec is self-consistent. If the pool 787 * has a consistent replication level, then we ignore any errors. Otherwise, 788 * report any difference between the two. 789 */ 790 static int 791 check_replication(nvlist_t *config, nvlist_t *newroot) 792 { 793 nvlist_t **child; 794 uint_t children; 795 replication_level_t *current = NULL, *new; 796 replication_level_t *raidz, *mirror; 797 int ret; 798 799 /* 800 * If we have a current pool configuration, check to see if it's 801 * self-consistent. If not, simply return success. 802 */ 803 if (config != NULL) { 804 nvlist_t *nvroot; 805 806 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 807 &nvroot) == 0); 808 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 809 return (0); 810 } 811 /* 812 * for spares there may be no children, and therefore no 813 * replication level to check 814 */ 815 if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, 816 &child, &children) != 0) || (children == 0)) { 817 free(current); 818 return (0); 819 } 820 821 /* 822 * If all we have is logs then there's no replication level to check. 823 */ 824 if (num_logs(newroot) == children) { 825 free(current); 826 return (0); 827 } 828 829 /* 830 * Get the replication level of the new vdev spec, reporting any 831 * inconsistencies found. 832 */ 833 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 834 free(current); 835 return (-1); 836 } 837 838 /* 839 * Check to see if the new vdev spec matches the replication level of 840 * the current pool. 841 */ 842 ret = 0; 843 if (current != NULL) { 844 if (is_raidz_mirror(current, new, &raidz, &mirror) || 845 is_raidz_mirror(new, current, &raidz, &mirror)) { 846 if (raidz->zprl_parity != mirror->zprl_children - 1) { 847 vdev_error(gettext( 848 "mismatched replication level: pool and " 849 "new vdev with different redundancy, %s " 850 "and %s vdevs, %llu vs. %llu (%llu-way)\n"), 851 raidz->zprl_type, 852 mirror->zprl_type, 853 (u_longlong_t)raidz->zprl_parity, 854 (u_longlong_t)mirror->zprl_children - 1, 855 (u_longlong_t)mirror->zprl_children); 856 ret = -1; 857 } 858 } else if (is_raidz_draid(current, new)) { 859 if (current->zprl_parity != new->zprl_parity) { 860 vdev_error(gettext( 861 "mismatched replication level: pool and " 862 "new vdev with different redundancy, %s " 863 "and %s vdevs, %llu vs. %llu\n"), 864 current->zprl_type, 865 new->zprl_type, 866 (u_longlong_t)current->zprl_parity, 867 (u_longlong_t)new->zprl_parity); 868 ret = -1; 869 } 870 } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { 871 vdev_error(gettext( 872 "mismatched replication level: pool uses %s " 873 "and new vdev is %s\n"), 874 current->zprl_type, new->zprl_type); 875 ret = -1; 876 } else if (current->zprl_parity != new->zprl_parity) { 877 vdev_error(gettext( 878 "mismatched replication level: pool uses %llu " 879 "device parity and new vdev uses %llu\n"), 880 (u_longlong_t)current->zprl_parity, 881 (u_longlong_t)new->zprl_parity); 882 ret = -1; 883 } else if (current->zprl_children != new->zprl_children) { 884 vdev_error(gettext( 885 "mismatched replication level: pool uses %llu-way " 886 "%s and new vdev uses %llu-way %s\n"), 887 (u_longlong_t)current->zprl_children, 888 current->zprl_type, 889 (u_longlong_t)new->zprl_children, 890 new->zprl_type); 891 ret = -1; 892 } 893 } 894 895 free(new); 896 if (current != NULL) 897 free(current); 898 899 return (ret); 900 } 901 902 static int 903 zero_label(const char *path) 904 { 905 const int size = 4096; 906 char buf[size]; 907 int err, fd; 908 909 if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) { 910 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 911 path, strerror(errno)); 912 return (-1); 913 } 914 915 memset(buf, 0, size); 916 err = write(fd, buf, size); 917 (void) fdatasync(fd); 918 (void) close(fd); 919 920 if (err == -1) { 921 (void) fprintf(stderr, gettext("cannot zero first %d bytes " 922 "of '%s': %s\n"), size, path, strerror(errno)); 923 return (-1); 924 } 925 926 if (err != size) { 927 (void) fprintf(stderr, gettext("could only zero %d/%d bytes " 928 "of '%s'\n"), err, size, path); 929 return (-1); 930 } 931 932 return (0); 933 } 934 935 static void 936 lines_to_stderr(char *lines[], int lines_cnt) 937 { 938 int i; 939 for (i = 0; i < lines_cnt; i++) { 940 fprintf(stderr, "%s\n", lines[i]); 941 } 942 } 943 944 /* 945 * Go through and find any whole disks in the vdev specification, labelling them 946 * as appropriate. When constructing the vdev spec, we were unable to open this 947 * device in order to provide a devid. Now that we have labelled the disk and 948 * know that slice 0 is valid, we can construct the devid now. 949 * 950 * If the disk was already labeled with an EFI label, we will have gotten the 951 * devid already (because we were able to open the whole disk). Otherwise, we 952 * need to get the devid after we label the disk. 953 */ 954 static int 955 make_disks(zpool_handle_t *zhp, nvlist_t *nv, boolean_t replacing) 956 { 957 nvlist_t **child; 958 uint_t c, children; 959 const char *type, *path; 960 char devpath[MAXPATHLEN]; 961 char udevpath[MAXPATHLEN]; 962 uint64_t wholedisk; 963 struct stat64 statbuf; 964 int is_exclusive = 0; 965 int fd; 966 int ret; 967 968 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 969 970 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 971 &child, &children) != 0) { 972 973 if (strcmp(type, VDEV_TYPE_DISK) != 0) 974 return (0); 975 976 /* 977 * We have a disk device. If this is a whole disk write 978 * out the efi partition table, otherwise write zero's to 979 * the first 4k of the partition. This is to ensure that 980 * libblkid will not misidentify the partition due to a 981 * magic value left by the previous filesystem. 982 */ 983 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); 984 verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 985 &wholedisk)); 986 987 if (!wholedisk) { 988 /* 989 * Update device id string for mpath nodes (Linux only) 990 */ 991 if (is_mpath_whole_disk(path)) 992 update_vdev_config_dev_strs(nv); 993 994 if (!is_spare(NULL, path)) 995 (void) zero_label(path); 996 return (0); 997 } 998 999 if (realpath(path, devpath) == NULL) { 1000 ret = errno; 1001 (void) fprintf(stderr, 1002 gettext("cannot resolve path '%s'\n"), path); 1003 return (ret); 1004 } 1005 1006 /* 1007 * Remove any previously existing symlink from a udev path to 1008 * the device before labeling the disk. This ensures that 1009 * only newly created links are used. Otherwise there is a 1010 * window between when udev deletes and recreates the link 1011 * during which access attempts will fail with ENOENT. 1012 */ 1013 strlcpy(udevpath, path, MAXPATHLEN); 1014 (void) zfs_append_partition(udevpath, MAXPATHLEN); 1015 1016 fd = open(devpath, O_RDWR|O_EXCL); 1017 if (fd == -1) { 1018 if (errno == EBUSY) 1019 is_exclusive = 1; 1020 #ifdef __FreeBSD__ 1021 if (errno == EPERM) 1022 is_exclusive = 1; 1023 #endif 1024 } else { 1025 (void) close(fd); 1026 } 1027 1028 /* 1029 * If the partition exists, contains a valid spare label, 1030 * and is opened exclusively there is no need to partition 1031 * it. Hot spares have already been partitioned and are 1032 * held open exclusively by the kernel as a safety measure. 1033 * 1034 * If the provided path is for a /dev/disk/ device its 1035 * symbolic link will be removed, partition table created, 1036 * and then block until udev creates the new link. 1037 */ 1038 if (!is_exclusive && !is_spare(NULL, udevpath)) { 1039 char *devnode = strrchr(devpath, '/') + 1; 1040 char **lines = NULL; 1041 int lines_cnt = 0; 1042 1043 ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); 1044 if (ret == 0) { 1045 ret = lstat64(udevpath, &statbuf); 1046 if (ret == 0 && S_ISLNK(statbuf.st_mode)) 1047 (void) unlink(udevpath); 1048 } 1049 1050 /* 1051 * When labeling a pool the raw device node name 1052 * is provided as it appears under /dev/. 1053 * 1054 * Note that 'zhp' will be NULL when we're creating a 1055 * pool. 1056 */ 1057 if (zpool_prepare_and_label_disk(g_zfs, zhp, devnode, 1058 nv, zhp == NULL ? "create" : 1059 replacing ? "replace" : "add", &lines, 1060 &lines_cnt) != 0) { 1061 (void) fprintf(stderr, 1062 gettext( 1063 "Error preparing/labeling disk.\n")); 1064 if (lines_cnt > 0) { 1065 (void) fprintf(stderr, 1066 gettext("zfs_prepare_disk output:\n")); 1067 lines_to_stderr(lines, lines_cnt); 1068 } 1069 1070 libzfs_free_str_array(lines, lines_cnt); 1071 return (-1); 1072 } 1073 libzfs_free_str_array(lines, lines_cnt); 1074 1075 /* 1076 * Wait for udev to signal the device is available 1077 * by the provided path. 1078 */ 1079 ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); 1080 if (ret) { 1081 (void) fprintf(stderr, 1082 gettext("missing link: %s was " 1083 "partitioned but %s is missing\n"), 1084 devnode, udevpath); 1085 return (ret); 1086 } 1087 1088 ret = zero_label(udevpath); 1089 if (ret) 1090 return (ret); 1091 } 1092 1093 /* 1094 * Update the path to refer to the partition. The presence of 1095 * the 'whole_disk' field indicates to the CLI that we should 1096 * chop off the partition number when displaying the device in 1097 * future output. 1098 */ 1099 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0); 1100 1101 /* 1102 * Update device id strings for whole disks (Linux only) 1103 */ 1104 update_vdev_config_dev_strs(nv); 1105 1106 return (0); 1107 } 1108 1109 for (c = 0; c < children; c++) 1110 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1111 return (ret); 1112 1113 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1114 &child, &children) == 0) 1115 for (c = 0; c < children; c++) 1116 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1117 return (ret); 1118 1119 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1120 &child, &children) == 0) 1121 for (c = 0; c < children; c++) 1122 if ((ret = make_disks(zhp, child[c], replacing)) != 0) 1123 return (ret); 1124 1125 return (0); 1126 } 1127 1128 /* 1129 * Go through and find any devices that are in use. We rely on libdiskmgt for 1130 * the majority of this task. 1131 */ 1132 static boolean_t 1133 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, 1134 boolean_t replacing, boolean_t isspare) 1135 { 1136 nvlist_t **child; 1137 uint_t c, children; 1138 const char *type, *path; 1139 int ret = 0; 1140 char buf[MAXPATHLEN]; 1141 uint64_t wholedisk = B_FALSE; 1142 boolean_t anyinuse = B_FALSE; 1143 1144 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1145 1146 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1147 &child, &children) != 0) { 1148 1149 verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); 1150 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1151 verify(!nvlist_lookup_uint64(nv, 1152 ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); 1153 1154 /* 1155 * As a generic check, we look to see if this is a replace of a 1156 * hot spare within the same pool. If so, we allow it 1157 * regardless of what libblkid or zpool_in_use() says. 1158 */ 1159 if (replacing) { 1160 (void) strlcpy(buf, path, sizeof (buf)); 1161 if (wholedisk) { 1162 ret = zfs_append_partition(buf, sizeof (buf)); 1163 if (ret == -1) 1164 return (-1); 1165 } 1166 1167 if (is_spare(config, buf)) 1168 return (B_FALSE); 1169 } 1170 1171 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1172 ret = check_device(path, force, isspare, wholedisk); 1173 1174 else if (strcmp(type, VDEV_TYPE_FILE) == 0) 1175 ret = check_file(path, force, isspare); 1176 1177 return (ret != 0); 1178 } 1179 1180 for (c = 0; c < children; c++) 1181 if (is_device_in_use(config, child[c], force, replacing, 1182 B_FALSE)) 1183 anyinuse = B_TRUE; 1184 1185 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1186 &child, &children) == 0) 1187 for (c = 0; c < children; c++) 1188 if (is_device_in_use(config, child[c], force, replacing, 1189 B_TRUE)) 1190 anyinuse = B_TRUE; 1191 1192 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, 1193 &child, &children) == 0) 1194 for (c = 0; c < children; c++) 1195 if (is_device_in_use(config, child[c], force, replacing, 1196 B_FALSE)) 1197 anyinuse = B_TRUE; 1198 1199 return (anyinuse); 1200 } 1201 1202 /* 1203 * Returns the parity level extracted from a raidz or draid type. 1204 * If the parity cannot be determined zero is returned. 1205 */ 1206 static int 1207 get_parity(const char *type) 1208 { 1209 long parity = 0; 1210 const char *p; 1211 1212 if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { 1213 p = type + strlen(VDEV_TYPE_RAIDZ); 1214 1215 if (*p == '\0') { 1216 /* when unspecified default to single parity */ 1217 return (1); 1218 } else if (*p == '0') { 1219 /* no zero prefixes allowed */ 1220 return (0); 1221 } else { 1222 /* 0-3, no suffixes allowed */ 1223 char *end; 1224 errno = 0; 1225 parity = strtol(p, &end, 10); 1226 if (errno != 0 || *end != '\0' || 1227 parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { 1228 return (0); 1229 } 1230 } 1231 } else if (strncmp(type, VDEV_TYPE_DRAID, 1232 strlen(VDEV_TYPE_DRAID)) == 0) { 1233 p = type + strlen(VDEV_TYPE_DRAID); 1234 1235 if (*p == '\0' || *p == ':') { 1236 /* when unspecified default to single parity */ 1237 return (1); 1238 } else if (*p == '0') { 1239 /* no zero prefixes allowed */ 1240 return (0); 1241 } else { 1242 /* 0-3, allowed suffixes: '\0' or ':' */ 1243 char *end; 1244 errno = 0; 1245 parity = strtol(p, &end, 10); 1246 if (errno != 0 || 1247 parity < 1 || parity > VDEV_DRAID_MAXPARITY || 1248 (*end != '\0' && *end != ':')) { 1249 return (0); 1250 } 1251 } 1252 } 1253 1254 return ((int)parity); 1255 } 1256 1257 /* 1258 * Assign the minimum and maximum number of devices allowed for 1259 * the specified type. On error NULL is returned, otherwise the 1260 * type prefix is returned (raidz, mirror, etc). 1261 */ 1262 static const char * 1263 is_grouping(const char *type, int *mindev, int *maxdev) 1264 { 1265 int nparity; 1266 1267 if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || 1268 strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { 1269 nparity = get_parity(type); 1270 if (nparity == 0) 1271 return (NULL); 1272 if (mindev != NULL) 1273 *mindev = nparity + 1; 1274 if (maxdev != NULL) 1275 *maxdev = 255; 1276 1277 if (strncmp(type, VDEV_TYPE_RAIDZ, 1278 strlen(VDEV_TYPE_RAIDZ)) == 0) { 1279 return (VDEV_TYPE_RAIDZ); 1280 } else { 1281 return (VDEV_TYPE_DRAID); 1282 } 1283 } 1284 1285 if (maxdev != NULL) 1286 *maxdev = INT_MAX; 1287 1288 if (strcmp(type, "mirror") == 0) { 1289 if (mindev != NULL) 1290 *mindev = 2; 1291 return (VDEV_TYPE_MIRROR); 1292 } 1293 1294 if (strcmp(type, "spare") == 0) { 1295 if (mindev != NULL) 1296 *mindev = 1; 1297 return (VDEV_TYPE_SPARE); 1298 } 1299 1300 if (strcmp(type, "log") == 0) { 1301 if (mindev != NULL) 1302 *mindev = 1; 1303 return (VDEV_TYPE_LOG); 1304 } 1305 1306 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || 1307 strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1308 if (mindev != NULL) 1309 *mindev = 1; 1310 return (type); 1311 } 1312 1313 if (strcmp(type, "cache") == 0) { 1314 if (mindev != NULL) 1315 *mindev = 1; 1316 return (VDEV_TYPE_L2CACHE); 1317 } 1318 1319 return (NULL); 1320 } 1321 1322 /* 1323 * Extract the configuration parameters encoded in the dRAID type and 1324 * use them to generate a dRAID configuration. The expected format is: 1325 * 1326 * draid[<parity>][:<data><d|D>][:<children><c|C>][:<spares><s|S>] 1327 * 1328 * The intent is to be able to generate a good configuration when no 1329 * additional information is provided. The only mandatory component 1330 * of the 'type' is the 'draid' prefix. If a value is not provided 1331 * then reasonable defaults are used. The optional components may 1332 * appear in any order but the d/s/c suffix is required. 1333 * 1334 * Valid inputs: 1335 * - data: number of data devices per group (1-255) 1336 * - parity: number of parity blocks per group (1-3) 1337 * - spares: number of distributed spare (0-100) 1338 * - children: total number of devices (1-255) 1339 * 1340 * Examples: 1341 * - zpool create tank draid <devices...> 1342 * - zpool create tank draid2:8d:51c:2s <devices...> 1343 */ 1344 static int 1345 draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) 1346 { 1347 uint64_t nparity; 1348 uint64_t nspares = 0; 1349 uint64_t ndata = UINT64_MAX; 1350 uint64_t ngroups = 1; 1351 long value; 1352 1353 if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) 1354 return (EINVAL); 1355 1356 nparity = (uint64_t)get_parity(type); 1357 if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { 1358 fprintf(stderr, 1359 gettext("invalid dRAID parity level %llu; must be " 1360 "between 1 and %d\n"), (u_longlong_t)nparity, 1361 VDEV_DRAID_MAXPARITY); 1362 return (EINVAL); 1363 } 1364 1365 char *p = (char *)type; 1366 while ((p = strchr(p, ':')) != NULL) { 1367 char *end; 1368 1369 p = p + 1; 1370 errno = 0; 1371 1372 if (!isdigit(p[0])) { 1373 (void) fprintf(stderr, gettext("invalid dRAID " 1374 "syntax; expected [:<number><c|d|s>] not '%s'\n"), 1375 type); 1376 return (EINVAL); 1377 } 1378 1379 /* Expected non-zero value with c/d/s suffix */ 1380 value = strtol(p, &end, 10); 1381 char suffix = tolower(*end); 1382 if (errno != 0 || 1383 (suffix != 'c' && suffix != 'd' && suffix != 's')) { 1384 (void) fprintf(stderr, gettext("invalid dRAID " 1385 "syntax; expected [:<number><c|d|s>] not '%s'\n"), 1386 type); 1387 return (EINVAL); 1388 } 1389 1390 if (suffix == 'c') { 1391 if ((uint64_t)value != children) { 1392 fprintf(stderr, 1393 gettext("invalid number of dRAID children; " 1394 "%llu required but %llu provided\n"), 1395 (u_longlong_t)value, 1396 (u_longlong_t)children); 1397 return (EINVAL); 1398 } 1399 } else if (suffix == 'd') { 1400 ndata = (uint64_t)value; 1401 } else if (suffix == 's') { 1402 nspares = (uint64_t)value; 1403 } else { 1404 verify(0); /* Unreachable */ 1405 } 1406 } 1407 1408 /* 1409 * When a specific number of data disks is not provided limit a 1410 * redundancy group to 8 data disks. This value was selected to 1411 * provide a reasonable tradeoff between capacity and performance. 1412 */ 1413 if (ndata == UINT64_MAX) { 1414 if (children > nspares + nparity) { 1415 ndata = MIN(children - nspares - nparity, 8); 1416 } else { 1417 fprintf(stderr, gettext("request number of " 1418 "distributed spares %llu and parity level %llu\n" 1419 "leaves no disks available for data\n"), 1420 (u_longlong_t)nspares, (u_longlong_t)nparity); 1421 return (EINVAL); 1422 } 1423 } 1424 1425 /* Verify the maximum allowed group size is never exceeded. */ 1426 if (ndata == 0 || (ndata + nparity > children - nspares)) { 1427 fprintf(stderr, gettext("requested number of dRAID data " 1428 "disks per group %llu is too high,\nat most %llu disks " 1429 "are available for data\n"), (u_longlong_t)ndata, 1430 (u_longlong_t)(children - nspares - nparity)); 1431 return (EINVAL); 1432 } 1433 1434 /* 1435 * Verify the requested number of spares can be satisfied. 1436 * An arbitrary limit of 100 distributed spares is applied. 1437 */ 1438 if (nspares > 100 || nspares > (children - (ndata + nparity))) { 1439 fprintf(stderr, 1440 gettext("invalid number of dRAID spares %llu; additional " 1441 "disks would be required\n"), (u_longlong_t)nspares); 1442 return (EINVAL); 1443 } 1444 1445 /* Verify the requested number children is sufficient. */ 1446 if (children < (ndata + nparity + nspares)) { 1447 fprintf(stderr, gettext("%llu disks were provided, but at " 1448 "least %llu disks are required for this config\n"), 1449 (u_longlong_t)children, 1450 (u_longlong_t)(ndata + nparity + nspares)); 1451 } 1452 1453 if (children > VDEV_DRAID_MAX_CHILDREN) { 1454 fprintf(stderr, gettext("%llu disks were provided, but " 1455 "dRAID only supports up to %u disks"), 1456 (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); 1457 } 1458 1459 /* 1460 * Calculate the minimum number of groups required to fill a slice. 1461 * This is the LCM of the stripe width (ndata + nparity) and the 1462 * number of data drives (children - nspares). 1463 */ 1464 while (ngroups * (ndata + nparity) % (children - nspares) != 0) 1465 ngroups++; 1466 1467 /* Store the basic dRAID configuration. */ 1468 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); 1469 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); 1470 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); 1471 fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); 1472 1473 return (0); 1474 } 1475 1476 /* 1477 * Construct a syntactically valid vdev specification, 1478 * and ensure that all devices and files exist and can be opened. 1479 * Note: we don't bother freeing anything in the error paths 1480 * because the program is just going to exit anyway. 1481 */ 1482 static nvlist_t * 1483 construct_spec(nvlist_t *props, int argc, char **argv) 1484 { 1485 nvlist_t *nvroot, *nv, **top, **spares, **l2cache; 1486 int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; 1487 const char *type, *fulltype; 1488 boolean_t is_log, is_special, is_dedup, is_spare; 1489 boolean_t seen_logs; 1490 uint64_t ashift = 0; 1491 1492 if (props != NULL) { 1493 const char *value = NULL; 1494 1495 if (nvlist_lookup_string(props, 1496 zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { 1497 if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { 1498 (void) fprintf(stderr, 1499 gettext("ashift must be a number.\n")); 1500 return (NULL); 1501 } 1502 if (ashift != 0 && 1503 (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { 1504 (void) fprintf(stderr, 1505 gettext("invalid 'ashift=%" PRIu64 "' " 1506 "property: only values between %" PRId32 " " 1507 "and %" PRId32 " are allowed.\n"), 1508 ashift, ASHIFT_MIN, ASHIFT_MAX); 1509 return (NULL); 1510 } 1511 } 1512 } 1513 1514 top = NULL; 1515 toplevels = 0; 1516 spares = NULL; 1517 l2cache = NULL; 1518 nspares = 0; 1519 nlogs = 0; 1520 nl2cache = 0; 1521 is_log = is_special = is_dedup = is_spare = B_FALSE; 1522 seen_logs = B_FALSE; 1523 nvroot = NULL; 1524 1525 while (argc > 0) { 1526 fulltype = argv[0]; 1527 nv = NULL; 1528 1529 /* 1530 * If it's a mirror, raidz, or draid the subsequent arguments 1531 * are its leaves -- until we encounter the next mirror, 1532 * raidz or draid. 1533 */ 1534 if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { 1535 nvlist_t **child = NULL; 1536 int c, children = 0; 1537 1538 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1539 if (spares != NULL) { 1540 (void) fprintf(stderr, 1541 gettext("invalid vdev " 1542 "specification: 'spare' can be " 1543 "specified only once\n")); 1544 goto spec_out; 1545 } 1546 is_spare = B_TRUE; 1547 is_log = is_special = is_dedup = B_FALSE; 1548 } 1549 1550 if (strcmp(type, VDEV_TYPE_LOG) == 0) { 1551 if (seen_logs) { 1552 (void) fprintf(stderr, 1553 gettext("invalid vdev " 1554 "specification: 'log' can be " 1555 "specified only once\n")); 1556 goto spec_out; 1557 } 1558 seen_logs = B_TRUE; 1559 is_log = B_TRUE; 1560 is_special = is_dedup = is_spare = B_FALSE; 1561 argc--; 1562 argv++; 1563 /* 1564 * A log is not a real grouping device. 1565 * We just set is_log and continue. 1566 */ 1567 continue; 1568 } 1569 1570 if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { 1571 is_special = B_TRUE; 1572 is_log = is_dedup = is_spare = B_FALSE; 1573 argc--; 1574 argv++; 1575 continue; 1576 } 1577 1578 if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { 1579 is_dedup = B_TRUE; 1580 is_log = is_special = is_spare = B_FALSE; 1581 argc--; 1582 argv++; 1583 continue; 1584 } 1585 1586 if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1587 if (l2cache != NULL) { 1588 (void) fprintf(stderr, 1589 gettext("invalid vdev " 1590 "specification: 'cache' can be " 1591 "specified only once\n")); 1592 goto spec_out; 1593 } 1594 is_log = is_special = B_FALSE; 1595 is_dedup = is_spare = B_FALSE; 1596 } 1597 1598 if (is_log) { 1599 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { 1600 (void) fprintf(stderr, 1601 gettext("invalid vdev " 1602 "specification: unsupported 'log' " 1603 "device: %s\n"), type); 1604 goto spec_out; 1605 } 1606 nlogs++; 1607 } 1608 1609 for (c = 1; c < argc; c++) { 1610 if (is_grouping(argv[c], NULL, NULL) != NULL) 1611 break; 1612 1613 children++; 1614 child = realloc(child, 1615 children * sizeof (nvlist_t *)); 1616 if (child == NULL) 1617 zpool_no_memory(); 1618 if ((nv = make_leaf_vdev(argv[c], 1619 !(is_log || is_special || is_dedup || 1620 is_spare), ashift)) == NULL) { 1621 for (c = 0; c < children - 1; c++) 1622 nvlist_free(child[c]); 1623 free(child); 1624 goto spec_out; 1625 } 1626 1627 child[children - 1] = nv; 1628 } 1629 1630 if (children < mindev) { 1631 (void) fprintf(stderr, gettext("invalid vdev " 1632 "specification: %s requires at least %d " 1633 "devices\n"), argv[0], mindev); 1634 for (c = 0; c < children; c++) 1635 nvlist_free(child[c]); 1636 free(child); 1637 goto spec_out; 1638 } 1639 1640 if (children > maxdev) { 1641 (void) fprintf(stderr, gettext("invalid vdev " 1642 "specification: %s supports no more than " 1643 "%d devices\n"), argv[0], maxdev); 1644 for (c = 0; c < children; c++) 1645 nvlist_free(child[c]); 1646 free(child); 1647 goto spec_out; 1648 } 1649 1650 argc -= c; 1651 argv += c; 1652 1653 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1654 spares = child; 1655 nspares = children; 1656 continue; 1657 } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { 1658 l2cache = child; 1659 nl2cache = children; 1660 continue; 1661 } else { 1662 /* create a top-level vdev with children */ 1663 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1664 0) == 0); 1665 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1666 type) == 0); 1667 verify(nvlist_add_uint64(nv, 1668 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1669 if (is_log) { 1670 verify(nvlist_add_string(nv, 1671 ZPOOL_CONFIG_ALLOCATION_BIAS, 1672 VDEV_ALLOC_BIAS_LOG) == 0); 1673 } 1674 if (is_special) { 1675 verify(nvlist_add_string(nv, 1676 ZPOOL_CONFIG_ALLOCATION_BIAS, 1677 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1678 } 1679 if (is_dedup) { 1680 verify(nvlist_add_string(nv, 1681 ZPOOL_CONFIG_ALLOCATION_BIAS, 1682 VDEV_ALLOC_BIAS_DEDUP) == 0); 1683 } 1684 if (ashift > 0) { 1685 fnvlist_add_uint64(nv, 1686 ZPOOL_CONFIG_ASHIFT, ashift); 1687 } 1688 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1689 verify(nvlist_add_uint64(nv, 1690 ZPOOL_CONFIG_NPARITY, 1691 mindev - 1) == 0); 1692 } 1693 if (strcmp(type, VDEV_TYPE_DRAID) == 0) { 1694 if (draid_config_by_type(nv, 1695 fulltype, children) != 0) { 1696 for (c = 0; c < children; c++) 1697 nvlist_free(child[c]); 1698 free(child); 1699 goto spec_out; 1700 } 1701 } 1702 verify(nvlist_add_nvlist_array(nv, 1703 ZPOOL_CONFIG_CHILDREN, 1704 (const nvlist_t **)child, children) == 0); 1705 1706 for (c = 0; c < children; c++) 1707 nvlist_free(child[c]); 1708 free(child); 1709 } 1710 } else { 1711 /* 1712 * We have a device. Pass off to make_leaf_vdev() to 1713 * construct the appropriate nvlist describing the vdev. 1714 */ 1715 if ((nv = make_leaf_vdev(argv[0], !(is_log || 1716 is_special || is_dedup || is_spare), 1717 ashift)) == NULL) 1718 goto spec_out; 1719 1720 verify(nvlist_add_uint64(nv, 1721 ZPOOL_CONFIG_IS_LOG, is_log) == 0); 1722 if (is_log) { 1723 verify(nvlist_add_string(nv, 1724 ZPOOL_CONFIG_ALLOCATION_BIAS, 1725 VDEV_ALLOC_BIAS_LOG) == 0); 1726 nlogs++; 1727 } 1728 1729 if (is_special) { 1730 verify(nvlist_add_string(nv, 1731 ZPOOL_CONFIG_ALLOCATION_BIAS, 1732 VDEV_ALLOC_BIAS_SPECIAL) == 0); 1733 } 1734 if (is_dedup) { 1735 verify(nvlist_add_string(nv, 1736 ZPOOL_CONFIG_ALLOCATION_BIAS, 1737 VDEV_ALLOC_BIAS_DEDUP) == 0); 1738 } 1739 argc--; 1740 argv++; 1741 } 1742 1743 toplevels++; 1744 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1745 if (top == NULL) 1746 zpool_no_memory(); 1747 top[toplevels - 1] = nv; 1748 } 1749 1750 if (toplevels == 0 && nspares == 0 && nl2cache == 0) { 1751 (void) fprintf(stderr, gettext("invalid vdev " 1752 "specification: at least one toplevel vdev must be " 1753 "specified\n")); 1754 goto spec_out; 1755 } 1756 1757 if (seen_logs && nlogs == 0) { 1758 (void) fprintf(stderr, gettext("invalid vdev specification: " 1759 "log requires at least 1 device\n")); 1760 goto spec_out; 1761 } 1762 1763 /* 1764 * Finally, create nvroot and add all top-level vdevs to it. 1765 */ 1766 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1767 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1768 VDEV_TYPE_ROOT) == 0); 1769 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1770 (const nvlist_t **)top, toplevels) == 0); 1771 if (nspares != 0) 1772 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1773 (const nvlist_t **)spares, nspares) == 0); 1774 if (nl2cache != 0) 1775 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 1776 (const nvlist_t **)l2cache, nl2cache) == 0); 1777 1778 spec_out: 1779 for (t = 0; t < toplevels; t++) 1780 nvlist_free(top[t]); 1781 for (t = 0; t < nspares; t++) 1782 nvlist_free(spares[t]); 1783 for (t = 0; t < nl2cache; t++) 1784 nvlist_free(l2cache[t]); 1785 1786 free(spares); 1787 free(l2cache); 1788 free(top); 1789 1790 return (nvroot); 1791 } 1792 1793 nvlist_t * 1794 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, 1795 splitflags_t flags, int argc, char **argv) 1796 { 1797 nvlist_t *newroot = NULL, **child; 1798 uint_t c, children; 1799 1800 if (argc > 0) { 1801 if ((newroot = construct_spec(props, argc, argv)) == NULL) { 1802 (void) fprintf(stderr, gettext("Unable to build a " 1803 "pool from the specified devices\n")); 1804 return (NULL); 1805 } 1806 1807 if (!flags.dryrun && make_disks(zhp, newroot, B_FALSE) != 0) { 1808 nvlist_free(newroot); 1809 return (NULL); 1810 } 1811 1812 /* avoid any tricks in the spec */ 1813 verify(nvlist_lookup_nvlist_array(newroot, 1814 ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); 1815 for (c = 0; c < children; c++) { 1816 const char *path; 1817 const char *type; 1818 int min, max; 1819 1820 verify(nvlist_lookup_string(child[c], 1821 ZPOOL_CONFIG_PATH, &path) == 0); 1822 if ((type = is_grouping(path, &min, &max)) != NULL) { 1823 (void) fprintf(stderr, gettext("Cannot use " 1824 "'%s' as a device for splitting\n"), type); 1825 nvlist_free(newroot); 1826 return (NULL); 1827 } 1828 } 1829 } 1830 1831 if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { 1832 nvlist_free(newroot); 1833 return (NULL); 1834 } 1835 1836 return (newroot); 1837 } 1838 1839 static int 1840 num_normal_vdevs(nvlist_t *nvroot) 1841 { 1842 nvlist_t **top; 1843 uint_t t, toplevels, normal = 0; 1844 1845 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1846 &top, &toplevels) == 0); 1847 1848 for (t = 0; t < toplevels; t++) { 1849 uint64_t log = B_FALSE; 1850 1851 (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); 1852 if (log) 1853 continue; 1854 if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) 1855 continue; 1856 1857 normal++; 1858 } 1859 1860 return (normal); 1861 } 1862 1863 /* 1864 * Get and validate the contents of the given vdev specification. This ensures 1865 * that the nvlist returned is well-formed, that all the devices exist, and that 1866 * they are not currently in use by any other known consumer. The 'poolconfig' 1867 * parameter is the current configuration of the pool when adding devices 1868 * existing pool, and is used to perform additional checks, such as changing the 1869 * replication level of the pool. It can be 'NULL' to indicate that this is a 1870 * new pool. The 'force' flag controls whether devices should be forcefully 1871 * added, even if they appear in use. 1872 */ 1873 nvlist_t * 1874 make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, 1875 boolean_t replacing, boolean_t dryrun, int argc, char **argv) 1876 { 1877 nvlist_t *newroot; 1878 nvlist_t *poolconfig = NULL; 1879 is_force = force; 1880 1881 /* 1882 * Construct the vdev specification. If this is successful, we know 1883 * that we have a valid specification, and that all devices can be 1884 * opened. 1885 */ 1886 if ((newroot = construct_spec(props, argc, argv)) == NULL) 1887 return (NULL); 1888 1889 if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) { 1890 nvlist_free(newroot); 1891 return (NULL); 1892 } 1893 1894 /* 1895 * Validate each device to make sure that it's not shared with another 1896 * subsystem. We do this even if 'force' is set, because there are some 1897 * uses (such as a dedicated dump device) that even '-f' cannot 1898 * override. 1899 */ 1900 if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { 1901 nvlist_free(newroot); 1902 return (NULL); 1903 } 1904 1905 /* 1906 * Check the replication level of the given vdevs and report any errors 1907 * found. We include the existing pool spec, if any, as we need to 1908 * catch changes against the existing replication level. 1909 */ 1910 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1911 nvlist_free(newroot); 1912 return (NULL); 1913 } 1914 1915 /* 1916 * On pool create the new vdev spec must have one normal vdev. 1917 */ 1918 if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { 1919 vdev_error(gettext("at least one general top-level vdev must " 1920 "be specified\n")); 1921 nvlist_free(newroot); 1922 return (NULL); 1923 } 1924 1925 /* 1926 * Run through the vdev specification and label any whole disks found. 1927 */ 1928 if (!dryrun && make_disks(zhp, newroot, replacing) != 0) { 1929 nvlist_free(newroot); 1930 return (NULL); 1931 } 1932 1933 return (newroot); 1934 } 1935