1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Functions to convert between a list of vdevs and an nvlist representing the 31 * configuration. Each entry in the list can be one of: 32 * 33 * Device vdevs 34 * disk=(path=..., devid=...) 35 * file=(path=...) 36 * 37 * Group vdevs 38 * raidz[1|2]=(...) 39 * mirror=(...) 40 * 41 * Hot spares 42 * 43 * While the underlying implementation supports it, group vdevs cannot contain 44 * other group vdevs. All userland verification of devices is contained within 45 * this file. If successful, the nvlist returned can be passed directly to the 46 * kernel; we've done as much verification as possible in userland. 47 * 48 * Hot spares are a special case, and passed down as an array of disk vdevs, at 49 * the same level as the root of the vdev tree. 50 * 51 * The only function exported by this file is 'get_vdev_spec'. The function 52 * performs several passes: 53 * 54 * 1. Construct the vdev specification. Performs syntax validation and 55 * makes sure each device is valid. 56 * 2. Check for devices in use. Using libdiskmgt, makes sure that no 57 * devices are also in use. Some can be overridden using the 'force' 58 * flag, others cannot. 59 * 3. Check for replication errors if the 'force' flag is not specified. 60 * validates that the replication level is consistent across the 61 * entire pool. 62 * 4. Label any whole disks with an EFI label. 63 */ 64 65 #include <assert.h> 66 #include <devid.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <libdiskmgt.h> 70 #include <libintl.h> 71 #include <libnvpair.h> 72 #include <stdio.h> 73 #include <string.h> 74 #include <unistd.h> 75 #include <sys/efi_partition.h> 76 #include <sys/stat.h> 77 #include <sys/vtoc.h> 78 #include <sys/mntent.h> 79 80 #include <libzfs.h> 81 82 #include "zpool_util.h" 83 84 #define DISK_ROOT "/dev/dsk" 85 #define RDISK_ROOT "/dev/rdsk" 86 #define BACKUP_SLICE "s2" 87 88 /* 89 * For any given vdev specification, we can have multiple errors. The 90 * vdev_error() function keeps track of whether we have seen an error yet, and 91 * prints out a header if its the first error we've seen. 92 */ 93 boolean_t error_seen; 94 boolean_t is_force; 95 96 /*PRINTFLIKE1*/ 97 static void 98 vdev_error(const char *fmt, ...) 99 { 100 va_list ap; 101 102 if (!error_seen) { 103 (void) fprintf(stderr, gettext("invalid vdev specification\n")); 104 if (!is_force) 105 (void) fprintf(stderr, gettext("use '-f' to override " 106 "the following errors:\n")); 107 else 108 (void) fprintf(stderr, gettext("the following errors " 109 "must be manually repaired:\n")); 110 error_seen = B_TRUE; 111 } 112 113 va_start(ap, fmt); 114 (void) vfprintf(stderr, fmt, ap); 115 va_end(ap); 116 } 117 118 static void 119 libdiskmgt_error(int error) 120 { 121 /* 122 * ENXIO/ENODEV is a valid error message if the device doesn't live in 123 * /dev/dsk. Don't bother printing an error message in this case. 124 */ 125 if (error == ENXIO || error == ENODEV) 126 return; 127 128 (void) fprintf(stderr, gettext("warning: device in use checking " 129 "failed: %s\n"), strerror(error)); 130 } 131 132 /* 133 * Validate a device, passing the bulk of the work off to libdiskmgt. 134 */ 135 int 136 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) 137 { 138 char *msg; 139 int error = 0; 140 int ret = 0; 141 142 if (dm_inuse((char *)path, &msg, 143 force ? DM_WHO_ZPOOL_FORCE : DM_WHO_ZPOOL, &error) || error) { 144 if (error != 0) { 145 libdiskmgt_error(error); 146 return (0); 147 } else if (!isspare || 148 strstr(msg, gettext("hot spare")) == NULL) { 149 /* 150 * The above check is a rather severe hack. It would 151 * probably make more sense to have DM_WHO_ZPOOL_SPARE 152 * instead. 153 */ 154 vdev_error("%s", msg); 155 free(msg); 156 ret = -1; 157 } 158 159 } 160 161 /* 162 * If we're given a whole disk, ignore overlapping slices since we're 163 * about to label it anyway. 164 */ 165 error = 0; 166 if (!wholedisk && !force && 167 (dm_isoverlapping((char *)path, &msg, &error) || error)) { 168 if (error != 0) { 169 libdiskmgt_error(error); 170 return (0); 171 } else { 172 vdev_error("%s overlaps with %s\n", path, msg); 173 free(msg); 174 } 175 176 ret = -1; 177 } 178 179 return (ret); 180 } 181 182 /* 183 * Validate a whole disk. Iterate over all slices on the disk and make sure 184 * that none is in use by calling check_slice(). 185 */ 186 /* ARGSUSED */ 187 int 188 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) 189 { 190 dm_descriptor_t *drive, *media, *slice; 191 int err = 0; 192 int i; 193 int ret; 194 195 /* 196 * Get the drive associated with this disk. This should never fail, 197 * because we already have an alias handle open for the device. 198 */ 199 if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, 200 &err)) == NULL || *drive == NULL) { 201 if (err) 202 libdiskmgt_error(err); 203 return (0); 204 } 205 206 if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, 207 &err)) == NULL) { 208 dm_free_descriptors(drive); 209 if (err) 210 libdiskmgt_error(err); 211 return (0); 212 } 213 214 dm_free_descriptors(drive); 215 216 /* 217 * It is possible that the user has specified a removable media drive, 218 * and the media is not present. 219 */ 220 if (*media == NULL) { 221 dm_free_descriptors(media); 222 vdev_error(gettext("'%s' has no media in drive\n"), name); 223 return (-1); 224 } 225 226 if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, 227 &err)) == NULL) { 228 dm_free_descriptors(media); 229 if (err) 230 libdiskmgt_error(err); 231 return (0); 232 } 233 234 dm_free_descriptors(media); 235 236 ret = 0; 237 238 /* 239 * Iterate over all slices and report any errors. We don't care about 240 * overlapping slices because we are using the whole disk. 241 */ 242 for (i = 0; slice[i] != NULL; i++) { 243 char *name = dm_get_name(slice[i], &err); 244 245 if (check_slice(name, force, B_TRUE, isspare) != 0) 246 ret = -1; 247 248 dm_free_name(name); 249 } 250 251 dm_free_descriptors(slice); 252 return (ret); 253 } 254 255 /* 256 * Validate a device. 257 */ 258 int 259 check_device(const char *path, boolean_t force, boolean_t isspare) 260 { 261 dm_descriptor_t desc; 262 int err; 263 char *dev; 264 265 /* 266 * For whole disks, libdiskmgt does not include the leading dev path. 267 */ 268 dev = strrchr(path, '/'); 269 assert(dev != NULL); 270 dev++; 271 if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { 272 err = check_disk(path, desc, force, isspare); 273 dm_free_descriptor(desc); 274 return (err); 275 } 276 277 return (check_slice(path, force, B_FALSE, isspare)); 278 } 279 280 /* 281 * Check that a file is valid. All we can do in this case is check that it's 282 * not in use by another pool. 283 */ 284 int 285 check_file(const char *file, boolean_t force, boolean_t isspare) 286 { 287 char *name; 288 int fd; 289 int ret = 0; 290 pool_state_t state; 291 boolean_t inuse; 292 293 if ((fd = open(file, O_RDONLY)) < 0) 294 return (0); 295 296 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { 297 const char *desc; 298 299 switch (state) { 300 case POOL_STATE_ACTIVE: 301 desc = gettext("active"); 302 break; 303 304 case POOL_STATE_EXPORTED: 305 desc = gettext("exported"); 306 break; 307 308 case POOL_STATE_POTENTIALLY_ACTIVE: 309 desc = gettext("potentially active"); 310 break; 311 312 default: 313 desc = gettext("unknown"); 314 break; 315 } 316 317 /* 318 * Allow hot spares to be shared between pools. 319 */ 320 if (state == POOL_STATE_SPARE && isspare) 321 return (0); 322 323 if (state == POOL_STATE_ACTIVE || 324 state == POOL_STATE_SPARE || !force) { 325 switch (state) { 326 case POOL_STATE_SPARE: 327 vdev_error(gettext("%s is reserved as a hot " 328 "spare for pool %s\n"), file, name); 329 break; 330 default: 331 vdev_error(gettext("%s is part of %s pool " 332 "'%s'\n"), file, desc, name); 333 break; 334 } 335 ret = -1; 336 } 337 338 free(name); 339 } 340 341 (void) close(fd); 342 return (ret); 343 } 344 345 static boolean_t 346 is_whole_disk(const char *arg, struct stat64 *statbuf) 347 { 348 char path[MAXPATHLEN]; 349 350 (void) snprintf(path, sizeof (path), "%s%s", arg, BACKUP_SLICE); 351 if (stat64(path, statbuf) == 0) 352 return (B_TRUE); 353 354 return (B_FALSE); 355 } 356 357 /* 358 * Create a leaf vdev. Determine if this is a file or a device. If it's a 359 * device, fill in the device id to make a complete nvlist. Valid forms for a 360 * leaf vdev are: 361 * 362 * /dev/dsk/xxx Complete disk path 363 * /xxx Full path to file 364 * xxx Shorthand for /dev/dsk/xxx 365 */ 366 nvlist_t * 367 make_leaf_vdev(const char *arg) 368 { 369 char path[MAXPATHLEN]; 370 struct stat64 statbuf; 371 nvlist_t *vdev = NULL; 372 char *type = NULL; 373 boolean_t wholedisk = B_FALSE; 374 375 /* 376 * Determine what type of vdev this is, and put the full path into 377 * 'path'. We detect whether this is a device of file afterwards by 378 * checking the st_mode of the file. 379 */ 380 if (arg[0] == '/') { 381 /* 382 * Complete device or file path. Exact type is determined by 383 * examining the file descriptor afterwards. 384 */ 385 if (is_whole_disk(arg, &statbuf)) { 386 wholedisk = B_TRUE; 387 } else if (stat64(arg, &statbuf) != 0) { 388 (void) fprintf(stderr, 389 gettext("cannot open '%s': %s\n"), 390 arg, strerror(errno)); 391 return (NULL); 392 } 393 394 (void) strlcpy(path, arg, sizeof (path)); 395 } else { 396 /* 397 * This may be a short path for a device, or it could be total 398 * gibberish. Check to see if it's a known device in 399 * /dev/dsk/. As part of this check, see if we've been given a 400 * an entire disk (minus the slice number). 401 */ 402 (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, 403 arg); 404 if (is_whole_disk(path, &statbuf)) { 405 wholedisk = B_TRUE; 406 } else if (stat64(path, &statbuf) != 0) { 407 /* 408 * If we got ENOENT, then the user gave us 409 * gibberish, so try to direct them with a 410 * reasonable error message. Otherwise, 411 * regurgitate strerror() since it's the best we 412 * can do. 413 */ 414 if (errno == ENOENT) { 415 (void) fprintf(stderr, 416 gettext("cannot open '%s': no such " 417 "device in %s\n"), arg, DISK_ROOT); 418 (void) fprintf(stderr, 419 gettext("must be a full path or " 420 "shorthand device name\n")); 421 return (NULL); 422 } else { 423 (void) fprintf(stderr, 424 gettext("cannot open '%s': %s\n"), 425 path, strerror(errno)); 426 return (NULL); 427 } 428 } 429 } 430 431 /* 432 * Determine whether this is a device or a file. 433 */ 434 if (S_ISBLK(statbuf.st_mode)) { 435 type = VDEV_TYPE_DISK; 436 } else if (S_ISREG(statbuf.st_mode)) { 437 type = VDEV_TYPE_FILE; 438 } else { 439 (void) fprintf(stderr, gettext("cannot use '%s': must be a " 440 "block device or regular file\n"), path); 441 return (NULL); 442 } 443 444 /* 445 * Finally, we have the complete device or file, and we know that it is 446 * acceptable to use. Construct the nvlist to describe this vdev. All 447 * vdevs have a 'path' element, and devices also have a 'devid' element. 448 */ 449 verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); 450 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); 451 verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); 452 if (strcmp(type, VDEV_TYPE_DISK) == 0) 453 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, 454 (uint64_t)wholedisk) == 0); 455 456 /* 457 * For a whole disk, defer getting its devid until after labeling it. 458 */ 459 if (S_ISBLK(statbuf.st_mode) && !wholedisk) { 460 /* 461 * Get the devid for the device. 462 */ 463 int fd; 464 ddi_devid_t devid; 465 char *minor = NULL, *devid_str = NULL; 466 467 if ((fd = open(path, O_RDONLY)) < 0) { 468 (void) fprintf(stderr, gettext("cannot open '%s': " 469 "%s\n"), path, strerror(errno)); 470 nvlist_free(vdev); 471 return (NULL); 472 } 473 474 if (devid_get(fd, &devid) == 0) { 475 if (devid_get_minor_name(fd, &minor) == 0 && 476 (devid_str = devid_str_encode(devid, minor)) != 477 NULL) { 478 verify(nvlist_add_string(vdev, 479 ZPOOL_CONFIG_DEVID, devid_str) == 0); 480 } 481 if (devid_str != NULL) 482 devid_str_free(devid_str); 483 if (minor != NULL) 484 devid_str_free(minor); 485 devid_free(devid); 486 } 487 488 (void) close(fd); 489 } 490 491 return (vdev); 492 } 493 494 /* 495 * Go through and verify the replication level of the pool is consistent. 496 * Performs the following checks: 497 * 498 * For the new spec, verifies that devices in mirrors and raidz are the 499 * same size. 500 * 501 * If the current configuration already has inconsistent replication 502 * levels, ignore any other potential problems in the new spec. 503 * 504 * Otherwise, make sure that the current spec (if there is one) and the new 505 * spec have consistent replication levels. 506 */ 507 typedef struct replication_level { 508 char *zprl_type; 509 uint64_t zprl_children; 510 uint64_t zprl_parity; 511 } replication_level_t; 512 513 /* 514 * Given a list of toplevel vdevs, return the current replication level. If 515 * the config is inconsistent, then NULL is returned. If 'fatal' is set, then 516 * an error message will be displayed for each self-inconsistent vdev. 517 */ 518 replication_level_t * 519 get_replication(nvlist_t *nvroot, boolean_t fatal) 520 { 521 nvlist_t **top; 522 uint_t t, toplevels; 523 nvlist_t **child; 524 uint_t c, children; 525 nvlist_t *nv; 526 char *type; 527 replication_level_t lastrep, rep, *ret; 528 boolean_t dontreport; 529 530 ret = safe_malloc(sizeof (replication_level_t)); 531 532 verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 533 &top, &toplevels) == 0); 534 535 lastrep.zprl_type = NULL; 536 for (t = 0; t < toplevels; t++) { 537 nv = top[t]; 538 539 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 540 541 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 542 &child, &children) != 0) { 543 /* 544 * This is a 'file' or 'disk' vdev. 545 */ 546 rep.zprl_type = type; 547 rep.zprl_children = 1; 548 rep.zprl_parity = 0; 549 } else { 550 uint64_t vdev_size; 551 552 /* 553 * This is a mirror or RAID-Z vdev. Go through and make 554 * sure the contents are all the same (files vs. disks), 555 * keeping track of the number of elements in the 556 * process. 557 * 558 * We also check that the size of each vdev (if it can 559 * be determined) is the same. 560 */ 561 rep.zprl_type = type; 562 rep.zprl_children = 0; 563 564 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 565 verify(nvlist_lookup_uint64(nv, 566 ZPOOL_CONFIG_NPARITY, 567 &rep.zprl_parity) == 0); 568 assert(rep.zprl_parity != 0); 569 } else { 570 rep.zprl_parity = 0; 571 } 572 573 /* 574 * The 'dontreport' variable indicatest that we've 575 * already reported an error for this spec, so don't 576 * bother doing it again. 577 */ 578 type = NULL; 579 dontreport = 0; 580 vdev_size = -1ULL; 581 for (c = 0; c < children; c++) { 582 nvlist_t *cnv = child[c]; 583 char *path; 584 struct stat64 statbuf; 585 uint64_t size = -1ULL; 586 char *childtype; 587 int fd, err; 588 589 rep.zprl_children++; 590 591 verify(nvlist_lookup_string(cnv, 592 ZPOOL_CONFIG_TYPE, &childtype) == 0); 593 594 /* 595 * If this is a a replacing or spare vdev, then 596 * get the real first child of the vdev. 597 */ 598 if (strcmp(childtype, 599 VDEV_TYPE_REPLACING) == 0 || 600 strcmp(childtype, VDEV_TYPE_SPARE) == 0) { 601 nvlist_t **rchild; 602 uint_t rchildren; 603 604 verify(nvlist_lookup_nvlist_array(cnv, 605 ZPOOL_CONFIG_CHILDREN, &rchild, 606 &rchildren) == 0); 607 assert(rchildren == 2); 608 cnv = rchild[0]; 609 610 verify(nvlist_lookup_string(cnv, 611 ZPOOL_CONFIG_TYPE, 612 &childtype) == 0); 613 } 614 615 verify(nvlist_lookup_string(cnv, 616 ZPOOL_CONFIG_PATH, &path) == 0); 617 618 /* 619 * If we have a raidz/mirror that combines disks 620 * with files, report it as an error. 621 */ 622 if (!dontreport && type != NULL && 623 strcmp(type, childtype) != 0) { 624 if (ret != NULL) 625 free(ret); 626 ret = NULL; 627 if (fatal) 628 vdev_error(gettext( 629 "mismatched replication " 630 "level: %s contains both " 631 "files and devices\n"), 632 rep.zprl_type); 633 else 634 return (NULL); 635 dontreport = B_TRUE; 636 } 637 638 /* 639 * According to stat(2), the value of 'st_size' 640 * is undefined for block devices and character 641 * devices. But there is no effective way to 642 * determine the real size in userland. 643 * 644 * Instead, we'll take advantage of an 645 * implementation detail of spec_size(). If the 646 * device is currently open, then we (should) 647 * return a valid size. 648 * 649 * If we still don't get a valid size (indicated 650 * by a size of 0 or MAXOFFSET_T), then ignore 651 * this device altogether. 652 */ 653 if ((fd = open(path, O_RDONLY)) >= 0) { 654 err = fstat64(fd, &statbuf); 655 (void) close(fd); 656 } else { 657 err = stat64(path, &statbuf); 658 } 659 660 if (err != 0 || 661 statbuf.st_size == 0 || 662 statbuf.st_size == MAXOFFSET_T) 663 continue; 664 665 size = statbuf.st_size; 666 667 /* 668 * Also check the size of each device. If they 669 * differ, then report an error. 670 */ 671 if (!dontreport && vdev_size != -1ULL && 672 size != vdev_size) { 673 if (ret != NULL) 674 free(ret); 675 ret = NULL; 676 if (fatal) 677 vdev_error(gettext( 678 "%s contains devices of " 679 "different sizes\n"), 680 rep.zprl_type); 681 else 682 return (NULL); 683 dontreport = B_TRUE; 684 } 685 686 type = childtype; 687 vdev_size = size; 688 } 689 } 690 691 /* 692 * At this point, we have the replication of the last toplevel 693 * vdev in 'rep'. Compare it to 'lastrep' to see if its 694 * different. 695 */ 696 if (lastrep.zprl_type != NULL) { 697 if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { 698 if (ret != NULL) 699 free(ret); 700 ret = NULL; 701 if (fatal) 702 vdev_error(gettext( 703 "mismatched replication level: " 704 "both %s and %s vdevs are " 705 "present\n"), 706 lastrep.zprl_type, rep.zprl_type); 707 else 708 return (NULL); 709 } else if (lastrep.zprl_parity != rep.zprl_parity) { 710 if (ret) 711 free(ret); 712 ret = NULL; 713 if (fatal) 714 vdev_error(gettext( 715 "mismatched replication level: " 716 "both %llu and %llu device parity " 717 "%s vdevs are present\n"), 718 lastrep.zprl_parity, 719 rep.zprl_parity, 720 rep.zprl_type); 721 else 722 return (NULL); 723 } else if (lastrep.zprl_children != rep.zprl_children) { 724 if (ret) 725 free(ret); 726 ret = NULL; 727 if (fatal) 728 vdev_error(gettext( 729 "mismatched replication level: " 730 "both %llu-way and %llu-way %s " 731 "vdevs are present\n"), 732 lastrep.zprl_children, 733 rep.zprl_children, 734 rep.zprl_type); 735 else 736 return (NULL); 737 } 738 } 739 lastrep = rep; 740 } 741 742 if (ret != NULL) 743 *ret = rep; 744 745 return (ret); 746 } 747 748 /* 749 * Check the replication level of the vdev spec against the current pool. Calls 750 * get_replication() to make sure the new spec is self-consistent. If the pool 751 * has a consistent replication level, then we ignore any errors. Otherwise, 752 * report any difference between the two. 753 */ 754 int 755 check_replication(nvlist_t *config, nvlist_t *newroot) 756 { 757 replication_level_t *current = NULL, *new; 758 int ret; 759 760 /* 761 * If we have a current pool configuration, check to see if it's 762 * self-consistent. If not, simply return success. 763 */ 764 if (config != NULL) { 765 nvlist_t *nvroot; 766 767 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 768 &nvroot) == 0); 769 if ((current = get_replication(nvroot, B_FALSE)) == NULL) 770 return (0); 771 } 772 773 /* 774 * Get the replication level of the new vdev spec, reporting any 775 * inconsistencies found. 776 */ 777 if ((new = get_replication(newroot, B_TRUE)) == NULL) { 778 free(current); 779 return (-1); 780 } 781 782 /* 783 * Check to see if the new vdev spec matches the replication level of 784 * the current pool. 785 */ 786 ret = 0; 787 if (current != NULL) { 788 if (strcmp(current->zprl_type, new->zprl_type) != 0) { 789 vdev_error(gettext( 790 "mismatched replication level: pool uses %s " 791 "and new vdev is %s\n"), 792 current->zprl_type, new->zprl_type); 793 ret = -1; 794 } else if (current->zprl_parity != new->zprl_parity) { 795 vdev_error(gettext( 796 "mismatched replication level: pool uses %llu " 797 "device parity and new vdev uses %llu\n"), 798 current->zprl_parity, new->zprl_parity); 799 ret = -1; 800 } else if (current->zprl_children != new->zprl_children) { 801 vdev_error(gettext( 802 "mismatched replication level: pool uses %llu-way " 803 "%s and new vdev uses %llu-way %s\n"), 804 current->zprl_children, current->zprl_type, 805 new->zprl_children, new->zprl_type); 806 ret = -1; 807 } 808 } 809 810 free(new); 811 if (current != NULL) 812 free(current); 813 814 return (ret); 815 } 816 817 /* 818 * Label an individual disk. The name provided is the short name, stripped of 819 * any leading /dev path. 820 */ 821 int 822 label_disk(char *name) 823 { 824 char path[MAXPATHLEN]; 825 struct dk_gpt *vtoc; 826 int fd; 827 size_t resv = 16384; 828 829 (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, 830 BACKUP_SLICE); 831 832 if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { 833 /* 834 * This shouldn't happen. We've long since verified that this 835 * is a valid device. 836 */ 837 (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), 838 path, strerror(errno)); 839 return (-1); 840 } 841 842 843 if (efi_alloc_and_init(fd, 9, &vtoc) != 0) { 844 /* 845 * The only way this can fail is if we run out of memory, or we 846 * were unable to read the disk geometry. 847 */ 848 if (errno == ENOMEM) 849 no_memory(); 850 851 (void) fprintf(stderr, gettext("cannot label '%s': unable to " 852 "read disk geometry\n"), name); 853 (void) close(fd); 854 return (-1); 855 } 856 857 vtoc->efi_parts[0].p_start = vtoc->efi_first_u_lba; 858 vtoc->efi_parts[0].p_size = vtoc->efi_last_u_lba + 1 - 859 vtoc->efi_first_u_lba - resv; 860 861 /* 862 * Why we use V_USR: V_BACKUP confuses users, and is considered 863 * disposable by some EFI utilities (since EFI doesn't have a backup 864 * slice). V_UNASSIGNED is supposed to be used only for zero size 865 * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, 866 * etc. were all pretty specific. V_USR is as close to reality as we 867 * can get, in the absence of V_OTHER. 868 */ 869 vtoc->efi_parts[0].p_tag = V_USR; 870 (void) strcpy(vtoc->efi_parts[0].p_name, "zfs"); 871 872 vtoc->efi_parts[8].p_start = vtoc->efi_last_u_lba + 1 - resv; 873 vtoc->efi_parts[8].p_size = resv; 874 vtoc->efi_parts[8].p_tag = V_RESERVED; 875 876 if (efi_write(fd, vtoc) != 0) { 877 /* 878 * Currently, EFI labels are not supported for IDE disks, and it 879 * is likely that they will not be supported on other drives for 880 * some time. Print out a helpful error message directing the 881 * user to manually label the disk and give a specific slice. 882 */ 883 (void) fprintf(stderr, gettext("cannot label '%s': failed to " 884 "write EFI label\n"), name); 885 (void) fprintf(stderr, gettext("use fdisk(1M) to partition " 886 "the disk, and provide a specific slice\n")); 887 (void) close(fd); 888 efi_free(vtoc); 889 return (-1); 890 } 891 892 (void) close(fd); 893 efi_free(vtoc); 894 return (0); 895 } 896 897 /* 898 * Go through and find any whole disks in the vdev specification, labelling them 899 * as appropriate. When constructing the vdev spec, we were unable to open this 900 * device in order to provide a devid. Now that we have labelled the disk and 901 * know that slice 0 is valid, we can construct the devid now. 902 * 903 * If the disk was already labelled with an EFI label, we will have gotten the 904 * devid already (because we were able to open the whole disk). Otherwise, we 905 * need to get the devid after we label the disk. 906 */ 907 int 908 make_disks(nvlist_t *nv) 909 { 910 nvlist_t **child; 911 uint_t c, children; 912 char *type, *path, *diskname; 913 char buf[MAXPATHLEN]; 914 uint64_t wholedisk; 915 int fd; 916 int ret; 917 ddi_devid_t devid; 918 char *minor = NULL, *devid_str = NULL; 919 920 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 921 922 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 923 &child, &children) != 0) { 924 925 if (strcmp(type, VDEV_TYPE_DISK) != 0) 926 return (0); 927 928 /* 929 * We have a disk device. Get the path to the device 930 * and see if its a whole disk by appending the backup 931 * slice and stat()ing the device. 932 */ 933 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 934 935 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 936 &wholedisk) != 0 || !wholedisk) 937 return (0); 938 939 diskname = strrchr(path, '/'); 940 assert(diskname != NULL); 941 diskname++; 942 if (label_disk(diskname) != 0) 943 return (-1); 944 945 /* 946 * Fill in the devid, now that we've labeled the disk. 947 */ 948 (void) snprintf(buf, sizeof (buf), "%ss0", path); 949 if ((fd = open(buf, O_RDONLY)) < 0) { 950 (void) fprintf(stderr, 951 gettext("cannot open '%s': %s\n"), 952 buf, strerror(errno)); 953 return (-1); 954 } 955 956 if (devid_get(fd, &devid) == 0) { 957 if (devid_get_minor_name(fd, &minor) == 0 && 958 (devid_str = devid_str_encode(devid, minor)) != 959 NULL) { 960 verify(nvlist_add_string(nv, 961 ZPOOL_CONFIG_DEVID, devid_str) == 0); 962 } 963 if (devid_str != NULL) 964 devid_str_free(devid_str); 965 if (minor != NULL) 966 devid_str_free(minor); 967 devid_free(devid); 968 } 969 970 /* 971 * Update the path to refer to the 's0' slice. The presence of 972 * the 'whole_disk' field indicates to the CLI that we should 973 * chop off the slice number when displaying the device in 974 * future output. 975 */ 976 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0); 977 978 (void) close(fd); 979 980 return (0); 981 } 982 983 for (c = 0; c < children; c++) 984 if ((ret = make_disks(child[c])) != 0) 985 return (ret); 986 987 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 988 &child, &children) == 0) 989 for (c = 0; c < children; c++) 990 if ((ret = make_disks(child[c])) != 0) 991 return (ret); 992 993 return (0); 994 } 995 996 /* 997 * Determine if the given path is a hot spare within the given configuration. 998 */ 999 static boolean_t 1000 is_spare(nvlist_t *config, const char *path) 1001 { 1002 int fd; 1003 pool_state_t state; 1004 char *name; 1005 nvlist_t *label; 1006 uint64_t guid, spareguid; 1007 nvlist_t *nvroot; 1008 nvlist_t **spares; 1009 uint_t i, nspares; 1010 boolean_t inuse; 1011 1012 if ((fd = open(path, O_RDONLY)) < 0) 1013 return (B_FALSE); 1014 1015 if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || 1016 !inuse || 1017 state != POOL_STATE_SPARE || 1018 zpool_read_label(fd, &label) != 0) { 1019 (void) close(fd); 1020 return (B_FALSE); 1021 } 1022 1023 (void) close(fd); 1024 verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); 1025 nvlist_free(label); 1026 1027 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 1028 &nvroot) == 0); 1029 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1030 &spares, &nspares) == 0) { 1031 for (i = 0; i < nspares; i++) { 1032 verify(nvlist_lookup_uint64(spares[i], 1033 ZPOOL_CONFIG_GUID, &spareguid) == 0); 1034 if (spareguid == guid) 1035 return (B_TRUE); 1036 } 1037 } 1038 1039 return (B_FALSE); 1040 } 1041 1042 /* 1043 * Go through and find any devices that are in use. We rely on libdiskmgt for 1044 * the majority of this task. 1045 */ 1046 int 1047 check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, 1048 int isspare) 1049 { 1050 nvlist_t **child; 1051 uint_t c, children; 1052 char *type, *path; 1053 int ret; 1054 char buf[MAXPATHLEN]; 1055 uint64_t wholedisk; 1056 1057 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); 1058 1059 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1060 &child, &children) != 0) { 1061 1062 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); 1063 1064 /* 1065 * As a generic check, we look to see if this is a replace of a 1066 * hot spare within the same pool. If so, we allow it 1067 * regardless of what libdiskmgt or zpool_in_use() says. 1068 */ 1069 if (isreplacing) { 1070 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 1071 &wholedisk) == 0 && wholedisk) 1072 (void) snprintf(buf, sizeof (buf), "%ss0", 1073 path); 1074 else 1075 (void) strlcpy(buf, path, sizeof (buf)); 1076 if (is_spare(config, buf)) 1077 return (0); 1078 } 1079 1080 if (strcmp(type, VDEV_TYPE_DISK) == 0) 1081 ret = check_device(path, force, isspare); 1082 1083 if (strcmp(type, VDEV_TYPE_FILE) == 0) 1084 ret = check_file(path, force, isspare); 1085 1086 return (ret); 1087 } 1088 1089 for (c = 0; c < children; c++) 1090 if ((ret = check_in_use(config, child[c], force, 1091 isreplacing, B_FALSE)) != 0) 1092 return (ret); 1093 1094 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, 1095 &child, &children) == 0) 1096 for (c = 0; c < children; c++) 1097 if ((ret = check_in_use(config, child[c], force, 1098 isreplacing, B_TRUE)) != 0) 1099 return (ret); 1100 1101 return (0); 1102 } 1103 1104 const char * 1105 is_grouping(const char *type, int *mindev) 1106 { 1107 if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { 1108 if (mindev != NULL) 1109 *mindev = 2; 1110 return (VDEV_TYPE_RAIDZ); 1111 } 1112 1113 if (strcmp(type, "raidz2") == 0) { 1114 if (mindev != NULL) 1115 *mindev = 3; 1116 return (VDEV_TYPE_RAIDZ); 1117 } 1118 1119 if (strcmp(type, "mirror") == 0) { 1120 if (mindev != NULL) 1121 *mindev = 2; 1122 return (VDEV_TYPE_MIRROR); 1123 } 1124 1125 if (strcmp(type, "spare") == 0) { 1126 if (mindev != NULL) 1127 *mindev = 1; 1128 return (VDEV_TYPE_SPARE); 1129 } 1130 1131 return (NULL); 1132 } 1133 1134 /* 1135 * Construct a syntactically valid vdev specification, 1136 * and ensure that all devices and files exist and can be opened. 1137 * Note: we don't bother freeing anything in the error paths 1138 * because the program is just going to exit anyway. 1139 */ 1140 nvlist_t * 1141 construct_spec(int argc, char **argv) 1142 { 1143 nvlist_t *nvroot, *nv, **top, **spares; 1144 int t, toplevels, mindev, nspares; 1145 const char *type; 1146 1147 top = NULL; 1148 toplevels = 0; 1149 spares = NULL; 1150 nspares = 0; 1151 1152 while (argc > 0) { 1153 nv = NULL; 1154 1155 /* 1156 * If it's a mirror or raidz, the subsequent arguments are 1157 * its leaves -- until we encounter the next mirror or raidz. 1158 */ 1159 if ((type = is_grouping(argv[0], &mindev)) != NULL) { 1160 nvlist_t **child = NULL; 1161 int c, children = 0; 1162 1163 if (strcmp(type, VDEV_TYPE_SPARE) == 0 && 1164 spares != NULL) { 1165 (void) fprintf(stderr, gettext("invalid vdev " 1166 "specification: 'spare' can be " 1167 "specified only once\n")); 1168 return (NULL); 1169 } 1170 1171 for (c = 1; c < argc; c++) { 1172 if (is_grouping(argv[c], NULL) != NULL) 1173 break; 1174 children++; 1175 child = realloc(child, 1176 children * sizeof (nvlist_t *)); 1177 if (child == NULL) 1178 no_memory(); 1179 if ((nv = make_leaf_vdev(argv[c])) == NULL) 1180 return (NULL); 1181 child[children - 1] = nv; 1182 } 1183 1184 if (children < mindev) { 1185 (void) fprintf(stderr, gettext("invalid vdev " 1186 "specification: %s requires at least %d " 1187 "devices\n"), argv[0], mindev); 1188 return (NULL); 1189 } 1190 1191 argc -= c; 1192 argv += c; 1193 1194 if (strcmp(type, VDEV_TYPE_SPARE) == 0) { 1195 spares = child; 1196 nspares = children; 1197 continue; 1198 } else { 1199 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 1200 0) == 0); 1201 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, 1202 type) == 0); 1203 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { 1204 verify(nvlist_add_uint64(nv, 1205 ZPOOL_CONFIG_NPARITY, 1206 mindev - 1) == 0); 1207 } 1208 verify(nvlist_add_nvlist_array(nv, 1209 ZPOOL_CONFIG_CHILDREN, child, 1210 children) == 0); 1211 1212 for (c = 0; c < children; c++) 1213 nvlist_free(child[c]); 1214 free(child); 1215 } 1216 } else { 1217 /* 1218 * We have a device. Pass off to make_leaf_vdev() to 1219 * construct the appropriate nvlist describing the vdev. 1220 */ 1221 if ((nv = make_leaf_vdev(argv[0])) == NULL) 1222 return (NULL); 1223 argc--; 1224 argv++; 1225 } 1226 1227 toplevels++; 1228 top = realloc(top, toplevels * sizeof (nvlist_t *)); 1229 if (top == NULL) 1230 no_memory(); 1231 top[toplevels - 1] = nv; 1232 } 1233 1234 if (toplevels == 0 && nspares == 0) { 1235 (void) fprintf(stderr, gettext("invalid vdev " 1236 "specification: at least one toplevel vdev must be " 1237 "specified\n")); 1238 return (NULL); 1239 } 1240 1241 /* 1242 * Finally, create nvroot and add all top-level vdevs to it. 1243 */ 1244 verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); 1245 verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 1246 VDEV_TYPE_ROOT) == 0); 1247 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1248 top, toplevels) == 0); 1249 if (nspares != 0) 1250 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 1251 spares, nspares) == 0); 1252 1253 for (t = 0; t < toplevels; t++) 1254 nvlist_free(top[t]); 1255 for (t = 0; t < nspares; t++) 1256 nvlist_free(spares[t]); 1257 if (spares) 1258 free(spares); 1259 free(top); 1260 1261 return (nvroot); 1262 } 1263 1264 /* 1265 * Get and validate the contents of the given vdev specification. This ensures 1266 * that the nvlist returned is well-formed, that all the devices exist, and that 1267 * they are not currently in use by any other known consumer. The 'poolconfig' 1268 * parameter is the current configuration of the pool when adding devices 1269 * existing pool, and is used to perform additional checks, such as changing the 1270 * replication level of the pool. It can be 'NULL' to indicate that this is a 1271 * new pool. The 'force' flag controls whether devices should be forcefully 1272 * added, even if they appear in use. 1273 */ 1274 nvlist_t * 1275 make_root_vdev(nvlist_t *poolconfig, int force, int check_rep, 1276 boolean_t isreplacing, int argc, char **argv) 1277 { 1278 nvlist_t *newroot; 1279 1280 is_force = force; 1281 1282 /* 1283 * Construct the vdev specification. If this is successful, we know 1284 * that we have a valid specification, and that all devices can be 1285 * opened. 1286 */ 1287 if ((newroot = construct_spec(argc, argv)) == NULL) 1288 return (NULL); 1289 1290 /* 1291 * Validate each device to make sure that its not shared with another 1292 * subsystem. We do this even if 'force' is set, because there are some 1293 * uses (such as a dedicated dump device) that even '-f' cannot 1294 * override. 1295 */ 1296 if (check_in_use(poolconfig, newroot, force, isreplacing, 1297 B_FALSE) != 0) { 1298 nvlist_free(newroot); 1299 return (NULL); 1300 } 1301 1302 /* 1303 * Check the replication level of the given vdevs and report any errors 1304 * found. We include the existing pool spec, if any, as we need to 1305 * catch changes against the existing replication level. 1306 */ 1307 if (check_rep && check_replication(poolconfig, newroot) != 0) { 1308 nvlist_free(newroot); 1309 return (NULL); 1310 } 1311 1312 /* 1313 * Run through the vdev specification and label any whole disks found. 1314 */ 1315 if (make_disks(newroot) != 0) { 1316 nvlist_free(newroot); 1317 return (NULL); 1318 } 1319 1320 return (newroot); 1321 } 1322