1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 25 * Copyright 2015 RackTop Systems. 26 * Copyright (c) 2016, Intel Corporation. 27 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 28 */ 29 30 /* 31 * Pool import support functions. 32 * 33 * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since 34 * these commands are expected to run in the global zone, we can assume 35 * that the devices are all readable when called. 36 * 37 * To import a pool, we rely on reading the configuration information from the 38 * ZFS label of each device. If we successfully read the label, then we 39 * organize the configuration information in the following hierarchy: 40 * 41 * pool guid -> toplevel vdev guid -> label txg 42 * 43 * Duplicate entries matching this same tuple will be discarded. Once we have 44 * examined every device, we pick the best label txg config for each toplevel 45 * vdev. We then arrange these toplevel vdevs into a complete pool config, and 46 * update any paths that have changed. Finally, we attempt to import the pool 47 * using our derived config, and record the results. 48 */ 49 50 #ifdef HAVE_AIO_H 51 #include <aio.h> 52 #endif 53 #include <ctype.h> 54 #include <dirent.h> 55 #include <errno.h> 56 #include <libintl.h> 57 #include <libgen.h> 58 #include <stddef.h> 59 #include <stdlib.h> 60 #include <string.h> 61 #include <sys/stat.h> 62 #include <unistd.h> 63 #include <fcntl.h> 64 #include <sys/dktp/fdisk.h> 65 #include <sys/vdev_impl.h> 66 #include <sys/fs/zfs.h> 67 68 #include <thread_pool.h> 69 #include <libzutil.h> 70 #include <libnvpair.h> 71 72 #include "zutil_import.h" 73 74 const char * 75 libpc_error_description(libpc_handle_t *hdl) 76 { 77 if (hdl->lpc_desc[0] != '\0') 78 return (hdl->lpc_desc); 79 80 switch (hdl->lpc_error) { 81 case LPC_BADCACHE: 82 return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); 83 case LPC_BADPATH: 84 return (dgettext(TEXT_DOMAIN, "must be an absolute path")); 85 case LPC_NOMEM: 86 return (dgettext(TEXT_DOMAIN, "out of memory")); 87 case LPC_EACCESS: 88 return (dgettext(TEXT_DOMAIN, "some devices require root " 89 "privileges")); 90 case LPC_UNKNOWN: 91 return (dgettext(TEXT_DOMAIN, "unknown error")); 92 default: 93 assert(hdl->lpc_error == 0); 94 return (dgettext(TEXT_DOMAIN, "no error")); 95 } 96 } 97 98 static __attribute__((format(printf, 2, 3))) void 99 zutil_error_aux(libpc_handle_t *hdl, const char *fmt, ...) 100 { 101 va_list ap; 102 103 va_start(ap, fmt); 104 105 (void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap); 106 hdl->lpc_desc_active = B_TRUE; 107 108 va_end(ap); 109 } 110 111 static void 112 zutil_verror(libpc_handle_t *hdl, lpc_error_t error, const char *fmt, 113 va_list ap) 114 { 115 char action[1024]; 116 117 (void) vsnprintf(action, sizeof (action), fmt, ap); 118 hdl->lpc_error = error; 119 120 if (hdl->lpc_desc_active) 121 hdl->lpc_desc_active = B_FALSE; 122 else 123 hdl->lpc_desc[0] = '\0'; 124 125 if (hdl->lpc_printerr) 126 (void) fprintf(stderr, "%s: %s\n", action, 127 libpc_error_description(hdl)); 128 } 129 130 static __attribute__((format(printf, 3, 4))) int 131 zutil_error_fmt(libpc_handle_t *hdl, lpc_error_t error, 132 const char *fmt, ...) 133 { 134 va_list ap; 135 136 va_start(ap, fmt); 137 138 zutil_verror(hdl, error, fmt, ap); 139 140 va_end(ap); 141 142 return (-1); 143 } 144 145 static int 146 zutil_error(libpc_handle_t *hdl, lpc_error_t error, const char *msg) 147 { 148 return (zutil_error_fmt(hdl, error, "%s", msg)); 149 } 150 151 static int 152 zutil_no_memory(libpc_handle_t *hdl) 153 { 154 zutil_error(hdl, LPC_NOMEM, "internal error"); 155 exit(1); 156 } 157 158 void * 159 zutil_alloc(libpc_handle_t *hdl, size_t size) 160 { 161 void *data; 162 163 if ((data = calloc(1, size)) == NULL) 164 (void) zutil_no_memory(hdl); 165 166 return (data); 167 } 168 169 char * 170 zutil_strdup(libpc_handle_t *hdl, const char *str) 171 { 172 char *ret; 173 174 if ((ret = strdup(str)) == NULL) 175 (void) zutil_no_memory(hdl); 176 177 return (ret); 178 } 179 180 static char * 181 zutil_strndup(libpc_handle_t *hdl, const char *str, size_t n) 182 { 183 char *ret; 184 185 if ((ret = strndup(str, n)) == NULL) 186 (void) zutil_no_memory(hdl); 187 188 return (ret); 189 } 190 191 /* 192 * Intermediate structures used to gather configuration information. 193 */ 194 typedef struct config_entry { 195 uint64_t ce_txg; 196 nvlist_t *ce_config; 197 struct config_entry *ce_next; 198 } config_entry_t; 199 200 typedef struct vdev_entry { 201 uint64_t ve_guid; 202 config_entry_t *ve_configs; 203 struct vdev_entry *ve_next; 204 } vdev_entry_t; 205 206 typedef struct pool_entry { 207 uint64_t pe_guid; 208 vdev_entry_t *pe_vdevs; 209 struct pool_entry *pe_next; 210 } pool_entry_t; 211 212 typedef struct name_entry { 213 char *ne_name; 214 uint64_t ne_guid; 215 uint64_t ne_order; 216 uint64_t ne_num_labels; 217 struct name_entry *ne_next; 218 } name_entry_t; 219 220 typedef struct pool_list { 221 pool_entry_t *pools; 222 name_entry_t *names; 223 } pool_list_t; 224 225 /* 226 * Go through and fix up any path and/or devid information for the given vdev 227 * configuration. 228 */ 229 static int 230 fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names) 231 { 232 nvlist_t **child; 233 uint_t c, children; 234 uint64_t guid; 235 name_entry_t *ne, *best; 236 const char *path; 237 238 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 239 &child, &children) == 0) { 240 for (c = 0; c < children; c++) 241 if (fix_paths(hdl, child[c], names) != 0) 242 return (-1); 243 return (0); 244 } 245 246 /* 247 * This is a leaf (file or disk) vdev. In either case, go through 248 * the name list and see if we find a matching guid. If so, replace 249 * the path and see if we can calculate a new devid. 250 * 251 * There may be multiple names associated with a particular guid, in 252 * which case we have overlapping partitions or multiple paths to the 253 * same disk. In this case we prefer to use the path name which 254 * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we 255 * use the lowest order device which corresponds to the first match 256 * while traversing the ZPOOL_IMPORT_PATH search path. 257 */ 258 verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); 259 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) 260 path = NULL; 261 262 best = NULL; 263 for (ne = names; ne != NULL; ne = ne->ne_next) { 264 if (ne->ne_guid == guid) { 265 if (path == NULL) { 266 best = ne; 267 break; 268 } 269 270 if ((strlen(path) == strlen(ne->ne_name)) && 271 strncmp(path, ne->ne_name, strlen(path)) == 0) { 272 best = ne; 273 break; 274 } 275 276 if (best == NULL) { 277 best = ne; 278 continue; 279 } 280 281 /* Prefer paths with move vdev labels. */ 282 if (ne->ne_num_labels > best->ne_num_labels) { 283 best = ne; 284 continue; 285 } 286 287 /* Prefer paths earlier in the search order. */ 288 if (ne->ne_num_labels == best->ne_num_labels && 289 ne->ne_order < best->ne_order) { 290 best = ne; 291 continue; 292 } 293 } 294 } 295 296 if (best == NULL) 297 return (0); 298 299 if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) 300 return (-1); 301 302 update_vdev_config_dev_strs(nv); 303 304 return (0); 305 } 306 307 /* 308 * Add the given configuration to the list of known devices. 309 */ 310 static int 311 add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, 312 int order, int num_labels, nvlist_t *config) 313 { 314 uint64_t pool_guid, vdev_guid, top_guid, txg, state; 315 pool_entry_t *pe; 316 vdev_entry_t *ve; 317 config_entry_t *ce; 318 name_entry_t *ne; 319 320 /* 321 * If this is a hot spare not currently in use or level 2 cache 322 * device, add it to the list of names to translate, but don't do 323 * anything else. 324 */ 325 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, 326 &state) == 0 && 327 (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && 328 nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { 329 if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) 330 return (-1); 331 332 if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { 333 free(ne); 334 return (-1); 335 } 336 ne->ne_guid = vdev_guid; 337 ne->ne_order = order; 338 ne->ne_num_labels = num_labels; 339 ne->ne_next = pl->names; 340 pl->names = ne; 341 342 return (0); 343 } 344 345 /* 346 * If we have a valid config but cannot read any of these fields, then 347 * it means we have a half-initialized label. In vdev_label_init() 348 * we write a label with txg == 0 so that we can identify the device 349 * in case the user refers to the same disk later on. If we fail to 350 * create the pool, we'll be left with a label in this state 351 * which should not be considered part of a valid pool. 352 */ 353 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 354 &pool_guid) != 0 || 355 nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, 356 &vdev_guid) != 0 || 357 nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, 358 &top_guid) != 0 || 359 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 360 &txg) != 0 || txg == 0) { 361 return (0); 362 } 363 364 /* 365 * First, see if we know about this pool. If not, then add it to the 366 * list of known pools. 367 */ 368 for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { 369 if (pe->pe_guid == pool_guid) 370 break; 371 } 372 373 if (pe == NULL) { 374 if ((pe = zutil_alloc(hdl, sizeof (pool_entry_t))) == NULL) { 375 return (-1); 376 } 377 pe->pe_guid = pool_guid; 378 pe->pe_next = pl->pools; 379 pl->pools = pe; 380 } 381 382 /* 383 * Second, see if we know about this toplevel vdev. Add it if its 384 * missing. 385 */ 386 for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { 387 if (ve->ve_guid == top_guid) 388 break; 389 } 390 391 if (ve == NULL) { 392 if ((ve = zutil_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { 393 return (-1); 394 } 395 ve->ve_guid = top_guid; 396 ve->ve_next = pe->pe_vdevs; 397 pe->pe_vdevs = ve; 398 } 399 400 /* 401 * Third, see if we have a config with a matching transaction group. If 402 * so, then we do nothing. Otherwise, add it to the list of known 403 * configs. 404 */ 405 for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { 406 if (ce->ce_txg == txg) 407 break; 408 } 409 410 if (ce == NULL) { 411 if ((ce = zutil_alloc(hdl, sizeof (config_entry_t))) == NULL) { 412 return (-1); 413 } 414 ce->ce_txg = txg; 415 ce->ce_config = fnvlist_dup(config); 416 ce->ce_next = ve->ve_configs; 417 ve->ve_configs = ce; 418 } 419 420 /* 421 * At this point we've successfully added our config to the list of 422 * known configs. The last thing to do is add the vdev guid -> path 423 * mappings so that we can fix up the configuration as necessary before 424 * doing the import. 425 */ 426 if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) 427 return (-1); 428 429 if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { 430 free(ne); 431 return (-1); 432 } 433 434 ne->ne_guid = vdev_guid; 435 ne->ne_order = order; 436 ne->ne_num_labels = num_labels; 437 ne->ne_next = pl->names; 438 pl->names = ne; 439 440 return (0); 441 } 442 443 static int 444 zutil_pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid, 445 boolean_t *isactive) 446 { 447 ASSERT(hdl->lpc_ops->pco_pool_active != NULL); 448 449 int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name, 450 guid, isactive); 451 452 return (error); 453 } 454 455 static nvlist_t * 456 zutil_refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig) 457 { 458 ASSERT(hdl->lpc_ops->pco_refresh_config != NULL); 459 460 return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle, 461 tryconfig)); 462 } 463 464 /* 465 * Determine if the vdev id is a hole in the namespace. 466 */ 467 static boolean_t 468 vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) 469 { 470 int c; 471 472 for (c = 0; c < holes; c++) { 473 474 /* Top-level is a hole */ 475 if (hole_array[c] == id) 476 return (B_TRUE); 477 } 478 return (B_FALSE); 479 } 480 481 /* 482 * Convert our list of pools into the definitive set of configurations. We 483 * start by picking the best config for each toplevel vdev. Once that's done, 484 * we assemble the toplevel vdevs into a full config for the pool. We make a 485 * pass to fix up any incorrect paths, and then add it to the main list to 486 * return to the user. 487 */ 488 static nvlist_t * 489 get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, 490 nvlist_t *policy) 491 { 492 pool_entry_t *pe; 493 vdev_entry_t *ve; 494 config_entry_t *ce; 495 nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; 496 nvlist_t **spares, **l2cache; 497 uint_t i, nspares, nl2cache; 498 boolean_t config_seen; 499 uint64_t best_txg; 500 const char *name, *hostname = NULL; 501 uint64_t guid; 502 uint_t children = 0; 503 nvlist_t **child = NULL; 504 uint64_t *hole_array, max_id; 505 uint_t c; 506 boolean_t isactive; 507 nvlist_t *nvl; 508 boolean_t valid_top_config = B_FALSE; 509 510 if (nvlist_alloc(&ret, 0, 0) != 0) 511 goto nomem; 512 513 for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { 514 uint64_t id, max_txg = 0, hostid = 0; 515 uint_t holes = 0; 516 517 if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) 518 goto nomem; 519 config_seen = B_FALSE; 520 521 /* 522 * Iterate over all toplevel vdevs. Grab the pool configuration 523 * from the first one we find, and then go through the rest and 524 * add them as necessary to the 'vdevs' member of the config. 525 */ 526 for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { 527 528 /* 529 * Determine the best configuration for this vdev by 530 * selecting the config with the latest transaction 531 * group. 532 */ 533 best_txg = 0; 534 for (ce = ve->ve_configs; ce != NULL; 535 ce = ce->ce_next) { 536 537 if (ce->ce_txg > best_txg) { 538 tmp = ce->ce_config; 539 best_txg = ce->ce_txg; 540 } 541 } 542 543 /* 544 * We rely on the fact that the max txg for the 545 * pool will contain the most up-to-date information 546 * about the valid top-levels in the vdev namespace. 547 */ 548 if (best_txg > max_txg) { 549 (void) nvlist_remove(config, 550 ZPOOL_CONFIG_VDEV_CHILDREN, 551 DATA_TYPE_UINT64); 552 (void) nvlist_remove(config, 553 ZPOOL_CONFIG_HOLE_ARRAY, 554 DATA_TYPE_UINT64_ARRAY); 555 556 max_txg = best_txg; 557 hole_array = NULL; 558 holes = 0; 559 max_id = 0; 560 valid_top_config = B_FALSE; 561 562 if (nvlist_lookup_uint64(tmp, 563 ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { 564 verify(nvlist_add_uint64(config, 565 ZPOOL_CONFIG_VDEV_CHILDREN, 566 max_id) == 0); 567 valid_top_config = B_TRUE; 568 } 569 570 if (nvlist_lookup_uint64_array(tmp, 571 ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, 572 &holes) == 0) { 573 verify(nvlist_add_uint64_array(config, 574 ZPOOL_CONFIG_HOLE_ARRAY, 575 hole_array, holes) == 0); 576 } 577 } 578 579 if (!config_seen) { 580 /* 581 * Copy the relevant pieces of data to the pool 582 * configuration: 583 * 584 * version 585 * pool guid 586 * name 587 * comment (if available) 588 * compatibility features (if available) 589 * pool state 590 * hostid (if available) 591 * hostname (if available) 592 */ 593 uint64_t state, version; 594 const char *comment = NULL; 595 const char *compatibility = NULL; 596 597 version = fnvlist_lookup_uint64(tmp, 598 ZPOOL_CONFIG_VERSION); 599 fnvlist_add_uint64(config, 600 ZPOOL_CONFIG_VERSION, version); 601 guid = fnvlist_lookup_uint64(tmp, 602 ZPOOL_CONFIG_POOL_GUID); 603 fnvlist_add_uint64(config, 604 ZPOOL_CONFIG_POOL_GUID, guid); 605 name = fnvlist_lookup_string(tmp, 606 ZPOOL_CONFIG_POOL_NAME); 607 fnvlist_add_string(config, 608 ZPOOL_CONFIG_POOL_NAME, name); 609 610 if (nvlist_lookup_string(tmp, 611 ZPOOL_CONFIG_COMMENT, &comment) == 0) 612 fnvlist_add_string(config, 613 ZPOOL_CONFIG_COMMENT, comment); 614 615 if (nvlist_lookup_string(tmp, 616 ZPOOL_CONFIG_COMPATIBILITY, 617 &compatibility) == 0) 618 fnvlist_add_string(config, 619 ZPOOL_CONFIG_COMPATIBILITY, 620 compatibility); 621 622 state = fnvlist_lookup_uint64(tmp, 623 ZPOOL_CONFIG_POOL_STATE); 624 fnvlist_add_uint64(config, 625 ZPOOL_CONFIG_POOL_STATE, state); 626 627 hostid = 0; 628 if (nvlist_lookup_uint64(tmp, 629 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 630 fnvlist_add_uint64(config, 631 ZPOOL_CONFIG_HOSTID, hostid); 632 hostname = fnvlist_lookup_string(tmp, 633 ZPOOL_CONFIG_HOSTNAME); 634 fnvlist_add_string(config, 635 ZPOOL_CONFIG_HOSTNAME, hostname); 636 } 637 638 config_seen = B_TRUE; 639 } 640 641 /* 642 * Add this top-level vdev to the child array. 643 */ 644 verify(nvlist_lookup_nvlist(tmp, 645 ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); 646 verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, 647 &id) == 0); 648 649 if (id >= children) { 650 nvlist_t **newchild; 651 652 newchild = zutil_alloc(hdl, (id + 1) * 653 sizeof (nvlist_t *)); 654 if (newchild == NULL) 655 goto nomem; 656 657 for (c = 0; c < children; c++) 658 newchild[c] = child[c]; 659 660 free(child); 661 child = newchild; 662 children = id + 1; 663 } 664 if (nvlist_dup(nvtop, &child[id], 0) != 0) 665 goto nomem; 666 667 } 668 669 /* 670 * If we have information about all the top-levels then 671 * clean up the nvlist which we've constructed. This 672 * means removing any extraneous devices that are 673 * beyond the valid range or adding devices to the end 674 * of our array which appear to be missing. 675 */ 676 if (valid_top_config) { 677 if (max_id < children) { 678 for (c = max_id; c < children; c++) 679 nvlist_free(child[c]); 680 children = max_id; 681 } else if (max_id > children) { 682 nvlist_t **newchild; 683 684 newchild = zutil_alloc(hdl, (max_id) * 685 sizeof (nvlist_t *)); 686 if (newchild == NULL) 687 goto nomem; 688 689 for (c = 0; c < children; c++) 690 newchild[c] = child[c]; 691 692 free(child); 693 child = newchild; 694 children = max_id; 695 } 696 } 697 698 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 699 &guid) == 0); 700 701 /* 702 * The vdev namespace may contain holes as a result of 703 * device removal. We must add them back into the vdev 704 * tree before we process any missing devices. 705 */ 706 if (holes > 0) { 707 ASSERT(valid_top_config); 708 709 for (c = 0; c < children; c++) { 710 nvlist_t *holey; 711 712 if (child[c] != NULL || 713 !vdev_is_hole(hole_array, holes, c)) 714 continue; 715 716 if (nvlist_alloc(&holey, NV_UNIQUE_NAME, 717 0) != 0) 718 goto nomem; 719 720 /* 721 * Holes in the namespace are treated as 722 * "hole" top-level vdevs and have a 723 * special flag set on them. 724 */ 725 if (nvlist_add_string(holey, 726 ZPOOL_CONFIG_TYPE, 727 VDEV_TYPE_HOLE) != 0 || 728 nvlist_add_uint64(holey, 729 ZPOOL_CONFIG_ID, c) != 0 || 730 nvlist_add_uint64(holey, 731 ZPOOL_CONFIG_GUID, 0ULL) != 0) { 732 nvlist_free(holey); 733 goto nomem; 734 } 735 child[c] = holey; 736 } 737 } 738 739 /* 740 * Look for any missing top-level vdevs. If this is the case, 741 * create a faked up 'missing' vdev as a placeholder. We cannot 742 * simply compress the child array, because the kernel performs 743 * certain checks to make sure the vdev IDs match their location 744 * in the configuration. 745 */ 746 for (c = 0; c < children; c++) { 747 if (child[c] == NULL) { 748 nvlist_t *missing; 749 if (nvlist_alloc(&missing, NV_UNIQUE_NAME, 750 0) != 0) 751 goto nomem; 752 if (nvlist_add_string(missing, 753 ZPOOL_CONFIG_TYPE, 754 VDEV_TYPE_MISSING) != 0 || 755 nvlist_add_uint64(missing, 756 ZPOOL_CONFIG_ID, c) != 0 || 757 nvlist_add_uint64(missing, 758 ZPOOL_CONFIG_GUID, 0ULL) != 0) { 759 nvlist_free(missing); 760 goto nomem; 761 } 762 child[c] = missing; 763 } 764 } 765 766 /* 767 * Put all of this pool's top-level vdevs into a root vdev. 768 */ 769 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) 770 goto nomem; 771 if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 772 VDEV_TYPE_ROOT) != 0 || 773 nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || 774 nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || 775 nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 776 (const nvlist_t **)child, children) != 0) { 777 nvlist_free(nvroot); 778 goto nomem; 779 } 780 781 for (c = 0; c < children; c++) 782 nvlist_free(child[c]); 783 free(child); 784 children = 0; 785 child = NULL; 786 787 /* 788 * Go through and fix up any paths and/or devids based on our 789 * known list of vdev GUID -> path mappings. 790 */ 791 if (fix_paths(hdl, nvroot, pl->names) != 0) { 792 nvlist_free(nvroot); 793 goto nomem; 794 } 795 796 /* 797 * Add the root vdev to this pool's configuration. 798 */ 799 if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 800 nvroot) != 0) { 801 nvlist_free(nvroot); 802 goto nomem; 803 } 804 nvlist_free(nvroot); 805 806 /* 807 * zdb uses this path to report on active pools that were 808 * imported or created using -R. 809 */ 810 if (active_ok) 811 goto add_pool; 812 813 /* 814 * Determine if this pool is currently active, in which case we 815 * can't actually import it. 816 */ 817 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 818 &name) == 0); 819 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 820 &guid) == 0); 821 822 if (zutil_pool_active(hdl, name, guid, &isactive) != 0) 823 goto error; 824 825 if (isactive) { 826 nvlist_free(config); 827 config = NULL; 828 continue; 829 } 830 831 if (policy != NULL) { 832 if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, 833 policy) != 0) 834 goto nomem; 835 } 836 837 if ((nvl = zutil_refresh_config(hdl, config)) == NULL) { 838 nvlist_free(config); 839 config = NULL; 840 continue; 841 } 842 843 nvlist_free(config); 844 config = nvl; 845 846 /* 847 * Go through and update the paths for spares, now that we have 848 * them. 849 */ 850 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 851 &nvroot) == 0); 852 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 853 &spares, &nspares) == 0) { 854 for (i = 0; i < nspares; i++) { 855 if (fix_paths(hdl, spares[i], pl->names) != 0) 856 goto nomem; 857 } 858 } 859 860 /* 861 * Update the paths for l2cache devices. 862 */ 863 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 864 &l2cache, &nl2cache) == 0) { 865 for (i = 0; i < nl2cache; i++) { 866 if (fix_paths(hdl, l2cache[i], pl->names) != 0) 867 goto nomem; 868 } 869 } 870 871 /* 872 * Restore the original information read from the actual label. 873 */ 874 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, 875 DATA_TYPE_UINT64); 876 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, 877 DATA_TYPE_STRING); 878 if (hostid != 0) { 879 verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, 880 hostid) == 0); 881 verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, 882 hostname) == 0); 883 } 884 885 add_pool: 886 /* 887 * Add this pool to the list of configs. 888 */ 889 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 890 &name) == 0); 891 892 if (nvlist_add_nvlist(ret, name, config) != 0) 893 goto nomem; 894 895 nvlist_free(config); 896 config = NULL; 897 } 898 899 return (ret); 900 901 nomem: 902 (void) zutil_no_memory(hdl); 903 error: 904 nvlist_free(config); 905 nvlist_free(ret); 906 for (c = 0; c < children; c++) 907 nvlist_free(child[c]); 908 free(child); 909 910 return (NULL); 911 } 912 913 /* 914 * Return the offset of the given label. 915 */ 916 static uint64_t 917 label_offset(uint64_t size, int l) 918 { 919 ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); 920 return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 921 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); 922 } 923 924 /* 925 * The same description applies as to zpool_read_label below, 926 * except here we do it without aio, presumably because an aio call 927 * errored out in a way we think not using it could circumvent. 928 */ 929 static int 930 zpool_read_label_slow(int fd, nvlist_t **config, int *num_labels) 931 { 932 struct stat64 statbuf; 933 int l, count = 0; 934 vdev_phys_t *label; 935 nvlist_t *expected_config = NULL; 936 uint64_t expected_guid = 0, size; 937 938 *config = NULL; 939 940 if (fstat64_blk(fd, &statbuf) == -1) 941 return (0); 942 size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); 943 944 label = (vdev_phys_t *)umem_alloc_aligned(sizeof (*label), PAGESIZE, 945 UMEM_DEFAULT); 946 if (label == NULL) 947 return (-1); 948 949 for (l = 0; l < VDEV_LABELS; l++) { 950 uint64_t state, guid, txg; 951 off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE; 952 953 if (pread64(fd, label, sizeof (vdev_phys_t), 954 offset) != sizeof (vdev_phys_t)) 955 continue; 956 957 if (nvlist_unpack(label->vp_nvlist, 958 sizeof (label->vp_nvlist), config, 0) != 0) 959 continue; 960 961 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, 962 &guid) != 0 || guid == 0) { 963 nvlist_free(*config); 964 continue; 965 } 966 967 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, 968 &state) != 0 || state > POOL_STATE_L2CACHE) { 969 nvlist_free(*config); 970 continue; 971 } 972 973 if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && 974 (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, 975 &txg) != 0 || txg == 0)) { 976 nvlist_free(*config); 977 continue; 978 } 979 980 if (expected_guid) { 981 if (expected_guid == guid) 982 count++; 983 984 nvlist_free(*config); 985 } else { 986 expected_config = *config; 987 expected_guid = guid; 988 count++; 989 } 990 } 991 992 if (num_labels != NULL) 993 *num_labels = count; 994 995 umem_free_aligned(label, sizeof (*label)); 996 *config = expected_config; 997 998 return (0); 999 } 1000 1001 /* 1002 * Given a file descriptor, read the label information and return an nvlist 1003 * describing the configuration, if there is one. The number of valid 1004 * labels found will be returned in num_labels when non-NULL. 1005 */ 1006 int 1007 zpool_read_label(int fd, nvlist_t **config, int *num_labels) 1008 { 1009 #ifndef HAVE_AIO_H 1010 return (zpool_read_label_slow(fd, config, num_labels)); 1011 #else 1012 struct stat64 statbuf; 1013 struct aiocb aiocbs[VDEV_LABELS]; 1014 struct aiocb *aiocbps[VDEV_LABELS]; 1015 vdev_phys_t *labels; 1016 nvlist_t *expected_config = NULL; 1017 uint64_t expected_guid = 0, size; 1018 int error, l, count = 0; 1019 1020 *config = NULL; 1021 1022 if (fstat64_blk(fd, &statbuf) == -1) 1023 return (0); 1024 size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); 1025 1026 labels = (vdev_phys_t *)umem_alloc_aligned( 1027 VDEV_LABELS * sizeof (*labels), PAGESIZE, UMEM_DEFAULT); 1028 if (labels == NULL) 1029 return (-1); 1030 1031 memset(aiocbs, 0, sizeof (aiocbs)); 1032 for (l = 0; l < VDEV_LABELS; l++) { 1033 off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE; 1034 1035 aiocbs[l].aio_fildes = fd; 1036 aiocbs[l].aio_offset = offset; 1037 aiocbs[l].aio_buf = &labels[l]; 1038 aiocbs[l].aio_nbytes = sizeof (vdev_phys_t); 1039 aiocbs[l].aio_lio_opcode = LIO_READ; 1040 aiocbps[l] = &aiocbs[l]; 1041 } 1042 1043 if (lio_listio(LIO_WAIT, aiocbps, VDEV_LABELS, NULL) != 0) { 1044 int saved_errno = errno; 1045 boolean_t do_slow = B_FALSE; 1046 error = -1; 1047 1048 if (errno == EAGAIN || errno == EINTR || errno == EIO) { 1049 /* 1050 * A portion of the requests may have been submitted. 1051 * Clean them up. 1052 */ 1053 for (l = 0; l < VDEV_LABELS; l++) { 1054 errno = 0; 1055 switch (aio_error(&aiocbs[l])) { 1056 case EINVAL: 1057 break; 1058 case EINPROGRESS: 1059 /* 1060 * This shouldn't be possible to 1061 * encounter, die if we do. 1062 */ 1063 ASSERT(B_FALSE); 1064 zfs_fallthrough; 1065 case EREMOTEIO: 1066 /* 1067 * May be returned by an NVMe device 1068 * which is visible in /dev/ but due 1069 * to a low-level format change, or 1070 * other error, needs to be rescanned. 1071 * Try the slow method. 1072 */ 1073 zfs_fallthrough; 1074 case EAGAIN: 1075 case EOPNOTSUPP: 1076 case ENOSYS: 1077 do_slow = B_TRUE; 1078 zfs_fallthrough; 1079 case 0: 1080 default: 1081 (void) aio_return(&aiocbs[l]); 1082 } 1083 } 1084 } 1085 if (do_slow) { 1086 /* 1087 * At least some IO involved access unsafe-for-AIO 1088 * files. Let's try again, without AIO this time. 1089 */ 1090 error = zpool_read_label_slow(fd, config, num_labels); 1091 saved_errno = errno; 1092 } 1093 umem_free_aligned(labels, VDEV_LABELS * sizeof (*labels)); 1094 errno = saved_errno; 1095 return (error); 1096 } 1097 1098 for (l = 0; l < VDEV_LABELS; l++) { 1099 uint64_t state, guid, txg; 1100 1101 if (aio_return(&aiocbs[l]) != sizeof (vdev_phys_t)) 1102 continue; 1103 1104 if (nvlist_unpack(labels[l].vp_nvlist, 1105 sizeof (labels[l].vp_nvlist), config, 0) != 0) 1106 continue; 1107 1108 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, 1109 &guid) != 0 || guid == 0) { 1110 nvlist_free(*config); 1111 continue; 1112 } 1113 1114 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, 1115 &state) != 0 || state > POOL_STATE_L2CACHE) { 1116 nvlist_free(*config); 1117 continue; 1118 } 1119 1120 if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && 1121 (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, 1122 &txg) != 0 || txg == 0)) { 1123 nvlist_free(*config); 1124 continue; 1125 } 1126 1127 if (expected_guid) { 1128 if (expected_guid == guid) 1129 count++; 1130 1131 nvlist_free(*config); 1132 } else { 1133 expected_config = *config; 1134 expected_guid = guid; 1135 count++; 1136 } 1137 } 1138 1139 if (num_labels != NULL) 1140 *num_labels = count; 1141 1142 umem_free_aligned(labels, VDEV_LABELS * sizeof (*labels)); 1143 *config = expected_config; 1144 1145 return (0); 1146 #endif 1147 } 1148 1149 /* 1150 * Sorted by full path and then vdev guid to allow for multiple entries with 1151 * the same full path name. This is required because it's possible to 1152 * have multiple block devices with labels that refer to the same 1153 * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both 1154 * entries need to be added to the cache. Scenarios where this can occur 1155 * include overwritten pool labels, devices which are visible from multiple 1156 * hosts and multipath devices. 1157 */ 1158 int 1159 slice_cache_compare(const void *arg1, const void *arg2) 1160 { 1161 const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; 1162 const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; 1163 uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid; 1164 uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; 1165 int rv; 1166 1167 rv = TREE_ISIGN(strcmp(nm1, nm2)); 1168 if (rv) 1169 return (rv); 1170 1171 return (TREE_CMP(guid1, guid2)); 1172 } 1173 1174 static int 1175 label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid, 1176 uint64_t vdev_guid, const char **path, const char **devid) 1177 { 1178 nvlist_t **child; 1179 uint_t c, children; 1180 uint64_t guid; 1181 const char *val; 1182 int error; 1183 1184 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1185 &child, &children) == 0) { 1186 for (c = 0; c < children; c++) { 1187 error = label_paths_impl(hdl, child[c], 1188 pool_guid, vdev_guid, path, devid); 1189 if (error) 1190 return (error); 1191 } 1192 return (0); 1193 } 1194 1195 if (nvroot == NULL) 1196 return (0); 1197 1198 error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid); 1199 if ((error != 0) || (guid != vdev_guid)) 1200 return (0); 1201 1202 error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val); 1203 if (error == 0) 1204 *path = val; 1205 1206 error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val); 1207 if (error == 0) 1208 *devid = val; 1209 1210 return (0); 1211 } 1212 1213 /* 1214 * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID 1215 * and store these strings as config_path and devid_path respectively. 1216 * The returned pointers are only valid as long as label remains valid. 1217 */ 1218 int 1219 label_paths(libpc_handle_t *hdl, nvlist_t *label, const char **path, 1220 const char **devid) 1221 { 1222 nvlist_t *nvroot; 1223 uint64_t pool_guid; 1224 uint64_t vdev_guid; 1225 uint64_t state; 1226 1227 *path = NULL; 1228 *devid = NULL; 1229 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid) != 0) 1230 return (ENOENT); 1231 1232 /* 1233 * In case of spare or l2cache, we directly return path/devid from the 1234 * label. 1235 */ 1236 if (!(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state)) && 1237 (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE)) { 1238 (void) nvlist_lookup_string(label, ZPOOL_CONFIG_PATH, path); 1239 (void) nvlist_lookup_string(label, ZPOOL_CONFIG_DEVID, devid); 1240 return (0); 1241 } 1242 1243 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1244 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1245 return (ENOENT); 1246 1247 return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path, 1248 devid)); 1249 } 1250 1251 static void 1252 zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock, 1253 avl_tree_t *cache, const char *path, const char *name, int order) 1254 { 1255 avl_index_t where; 1256 rdsk_node_t *slice; 1257 1258 slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); 1259 if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) { 1260 free(slice); 1261 return; 1262 } 1263 slice->rn_vdev_guid = 0; 1264 slice->rn_lock = lock; 1265 slice->rn_avl = cache; 1266 slice->rn_hdl = hdl; 1267 slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET; 1268 slice->rn_labelpaths = B_FALSE; 1269 1270 pthread_mutex_lock(lock); 1271 if (avl_find(cache, slice, &where)) { 1272 free(slice->rn_name); 1273 free(slice); 1274 } else { 1275 avl_insert(cache, slice, where); 1276 } 1277 pthread_mutex_unlock(lock); 1278 } 1279 1280 static int 1281 zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock, 1282 avl_tree_t *cache, const char *dir, int order) 1283 { 1284 int error; 1285 char path[MAXPATHLEN]; 1286 struct dirent64 *dp; 1287 DIR *dirp; 1288 1289 if (realpath(dir, path) == NULL) { 1290 error = errno; 1291 if (error == ENOENT) 1292 return (0); 1293 1294 zutil_error_aux(hdl, "%s", zfs_strerror(error)); 1295 (void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN, 1296 "cannot resolve path '%s'"), dir); 1297 return (error); 1298 } 1299 1300 dirp = opendir(path); 1301 if (dirp == NULL) { 1302 error = errno; 1303 zutil_error_aux(hdl, "%s", zfs_strerror(error)); 1304 (void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN, 1305 "cannot open '%s'"), path); 1306 return (error); 1307 } 1308 1309 while ((dp = readdir64(dirp)) != NULL) { 1310 const char *name = dp->d_name; 1311 if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) 1312 continue; 1313 1314 switch (dp->d_type) { 1315 case DT_UNKNOWN: 1316 case DT_BLK: 1317 case DT_LNK: 1318 #ifdef __FreeBSD__ 1319 case DT_CHR: 1320 #endif 1321 case DT_REG: 1322 break; 1323 default: 1324 continue; 1325 } 1326 1327 zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, 1328 order); 1329 } 1330 1331 (void) closedir(dirp); 1332 return (0); 1333 } 1334 1335 static int 1336 zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock, 1337 avl_tree_t *cache, const char *dir, int order) 1338 { 1339 int error = 0; 1340 char path[MAXPATHLEN]; 1341 char *d = NULL; 1342 ssize_t dl; 1343 const char *dpath, *name; 1344 1345 /* 1346 * Separate the directory and the basename. 1347 * We do this so that we can get the realpath of 1348 * the directory. We don't get the realpath on the 1349 * whole path because if it's a symlink, we want the 1350 * path of the symlink not where it points to. 1351 */ 1352 name = zfs_basename(dir); 1353 if ((dl = zfs_dirnamelen(dir)) == -1) 1354 dpath = "."; 1355 else 1356 dpath = d = zutil_strndup(hdl, dir, dl); 1357 1358 if (realpath(dpath, path) == NULL) { 1359 error = errno; 1360 if (error == ENOENT) { 1361 error = 0; 1362 goto out; 1363 } 1364 1365 zutil_error_aux(hdl, "%s", zfs_strerror(error)); 1366 (void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN, 1367 "cannot resolve path '%s'"), dir); 1368 goto out; 1369 } 1370 1371 zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); 1372 1373 out: 1374 free(d); 1375 return (error); 1376 } 1377 1378 /* 1379 * Scan a list of directories for zfs devices. 1380 */ 1381 static int 1382 zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock, 1383 avl_tree_t **slice_cache, const char * const *dir, size_t dirs) 1384 { 1385 avl_tree_t *cache; 1386 rdsk_node_t *slice; 1387 void *cookie; 1388 int i, error; 1389 1390 *slice_cache = NULL; 1391 cache = zutil_alloc(hdl, sizeof (avl_tree_t)); 1392 avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t), 1393 offsetof(rdsk_node_t, rn_node)); 1394 1395 for (i = 0; i < dirs; i++) { 1396 struct stat sbuf; 1397 1398 if (stat(dir[i], &sbuf) != 0) { 1399 error = errno; 1400 if (error == ENOENT) 1401 continue; 1402 1403 zutil_error_aux(hdl, "%s", zfs_strerror(error)); 1404 (void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext( 1405 TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]); 1406 goto error; 1407 } 1408 1409 /* 1410 * If dir[i] is a directory, we walk through it and add all 1411 * the entries to the cache. If it's not a directory, we just 1412 * add it to the cache. 1413 */ 1414 if (S_ISDIR(sbuf.st_mode)) { 1415 if ((error = zpool_find_import_scan_dir(hdl, lock, 1416 cache, dir[i], i)) != 0) 1417 goto error; 1418 } else { 1419 if ((error = zpool_find_import_scan_path(hdl, lock, 1420 cache, dir[i], i)) != 0) 1421 goto error; 1422 } 1423 } 1424 1425 *slice_cache = cache; 1426 return (0); 1427 1428 error: 1429 cookie = NULL; 1430 while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { 1431 free(slice->rn_name); 1432 free(slice); 1433 } 1434 free(cache); 1435 1436 return (error); 1437 } 1438 1439 /* 1440 * Given a list of directories to search, find all pools stored on disk. This 1441 * includes partial pools which are not available to import. If no args are 1442 * given (argc is 0), then the default directory (/dev/dsk) is searched. 1443 * poolname or guid (but not both) are provided by the caller when trying 1444 * to import a specific pool. 1445 */ 1446 static nvlist_t * 1447 zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg, 1448 pthread_mutex_t *lock, avl_tree_t *cache) 1449 { 1450 (void) lock; 1451 nvlist_t *ret = NULL; 1452 pool_list_t pools = { 0 }; 1453 pool_entry_t *pe, *penext; 1454 vdev_entry_t *ve, *venext; 1455 config_entry_t *ce, *cenext; 1456 name_entry_t *ne, *nenext; 1457 rdsk_node_t *slice; 1458 void *cookie; 1459 tpool_t *t; 1460 1461 verify(iarg->poolname == NULL || iarg->guid == 0); 1462 1463 /* 1464 * Create a thread pool to parallelize the process of reading and 1465 * validating labels, a large number of threads can be used due to 1466 * minimal contention. 1467 */ 1468 long threads = 2 * sysconf(_SC_NPROCESSORS_ONLN); 1469 #ifdef HAVE_AIO_H 1470 long am; 1471 #ifdef _SC_AIO_LISTIO_MAX 1472 am = sysconf(_SC_AIO_LISTIO_MAX); 1473 if (am >= VDEV_LABELS) 1474 threads = MIN(threads, am / VDEV_LABELS); 1475 #endif 1476 #ifdef _SC_AIO_MAX 1477 am = sysconf(_SC_AIO_MAX); 1478 if (am >= VDEV_LABELS) 1479 threads = MIN(threads, am / VDEV_LABELS); 1480 #endif 1481 #endif 1482 t = tpool_create(1, threads, 0, NULL); 1483 for (slice = avl_first(cache); slice; 1484 (slice = avl_walk(cache, slice, AVL_AFTER))) 1485 (void) tpool_dispatch(t, zpool_open_func, slice); 1486 1487 tpool_wait(t); 1488 tpool_destroy(t); 1489 1490 /* 1491 * Process the cache, filtering out any entries which are not 1492 * for the specified pool then adding matching label configs. 1493 */ 1494 cookie = NULL; 1495 while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { 1496 if (slice->rn_config != NULL) { 1497 nvlist_t *config = slice->rn_config; 1498 boolean_t matched = B_TRUE; 1499 boolean_t aux = B_FALSE; 1500 int fd; 1501 1502 /* 1503 * Check if it's a spare or l2cache device. If it is, 1504 * we need to skip the name and guid check since they 1505 * don't exist on aux device label. 1506 */ 1507 if (iarg->poolname != NULL || iarg->guid != 0) { 1508 uint64_t state; 1509 aux = nvlist_lookup_uint64(config, 1510 ZPOOL_CONFIG_POOL_STATE, &state) == 0 && 1511 (state == POOL_STATE_SPARE || 1512 state == POOL_STATE_L2CACHE); 1513 } 1514 1515 if (iarg->poolname != NULL && !aux) { 1516 const char *pname; 1517 1518 matched = nvlist_lookup_string(config, 1519 ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && 1520 strcmp(iarg->poolname, pname) == 0; 1521 } else if (iarg->guid != 0 && !aux) { 1522 uint64_t this_guid; 1523 1524 matched = nvlist_lookup_uint64(config, 1525 ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 && 1526 iarg->guid == this_guid; 1527 } 1528 if (matched) { 1529 /* 1530 * Verify all remaining entries can be opened 1531 * exclusively. This will prune all underlying 1532 * multipath devices which otherwise could 1533 * result in the vdev appearing as UNAVAIL. 1534 * 1535 * Under zdb, this step isn't required and 1536 * would prevent a zdb -e of active pools with 1537 * no cachefile. 1538 */ 1539 fd = open(slice->rn_name, 1540 O_RDONLY | O_EXCL | O_CLOEXEC); 1541 if (fd >= 0 || iarg->can_be_active) { 1542 if (fd >= 0) 1543 close(fd); 1544 add_config(hdl, &pools, 1545 slice->rn_name, slice->rn_order, 1546 slice->rn_num_labels, config); 1547 } 1548 } 1549 nvlist_free(config); 1550 } 1551 free(slice->rn_name); 1552 free(slice); 1553 } 1554 avl_destroy(cache); 1555 free(cache); 1556 1557 ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy); 1558 1559 for (pe = pools.pools; pe != NULL; pe = penext) { 1560 penext = pe->pe_next; 1561 for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { 1562 venext = ve->ve_next; 1563 for (ce = ve->ve_configs; ce != NULL; ce = cenext) { 1564 cenext = ce->ce_next; 1565 nvlist_free(ce->ce_config); 1566 free(ce); 1567 } 1568 free(ve); 1569 } 1570 free(pe); 1571 } 1572 1573 for (ne = pools.names; ne != NULL; ne = nenext) { 1574 nenext = ne->ne_next; 1575 free(ne->ne_name); 1576 free(ne); 1577 } 1578 1579 return (ret); 1580 } 1581 1582 /* 1583 * Given a config, discover the paths for the devices which 1584 * exist in the config. 1585 */ 1586 static int 1587 discover_cached_paths(libpc_handle_t *hdl, nvlist_t *nv, 1588 avl_tree_t *cache, pthread_mutex_t *lock) 1589 { 1590 const char *path = NULL; 1591 ssize_t dl; 1592 uint_t children; 1593 nvlist_t **child; 1594 1595 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1596 &child, &children) == 0) { 1597 for (int c = 0; c < children; c++) { 1598 discover_cached_paths(hdl, child[c], cache, lock); 1599 } 1600 } 1601 1602 /* 1603 * Once we have the path, we need to add the directory to 1604 * our directory cache. 1605 */ 1606 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { 1607 int ret; 1608 char c = '\0'; 1609 if ((dl = zfs_dirnamelen(path)) == -1) { 1610 path = "."; 1611 } else { 1612 c = path[dl]; 1613 ((char *)path)[dl] = '\0'; 1614 1615 } 1616 ret = zpool_find_import_scan_dir(hdl, lock, cache, 1617 path, 0); 1618 if (c != '\0') 1619 ((char *)path)[dl] = c; 1620 1621 return (ret); 1622 } 1623 return (0); 1624 } 1625 1626 /* 1627 * Given a cache file, return the contents as a list of importable pools. 1628 * poolname or guid (but not both) are provided by the caller when trying 1629 * to import a specific pool. 1630 */ 1631 static nvlist_t * 1632 zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg) 1633 { 1634 char *buf; 1635 int fd; 1636 struct stat64 statbuf; 1637 nvlist_t *raw, *src, *dst; 1638 nvlist_t *pools; 1639 nvpair_t *elem; 1640 const char *name; 1641 uint64_t this_guid; 1642 boolean_t active; 1643 1644 verify(iarg->poolname == NULL || iarg->guid == 0); 1645 1646 if ((fd = open(iarg->cachefile, O_RDONLY | O_CLOEXEC)) < 0) { 1647 zutil_error_aux(hdl, "%s", zfs_strerror(errno)); 1648 (void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN, 1649 "failed to open cache file")); 1650 return (NULL); 1651 } 1652 1653 if (fstat64(fd, &statbuf) != 0) { 1654 zutil_error_aux(hdl, "%s", zfs_strerror(errno)); 1655 (void) close(fd); 1656 (void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN, 1657 "failed to get size of cache file")); 1658 return (NULL); 1659 } 1660 1661 if ((buf = zutil_alloc(hdl, statbuf.st_size)) == NULL) { 1662 (void) close(fd); 1663 return (NULL); 1664 } 1665 1666 if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { 1667 (void) close(fd); 1668 free(buf); 1669 (void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN, 1670 "failed to read cache file contents")); 1671 return (NULL); 1672 } 1673 1674 (void) close(fd); 1675 1676 if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) { 1677 free(buf); 1678 (void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN, 1679 "invalid or corrupt cache file contents")); 1680 return (NULL); 1681 } 1682 1683 free(buf); 1684 1685 /* 1686 * Go through and get the current state of the pools and refresh their 1687 * state. 1688 */ 1689 if (nvlist_alloc(&pools, 0, 0) != 0) { 1690 (void) zutil_no_memory(hdl); 1691 nvlist_free(raw); 1692 return (NULL); 1693 } 1694 1695 elem = NULL; 1696 while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) { 1697 src = fnvpair_value_nvlist(elem); 1698 1699 name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME); 1700 if (iarg->poolname != NULL && strcmp(iarg->poolname, name) != 0) 1701 continue; 1702 1703 this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID); 1704 if (iarg->guid != 0 && iarg->guid != this_guid) 1705 continue; 1706 1707 if (zutil_pool_active(hdl, name, this_guid, &active) != 0) { 1708 nvlist_free(raw); 1709 nvlist_free(pools); 1710 return (NULL); 1711 } 1712 1713 if (active) 1714 continue; 1715 1716 if (iarg->scan) { 1717 uint64_t saved_guid = iarg->guid; 1718 const char *saved_poolname = iarg->poolname; 1719 pthread_mutex_t lock; 1720 1721 /* 1722 * Create the device cache that will hold the 1723 * devices we will scan based on the cachefile. 1724 * This will get destroyed and freed by 1725 * zpool_find_import_impl. 1726 */ 1727 avl_tree_t *cache = zutil_alloc(hdl, 1728 sizeof (avl_tree_t)); 1729 avl_create(cache, slice_cache_compare, 1730 sizeof (rdsk_node_t), 1731 offsetof(rdsk_node_t, rn_node)); 1732 nvlist_t *nvroot = fnvlist_lookup_nvlist(src, 1733 ZPOOL_CONFIG_VDEV_TREE); 1734 1735 /* 1736 * We only want to find the pool with this_guid. 1737 * We will reset these values back later. 1738 */ 1739 iarg->guid = this_guid; 1740 iarg->poolname = NULL; 1741 1742 /* 1743 * We need to build up a cache of devices that exists 1744 * in the paths pointed to by the cachefile. This allows 1745 * us to preserve the device namespace that was 1746 * originally specified by the user but also lets us 1747 * scan devices in those directories in case they had 1748 * been renamed. 1749 */ 1750 pthread_mutex_init(&lock, NULL); 1751 discover_cached_paths(hdl, nvroot, cache, &lock); 1752 nvlist_t *nv = zpool_find_import_impl(hdl, iarg, 1753 &lock, cache); 1754 pthread_mutex_destroy(&lock); 1755 1756 /* 1757 * zpool_find_import_impl will return back 1758 * a list of pools that it found based on the 1759 * device cache. There should only be one pool 1760 * since we're looking for a specific guid. 1761 * We will use that pool to build up the final 1762 * pool nvlist which is returned back to the 1763 * caller. 1764 */ 1765 nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 1766 if (pair == NULL) 1767 continue; 1768 fnvlist_add_nvlist(pools, nvpair_name(pair), 1769 fnvpair_value_nvlist(pair)); 1770 1771 VERIFY3P(nvlist_next_nvpair(nv, pair), ==, NULL); 1772 1773 iarg->guid = saved_guid; 1774 iarg->poolname = saved_poolname; 1775 continue; 1776 } 1777 1778 if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE, 1779 iarg->cachefile) != 0) { 1780 (void) zutil_no_memory(hdl); 1781 nvlist_free(raw); 1782 nvlist_free(pools); 1783 return (NULL); 1784 } 1785 1786 update_vdevs_config_dev_sysfs_path(src); 1787 1788 if ((dst = zutil_refresh_config(hdl, src)) == NULL) { 1789 nvlist_free(raw); 1790 nvlist_free(pools); 1791 return (NULL); 1792 } 1793 1794 if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) { 1795 (void) zutil_no_memory(hdl); 1796 nvlist_free(dst); 1797 nvlist_free(raw); 1798 nvlist_free(pools); 1799 return (NULL); 1800 } 1801 nvlist_free(dst); 1802 } 1803 nvlist_free(raw); 1804 return (pools); 1805 } 1806 1807 static nvlist_t * 1808 zpool_find_import(libpc_handle_t *hdl, importargs_t *iarg) 1809 { 1810 pthread_mutex_t lock; 1811 avl_tree_t *cache; 1812 nvlist_t *pools = NULL; 1813 1814 verify(iarg->poolname == NULL || iarg->guid == 0); 1815 pthread_mutex_init(&lock, NULL); 1816 1817 /* 1818 * Locate pool member vdevs by blkid or by directory scanning. 1819 * On success a newly allocated AVL tree which is populated with an 1820 * entry for each discovered vdev will be returned in the cache. 1821 * It's the caller's responsibility to consume and destroy this tree. 1822 */ 1823 if (iarg->scan || iarg->paths != 0) { 1824 size_t dirs = iarg->paths; 1825 const char * const *dir = (const char * const *)iarg->path; 1826 1827 if (dirs == 0) 1828 dir = zpool_default_search_paths(&dirs); 1829 1830 if (zpool_find_import_scan(hdl, &lock, &cache, 1831 dir, dirs) != 0) { 1832 pthread_mutex_destroy(&lock); 1833 return (NULL); 1834 } 1835 } else { 1836 if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) { 1837 pthread_mutex_destroy(&lock); 1838 return (NULL); 1839 } 1840 } 1841 1842 pools = zpool_find_import_impl(hdl, iarg, &lock, cache); 1843 pthread_mutex_destroy(&lock); 1844 return (pools); 1845 } 1846 1847 1848 nvlist_t * 1849 zpool_search_import(libpc_handle_t *hdl, importargs_t *import) 1850 { 1851 nvlist_t *pools = NULL; 1852 1853 verify(import->poolname == NULL || import->guid == 0); 1854 1855 if (import->cachefile != NULL) 1856 pools = zpool_find_import_cached(hdl, import); 1857 else 1858 pools = zpool_find_import(hdl, import); 1859 1860 if ((pools == NULL || nvlist_empty(pools)) && 1861 hdl->lpc_open_access_error && geteuid() != 0) { 1862 (void) zutil_error(hdl, LPC_EACCESS, dgettext(TEXT_DOMAIN, 1863 "no pools found")); 1864 } 1865 1866 return (pools); 1867 } 1868 1869 static boolean_t 1870 pool_match(nvlist_t *cfg, const char *tgt) 1871 { 1872 uint64_t v, guid = strtoull(tgt, NULL, 0); 1873 const char *s; 1874 1875 if (guid != 0) { 1876 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) 1877 return (v == guid); 1878 } else { 1879 if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) 1880 return (strcmp(s, tgt) == 0); 1881 } 1882 return (B_FALSE); 1883 } 1884 1885 int 1886 zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp, 1887 importargs_t *args) 1888 { 1889 nvlist_t *pools; 1890 nvlist_t *match = NULL; 1891 nvlist_t *config = NULL; 1892 char *sepp = NULL; 1893 int count = 0; 1894 char *targetdup = strdup(target); 1895 1896 if (targetdup == NULL) 1897 return (ENOMEM); 1898 1899 *configp = NULL; 1900 1901 if ((sepp = strpbrk(targetdup, "/@")) != NULL) 1902 *sepp = '\0'; 1903 1904 pools = zpool_search_import(hdl, args); 1905 1906 if (pools != NULL) { 1907 nvpair_t *elem = NULL; 1908 while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { 1909 VERIFY0(nvpair_value_nvlist(elem, &config)); 1910 if (pool_match(config, targetdup)) { 1911 count++; 1912 if (match != NULL) { 1913 /* multiple matches found */ 1914 continue; 1915 } else { 1916 match = fnvlist_dup(config); 1917 } 1918 } 1919 } 1920 fnvlist_free(pools); 1921 } 1922 1923 if (count == 0) { 1924 free(targetdup); 1925 return (ENOENT); 1926 } 1927 1928 if (count > 1) { 1929 free(targetdup); 1930 fnvlist_free(match); 1931 return (EINVAL); 1932 } 1933 1934 *configp = match; 1935 free(targetdup); 1936 1937 return (0); 1938 } 1939 1940 /* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */ 1941 static boolean_t 1942 vdev_is_leaf(nvlist_t *nv) 1943 { 1944 uint_t children = 0; 1945 nvlist_t **child; 1946 1947 (void) nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1948 &child, &children); 1949 1950 return (children == 0); 1951 } 1952 1953 /* Return if a vdev is a leaf vdev and a real device (disk or file) */ 1954 static boolean_t 1955 vdev_is_real_leaf(nvlist_t *nv) 1956 { 1957 const char *type = NULL; 1958 if (!vdev_is_leaf(nv)) 1959 return (B_FALSE); 1960 1961 (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type); 1962 if ((strcmp(type, VDEV_TYPE_DISK) == 0) || 1963 (strcmp(type, VDEV_TYPE_FILE) == 0)) { 1964 return (B_TRUE); 1965 } 1966 1967 return (B_FALSE); 1968 } 1969 1970 /* 1971 * This function is called by our FOR_EACH_VDEV() macros. 1972 * 1973 * state: State machine status (stored inside of a (nvlist_t *)) 1974 * nv: The current vdev nvlist_t we are iterating over. 1975 * last_nv: The previous vdev nvlist_t we returned to the user in 1976 * the last iteration of FOR_EACH_VDEV(). We use it 1977 * to find the next vdev nvlist_t we should return. 1978 * real_leaves_only: Only return leaf vdevs. 1979 * 1980 * Returns 1 if we found the next vdev nvlist_t for this iteration. 0 if 1981 * we're still searching for it. 1982 */ 1983 static int 1984 __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv, 1985 boolean_t real_leaves_only) 1986 { 1987 enum {FIRST_NV = 0, NEXT_IS_MATCH = 1, STOP_LOOKING = 2}; 1988 1989 /* The very first entry in the NV list is a special case */ 1990 if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) { 1991 if (real_leaves_only && !vdev_is_real_leaf(nv)) 1992 return (0); 1993 1994 *((nvlist_t **)last_nv) = nv; 1995 *((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING; 1996 return (1); 1997 } 1998 1999 /* 2000 * We came across our last_nv, meaning the next one is the one we 2001 * want 2002 */ 2003 if (nv == *((nvlist_t **)last_nv)) { 2004 /* Next iteration of this function will return the nvlist_t */ 2005 *((nvlist_t **)state) = (nvlist_t *)NEXT_IS_MATCH; 2006 return (0); 2007 } 2008 2009 /* 2010 * We marked NEXT_IS_MATCH on the previous iteration, so this is the one 2011 * we want. 2012 */ 2013 if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) { 2014 if (real_leaves_only && !vdev_is_real_leaf(nv)) 2015 return (0); 2016 2017 *((nvlist_t **)last_nv) = nv; 2018 *((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING; 2019 return (1); 2020 } 2021 2022 return (0); 2023 } 2024 2025 int 2026 for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv) 2027 { 2028 return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_FALSE)); 2029 } 2030 2031 int 2032 for_each_real_leaf_vdev_macro_helper_func(void *state, nvlist_t *nv, 2033 void *last_nv) 2034 { 2035 return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_TRUE)); 2036 } 2037 2038 /* 2039 * Internal function for iterating over the vdevs. 2040 * 2041 * For each vdev, func() will be called and will be passed 'zhp' (which is 2042 * typically the zpool_handle_t cast as a void pointer), the vdev's nvlist, and 2043 * a user-defined data pointer). 2044 * 2045 * The return values from all the func() calls will be OR'd together and 2046 * returned. 2047 */ 2048 int 2049 for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, 2050 void *data) 2051 { 2052 nvlist_t **child; 2053 uint_t c, children; 2054 int ret = 0; 2055 int i; 2056 const char *type; 2057 2058 const char *list[] = { 2059 ZPOOL_CONFIG_SPARES, 2060 ZPOOL_CONFIG_L2CACHE, 2061 ZPOOL_CONFIG_CHILDREN 2062 }; 2063 2064 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 2065 return (ret); 2066 2067 /* Don't run our function on indirect vdevs */ 2068 if (strcmp(type, VDEV_TYPE_INDIRECT) != 0) { 2069 ret |= func(zhp, nv, data); 2070 } 2071 2072 for (i = 0; i < ARRAY_SIZE(list); i++) { 2073 if (nvlist_lookup_nvlist_array(nv, list[i], &child, 2074 &children) == 0) { 2075 for (c = 0; c < children; c++) { 2076 uint64_t ishole = 0; 2077 2078 (void) nvlist_lookup_uint64(child[c], 2079 ZPOOL_CONFIG_IS_HOLE, &ishole); 2080 2081 if (ishole) 2082 continue; 2083 2084 ret |= for_each_vdev_cb(zhp, child[c], 2085 func, data); 2086 } 2087 } 2088 } 2089 2090 return (ret); 2091 } 2092 2093 /* 2094 * Given an ZPOOL_CONFIG_VDEV_TREE nvpair, iterate over all the vdevs, calling 2095 * func() for each one. func() is passed the vdev's nvlist and an optional 2096 * user-defined 'data' pointer. 2097 */ 2098 int 2099 for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data) 2100 { 2101 return (for_each_vdev_cb(NULL, nvroot, func, data)); 2102 } 2103