1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 26 * Copyright 2015 RackTop Systems. 27 * Copyright (c) 2016, Intel Corporation. 28 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 29 */ 30 31 /* 32 * Pool import support functions. 33 * 34 * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since 35 * these commands are expected to run in the global zone, we can assume 36 * that the devices are all readable when called. 37 * 38 * To import a pool, we rely on reading the configuration information from the 39 * ZFS label of each device. If we successfully read the label, then we 40 * organize the configuration information in the following hierarchy: 41 * 42 * pool guid -> toplevel vdev guid -> label txg 43 * 44 * Duplicate entries matching this same tuple will be discarded. Once we have 45 * examined every device, we pick the best label txg config for each toplevel 46 * vdev. We then arrange these toplevel vdevs into a complete pool config, and 47 * update any paths that have changed. Finally, we attempt to import the pool 48 * using our derived config, and record the results. 49 */ 50 51 #ifdef HAVE_AIO_H 52 #include <aio.h> 53 #endif 54 #include <ctype.h> 55 #include <dirent.h> 56 #include <errno.h> 57 #include <libintl.h> 58 #include <libgen.h> 59 #include <stddef.h> 60 #include <stdlib.h> 61 #include <string.h> 62 #include <sys/stat.h> 63 #include <unistd.h> 64 #include <fcntl.h> 65 #include <sys/dktp/fdisk.h> 66 #include <sys/vdev_impl.h> 67 #include <sys/fs/zfs.h> 68 69 #include <thread_pool.h> 70 #include <libzutil.h> 71 #include <libnvpair.h> 72 73 #include "zutil_import.h" 74 75 const char * 76 libpc_error_description(libpc_handle_t *hdl) 77 { 78 if (hdl->lpc_desc[0] != '\0') 79 return (hdl->lpc_desc); 80 81 switch (hdl->lpc_error) { 82 case LPC_BADCACHE: 83 return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); 84 case LPC_BADPATH: 85 return (dgettext(TEXT_DOMAIN, "must be an absolute path")); 86 case LPC_NOMEM: 87 return (dgettext(TEXT_DOMAIN, "out of memory")); 88 case LPC_EACCESS: 89 return (dgettext(TEXT_DOMAIN, "some devices require root " 90 "privileges")); 91 case LPC_UNKNOWN: 92 return (dgettext(TEXT_DOMAIN, "unknown error")); 93 default: 94 assert(hdl->lpc_error == 0); 95 return (dgettext(TEXT_DOMAIN, "no error")); 96 } 97 } 98 99 static __attribute__((format(printf, 2, 3))) void 100 zutil_error_aux(libpc_handle_t *hdl, const char *fmt, ...) 101 { 102 va_list ap; 103 104 va_start(ap, fmt); 105 106 (void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap); 107 hdl->lpc_desc_active = B_TRUE; 108 109 va_end(ap); 110 } 111 112 static void 113 zutil_verror(libpc_handle_t *hdl, lpc_error_t error, const char *fmt, 114 va_list ap) 115 { 116 char action[1024]; 117 118 (void) vsnprintf(action, sizeof (action), fmt, ap); 119 hdl->lpc_error = error; 120 121 if (hdl->lpc_desc_active) 122 hdl->lpc_desc_active = B_FALSE; 123 else 124 hdl->lpc_desc[0] = '\0'; 125 126 if (hdl->lpc_printerr) 127 (void) fprintf(stderr, "%s: %s\n", action, 128 libpc_error_description(hdl)); 129 } 130 131 static __attribute__((format(printf, 3, 4))) int 132 zutil_error_fmt(libpc_handle_t *hdl, lpc_error_t error, 133 const char *fmt, ...) 134 { 135 va_list ap; 136 137 va_start(ap, fmt); 138 139 zutil_verror(hdl, error, fmt, ap); 140 141 va_end(ap); 142 143 return (-1); 144 } 145 146 static int 147 zutil_error(libpc_handle_t *hdl, lpc_error_t error, const char *msg) 148 { 149 return (zutil_error_fmt(hdl, error, "%s", msg)); 150 } 151 152 static int 153 zutil_no_memory(libpc_handle_t *hdl) 154 { 155 zutil_error(hdl, LPC_NOMEM, "internal error"); 156 exit(1); 157 } 158 159 void * 160 zutil_alloc(libpc_handle_t *hdl, size_t size) 161 { 162 void *data; 163 164 if ((data = calloc(1, size)) == NULL) 165 (void) zutil_no_memory(hdl); 166 167 return (data); 168 } 169 170 char * 171 zutil_strdup(libpc_handle_t *hdl, const char *str) 172 { 173 char *ret; 174 175 if ((ret = strdup(str)) == NULL) 176 (void) zutil_no_memory(hdl); 177 178 return (ret); 179 } 180 181 static char * 182 zutil_strndup(libpc_handle_t *hdl, const char *str, size_t n) 183 { 184 char *ret; 185 186 if ((ret = strndup(str, n)) == NULL) 187 (void) zutil_no_memory(hdl); 188 189 return (ret); 190 } 191 192 /* 193 * Intermediate structures used to gather configuration information. 194 */ 195 typedef struct config_entry { 196 uint64_t ce_txg; 197 nvlist_t *ce_config; 198 struct config_entry *ce_next; 199 } config_entry_t; 200 201 typedef struct vdev_entry { 202 uint64_t ve_guid; 203 config_entry_t *ve_configs; 204 struct vdev_entry *ve_next; 205 } vdev_entry_t; 206 207 typedef struct pool_entry { 208 uint64_t pe_guid; 209 vdev_entry_t *pe_vdevs; 210 struct pool_entry *pe_next; 211 } pool_entry_t; 212 213 typedef struct name_entry { 214 char *ne_name; 215 uint64_t ne_guid; 216 uint64_t ne_order; 217 uint64_t ne_num_labels; 218 struct name_entry *ne_next; 219 } name_entry_t; 220 221 typedef struct pool_list { 222 pool_entry_t *pools; 223 name_entry_t *names; 224 } pool_list_t; 225 226 /* 227 * Go through and fix up any path and/or devid information for the given vdev 228 * configuration. 229 */ 230 static int 231 fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names) 232 { 233 nvlist_t **child; 234 uint_t c, children; 235 uint64_t guid; 236 name_entry_t *ne, *best; 237 const char *path; 238 239 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 240 &child, &children) == 0) { 241 for (c = 0; c < children; c++) 242 if (fix_paths(hdl, child[c], names) != 0) 243 return (-1); 244 return (0); 245 } 246 247 /* 248 * This is a leaf (file or disk) vdev. In either case, go through 249 * the name list and see if we find a matching guid. If so, replace 250 * the path and see if we can calculate a new devid. 251 * 252 * There may be multiple names associated with a particular guid, in 253 * which case we have overlapping partitions or multiple paths to the 254 * same disk. In this case we prefer to use the path name which 255 * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we 256 * use the lowest order device which corresponds to the first match 257 * while traversing the ZPOOL_IMPORT_PATH search path. 258 */ 259 verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); 260 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) 261 path = NULL; 262 263 best = NULL; 264 for (ne = names; ne != NULL; ne = ne->ne_next) { 265 if (ne->ne_guid == guid) { 266 if (path == NULL) { 267 best = ne; 268 break; 269 } 270 271 if ((strlen(path) == strlen(ne->ne_name)) && 272 strncmp(path, ne->ne_name, strlen(path)) == 0) { 273 best = ne; 274 break; 275 } 276 277 if (best == NULL) { 278 best = ne; 279 continue; 280 } 281 282 /* Prefer paths with move vdev labels. */ 283 if (ne->ne_num_labels > best->ne_num_labels) { 284 best = ne; 285 continue; 286 } 287 288 /* Prefer paths earlier in the search order. */ 289 if (ne->ne_num_labels == best->ne_num_labels && 290 ne->ne_order < best->ne_order) { 291 best = ne; 292 continue; 293 } 294 } 295 } 296 297 if (best == NULL) 298 return (0); 299 300 if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) 301 return (-1); 302 303 update_vdev_config_dev_strs(nv); 304 305 return (0); 306 } 307 308 /* 309 * Add the given configuration to the list of known devices. 310 */ 311 static int 312 add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, 313 int order, int num_labels, nvlist_t *config) 314 { 315 uint64_t pool_guid, vdev_guid, top_guid, txg, state; 316 pool_entry_t *pe; 317 vdev_entry_t *ve; 318 config_entry_t *ce; 319 name_entry_t *ne; 320 321 /* 322 * If this is a hot spare not currently in use or level 2 cache 323 * device, add it to the list of names to translate, but don't do 324 * anything else. 325 */ 326 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, 327 &state) == 0 && 328 (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && 329 nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { 330 if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) 331 return (-1); 332 333 if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { 334 free(ne); 335 return (-1); 336 } 337 ne->ne_guid = vdev_guid; 338 ne->ne_order = order; 339 ne->ne_num_labels = num_labels; 340 ne->ne_next = pl->names; 341 pl->names = ne; 342 343 return (0); 344 } 345 346 /* 347 * If we have a valid config but cannot read any of these fields, then 348 * it means we have a half-initialized label. In vdev_label_init() 349 * we write a label with txg == 0 so that we can identify the device 350 * in case the user refers to the same disk later on. If we fail to 351 * create the pool, we'll be left with a label in this state 352 * which should not be considered part of a valid pool. 353 */ 354 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 355 &pool_guid) != 0 || 356 nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, 357 &vdev_guid) != 0 || 358 nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, 359 &top_guid) != 0 || 360 nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 361 &txg) != 0 || txg == 0) { 362 return (0); 363 } 364 365 /* 366 * First, see if we know about this pool. If not, then add it to the 367 * list of known pools. 368 */ 369 for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { 370 if (pe->pe_guid == pool_guid) 371 break; 372 } 373 374 if (pe == NULL) { 375 if ((pe = zutil_alloc(hdl, sizeof (pool_entry_t))) == NULL) { 376 return (-1); 377 } 378 pe->pe_guid = pool_guid; 379 pe->pe_next = pl->pools; 380 pl->pools = pe; 381 } 382 383 /* 384 * Second, see if we know about this toplevel vdev. Add it if its 385 * missing. 386 */ 387 for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { 388 if (ve->ve_guid == top_guid) 389 break; 390 } 391 392 if (ve == NULL) { 393 if ((ve = zutil_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { 394 return (-1); 395 } 396 ve->ve_guid = top_guid; 397 ve->ve_next = pe->pe_vdevs; 398 pe->pe_vdevs = ve; 399 } 400 401 /* 402 * Third, see if we have a config with a matching transaction group. If 403 * so, then we do nothing. Otherwise, add it to the list of known 404 * configs. 405 */ 406 for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { 407 if (ce->ce_txg == txg) 408 break; 409 } 410 411 if (ce == NULL) { 412 if ((ce = zutil_alloc(hdl, sizeof (config_entry_t))) == NULL) { 413 return (-1); 414 } 415 ce->ce_txg = txg; 416 ce->ce_config = fnvlist_dup(config); 417 ce->ce_next = ve->ve_configs; 418 ve->ve_configs = ce; 419 } 420 421 /* 422 * At this point we've successfully added our config to the list of 423 * known configs. The last thing to do is add the vdev guid -> path 424 * mappings so that we can fix up the configuration as necessary before 425 * doing the import. 426 */ 427 if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) 428 return (-1); 429 430 if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { 431 free(ne); 432 return (-1); 433 } 434 435 ne->ne_guid = vdev_guid; 436 ne->ne_order = order; 437 ne->ne_num_labels = num_labels; 438 ne->ne_next = pl->names; 439 pl->names = ne; 440 441 return (0); 442 } 443 444 static int 445 zutil_pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid, 446 boolean_t *isactive) 447 { 448 ASSERT(hdl->lpc_ops->pco_pool_active != NULL); 449 450 int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name, 451 guid, isactive); 452 453 return (error); 454 } 455 456 static nvlist_t * 457 zutil_refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig) 458 { 459 ASSERT(hdl->lpc_ops->pco_refresh_config != NULL); 460 461 return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle, 462 tryconfig)); 463 } 464 465 /* 466 * Determine if the vdev id is a hole in the namespace. 467 */ 468 static boolean_t 469 vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) 470 { 471 int c; 472 473 for (c = 0; c < holes; c++) { 474 475 /* Top-level is a hole */ 476 if (hole_array[c] == id) 477 return (B_TRUE); 478 } 479 return (B_FALSE); 480 } 481 482 /* 483 * Convert our list of pools into the definitive set of configurations. We 484 * start by picking the best config for each toplevel vdev. Once that's done, 485 * we assemble the toplevel vdevs into a full config for the pool. We make a 486 * pass to fix up any incorrect paths, and then add it to the main list to 487 * return to the user. 488 */ 489 static nvlist_t * 490 get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, 491 nvlist_t *policy) 492 { 493 pool_entry_t *pe; 494 vdev_entry_t *ve; 495 config_entry_t *ce; 496 nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; 497 nvlist_t **spares, **l2cache; 498 uint_t i, nspares, nl2cache; 499 boolean_t config_seen; 500 uint64_t best_txg; 501 const char *name, *hostname = NULL; 502 uint64_t guid; 503 uint_t children = 0; 504 nvlist_t **child = NULL; 505 uint64_t *hole_array, max_id; 506 uint_t c; 507 boolean_t isactive; 508 nvlist_t *nvl; 509 boolean_t valid_top_config = B_FALSE; 510 511 if (nvlist_alloc(&ret, 0, 0) != 0) 512 goto nomem; 513 514 for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { 515 uint64_t id, max_txg = 0, hostid = 0; 516 uint_t holes = 0; 517 518 if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) 519 goto nomem; 520 config_seen = B_FALSE; 521 522 /* 523 * Iterate over all toplevel vdevs. Grab the pool configuration 524 * from the first one we find, and then go through the rest and 525 * add them as necessary to the 'vdevs' member of the config. 526 */ 527 for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { 528 529 /* 530 * Determine the best configuration for this vdev by 531 * selecting the config with the latest transaction 532 * group. 533 */ 534 best_txg = 0; 535 for (ce = ve->ve_configs; ce != NULL; 536 ce = ce->ce_next) { 537 538 if (ce->ce_txg > best_txg) { 539 tmp = ce->ce_config; 540 best_txg = ce->ce_txg; 541 } 542 } 543 544 /* 545 * We rely on the fact that the max txg for the 546 * pool will contain the most up-to-date information 547 * about the valid top-levels in the vdev namespace. 548 */ 549 if (best_txg > max_txg) { 550 (void) nvlist_remove(config, 551 ZPOOL_CONFIG_VDEV_CHILDREN, 552 DATA_TYPE_UINT64); 553 (void) nvlist_remove(config, 554 ZPOOL_CONFIG_HOLE_ARRAY, 555 DATA_TYPE_UINT64_ARRAY); 556 557 max_txg = best_txg; 558 hole_array = NULL; 559 holes = 0; 560 max_id = 0; 561 valid_top_config = B_FALSE; 562 563 if (nvlist_lookup_uint64(tmp, 564 ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { 565 verify(nvlist_add_uint64(config, 566 ZPOOL_CONFIG_VDEV_CHILDREN, 567 max_id) == 0); 568 valid_top_config = B_TRUE; 569 } 570 571 if (nvlist_lookup_uint64_array(tmp, 572 ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, 573 &holes) == 0) { 574 verify(nvlist_add_uint64_array(config, 575 ZPOOL_CONFIG_HOLE_ARRAY, 576 hole_array, holes) == 0); 577 } 578 } 579 580 if (!config_seen) { 581 /* 582 * Copy the relevant pieces of data to the pool 583 * configuration: 584 * 585 * version 586 * pool guid 587 * name 588 * comment (if available) 589 * compatibility features (if available) 590 * pool state 591 * hostid (if available) 592 * hostname (if available) 593 */ 594 uint64_t state, version; 595 const char *comment = NULL; 596 const char *compatibility = NULL; 597 598 version = fnvlist_lookup_uint64(tmp, 599 ZPOOL_CONFIG_VERSION); 600 fnvlist_add_uint64(config, 601 ZPOOL_CONFIG_VERSION, version); 602 guid = fnvlist_lookup_uint64(tmp, 603 ZPOOL_CONFIG_POOL_GUID); 604 fnvlist_add_uint64(config, 605 ZPOOL_CONFIG_POOL_GUID, guid); 606 name = fnvlist_lookup_string(tmp, 607 ZPOOL_CONFIG_POOL_NAME); 608 fnvlist_add_string(config, 609 ZPOOL_CONFIG_POOL_NAME, name); 610 611 if (nvlist_lookup_string(tmp, 612 ZPOOL_CONFIG_COMMENT, &comment) == 0) 613 fnvlist_add_string(config, 614 ZPOOL_CONFIG_COMMENT, comment); 615 616 if (nvlist_lookup_string(tmp, 617 ZPOOL_CONFIG_COMPATIBILITY, 618 &compatibility) == 0) 619 fnvlist_add_string(config, 620 ZPOOL_CONFIG_COMPATIBILITY, 621 compatibility); 622 623 state = fnvlist_lookup_uint64(tmp, 624 ZPOOL_CONFIG_POOL_STATE); 625 fnvlist_add_uint64(config, 626 ZPOOL_CONFIG_POOL_STATE, state); 627 628 hostid = 0; 629 if (nvlist_lookup_uint64(tmp, 630 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 631 fnvlist_add_uint64(config, 632 ZPOOL_CONFIG_HOSTID, hostid); 633 hostname = fnvlist_lookup_string(tmp, 634 ZPOOL_CONFIG_HOSTNAME); 635 fnvlist_add_string(config, 636 ZPOOL_CONFIG_HOSTNAME, hostname); 637 } 638 639 config_seen = B_TRUE; 640 } 641 642 /* 643 * Add this top-level vdev to the child array. 644 */ 645 verify(nvlist_lookup_nvlist(tmp, 646 ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); 647 verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, 648 &id) == 0); 649 650 if (id >= children) { 651 nvlist_t **newchild; 652 653 newchild = zutil_alloc(hdl, (id + 1) * 654 sizeof (nvlist_t *)); 655 if (newchild == NULL) 656 goto nomem; 657 658 for (c = 0; c < children; c++) 659 newchild[c] = child[c]; 660 661 free(child); 662 child = newchild; 663 children = id + 1; 664 } 665 if (nvlist_dup(nvtop, &child[id], 0) != 0) 666 goto nomem; 667 668 } 669 670 /* 671 * If we have information about all the top-levels then 672 * clean up the nvlist which we've constructed. This 673 * means removing any extraneous devices that are 674 * beyond the valid range or adding devices to the end 675 * of our array which appear to be missing. 676 */ 677 if (valid_top_config) { 678 if (max_id < children) { 679 for (c = max_id; c < children; c++) 680 nvlist_free(child[c]); 681 children = max_id; 682 } else if (max_id > children) { 683 nvlist_t **newchild; 684 685 newchild = zutil_alloc(hdl, (max_id) * 686 sizeof (nvlist_t *)); 687 if (newchild == NULL) 688 goto nomem; 689 690 for (c = 0; c < children; c++) 691 newchild[c] = child[c]; 692 693 free(child); 694 child = newchild; 695 children = max_id; 696 } 697 } 698 699 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 700 &guid) == 0); 701 702 /* 703 * The vdev namespace may contain holes as a result of 704 * device removal. We must add them back into the vdev 705 * tree before we process any missing devices. 706 */ 707 if (holes > 0) { 708 ASSERT(valid_top_config); 709 710 for (c = 0; c < children; c++) { 711 nvlist_t *holey; 712 713 if (child[c] != NULL || 714 !vdev_is_hole(hole_array, holes, c)) 715 continue; 716 717 if (nvlist_alloc(&holey, NV_UNIQUE_NAME, 718 0) != 0) 719 goto nomem; 720 721 /* 722 * Holes in the namespace are treated as 723 * "hole" top-level vdevs and have a 724 * special flag set on them. 725 */ 726 if (nvlist_add_string(holey, 727 ZPOOL_CONFIG_TYPE, 728 VDEV_TYPE_HOLE) != 0 || 729 nvlist_add_uint64(holey, 730 ZPOOL_CONFIG_ID, c) != 0 || 731 nvlist_add_uint64(holey, 732 ZPOOL_CONFIG_GUID, 0ULL) != 0) { 733 nvlist_free(holey); 734 goto nomem; 735 } 736 child[c] = holey; 737 } 738 } 739 740 /* 741 * Look for any missing top-level vdevs. If this is the case, 742 * create a faked up 'missing' vdev as a placeholder. We cannot 743 * simply compress the child array, because the kernel performs 744 * certain checks to make sure the vdev IDs match their location 745 * in the configuration. 746 */ 747 for (c = 0; c < children; c++) { 748 if (child[c] == NULL) { 749 nvlist_t *missing; 750 if (nvlist_alloc(&missing, NV_UNIQUE_NAME, 751 0) != 0) 752 goto nomem; 753 if (nvlist_add_string(missing, 754 ZPOOL_CONFIG_TYPE, 755 VDEV_TYPE_MISSING) != 0 || 756 nvlist_add_uint64(missing, 757 ZPOOL_CONFIG_ID, c) != 0 || 758 nvlist_add_uint64(missing, 759 ZPOOL_CONFIG_GUID, 0ULL) != 0) { 760 nvlist_free(missing); 761 goto nomem; 762 } 763 child[c] = missing; 764 } 765 } 766 767 /* 768 * Put all of this pool's top-level vdevs into a root vdev. 769 */ 770 if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) 771 goto nomem; 772 if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 773 VDEV_TYPE_ROOT) != 0 || 774 nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || 775 nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || 776 nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 777 (const nvlist_t **)child, children) != 0) { 778 nvlist_free(nvroot); 779 goto nomem; 780 } 781 782 for (c = 0; c < children; c++) 783 nvlist_free(child[c]); 784 free(child); 785 children = 0; 786 child = NULL; 787 788 /* 789 * Go through and fix up any paths and/or devids based on our 790 * known list of vdev GUID -> path mappings. 791 */ 792 if (fix_paths(hdl, nvroot, pl->names) != 0) { 793 nvlist_free(nvroot); 794 goto nomem; 795 } 796 797 /* 798 * Add the root vdev to this pool's configuration. 799 */ 800 if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 801 nvroot) != 0) { 802 nvlist_free(nvroot); 803 goto nomem; 804 } 805 nvlist_free(nvroot); 806 807 /* 808 * zdb uses this path to report on active pools that were 809 * imported or created using -R. 810 */ 811 if (active_ok) 812 goto add_pool; 813 814 /* 815 * Determine if this pool is currently active, in which case we 816 * can't actually import it. 817 */ 818 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 819 &name) == 0); 820 verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 821 &guid) == 0); 822 823 if (zutil_pool_active(hdl, name, guid, &isactive) != 0) 824 goto error; 825 826 if (isactive) { 827 nvlist_free(config); 828 config = NULL; 829 continue; 830 } 831 832 if (policy != NULL) { 833 if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, 834 policy) != 0) 835 goto nomem; 836 } 837 838 if ((nvl = zutil_refresh_config(hdl, config)) == NULL) { 839 nvlist_free(config); 840 config = NULL; 841 continue; 842 } 843 844 nvlist_free(config); 845 config = nvl; 846 847 /* 848 * Go through and update the paths for spares, now that we have 849 * them. 850 */ 851 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 852 &nvroot) == 0); 853 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 854 &spares, &nspares) == 0) { 855 for (i = 0; i < nspares; i++) { 856 if (fix_paths(hdl, spares[i], pl->names) != 0) 857 goto nomem; 858 } 859 } 860 861 /* 862 * Update the paths for l2cache devices. 863 */ 864 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 865 &l2cache, &nl2cache) == 0) { 866 for (i = 0; i < nl2cache; i++) { 867 if (fix_paths(hdl, l2cache[i], pl->names) != 0) 868 goto nomem; 869 } 870 } 871 872 /* 873 * Restore the original information read from the actual label. 874 */ 875 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, 876 DATA_TYPE_UINT64); 877 (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, 878 DATA_TYPE_STRING); 879 if (hostid != 0) { 880 verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, 881 hostid) == 0); 882 verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, 883 hostname) == 0); 884 } 885 886 add_pool: 887 /* 888 * Add this pool to the list of configs. 889 */ 890 verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 891 &name) == 0); 892 893 if (nvlist_add_nvlist(ret, name, config) != 0) 894 goto nomem; 895 896 nvlist_free(config); 897 config = NULL; 898 } 899 900 return (ret); 901 902 nomem: 903 (void) zutil_no_memory(hdl); 904 error: 905 nvlist_free(config); 906 nvlist_free(ret); 907 for (c = 0; c < children; c++) 908 nvlist_free(child[c]); 909 free(child); 910 911 return (NULL); 912 } 913 914 /* 915 * Return the offset of the given label. 916 */ 917 static uint64_t 918 label_offset(uint64_t size, int l) 919 { 920 ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); 921 return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? 922 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); 923 } 924 925 /* 926 * The same description applies as to zpool_read_label below, 927 * except here we do it without aio, presumably because an aio call 928 * errored out in a way we think not using it could circumvent. 929 */ 930 static int 931 zpool_read_label_slow(int fd, nvlist_t **config, int *num_labels) 932 { 933 struct stat64 statbuf; 934 int l, count = 0; 935 vdev_phys_t *label; 936 nvlist_t *expected_config = NULL; 937 uint64_t expected_guid = 0, size; 938 939 *config = NULL; 940 941 if (fstat64_blk(fd, &statbuf) == -1) 942 return (0); 943 size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); 944 945 label = (vdev_phys_t *)umem_alloc_aligned(sizeof (*label), PAGESIZE, 946 UMEM_DEFAULT); 947 if (label == NULL) 948 return (-1); 949 950 for (l = 0; l < VDEV_LABELS; l++) { 951 uint64_t state, guid, txg; 952 off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE; 953 954 if (pread64(fd, label, sizeof (vdev_phys_t), 955 offset) != sizeof (vdev_phys_t)) 956 continue; 957 958 if (nvlist_unpack(label->vp_nvlist, 959 sizeof (label->vp_nvlist), config, 0) != 0) 960 continue; 961 962 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, 963 &guid) != 0 || guid == 0) { 964 nvlist_free(*config); 965 continue; 966 } 967 968 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, 969 &state) != 0 || state > POOL_STATE_L2CACHE) { 970 nvlist_free(*config); 971 continue; 972 } 973 974 if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && 975 (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, 976 &txg) != 0 || txg == 0)) { 977 nvlist_free(*config); 978 continue; 979 } 980 981 if (expected_guid) { 982 if (expected_guid == guid) 983 count++; 984 985 nvlist_free(*config); 986 } else { 987 expected_config = *config; 988 expected_guid = guid; 989 count++; 990 } 991 } 992 993 if (num_labels != NULL) 994 *num_labels = count; 995 996 umem_free_aligned(label, sizeof (*label)); 997 *config = expected_config; 998 999 return (0); 1000 } 1001 1002 /* 1003 * Given a file descriptor, read the label information and return an nvlist 1004 * describing the configuration, if there is one. The number of valid 1005 * labels found will be returned in num_labels when non-NULL. 1006 */ 1007 int 1008 zpool_read_label(int fd, nvlist_t **config, int *num_labels) 1009 { 1010 #ifndef HAVE_AIO_H 1011 return (zpool_read_label_slow(fd, config, num_labels)); 1012 #else 1013 struct stat64 statbuf; 1014 struct aiocb aiocbs[VDEV_LABELS]; 1015 struct aiocb *aiocbps[VDEV_LABELS]; 1016 vdev_phys_t *labels; 1017 nvlist_t *expected_config = NULL; 1018 uint64_t expected_guid = 0, size; 1019 int error, l, count = 0; 1020 1021 *config = NULL; 1022 1023 if (fstat64_blk(fd, &statbuf) == -1) 1024 return (0); 1025 size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); 1026 1027 labels = (vdev_phys_t *)umem_alloc_aligned( 1028 VDEV_LABELS * sizeof (*labels), PAGESIZE, UMEM_DEFAULT); 1029 if (labels == NULL) 1030 return (-1); 1031 1032 memset(aiocbs, 0, sizeof (aiocbs)); 1033 for (l = 0; l < VDEV_LABELS; l++) { 1034 off_t offset = label_offset(size, l) + VDEV_SKIP_SIZE; 1035 1036 aiocbs[l].aio_fildes = fd; 1037 aiocbs[l].aio_offset = offset; 1038 aiocbs[l].aio_buf = &labels[l]; 1039 aiocbs[l].aio_nbytes = sizeof (vdev_phys_t); 1040 aiocbs[l].aio_lio_opcode = LIO_READ; 1041 aiocbps[l] = &aiocbs[l]; 1042 } 1043 1044 if (lio_listio(LIO_WAIT, aiocbps, VDEV_LABELS, NULL) != 0) { 1045 int saved_errno = errno; 1046 boolean_t do_slow = B_FALSE; 1047 error = -1; 1048 1049 if (errno == EAGAIN || errno == EINTR || errno == EIO) { 1050 /* 1051 * A portion of the requests may have been submitted. 1052 * Clean them up. 1053 */ 1054 for (l = 0; l < VDEV_LABELS; l++) { 1055 errno = 0; 1056 switch (aio_error(&aiocbs[l])) { 1057 case EINVAL: 1058 break; 1059 case EINPROGRESS: 1060 /* 1061 * This shouldn't be possible to 1062 * encounter, die if we do. 1063 */ 1064 ASSERT(B_FALSE); 1065 zfs_fallthrough; 1066 case EREMOTEIO: 1067 /* 1068 * May be returned by an NVMe device 1069 * which is visible in /dev/ but due 1070 * to a low-level format change, or 1071 * other error, needs to be rescanned. 1072 * Try the slow method. 1073 */ 1074 zfs_fallthrough; 1075 case EAGAIN: 1076 case EOPNOTSUPP: 1077 case ENOSYS: 1078 do_slow = B_TRUE; 1079 zfs_fallthrough; 1080 case 0: 1081 default: 1082 (void) aio_return(&aiocbs[l]); 1083 } 1084 } 1085 } 1086 if (do_slow) { 1087 /* 1088 * At least some IO involved access unsafe-for-AIO 1089 * files. Let's try again, without AIO this time. 1090 */ 1091 error = zpool_read_label_slow(fd, config, num_labels); 1092 saved_errno = errno; 1093 } 1094 umem_free_aligned(labels, VDEV_LABELS * sizeof (*labels)); 1095 errno = saved_errno; 1096 return (error); 1097 } 1098 1099 for (l = 0; l < VDEV_LABELS; l++) { 1100 uint64_t state, guid, txg; 1101 1102 if (aio_return(&aiocbs[l]) != sizeof (vdev_phys_t)) 1103 continue; 1104 1105 if (nvlist_unpack(labels[l].vp_nvlist, 1106 sizeof (labels[l].vp_nvlist), config, 0) != 0) 1107 continue; 1108 1109 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, 1110 &guid) != 0 || guid == 0) { 1111 nvlist_free(*config); 1112 continue; 1113 } 1114 1115 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, 1116 &state) != 0 || state > POOL_STATE_L2CACHE) { 1117 nvlist_free(*config); 1118 continue; 1119 } 1120 1121 if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && 1122 (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, 1123 &txg) != 0 || txg == 0)) { 1124 nvlist_free(*config); 1125 continue; 1126 } 1127 1128 if (expected_guid) { 1129 if (expected_guid == guid) 1130 count++; 1131 1132 nvlist_free(*config); 1133 } else { 1134 expected_config = *config; 1135 expected_guid = guid; 1136 count++; 1137 } 1138 } 1139 1140 if (num_labels != NULL) 1141 *num_labels = count; 1142 1143 umem_free_aligned(labels, VDEV_LABELS * sizeof (*labels)); 1144 *config = expected_config; 1145 1146 return (0); 1147 #endif 1148 } 1149 1150 /* 1151 * Sorted by full path and then vdev guid to allow for multiple entries with 1152 * the same full path name. This is required because it's possible to 1153 * have multiple block devices with labels that refer to the same 1154 * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both 1155 * entries need to be added to the cache. Scenarios where this can occur 1156 * include overwritten pool labels, devices which are visible from multiple 1157 * hosts and multipath devices. 1158 */ 1159 int 1160 slice_cache_compare(const void *arg1, const void *arg2) 1161 { 1162 const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; 1163 const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; 1164 uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid; 1165 uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; 1166 int rv; 1167 1168 rv = TREE_ISIGN(strcmp(nm1, nm2)); 1169 if (rv) 1170 return (rv); 1171 1172 return (TREE_CMP(guid1, guid2)); 1173 } 1174 1175 static int 1176 label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid, 1177 uint64_t vdev_guid, const char **path, const char **devid) 1178 { 1179 nvlist_t **child; 1180 uint_t c, children; 1181 uint64_t guid; 1182 const char *val; 1183 int error; 1184 1185 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 1186 &child, &children) == 0) { 1187 for (c = 0; c < children; c++) { 1188 error = label_paths_impl(hdl, child[c], 1189 pool_guid, vdev_guid, path, devid); 1190 if (error) 1191 return (error); 1192 } 1193 return (0); 1194 } 1195 1196 if (nvroot == NULL) 1197 return (0); 1198 1199 error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid); 1200 if ((error != 0) || (guid != vdev_guid)) 1201 return (0); 1202 1203 error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val); 1204 if (error == 0) 1205 *path = val; 1206 1207 error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val); 1208 if (error == 0) 1209 *devid = val; 1210 1211 return (0); 1212 } 1213 1214 /* 1215 * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID 1216 * and store these strings as config_path and devid_path respectively. 1217 * The returned pointers are only valid as long as label remains valid. 1218 */ 1219 int 1220 label_paths(libpc_handle_t *hdl, nvlist_t *label, const char **path, 1221 const char **devid) 1222 { 1223 nvlist_t *nvroot; 1224 uint64_t pool_guid; 1225 uint64_t vdev_guid; 1226 uint64_t state; 1227 1228 *path = NULL; 1229 *devid = NULL; 1230 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid) != 0) 1231 return (ENOENT); 1232 1233 /* 1234 * In case of spare or l2cache, we directly return path/devid from the 1235 * label. 1236 */ 1237 if (!(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state)) && 1238 (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE)) { 1239 (void) nvlist_lookup_string(label, ZPOOL_CONFIG_PATH, path); 1240 (void) nvlist_lookup_string(label, ZPOOL_CONFIG_DEVID, devid); 1241 return (0); 1242 } 1243 1244 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || 1245 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1246 return (ENOENT); 1247 1248 return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path, 1249 devid)); 1250 } 1251 1252 static void 1253 zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock, 1254 avl_tree_t *cache, const char *path, const char *name, int order) 1255 { 1256 avl_index_t where; 1257 rdsk_node_t *slice; 1258 1259 slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); 1260 if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) { 1261 free(slice); 1262 return; 1263 } 1264 slice->rn_vdev_guid = 0; 1265 slice->rn_lock = lock; 1266 slice->rn_avl = cache; 1267 slice->rn_hdl = hdl; 1268 slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET; 1269 slice->rn_labelpaths = B_FALSE; 1270 1271 pthread_mutex_lock(lock); 1272 if (avl_find(cache, slice, &where)) { 1273 free(slice->rn_name); 1274 free(slice); 1275 } else { 1276 avl_insert(cache, slice, where); 1277 } 1278 pthread_mutex_unlock(lock); 1279 } 1280 1281 static int 1282 zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock, 1283 avl_tree_t *cache, const char *dir, int order) 1284 { 1285 int error; 1286 char path[MAXPATHLEN]; 1287 struct dirent64 *dp; 1288 DIR *dirp; 1289 1290 if (realpath(dir, path) == NULL) { 1291 error = errno; 1292 if (error == ENOENT) 1293 return (0); 1294 1295 zutil_error_aux(hdl, "%s", zfs_strerror(error)); 1296 (void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN, 1297 "cannot resolve path '%s'"), dir); 1298 return (error); 1299 } 1300 1301 dirp = opendir(path); 1302 if (dirp == NULL) { 1303 error = errno; 1304 zutil_error_aux(hdl, "%s", zfs_strerror(error)); 1305 (void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN, 1306 "cannot open '%s'"), path); 1307 return (error); 1308 } 1309 1310 while ((dp = readdir64(dirp)) != NULL) { 1311 const char *name = dp->d_name; 1312 if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) 1313 continue; 1314 1315 switch (dp->d_type) { 1316 case DT_UNKNOWN: 1317 case DT_BLK: 1318 case DT_LNK: 1319 #ifdef __FreeBSD__ 1320 case DT_CHR: 1321 #endif 1322 case DT_REG: 1323 break; 1324 default: 1325 continue; 1326 } 1327 1328 zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, 1329 order); 1330 } 1331 1332 (void) closedir(dirp); 1333 return (0); 1334 } 1335 1336 static int 1337 zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock, 1338 avl_tree_t *cache, const char *dir, int order) 1339 { 1340 int error = 0; 1341 char path[MAXPATHLEN]; 1342 char *d = NULL; 1343 ssize_t dl; 1344 const char *dpath, *name; 1345 1346 /* 1347 * Separate the directory and the basename. 1348 * We do this so that we can get the realpath of 1349 * the directory. We don't get the realpath on the 1350 * whole path because if it's a symlink, we want the 1351 * path of the symlink not where it points to. 1352 */ 1353 name = zfs_basename(dir); 1354 if ((dl = zfs_dirnamelen(dir)) == -1) 1355 dpath = "."; 1356 else 1357 dpath = d = zutil_strndup(hdl, dir, dl); 1358 1359 if (realpath(dpath, path) == NULL) { 1360 error = errno; 1361 if (error == ENOENT) { 1362 error = 0; 1363 goto out; 1364 } 1365 1366 zutil_error_aux(hdl, "%s", zfs_strerror(error)); 1367 (void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext(TEXT_DOMAIN, 1368 "cannot resolve path '%s'"), dir); 1369 goto out; 1370 } 1371 1372 zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); 1373 1374 out: 1375 free(d); 1376 return (error); 1377 } 1378 1379 /* 1380 * Scan a list of directories for zfs devices. 1381 */ 1382 static int 1383 zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock, 1384 avl_tree_t **slice_cache, const char * const *dir, size_t dirs) 1385 { 1386 avl_tree_t *cache; 1387 rdsk_node_t *slice; 1388 void *cookie; 1389 int i, error; 1390 1391 *slice_cache = NULL; 1392 cache = zutil_alloc(hdl, sizeof (avl_tree_t)); 1393 avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t), 1394 offsetof(rdsk_node_t, rn_node)); 1395 1396 for (i = 0; i < dirs; i++) { 1397 struct stat sbuf; 1398 1399 if (stat(dir[i], &sbuf) != 0) { 1400 error = errno; 1401 if (error == ENOENT) 1402 continue; 1403 1404 zutil_error_aux(hdl, "%s", zfs_strerror(error)); 1405 (void) zutil_error_fmt(hdl, LPC_BADPATH, dgettext( 1406 TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]); 1407 goto error; 1408 } 1409 1410 /* 1411 * If dir[i] is a directory, we walk through it and add all 1412 * the entries to the cache. If it's not a directory, we just 1413 * add it to the cache. 1414 */ 1415 if (S_ISDIR(sbuf.st_mode)) { 1416 if ((error = zpool_find_import_scan_dir(hdl, lock, 1417 cache, dir[i], i)) != 0) 1418 goto error; 1419 } else { 1420 if ((error = zpool_find_import_scan_path(hdl, lock, 1421 cache, dir[i], i)) != 0) 1422 goto error; 1423 } 1424 } 1425 1426 *slice_cache = cache; 1427 return (0); 1428 1429 error: 1430 cookie = NULL; 1431 while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { 1432 free(slice->rn_name); 1433 free(slice); 1434 } 1435 free(cache); 1436 1437 return (error); 1438 } 1439 1440 /* 1441 * Given a list of directories to search, find all pools stored on disk. This 1442 * includes partial pools which are not available to import. If no args are 1443 * given (argc is 0), then the default directory (/dev/dsk) is searched. 1444 * poolname or guid (but not both) are provided by the caller when trying 1445 * to import a specific pool. 1446 */ 1447 static nvlist_t * 1448 zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg, 1449 pthread_mutex_t *lock, avl_tree_t *cache) 1450 { 1451 (void) lock; 1452 nvlist_t *ret = NULL; 1453 pool_list_t pools = { 0 }; 1454 pool_entry_t *pe, *penext; 1455 vdev_entry_t *ve, *venext; 1456 config_entry_t *ce, *cenext; 1457 name_entry_t *ne, *nenext; 1458 rdsk_node_t *slice; 1459 void *cookie; 1460 tpool_t *t; 1461 1462 verify(iarg->poolname == NULL || iarg->guid == 0); 1463 1464 /* 1465 * Create a thread pool to parallelize the process of reading and 1466 * validating labels, a large number of threads can be used due to 1467 * minimal contention. 1468 */ 1469 long threads = 2 * sysconf(_SC_NPROCESSORS_ONLN); 1470 #ifdef HAVE_AIO_H 1471 long am; 1472 #ifdef _SC_AIO_LISTIO_MAX 1473 am = sysconf(_SC_AIO_LISTIO_MAX); 1474 if (am >= VDEV_LABELS) 1475 threads = MIN(threads, am / VDEV_LABELS); 1476 #endif 1477 #ifdef _SC_AIO_MAX 1478 am = sysconf(_SC_AIO_MAX); 1479 if (am >= VDEV_LABELS) 1480 threads = MIN(threads, am / VDEV_LABELS); 1481 #endif 1482 #endif 1483 t = tpool_create(1, threads, 0, NULL); 1484 for (slice = avl_first(cache); slice; 1485 (slice = avl_walk(cache, slice, AVL_AFTER))) 1486 (void) tpool_dispatch(t, zpool_open_func, slice); 1487 1488 tpool_wait(t); 1489 tpool_destroy(t); 1490 1491 /* 1492 * Process the cache, filtering out any entries which are not 1493 * for the specified pool then adding matching label configs. 1494 */ 1495 cookie = NULL; 1496 while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { 1497 if (slice->rn_config != NULL) { 1498 nvlist_t *config = slice->rn_config; 1499 boolean_t matched = B_TRUE; 1500 boolean_t aux = B_FALSE; 1501 int fd; 1502 1503 /* 1504 * Check if it's a spare or l2cache device. If it is, 1505 * we need to skip the name and guid check since they 1506 * don't exist on aux device label. 1507 */ 1508 if (iarg->poolname != NULL || iarg->guid != 0) { 1509 uint64_t state; 1510 aux = nvlist_lookup_uint64(config, 1511 ZPOOL_CONFIG_POOL_STATE, &state) == 0 && 1512 (state == POOL_STATE_SPARE || 1513 state == POOL_STATE_L2CACHE); 1514 } 1515 1516 if (iarg->poolname != NULL && !aux) { 1517 const char *pname; 1518 1519 matched = nvlist_lookup_string(config, 1520 ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && 1521 strcmp(iarg->poolname, pname) == 0; 1522 } else if (iarg->guid != 0 && !aux) { 1523 uint64_t this_guid; 1524 1525 matched = nvlist_lookup_uint64(config, 1526 ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 && 1527 iarg->guid == this_guid; 1528 } 1529 if (matched) { 1530 /* 1531 * Verify all remaining entries can be opened 1532 * exclusively. This will prune all underlying 1533 * multipath devices which otherwise could 1534 * result in the vdev appearing as UNAVAIL. 1535 * 1536 * Under zdb, this step isn't required and 1537 * would prevent a zdb -e of active pools with 1538 * no cachefile. 1539 */ 1540 fd = open(slice->rn_name, 1541 O_RDONLY | O_EXCL | O_CLOEXEC); 1542 if (fd >= 0 || iarg->can_be_active) { 1543 if (fd >= 0) 1544 close(fd); 1545 add_config(hdl, &pools, 1546 slice->rn_name, slice->rn_order, 1547 slice->rn_num_labels, config); 1548 } 1549 } 1550 nvlist_free(config); 1551 } 1552 free(slice->rn_name); 1553 free(slice); 1554 } 1555 avl_destroy(cache); 1556 free(cache); 1557 1558 ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy); 1559 1560 for (pe = pools.pools; pe != NULL; pe = penext) { 1561 penext = pe->pe_next; 1562 for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { 1563 venext = ve->ve_next; 1564 for (ce = ve->ve_configs; ce != NULL; ce = cenext) { 1565 cenext = ce->ce_next; 1566 nvlist_free(ce->ce_config); 1567 free(ce); 1568 } 1569 free(ve); 1570 } 1571 free(pe); 1572 } 1573 1574 for (ne = pools.names; ne != NULL; ne = nenext) { 1575 nenext = ne->ne_next; 1576 free(ne->ne_name); 1577 free(ne); 1578 } 1579 1580 return (ret); 1581 } 1582 1583 /* 1584 * Given a config, discover the paths for the devices which 1585 * exist in the config. 1586 */ 1587 static int 1588 discover_cached_paths(libpc_handle_t *hdl, nvlist_t *nv, 1589 avl_tree_t *cache, pthread_mutex_t *lock) 1590 { 1591 const char *path = NULL; 1592 ssize_t dl; 1593 uint_t children; 1594 nvlist_t **child; 1595 1596 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1597 &child, &children) == 0) { 1598 for (int c = 0; c < children; c++) { 1599 discover_cached_paths(hdl, child[c], cache, lock); 1600 } 1601 } 1602 1603 /* 1604 * Once we have the path, we need to add the directory to 1605 * our directory cache. 1606 */ 1607 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { 1608 int ret; 1609 char c = '\0'; 1610 if ((dl = zfs_dirnamelen(path)) == -1) { 1611 path = "."; 1612 } else { 1613 c = path[dl]; 1614 ((char *)path)[dl] = '\0'; 1615 1616 } 1617 ret = zpool_find_import_scan_dir(hdl, lock, cache, 1618 path, 0); 1619 if (c != '\0') 1620 ((char *)path)[dl] = c; 1621 1622 return (ret); 1623 } 1624 return (0); 1625 } 1626 1627 /* 1628 * Given a cache file, return the contents as a list of importable pools. 1629 * poolname or guid (but not both) are provided by the caller when trying 1630 * to import a specific pool. 1631 */ 1632 static nvlist_t * 1633 zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg) 1634 { 1635 char *buf; 1636 int fd; 1637 struct stat64 statbuf; 1638 nvlist_t *raw, *src, *dst; 1639 nvlist_t *pools; 1640 nvpair_t *elem; 1641 const char *name; 1642 uint64_t this_guid; 1643 boolean_t active; 1644 1645 verify(iarg->poolname == NULL || iarg->guid == 0); 1646 1647 if ((fd = open(iarg->cachefile, O_RDONLY | O_CLOEXEC)) < 0) { 1648 zutil_error_aux(hdl, "%s", zfs_strerror(errno)); 1649 (void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN, 1650 "failed to open cache file")); 1651 return (NULL); 1652 } 1653 1654 if (fstat64(fd, &statbuf) != 0) { 1655 zutil_error_aux(hdl, "%s", zfs_strerror(errno)); 1656 (void) close(fd); 1657 (void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN, 1658 "failed to get size of cache file")); 1659 return (NULL); 1660 } 1661 1662 if ((buf = zutil_alloc(hdl, statbuf.st_size)) == NULL) { 1663 (void) close(fd); 1664 return (NULL); 1665 } 1666 1667 if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { 1668 (void) close(fd); 1669 free(buf); 1670 (void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN, 1671 "failed to read cache file contents")); 1672 return (NULL); 1673 } 1674 1675 (void) close(fd); 1676 1677 if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) { 1678 free(buf); 1679 (void) zutil_error(hdl, LPC_BADCACHE, dgettext(TEXT_DOMAIN, 1680 "invalid or corrupt cache file contents")); 1681 return (NULL); 1682 } 1683 1684 free(buf); 1685 1686 /* 1687 * Go through and get the current state of the pools and refresh their 1688 * state. 1689 */ 1690 if (nvlist_alloc(&pools, 0, 0) != 0) { 1691 (void) zutil_no_memory(hdl); 1692 nvlist_free(raw); 1693 return (NULL); 1694 } 1695 1696 elem = NULL; 1697 while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) { 1698 src = fnvpair_value_nvlist(elem); 1699 1700 name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME); 1701 if (iarg->poolname != NULL && strcmp(iarg->poolname, name) != 0) 1702 continue; 1703 1704 this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID); 1705 if (iarg->guid != 0 && iarg->guid != this_guid) 1706 continue; 1707 1708 if (zutil_pool_active(hdl, name, this_guid, &active) != 0) { 1709 nvlist_free(raw); 1710 nvlist_free(pools); 1711 return (NULL); 1712 } 1713 1714 if (active) 1715 continue; 1716 1717 if (iarg->scan) { 1718 uint64_t saved_guid = iarg->guid; 1719 const char *saved_poolname = iarg->poolname; 1720 pthread_mutex_t lock; 1721 1722 /* 1723 * Create the device cache that will hold the 1724 * devices we will scan based on the cachefile. 1725 * This will get destroyed and freed by 1726 * zpool_find_import_impl. 1727 */ 1728 avl_tree_t *cache = zutil_alloc(hdl, 1729 sizeof (avl_tree_t)); 1730 avl_create(cache, slice_cache_compare, 1731 sizeof (rdsk_node_t), 1732 offsetof(rdsk_node_t, rn_node)); 1733 nvlist_t *nvroot = fnvlist_lookup_nvlist(src, 1734 ZPOOL_CONFIG_VDEV_TREE); 1735 1736 /* 1737 * We only want to find the pool with this_guid. 1738 * We will reset these values back later. 1739 */ 1740 iarg->guid = this_guid; 1741 iarg->poolname = NULL; 1742 1743 /* 1744 * We need to build up a cache of devices that exists 1745 * in the paths pointed to by the cachefile. This allows 1746 * us to preserve the device namespace that was 1747 * originally specified by the user but also lets us 1748 * scan devices in those directories in case they had 1749 * been renamed. 1750 */ 1751 pthread_mutex_init(&lock, NULL); 1752 discover_cached_paths(hdl, nvroot, cache, &lock); 1753 nvlist_t *nv = zpool_find_import_impl(hdl, iarg, 1754 &lock, cache); 1755 pthread_mutex_destroy(&lock); 1756 1757 /* 1758 * zpool_find_import_impl will return back 1759 * a list of pools that it found based on the 1760 * device cache. There should only be one pool 1761 * since we're looking for a specific guid. 1762 * We will use that pool to build up the final 1763 * pool nvlist which is returned back to the 1764 * caller. 1765 */ 1766 nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 1767 if (pair == NULL) 1768 continue; 1769 fnvlist_add_nvlist(pools, nvpair_name(pair), 1770 fnvpair_value_nvlist(pair)); 1771 1772 VERIFY3P(nvlist_next_nvpair(nv, pair), ==, NULL); 1773 1774 iarg->guid = saved_guid; 1775 iarg->poolname = saved_poolname; 1776 continue; 1777 } 1778 1779 if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE, 1780 iarg->cachefile) != 0) { 1781 (void) zutil_no_memory(hdl); 1782 nvlist_free(raw); 1783 nvlist_free(pools); 1784 return (NULL); 1785 } 1786 1787 update_vdevs_config_dev_sysfs_path(src); 1788 1789 if ((dst = zutil_refresh_config(hdl, src)) == NULL) { 1790 nvlist_free(raw); 1791 nvlist_free(pools); 1792 return (NULL); 1793 } 1794 1795 if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) { 1796 (void) zutil_no_memory(hdl); 1797 nvlist_free(dst); 1798 nvlist_free(raw); 1799 nvlist_free(pools); 1800 return (NULL); 1801 } 1802 nvlist_free(dst); 1803 } 1804 nvlist_free(raw); 1805 return (pools); 1806 } 1807 1808 static nvlist_t * 1809 zpool_find_import(libpc_handle_t *hdl, importargs_t *iarg) 1810 { 1811 pthread_mutex_t lock; 1812 avl_tree_t *cache; 1813 nvlist_t *pools = NULL; 1814 1815 verify(iarg->poolname == NULL || iarg->guid == 0); 1816 pthread_mutex_init(&lock, NULL); 1817 1818 /* 1819 * Locate pool member vdevs by blkid or by directory scanning. 1820 * On success a newly allocated AVL tree which is populated with an 1821 * entry for each discovered vdev will be returned in the cache. 1822 * It's the caller's responsibility to consume and destroy this tree. 1823 */ 1824 if (iarg->scan || iarg->paths != 0) { 1825 size_t dirs = iarg->paths; 1826 const char * const *dir = (const char * const *)iarg->path; 1827 1828 if (dirs == 0) 1829 dir = zpool_default_search_paths(&dirs); 1830 1831 if (zpool_find_import_scan(hdl, &lock, &cache, 1832 dir, dirs) != 0) { 1833 pthread_mutex_destroy(&lock); 1834 return (NULL); 1835 } 1836 } else { 1837 if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) { 1838 pthread_mutex_destroy(&lock); 1839 return (NULL); 1840 } 1841 } 1842 1843 pools = zpool_find_import_impl(hdl, iarg, &lock, cache); 1844 pthread_mutex_destroy(&lock); 1845 return (pools); 1846 } 1847 1848 1849 nvlist_t * 1850 zpool_search_import(libpc_handle_t *hdl, importargs_t *import) 1851 { 1852 nvlist_t *pools = NULL; 1853 1854 verify(import->poolname == NULL || import->guid == 0); 1855 1856 if (import->cachefile != NULL) 1857 pools = zpool_find_import_cached(hdl, import); 1858 else 1859 pools = zpool_find_import(hdl, import); 1860 1861 if ((pools == NULL || nvlist_empty(pools)) && 1862 hdl->lpc_open_access_error && geteuid() != 0) { 1863 (void) zutil_error(hdl, LPC_EACCESS, dgettext(TEXT_DOMAIN, 1864 "no pools found")); 1865 } 1866 1867 return (pools); 1868 } 1869 1870 static boolean_t 1871 pool_match(nvlist_t *cfg, const char *tgt) 1872 { 1873 uint64_t v, guid = strtoull(tgt, NULL, 0); 1874 const char *s; 1875 1876 if (guid != 0) { 1877 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) 1878 return (v == guid); 1879 } else { 1880 if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) 1881 return (strcmp(s, tgt) == 0); 1882 } 1883 return (B_FALSE); 1884 } 1885 1886 int 1887 zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp, 1888 importargs_t *args) 1889 { 1890 nvlist_t *pools; 1891 nvlist_t *match = NULL; 1892 nvlist_t *config = NULL; 1893 char *sepp = NULL; 1894 int count = 0; 1895 char *targetdup = strdup(target); 1896 1897 if (targetdup == NULL) 1898 return (ENOMEM); 1899 1900 *configp = NULL; 1901 1902 if ((sepp = strpbrk(targetdup, "/@")) != NULL) 1903 *sepp = '\0'; 1904 1905 pools = zpool_search_import(hdl, args); 1906 1907 if (pools != NULL) { 1908 nvpair_t *elem = NULL; 1909 while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { 1910 VERIFY0(nvpair_value_nvlist(elem, &config)); 1911 if (pool_match(config, targetdup)) { 1912 count++; 1913 if (match != NULL) { 1914 /* multiple matches found */ 1915 continue; 1916 } else { 1917 match = fnvlist_dup(config); 1918 } 1919 } 1920 } 1921 fnvlist_free(pools); 1922 } 1923 1924 if (count == 0) { 1925 free(targetdup); 1926 return (ENOENT); 1927 } 1928 1929 if (count > 1) { 1930 free(targetdup); 1931 fnvlist_free(match); 1932 return (EINVAL); 1933 } 1934 1935 *configp = match; 1936 free(targetdup); 1937 1938 return (0); 1939 } 1940 1941 /* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */ 1942 static boolean_t 1943 vdev_is_leaf(nvlist_t *nv) 1944 { 1945 uint_t children = 0; 1946 nvlist_t **child; 1947 1948 (void) nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1949 &child, &children); 1950 1951 return (children == 0); 1952 } 1953 1954 /* Return if a vdev is a leaf vdev and a real device (disk or file) */ 1955 static boolean_t 1956 vdev_is_real_leaf(nvlist_t *nv) 1957 { 1958 const char *type = NULL; 1959 if (!vdev_is_leaf(nv)) 1960 return (B_FALSE); 1961 1962 (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type); 1963 if ((strcmp(type, VDEV_TYPE_DISK) == 0) || 1964 (strcmp(type, VDEV_TYPE_FILE) == 0)) { 1965 return (B_TRUE); 1966 } 1967 1968 return (B_FALSE); 1969 } 1970 1971 /* 1972 * This function is called by our FOR_EACH_VDEV() macros. 1973 * 1974 * state: State machine status (stored inside of a (nvlist_t *)) 1975 * nv: The current vdev nvlist_t we are iterating over. 1976 * last_nv: The previous vdev nvlist_t we returned to the user in 1977 * the last iteration of FOR_EACH_VDEV(). We use it 1978 * to find the next vdev nvlist_t we should return. 1979 * real_leaves_only: Only return leaf vdevs. 1980 * 1981 * Returns 1 if we found the next vdev nvlist_t for this iteration. 0 if 1982 * we're still searching for it. 1983 */ 1984 static int 1985 __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv, 1986 boolean_t real_leaves_only) 1987 { 1988 enum {FIRST_NV = 0, NEXT_IS_MATCH = 1, STOP_LOOKING = 2}; 1989 1990 /* The very first entry in the NV list is a special case */ 1991 if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) { 1992 if (real_leaves_only && !vdev_is_real_leaf(nv)) 1993 return (0); 1994 1995 *((nvlist_t **)last_nv) = nv; 1996 *((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING; 1997 return (1); 1998 } 1999 2000 /* 2001 * We came across our last_nv, meaning the next one is the one we 2002 * want 2003 */ 2004 if (nv == *((nvlist_t **)last_nv)) { 2005 /* Next iteration of this function will return the nvlist_t */ 2006 *((nvlist_t **)state) = (nvlist_t *)NEXT_IS_MATCH; 2007 return (0); 2008 } 2009 2010 /* 2011 * We marked NEXT_IS_MATCH on the previous iteration, so this is the one 2012 * we want. 2013 */ 2014 if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) { 2015 if (real_leaves_only && !vdev_is_real_leaf(nv)) 2016 return (0); 2017 2018 *((nvlist_t **)last_nv) = nv; 2019 *((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING; 2020 return (1); 2021 } 2022 2023 return (0); 2024 } 2025 2026 int 2027 for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv) 2028 { 2029 return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_FALSE)); 2030 } 2031 2032 int 2033 for_each_real_leaf_vdev_macro_helper_func(void *state, nvlist_t *nv, 2034 void *last_nv) 2035 { 2036 return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_TRUE)); 2037 } 2038 2039 /* 2040 * Internal function for iterating over the vdevs. 2041 * 2042 * For each vdev, func() will be called and will be passed 'zhp' (which is 2043 * typically the zpool_handle_t cast as a void pointer), the vdev's nvlist, and 2044 * a user-defined data pointer). 2045 * 2046 * The return values from all the func() calls will be OR'd together and 2047 * returned. 2048 */ 2049 int 2050 for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, 2051 void *data) 2052 { 2053 nvlist_t **child; 2054 uint_t c, children; 2055 int ret = 0; 2056 int i; 2057 const char *type; 2058 2059 const char *list[] = { 2060 ZPOOL_CONFIG_SPARES, 2061 ZPOOL_CONFIG_L2CACHE, 2062 ZPOOL_CONFIG_CHILDREN 2063 }; 2064 2065 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 2066 return (ret); 2067 2068 /* Don't run our function on indirect vdevs */ 2069 if (strcmp(type, VDEV_TYPE_INDIRECT) != 0) { 2070 ret |= func(zhp, nv, data); 2071 } 2072 2073 for (i = 0; i < ARRAY_SIZE(list); i++) { 2074 if (nvlist_lookup_nvlist_array(nv, list[i], &child, 2075 &children) == 0) { 2076 for (c = 0; c < children; c++) { 2077 uint64_t ishole = 0; 2078 2079 (void) nvlist_lookup_uint64(child[c], 2080 ZPOOL_CONFIG_IS_HOLE, &ishole); 2081 2082 if (ishole) 2083 continue; 2084 2085 ret |= for_each_vdev_cb(zhp, child[c], 2086 func, data); 2087 } 2088 } 2089 } 2090 2091 return (ret); 2092 } 2093 2094 /* 2095 * Given an ZPOOL_CONFIG_VDEV_TREE nvpair, iterate over all the vdevs, calling 2096 * func() for each one. func() is passed the vdev's nvlist and an optional 2097 * user-defined 'data' pointer. 2098 */ 2099 int 2100 for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data) 2101 { 2102 return (for_each_vdev_cb(NULL, nvroot, func, data)); 2103 } 2104