1 /*- 2 * Copyright (c) 2007 Doug Rabson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 /* 31 * Stand-alone ZFS file reader. 32 */ 33 34 #include <stdbool.h> 35 #include <sys/endian.h> 36 #include <sys/stat.h> 37 #include <sys/stdint.h> 38 #include <sys/list.h> 39 #include <sys/zfs_bootenv.h> 40 #include <machine/_inttypes.h> 41 42 #include "zfsimpl.h" 43 #include "zfssubr.c" 44 45 #ifdef HAS_ZSTD_ZFS 46 extern int zstd_init(void); 47 #endif 48 49 struct zfsmount { 50 char *path; 51 const spa_t *spa; 52 objset_phys_t objset; 53 uint64_t rootobj; 54 STAILQ_ENTRY(zfsmount) next; 55 }; 56 57 typedef STAILQ_HEAD(zfs_mnt_list, zfsmount) zfs_mnt_list_t; 58 static zfs_mnt_list_t zfsmount = STAILQ_HEAD_INITIALIZER(zfsmount); 59 60 /* 61 * The indirect_child_t represents the vdev that we will read from, when we 62 * need to read all copies of the data (e.g. for scrub or reconstruction). 63 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), 64 * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, 65 * ic_vdev is a child of the mirror. 66 */ 67 typedef struct indirect_child { 68 void *ic_data; 69 vdev_t *ic_vdev; 70 } indirect_child_t; 71 72 /* 73 * The indirect_split_t represents one mapped segment of an i/o to the 74 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be 75 * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. 76 * For split blocks, there will be several of these. 77 */ 78 typedef struct indirect_split { 79 list_node_t is_node; /* link on iv_splits */ 80 81 /* 82 * is_split_offset is the offset into the i/o. 83 * This is the sum of the previous splits' is_size's. 84 */ 85 uint64_t is_split_offset; 86 87 vdev_t *is_vdev; /* top-level vdev */ 88 uint64_t is_target_offset; /* offset on is_vdev */ 89 uint64_t is_size; 90 int is_children; /* number of entries in is_child[] */ 91 92 /* 93 * is_good_child is the child that we are currently using to 94 * attempt reconstruction. 95 */ 96 int is_good_child; 97 98 indirect_child_t is_child[1]; /* variable-length */ 99 } indirect_split_t; 100 101 /* 102 * The indirect_vsd_t is associated with each i/o to the indirect vdev. 103 * It is the "Vdev-Specific Data" in the zio_t's io_vsd. 104 */ 105 typedef struct indirect_vsd { 106 boolean_t iv_split_block; 107 boolean_t iv_reconstruct; 108 109 list_t iv_splits; /* list of indirect_split_t's */ 110 } indirect_vsd_t; 111 112 /* 113 * List of all vdevs, chained through v_alllink. 114 */ 115 static vdev_list_t zfs_vdevs; 116 117 /* 118 * List of ZFS features supported for read 119 */ 120 static const char *features_for_read[] = { 121 "org.illumos:lz4_compress", 122 "com.delphix:hole_birth", 123 "com.delphix:extensible_dataset", 124 "com.delphix:embedded_data", 125 "org.open-zfs:large_blocks", 126 "org.illumos:sha512", 127 "org.illumos:skein", 128 "org.zfsonlinux:large_dnode", 129 "com.joyent:multi_vdev_crash_dump", 130 "com.delphix:spacemap_histogram", 131 "com.delphix:zpool_checkpoint", 132 "com.delphix:spacemap_v2", 133 "com.datto:encryption", 134 "com.datto:bookmark_v2", 135 "org.zfsonlinux:allocation_classes", 136 "com.datto:resilver_defer", 137 "com.delphix:device_removal", 138 "com.delphix:obsolete_counts", 139 "com.intel:allocation_classes", 140 "org.freebsd:zstd_compress", 141 "com.delphix:bookmark_written", 142 NULL 143 }; 144 145 /* 146 * List of all pools, chained through spa_link. 147 */ 148 static spa_list_t zfs_pools; 149 150 static const dnode_phys_t *dnode_cache_obj; 151 static uint64_t dnode_cache_bn; 152 static char *dnode_cache_buf; 153 154 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf); 155 static int zfs_get_root(const spa_t *spa, uint64_t *objid); 156 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result); 157 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, 158 const char *name, uint64_t integer_size, uint64_t num_integers, 159 void *value); 160 static int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t, 161 dnode_phys_t *); 162 static int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *, 163 size_t); 164 static int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t, 165 size_t); 166 static int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t, size_t); 167 vdev_indirect_mapping_t *vdev_indirect_mapping_open(spa_t *, objset_phys_t *, 168 uint64_t); 169 vdev_indirect_mapping_entry_phys_t * 170 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *, uint64_t, 171 uint64_t, uint64_t *); 172 173 static void 174 zfs_init(void) 175 { 176 STAILQ_INIT(&zfs_vdevs); 177 STAILQ_INIT(&zfs_pools); 178 179 dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE); 180 181 zfs_init_crc(); 182 #ifdef HAS_ZSTD_ZFS 183 zstd_init(); 184 #endif 185 } 186 187 static int 188 nvlist_check_features_for_read(nvlist_t *nvl) 189 { 190 nvlist_t *features = NULL; 191 nvs_data_t *data; 192 nvp_header_t *nvp; 193 nv_string_t *nvp_name; 194 int rc; 195 196 rc = nvlist_find(nvl, ZPOOL_CONFIG_FEATURES_FOR_READ, 197 DATA_TYPE_NVLIST, NULL, &features, NULL); 198 switch (rc) { 199 case 0: 200 break; /* Continue with checks */ 201 202 case ENOENT: 203 return (0); /* All features are disabled */ 204 205 default: 206 return (rc); /* Error while reading nvlist */ 207 } 208 209 data = (nvs_data_t *)features->nv_data; 210 nvp = &data->nvl_pair; /* first pair in nvlist */ 211 212 while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { 213 int i, found; 214 215 nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp)); 216 found = 0; 217 218 for (i = 0; features_for_read[i] != NULL; i++) { 219 if (memcmp(nvp_name->nv_data, features_for_read[i], 220 nvp_name->nv_size) == 0) { 221 found = 1; 222 break; 223 } 224 } 225 226 if (!found) { 227 printf("ZFS: unsupported feature: %.*s\n", 228 nvp_name->nv_size, nvp_name->nv_data); 229 rc = EIO; 230 } 231 nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); 232 } 233 nvlist_destroy(features); 234 235 return (rc); 236 } 237 238 static int 239 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, 240 off_t offset, size_t size) 241 { 242 size_t psize; 243 int rc; 244 245 if (vdev->v_phys_read == NULL) 246 return (ENOTSUP); 247 248 if (bp) { 249 psize = BP_GET_PSIZE(bp); 250 } else { 251 psize = size; 252 } 253 254 rc = vdev->v_phys_read(vdev, vdev->v_priv, offset, buf, psize); 255 if (rc == 0) { 256 if (bp != NULL) 257 rc = zio_checksum_verify(vdev->v_spa, bp, buf); 258 } 259 260 return (rc); 261 } 262 263 static int 264 vdev_write_phys(vdev_t *vdev, void *buf, off_t offset, size_t size) 265 { 266 if (vdev->v_phys_write == NULL) 267 return (ENOTSUP); 268 269 return (vdev->v_phys_write(vdev, offset, buf, size)); 270 } 271 272 typedef struct remap_segment { 273 vdev_t *rs_vd; 274 uint64_t rs_offset; 275 uint64_t rs_asize; 276 uint64_t rs_split_offset; 277 list_node_t rs_node; 278 } remap_segment_t; 279 280 static remap_segment_t * 281 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 282 { 283 remap_segment_t *rs = malloc(sizeof (remap_segment_t)); 284 285 if (rs != NULL) { 286 rs->rs_vd = vd; 287 rs->rs_offset = offset; 288 rs->rs_asize = asize; 289 rs->rs_split_offset = split_offset; 290 } 291 292 return (rs); 293 } 294 295 vdev_indirect_mapping_t * 296 vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os, 297 uint64_t mapping_object) 298 { 299 vdev_indirect_mapping_t *vim; 300 vdev_indirect_mapping_phys_t *vim_phys; 301 int rc; 302 303 vim = calloc(1, sizeof (*vim)); 304 if (vim == NULL) 305 return (NULL); 306 307 vim->vim_dn = calloc(1, sizeof (*vim->vim_dn)); 308 if (vim->vim_dn == NULL) { 309 free(vim); 310 return (NULL); 311 } 312 313 rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn); 314 if (rc != 0) { 315 free(vim->vim_dn); 316 free(vim); 317 return (NULL); 318 } 319 320 vim->vim_spa = spa; 321 vim->vim_phys = malloc(sizeof (*vim->vim_phys)); 322 if (vim->vim_phys == NULL) { 323 free(vim->vim_dn); 324 free(vim); 325 return (NULL); 326 } 327 328 vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn); 329 *vim->vim_phys = *vim_phys; 330 331 vim->vim_objset = os; 332 vim->vim_object = mapping_object; 333 vim->vim_entries = NULL; 334 335 vim->vim_havecounts = 336 (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0); 337 338 return (vim); 339 } 340 341 /* 342 * Compare an offset with an indirect mapping entry; there are three 343 * possible scenarios: 344 * 345 * 1. The offset is "less than" the mapping entry; meaning the 346 * offset is less than the source offset of the mapping entry. In 347 * this case, there is no overlap between the offset and the 348 * mapping entry and -1 will be returned. 349 * 350 * 2. The offset is "greater than" the mapping entry; meaning the 351 * offset is greater than the mapping entry's source offset plus 352 * the entry's size. In this case, there is no overlap between 353 * the offset and the mapping entry and 1 will be returned. 354 * 355 * NOTE: If the offset is actually equal to the entry's offset 356 * plus size, this is considered to be "greater" than the entry, 357 * and this case applies (i.e. 1 will be returned). Thus, the 358 * entry's "range" can be considered to be inclusive at its 359 * start, but exclusive at its end: e.g. [src, src + size). 360 * 361 * 3. The last case to consider is if the offset actually falls 362 * within the mapping entry's range. If this is the case, the 363 * offset is considered to be "equal to" the mapping entry and 364 * 0 will be returned. 365 * 366 * NOTE: If the offset is equal to the entry's source offset, 367 * this case applies and 0 will be returned. If the offset is 368 * equal to the entry's source plus its size, this case does 369 * *not* apply (see "NOTE" above for scenario 2), and 1 will be 370 * returned. 371 */ 372 static int 373 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) 374 { 375 const uint64_t *key = v_key; 376 const vdev_indirect_mapping_entry_phys_t *array_elem = 377 v_array_elem; 378 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); 379 380 if (*key < src_offset) { 381 return (-1); 382 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { 383 return (0); 384 } else { 385 return (1); 386 } 387 } 388 389 /* 390 * Return array entry. 391 */ 392 static vdev_indirect_mapping_entry_phys_t * 393 vdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index) 394 { 395 uint64_t size; 396 off_t offset = 0; 397 int rc; 398 399 if (vim->vim_phys->vimp_num_entries == 0) 400 return (NULL); 401 402 if (vim->vim_entries == NULL) { 403 uint64_t bsize; 404 405 bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT; 406 size = vim->vim_phys->vimp_num_entries * 407 sizeof (*vim->vim_entries); 408 if (size > bsize) { 409 size = bsize / sizeof (*vim->vim_entries); 410 size *= sizeof (*vim->vim_entries); 411 } 412 vim->vim_entries = malloc(size); 413 if (vim->vim_entries == NULL) 414 return (NULL); 415 vim->vim_num_entries = size / sizeof (*vim->vim_entries); 416 offset = index * sizeof (*vim->vim_entries); 417 } 418 419 /* We have data in vim_entries */ 420 if (offset == 0) { 421 if (index >= vim->vim_entry_offset && 422 index <= vim->vim_entry_offset + vim->vim_num_entries) { 423 index -= vim->vim_entry_offset; 424 return (&vim->vim_entries[index]); 425 } 426 offset = index * sizeof (*vim->vim_entries); 427 } 428 429 vim->vim_entry_offset = index; 430 size = vim->vim_num_entries * sizeof (*vim->vim_entries); 431 rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries, 432 size); 433 if (rc != 0) { 434 /* Read error, invalidate vim_entries. */ 435 free(vim->vim_entries); 436 vim->vim_entries = NULL; 437 return (NULL); 438 } 439 index -= vim->vim_entry_offset; 440 return (&vim->vim_entries[index]); 441 } 442 443 /* 444 * Returns the mapping entry for the given offset. 445 * 446 * It's possible that the given offset will not be in the mapping table 447 * (i.e. no mapping entries contain this offset), in which case, the 448 * return value value depends on the "next_if_missing" parameter. 449 * 450 * If the offset is not found in the table and "next_if_missing" is 451 * B_FALSE, then NULL will always be returned. The behavior is intended 452 * to allow consumers to get the entry corresponding to the offset 453 * parameter, iff the offset overlaps with an entry in the table. 454 * 455 * If the offset is not found in the table and "next_if_missing" is 456 * B_TRUE, then the entry nearest to the given offset will be returned, 457 * such that the entry's source offset is greater than the offset 458 * passed in (i.e. the "next" mapping entry in the table is returned, if 459 * the offset is missing from the table). If there are no entries whose 460 * source offset is greater than the passed in offset, NULL is returned. 461 */ 462 static vdev_indirect_mapping_entry_phys_t * 463 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, 464 uint64_t offset) 465 { 466 ASSERT(vim->vim_phys->vimp_num_entries > 0); 467 468 vdev_indirect_mapping_entry_phys_t *entry; 469 470 uint64_t last = vim->vim_phys->vimp_num_entries - 1; 471 uint64_t base = 0; 472 473 /* 474 * We don't define these inside of the while loop because we use 475 * their value in the case that offset isn't in the mapping. 476 */ 477 uint64_t mid; 478 int result; 479 480 while (last >= base) { 481 mid = base + ((last - base) >> 1); 482 483 entry = vdev_indirect_mapping_entry(vim, mid); 484 if (entry == NULL) 485 break; 486 result = dva_mapping_overlap_compare(&offset, entry); 487 488 if (result == 0) { 489 break; 490 } else if (result < 0) { 491 last = mid - 1; 492 } else { 493 base = mid + 1; 494 } 495 } 496 return (entry); 497 } 498 499 /* 500 * Given an indirect vdev and an extent on that vdev, it duplicates the 501 * physical entries of the indirect mapping that correspond to the extent 502 * to a new array and returns a pointer to it. In addition, copied_entries 503 * is populated with the number of mapping entries that were duplicated. 504 * 505 * Finally, since we are doing an allocation, it is up to the caller to 506 * free the array allocated in this function. 507 */ 508 vdev_indirect_mapping_entry_phys_t * 509 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, 510 uint64_t asize, uint64_t *copied_entries) 511 { 512 vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; 513 vdev_indirect_mapping_t *vim = vd->v_mapping; 514 uint64_t entries = 0; 515 516 vdev_indirect_mapping_entry_phys_t *first_mapping = 517 vdev_indirect_mapping_entry_for_offset(vim, offset); 518 ASSERT3P(first_mapping, !=, NULL); 519 520 vdev_indirect_mapping_entry_phys_t *m = first_mapping; 521 while (asize > 0) { 522 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 523 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); 524 uint64_t inner_size = MIN(asize, size - inner_offset); 525 526 offset += inner_size; 527 asize -= inner_size; 528 entries++; 529 m++; 530 } 531 532 size_t copy_length = entries * sizeof (*first_mapping); 533 duplicate_mappings = malloc(copy_length); 534 if (duplicate_mappings != NULL) 535 bcopy(first_mapping, duplicate_mappings, copy_length); 536 else 537 entries = 0; 538 539 *copied_entries = entries; 540 541 return (duplicate_mappings); 542 } 543 544 static vdev_t * 545 vdev_lookup_top(spa_t *spa, uint64_t vdev) 546 { 547 vdev_t *rvd; 548 vdev_list_t *vlist; 549 550 vlist = &spa->spa_root_vdev->v_children; 551 STAILQ_FOREACH(rvd, vlist, v_childlink) 552 if (rvd->v_id == vdev) 553 break; 554 555 return (rvd); 556 } 557 558 /* 559 * This is a callback for vdev_indirect_remap() which allocates an 560 * indirect_split_t for each split segment and adds it to iv_splits. 561 */ 562 static void 563 vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, 564 uint64_t size, void *arg) 565 { 566 int n = 1; 567 zio_t *zio = arg; 568 indirect_vsd_t *iv = zio->io_vsd; 569 570 if (vd->v_read == vdev_indirect_read) 571 return; 572 573 if (vd->v_read == vdev_mirror_read) 574 n = vd->v_nchildren; 575 576 indirect_split_t *is = 577 malloc(offsetof(indirect_split_t, is_child[n])); 578 if (is == NULL) { 579 zio->io_error = ENOMEM; 580 return; 581 } 582 bzero(is, offsetof(indirect_split_t, is_child[n])); 583 584 is->is_children = n; 585 is->is_size = size; 586 is->is_split_offset = split_offset; 587 is->is_target_offset = offset; 588 is->is_vdev = vd; 589 590 /* 591 * Note that we only consider multiple copies of the data for 592 * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even 593 * though they use the same ops as mirror, because there's only one 594 * "good" copy under the replacing/spare. 595 */ 596 if (vd->v_read == vdev_mirror_read) { 597 int i = 0; 598 vdev_t *kid; 599 600 STAILQ_FOREACH(kid, &vd->v_children, v_childlink) { 601 is->is_child[i++].ic_vdev = kid; 602 } 603 } else { 604 is->is_child[0].ic_vdev = vd; 605 } 606 607 list_insert_tail(&iv->iv_splits, is); 608 } 609 610 static void 611 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg) 612 { 613 list_t stack; 614 spa_t *spa = vd->v_spa; 615 zio_t *zio = arg; 616 remap_segment_t *rs; 617 618 list_create(&stack, sizeof (remap_segment_t), 619 offsetof(remap_segment_t, rs_node)); 620 621 rs = rs_alloc(vd, offset, asize, 0); 622 if (rs == NULL) { 623 printf("vdev_indirect_remap: out of memory.\n"); 624 zio->io_error = ENOMEM; 625 } 626 for (; rs != NULL; rs = list_remove_head(&stack)) { 627 vdev_t *v = rs->rs_vd; 628 uint64_t num_entries = 0; 629 /* vdev_indirect_mapping_t *vim = v->v_mapping; */ 630 vdev_indirect_mapping_entry_phys_t *mapping = 631 vdev_indirect_mapping_duplicate_adjacent_entries(v, 632 rs->rs_offset, rs->rs_asize, &num_entries); 633 634 if (num_entries == 0) 635 zio->io_error = ENOMEM; 636 637 for (uint64_t i = 0; i < num_entries; i++) { 638 vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; 639 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 640 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); 641 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); 642 uint64_t inner_offset = rs->rs_offset - 643 DVA_MAPPING_GET_SRC_OFFSET(m); 644 uint64_t inner_size = 645 MIN(rs->rs_asize, size - inner_offset); 646 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 647 648 if (dst_v->v_read == vdev_indirect_read) { 649 remap_segment_t *o; 650 651 o = rs_alloc(dst_v, dst_offset + inner_offset, 652 inner_size, rs->rs_split_offset); 653 if (o == NULL) { 654 printf("vdev_indirect_remap: " 655 "out of memory.\n"); 656 zio->io_error = ENOMEM; 657 break; 658 } 659 660 list_insert_head(&stack, o); 661 } 662 vdev_indirect_gather_splits(rs->rs_split_offset, dst_v, 663 dst_offset + inner_offset, 664 inner_size, arg); 665 666 /* 667 * vdev_indirect_gather_splits can have memory 668 * allocation error, we can not recover from it. 669 */ 670 if (zio->io_error != 0) 671 break; 672 rs->rs_offset += inner_size; 673 rs->rs_asize -= inner_size; 674 rs->rs_split_offset += inner_size; 675 } 676 677 free(mapping); 678 free(rs); 679 if (zio->io_error != 0) 680 break; 681 } 682 683 list_destroy(&stack); 684 } 685 686 static void 687 vdev_indirect_map_free(zio_t *zio) 688 { 689 indirect_vsd_t *iv = zio->io_vsd; 690 indirect_split_t *is; 691 692 while ((is = list_head(&iv->iv_splits)) != NULL) { 693 for (int c = 0; c < is->is_children; c++) { 694 indirect_child_t *ic = &is->is_child[c]; 695 free(ic->ic_data); 696 } 697 list_remove(&iv->iv_splits, is); 698 free(is); 699 } 700 free(iv); 701 } 702 703 static int 704 vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 705 off_t offset, size_t bytes) 706 { 707 zio_t zio; 708 spa_t *spa = vdev->v_spa; 709 indirect_vsd_t *iv; 710 indirect_split_t *first; 711 int rc = EIO; 712 713 iv = calloc(1, sizeof(*iv)); 714 if (iv == NULL) 715 return (ENOMEM); 716 717 list_create(&iv->iv_splits, 718 sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); 719 720 bzero(&zio, sizeof(zio)); 721 zio.io_spa = spa; 722 zio.io_bp = (blkptr_t *)bp; 723 zio.io_data = buf; 724 zio.io_size = bytes; 725 zio.io_offset = offset; 726 zio.io_vd = vdev; 727 zio.io_vsd = iv; 728 729 if (vdev->v_mapping == NULL) { 730 vdev_indirect_config_t *vic; 731 732 vic = &vdev->vdev_indirect_config; 733 vdev->v_mapping = vdev_indirect_mapping_open(spa, 734 spa->spa_mos, vic->vic_mapping_object); 735 } 736 737 vdev_indirect_remap(vdev, offset, bytes, &zio); 738 if (zio.io_error != 0) 739 return (zio.io_error); 740 741 first = list_head(&iv->iv_splits); 742 if (first->is_size == zio.io_size) { 743 /* 744 * This is not a split block; we are pointing to the entire 745 * data, which will checksum the same as the original data. 746 * Pass the BP down so that the child i/o can verify the 747 * checksum, and try a different location if available 748 * (e.g. on a mirror). 749 * 750 * While this special case could be handled the same as the 751 * general (split block) case, doing it this way ensures 752 * that the vast majority of blocks on indirect vdevs 753 * (which are not split) are handled identically to blocks 754 * on non-indirect vdevs. This allows us to be less strict 755 * about performance in the general (but rare) case. 756 */ 757 rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp, 758 zio.io_data, first->is_target_offset, bytes); 759 } else { 760 iv->iv_split_block = B_TRUE; 761 /* 762 * Read one copy of each split segment, from the 763 * top-level vdev. Since we don't know the 764 * checksum of each split individually, the child 765 * zio can't ensure that we get the right data. 766 * E.g. if it's a mirror, it will just read from a 767 * random (healthy) leaf vdev. We have to verify 768 * the checksum in vdev_indirect_io_done(). 769 */ 770 for (indirect_split_t *is = list_head(&iv->iv_splits); 771 is != NULL; is = list_next(&iv->iv_splits, is)) { 772 char *ptr = zio.io_data; 773 774 rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp, 775 ptr + is->is_split_offset, is->is_target_offset, 776 is->is_size); 777 } 778 if (zio_checksum_verify(spa, zio.io_bp, zio.io_data)) 779 rc = ECKSUM; 780 else 781 rc = 0; 782 } 783 784 vdev_indirect_map_free(&zio); 785 if (rc == 0) 786 rc = zio.io_error; 787 788 return (rc); 789 } 790 791 static int 792 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 793 off_t offset, size_t bytes) 794 { 795 796 return (vdev_read_phys(vdev, bp, buf, 797 offset + VDEV_LABEL_START_SIZE, bytes)); 798 } 799 800 static int 801 vdev_missing_read(vdev_t *vdev __unused, const blkptr_t *bp __unused, 802 void *buf __unused, off_t offset __unused, size_t bytes __unused) 803 { 804 805 return (ENOTSUP); 806 } 807 808 static int 809 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 810 off_t offset, size_t bytes) 811 { 812 vdev_t *kid; 813 int rc; 814 815 rc = EIO; 816 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 817 if (kid->v_state != VDEV_STATE_HEALTHY) 818 continue; 819 rc = kid->v_read(kid, bp, buf, offset, bytes); 820 if (!rc) 821 return (0); 822 } 823 824 return (rc); 825 } 826 827 static int 828 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 829 off_t offset, size_t bytes) 830 { 831 vdev_t *kid; 832 833 /* 834 * Here we should have two kids: 835 * First one which is the one we are replacing and we can trust 836 * only this one to have valid data, but it might not be present. 837 * Second one is that one we are replacing with. It is most likely 838 * healthy, but we can't trust it has needed data, so we won't use it. 839 */ 840 kid = STAILQ_FIRST(&vdev->v_children); 841 if (kid == NULL) 842 return (EIO); 843 if (kid->v_state != VDEV_STATE_HEALTHY) 844 return (EIO); 845 return (kid->v_read(kid, bp, buf, offset, bytes)); 846 } 847 848 static vdev_t * 849 vdev_find(uint64_t guid) 850 { 851 vdev_t *vdev; 852 853 STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink) 854 if (vdev->v_guid == guid) 855 return (vdev); 856 857 return (0); 858 } 859 860 static vdev_t * 861 vdev_create(uint64_t guid, vdev_read_t *_read) 862 { 863 vdev_t *vdev; 864 vdev_indirect_config_t *vic; 865 866 vdev = calloc(1, sizeof(vdev_t)); 867 if (vdev != NULL) { 868 STAILQ_INIT(&vdev->v_children); 869 vdev->v_guid = guid; 870 vdev->v_read = _read; 871 872 /* 873 * root vdev has no read function, we use this fact to 874 * skip setting up data we do not need for root vdev. 875 * We only point root vdev from spa. 876 */ 877 if (_read != NULL) { 878 vic = &vdev->vdev_indirect_config; 879 vic->vic_prev_indirect_vdev = UINT64_MAX; 880 STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); 881 } 882 } 883 884 return (vdev); 885 } 886 887 static void 888 vdev_set_initial_state(vdev_t *vdev, const nvlist_t *nvlist) 889 { 890 uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; 891 uint64_t is_log; 892 893 is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; 894 is_log = 0; 895 (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, 896 &is_offline, NULL); 897 (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, 898 &is_removed, NULL); 899 (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, 900 &is_faulted, NULL); 901 (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 902 NULL, &is_degraded, NULL); 903 (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 904 NULL, &isnt_present, NULL); 905 (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, 906 &is_log, NULL); 907 908 if (is_offline != 0) 909 vdev->v_state = VDEV_STATE_OFFLINE; 910 else if (is_removed != 0) 911 vdev->v_state = VDEV_STATE_REMOVED; 912 else if (is_faulted != 0) 913 vdev->v_state = VDEV_STATE_FAULTED; 914 else if (is_degraded != 0) 915 vdev->v_state = VDEV_STATE_DEGRADED; 916 else if (isnt_present != 0) 917 vdev->v_state = VDEV_STATE_CANT_OPEN; 918 919 vdev->v_islog = is_log != 0; 920 } 921 922 static int 923 vdev_init(uint64_t guid, const nvlist_t *nvlist, vdev_t **vdevp) 924 { 925 uint64_t id, ashift, asize, nparity; 926 const char *path; 927 const char *type; 928 int len, pathlen; 929 char *name; 930 vdev_t *vdev; 931 932 if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id, 933 NULL) || 934 nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, 935 &type, &len)) { 936 return (ENOENT); 937 } 938 939 if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 && 940 memcmp(type, VDEV_TYPE_DISK, len) != 0 && 941 #ifdef ZFS_TEST 942 memcmp(type, VDEV_TYPE_FILE, len) != 0 && 943 #endif 944 memcmp(type, VDEV_TYPE_RAIDZ, len) != 0 && 945 memcmp(type, VDEV_TYPE_INDIRECT, len) != 0 && 946 memcmp(type, VDEV_TYPE_REPLACING, len) != 0 && 947 memcmp(type, VDEV_TYPE_HOLE, len) != 0) { 948 printf("ZFS: can only boot from disk, mirror, raidz1, " 949 "raidz2 and raidz3 vdevs, got: %.*s\n", len, type); 950 return (EIO); 951 } 952 953 if (memcmp(type, VDEV_TYPE_MIRROR, len) == 0) 954 vdev = vdev_create(guid, vdev_mirror_read); 955 else if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) 956 vdev = vdev_create(guid, vdev_raidz_read); 957 else if (memcmp(type, VDEV_TYPE_REPLACING, len) == 0) 958 vdev = vdev_create(guid, vdev_replacing_read); 959 else if (memcmp(type, VDEV_TYPE_INDIRECT, len) == 0) { 960 vdev_indirect_config_t *vic; 961 962 vdev = vdev_create(guid, vdev_indirect_read); 963 if (vdev != NULL) { 964 vdev->v_state = VDEV_STATE_HEALTHY; 965 vic = &vdev->vdev_indirect_config; 966 967 nvlist_find(nvlist, 968 ZPOOL_CONFIG_INDIRECT_OBJECT, 969 DATA_TYPE_UINT64, 970 NULL, &vic->vic_mapping_object, NULL); 971 nvlist_find(nvlist, 972 ZPOOL_CONFIG_INDIRECT_BIRTHS, 973 DATA_TYPE_UINT64, 974 NULL, &vic->vic_births_object, NULL); 975 nvlist_find(nvlist, 976 ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 977 DATA_TYPE_UINT64, 978 NULL, &vic->vic_prev_indirect_vdev, NULL); 979 } 980 } else if (memcmp(type, VDEV_TYPE_HOLE, len) == 0) { 981 vdev = vdev_create(guid, vdev_missing_read); 982 } else { 983 vdev = vdev_create(guid, vdev_disk_read); 984 } 985 986 if (vdev == NULL) 987 return (ENOMEM); 988 989 vdev_set_initial_state(vdev, nvlist); 990 vdev->v_id = id; 991 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, 992 DATA_TYPE_UINT64, NULL, &ashift, NULL) == 0) 993 vdev->v_ashift = ashift; 994 995 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, 996 DATA_TYPE_UINT64, NULL, &asize, NULL) == 0) { 997 vdev->v_psize = asize + 998 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 999 } 1000 1001 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, 1002 DATA_TYPE_UINT64, NULL, &nparity, NULL) == 0) 1003 vdev->v_nparity = nparity; 1004 1005 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, 1006 DATA_TYPE_STRING, NULL, &path, &pathlen) == 0) { 1007 char prefix[] = "/dev/"; 1008 1009 len = strlen(prefix); 1010 if (len < pathlen && memcmp(path, prefix, len) == 0) { 1011 path += len; 1012 pathlen -= len; 1013 } 1014 name = malloc(pathlen + 1); 1015 bcopy(path, name, pathlen); 1016 name[pathlen] = '\0'; 1017 vdev->v_name = name; 1018 } else { 1019 name = NULL; 1020 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) { 1021 if (vdev->v_nparity < 1 || 1022 vdev->v_nparity > 3) { 1023 printf("ZFS: invalid raidz parity: %d\n", 1024 vdev->v_nparity); 1025 return (EIO); 1026 } 1027 (void) asprintf(&name, "%.*s%d-%" PRIu64, len, type, 1028 vdev->v_nparity, id); 1029 } else { 1030 (void) asprintf(&name, "%.*s-%" PRIu64, len, type, id); 1031 } 1032 vdev->v_name = name; 1033 } 1034 *vdevp = vdev; 1035 return (0); 1036 } 1037 1038 /* 1039 * Find slot for vdev. We return either NULL to signal to use 1040 * STAILQ_INSERT_HEAD, or we return link element to be used with 1041 * STAILQ_INSERT_AFTER. 1042 */ 1043 static vdev_t * 1044 vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev) 1045 { 1046 vdev_t *v, *previous; 1047 1048 if (STAILQ_EMPTY(&top_vdev->v_children)) 1049 return (NULL); 1050 1051 previous = NULL; 1052 STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) { 1053 if (v->v_id > vdev->v_id) 1054 return (previous); 1055 1056 if (v->v_id == vdev->v_id) 1057 return (v); 1058 1059 if (v->v_id < vdev->v_id) 1060 previous = v; 1061 } 1062 return (previous); 1063 } 1064 1065 static size_t 1066 vdev_child_count(vdev_t *vdev) 1067 { 1068 vdev_t *v; 1069 size_t count; 1070 1071 count = 0; 1072 STAILQ_FOREACH(v, &vdev->v_children, v_childlink) { 1073 count++; 1074 } 1075 return (count); 1076 } 1077 1078 /* 1079 * Insert vdev into top_vdev children list. List is ordered by v_id. 1080 */ 1081 static void 1082 vdev_insert(vdev_t *top_vdev, vdev_t *vdev) 1083 { 1084 vdev_t *previous; 1085 size_t count; 1086 1087 /* 1088 * The top level vdev can appear in random order, depending how 1089 * the firmware is presenting the disk devices. 1090 * However, we will insert vdev to create list ordered by v_id, 1091 * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER 1092 * as STAILQ does not have insert before. 1093 */ 1094 previous = vdev_find_previous(top_vdev, vdev); 1095 1096 if (previous == NULL) { 1097 STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink); 1098 } else if (previous->v_id == vdev->v_id) { 1099 /* 1100 * This vdev was configured from label config, 1101 * do not insert duplicate. 1102 */ 1103 return; 1104 } else { 1105 STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev, 1106 v_childlink); 1107 } 1108 1109 count = vdev_child_count(top_vdev); 1110 if (top_vdev->v_nchildren < count) 1111 top_vdev->v_nchildren = count; 1112 } 1113 1114 static int 1115 vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist) 1116 { 1117 vdev_t *top_vdev, *vdev; 1118 nvlist_t **kids = NULL; 1119 int rc, nkids; 1120 1121 /* Get top vdev. */ 1122 top_vdev = vdev_find(top_guid); 1123 if (top_vdev == NULL) { 1124 rc = vdev_init(top_guid, nvlist, &top_vdev); 1125 if (rc != 0) 1126 return (rc); 1127 top_vdev->v_spa = spa; 1128 top_vdev->v_top = top_vdev; 1129 vdev_insert(spa->spa_root_vdev, top_vdev); 1130 } 1131 1132 /* Add children if there are any. */ 1133 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1134 &nkids, &kids, NULL); 1135 if (rc == 0) { 1136 for (int i = 0; i < nkids; i++) { 1137 uint64_t guid; 1138 1139 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, 1140 DATA_TYPE_UINT64, NULL, &guid, NULL); 1141 if (rc != 0) 1142 goto done; 1143 1144 rc = vdev_init(guid, kids[i], &vdev); 1145 if (rc != 0) 1146 goto done; 1147 1148 vdev->v_spa = spa; 1149 vdev->v_top = top_vdev; 1150 vdev_insert(top_vdev, vdev); 1151 } 1152 } else { 1153 /* 1154 * When there are no children, nvlist_find() does return 1155 * error, reset it because leaf devices have no children. 1156 */ 1157 rc = 0; 1158 } 1159 done: 1160 if (kids != NULL) { 1161 for (int i = 0; i < nkids; i++) 1162 nvlist_destroy(kids[i]); 1163 free(kids); 1164 } 1165 1166 return (rc); 1167 } 1168 1169 static int 1170 vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist) 1171 { 1172 uint64_t pool_guid, top_guid; 1173 nvlist_t *vdevs; 1174 int rc; 1175 1176 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1177 NULL, &pool_guid, NULL) || 1178 nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64, 1179 NULL, &top_guid, NULL) || 1180 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1181 NULL, &vdevs, NULL)) { 1182 printf("ZFS: can't find vdev details\n"); 1183 return (ENOENT); 1184 } 1185 1186 rc = vdev_from_nvlist(spa, top_guid, vdevs); 1187 nvlist_destroy(vdevs); 1188 return (rc); 1189 } 1190 1191 static void 1192 vdev_set_state(vdev_t *vdev) 1193 { 1194 vdev_t *kid; 1195 int good_kids; 1196 int bad_kids; 1197 1198 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1199 vdev_set_state(kid); 1200 } 1201 1202 /* 1203 * A mirror or raidz is healthy if all its kids are healthy. A 1204 * mirror is degraded if any of its kids is healthy; a raidz 1205 * is degraded if at most nparity kids are offline. 1206 */ 1207 if (STAILQ_FIRST(&vdev->v_children)) { 1208 good_kids = 0; 1209 bad_kids = 0; 1210 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1211 if (kid->v_state == VDEV_STATE_HEALTHY) 1212 good_kids++; 1213 else 1214 bad_kids++; 1215 } 1216 if (bad_kids == 0) { 1217 vdev->v_state = VDEV_STATE_HEALTHY; 1218 } else { 1219 if (vdev->v_read == vdev_mirror_read) { 1220 if (good_kids) { 1221 vdev->v_state = VDEV_STATE_DEGRADED; 1222 } else { 1223 vdev->v_state = VDEV_STATE_OFFLINE; 1224 } 1225 } else if (vdev->v_read == vdev_raidz_read) { 1226 if (bad_kids > vdev->v_nparity) { 1227 vdev->v_state = VDEV_STATE_OFFLINE; 1228 } else { 1229 vdev->v_state = VDEV_STATE_DEGRADED; 1230 } 1231 } 1232 } 1233 } 1234 } 1235 1236 static int 1237 vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist) 1238 { 1239 vdev_t *vdev; 1240 nvlist_t **kids = NULL; 1241 int rc, nkids; 1242 1243 /* Update top vdev. */ 1244 vdev = vdev_find(top_guid); 1245 if (vdev != NULL) 1246 vdev_set_initial_state(vdev, nvlist); 1247 1248 /* Update children if there are any. */ 1249 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1250 &nkids, &kids, NULL); 1251 if (rc == 0) { 1252 for (int i = 0; i < nkids; i++) { 1253 uint64_t guid; 1254 1255 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, 1256 DATA_TYPE_UINT64, NULL, &guid, NULL); 1257 if (rc != 0) 1258 break; 1259 1260 vdev = vdev_find(guid); 1261 if (vdev != NULL) 1262 vdev_set_initial_state(vdev, kids[i]); 1263 } 1264 } else { 1265 rc = 0; 1266 } 1267 if (kids != NULL) { 1268 for (int i = 0; i < nkids; i++) 1269 nvlist_destroy(kids[i]); 1270 free(kids); 1271 } 1272 1273 return (rc); 1274 } 1275 1276 static int 1277 vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist) 1278 { 1279 uint64_t pool_guid, vdev_children; 1280 nvlist_t *vdevs = NULL, **kids = NULL; 1281 int rc, nkids; 1282 1283 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1284 NULL, &pool_guid, NULL) || 1285 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64, 1286 NULL, &vdev_children, NULL) || 1287 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1288 NULL, &vdevs, NULL)) { 1289 printf("ZFS: can't find vdev details\n"); 1290 return (ENOENT); 1291 } 1292 1293 /* Wrong guid?! */ 1294 if (spa->spa_guid != pool_guid) { 1295 nvlist_destroy(vdevs); 1296 return (EINVAL); 1297 } 1298 1299 spa->spa_root_vdev->v_nchildren = vdev_children; 1300 1301 rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1302 &nkids, &kids, NULL); 1303 nvlist_destroy(vdevs); 1304 1305 /* 1306 * MOS config has at least one child for root vdev. 1307 */ 1308 if (rc != 0) 1309 return (rc); 1310 1311 for (int i = 0; i < nkids; i++) { 1312 uint64_t guid; 1313 vdev_t *vdev; 1314 1315 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 1316 NULL, &guid, NULL); 1317 if (rc != 0) 1318 break; 1319 vdev = vdev_find(guid); 1320 /* 1321 * Top level vdev is missing, create it. 1322 */ 1323 if (vdev == NULL) 1324 rc = vdev_from_nvlist(spa, guid, kids[i]); 1325 else 1326 rc = vdev_update_from_nvlist(guid, kids[i]); 1327 if (rc != 0) 1328 break; 1329 } 1330 if (kids != NULL) { 1331 for (int i = 0; i < nkids; i++) 1332 nvlist_destroy(kids[i]); 1333 free(kids); 1334 } 1335 1336 /* 1337 * Re-evaluate top-level vdev state. 1338 */ 1339 vdev_set_state(spa->spa_root_vdev); 1340 1341 return (rc); 1342 } 1343 1344 static spa_t * 1345 spa_find_by_guid(uint64_t guid) 1346 { 1347 spa_t *spa; 1348 1349 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 1350 if (spa->spa_guid == guid) 1351 return (spa); 1352 1353 return (NULL); 1354 } 1355 1356 static spa_t * 1357 spa_find_by_name(const char *name) 1358 { 1359 spa_t *spa; 1360 1361 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 1362 if (strcmp(spa->spa_name, name) == 0) 1363 return (spa); 1364 1365 return (NULL); 1366 } 1367 1368 static spa_t * 1369 spa_find_by_dev(struct zfs_devdesc *dev) 1370 { 1371 1372 if (dev->dd.d_dev->dv_type != DEVT_ZFS) 1373 return (NULL); 1374 1375 if (dev->pool_guid == 0) 1376 return (STAILQ_FIRST(&zfs_pools)); 1377 1378 return (spa_find_by_guid(dev->pool_guid)); 1379 } 1380 1381 static spa_t * 1382 spa_create(uint64_t guid, const char *name) 1383 { 1384 spa_t *spa; 1385 1386 if ((spa = calloc(1, sizeof(spa_t))) == NULL) 1387 return (NULL); 1388 if ((spa->spa_name = strdup(name)) == NULL) { 1389 free(spa); 1390 return (NULL); 1391 } 1392 spa->spa_uberblock = &spa->spa_uberblock_master; 1393 spa->spa_mos = &spa->spa_mos_master; 1394 spa->spa_guid = guid; 1395 spa->spa_root_vdev = vdev_create(guid, NULL); 1396 if (spa->spa_root_vdev == NULL) { 1397 free(spa->spa_name); 1398 free(spa); 1399 return (NULL); 1400 } 1401 spa->spa_root_vdev->v_name = strdup("root"); 1402 STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); 1403 1404 return (spa); 1405 } 1406 1407 static const char * 1408 state_name(vdev_state_t state) 1409 { 1410 static const char *names[] = { 1411 "UNKNOWN", 1412 "CLOSED", 1413 "OFFLINE", 1414 "REMOVED", 1415 "CANT_OPEN", 1416 "FAULTED", 1417 "DEGRADED", 1418 "ONLINE" 1419 }; 1420 return (names[state]); 1421 } 1422 1423 #ifdef BOOT2 1424 1425 #define pager_printf printf 1426 1427 #else 1428 1429 static int 1430 pager_printf(const char *fmt, ...) 1431 { 1432 char line[80]; 1433 va_list args; 1434 1435 va_start(args, fmt); 1436 vsnprintf(line, sizeof(line), fmt, args); 1437 va_end(args); 1438 return (pager_output(line)); 1439 } 1440 1441 #endif 1442 1443 #define STATUS_FORMAT " %s %s\n" 1444 1445 static int 1446 print_state(int indent, const char *name, vdev_state_t state) 1447 { 1448 int i; 1449 char buf[512]; 1450 1451 buf[0] = 0; 1452 for (i = 0; i < indent; i++) 1453 strcat(buf, " "); 1454 strcat(buf, name); 1455 return (pager_printf(STATUS_FORMAT, buf, state_name(state))); 1456 } 1457 1458 static int 1459 vdev_status(vdev_t *vdev, int indent) 1460 { 1461 vdev_t *kid; 1462 int ret; 1463 1464 if (vdev->v_islog) { 1465 (void) pager_output(" logs\n"); 1466 indent++; 1467 } 1468 1469 ret = print_state(indent, vdev->v_name, vdev->v_state); 1470 if (ret != 0) 1471 return (ret); 1472 1473 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1474 ret = vdev_status(kid, indent + 1); 1475 if (ret != 0) 1476 return (ret); 1477 } 1478 return (ret); 1479 } 1480 1481 static int 1482 spa_status(spa_t *spa) 1483 { 1484 static char bootfs[ZFS_MAXNAMELEN]; 1485 uint64_t rootid; 1486 vdev_list_t *vlist; 1487 vdev_t *vdev; 1488 int good_kids, bad_kids, degraded_kids, ret; 1489 vdev_state_t state; 1490 1491 ret = pager_printf(" pool: %s\n", spa->spa_name); 1492 if (ret != 0) 1493 return (ret); 1494 1495 if (zfs_get_root(spa, &rootid) == 0 && 1496 zfs_rlookup(spa, rootid, bootfs) == 0) { 1497 if (bootfs[0] == '\0') 1498 ret = pager_printf("bootfs: %s\n", spa->spa_name); 1499 else 1500 ret = pager_printf("bootfs: %s/%s\n", spa->spa_name, 1501 bootfs); 1502 if (ret != 0) 1503 return (ret); 1504 } 1505 ret = pager_printf("config:\n\n"); 1506 if (ret != 0) 1507 return (ret); 1508 ret = pager_printf(STATUS_FORMAT, "NAME", "STATE"); 1509 if (ret != 0) 1510 return (ret); 1511 1512 good_kids = 0; 1513 degraded_kids = 0; 1514 bad_kids = 0; 1515 vlist = &spa->spa_root_vdev->v_children; 1516 STAILQ_FOREACH(vdev, vlist, v_childlink) { 1517 if (vdev->v_state == VDEV_STATE_HEALTHY) 1518 good_kids++; 1519 else if (vdev->v_state == VDEV_STATE_DEGRADED) 1520 degraded_kids++; 1521 else 1522 bad_kids++; 1523 } 1524 1525 state = VDEV_STATE_CLOSED; 1526 if (good_kids > 0 && (degraded_kids + bad_kids) == 0) 1527 state = VDEV_STATE_HEALTHY; 1528 else if ((good_kids + degraded_kids) > 0) 1529 state = VDEV_STATE_DEGRADED; 1530 1531 ret = print_state(0, spa->spa_name, state); 1532 if (ret != 0) 1533 return (ret); 1534 1535 STAILQ_FOREACH(vdev, vlist, v_childlink) { 1536 ret = vdev_status(vdev, 1); 1537 if (ret != 0) 1538 return (ret); 1539 } 1540 return (ret); 1541 } 1542 1543 static int 1544 spa_all_status(void) 1545 { 1546 spa_t *spa; 1547 int first = 1, ret = 0; 1548 1549 STAILQ_FOREACH(spa, &zfs_pools, spa_link) { 1550 if (!first) { 1551 ret = pager_printf("\n"); 1552 if (ret != 0) 1553 return (ret); 1554 } 1555 first = 0; 1556 ret = spa_status(spa); 1557 if (ret != 0) 1558 return (ret); 1559 } 1560 return (ret); 1561 } 1562 1563 static uint64_t 1564 vdev_label_offset(uint64_t psize, int l, uint64_t offset) 1565 { 1566 uint64_t label_offset; 1567 1568 if (l < VDEV_LABELS / 2) 1569 label_offset = 0; 1570 else 1571 label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t); 1572 1573 return (offset + l * sizeof (vdev_label_t) + label_offset); 1574 } 1575 1576 static int 1577 vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) 1578 { 1579 unsigned int seq1 = 0; 1580 unsigned int seq2 = 0; 1581 int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); 1582 1583 if (cmp != 0) 1584 return (cmp); 1585 1586 cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); 1587 if (cmp != 0) 1588 return (cmp); 1589 1590 if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) 1591 seq1 = MMP_SEQ(ub1); 1592 1593 if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) 1594 seq2 = MMP_SEQ(ub2); 1595 1596 return (AVL_CMP(seq1, seq2)); 1597 } 1598 1599 static int 1600 uberblock_verify(uberblock_t *ub) 1601 { 1602 if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) { 1603 byteswap_uint64_array(ub, sizeof (uberblock_t)); 1604 } 1605 1606 if (ub->ub_magic != UBERBLOCK_MAGIC || 1607 !SPA_VERSION_IS_SUPPORTED(ub->ub_version)) 1608 return (EINVAL); 1609 1610 return (0); 1611 } 1612 1613 static int 1614 vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset, 1615 size_t size) 1616 { 1617 blkptr_t bp; 1618 off_t off; 1619 1620 off = vdev_label_offset(vd->v_psize, l, offset); 1621 1622 BP_ZERO(&bp); 1623 BP_SET_LSIZE(&bp, size); 1624 BP_SET_PSIZE(&bp, size); 1625 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); 1626 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); 1627 DVA_SET_OFFSET(BP_IDENTITY(&bp), off); 1628 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); 1629 1630 return (vdev_read_phys(vd, &bp, buf, off, size)); 1631 } 1632 1633 /* 1634 * We do need to be sure we write to correct location. 1635 * Our vdev label does consist of 4 fields: 1636 * pad1 (8k), reserved. 1637 * bootenv (8k), checksummed, previously reserved, may contian garbage. 1638 * vdev_phys (112k), checksummed 1639 * uberblock ring (128k), checksummed. 1640 * 1641 * Since bootenv area may contain garbage, we can not reliably read it, as 1642 * we can get checksum errors. 1643 * Next best thing is vdev_phys - it is just after bootenv. It still may 1644 * be corrupted, but in such case we will miss this one write. 1645 */ 1646 static int 1647 vdev_label_write_validate(vdev_t *vd, int l, uint64_t offset) 1648 { 1649 uint64_t off, o_phys; 1650 void *buf; 1651 size_t size = VDEV_PHYS_SIZE; 1652 int rc; 1653 1654 o_phys = offsetof(vdev_label_t, vl_vdev_phys); 1655 off = vdev_label_offset(vd->v_psize, l, o_phys); 1656 1657 /* off should be 8K from bootenv */ 1658 if (vdev_label_offset(vd->v_psize, l, offset) + VDEV_PAD_SIZE != off) 1659 return (EINVAL); 1660 1661 buf = malloc(size); 1662 if (buf == NULL) 1663 return (ENOMEM); 1664 1665 /* Read vdev_phys */ 1666 rc = vdev_label_read(vd, l, buf, o_phys, size); 1667 free(buf); 1668 return (rc); 1669 } 1670 1671 static int 1672 vdev_label_write(vdev_t *vd, int l, vdev_boot_envblock_t *be, uint64_t offset) 1673 { 1674 zio_checksum_info_t *ci; 1675 zio_cksum_t cksum; 1676 off_t off; 1677 size_t size = VDEV_PAD_SIZE; 1678 int rc; 1679 1680 if (vd->v_phys_write == NULL) 1681 return (ENOTSUP); 1682 1683 off = vdev_label_offset(vd->v_psize, l, offset); 1684 1685 rc = vdev_label_write_validate(vd, l, offset); 1686 if (rc != 0) { 1687 return (rc); 1688 } 1689 1690 ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; 1691 be->vbe_zbt.zec_magic = ZEC_MAGIC; 1692 zio_checksum_label_verifier(&be->vbe_zbt.zec_cksum, off); 1693 ci->ci_func[0](be, size, NULL, &cksum); 1694 be->vbe_zbt.zec_cksum = cksum; 1695 1696 return (vdev_write_phys(vd, be, off, size)); 1697 } 1698 1699 static int 1700 vdev_write_bootenv_impl(vdev_t *vdev, vdev_boot_envblock_t *be) 1701 { 1702 vdev_t *kid; 1703 int rv = 0, rc; 1704 1705 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1706 if (kid->v_state != VDEV_STATE_HEALTHY) 1707 continue; 1708 rc = vdev_write_bootenv_impl(kid, be); 1709 if (rv == 0) 1710 rv = rc; 1711 } 1712 1713 /* 1714 * Non-leaf vdevs do not have v_phys_write. 1715 */ 1716 if (vdev->v_phys_write == NULL) 1717 return (rv); 1718 1719 for (int l = 0; l < VDEV_LABELS; l++) { 1720 rc = vdev_label_write(vdev, l, be, 1721 offsetof(vdev_label_t, vl_be)); 1722 if (rc != 0) { 1723 printf("failed to write bootenv to %s label %d: %d\n", 1724 vdev->v_name ? vdev->v_name : "unknown", l, rc); 1725 rv = rc; 1726 } 1727 } 1728 return (rv); 1729 } 1730 1731 int 1732 vdev_write_bootenv(vdev_t *vdev, nvlist_t *nvl) 1733 { 1734 vdev_boot_envblock_t *be; 1735 nvlist_t nv, *nvp; 1736 uint64_t version; 1737 int rv; 1738 1739 if (nvl->nv_size > sizeof(be->vbe_bootenv)) 1740 return (E2BIG); 1741 1742 version = VB_RAW; 1743 nvp = vdev_read_bootenv(vdev); 1744 if (nvp != NULL) { 1745 nvlist_find(nvp, BOOTENV_VERSION, DATA_TYPE_UINT64, NULL, 1746 &version, NULL); 1747 nvlist_destroy(nvp); 1748 } 1749 1750 be = calloc(1, sizeof(*be)); 1751 if (be == NULL) 1752 return (ENOMEM); 1753 1754 be->vbe_version = version; 1755 switch (version) { 1756 case VB_RAW: 1757 /* 1758 * If there is no envmap, we will just wipe bootenv. 1759 */ 1760 nvlist_find(nvl, GRUB_ENVMAP, DATA_TYPE_STRING, NULL, 1761 be->vbe_bootenv, NULL); 1762 rv = 0; 1763 break; 1764 1765 case VB_NVLIST: 1766 nv.nv_header = nvl->nv_header; 1767 nv.nv_asize = nvl->nv_asize; 1768 nv.nv_size = nvl->nv_size; 1769 1770 bcopy(&nv.nv_header, be->vbe_bootenv, sizeof(nv.nv_header)); 1771 nv.nv_data = be->vbe_bootenv + sizeof(nvs_header_t); 1772 bcopy(nvl->nv_data, nv.nv_data, nv.nv_size); 1773 rv = nvlist_export(&nv); 1774 break; 1775 1776 default: 1777 rv = EINVAL; 1778 break; 1779 } 1780 1781 if (rv == 0) { 1782 be->vbe_version = htobe64(be->vbe_version); 1783 rv = vdev_write_bootenv_impl(vdev, be); 1784 } 1785 free(be); 1786 return (rv); 1787 } 1788 1789 /* 1790 * Read the bootenv area from pool label, return the nvlist from it. 1791 * We return from first successful read. 1792 */ 1793 nvlist_t * 1794 vdev_read_bootenv(vdev_t *vdev) 1795 { 1796 vdev_t *kid; 1797 nvlist_t *benv; 1798 vdev_boot_envblock_t *be; 1799 char *command; 1800 bool ok; 1801 int rv; 1802 1803 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1804 if (kid->v_state != VDEV_STATE_HEALTHY) 1805 continue; 1806 1807 benv = vdev_read_bootenv(kid); 1808 if (benv != NULL) 1809 return (benv); 1810 } 1811 1812 be = malloc(sizeof (*be)); 1813 if (be == NULL) 1814 return (NULL); 1815 1816 rv = 0; 1817 for (int l = 0; l < VDEV_LABELS; l++) { 1818 rv = vdev_label_read(vdev, l, be, 1819 offsetof(vdev_label_t, vl_be), 1820 sizeof (*be)); 1821 if (rv == 0) 1822 break; 1823 } 1824 if (rv != 0) { 1825 free(be); 1826 return (NULL); 1827 } 1828 1829 be->vbe_version = be64toh(be->vbe_version); 1830 switch (be->vbe_version) { 1831 case VB_RAW: 1832 /* 1833 * we have textual data in vbe_bootenv, create nvlist 1834 * with key "envmap". 1835 */ 1836 benv = nvlist_create(NV_UNIQUE_NAME); 1837 if (benv != NULL) { 1838 if (*be->vbe_bootenv == '\0') { 1839 nvlist_add_uint64(benv, BOOTENV_VERSION, 1840 VB_NVLIST); 1841 break; 1842 } 1843 nvlist_add_uint64(benv, BOOTENV_VERSION, VB_RAW); 1844 be->vbe_bootenv[sizeof (be->vbe_bootenv) - 1] = '\0'; 1845 nvlist_add_string(benv, GRUB_ENVMAP, be->vbe_bootenv); 1846 } 1847 break; 1848 1849 case VB_NVLIST: 1850 benv = nvlist_import(be->vbe_bootenv, sizeof(be->vbe_bootenv)); 1851 break; 1852 1853 default: 1854 command = (char *)be; 1855 ok = false; 1856 1857 /* Check for legacy zfsbootcfg command string */ 1858 for (int i = 0; command[i] != '\0'; i++) { 1859 if (iscntrl(command[i])) { 1860 ok = false; 1861 break; 1862 } else { 1863 ok = true; 1864 } 1865 } 1866 benv = nvlist_create(NV_UNIQUE_NAME); 1867 if (benv != NULL) { 1868 if (ok) 1869 nvlist_add_string(benv, FREEBSD_BOOTONCE, 1870 command); 1871 else 1872 nvlist_add_uint64(benv, BOOTENV_VERSION, 1873 VB_NVLIST); 1874 } 1875 break; 1876 } 1877 free(be); 1878 return (benv); 1879 } 1880 1881 static uint64_t 1882 vdev_get_label_asize(nvlist_t *nvl) 1883 { 1884 nvlist_t *vdevs; 1885 uint64_t asize; 1886 const char *type; 1887 int len; 1888 1889 asize = 0; 1890 /* Get vdev tree */ 1891 if (nvlist_find(nvl, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1892 NULL, &vdevs, NULL) != 0) 1893 return (asize); 1894 1895 /* 1896 * Get vdev type. We will calculate asize for raidz, mirror and disk. 1897 * For raidz, the asize is raw size of all children. 1898 */ 1899 if (nvlist_find(vdevs, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, 1900 NULL, &type, &len) != 0) 1901 goto done; 1902 1903 if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 && 1904 memcmp(type, VDEV_TYPE_DISK, len) != 0 && 1905 memcmp(type, VDEV_TYPE_RAIDZ, len) != 0) 1906 goto done; 1907 1908 if (nvlist_find(vdevs, ZPOOL_CONFIG_ASIZE, DATA_TYPE_UINT64, 1909 NULL, &asize, NULL) != 0) 1910 goto done; 1911 1912 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) { 1913 nvlist_t **kids; 1914 int nkids; 1915 1916 if (nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, 1917 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids, NULL) != 0) { 1918 asize = 0; 1919 goto done; 1920 } 1921 1922 asize /= nkids; 1923 for (int i = 0; i < nkids; i++) 1924 nvlist_destroy(kids[i]); 1925 free(kids); 1926 } 1927 1928 asize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 1929 done: 1930 nvlist_destroy(vdevs); 1931 return (asize); 1932 } 1933 1934 static nvlist_t * 1935 vdev_label_read_config(vdev_t *vd, uint64_t txg) 1936 { 1937 vdev_phys_t *label; 1938 uint64_t best_txg = 0; 1939 uint64_t label_txg = 0; 1940 uint64_t asize; 1941 nvlist_t *nvl = NULL, *tmp; 1942 int error; 1943 1944 label = malloc(sizeof (vdev_phys_t)); 1945 if (label == NULL) 1946 return (NULL); 1947 1948 for (int l = 0; l < VDEV_LABELS; l++) { 1949 if (vdev_label_read(vd, l, label, 1950 offsetof(vdev_label_t, vl_vdev_phys), 1951 sizeof (vdev_phys_t))) 1952 continue; 1953 1954 tmp = nvlist_import(label->vp_nvlist, 1955 sizeof(label->vp_nvlist)); 1956 if (tmp == NULL) 1957 continue; 1958 1959 error = nvlist_find(tmp, ZPOOL_CONFIG_POOL_TXG, 1960 DATA_TYPE_UINT64, NULL, &label_txg, NULL); 1961 if (error != 0 || label_txg == 0) { 1962 nvlist_destroy(nvl); 1963 nvl = tmp; 1964 goto done; 1965 } 1966 1967 if (label_txg <= txg && label_txg > best_txg) { 1968 best_txg = label_txg; 1969 nvlist_destroy(nvl); 1970 nvl = tmp; 1971 tmp = NULL; 1972 1973 /* 1974 * Use asize from pool config. We need this 1975 * because we can get bad value from BIOS. 1976 */ 1977 asize = vdev_get_label_asize(nvl); 1978 if (asize != 0) { 1979 vd->v_psize = asize; 1980 } 1981 } 1982 nvlist_destroy(tmp); 1983 } 1984 1985 if (best_txg == 0) { 1986 nvlist_destroy(nvl); 1987 nvl = NULL; 1988 } 1989 done: 1990 free(label); 1991 return (nvl); 1992 } 1993 1994 static void 1995 vdev_uberblock_load(vdev_t *vd, uberblock_t *ub) 1996 { 1997 uberblock_t *buf; 1998 1999 buf = malloc(VDEV_UBERBLOCK_SIZE(vd)); 2000 if (buf == NULL) 2001 return; 2002 2003 for (int l = 0; l < VDEV_LABELS; l++) { 2004 for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { 2005 if (vdev_label_read(vd, l, buf, 2006 VDEV_UBERBLOCK_OFFSET(vd, n), 2007 VDEV_UBERBLOCK_SIZE(vd))) 2008 continue; 2009 if (uberblock_verify(buf) != 0) 2010 continue; 2011 2012 if (vdev_uberblock_compare(buf, ub) > 0) 2013 *ub = *buf; 2014 } 2015 } 2016 free(buf); 2017 } 2018 2019 static int 2020 vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv, 2021 spa_t **spap) 2022 { 2023 vdev_t vtmp; 2024 spa_t *spa; 2025 vdev_t *vdev; 2026 nvlist_t *nvl; 2027 uint64_t val; 2028 uint64_t guid, vdev_children; 2029 uint64_t pool_txg, pool_guid; 2030 const char *pool_name; 2031 int rc, namelen; 2032 2033 /* 2034 * Load the vdev label and figure out which 2035 * uberblock is most current. 2036 */ 2037 memset(&vtmp, 0, sizeof(vtmp)); 2038 vtmp.v_phys_read = _read; 2039 vtmp.v_phys_write = _write; 2040 vtmp.v_priv = priv; 2041 vtmp.v_psize = P2ALIGN(ldi_get_size(priv), 2042 (uint64_t)sizeof (vdev_label_t)); 2043 2044 /* Test for minimum device size. */ 2045 if (vtmp.v_psize < SPA_MINDEVSIZE) 2046 return (EIO); 2047 2048 nvl = vdev_label_read_config(&vtmp, UINT64_MAX); 2049 if (nvl == NULL) 2050 return (EIO); 2051 2052 if (nvlist_find(nvl, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, 2053 NULL, &val, NULL) != 0) { 2054 nvlist_destroy(nvl); 2055 return (EIO); 2056 } 2057 2058 if (!SPA_VERSION_IS_SUPPORTED(val)) { 2059 printf("ZFS: unsupported ZFS version %u (should be %u)\n", 2060 (unsigned)val, (unsigned)SPA_VERSION); 2061 nvlist_destroy(nvl); 2062 return (EIO); 2063 } 2064 2065 /* Check ZFS features for read */ 2066 rc = nvlist_check_features_for_read(nvl); 2067 if (rc != 0) { 2068 nvlist_destroy(nvl); 2069 return (EIO); 2070 } 2071 2072 if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, 2073 NULL, &val, NULL) != 0) { 2074 nvlist_destroy(nvl); 2075 return (EIO); 2076 } 2077 2078 if (val == POOL_STATE_DESTROYED) { 2079 /* We don't boot only from destroyed pools. */ 2080 nvlist_destroy(nvl); 2081 return (EIO); 2082 } 2083 2084 if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, 2085 NULL, &pool_txg, NULL) != 0 || 2086 nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 2087 NULL, &pool_guid, NULL) != 0 || 2088 nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING, 2089 NULL, &pool_name, &namelen) != 0) { 2090 /* 2091 * Cache and spare devices end up here - just ignore 2092 * them. 2093 */ 2094 nvlist_destroy(nvl); 2095 return (EIO); 2096 } 2097 2098 /* 2099 * Create the pool if this is the first time we've seen it. 2100 */ 2101 spa = spa_find_by_guid(pool_guid); 2102 if (spa == NULL) { 2103 char *name; 2104 2105 nvlist_find(nvl, ZPOOL_CONFIG_VDEV_CHILDREN, 2106 DATA_TYPE_UINT64, NULL, &vdev_children, NULL); 2107 name = malloc(namelen + 1); 2108 if (name == NULL) { 2109 nvlist_destroy(nvl); 2110 return (ENOMEM); 2111 } 2112 bcopy(pool_name, name, namelen); 2113 name[namelen] = '\0'; 2114 spa = spa_create(pool_guid, name); 2115 free(name); 2116 if (spa == NULL) { 2117 nvlist_destroy(nvl); 2118 return (ENOMEM); 2119 } 2120 spa->spa_root_vdev->v_nchildren = vdev_children; 2121 } 2122 if (pool_txg > spa->spa_txg) 2123 spa->spa_txg = pool_txg; 2124 2125 /* 2126 * Get the vdev tree and create our in-core copy of it. 2127 * If we already have a vdev with this guid, this must 2128 * be some kind of alias (overlapping slices, dangerously dedicated 2129 * disks etc). 2130 */ 2131 if (nvlist_find(nvl, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 2132 NULL, &guid, NULL) != 0) { 2133 nvlist_destroy(nvl); 2134 return (EIO); 2135 } 2136 vdev = vdev_find(guid); 2137 /* Has this vdev already been inited? */ 2138 if (vdev && vdev->v_phys_read) { 2139 nvlist_destroy(nvl); 2140 return (EIO); 2141 } 2142 2143 rc = vdev_init_from_label(spa, nvl); 2144 nvlist_destroy(nvl); 2145 if (rc != 0) 2146 return (rc); 2147 2148 /* 2149 * We should already have created an incomplete vdev for this 2150 * vdev. Find it and initialise it with our read proc. 2151 */ 2152 vdev = vdev_find(guid); 2153 if (vdev != NULL) { 2154 vdev->v_phys_read = _read; 2155 vdev->v_phys_write = _write; 2156 vdev->v_priv = priv; 2157 vdev->v_psize = vtmp.v_psize; 2158 /* 2159 * If no other state is set, mark vdev healthy. 2160 */ 2161 if (vdev->v_state == VDEV_STATE_UNKNOWN) 2162 vdev->v_state = VDEV_STATE_HEALTHY; 2163 } else { 2164 printf("ZFS: inconsistent nvlist contents\n"); 2165 return (EIO); 2166 } 2167 2168 if (vdev->v_islog) 2169 spa->spa_with_log = vdev->v_islog; 2170 2171 /* 2172 * Re-evaluate top-level vdev state. 2173 */ 2174 vdev_set_state(vdev->v_top); 2175 2176 /* 2177 * Ok, we are happy with the pool so far. Lets find 2178 * the best uberblock and then we can actually access 2179 * the contents of the pool. 2180 */ 2181 vdev_uberblock_load(vdev, spa->spa_uberblock); 2182 2183 if (spap != NULL) 2184 *spap = spa; 2185 return (0); 2186 } 2187 2188 static int 2189 ilog2(int n) 2190 { 2191 int v; 2192 2193 for (v = 0; v < 32; v++) 2194 if (n == (1 << v)) 2195 return (v); 2196 return (-1); 2197 } 2198 2199 static int 2200 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf) 2201 { 2202 blkptr_t gbh_bp; 2203 zio_gbh_phys_t zio_gb; 2204 char *pbuf; 2205 int i; 2206 2207 /* Artificial BP for gang block header. */ 2208 gbh_bp = *bp; 2209 BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 2210 BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 2211 BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER); 2212 BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF); 2213 for (i = 0; i < SPA_DVAS_PER_BP; i++) 2214 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0); 2215 2216 /* Read gang header block using the artificial BP. */ 2217 if (zio_read(spa, &gbh_bp, &zio_gb)) 2218 return (EIO); 2219 2220 pbuf = buf; 2221 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 2222 blkptr_t *gbp = &zio_gb.zg_blkptr[i]; 2223 2224 if (BP_IS_HOLE(gbp)) 2225 continue; 2226 if (zio_read(spa, gbp, pbuf)) 2227 return (EIO); 2228 pbuf += BP_GET_PSIZE(gbp); 2229 } 2230 2231 if (zio_checksum_verify(spa, bp, buf)) 2232 return (EIO); 2233 return (0); 2234 } 2235 2236 static int 2237 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) 2238 { 2239 int cpfunc = BP_GET_COMPRESS(bp); 2240 uint64_t align, size; 2241 void *pbuf; 2242 int i, error; 2243 2244 /* 2245 * Process data embedded in block pointer 2246 */ 2247 if (BP_IS_EMBEDDED(bp)) { 2248 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 2249 2250 size = BPE_GET_PSIZE(bp); 2251 ASSERT(size <= BPE_PAYLOAD_SIZE); 2252 2253 if (cpfunc != ZIO_COMPRESS_OFF) 2254 pbuf = malloc(size); 2255 else 2256 pbuf = buf; 2257 2258 if (pbuf == NULL) 2259 return (ENOMEM); 2260 2261 decode_embedded_bp_compressed(bp, pbuf); 2262 error = 0; 2263 2264 if (cpfunc != ZIO_COMPRESS_OFF) { 2265 error = zio_decompress_data(cpfunc, pbuf, 2266 size, buf, BP_GET_LSIZE(bp)); 2267 free(pbuf); 2268 } 2269 if (error != 0) 2270 printf("ZFS: i/o error - unable to decompress " 2271 "block pointer data, error %d\n", error); 2272 return (error); 2273 } 2274 2275 error = EIO; 2276 2277 for (i = 0; i < SPA_DVAS_PER_BP; i++) { 2278 const dva_t *dva = &bp->blk_dva[i]; 2279 vdev_t *vdev; 2280 vdev_list_t *vlist; 2281 uint64_t vdevid; 2282 off_t offset; 2283 2284 if (!dva->dva_word[0] && !dva->dva_word[1]) 2285 continue; 2286 2287 vdevid = DVA_GET_VDEV(dva); 2288 offset = DVA_GET_OFFSET(dva); 2289 vlist = &spa->spa_root_vdev->v_children; 2290 STAILQ_FOREACH(vdev, vlist, v_childlink) { 2291 if (vdev->v_id == vdevid) 2292 break; 2293 } 2294 if (!vdev || !vdev->v_read) 2295 continue; 2296 2297 size = BP_GET_PSIZE(bp); 2298 if (vdev->v_read == vdev_raidz_read) { 2299 align = 1ULL << vdev->v_ashift; 2300 if (P2PHASE(size, align) != 0) 2301 size = P2ROUNDUP(size, align); 2302 } 2303 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) 2304 pbuf = malloc(size); 2305 else 2306 pbuf = buf; 2307 2308 if (pbuf == NULL) { 2309 error = ENOMEM; 2310 break; 2311 } 2312 2313 if (DVA_GET_GANG(dva)) 2314 error = zio_read_gang(spa, bp, pbuf); 2315 else 2316 error = vdev->v_read(vdev, bp, pbuf, offset, size); 2317 if (error == 0) { 2318 if (cpfunc != ZIO_COMPRESS_OFF) 2319 error = zio_decompress_data(cpfunc, pbuf, 2320 BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp)); 2321 else if (size != BP_GET_PSIZE(bp)) 2322 bcopy(pbuf, buf, BP_GET_PSIZE(bp)); 2323 } else { 2324 printf("zio_read error: %d\n", error); 2325 } 2326 if (buf != pbuf) 2327 free(pbuf); 2328 if (error == 0) 2329 break; 2330 } 2331 if (error != 0) 2332 printf("ZFS: i/o error - all block copies unavailable\n"); 2333 2334 return (error); 2335 } 2336 2337 static int 2338 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, 2339 void *buf, size_t buflen) 2340 { 2341 int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 2342 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2343 int nlevels = dnode->dn_nlevels; 2344 int i, rc; 2345 2346 if (bsize > SPA_MAXBLOCKSIZE) { 2347 printf("ZFS: I/O error - blocks larger than %llu are not " 2348 "supported\n", SPA_MAXBLOCKSIZE); 2349 return (EIO); 2350 } 2351 2352 /* 2353 * Handle odd block sizes, mirrors dmu_read_impl(). Data can't exist 2354 * past the first block, so we'll clip the read to the portion of the 2355 * buffer within bsize and zero out the remainder. 2356 */ 2357 if (dnode->dn_maxblkid == 0) { 2358 size_t newbuflen; 2359 2360 newbuflen = offset > bsize ? 0 : MIN(buflen, bsize - offset); 2361 bzero((char *)buf + newbuflen, buflen - newbuflen); 2362 buflen = newbuflen; 2363 } 2364 2365 /* 2366 * Note: bsize may not be a power of two here so we need to do an 2367 * actual divide rather than a bitshift. 2368 */ 2369 while (buflen > 0) { 2370 uint64_t bn = offset / bsize; 2371 int boff = offset % bsize; 2372 int ibn; 2373 const blkptr_t *indbp; 2374 blkptr_t bp; 2375 2376 if (bn > dnode->dn_maxblkid) 2377 return (EIO); 2378 2379 if (dnode == dnode_cache_obj && bn == dnode_cache_bn) 2380 goto cached; 2381 2382 indbp = dnode->dn_blkptr; 2383 for (i = 0; i < nlevels; i++) { 2384 /* 2385 * Copy the bp from the indirect array so that 2386 * we can re-use the scratch buffer for multi-level 2387 * objects. 2388 */ 2389 ibn = bn >> ((nlevels - i - 1) * ibshift); 2390 ibn &= ((1 << ibshift) - 1); 2391 bp = indbp[ibn]; 2392 if (BP_IS_HOLE(&bp)) { 2393 memset(dnode_cache_buf, 0, bsize); 2394 break; 2395 } 2396 rc = zio_read(spa, &bp, dnode_cache_buf); 2397 if (rc) 2398 return (rc); 2399 indbp = (const blkptr_t *) dnode_cache_buf; 2400 } 2401 dnode_cache_obj = dnode; 2402 dnode_cache_bn = bn; 2403 cached: 2404 2405 /* 2406 * The buffer contains our data block. Copy what we 2407 * need from it and loop. 2408 */ 2409 i = bsize - boff; 2410 if (i > buflen) i = buflen; 2411 memcpy(buf, &dnode_cache_buf[boff], i); 2412 buf = ((char *)buf) + i; 2413 offset += i; 2414 buflen -= i; 2415 } 2416 2417 return (0); 2418 } 2419 2420 /* 2421 * Lookup a value in a microzap directory. 2422 */ 2423 static int 2424 mzap_lookup(const mzap_phys_t *mz, size_t size, const char *name, 2425 uint64_t *value) 2426 { 2427 const mzap_ent_phys_t *mze; 2428 int chunks, i; 2429 2430 /* 2431 * Microzap objects use exactly one block. Read the whole 2432 * thing. 2433 */ 2434 chunks = size / MZAP_ENT_LEN - 1; 2435 for (i = 0; i < chunks; i++) { 2436 mze = &mz->mz_chunk[i]; 2437 if (strcmp(mze->mze_name, name) == 0) { 2438 *value = mze->mze_value; 2439 return (0); 2440 } 2441 } 2442 2443 return (ENOENT); 2444 } 2445 2446 /* 2447 * Compare a name with a zap leaf entry. Return non-zero if the name 2448 * matches. 2449 */ 2450 static int 2451 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, 2452 const char *name) 2453 { 2454 size_t namelen; 2455 const zap_leaf_chunk_t *nc; 2456 const char *p; 2457 2458 namelen = zc->l_entry.le_name_numints; 2459 2460 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 2461 p = name; 2462 while (namelen > 0) { 2463 size_t len; 2464 2465 len = namelen; 2466 if (len > ZAP_LEAF_ARRAY_BYTES) 2467 len = ZAP_LEAF_ARRAY_BYTES; 2468 if (memcmp(p, nc->l_array.la_array, len)) 2469 return (0); 2470 p += len; 2471 namelen -= len; 2472 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 2473 } 2474 2475 return (1); 2476 } 2477 2478 /* 2479 * Extract a uint64_t value from a zap leaf entry. 2480 */ 2481 static uint64_t 2482 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc) 2483 { 2484 const zap_leaf_chunk_t *vc; 2485 int i; 2486 uint64_t value; 2487 const uint8_t *p; 2488 2489 vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk); 2490 for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) { 2491 value = (value << 8) | p[i]; 2492 } 2493 2494 return (value); 2495 } 2496 2497 static void 2498 stv(int len, void *addr, uint64_t value) 2499 { 2500 switch (len) { 2501 case 1: 2502 *(uint8_t *)addr = value; 2503 return; 2504 case 2: 2505 *(uint16_t *)addr = value; 2506 return; 2507 case 4: 2508 *(uint32_t *)addr = value; 2509 return; 2510 case 8: 2511 *(uint64_t *)addr = value; 2512 return; 2513 } 2514 } 2515 2516 /* 2517 * Extract a array from a zap leaf entry. 2518 */ 2519 static void 2520 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, 2521 uint64_t integer_size, uint64_t num_integers, void *buf) 2522 { 2523 uint64_t array_int_len = zc->l_entry.le_value_intlen; 2524 uint64_t value = 0; 2525 uint64_t *u64 = buf; 2526 char *p = buf; 2527 int len = MIN(zc->l_entry.le_value_numints, num_integers); 2528 int chunk = zc->l_entry.le_value_chunk; 2529 int byten = 0; 2530 2531 if (integer_size == 8 && len == 1) { 2532 *u64 = fzap_leaf_value(zl, zc); 2533 return; 2534 } 2535 2536 while (len > 0) { 2537 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array; 2538 int i; 2539 2540 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl)); 2541 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { 2542 value = (value << 8) | la->la_array[i]; 2543 byten++; 2544 if (byten == array_int_len) { 2545 stv(integer_size, p, value); 2546 byten = 0; 2547 len--; 2548 if (len == 0) 2549 return; 2550 p += integer_size; 2551 } 2552 } 2553 chunk = la->la_next; 2554 } 2555 } 2556 2557 static int 2558 fzap_check_size(uint64_t integer_size, uint64_t num_integers) 2559 { 2560 2561 switch (integer_size) { 2562 case 1: 2563 case 2: 2564 case 4: 2565 case 8: 2566 break; 2567 default: 2568 return (EINVAL); 2569 } 2570 2571 if (integer_size * num_integers > ZAP_MAXVALUELEN) 2572 return (E2BIG); 2573 2574 return (0); 2575 } 2576 2577 static void 2578 zap_leaf_free(zap_leaf_t *leaf) 2579 { 2580 free(leaf->l_phys); 2581 free(leaf); 2582 } 2583 2584 static int 2585 zap_get_leaf_byblk(fat_zap_t *zap, uint64_t blk, zap_leaf_t **lp) 2586 { 2587 int bs = FZAP_BLOCK_SHIFT(zap); 2588 int err; 2589 2590 *lp = malloc(sizeof(**lp)); 2591 if (*lp == NULL) 2592 return (ENOMEM); 2593 2594 (*lp)->l_bs = bs; 2595 (*lp)->l_phys = malloc(1 << bs); 2596 2597 if ((*lp)->l_phys == NULL) { 2598 free(*lp); 2599 return (ENOMEM); 2600 } 2601 err = dnode_read(zap->zap_spa, zap->zap_dnode, blk << bs, (*lp)->l_phys, 2602 1 << bs); 2603 if (err != 0) { 2604 zap_leaf_free(*lp); 2605 } 2606 return (err); 2607 } 2608 2609 static int 2610 zap_table_load(fat_zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, 2611 uint64_t *valp) 2612 { 2613 int bs = FZAP_BLOCK_SHIFT(zap); 2614 uint64_t blk = idx >> (bs - 3); 2615 uint64_t off = idx & ((1 << (bs - 3)) - 1); 2616 uint64_t *buf; 2617 int rc; 2618 2619 buf = malloc(1 << zap->zap_block_shift); 2620 if (buf == NULL) 2621 return (ENOMEM); 2622 rc = dnode_read(zap->zap_spa, zap->zap_dnode, (tbl->zt_blk + blk) << bs, 2623 buf, 1 << zap->zap_block_shift); 2624 if (rc == 0) 2625 *valp = buf[off]; 2626 free(buf); 2627 return (rc); 2628 } 2629 2630 static int 2631 zap_idx_to_blk(fat_zap_t *zap, uint64_t idx, uint64_t *valp) 2632 { 2633 if (zap->zap_phys->zap_ptrtbl.zt_numblks == 0) { 2634 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); 2635 return (0); 2636 } else { 2637 return (zap_table_load(zap, &zap->zap_phys->zap_ptrtbl, 2638 idx, valp)); 2639 } 2640 } 2641 2642 #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) 2643 static int 2644 zap_deref_leaf(fat_zap_t *zap, uint64_t h, zap_leaf_t **lp) 2645 { 2646 uint64_t idx, blk; 2647 int err; 2648 2649 idx = ZAP_HASH_IDX(h, zap->zap_phys->zap_ptrtbl.zt_shift); 2650 err = zap_idx_to_blk(zap, idx, &blk); 2651 if (err != 0) 2652 return (err); 2653 return (zap_get_leaf_byblk(zap, blk, lp)); 2654 } 2655 2656 #define CHAIN_END 0xffff /* end of the chunk chain */ 2657 #define LEAF_HASH(l, h) \ 2658 ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ 2659 ((h) >> \ 2660 (64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len))) 2661 #define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) 2662 2663 static int 2664 zap_leaf_lookup(zap_leaf_t *zl, uint64_t hash, const char *name, 2665 uint64_t integer_size, uint64_t num_integers, void *value) 2666 { 2667 int rc; 2668 uint16_t *chunkp; 2669 struct zap_leaf_entry *le; 2670 2671 /* 2672 * Make sure this chunk matches our hash. 2673 */ 2674 if (zl->l_phys->l_hdr.lh_prefix_len > 0 && 2675 zl->l_phys->l_hdr.lh_prefix != 2676 hash >> (64 - zl->l_phys->l_hdr.lh_prefix_len)) 2677 return (EIO); 2678 2679 rc = ENOENT; 2680 for (chunkp = LEAF_HASH_ENTPTR(zl, hash); 2681 *chunkp != CHAIN_END; chunkp = &le->le_next) { 2682 zap_leaf_chunk_t *zc; 2683 uint16_t chunk = *chunkp; 2684 2685 le = ZAP_LEAF_ENTRY(zl, chunk); 2686 if (le->le_hash != hash) 2687 continue; 2688 zc = &ZAP_LEAF_CHUNK(zl, chunk); 2689 if (fzap_name_equal(zl, zc, name)) { 2690 if (zc->l_entry.le_value_intlen > integer_size) { 2691 rc = EINVAL; 2692 } else { 2693 fzap_leaf_array(zl, zc, integer_size, 2694 num_integers, value); 2695 rc = 0; 2696 } 2697 break; 2698 } 2699 } 2700 return (rc); 2701 } 2702 2703 /* 2704 * Lookup a value in a fatzap directory. 2705 */ 2706 static int 2707 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2708 const char *name, uint64_t integer_size, uint64_t num_integers, 2709 void *value) 2710 { 2711 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2712 fat_zap_t z; 2713 zap_leaf_t *zl; 2714 uint64_t hash; 2715 int rc; 2716 2717 if (zh->zap_magic != ZAP_MAGIC) 2718 return (EIO); 2719 2720 if ((rc = fzap_check_size(integer_size, num_integers)) != 0) { 2721 return (rc); 2722 } 2723 2724 z.zap_block_shift = ilog2(bsize); 2725 z.zap_phys = zh; 2726 z.zap_spa = spa; 2727 z.zap_dnode = dnode; 2728 2729 hash = zap_hash(zh->zap_salt, name); 2730 rc = zap_deref_leaf(&z, hash, &zl); 2731 if (rc != 0) 2732 return (rc); 2733 2734 rc = zap_leaf_lookup(zl, hash, name, integer_size, num_integers, value); 2735 2736 zap_leaf_free(zl); 2737 return (rc); 2738 } 2739 2740 /* 2741 * Lookup a name in a zap object and return its value as a uint64_t. 2742 */ 2743 static int 2744 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, 2745 uint64_t integer_size, uint64_t num_integers, void *value) 2746 { 2747 int rc; 2748 zap_phys_t *zap; 2749 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2750 2751 zap = malloc(size); 2752 if (zap == NULL) 2753 return (ENOMEM); 2754 2755 rc = dnode_read(spa, dnode, 0, zap, size); 2756 if (rc) 2757 goto done; 2758 2759 switch (zap->zap_block_type) { 2760 case ZBT_MICRO: 2761 rc = mzap_lookup((const mzap_phys_t *)zap, size, name, value); 2762 break; 2763 case ZBT_HEADER: 2764 rc = fzap_lookup(spa, dnode, zap, name, integer_size, 2765 num_integers, value); 2766 break; 2767 default: 2768 printf("ZFS: invalid zap_type=%" PRIx64 "\n", 2769 zap->zap_block_type); 2770 rc = EIO; 2771 } 2772 done: 2773 free(zap); 2774 return (rc); 2775 } 2776 2777 /* 2778 * List a microzap directory. 2779 */ 2780 static int 2781 mzap_list(const mzap_phys_t *mz, size_t size, 2782 int (*callback)(const char *, uint64_t)) 2783 { 2784 const mzap_ent_phys_t *mze; 2785 int chunks, i, rc; 2786 2787 /* 2788 * Microzap objects use exactly one block. Read the whole 2789 * thing. 2790 */ 2791 rc = 0; 2792 chunks = size / MZAP_ENT_LEN - 1; 2793 for (i = 0; i < chunks; i++) { 2794 mze = &mz->mz_chunk[i]; 2795 if (mze->mze_name[0]) { 2796 rc = callback(mze->mze_name, mze->mze_value); 2797 if (rc != 0) 2798 break; 2799 } 2800 } 2801 2802 return (rc); 2803 } 2804 2805 /* 2806 * List a fatzap directory. 2807 */ 2808 static int 2809 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2810 int (*callback)(const char *, uint64_t)) 2811 { 2812 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2813 fat_zap_t z; 2814 uint64_t i; 2815 int j, rc; 2816 2817 if (zh->zap_magic != ZAP_MAGIC) 2818 return (EIO); 2819 2820 z.zap_block_shift = ilog2(bsize); 2821 z.zap_phys = zh; 2822 2823 /* 2824 * This assumes that the leaf blocks start at block 1. The 2825 * documentation isn't exactly clear on this. 2826 */ 2827 zap_leaf_t zl; 2828 zl.l_bs = z.zap_block_shift; 2829 zl.l_phys = malloc(bsize); 2830 if (zl.l_phys == NULL) 2831 return (ENOMEM); 2832 2833 for (i = 0; i < zh->zap_num_leafs; i++) { 2834 off_t off = ((off_t)(i + 1)) << zl.l_bs; 2835 char name[256], *p; 2836 uint64_t value; 2837 2838 if (dnode_read(spa, dnode, off, zl.l_phys, bsize)) { 2839 free(zl.l_phys); 2840 return (EIO); 2841 } 2842 2843 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 2844 zap_leaf_chunk_t *zc, *nc; 2845 int namelen; 2846 2847 zc = &ZAP_LEAF_CHUNK(&zl, j); 2848 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 2849 continue; 2850 namelen = zc->l_entry.le_name_numints; 2851 if (namelen > sizeof(name)) 2852 namelen = sizeof(name); 2853 2854 /* 2855 * Paste the name back together. 2856 */ 2857 nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); 2858 p = name; 2859 while (namelen > 0) { 2860 int len; 2861 len = namelen; 2862 if (len > ZAP_LEAF_ARRAY_BYTES) 2863 len = ZAP_LEAF_ARRAY_BYTES; 2864 memcpy(p, nc->l_array.la_array, len); 2865 p += len; 2866 namelen -= len; 2867 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); 2868 } 2869 2870 /* 2871 * Assume the first eight bytes of the value are 2872 * a uint64_t. 2873 */ 2874 value = fzap_leaf_value(&zl, zc); 2875 2876 /* printf("%s 0x%jx\n", name, (uintmax_t)value); */ 2877 rc = callback((const char *)name, value); 2878 if (rc != 0) { 2879 free(zl.l_phys); 2880 return (rc); 2881 } 2882 } 2883 } 2884 2885 free(zl.l_phys); 2886 return (0); 2887 } 2888 2889 static int zfs_printf(const char *name, uint64_t value __unused) 2890 { 2891 2892 printf("%s\n", name); 2893 2894 return (0); 2895 } 2896 2897 /* 2898 * List a zap directory. 2899 */ 2900 static int 2901 zap_list(const spa_t *spa, const dnode_phys_t *dnode) 2902 { 2903 zap_phys_t *zap; 2904 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2905 int rc; 2906 2907 zap = malloc(size); 2908 if (zap == NULL) 2909 return (ENOMEM); 2910 2911 rc = dnode_read(spa, dnode, 0, zap, size); 2912 if (rc == 0) { 2913 if (zap->zap_block_type == ZBT_MICRO) 2914 rc = mzap_list((const mzap_phys_t *)zap, size, 2915 zfs_printf); 2916 else 2917 rc = fzap_list(spa, dnode, zap, zfs_printf); 2918 } 2919 free(zap); 2920 return (rc); 2921 } 2922 2923 static int 2924 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, 2925 dnode_phys_t *dnode) 2926 { 2927 off_t offset; 2928 2929 offset = objnum * sizeof(dnode_phys_t); 2930 return dnode_read(spa, &os->os_meta_dnode, offset, 2931 dnode, sizeof(dnode_phys_t)); 2932 } 2933 2934 /* 2935 * Lookup a name in a microzap directory. 2936 */ 2937 static int 2938 mzap_rlookup(const mzap_phys_t *mz, size_t size, char *name, uint64_t value) 2939 { 2940 const mzap_ent_phys_t *mze; 2941 int chunks, i; 2942 2943 /* 2944 * Microzap objects use exactly one block. Read the whole 2945 * thing. 2946 */ 2947 chunks = size / MZAP_ENT_LEN - 1; 2948 for (i = 0; i < chunks; i++) { 2949 mze = &mz->mz_chunk[i]; 2950 if (value == mze->mze_value) { 2951 strcpy(name, mze->mze_name); 2952 return (0); 2953 } 2954 } 2955 2956 return (ENOENT); 2957 } 2958 2959 static void 2960 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name) 2961 { 2962 size_t namelen; 2963 const zap_leaf_chunk_t *nc; 2964 char *p; 2965 2966 namelen = zc->l_entry.le_name_numints; 2967 2968 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 2969 p = name; 2970 while (namelen > 0) { 2971 size_t len; 2972 len = namelen; 2973 if (len > ZAP_LEAF_ARRAY_BYTES) 2974 len = ZAP_LEAF_ARRAY_BYTES; 2975 memcpy(p, nc->l_array.la_array, len); 2976 p += len; 2977 namelen -= len; 2978 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 2979 } 2980 2981 *p = '\0'; 2982 } 2983 2984 static int 2985 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2986 char *name, uint64_t value) 2987 { 2988 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2989 fat_zap_t z; 2990 uint64_t i; 2991 int j, rc; 2992 2993 if (zh->zap_magic != ZAP_MAGIC) 2994 return (EIO); 2995 2996 z.zap_block_shift = ilog2(bsize); 2997 z.zap_phys = zh; 2998 2999 /* 3000 * This assumes that the leaf blocks start at block 1. The 3001 * documentation isn't exactly clear on this. 3002 */ 3003 zap_leaf_t zl; 3004 zl.l_bs = z.zap_block_shift; 3005 zl.l_phys = malloc(bsize); 3006 if (zl.l_phys == NULL) 3007 return (ENOMEM); 3008 3009 for (i = 0; i < zh->zap_num_leafs; i++) { 3010 off_t off = ((off_t)(i + 1)) << zl.l_bs; 3011 3012 rc = dnode_read(spa, dnode, off, zl.l_phys, bsize); 3013 if (rc != 0) 3014 goto done; 3015 3016 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 3017 zap_leaf_chunk_t *zc; 3018 3019 zc = &ZAP_LEAF_CHUNK(&zl, j); 3020 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 3021 continue; 3022 if (zc->l_entry.le_value_intlen != 8 || 3023 zc->l_entry.le_value_numints != 1) 3024 continue; 3025 3026 if (fzap_leaf_value(&zl, zc) == value) { 3027 fzap_name_copy(&zl, zc, name); 3028 goto done; 3029 } 3030 } 3031 } 3032 3033 rc = ENOENT; 3034 done: 3035 free(zl.l_phys); 3036 return (rc); 3037 } 3038 3039 static int 3040 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, 3041 uint64_t value) 3042 { 3043 zap_phys_t *zap; 3044 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 3045 int rc; 3046 3047 zap = malloc(size); 3048 if (zap == NULL) 3049 return (ENOMEM); 3050 3051 rc = dnode_read(spa, dnode, 0, zap, size); 3052 if (rc == 0) { 3053 if (zap->zap_block_type == ZBT_MICRO) 3054 rc = mzap_rlookup((const mzap_phys_t *)zap, size, 3055 name, value); 3056 else 3057 rc = fzap_rlookup(spa, dnode, zap, name, value); 3058 } 3059 free(zap); 3060 return (rc); 3061 } 3062 3063 static int 3064 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result) 3065 { 3066 char name[256]; 3067 char component[256]; 3068 uint64_t dir_obj, parent_obj, child_dir_zapobj; 3069 dnode_phys_t child_dir_zap, dataset, dir, parent; 3070 dsl_dir_phys_t *dd; 3071 dsl_dataset_phys_t *ds; 3072 char *p; 3073 int len; 3074 3075 p = &name[sizeof(name) - 1]; 3076 *p = '\0'; 3077 3078 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3079 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3080 return (EIO); 3081 } 3082 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3083 dir_obj = ds->ds_dir_obj; 3084 3085 for (;;) { 3086 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir) != 0) 3087 return (EIO); 3088 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3089 3090 /* Actual loop condition. */ 3091 parent_obj = dd->dd_parent_obj; 3092 if (parent_obj == 0) 3093 break; 3094 3095 if (objset_get_dnode(spa, spa->spa_mos, parent_obj, 3096 &parent) != 0) 3097 return (EIO); 3098 dd = (dsl_dir_phys_t *)&parent.dn_bonus; 3099 child_dir_zapobj = dd->dd_child_dir_zapobj; 3100 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3101 &child_dir_zap) != 0) 3102 return (EIO); 3103 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0) 3104 return (EIO); 3105 3106 len = strlen(component); 3107 p -= len; 3108 memcpy(p, component, len); 3109 --p; 3110 *p = '/'; 3111 3112 /* Actual loop iteration. */ 3113 dir_obj = parent_obj; 3114 } 3115 3116 if (*p != '\0') 3117 ++p; 3118 strcpy(result, p); 3119 3120 return (0); 3121 } 3122 3123 static int 3124 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum) 3125 { 3126 char element[256]; 3127 uint64_t dir_obj, child_dir_zapobj; 3128 dnode_phys_t child_dir_zap, dir; 3129 dsl_dir_phys_t *dd; 3130 const char *p, *q; 3131 3132 if (objset_get_dnode(spa, spa->spa_mos, 3133 DMU_POOL_DIRECTORY_OBJECT, &dir)) 3134 return (EIO); 3135 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj), 3136 1, &dir_obj)) 3137 return (EIO); 3138 3139 p = name; 3140 for (;;) { 3141 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir)) 3142 return (EIO); 3143 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3144 3145 while (*p == '/') 3146 p++; 3147 /* Actual loop condition #1. */ 3148 if (*p == '\0') 3149 break; 3150 3151 q = strchr(p, '/'); 3152 if (q) { 3153 memcpy(element, p, q - p); 3154 element[q - p] = '\0'; 3155 p = q + 1; 3156 } else { 3157 strcpy(element, p); 3158 p += strlen(p); 3159 } 3160 3161 child_dir_zapobj = dd->dd_child_dir_zapobj; 3162 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3163 &child_dir_zap) != 0) 3164 return (EIO); 3165 3166 /* Actual loop condition #2. */ 3167 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj), 3168 1, &dir_obj) != 0) 3169 return (ENOENT); 3170 } 3171 3172 *objnum = dd->dd_head_dataset_obj; 3173 return (0); 3174 } 3175 3176 #ifndef BOOT2 3177 static int 3178 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/) 3179 { 3180 uint64_t dir_obj, child_dir_zapobj; 3181 dnode_phys_t child_dir_zap, dir, dataset; 3182 dsl_dataset_phys_t *ds; 3183 dsl_dir_phys_t *dd; 3184 3185 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3186 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3187 return (EIO); 3188 } 3189 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3190 dir_obj = ds->ds_dir_obj; 3191 3192 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir)) { 3193 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 3194 return (EIO); 3195 } 3196 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3197 3198 child_dir_zapobj = dd->dd_child_dir_zapobj; 3199 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3200 &child_dir_zap) != 0) { 3201 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 3202 return (EIO); 3203 } 3204 3205 return (zap_list(spa, &child_dir_zap) != 0); 3206 } 3207 3208 int 3209 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, 3210 int (*callback)(const char *, uint64_t)) 3211 { 3212 uint64_t dir_obj, child_dir_zapobj; 3213 dnode_phys_t child_dir_zap, dir, dataset; 3214 dsl_dataset_phys_t *ds; 3215 dsl_dir_phys_t *dd; 3216 zap_phys_t *zap; 3217 size_t size; 3218 int err; 3219 3220 err = objset_get_dnode(spa, spa->spa_mos, objnum, &dataset); 3221 if (err != 0) { 3222 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3223 return (err); 3224 } 3225 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3226 dir_obj = ds->ds_dir_obj; 3227 3228 err = objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir); 3229 if (err != 0) { 3230 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 3231 return (err); 3232 } 3233 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3234 3235 child_dir_zapobj = dd->dd_child_dir_zapobj; 3236 err = objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3237 &child_dir_zap); 3238 if (err != 0) { 3239 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 3240 return (err); 3241 } 3242 3243 size = child_dir_zap.dn_datablkszsec << SPA_MINBLOCKSHIFT; 3244 zap = malloc(size); 3245 if (zap != NULL) { 3246 err = dnode_read(spa, &child_dir_zap, 0, zap, size); 3247 if (err != 0) 3248 goto done; 3249 3250 if (zap->zap_block_type == ZBT_MICRO) 3251 err = mzap_list((const mzap_phys_t *)zap, size, 3252 callback); 3253 else 3254 err = fzap_list(spa, &child_dir_zap, zap, callback); 3255 } else { 3256 err = ENOMEM; 3257 } 3258 done: 3259 free(zap); 3260 return (err); 3261 } 3262 #endif 3263 3264 /* 3265 * Find the object set given the object number of its dataset object 3266 * and return its details in *objset 3267 */ 3268 static int 3269 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset) 3270 { 3271 dnode_phys_t dataset; 3272 dsl_dataset_phys_t *ds; 3273 3274 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3275 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3276 return (EIO); 3277 } 3278 3279 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3280 if (zio_read(spa, &ds->ds_bp, objset)) { 3281 printf("ZFS: can't read object set for dataset %ju\n", 3282 (uintmax_t)objnum); 3283 return (EIO); 3284 } 3285 3286 return (0); 3287 } 3288 3289 /* 3290 * Find the object set pointed to by the BOOTFS property or the root 3291 * dataset if there is none and return its details in *objset 3292 */ 3293 static int 3294 zfs_get_root(const spa_t *spa, uint64_t *objid) 3295 { 3296 dnode_phys_t dir, propdir; 3297 uint64_t props, bootfs, root; 3298 3299 *objid = 0; 3300 3301 /* 3302 * Start with the MOS directory object. 3303 */ 3304 if (objset_get_dnode(spa, spa->spa_mos, 3305 DMU_POOL_DIRECTORY_OBJECT, &dir)) { 3306 printf("ZFS: can't read MOS object directory\n"); 3307 return (EIO); 3308 } 3309 3310 /* 3311 * Lookup the pool_props and see if we can find a bootfs. 3312 */ 3313 if (zap_lookup(spa, &dir, DMU_POOL_PROPS, 3314 sizeof(props), 1, &props) == 0 && 3315 objset_get_dnode(spa, spa->spa_mos, props, &propdir) == 0 && 3316 zap_lookup(spa, &propdir, "bootfs", 3317 sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) { 3318 *objid = bootfs; 3319 return (0); 3320 } 3321 /* 3322 * Lookup the root dataset directory 3323 */ 3324 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, 3325 sizeof(root), 1, &root) || 3326 objset_get_dnode(spa, spa->spa_mos, root, &dir)) { 3327 printf("ZFS: can't find root dsl_dir\n"); 3328 return (EIO); 3329 } 3330 3331 /* 3332 * Use the information from the dataset directory's bonus buffer 3333 * to find the dataset object and from that the object set itself. 3334 */ 3335 dsl_dir_phys_t *dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3336 *objid = dd->dd_head_dataset_obj; 3337 return (0); 3338 } 3339 3340 static int 3341 zfs_mount_impl(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount) 3342 { 3343 3344 mount->spa = spa; 3345 3346 /* 3347 * Find the root object set if not explicitly provided 3348 */ 3349 if (rootobj == 0 && zfs_get_root(spa, &rootobj)) { 3350 printf("ZFS: can't find root filesystem\n"); 3351 return (EIO); 3352 } 3353 3354 if (zfs_mount_dataset(spa, rootobj, &mount->objset)) { 3355 printf("ZFS: can't open root filesystem\n"); 3356 return (EIO); 3357 } 3358 3359 mount->rootobj = rootobj; 3360 3361 return (0); 3362 } 3363 3364 /* 3365 * callback function for feature name checks. 3366 */ 3367 static int 3368 check_feature(const char *name, uint64_t value) 3369 { 3370 int i; 3371 3372 if (value == 0) 3373 return (0); 3374 if (name[0] == '\0') 3375 return (0); 3376 3377 for (i = 0; features_for_read[i] != NULL; i++) { 3378 if (strcmp(name, features_for_read[i]) == 0) 3379 return (0); 3380 } 3381 printf("ZFS: unsupported feature: %s\n", name); 3382 return (EIO); 3383 } 3384 3385 /* 3386 * Checks whether the MOS features that are active are supported. 3387 */ 3388 static int 3389 check_mos_features(const spa_t *spa) 3390 { 3391 dnode_phys_t dir; 3392 zap_phys_t *zap; 3393 uint64_t objnum; 3394 size_t size; 3395 int rc; 3396 3397 if ((rc = objset_get_dnode(spa, spa->spa_mos, DMU_OT_OBJECT_DIRECTORY, 3398 &dir)) != 0) 3399 return (rc); 3400 if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ, 3401 sizeof (objnum), 1, &objnum)) != 0) { 3402 /* 3403 * It is older pool without features. As we have already 3404 * tested the label, just return without raising the error. 3405 */ 3406 return (0); 3407 } 3408 3409 if ((rc = objset_get_dnode(spa, spa->spa_mos, objnum, &dir)) != 0) 3410 return (rc); 3411 3412 if (dir.dn_type != DMU_OTN_ZAP_METADATA) 3413 return (EIO); 3414 3415 size = dir.dn_datablkszsec << SPA_MINBLOCKSHIFT; 3416 zap = malloc(size); 3417 if (zap == NULL) 3418 return (ENOMEM); 3419 3420 if (dnode_read(spa, &dir, 0, zap, size)) { 3421 free(zap); 3422 return (EIO); 3423 } 3424 3425 if (zap->zap_block_type == ZBT_MICRO) 3426 rc = mzap_list((const mzap_phys_t *)zap, size, check_feature); 3427 else 3428 rc = fzap_list(spa, &dir, zap, check_feature); 3429 3430 free(zap); 3431 return (rc); 3432 } 3433 3434 static int 3435 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 3436 { 3437 dnode_phys_t dir; 3438 size_t size; 3439 int rc; 3440 char *nv; 3441 3442 *value = NULL; 3443 if ((rc = objset_get_dnode(spa, spa->spa_mos, obj, &dir)) != 0) 3444 return (rc); 3445 if (dir.dn_type != DMU_OT_PACKED_NVLIST && 3446 dir.dn_bonustype != DMU_OT_PACKED_NVLIST_SIZE) { 3447 return (EIO); 3448 } 3449 3450 if (dir.dn_bonuslen != sizeof (uint64_t)) 3451 return (EIO); 3452 3453 size = *(uint64_t *)DN_BONUS(&dir); 3454 nv = malloc(size); 3455 if (nv == NULL) 3456 return (ENOMEM); 3457 3458 rc = dnode_read(spa, &dir, 0, nv, size); 3459 if (rc != 0) { 3460 free(nv); 3461 nv = NULL; 3462 return (rc); 3463 } 3464 *value = nvlist_import(nv, size); 3465 free(nv); 3466 return (rc); 3467 } 3468 3469 static int 3470 zfs_spa_init(spa_t *spa) 3471 { 3472 struct uberblock checkpoint; 3473 dnode_phys_t dir; 3474 uint64_t config_object; 3475 nvlist_t *nvlist; 3476 int rc; 3477 3478 if (zio_read(spa, &spa->spa_uberblock->ub_rootbp, spa->spa_mos)) { 3479 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name); 3480 return (EIO); 3481 } 3482 if (spa->spa_mos->os_type != DMU_OST_META) { 3483 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name); 3484 return (EIO); 3485 } 3486 3487 if (objset_get_dnode(spa, &spa->spa_mos_master, 3488 DMU_POOL_DIRECTORY_OBJECT, &dir)) { 3489 printf("ZFS: failed to read pool %s directory object\n", 3490 spa->spa_name); 3491 return (EIO); 3492 } 3493 /* this is allowed to fail, older pools do not have salt */ 3494 rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1, 3495 sizeof (spa->spa_cksum_salt.zcs_bytes), 3496 spa->spa_cksum_salt.zcs_bytes); 3497 3498 rc = check_mos_features(spa); 3499 if (rc != 0) { 3500 printf("ZFS: pool %s is not supported\n", spa->spa_name); 3501 return (rc); 3502 } 3503 3504 rc = zap_lookup(spa, &dir, DMU_POOL_CONFIG, 3505 sizeof (config_object), 1, &config_object); 3506 if (rc != 0) { 3507 printf("ZFS: can not read MOS %s\n", DMU_POOL_CONFIG); 3508 return (EIO); 3509 } 3510 rc = load_nvlist(spa, config_object, &nvlist); 3511 if (rc != 0) 3512 return (rc); 3513 3514 rc = zap_lookup(spa, &dir, DMU_POOL_ZPOOL_CHECKPOINT, 3515 sizeof(uint64_t), sizeof(checkpoint) / sizeof(uint64_t), 3516 &checkpoint); 3517 if (rc == 0 && checkpoint.ub_checkpoint_txg != 0) { 3518 memcpy(&spa->spa_uberblock_checkpoint, &checkpoint, 3519 sizeof(checkpoint)); 3520 if (zio_read(spa, &spa->spa_uberblock_checkpoint.ub_rootbp, 3521 &spa->spa_mos_checkpoint)) { 3522 printf("ZFS: can not read checkpoint data.\n"); 3523 return (EIO); 3524 } 3525 } 3526 3527 /* 3528 * Update vdevs from MOS config. Note, we do skip encoding bytes 3529 * here. See also vdev_label_read_config(). 3530 */ 3531 rc = vdev_init_from_nvlist(spa, nvlist); 3532 nvlist_destroy(nvlist); 3533 return (rc); 3534 } 3535 3536 static int 3537 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb) 3538 { 3539 3540 if (dn->dn_bonustype != DMU_OT_SA) { 3541 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus; 3542 3543 sb->st_mode = zp->zp_mode; 3544 sb->st_uid = zp->zp_uid; 3545 sb->st_gid = zp->zp_gid; 3546 sb->st_size = zp->zp_size; 3547 } else { 3548 sa_hdr_phys_t *sahdrp; 3549 int hdrsize; 3550 size_t size = 0; 3551 void *buf = NULL; 3552 3553 if (dn->dn_bonuslen != 0) 3554 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 3555 else { 3556 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) { 3557 blkptr_t *bp = DN_SPILL_BLKPTR(dn); 3558 int error; 3559 3560 size = BP_GET_LSIZE(bp); 3561 buf = malloc(size); 3562 if (buf == NULL) 3563 error = ENOMEM; 3564 else 3565 error = zio_read(spa, bp, buf); 3566 3567 if (error != 0) { 3568 free(buf); 3569 return (error); 3570 } 3571 sahdrp = buf; 3572 } else { 3573 return (EIO); 3574 } 3575 } 3576 hdrsize = SA_HDR_SIZE(sahdrp); 3577 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize + 3578 SA_MODE_OFFSET); 3579 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize + 3580 SA_UID_OFFSET); 3581 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize + 3582 SA_GID_OFFSET); 3583 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize + 3584 SA_SIZE_OFFSET); 3585 free(buf); 3586 } 3587 3588 return (0); 3589 } 3590 3591 static int 3592 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize) 3593 { 3594 int rc = 0; 3595 3596 if (dn->dn_bonustype == DMU_OT_SA) { 3597 sa_hdr_phys_t *sahdrp = NULL; 3598 size_t size = 0; 3599 void *buf = NULL; 3600 int hdrsize; 3601 char *p; 3602 3603 if (dn->dn_bonuslen != 0) { 3604 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 3605 } else { 3606 blkptr_t *bp; 3607 3608 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0) 3609 return (EIO); 3610 bp = DN_SPILL_BLKPTR(dn); 3611 3612 size = BP_GET_LSIZE(bp); 3613 buf = malloc(size); 3614 if (buf == NULL) 3615 rc = ENOMEM; 3616 else 3617 rc = zio_read(spa, bp, buf); 3618 if (rc != 0) { 3619 free(buf); 3620 return (rc); 3621 } 3622 sahdrp = buf; 3623 } 3624 hdrsize = SA_HDR_SIZE(sahdrp); 3625 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET); 3626 memcpy(path, p, psize); 3627 free(buf); 3628 return (0); 3629 } 3630 /* 3631 * Second test is purely to silence bogus compiler 3632 * warning about accessing past the end of dn_bonus. 3633 */ 3634 if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen && 3635 sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) { 3636 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize); 3637 } else { 3638 rc = dnode_read(spa, dn, 0, path, psize); 3639 } 3640 return (rc); 3641 } 3642 3643 struct obj_list { 3644 uint64_t objnum; 3645 STAILQ_ENTRY(obj_list) entry; 3646 }; 3647 3648 /* 3649 * Lookup a file and return its dnode. 3650 */ 3651 static int 3652 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode) 3653 { 3654 int rc; 3655 uint64_t objnum; 3656 const spa_t *spa; 3657 dnode_phys_t dn; 3658 const char *p, *q; 3659 char element[256]; 3660 char path[1024]; 3661 int symlinks_followed = 0; 3662 struct stat sb; 3663 struct obj_list *entry, *tentry; 3664 STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache); 3665 3666 spa = mount->spa; 3667 if (mount->objset.os_type != DMU_OST_ZFS) { 3668 printf("ZFS: unexpected object set type %ju\n", 3669 (uintmax_t)mount->objset.os_type); 3670 return (EIO); 3671 } 3672 3673 if ((entry = malloc(sizeof(struct obj_list))) == NULL) 3674 return (ENOMEM); 3675 3676 /* 3677 * Get the root directory dnode. 3678 */ 3679 rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn); 3680 if (rc) { 3681 free(entry); 3682 return (rc); 3683 } 3684 3685 rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof(objnum), 1, &objnum); 3686 if (rc) { 3687 free(entry); 3688 return (rc); 3689 } 3690 entry->objnum = objnum; 3691 STAILQ_INSERT_HEAD(&on_cache, entry, entry); 3692 3693 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3694 if (rc != 0) 3695 goto done; 3696 3697 p = upath; 3698 while (p && *p) { 3699 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3700 if (rc != 0) 3701 goto done; 3702 3703 while (*p == '/') 3704 p++; 3705 if (*p == '\0') 3706 break; 3707 q = p; 3708 while (*q != '\0' && *q != '/') 3709 q++; 3710 3711 /* skip dot */ 3712 if (p + 1 == q && p[0] == '.') { 3713 p++; 3714 continue; 3715 } 3716 /* double dot */ 3717 if (p + 2 == q && p[0] == '.' && p[1] == '.') { 3718 p += 2; 3719 if (STAILQ_FIRST(&on_cache) == 3720 STAILQ_LAST(&on_cache, obj_list, entry)) { 3721 rc = ENOENT; 3722 goto done; 3723 } 3724 entry = STAILQ_FIRST(&on_cache); 3725 STAILQ_REMOVE_HEAD(&on_cache, entry); 3726 free(entry); 3727 objnum = (STAILQ_FIRST(&on_cache))->objnum; 3728 continue; 3729 } 3730 if (q - p + 1 > sizeof(element)) { 3731 rc = ENAMETOOLONG; 3732 goto done; 3733 } 3734 memcpy(element, p, q - p); 3735 element[q - p] = 0; 3736 p = q; 3737 3738 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0) 3739 goto done; 3740 if (!S_ISDIR(sb.st_mode)) { 3741 rc = ENOTDIR; 3742 goto done; 3743 } 3744 3745 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum); 3746 if (rc) 3747 goto done; 3748 objnum = ZFS_DIRENT_OBJ(objnum); 3749 3750 if ((entry = malloc(sizeof(struct obj_list))) == NULL) { 3751 rc = ENOMEM; 3752 goto done; 3753 } 3754 entry->objnum = objnum; 3755 STAILQ_INSERT_HEAD(&on_cache, entry, entry); 3756 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3757 if (rc) 3758 goto done; 3759 3760 /* 3761 * Check for symlink. 3762 */ 3763 rc = zfs_dnode_stat(spa, &dn, &sb); 3764 if (rc) 3765 goto done; 3766 if (S_ISLNK(sb.st_mode)) { 3767 if (symlinks_followed > 10) { 3768 rc = EMLINK; 3769 goto done; 3770 } 3771 symlinks_followed++; 3772 3773 /* 3774 * Read the link value and copy the tail of our 3775 * current path onto the end. 3776 */ 3777 if (sb.st_size + strlen(p) + 1 > sizeof(path)) { 3778 rc = ENAMETOOLONG; 3779 goto done; 3780 } 3781 strcpy(&path[sb.st_size], p); 3782 3783 rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size); 3784 if (rc != 0) 3785 goto done; 3786 3787 /* 3788 * Restart with the new path, starting either at 3789 * the root or at the parent depending whether or 3790 * not the link is relative. 3791 */ 3792 p = path; 3793 if (*p == '/') { 3794 while (STAILQ_FIRST(&on_cache) != 3795 STAILQ_LAST(&on_cache, obj_list, entry)) { 3796 entry = STAILQ_FIRST(&on_cache); 3797 STAILQ_REMOVE_HEAD(&on_cache, entry); 3798 free(entry); 3799 } 3800 } else { 3801 entry = STAILQ_FIRST(&on_cache); 3802 STAILQ_REMOVE_HEAD(&on_cache, entry); 3803 free(entry); 3804 } 3805 objnum = (STAILQ_FIRST(&on_cache))->objnum; 3806 } 3807 } 3808 3809 *dnode = dn; 3810 done: 3811 STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry) 3812 free(entry); 3813 return (rc); 3814 } 3815