1 /*- 2 * Copyright (c) 2007 Doug Rabson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 /* 31 * Stand-alone ZFS file reader. 32 */ 33 34 #include <stdbool.h> 35 #include <sys/endian.h> 36 #include <sys/stat.h> 37 #include <sys/stdint.h> 38 #include <sys/list.h> 39 #include <sys/zfs_bootenv.h> 40 #include <machine/_inttypes.h> 41 42 #include "zfsimpl.h" 43 #include "zfssubr.c" 44 45 #ifdef HAS_ZSTD_ZFS 46 extern int zstd_init(void); 47 #endif 48 49 struct zfsmount { 50 char *path; 51 const spa_t *spa; 52 objset_phys_t objset; 53 uint64_t rootobj; 54 STAILQ_ENTRY(zfsmount) next; 55 }; 56 57 typedef STAILQ_HEAD(zfs_mnt_list, zfsmount) zfs_mnt_list_t; 58 static zfs_mnt_list_t zfsmount = STAILQ_HEAD_INITIALIZER(zfsmount); 59 60 /* 61 * The indirect_child_t represents the vdev that we will read from, when we 62 * need to read all copies of the data (e.g. for scrub or reconstruction). 63 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), 64 * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, 65 * ic_vdev is a child of the mirror. 66 */ 67 typedef struct indirect_child { 68 void *ic_data; 69 vdev_t *ic_vdev; 70 } indirect_child_t; 71 72 /* 73 * The indirect_split_t represents one mapped segment of an i/o to the 74 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be 75 * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. 76 * For split blocks, there will be several of these. 77 */ 78 typedef struct indirect_split { 79 list_node_t is_node; /* link on iv_splits */ 80 81 /* 82 * is_split_offset is the offset into the i/o. 83 * This is the sum of the previous splits' is_size's. 84 */ 85 uint64_t is_split_offset; 86 87 vdev_t *is_vdev; /* top-level vdev */ 88 uint64_t is_target_offset; /* offset on is_vdev */ 89 uint64_t is_size; 90 int is_children; /* number of entries in is_child[] */ 91 92 /* 93 * is_good_child is the child that we are currently using to 94 * attempt reconstruction. 95 */ 96 int is_good_child; 97 98 indirect_child_t is_child[1]; /* variable-length */ 99 } indirect_split_t; 100 101 /* 102 * The indirect_vsd_t is associated with each i/o to the indirect vdev. 103 * It is the "Vdev-Specific Data" in the zio_t's io_vsd. 104 */ 105 typedef struct indirect_vsd { 106 boolean_t iv_split_block; 107 boolean_t iv_reconstruct; 108 109 list_t iv_splits; /* list of indirect_split_t's */ 110 } indirect_vsd_t; 111 112 /* 113 * List of all vdevs, chained through v_alllink. 114 */ 115 static vdev_list_t zfs_vdevs; 116 117 /* 118 * List of ZFS features supported for read 119 */ 120 static const char *features_for_read[] = { 121 "org.illumos:lz4_compress", 122 "com.delphix:hole_birth", 123 "com.delphix:extensible_dataset", 124 "com.delphix:embedded_data", 125 "org.open-zfs:large_blocks", 126 "org.illumos:sha512", 127 "org.illumos:skein", 128 "org.zfsonlinux:large_dnode", 129 "com.joyent:multi_vdev_crash_dump", 130 "com.delphix:spacemap_histogram", 131 "com.delphix:zpool_checkpoint", 132 "com.delphix:spacemap_v2", 133 "com.datto:encryption", 134 "com.datto:bookmark_v2", 135 "org.zfsonlinux:allocation_classes", 136 "com.datto:resilver_defer", 137 "com.delphix:device_removal", 138 "com.delphix:obsolete_counts", 139 "com.intel:allocation_classes", 140 "org.freebsd:zstd_compress", 141 "com.delphix:bookmark_written", 142 NULL 143 }; 144 145 /* 146 * List of all pools, chained through spa_link. 147 */ 148 static spa_list_t zfs_pools; 149 150 static const dnode_phys_t *dnode_cache_obj; 151 static uint64_t dnode_cache_bn; 152 static char *dnode_cache_buf; 153 154 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf); 155 static int zfs_get_root(const spa_t *spa, uint64_t *objid); 156 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result); 157 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, 158 const char *name, uint64_t integer_size, uint64_t num_integers, 159 void *value); 160 static int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t, 161 dnode_phys_t *); 162 static int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *, 163 size_t); 164 static int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t, 165 size_t); 166 static int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t, size_t); 167 vdev_indirect_mapping_t *vdev_indirect_mapping_open(spa_t *, objset_phys_t *, 168 uint64_t); 169 vdev_indirect_mapping_entry_phys_t * 170 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *, uint64_t, 171 uint64_t, uint64_t *); 172 173 static void 174 zfs_init(void) 175 { 176 STAILQ_INIT(&zfs_vdevs); 177 STAILQ_INIT(&zfs_pools); 178 179 dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE); 180 181 zfs_init_crc(); 182 #ifdef HAS_ZSTD_ZFS 183 zstd_init(); 184 #endif 185 } 186 187 static int 188 nvlist_check_features_for_read(nvlist_t *nvl) 189 { 190 nvlist_t *features = NULL; 191 nvs_data_t *data; 192 nvp_header_t *nvp; 193 nv_string_t *nvp_name; 194 int rc; 195 196 rc = nvlist_find(nvl, ZPOOL_CONFIG_FEATURES_FOR_READ, 197 DATA_TYPE_NVLIST, NULL, &features, NULL); 198 switch (rc) { 199 case 0: 200 break; /* Continue with checks */ 201 202 case ENOENT: 203 return (0); /* All features are disabled */ 204 205 default: 206 return (rc); /* Error while reading nvlist */ 207 } 208 209 data = (nvs_data_t *)features->nv_data; 210 nvp = &data->nvl_pair; /* first pair in nvlist */ 211 212 while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { 213 int i, found; 214 215 nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp)); 216 found = 0; 217 218 for (i = 0; features_for_read[i] != NULL; i++) { 219 if (memcmp(nvp_name->nv_data, features_for_read[i], 220 nvp_name->nv_size) == 0) { 221 found = 1; 222 break; 223 } 224 } 225 226 if (!found) { 227 printf("ZFS: unsupported feature: %.*s\n", 228 nvp_name->nv_size, nvp_name->nv_data); 229 rc = EIO; 230 } 231 nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); 232 } 233 nvlist_destroy(features); 234 235 return (rc); 236 } 237 238 static int 239 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, 240 off_t offset, size_t size) 241 { 242 size_t psize; 243 int rc; 244 245 if (vdev->v_phys_read == NULL) 246 return (ENOTSUP); 247 248 if (bp) { 249 psize = BP_GET_PSIZE(bp); 250 } else { 251 psize = size; 252 } 253 254 rc = vdev->v_phys_read(vdev, vdev->v_priv, offset, buf, psize); 255 if (rc == 0) { 256 if (bp != NULL) 257 rc = zio_checksum_verify(vdev->v_spa, bp, buf); 258 } 259 260 return (rc); 261 } 262 263 static int 264 vdev_write_phys(vdev_t *vdev, void *buf, off_t offset, size_t size) 265 { 266 if (vdev->v_phys_write == NULL) 267 return (ENOTSUP); 268 269 return (vdev->v_phys_write(vdev, offset, buf, size)); 270 } 271 272 typedef struct remap_segment { 273 vdev_t *rs_vd; 274 uint64_t rs_offset; 275 uint64_t rs_asize; 276 uint64_t rs_split_offset; 277 list_node_t rs_node; 278 } remap_segment_t; 279 280 static remap_segment_t * 281 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 282 { 283 remap_segment_t *rs = malloc(sizeof (remap_segment_t)); 284 285 if (rs != NULL) { 286 rs->rs_vd = vd; 287 rs->rs_offset = offset; 288 rs->rs_asize = asize; 289 rs->rs_split_offset = split_offset; 290 } 291 292 return (rs); 293 } 294 295 vdev_indirect_mapping_t * 296 vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os, 297 uint64_t mapping_object) 298 { 299 vdev_indirect_mapping_t *vim; 300 vdev_indirect_mapping_phys_t *vim_phys; 301 int rc; 302 303 vim = calloc(1, sizeof (*vim)); 304 if (vim == NULL) 305 return (NULL); 306 307 vim->vim_dn = calloc(1, sizeof (*vim->vim_dn)); 308 if (vim->vim_dn == NULL) { 309 free(vim); 310 return (NULL); 311 } 312 313 rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn); 314 if (rc != 0) { 315 free(vim->vim_dn); 316 free(vim); 317 return (NULL); 318 } 319 320 vim->vim_spa = spa; 321 vim->vim_phys = malloc(sizeof (*vim->vim_phys)); 322 if (vim->vim_phys == NULL) { 323 free(vim->vim_dn); 324 free(vim); 325 return (NULL); 326 } 327 328 vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn); 329 *vim->vim_phys = *vim_phys; 330 331 vim->vim_objset = os; 332 vim->vim_object = mapping_object; 333 vim->vim_entries = NULL; 334 335 vim->vim_havecounts = 336 (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0); 337 338 return (vim); 339 } 340 341 /* 342 * Compare an offset with an indirect mapping entry; there are three 343 * possible scenarios: 344 * 345 * 1. The offset is "less than" the mapping entry; meaning the 346 * offset is less than the source offset of the mapping entry. In 347 * this case, there is no overlap between the offset and the 348 * mapping entry and -1 will be returned. 349 * 350 * 2. The offset is "greater than" the mapping entry; meaning the 351 * offset is greater than the mapping entry's source offset plus 352 * the entry's size. In this case, there is no overlap between 353 * the offset and the mapping entry and 1 will be returned. 354 * 355 * NOTE: If the offset is actually equal to the entry's offset 356 * plus size, this is considered to be "greater" than the entry, 357 * and this case applies (i.e. 1 will be returned). Thus, the 358 * entry's "range" can be considered to be inclusive at its 359 * start, but exclusive at its end: e.g. [src, src + size). 360 * 361 * 3. The last case to consider is if the offset actually falls 362 * within the mapping entry's range. If this is the case, the 363 * offset is considered to be "equal to" the mapping entry and 364 * 0 will be returned. 365 * 366 * NOTE: If the offset is equal to the entry's source offset, 367 * this case applies and 0 will be returned. If the offset is 368 * equal to the entry's source plus its size, this case does 369 * *not* apply (see "NOTE" above for scenario 2), and 1 will be 370 * returned. 371 */ 372 static int 373 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) 374 { 375 const uint64_t *key = v_key; 376 const vdev_indirect_mapping_entry_phys_t *array_elem = 377 v_array_elem; 378 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); 379 380 if (*key < src_offset) { 381 return (-1); 382 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { 383 return (0); 384 } else { 385 return (1); 386 } 387 } 388 389 /* 390 * Return array entry. 391 */ 392 static vdev_indirect_mapping_entry_phys_t * 393 vdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index) 394 { 395 uint64_t size; 396 off_t offset = 0; 397 int rc; 398 399 if (vim->vim_phys->vimp_num_entries == 0) 400 return (NULL); 401 402 if (vim->vim_entries == NULL) { 403 uint64_t bsize; 404 405 bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT; 406 size = vim->vim_phys->vimp_num_entries * 407 sizeof (*vim->vim_entries); 408 if (size > bsize) { 409 size = bsize / sizeof (*vim->vim_entries); 410 size *= sizeof (*vim->vim_entries); 411 } 412 vim->vim_entries = malloc(size); 413 if (vim->vim_entries == NULL) 414 return (NULL); 415 vim->vim_num_entries = size / sizeof (*vim->vim_entries); 416 offset = index * sizeof (*vim->vim_entries); 417 } 418 419 /* We have data in vim_entries */ 420 if (offset == 0) { 421 if (index >= vim->vim_entry_offset && 422 index <= vim->vim_entry_offset + vim->vim_num_entries) { 423 index -= vim->vim_entry_offset; 424 return (&vim->vim_entries[index]); 425 } 426 offset = index * sizeof (*vim->vim_entries); 427 } 428 429 vim->vim_entry_offset = index; 430 size = vim->vim_num_entries * sizeof (*vim->vim_entries); 431 rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries, 432 size); 433 if (rc != 0) { 434 /* Read error, invalidate vim_entries. */ 435 free(vim->vim_entries); 436 vim->vim_entries = NULL; 437 return (NULL); 438 } 439 index -= vim->vim_entry_offset; 440 return (&vim->vim_entries[index]); 441 } 442 443 /* 444 * Returns the mapping entry for the given offset. 445 * 446 * It's possible that the given offset will not be in the mapping table 447 * (i.e. no mapping entries contain this offset), in which case, the 448 * return value value depends on the "next_if_missing" parameter. 449 * 450 * If the offset is not found in the table and "next_if_missing" is 451 * B_FALSE, then NULL will always be returned. The behavior is intended 452 * to allow consumers to get the entry corresponding to the offset 453 * parameter, iff the offset overlaps with an entry in the table. 454 * 455 * If the offset is not found in the table and "next_if_missing" is 456 * B_TRUE, then the entry nearest to the given offset will be returned, 457 * such that the entry's source offset is greater than the offset 458 * passed in (i.e. the "next" mapping entry in the table is returned, if 459 * the offset is missing from the table). If there are no entries whose 460 * source offset is greater than the passed in offset, NULL is returned. 461 */ 462 static vdev_indirect_mapping_entry_phys_t * 463 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, 464 uint64_t offset) 465 { 466 ASSERT(vim->vim_phys->vimp_num_entries > 0); 467 468 vdev_indirect_mapping_entry_phys_t *entry; 469 470 uint64_t last = vim->vim_phys->vimp_num_entries - 1; 471 uint64_t base = 0; 472 473 /* 474 * We don't define these inside of the while loop because we use 475 * their value in the case that offset isn't in the mapping. 476 */ 477 uint64_t mid; 478 int result; 479 480 while (last >= base) { 481 mid = base + ((last - base) >> 1); 482 483 entry = vdev_indirect_mapping_entry(vim, mid); 484 if (entry == NULL) 485 break; 486 result = dva_mapping_overlap_compare(&offset, entry); 487 488 if (result == 0) { 489 break; 490 } else if (result < 0) { 491 last = mid - 1; 492 } else { 493 base = mid + 1; 494 } 495 } 496 return (entry); 497 } 498 499 /* 500 * Given an indirect vdev and an extent on that vdev, it duplicates the 501 * physical entries of the indirect mapping that correspond to the extent 502 * to a new array and returns a pointer to it. In addition, copied_entries 503 * is populated with the number of mapping entries that were duplicated. 504 * 505 * Finally, since we are doing an allocation, it is up to the caller to 506 * free the array allocated in this function. 507 */ 508 vdev_indirect_mapping_entry_phys_t * 509 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, 510 uint64_t asize, uint64_t *copied_entries) 511 { 512 vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; 513 vdev_indirect_mapping_t *vim = vd->v_mapping; 514 uint64_t entries = 0; 515 516 vdev_indirect_mapping_entry_phys_t *first_mapping = 517 vdev_indirect_mapping_entry_for_offset(vim, offset); 518 ASSERT3P(first_mapping, !=, NULL); 519 520 vdev_indirect_mapping_entry_phys_t *m = first_mapping; 521 while (asize > 0) { 522 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 523 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); 524 uint64_t inner_size = MIN(asize, size - inner_offset); 525 526 offset += inner_size; 527 asize -= inner_size; 528 entries++; 529 m++; 530 } 531 532 size_t copy_length = entries * sizeof (*first_mapping); 533 duplicate_mappings = malloc(copy_length); 534 if (duplicate_mappings != NULL) 535 bcopy(first_mapping, duplicate_mappings, copy_length); 536 else 537 entries = 0; 538 539 *copied_entries = entries; 540 541 return (duplicate_mappings); 542 } 543 544 static vdev_t * 545 vdev_lookup_top(spa_t *spa, uint64_t vdev) 546 { 547 vdev_t *rvd; 548 vdev_list_t *vlist; 549 550 vlist = &spa->spa_root_vdev->v_children; 551 STAILQ_FOREACH(rvd, vlist, v_childlink) 552 if (rvd->v_id == vdev) 553 break; 554 555 return (rvd); 556 } 557 558 /* 559 * This is a callback for vdev_indirect_remap() which allocates an 560 * indirect_split_t for each split segment and adds it to iv_splits. 561 */ 562 static void 563 vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, 564 uint64_t size, void *arg) 565 { 566 int n = 1; 567 zio_t *zio = arg; 568 indirect_vsd_t *iv = zio->io_vsd; 569 570 if (vd->v_read == vdev_indirect_read) 571 return; 572 573 if (vd->v_read == vdev_mirror_read) 574 n = vd->v_nchildren; 575 576 indirect_split_t *is = 577 malloc(offsetof(indirect_split_t, is_child[n])); 578 if (is == NULL) { 579 zio->io_error = ENOMEM; 580 return; 581 } 582 bzero(is, offsetof(indirect_split_t, is_child[n])); 583 584 is->is_children = n; 585 is->is_size = size; 586 is->is_split_offset = split_offset; 587 is->is_target_offset = offset; 588 is->is_vdev = vd; 589 590 /* 591 * Note that we only consider multiple copies of the data for 592 * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even 593 * though they use the same ops as mirror, because there's only one 594 * "good" copy under the replacing/spare. 595 */ 596 if (vd->v_read == vdev_mirror_read) { 597 int i = 0; 598 vdev_t *kid; 599 600 STAILQ_FOREACH(kid, &vd->v_children, v_childlink) { 601 is->is_child[i++].ic_vdev = kid; 602 } 603 } else { 604 is->is_child[0].ic_vdev = vd; 605 } 606 607 list_insert_tail(&iv->iv_splits, is); 608 } 609 610 static void 611 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg) 612 { 613 list_t stack; 614 spa_t *spa = vd->v_spa; 615 zio_t *zio = arg; 616 remap_segment_t *rs; 617 618 list_create(&stack, sizeof (remap_segment_t), 619 offsetof(remap_segment_t, rs_node)); 620 621 rs = rs_alloc(vd, offset, asize, 0); 622 if (rs == NULL) { 623 printf("vdev_indirect_remap: out of memory.\n"); 624 zio->io_error = ENOMEM; 625 } 626 for (; rs != NULL; rs = list_remove_head(&stack)) { 627 vdev_t *v = rs->rs_vd; 628 uint64_t num_entries = 0; 629 /* vdev_indirect_mapping_t *vim = v->v_mapping; */ 630 vdev_indirect_mapping_entry_phys_t *mapping = 631 vdev_indirect_mapping_duplicate_adjacent_entries(v, 632 rs->rs_offset, rs->rs_asize, &num_entries); 633 634 if (num_entries == 0) 635 zio->io_error = ENOMEM; 636 637 for (uint64_t i = 0; i < num_entries; i++) { 638 vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; 639 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 640 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); 641 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); 642 uint64_t inner_offset = rs->rs_offset - 643 DVA_MAPPING_GET_SRC_OFFSET(m); 644 uint64_t inner_size = 645 MIN(rs->rs_asize, size - inner_offset); 646 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 647 648 if (dst_v->v_read == vdev_indirect_read) { 649 remap_segment_t *o; 650 651 o = rs_alloc(dst_v, dst_offset + inner_offset, 652 inner_size, rs->rs_split_offset); 653 if (o == NULL) { 654 printf("vdev_indirect_remap: " 655 "out of memory.\n"); 656 zio->io_error = ENOMEM; 657 break; 658 } 659 660 list_insert_head(&stack, o); 661 } 662 vdev_indirect_gather_splits(rs->rs_split_offset, dst_v, 663 dst_offset + inner_offset, 664 inner_size, arg); 665 666 /* 667 * vdev_indirect_gather_splits can have memory 668 * allocation error, we can not recover from it. 669 */ 670 if (zio->io_error != 0) 671 break; 672 rs->rs_offset += inner_size; 673 rs->rs_asize -= inner_size; 674 rs->rs_split_offset += inner_size; 675 } 676 677 free(mapping); 678 free(rs); 679 if (zio->io_error != 0) 680 break; 681 } 682 683 list_destroy(&stack); 684 } 685 686 static void 687 vdev_indirect_map_free(zio_t *zio) 688 { 689 indirect_vsd_t *iv = zio->io_vsd; 690 indirect_split_t *is; 691 692 while ((is = list_head(&iv->iv_splits)) != NULL) { 693 for (int c = 0; c < is->is_children; c++) { 694 indirect_child_t *ic = &is->is_child[c]; 695 free(ic->ic_data); 696 } 697 list_remove(&iv->iv_splits, is); 698 free(is); 699 } 700 free(iv); 701 } 702 703 static int 704 vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 705 off_t offset, size_t bytes) 706 { 707 zio_t zio; 708 spa_t *spa = vdev->v_spa; 709 indirect_vsd_t *iv; 710 indirect_split_t *first; 711 int rc = EIO; 712 713 iv = calloc(1, sizeof(*iv)); 714 if (iv == NULL) 715 return (ENOMEM); 716 717 list_create(&iv->iv_splits, 718 sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); 719 720 bzero(&zio, sizeof(zio)); 721 zio.io_spa = spa; 722 zio.io_bp = (blkptr_t *)bp; 723 zio.io_data = buf; 724 zio.io_size = bytes; 725 zio.io_offset = offset; 726 zio.io_vd = vdev; 727 zio.io_vsd = iv; 728 729 if (vdev->v_mapping == NULL) { 730 vdev_indirect_config_t *vic; 731 732 vic = &vdev->vdev_indirect_config; 733 vdev->v_mapping = vdev_indirect_mapping_open(spa, 734 spa->spa_mos, vic->vic_mapping_object); 735 } 736 737 vdev_indirect_remap(vdev, offset, bytes, &zio); 738 if (zio.io_error != 0) 739 return (zio.io_error); 740 741 first = list_head(&iv->iv_splits); 742 if (first->is_size == zio.io_size) { 743 /* 744 * This is not a split block; we are pointing to the entire 745 * data, which will checksum the same as the original data. 746 * Pass the BP down so that the child i/o can verify the 747 * checksum, and try a different location if available 748 * (e.g. on a mirror). 749 * 750 * While this special case could be handled the same as the 751 * general (split block) case, doing it this way ensures 752 * that the vast majority of blocks on indirect vdevs 753 * (which are not split) are handled identically to blocks 754 * on non-indirect vdevs. This allows us to be less strict 755 * about performance in the general (but rare) case. 756 */ 757 rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp, 758 zio.io_data, first->is_target_offset, bytes); 759 } else { 760 iv->iv_split_block = B_TRUE; 761 /* 762 * Read one copy of each split segment, from the 763 * top-level vdev. Since we don't know the 764 * checksum of each split individually, the child 765 * zio can't ensure that we get the right data. 766 * E.g. if it's a mirror, it will just read from a 767 * random (healthy) leaf vdev. We have to verify 768 * the checksum in vdev_indirect_io_done(). 769 */ 770 for (indirect_split_t *is = list_head(&iv->iv_splits); 771 is != NULL; is = list_next(&iv->iv_splits, is)) { 772 char *ptr = zio.io_data; 773 774 rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp, 775 ptr + is->is_split_offset, is->is_target_offset, 776 is->is_size); 777 } 778 if (zio_checksum_verify(spa, zio.io_bp, zio.io_data)) 779 rc = ECKSUM; 780 else 781 rc = 0; 782 } 783 784 vdev_indirect_map_free(&zio); 785 if (rc == 0) 786 rc = zio.io_error; 787 788 return (rc); 789 } 790 791 static int 792 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 793 off_t offset, size_t bytes) 794 { 795 796 return (vdev_read_phys(vdev, bp, buf, 797 offset + VDEV_LABEL_START_SIZE, bytes)); 798 } 799 800 static int 801 vdev_missing_read(vdev_t *vdev __unused, const blkptr_t *bp __unused, 802 void *buf __unused, off_t offset __unused, size_t bytes __unused) 803 { 804 805 return (ENOTSUP); 806 } 807 808 static int 809 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 810 off_t offset, size_t bytes) 811 { 812 vdev_t *kid; 813 int rc; 814 815 rc = EIO; 816 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 817 if (kid->v_state != VDEV_STATE_HEALTHY) 818 continue; 819 rc = kid->v_read(kid, bp, buf, offset, bytes); 820 if (!rc) 821 return (0); 822 } 823 824 return (rc); 825 } 826 827 static int 828 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 829 off_t offset, size_t bytes) 830 { 831 vdev_t *kid; 832 833 /* 834 * Here we should have two kids: 835 * First one which is the one we are replacing and we can trust 836 * only this one to have valid data, but it might not be present. 837 * Second one is that one we are replacing with. It is most likely 838 * healthy, but we can't trust it has needed data, so we won't use it. 839 */ 840 kid = STAILQ_FIRST(&vdev->v_children); 841 if (kid == NULL) 842 return (EIO); 843 if (kid->v_state != VDEV_STATE_HEALTHY) 844 return (EIO); 845 return (kid->v_read(kid, bp, buf, offset, bytes)); 846 } 847 848 static vdev_t * 849 vdev_find(uint64_t guid) 850 { 851 vdev_t *vdev; 852 853 STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink) 854 if (vdev->v_guid == guid) 855 return (vdev); 856 857 return (0); 858 } 859 860 static vdev_t * 861 vdev_create(uint64_t guid, vdev_read_t *_read) 862 { 863 vdev_t *vdev; 864 vdev_indirect_config_t *vic; 865 866 vdev = calloc(1, sizeof(vdev_t)); 867 if (vdev != NULL) { 868 STAILQ_INIT(&vdev->v_children); 869 vdev->v_guid = guid; 870 vdev->v_read = _read; 871 872 /* 873 * root vdev has no read function, we use this fact to 874 * skip setting up data we do not need for root vdev. 875 * We only point root vdev from spa. 876 */ 877 if (_read != NULL) { 878 vic = &vdev->vdev_indirect_config; 879 vic->vic_prev_indirect_vdev = UINT64_MAX; 880 STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); 881 } 882 } 883 884 return (vdev); 885 } 886 887 static void 888 vdev_set_initial_state(vdev_t *vdev, const nvlist_t *nvlist) 889 { 890 uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; 891 uint64_t is_log; 892 893 is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; 894 is_log = 0; 895 (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, 896 &is_offline, NULL); 897 (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, 898 &is_removed, NULL); 899 (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, 900 &is_faulted, NULL); 901 (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 902 NULL, &is_degraded, NULL); 903 (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 904 NULL, &isnt_present, NULL); 905 (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, 906 &is_log, NULL); 907 908 if (is_offline != 0) 909 vdev->v_state = VDEV_STATE_OFFLINE; 910 else if (is_removed != 0) 911 vdev->v_state = VDEV_STATE_REMOVED; 912 else if (is_faulted != 0) 913 vdev->v_state = VDEV_STATE_FAULTED; 914 else if (is_degraded != 0) 915 vdev->v_state = VDEV_STATE_DEGRADED; 916 else if (isnt_present != 0) 917 vdev->v_state = VDEV_STATE_CANT_OPEN; 918 919 vdev->v_islog = is_log != 0; 920 } 921 922 static int 923 vdev_init(uint64_t guid, const nvlist_t *nvlist, vdev_t **vdevp) 924 { 925 uint64_t id, ashift, asize, nparity; 926 const char *path; 927 const char *type; 928 int len, pathlen; 929 char *name; 930 vdev_t *vdev; 931 932 if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id, 933 NULL) || 934 nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, 935 &type, &len)) { 936 return (ENOENT); 937 } 938 939 if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 && 940 memcmp(type, VDEV_TYPE_DISK, len) != 0 && 941 #ifdef ZFS_TEST 942 memcmp(type, VDEV_TYPE_FILE, len) != 0 && 943 #endif 944 memcmp(type, VDEV_TYPE_RAIDZ, len) != 0 && 945 memcmp(type, VDEV_TYPE_INDIRECT, len) != 0 && 946 memcmp(type, VDEV_TYPE_REPLACING, len) != 0 && 947 memcmp(type, VDEV_TYPE_HOLE, len) != 0) { 948 printf("ZFS: can only boot from disk, mirror, raidz1, " 949 "raidz2 and raidz3 vdevs, got: %.*s\n", len, type); 950 return (EIO); 951 } 952 953 if (memcmp(type, VDEV_TYPE_MIRROR, len) == 0) 954 vdev = vdev_create(guid, vdev_mirror_read); 955 else if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) 956 vdev = vdev_create(guid, vdev_raidz_read); 957 else if (memcmp(type, VDEV_TYPE_REPLACING, len) == 0) 958 vdev = vdev_create(guid, vdev_replacing_read); 959 else if (memcmp(type, VDEV_TYPE_INDIRECT, len) == 0) { 960 vdev_indirect_config_t *vic; 961 962 vdev = vdev_create(guid, vdev_indirect_read); 963 if (vdev != NULL) { 964 vdev->v_state = VDEV_STATE_HEALTHY; 965 vic = &vdev->vdev_indirect_config; 966 967 nvlist_find(nvlist, 968 ZPOOL_CONFIG_INDIRECT_OBJECT, 969 DATA_TYPE_UINT64, 970 NULL, &vic->vic_mapping_object, NULL); 971 nvlist_find(nvlist, 972 ZPOOL_CONFIG_INDIRECT_BIRTHS, 973 DATA_TYPE_UINT64, 974 NULL, &vic->vic_births_object, NULL); 975 nvlist_find(nvlist, 976 ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 977 DATA_TYPE_UINT64, 978 NULL, &vic->vic_prev_indirect_vdev, NULL); 979 } 980 } else if (memcmp(type, VDEV_TYPE_HOLE, len) == 0) { 981 vdev = vdev_create(guid, vdev_missing_read); 982 } else { 983 vdev = vdev_create(guid, vdev_disk_read); 984 } 985 986 if (vdev == NULL) 987 return (ENOMEM); 988 989 vdev_set_initial_state(vdev, nvlist); 990 vdev->v_id = id; 991 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, 992 DATA_TYPE_UINT64, NULL, &ashift, NULL) == 0) 993 vdev->v_ashift = ashift; 994 995 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, 996 DATA_TYPE_UINT64, NULL, &asize, NULL) == 0) { 997 vdev->v_psize = asize + 998 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 999 } 1000 1001 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, 1002 DATA_TYPE_UINT64, NULL, &nparity, NULL) == 0) 1003 vdev->v_nparity = nparity; 1004 1005 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, 1006 DATA_TYPE_STRING, NULL, &path, &pathlen) == 0) { 1007 char prefix[] = "/dev/"; 1008 1009 len = strlen(prefix); 1010 if (len < pathlen && memcmp(path, prefix, len) == 0) { 1011 path += len; 1012 pathlen -= len; 1013 } 1014 name = malloc(pathlen + 1); 1015 bcopy(path, name, pathlen); 1016 name[pathlen] = '\0'; 1017 vdev->v_name = name; 1018 } else { 1019 name = NULL; 1020 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) { 1021 if (vdev->v_nparity < 1 || 1022 vdev->v_nparity > 3) { 1023 printf("ZFS: invalid raidz parity: %d\n", 1024 vdev->v_nparity); 1025 return (EIO); 1026 } 1027 (void) asprintf(&name, "%.*s%d-%" PRIu64, len, type, 1028 vdev->v_nparity, id); 1029 } else { 1030 (void) asprintf(&name, "%.*s-%" PRIu64, len, type, id); 1031 } 1032 vdev->v_name = name; 1033 } 1034 *vdevp = vdev; 1035 return (0); 1036 } 1037 1038 /* 1039 * Find slot for vdev. We return either NULL to signal to use 1040 * STAILQ_INSERT_HEAD, or we return link element to be used with 1041 * STAILQ_INSERT_AFTER. 1042 */ 1043 static vdev_t * 1044 vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev) 1045 { 1046 vdev_t *v, *previous; 1047 1048 if (STAILQ_EMPTY(&top_vdev->v_children)) 1049 return (NULL); 1050 1051 previous = NULL; 1052 STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) { 1053 if (v->v_id > vdev->v_id) 1054 return (previous); 1055 1056 if (v->v_id == vdev->v_id) 1057 return (v); 1058 1059 if (v->v_id < vdev->v_id) 1060 previous = v; 1061 } 1062 return (previous); 1063 } 1064 1065 static size_t 1066 vdev_child_count(vdev_t *vdev) 1067 { 1068 vdev_t *v; 1069 size_t count; 1070 1071 count = 0; 1072 STAILQ_FOREACH(v, &vdev->v_children, v_childlink) { 1073 count++; 1074 } 1075 return (count); 1076 } 1077 1078 /* 1079 * Insert vdev into top_vdev children list. List is ordered by v_id. 1080 */ 1081 static void 1082 vdev_insert(vdev_t *top_vdev, vdev_t *vdev) 1083 { 1084 vdev_t *previous; 1085 size_t count; 1086 1087 /* 1088 * The top level vdev can appear in random order, depending how 1089 * the firmware is presenting the disk devices. 1090 * However, we will insert vdev to create list ordered by v_id, 1091 * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER 1092 * as STAILQ does not have insert before. 1093 */ 1094 previous = vdev_find_previous(top_vdev, vdev); 1095 1096 if (previous == NULL) { 1097 STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink); 1098 } else if (previous->v_id == vdev->v_id) { 1099 /* 1100 * This vdev was configured from label config, 1101 * do not insert duplicate. 1102 */ 1103 return; 1104 } else { 1105 STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev, 1106 v_childlink); 1107 } 1108 1109 count = vdev_child_count(top_vdev); 1110 if (top_vdev->v_nchildren < count) 1111 top_vdev->v_nchildren = count; 1112 } 1113 1114 static int 1115 vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist) 1116 { 1117 vdev_t *top_vdev, *vdev; 1118 nvlist_t **kids = NULL; 1119 int rc, nkids; 1120 1121 /* Get top vdev. */ 1122 top_vdev = vdev_find(top_guid); 1123 if (top_vdev == NULL) { 1124 rc = vdev_init(top_guid, nvlist, &top_vdev); 1125 if (rc != 0) 1126 return (rc); 1127 top_vdev->v_spa = spa; 1128 top_vdev->v_top = top_vdev; 1129 vdev_insert(spa->spa_root_vdev, top_vdev); 1130 } 1131 1132 /* Add children if there are any. */ 1133 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1134 &nkids, &kids, NULL); 1135 if (rc == 0) { 1136 for (int i = 0; i < nkids; i++) { 1137 uint64_t guid; 1138 1139 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, 1140 DATA_TYPE_UINT64, NULL, &guid, NULL); 1141 if (rc != 0) 1142 goto done; 1143 1144 rc = vdev_init(guid, kids[i], &vdev); 1145 if (rc != 0) 1146 goto done; 1147 1148 vdev->v_spa = spa; 1149 vdev->v_top = top_vdev; 1150 vdev_insert(top_vdev, vdev); 1151 } 1152 } else { 1153 /* 1154 * When there are no children, nvlist_find() does return 1155 * error, reset it because leaf devices have no children. 1156 */ 1157 rc = 0; 1158 } 1159 done: 1160 if (kids != NULL) { 1161 for (int i = 0; i < nkids; i++) 1162 nvlist_destroy(kids[i]); 1163 free(kids); 1164 } 1165 1166 return (rc); 1167 } 1168 1169 static int 1170 vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist) 1171 { 1172 uint64_t pool_guid, top_guid; 1173 nvlist_t *vdevs; 1174 int rc; 1175 1176 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1177 NULL, &pool_guid, NULL) || 1178 nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64, 1179 NULL, &top_guid, NULL) || 1180 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1181 NULL, &vdevs, NULL)) { 1182 printf("ZFS: can't find vdev details\n"); 1183 return (ENOENT); 1184 } 1185 1186 rc = vdev_from_nvlist(spa, top_guid, vdevs); 1187 nvlist_destroy(vdevs); 1188 return (rc); 1189 } 1190 1191 static void 1192 vdev_set_state(vdev_t *vdev) 1193 { 1194 vdev_t *kid; 1195 int good_kids; 1196 int bad_kids; 1197 1198 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1199 vdev_set_state(kid); 1200 } 1201 1202 /* 1203 * A mirror or raidz is healthy if all its kids are healthy. A 1204 * mirror is degraded if any of its kids is healthy; a raidz 1205 * is degraded if at most nparity kids are offline. 1206 */ 1207 if (STAILQ_FIRST(&vdev->v_children)) { 1208 good_kids = 0; 1209 bad_kids = 0; 1210 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1211 if (kid->v_state == VDEV_STATE_HEALTHY) 1212 good_kids++; 1213 else 1214 bad_kids++; 1215 } 1216 if (bad_kids == 0) { 1217 vdev->v_state = VDEV_STATE_HEALTHY; 1218 } else { 1219 if (vdev->v_read == vdev_mirror_read) { 1220 if (good_kids) { 1221 vdev->v_state = VDEV_STATE_DEGRADED; 1222 } else { 1223 vdev->v_state = VDEV_STATE_OFFLINE; 1224 } 1225 } else if (vdev->v_read == vdev_raidz_read) { 1226 if (bad_kids > vdev->v_nparity) { 1227 vdev->v_state = VDEV_STATE_OFFLINE; 1228 } else { 1229 vdev->v_state = VDEV_STATE_DEGRADED; 1230 } 1231 } 1232 } 1233 } 1234 } 1235 1236 static int 1237 vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist) 1238 { 1239 vdev_t *vdev; 1240 nvlist_t **kids = NULL; 1241 int rc, nkids; 1242 1243 /* Update top vdev. */ 1244 vdev = vdev_find(top_guid); 1245 if (vdev != NULL) 1246 vdev_set_initial_state(vdev, nvlist); 1247 1248 /* Update children if there are any. */ 1249 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1250 &nkids, &kids, NULL); 1251 if (rc == 0) { 1252 for (int i = 0; i < nkids; i++) { 1253 uint64_t guid; 1254 1255 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, 1256 DATA_TYPE_UINT64, NULL, &guid, NULL); 1257 if (rc != 0) 1258 break; 1259 1260 vdev = vdev_find(guid); 1261 if (vdev != NULL) 1262 vdev_set_initial_state(vdev, kids[i]); 1263 } 1264 } else { 1265 rc = 0; 1266 } 1267 if (kids != NULL) { 1268 for (int i = 0; i < nkids; i++) 1269 nvlist_destroy(kids[i]); 1270 free(kids); 1271 } 1272 1273 return (rc); 1274 } 1275 1276 static int 1277 vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist) 1278 { 1279 uint64_t pool_guid, vdev_children; 1280 nvlist_t *vdevs = NULL, **kids = NULL; 1281 int rc, nkids; 1282 1283 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1284 NULL, &pool_guid, NULL) || 1285 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64, 1286 NULL, &vdev_children, NULL) || 1287 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1288 NULL, &vdevs, NULL)) { 1289 printf("ZFS: can't find vdev details\n"); 1290 return (ENOENT); 1291 } 1292 1293 /* Wrong guid?! */ 1294 if (spa->spa_guid != pool_guid) { 1295 nvlist_destroy(vdevs); 1296 return (EINVAL); 1297 } 1298 1299 spa->spa_root_vdev->v_nchildren = vdev_children; 1300 1301 rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1302 &nkids, &kids, NULL); 1303 nvlist_destroy(vdevs); 1304 1305 /* 1306 * MOS config has at least one child for root vdev. 1307 */ 1308 if (rc != 0) 1309 return (rc); 1310 1311 for (int i = 0; i < nkids; i++) { 1312 uint64_t guid; 1313 vdev_t *vdev; 1314 1315 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 1316 NULL, &guid, NULL); 1317 if (rc != 0) 1318 break; 1319 vdev = vdev_find(guid); 1320 /* 1321 * Top level vdev is missing, create it. 1322 */ 1323 if (vdev == NULL) 1324 rc = vdev_from_nvlist(spa, guid, kids[i]); 1325 else 1326 rc = vdev_update_from_nvlist(guid, kids[i]); 1327 if (rc != 0) 1328 break; 1329 } 1330 if (kids != NULL) { 1331 for (int i = 0; i < nkids; i++) 1332 nvlist_destroy(kids[i]); 1333 free(kids); 1334 } 1335 1336 /* 1337 * Re-evaluate top-level vdev state. 1338 */ 1339 vdev_set_state(spa->spa_root_vdev); 1340 1341 return (rc); 1342 } 1343 1344 static spa_t * 1345 spa_find_by_guid(uint64_t guid) 1346 { 1347 spa_t *spa; 1348 1349 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 1350 if (spa->spa_guid == guid) 1351 return (spa); 1352 1353 return (NULL); 1354 } 1355 1356 static spa_t * 1357 spa_find_by_name(const char *name) 1358 { 1359 spa_t *spa; 1360 1361 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 1362 if (strcmp(spa->spa_name, name) == 0) 1363 return (spa); 1364 1365 return (NULL); 1366 } 1367 1368 static spa_t * 1369 spa_find_by_dev(struct zfs_devdesc *dev) 1370 { 1371 1372 if (dev->dd.d_dev->dv_type != DEVT_ZFS) 1373 return (NULL); 1374 1375 if (dev->pool_guid == 0) 1376 return (STAILQ_FIRST(&zfs_pools)); 1377 1378 return (spa_find_by_guid(dev->pool_guid)); 1379 } 1380 1381 static spa_t * 1382 spa_create(uint64_t guid, const char *name) 1383 { 1384 spa_t *spa; 1385 1386 if ((spa = calloc(1, sizeof(spa_t))) == NULL) 1387 return (NULL); 1388 if ((spa->spa_name = strdup(name)) == NULL) { 1389 free(spa); 1390 return (NULL); 1391 } 1392 spa->spa_uberblock = &spa->spa_uberblock_master; 1393 spa->spa_mos = &spa->spa_mos_master; 1394 spa->spa_guid = guid; 1395 spa->spa_root_vdev = vdev_create(guid, NULL); 1396 if (spa->spa_root_vdev == NULL) { 1397 free(spa->spa_name); 1398 free(spa); 1399 return (NULL); 1400 } 1401 spa->spa_root_vdev->v_name = strdup("root"); 1402 STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); 1403 1404 return (spa); 1405 } 1406 1407 static const char * 1408 state_name(vdev_state_t state) 1409 { 1410 static const char *names[] = { 1411 "UNKNOWN", 1412 "CLOSED", 1413 "OFFLINE", 1414 "REMOVED", 1415 "CANT_OPEN", 1416 "FAULTED", 1417 "DEGRADED", 1418 "ONLINE" 1419 }; 1420 return (names[state]); 1421 } 1422 1423 #ifdef BOOT2 1424 1425 #define pager_printf printf 1426 1427 #else 1428 1429 static int 1430 pager_printf(const char *fmt, ...) 1431 { 1432 char line[80]; 1433 va_list args; 1434 1435 va_start(args, fmt); 1436 vsnprintf(line, sizeof(line), fmt, args); 1437 va_end(args); 1438 return (pager_output(line)); 1439 } 1440 1441 #endif 1442 1443 #define STATUS_FORMAT " %s %s\n" 1444 1445 static int 1446 print_state(int indent, const char *name, vdev_state_t state) 1447 { 1448 int i; 1449 char buf[512]; 1450 1451 buf[0] = 0; 1452 for (i = 0; i < indent; i++) 1453 strcat(buf, " "); 1454 strcat(buf, name); 1455 return (pager_printf(STATUS_FORMAT, buf, state_name(state))); 1456 } 1457 1458 static int 1459 vdev_status(vdev_t *vdev, int indent) 1460 { 1461 vdev_t *kid; 1462 int ret; 1463 1464 if (vdev->v_islog) { 1465 (void) pager_output(" logs\n"); 1466 indent++; 1467 } 1468 1469 ret = print_state(indent, vdev->v_name, vdev->v_state); 1470 if (ret != 0) 1471 return (ret); 1472 1473 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1474 ret = vdev_status(kid, indent + 1); 1475 if (ret != 0) 1476 return (ret); 1477 } 1478 return (ret); 1479 } 1480 1481 static int 1482 spa_status(spa_t *spa) 1483 { 1484 static char bootfs[ZFS_MAXNAMELEN]; 1485 uint64_t rootid; 1486 vdev_list_t *vlist; 1487 vdev_t *vdev; 1488 int good_kids, bad_kids, degraded_kids, ret; 1489 vdev_state_t state; 1490 1491 ret = pager_printf(" pool: %s\n", spa->spa_name); 1492 if (ret != 0) 1493 return (ret); 1494 1495 if (zfs_get_root(spa, &rootid) == 0 && 1496 zfs_rlookup(spa, rootid, bootfs) == 0) { 1497 if (bootfs[0] == '\0') 1498 ret = pager_printf("bootfs: %s\n", spa->spa_name); 1499 else 1500 ret = pager_printf("bootfs: %s/%s\n", spa->spa_name, 1501 bootfs); 1502 if (ret != 0) 1503 return (ret); 1504 } 1505 ret = pager_printf("config:\n\n"); 1506 if (ret != 0) 1507 return (ret); 1508 ret = pager_printf(STATUS_FORMAT, "NAME", "STATE"); 1509 if (ret != 0) 1510 return (ret); 1511 1512 good_kids = 0; 1513 degraded_kids = 0; 1514 bad_kids = 0; 1515 vlist = &spa->spa_root_vdev->v_children; 1516 STAILQ_FOREACH(vdev, vlist, v_childlink) { 1517 if (vdev->v_state == VDEV_STATE_HEALTHY) 1518 good_kids++; 1519 else if (vdev->v_state == VDEV_STATE_DEGRADED) 1520 degraded_kids++; 1521 else 1522 bad_kids++; 1523 } 1524 1525 state = VDEV_STATE_CLOSED; 1526 if (good_kids > 0 && (degraded_kids + bad_kids) == 0) 1527 state = VDEV_STATE_HEALTHY; 1528 else if ((good_kids + degraded_kids) > 0) 1529 state = VDEV_STATE_DEGRADED; 1530 1531 ret = print_state(0, spa->spa_name, state); 1532 if (ret != 0) 1533 return (ret); 1534 1535 STAILQ_FOREACH(vdev, vlist, v_childlink) { 1536 ret = vdev_status(vdev, 1); 1537 if (ret != 0) 1538 return (ret); 1539 } 1540 return (ret); 1541 } 1542 1543 static int 1544 spa_all_status(void) 1545 { 1546 spa_t *spa; 1547 int first = 1, ret = 0; 1548 1549 STAILQ_FOREACH(spa, &zfs_pools, spa_link) { 1550 if (!first) { 1551 ret = pager_printf("\n"); 1552 if (ret != 0) 1553 return (ret); 1554 } 1555 first = 0; 1556 ret = spa_status(spa); 1557 if (ret != 0) 1558 return (ret); 1559 } 1560 return (ret); 1561 } 1562 1563 static uint64_t 1564 vdev_label_offset(uint64_t psize, int l, uint64_t offset) 1565 { 1566 uint64_t label_offset; 1567 1568 if (l < VDEV_LABELS / 2) 1569 label_offset = 0; 1570 else 1571 label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t); 1572 1573 return (offset + l * sizeof (vdev_label_t) + label_offset); 1574 } 1575 1576 static int 1577 vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) 1578 { 1579 unsigned int seq1 = 0; 1580 unsigned int seq2 = 0; 1581 int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); 1582 1583 if (cmp != 0) 1584 return (cmp); 1585 1586 cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); 1587 if (cmp != 0) 1588 return (cmp); 1589 1590 if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) 1591 seq1 = MMP_SEQ(ub1); 1592 1593 if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) 1594 seq2 = MMP_SEQ(ub2); 1595 1596 return (AVL_CMP(seq1, seq2)); 1597 } 1598 1599 static int 1600 uberblock_verify(uberblock_t *ub) 1601 { 1602 if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) { 1603 byteswap_uint64_array(ub, sizeof (uberblock_t)); 1604 } 1605 1606 if (ub->ub_magic != UBERBLOCK_MAGIC || 1607 !SPA_VERSION_IS_SUPPORTED(ub->ub_version)) 1608 return (EINVAL); 1609 1610 return (0); 1611 } 1612 1613 static int 1614 vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset, 1615 size_t size) 1616 { 1617 blkptr_t bp; 1618 off_t off; 1619 1620 off = vdev_label_offset(vd->v_psize, l, offset); 1621 1622 BP_ZERO(&bp); 1623 BP_SET_LSIZE(&bp, size); 1624 BP_SET_PSIZE(&bp, size); 1625 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); 1626 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); 1627 DVA_SET_OFFSET(BP_IDENTITY(&bp), off); 1628 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); 1629 1630 return (vdev_read_phys(vd, &bp, buf, off, size)); 1631 } 1632 1633 /* 1634 * We do need to be sure we write to correct location. 1635 * Our vdev label does consist of 4 fields: 1636 * pad1 (8k), reserved. 1637 * bootenv (8k), checksummed, previously reserved, may contian garbage. 1638 * vdev_phys (112k), checksummed 1639 * uberblock ring (128k), checksummed. 1640 * 1641 * Since bootenv area may contain garbage, we can not reliably read it, as 1642 * we can get checksum errors. 1643 * Next best thing is vdev_phys - it is just after bootenv. It still may 1644 * be corrupted, but in such case we will miss this one write. 1645 */ 1646 static int 1647 vdev_label_write_validate(vdev_t *vd, int l, uint64_t offset) 1648 { 1649 uint64_t off, o_phys; 1650 void *buf; 1651 size_t size = VDEV_PHYS_SIZE; 1652 int rc; 1653 1654 o_phys = offsetof(vdev_label_t, vl_vdev_phys); 1655 off = vdev_label_offset(vd->v_psize, l, o_phys); 1656 1657 /* off should be 8K from bootenv */ 1658 if (vdev_label_offset(vd->v_psize, l, offset) + VDEV_PAD_SIZE != off) 1659 return (EINVAL); 1660 1661 buf = malloc(size); 1662 if (buf == NULL) 1663 return (ENOMEM); 1664 1665 /* Read vdev_phys */ 1666 rc = vdev_label_read(vd, l, buf, o_phys, size); 1667 free(buf); 1668 return (rc); 1669 } 1670 1671 static int 1672 vdev_label_write(vdev_t *vd, int l, vdev_boot_envblock_t *be, uint64_t offset) 1673 { 1674 zio_checksum_info_t *ci; 1675 zio_cksum_t cksum; 1676 off_t off; 1677 size_t size = VDEV_PAD_SIZE; 1678 int rc; 1679 1680 if (vd->v_phys_write == NULL) 1681 return (ENOTSUP); 1682 1683 off = vdev_label_offset(vd->v_psize, l, offset); 1684 1685 rc = vdev_label_write_validate(vd, l, offset); 1686 if (rc != 0) { 1687 return (rc); 1688 } 1689 1690 ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; 1691 be->vbe_zbt.zec_magic = ZEC_MAGIC; 1692 zio_checksum_label_verifier(&be->vbe_zbt.zec_cksum, off); 1693 ci->ci_func[0](be, size, NULL, &cksum); 1694 be->vbe_zbt.zec_cksum = cksum; 1695 1696 return (vdev_write_phys(vd, be, off, size)); 1697 } 1698 1699 static int 1700 vdev_write_bootenv_impl(vdev_t *vdev, vdev_boot_envblock_t *be) 1701 { 1702 vdev_t *kid; 1703 int rv = 0, rc; 1704 1705 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1706 if (kid->v_state != VDEV_STATE_HEALTHY) 1707 continue; 1708 rc = vdev_write_bootenv_impl(kid, be); 1709 if (rv == 0) 1710 rv = rc; 1711 } 1712 1713 /* 1714 * Non-leaf vdevs do not have v_phys_write. 1715 */ 1716 if (vdev->v_phys_write == NULL) 1717 return (rv); 1718 1719 for (int l = 0; l < VDEV_LABELS; l++) { 1720 rc = vdev_label_write(vdev, l, be, 1721 offsetof(vdev_label_t, vl_be)); 1722 if (rc != 0) { 1723 printf("failed to write bootenv to %s label %d: %d\n", 1724 vdev->v_name ? vdev->v_name : "unknown", l, rc); 1725 rv = rc; 1726 } 1727 } 1728 return (rv); 1729 } 1730 1731 int 1732 vdev_write_bootenv(vdev_t *vdev, nvlist_t *nvl) 1733 { 1734 vdev_boot_envblock_t *be; 1735 nvlist_t nv, *nvp; 1736 uint64_t version; 1737 int rv; 1738 1739 if (nvl->nv_size > sizeof(be->vbe_bootenv)) 1740 return (E2BIG); 1741 1742 version = VB_RAW; 1743 nvp = vdev_read_bootenv(vdev); 1744 if (nvp != NULL) { 1745 nvlist_find(nvp, BOOTENV_VERSION, DATA_TYPE_UINT64, NULL, 1746 &version, NULL); 1747 nvlist_destroy(nvp); 1748 } 1749 1750 be = calloc(1, sizeof(*be)); 1751 if (be == NULL) 1752 return (ENOMEM); 1753 1754 be->vbe_version = version; 1755 switch (version) { 1756 case VB_RAW: 1757 /* 1758 * If there is no envmap, we will just wipe bootenv. 1759 */ 1760 nvlist_find(nvl, GRUB_ENVMAP, DATA_TYPE_STRING, NULL, 1761 be->vbe_bootenv, NULL); 1762 rv = 0; 1763 break; 1764 1765 case VB_NVLIST: 1766 nv.nv_header = nvl->nv_header; 1767 nv.nv_asize = nvl->nv_asize; 1768 nv.nv_size = nvl->nv_size; 1769 1770 bcopy(&nv.nv_header, be->vbe_bootenv, sizeof(nv.nv_header)); 1771 nv.nv_data = be->vbe_bootenv + sizeof(nvs_header_t); 1772 bcopy(nvl->nv_data, nv.nv_data, nv.nv_size); 1773 rv = nvlist_export(&nv); 1774 break; 1775 1776 default: 1777 rv = EINVAL; 1778 break; 1779 } 1780 1781 if (rv == 0) { 1782 be->vbe_version = htobe64(be->vbe_version); 1783 rv = vdev_write_bootenv_impl(vdev, be); 1784 } 1785 free(be); 1786 return (rv); 1787 } 1788 1789 /* 1790 * Read the bootenv area from pool label, return the nvlist from it. 1791 * We return from first successful read. 1792 */ 1793 nvlist_t * 1794 vdev_read_bootenv(vdev_t *vdev) 1795 { 1796 vdev_t *kid; 1797 nvlist_t *benv; 1798 vdev_boot_envblock_t *be; 1799 char *command; 1800 bool ok; 1801 int rv; 1802 1803 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1804 if (kid->v_state != VDEV_STATE_HEALTHY) 1805 continue; 1806 1807 benv = vdev_read_bootenv(kid); 1808 if (benv != NULL) 1809 return (benv); 1810 } 1811 1812 be = malloc(sizeof (*be)); 1813 if (be == NULL) 1814 return (NULL); 1815 1816 rv = 0; 1817 for (int l = 0; l < VDEV_LABELS; l++) { 1818 rv = vdev_label_read(vdev, l, be, 1819 offsetof(vdev_label_t, vl_be), 1820 sizeof (*be)); 1821 if (rv == 0) 1822 break; 1823 } 1824 if (rv != 0) { 1825 free(be); 1826 return (NULL); 1827 } 1828 1829 be->vbe_version = be64toh(be->vbe_version); 1830 switch (be->vbe_version) { 1831 case VB_RAW: 1832 /* 1833 * we have textual data in vbe_bootenv, create nvlist 1834 * with key "envmap". 1835 */ 1836 benv = nvlist_create(NV_UNIQUE_NAME); 1837 if (benv != NULL) { 1838 if (*be->vbe_bootenv == '\0') { 1839 nvlist_add_uint64(benv, BOOTENV_VERSION, 1840 VB_NVLIST); 1841 break; 1842 } 1843 nvlist_add_uint64(benv, BOOTENV_VERSION, VB_RAW); 1844 be->vbe_bootenv[sizeof (be->vbe_bootenv) - 1] = '\0'; 1845 nvlist_add_string(benv, GRUB_ENVMAP, be->vbe_bootenv); 1846 } 1847 break; 1848 1849 case VB_NVLIST: 1850 benv = nvlist_import(be->vbe_bootenv, sizeof(be->vbe_bootenv)); 1851 break; 1852 1853 default: 1854 command = (char *)be; 1855 ok = false; 1856 1857 /* Check for legacy zfsbootcfg command string */ 1858 for (int i = 0; command[i] != '\0'; i++) { 1859 if (iscntrl(command[i])) { 1860 ok = false; 1861 break; 1862 } else { 1863 ok = true; 1864 } 1865 } 1866 benv = nvlist_create(NV_UNIQUE_NAME); 1867 if (benv != NULL) { 1868 if (ok) 1869 nvlist_add_string(benv, FREEBSD_BOOTONCE, 1870 command); 1871 else 1872 nvlist_add_uint64(benv, BOOTENV_VERSION, 1873 VB_NVLIST); 1874 } 1875 break; 1876 } 1877 free(be); 1878 return (benv); 1879 } 1880 1881 static uint64_t 1882 vdev_get_label_asize(nvlist_t *nvl) 1883 { 1884 nvlist_t *vdevs; 1885 uint64_t asize; 1886 const char *type; 1887 int len; 1888 1889 asize = 0; 1890 /* Get vdev tree */ 1891 if (nvlist_find(nvl, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1892 NULL, &vdevs, NULL) != 0) 1893 return (asize); 1894 1895 /* 1896 * Get vdev type. We will calculate asize for raidz, mirror and disk. 1897 * For raidz, the asize is raw size of all children. 1898 */ 1899 if (nvlist_find(vdevs, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, 1900 NULL, &type, &len) != 0) 1901 goto done; 1902 1903 if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 && 1904 memcmp(type, VDEV_TYPE_DISK, len) != 0 && 1905 memcmp(type, VDEV_TYPE_RAIDZ, len) != 0) 1906 goto done; 1907 1908 if (nvlist_find(vdevs, ZPOOL_CONFIG_ASIZE, DATA_TYPE_UINT64, 1909 NULL, &asize, NULL) != 0) 1910 goto done; 1911 1912 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) { 1913 nvlist_t **kids; 1914 int nkids; 1915 1916 if (nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, 1917 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids, NULL) != 0) { 1918 asize = 0; 1919 goto done; 1920 } 1921 1922 asize /= nkids; 1923 for (int i = 0; i < nkids; i++) 1924 nvlist_destroy(kids[i]); 1925 free(kids); 1926 } 1927 1928 asize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 1929 done: 1930 nvlist_destroy(vdevs); 1931 return (asize); 1932 } 1933 1934 static nvlist_t * 1935 vdev_label_read_config(vdev_t *vd, uint64_t txg) 1936 { 1937 vdev_phys_t *label; 1938 uint64_t best_txg = 0; 1939 uint64_t label_txg = 0; 1940 uint64_t asize; 1941 nvlist_t *nvl = NULL, *tmp; 1942 int error; 1943 1944 label = malloc(sizeof (vdev_phys_t)); 1945 if (label == NULL) 1946 return (NULL); 1947 1948 for (int l = 0; l < VDEV_LABELS; l++) { 1949 if (vdev_label_read(vd, l, label, 1950 offsetof(vdev_label_t, vl_vdev_phys), 1951 sizeof (vdev_phys_t))) 1952 continue; 1953 1954 tmp = nvlist_import(label->vp_nvlist, 1955 sizeof(label->vp_nvlist)); 1956 if (tmp == NULL) 1957 continue; 1958 1959 error = nvlist_find(tmp, ZPOOL_CONFIG_POOL_TXG, 1960 DATA_TYPE_UINT64, NULL, &label_txg, NULL); 1961 if (error != 0 || label_txg == 0) { 1962 nvlist_destroy(nvl); 1963 nvl = tmp; 1964 goto done; 1965 } 1966 1967 if (label_txg <= txg && label_txg > best_txg) { 1968 best_txg = label_txg; 1969 nvlist_destroy(nvl); 1970 nvl = tmp; 1971 tmp = NULL; 1972 1973 /* 1974 * Use asize from pool config. We need this 1975 * because we can get bad value from BIOS. 1976 */ 1977 asize = vdev_get_label_asize(nvl); 1978 if (asize != 0) { 1979 vd->v_psize = asize; 1980 } 1981 } 1982 nvlist_destroy(tmp); 1983 } 1984 1985 if (best_txg == 0) { 1986 nvlist_destroy(nvl); 1987 nvl = NULL; 1988 } 1989 done: 1990 free(label); 1991 return (nvl); 1992 } 1993 1994 static void 1995 vdev_uberblock_load(vdev_t *vd, uberblock_t *ub) 1996 { 1997 uberblock_t *buf; 1998 1999 buf = malloc(VDEV_UBERBLOCK_SIZE(vd)); 2000 if (buf == NULL) 2001 return; 2002 2003 for (int l = 0; l < VDEV_LABELS; l++) { 2004 for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { 2005 if (vdev_label_read(vd, l, buf, 2006 VDEV_UBERBLOCK_OFFSET(vd, n), 2007 VDEV_UBERBLOCK_SIZE(vd))) 2008 continue; 2009 if (uberblock_verify(buf) != 0) 2010 continue; 2011 2012 if (vdev_uberblock_compare(buf, ub) > 0) 2013 *ub = *buf; 2014 } 2015 } 2016 free(buf); 2017 } 2018 2019 static int 2020 vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv, 2021 spa_t **spap) 2022 { 2023 vdev_t vtmp; 2024 spa_t *spa; 2025 vdev_t *vdev; 2026 nvlist_t *nvl; 2027 uint64_t val; 2028 uint64_t guid, vdev_children; 2029 uint64_t pool_txg, pool_guid; 2030 const char *pool_name; 2031 int rc, namelen; 2032 2033 /* 2034 * Load the vdev label and figure out which 2035 * uberblock is most current. 2036 */ 2037 memset(&vtmp, 0, sizeof(vtmp)); 2038 vtmp.v_phys_read = _read; 2039 vtmp.v_phys_write = _write; 2040 vtmp.v_priv = priv; 2041 vtmp.v_psize = P2ALIGN(ldi_get_size(priv), 2042 (uint64_t)sizeof (vdev_label_t)); 2043 2044 /* Test for minimum device size. */ 2045 if (vtmp.v_psize < SPA_MINDEVSIZE) 2046 return (EIO); 2047 2048 nvl = vdev_label_read_config(&vtmp, UINT64_MAX); 2049 if (nvl == NULL) 2050 return (EIO); 2051 2052 if (nvlist_find(nvl, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, 2053 NULL, &val, NULL) != 0) { 2054 nvlist_destroy(nvl); 2055 return (EIO); 2056 } 2057 2058 if (!SPA_VERSION_IS_SUPPORTED(val)) { 2059 printf("ZFS: unsupported ZFS version %u (should be %u)\n", 2060 (unsigned)val, (unsigned)SPA_VERSION); 2061 nvlist_destroy(nvl); 2062 return (EIO); 2063 } 2064 2065 /* Check ZFS features for read */ 2066 rc = nvlist_check_features_for_read(nvl); 2067 if (rc != 0) { 2068 nvlist_destroy(nvl); 2069 return (EIO); 2070 } 2071 2072 if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, 2073 NULL, &val, NULL) != 0) { 2074 nvlist_destroy(nvl); 2075 return (EIO); 2076 } 2077 2078 if (val == POOL_STATE_DESTROYED) { 2079 /* We don't boot only from destroyed pools. */ 2080 nvlist_destroy(nvl); 2081 return (EIO); 2082 } 2083 2084 if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, 2085 NULL, &pool_txg, NULL) != 0 || 2086 nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 2087 NULL, &pool_guid, NULL) != 0 || 2088 nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING, 2089 NULL, &pool_name, &namelen) != 0) { 2090 /* 2091 * Cache and spare devices end up here - just ignore 2092 * them. 2093 */ 2094 nvlist_destroy(nvl); 2095 return (EIO); 2096 } 2097 2098 /* 2099 * Create the pool if this is the first time we've seen it. 2100 */ 2101 spa = spa_find_by_guid(pool_guid); 2102 if (spa == NULL) { 2103 char *name; 2104 2105 nvlist_find(nvl, ZPOOL_CONFIG_VDEV_CHILDREN, 2106 DATA_TYPE_UINT64, NULL, &vdev_children, NULL); 2107 name = malloc(namelen + 1); 2108 if (name == NULL) { 2109 nvlist_destroy(nvl); 2110 return (ENOMEM); 2111 } 2112 bcopy(pool_name, name, namelen); 2113 name[namelen] = '\0'; 2114 spa = spa_create(pool_guid, name); 2115 free(name); 2116 if (spa == NULL) { 2117 nvlist_destroy(nvl); 2118 return (ENOMEM); 2119 } 2120 spa->spa_root_vdev->v_nchildren = vdev_children; 2121 } 2122 if (pool_txg > spa->spa_txg) 2123 spa->spa_txg = pool_txg; 2124 2125 /* 2126 * Get the vdev tree and create our in-core copy of it. 2127 * If we already have a vdev with this guid, this must 2128 * be some kind of alias (overlapping slices, dangerously dedicated 2129 * disks etc). 2130 */ 2131 if (nvlist_find(nvl, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 2132 NULL, &guid, NULL) != 0) { 2133 nvlist_destroy(nvl); 2134 return (EIO); 2135 } 2136 vdev = vdev_find(guid); 2137 /* Has this vdev already been inited? */ 2138 if (vdev && vdev->v_phys_read) { 2139 nvlist_destroy(nvl); 2140 return (EIO); 2141 } 2142 2143 rc = vdev_init_from_label(spa, nvl); 2144 nvlist_destroy(nvl); 2145 if (rc != 0) 2146 return (rc); 2147 2148 /* 2149 * We should already have created an incomplete vdev for this 2150 * vdev. Find it and initialise it with our read proc. 2151 */ 2152 vdev = vdev_find(guid); 2153 if (vdev != NULL) { 2154 vdev->v_phys_read = _read; 2155 vdev->v_phys_write = _write; 2156 vdev->v_priv = priv; 2157 vdev->v_psize = vtmp.v_psize; 2158 /* 2159 * If no other state is set, mark vdev healthy. 2160 */ 2161 if (vdev->v_state == VDEV_STATE_UNKNOWN) 2162 vdev->v_state = VDEV_STATE_HEALTHY; 2163 } else { 2164 printf("ZFS: inconsistent nvlist contents\n"); 2165 return (EIO); 2166 } 2167 2168 if (vdev->v_islog) 2169 spa->spa_with_log = vdev->v_islog; 2170 2171 /* 2172 * Re-evaluate top-level vdev state. 2173 */ 2174 vdev_set_state(vdev->v_top); 2175 2176 /* 2177 * Ok, we are happy with the pool so far. Lets find 2178 * the best uberblock and then we can actually access 2179 * the contents of the pool. 2180 */ 2181 vdev_uberblock_load(vdev, spa->spa_uberblock); 2182 2183 if (spap != NULL) 2184 *spap = spa; 2185 return (0); 2186 } 2187 2188 static int 2189 ilog2(int n) 2190 { 2191 int v; 2192 2193 for (v = 0; v < 32; v++) 2194 if (n == (1 << v)) 2195 return (v); 2196 return (-1); 2197 } 2198 2199 static int 2200 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf) 2201 { 2202 blkptr_t gbh_bp; 2203 zio_gbh_phys_t zio_gb; 2204 char *pbuf; 2205 int i; 2206 2207 /* Artificial BP for gang block header. */ 2208 gbh_bp = *bp; 2209 BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 2210 BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 2211 BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER); 2212 BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF); 2213 for (i = 0; i < SPA_DVAS_PER_BP; i++) 2214 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0); 2215 2216 /* Read gang header block using the artificial BP. */ 2217 if (zio_read(spa, &gbh_bp, &zio_gb)) 2218 return (EIO); 2219 2220 pbuf = buf; 2221 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 2222 blkptr_t *gbp = &zio_gb.zg_blkptr[i]; 2223 2224 if (BP_IS_HOLE(gbp)) 2225 continue; 2226 if (zio_read(spa, gbp, pbuf)) 2227 return (EIO); 2228 pbuf += BP_GET_PSIZE(gbp); 2229 } 2230 2231 if (zio_checksum_verify(spa, bp, buf)) 2232 return (EIO); 2233 return (0); 2234 } 2235 2236 static int 2237 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) 2238 { 2239 int cpfunc = BP_GET_COMPRESS(bp); 2240 uint64_t align, size; 2241 void *pbuf; 2242 int i, error; 2243 2244 /* 2245 * Process data embedded in block pointer 2246 */ 2247 if (BP_IS_EMBEDDED(bp)) { 2248 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 2249 2250 size = BPE_GET_PSIZE(bp); 2251 ASSERT(size <= BPE_PAYLOAD_SIZE); 2252 2253 if (cpfunc != ZIO_COMPRESS_OFF) 2254 pbuf = malloc(size); 2255 else 2256 pbuf = buf; 2257 2258 if (pbuf == NULL) 2259 return (ENOMEM); 2260 2261 decode_embedded_bp_compressed(bp, pbuf); 2262 error = 0; 2263 2264 if (cpfunc != ZIO_COMPRESS_OFF) { 2265 error = zio_decompress_data(cpfunc, pbuf, 2266 size, buf, BP_GET_LSIZE(bp)); 2267 free(pbuf); 2268 } 2269 if (error != 0) 2270 printf("ZFS: i/o error - unable to decompress " 2271 "block pointer data, error %d\n", error); 2272 return (error); 2273 } 2274 2275 error = EIO; 2276 2277 for (i = 0; i < SPA_DVAS_PER_BP; i++) { 2278 const dva_t *dva = &bp->blk_dva[i]; 2279 vdev_t *vdev; 2280 vdev_list_t *vlist; 2281 uint64_t vdevid; 2282 off_t offset; 2283 2284 if (!dva->dva_word[0] && !dva->dva_word[1]) 2285 continue; 2286 2287 vdevid = DVA_GET_VDEV(dva); 2288 offset = DVA_GET_OFFSET(dva); 2289 vlist = &spa->spa_root_vdev->v_children; 2290 STAILQ_FOREACH(vdev, vlist, v_childlink) { 2291 if (vdev->v_id == vdevid) 2292 break; 2293 } 2294 if (!vdev || !vdev->v_read) 2295 continue; 2296 2297 size = BP_GET_PSIZE(bp); 2298 if (vdev->v_read == vdev_raidz_read) { 2299 align = 1ULL << vdev->v_ashift; 2300 if (P2PHASE(size, align) != 0) 2301 size = P2ROUNDUP(size, align); 2302 } 2303 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) 2304 pbuf = malloc(size); 2305 else 2306 pbuf = buf; 2307 2308 if (pbuf == NULL) { 2309 error = ENOMEM; 2310 break; 2311 } 2312 2313 if (DVA_GET_GANG(dva)) 2314 error = zio_read_gang(spa, bp, pbuf); 2315 else 2316 error = vdev->v_read(vdev, bp, pbuf, offset, size); 2317 if (error == 0) { 2318 if (cpfunc != ZIO_COMPRESS_OFF) 2319 error = zio_decompress_data(cpfunc, pbuf, 2320 BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp)); 2321 else if (size != BP_GET_PSIZE(bp)) 2322 bcopy(pbuf, buf, BP_GET_PSIZE(bp)); 2323 } else { 2324 printf("zio_read error: %d\n", error); 2325 } 2326 if (buf != pbuf) 2327 free(pbuf); 2328 if (error == 0) 2329 break; 2330 } 2331 if (error != 0) 2332 printf("ZFS: i/o error - all block copies unavailable\n"); 2333 2334 return (error); 2335 } 2336 2337 static int 2338 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, 2339 void *buf, size_t buflen) 2340 { 2341 int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 2342 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2343 int nlevels = dnode->dn_nlevels; 2344 int i, rc; 2345 2346 if (bsize > SPA_MAXBLOCKSIZE) { 2347 printf("ZFS: I/O error - blocks larger than %llu are not " 2348 "supported\n", SPA_MAXBLOCKSIZE); 2349 return (EIO); 2350 } 2351 2352 /* 2353 * Note: bsize may not be a power of two here so we need to do an 2354 * actual divide rather than a bitshift. 2355 */ 2356 while (buflen > 0) { 2357 uint64_t bn = offset / bsize; 2358 int boff = offset % bsize; 2359 int ibn; 2360 const blkptr_t *indbp; 2361 blkptr_t bp; 2362 2363 if (bn > dnode->dn_maxblkid) 2364 return (EIO); 2365 2366 if (dnode == dnode_cache_obj && bn == dnode_cache_bn) 2367 goto cached; 2368 2369 indbp = dnode->dn_blkptr; 2370 for (i = 0; i < nlevels; i++) { 2371 /* 2372 * Copy the bp from the indirect array so that 2373 * we can re-use the scratch buffer for multi-level 2374 * objects. 2375 */ 2376 ibn = bn >> ((nlevels - i - 1) * ibshift); 2377 ibn &= ((1 << ibshift) - 1); 2378 bp = indbp[ibn]; 2379 if (BP_IS_HOLE(&bp)) { 2380 memset(dnode_cache_buf, 0, bsize); 2381 break; 2382 } 2383 rc = zio_read(spa, &bp, dnode_cache_buf); 2384 if (rc) 2385 return (rc); 2386 indbp = (const blkptr_t *) dnode_cache_buf; 2387 } 2388 dnode_cache_obj = dnode; 2389 dnode_cache_bn = bn; 2390 cached: 2391 2392 /* 2393 * The buffer contains our data block. Copy what we 2394 * need from it and loop. 2395 */ 2396 i = bsize - boff; 2397 if (i > buflen) i = buflen; 2398 memcpy(buf, &dnode_cache_buf[boff], i); 2399 buf = ((char *)buf) + i; 2400 offset += i; 2401 buflen -= i; 2402 } 2403 2404 return (0); 2405 } 2406 2407 /* 2408 * Lookup a value in a microzap directory. 2409 */ 2410 static int 2411 mzap_lookup(const mzap_phys_t *mz, size_t size, const char *name, 2412 uint64_t *value) 2413 { 2414 const mzap_ent_phys_t *mze; 2415 int chunks, i; 2416 2417 /* 2418 * Microzap objects use exactly one block. Read the whole 2419 * thing. 2420 */ 2421 chunks = size / MZAP_ENT_LEN - 1; 2422 for (i = 0; i < chunks; i++) { 2423 mze = &mz->mz_chunk[i]; 2424 if (strcmp(mze->mze_name, name) == 0) { 2425 *value = mze->mze_value; 2426 return (0); 2427 } 2428 } 2429 2430 return (ENOENT); 2431 } 2432 2433 /* 2434 * Compare a name with a zap leaf entry. Return non-zero if the name 2435 * matches. 2436 */ 2437 static int 2438 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, 2439 const char *name) 2440 { 2441 size_t namelen; 2442 const zap_leaf_chunk_t *nc; 2443 const char *p; 2444 2445 namelen = zc->l_entry.le_name_numints; 2446 2447 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 2448 p = name; 2449 while (namelen > 0) { 2450 size_t len; 2451 2452 len = namelen; 2453 if (len > ZAP_LEAF_ARRAY_BYTES) 2454 len = ZAP_LEAF_ARRAY_BYTES; 2455 if (memcmp(p, nc->l_array.la_array, len)) 2456 return (0); 2457 p += len; 2458 namelen -= len; 2459 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 2460 } 2461 2462 return (1); 2463 } 2464 2465 /* 2466 * Extract a uint64_t value from a zap leaf entry. 2467 */ 2468 static uint64_t 2469 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc) 2470 { 2471 const zap_leaf_chunk_t *vc; 2472 int i; 2473 uint64_t value; 2474 const uint8_t *p; 2475 2476 vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk); 2477 for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) { 2478 value = (value << 8) | p[i]; 2479 } 2480 2481 return (value); 2482 } 2483 2484 static void 2485 stv(int len, void *addr, uint64_t value) 2486 { 2487 switch (len) { 2488 case 1: 2489 *(uint8_t *)addr = value; 2490 return; 2491 case 2: 2492 *(uint16_t *)addr = value; 2493 return; 2494 case 4: 2495 *(uint32_t *)addr = value; 2496 return; 2497 case 8: 2498 *(uint64_t *)addr = value; 2499 return; 2500 } 2501 } 2502 2503 /* 2504 * Extract a array from a zap leaf entry. 2505 */ 2506 static void 2507 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, 2508 uint64_t integer_size, uint64_t num_integers, void *buf) 2509 { 2510 uint64_t array_int_len = zc->l_entry.le_value_intlen; 2511 uint64_t value = 0; 2512 uint64_t *u64 = buf; 2513 char *p = buf; 2514 int len = MIN(zc->l_entry.le_value_numints, num_integers); 2515 int chunk = zc->l_entry.le_value_chunk; 2516 int byten = 0; 2517 2518 if (integer_size == 8 && len == 1) { 2519 *u64 = fzap_leaf_value(zl, zc); 2520 return; 2521 } 2522 2523 while (len > 0) { 2524 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array; 2525 int i; 2526 2527 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl)); 2528 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { 2529 value = (value << 8) | la->la_array[i]; 2530 byten++; 2531 if (byten == array_int_len) { 2532 stv(integer_size, p, value); 2533 byten = 0; 2534 len--; 2535 if (len == 0) 2536 return; 2537 p += integer_size; 2538 } 2539 } 2540 chunk = la->la_next; 2541 } 2542 } 2543 2544 static int 2545 fzap_check_size(uint64_t integer_size, uint64_t num_integers) 2546 { 2547 2548 switch (integer_size) { 2549 case 1: 2550 case 2: 2551 case 4: 2552 case 8: 2553 break; 2554 default: 2555 return (EINVAL); 2556 } 2557 2558 if (integer_size * num_integers > ZAP_MAXVALUELEN) 2559 return (E2BIG); 2560 2561 return (0); 2562 } 2563 2564 static void 2565 zap_leaf_free(zap_leaf_t *leaf) 2566 { 2567 free(leaf->l_phys); 2568 free(leaf); 2569 } 2570 2571 static int 2572 zap_get_leaf_byblk(fat_zap_t *zap, uint64_t blk, zap_leaf_t **lp) 2573 { 2574 int bs = FZAP_BLOCK_SHIFT(zap); 2575 int err; 2576 2577 *lp = malloc(sizeof(**lp)); 2578 if (*lp == NULL) 2579 return (ENOMEM); 2580 2581 (*lp)->l_bs = bs; 2582 (*lp)->l_phys = malloc(1 << bs); 2583 2584 if ((*lp)->l_phys == NULL) { 2585 free(*lp); 2586 return (ENOMEM); 2587 } 2588 err = dnode_read(zap->zap_spa, zap->zap_dnode, blk << bs, (*lp)->l_phys, 2589 1 << bs); 2590 if (err != 0) { 2591 zap_leaf_free(*lp); 2592 } 2593 return (err); 2594 } 2595 2596 static int 2597 zap_table_load(fat_zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, 2598 uint64_t *valp) 2599 { 2600 int bs = FZAP_BLOCK_SHIFT(zap); 2601 uint64_t blk = idx >> (bs - 3); 2602 uint64_t off = idx & ((1 << (bs - 3)) - 1); 2603 uint64_t *buf; 2604 int rc; 2605 2606 buf = malloc(1 << zap->zap_block_shift); 2607 if (buf == NULL) 2608 return (ENOMEM); 2609 rc = dnode_read(zap->zap_spa, zap->zap_dnode, (tbl->zt_blk + blk) << bs, 2610 buf, 1 << zap->zap_block_shift); 2611 if (rc == 0) 2612 *valp = buf[off]; 2613 free(buf); 2614 return (rc); 2615 } 2616 2617 static int 2618 zap_idx_to_blk(fat_zap_t *zap, uint64_t idx, uint64_t *valp) 2619 { 2620 if (zap->zap_phys->zap_ptrtbl.zt_numblks == 0) { 2621 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); 2622 return (0); 2623 } else { 2624 return (zap_table_load(zap, &zap->zap_phys->zap_ptrtbl, 2625 idx, valp)); 2626 } 2627 } 2628 2629 #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) 2630 static int 2631 zap_deref_leaf(fat_zap_t *zap, uint64_t h, zap_leaf_t **lp) 2632 { 2633 uint64_t idx, blk; 2634 int err; 2635 2636 idx = ZAP_HASH_IDX(h, zap->zap_phys->zap_ptrtbl.zt_shift); 2637 err = zap_idx_to_blk(zap, idx, &blk); 2638 if (err != 0) 2639 return (err); 2640 return (zap_get_leaf_byblk(zap, blk, lp)); 2641 } 2642 2643 #define CHAIN_END 0xffff /* end of the chunk chain */ 2644 #define LEAF_HASH(l, h) \ 2645 ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ 2646 ((h) >> \ 2647 (64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len))) 2648 #define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) 2649 2650 static int 2651 zap_leaf_lookup(zap_leaf_t *zl, uint64_t hash, const char *name, 2652 uint64_t integer_size, uint64_t num_integers, void *value) 2653 { 2654 int rc; 2655 uint16_t *chunkp; 2656 struct zap_leaf_entry *le; 2657 2658 /* 2659 * Make sure this chunk matches our hash. 2660 */ 2661 if (zl->l_phys->l_hdr.lh_prefix_len > 0 && 2662 zl->l_phys->l_hdr.lh_prefix != 2663 hash >> (64 - zl->l_phys->l_hdr.lh_prefix_len)) 2664 return (EIO); 2665 2666 rc = ENOENT; 2667 for (chunkp = LEAF_HASH_ENTPTR(zl, hash); 2668 *chunkp != CHAIN_END; chunkp = &le->le_next) { 2669 zap_leaf_chunk_t *zc; 2670 uint16_t chunk = *chunkp; 2671 2672 le = ZAP_LEAF_ENTRY(zl, chunk); 2673 if (le->le_hash != hash) 2674 continue; 2675 zc = &ZAP_LEAF_CHUNK(zl, chunk); 2676 if (fzap_name_equal(zl, zc, name)) { 2677 if (zc->l_entry.le_value_intlen > integer_size) { 2678 rc = EINVAL; 2679 } else { 2680 fzap_leaf_array(zl, zc, integer_size, 2681 num_integers, value); 2682 rc = 0; 2683 } 2684 break; 2685 } 2686 } 2687 return (rc); 2688 } 2689 2690 /* 2691 * Lookup a value in a fatzap directory. 2692 */ 2693 static int 2694 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2695 const char *name, uint64_t integer_size, uint64_t num_integers, 2696 void *value) 2697 { 2698 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2699 fat_zap_t z; 2700 zap_leaf_t *zl; 2701 uint64_t hash; 2702 int rc; 2703 2704 if (zh->zap_magic != ZAP_MAGIC) 2705 return (EIO); 2706 2707 if ((rc = fzap_check_size(integer_size, num_integers)) != 0) { 2708 return (rc); 2709 } 2710 2711 z.zap_block_shift = ilog2(bsize); 2712 z.zap_phys = zh; 2713 z.zap_spa = spa; 2714 z.zap_dnode = dnode; 2715 2716 hash = zap_hash(zh->zap_salt, name); 2717 rc = zap_deref_leaf(&z, hash, &zl); 2718 if (rc != 0) 2719 return (rc); 2720 2721 rc = zap_leaf_lookup(zl, hash, name, integer_size, num_integers, value); 2722 2723 zap_leaf_free(zl); 2724 return (rc); 2725 } 2726 2727 /* 2728 * Lookup a name in a zap object and return its value as a uint64_t. 2729 */ 2730 static int 2731 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, 2732 uint64_t integer_size, uint64_t num_integers, void *value) 2733 { 2734 int rc; 2735 zap_phys_t *zap; 2736 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2737 2738 zap = malloc(size); 2739 if (zap == NULL) 2740 return (ENOMEM); 2741 2742 rc = dnode_read(spa, dnode, 0, zap, size); 2743 if (rc) 2744 goto done; 2745 2746 switch (zap->zap_block_type) { 2747 case ZBT_MICRO: 2748 rc = mzap_lookup((const mzap_phys_t *)zap, size, name, value); 2749 break; 2750 case ZBT_HEADER: 2751 rc = fzap_lookup(spa, dnode, zap, name, integer_size, 2752 num_integers, value); 2753 break; 2754 default: 2755 printf("ZFS: invalid zap_type=%" PRIx64 "\n", 2756 zap->zap_block_type); 2757 rc = EIO; 2758 } 2759 done: 2760 free(zap); 2761 return (rc); 2762 } 2763 2764 /* 2765 * List a microzap directory. 2766 */ 2767 static int 2768 mzap_list(const mzap_phys_t *mz, size_t size, 2769 int (*callback)(const char *, uint64_t)) 2770 { 2771 const mzap_ent_phys_t *mze; 2772 int chunks, i, rc; 2773 2774 /* 2775 * Microzap objects use exactly one block. Read the whole 2776 * thing. 2777 */ 2778 rc = 0; 2779 chunks = size / MZAP_ENT_LEN - 1; 2780 for (i = 0; i < chunks; i++) { 2781 mze = &mz->mz_chunk[i]; 2782 if (mze->mze_name[0]) { 2783 rc = callback(mze->mze_name, mze->mze_value); 2784 if (rc != 0) 2785 break; 2786 } 2787 } 2788 2789 return (rc); 2790 } 2791 2792 /* 2793 * List a fatzap directory. 2794 */ 2795 static int 2796 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2797 int (*callback)(const char *, uint64_t)) 2798 { 2799 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2800 fat_zap_t z; 2801 uint64_t i; 2802 int j, rc; 2803 2804 if (zh->zap_magic != ZAP_MAGIC) 2805 return (EIO); 2806 2807 z.zap_block_shift = ilog2(bsize); 2808 z.zap_phys = zh; 2809 2810 /* 2811 * This assumes that the leaf blocks start at block 1. The 2812 * documentation isn't exactly clear on this. 2813 */ 2814 zap_leaf_t zl; 2815 zl.l_bs = z.zap_block_shift; 2816 zl.l_phys = malloc(bsize); 2817 if (zl.l_phys == NULL) 2818 return (ENOMEM); 2819 2820 for (i = 0; i < zh->zap_num_leafs; i++) { 2821 off_t off = ((off_t)(i + 1)) << zl.l_bs; 2822 char name[256], *p; 2823 uint64_t value; 2824 2825 if (dnode_read(spa, dnode, off, zl.l_phys, bsize)) { 2826 free(zl.l_phys); 2827 return (EIO); 2828 } 2829 2830 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 2831 zap_leaf_chunk_t *zc, *nc; 2832 int namelen; 2833 2834 zc = &ZAP_LEAF_CHUNK(&zl, j); 2835 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 2836 continue; 2837 namelen = zc->l_entry.le_name_numints; 2838 if (namelen > sizeof(name)) 2839 namelen = sizeof(name); 2840 2841 /* 2842 * Paste the name back together. 2843 */ 2844 nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); 2845 p = name; 2846 while (namelen > 0) { 2847 int len; 2848 len = namelen; 2849 if (len > ZAP_LEAF_ARRAY_BYTES) 2850 len = ZAP_LEAF_ARRAY_BYTES; 2851 memcpy(p, nc->l_array.la_array, len); 2852 p += len; 2853 namelen -= len; 2854 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); 2855 } 2856 2857 /* 2858 * Assume the first eight bytes of the value are 2859 * a uint64_t. 2860 */ 2861 value = fzap_leaf_value(&zl, zc); 2862 2863 /* printf("%s 0x%jx\n", name, (uintmax_t)value); */ 2864 rc = callback((const char *)name, value); 2865 if (rc != 0) { 2866 free(zl.l_phys); 2867 return (rc); 2868 } 2869 } 2870 } 2871 2872 free(zl.l_phys); 2873 return (0); 2874 } 2875 2876 static int zfs_printf(const char *name, uint64_t value __unused) 2877 { 2878 2879 printf("%s\n", name); 2880 2881 return (0); 2882 } 2883 2884 /* 2885 * List a zap directory. 2886 */ 2887 static int 2888 zap_list(const spa_t *spa, const dnode_phys_t *dnode) 2889 { 2890 zap_phys_t *zap; 2891 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2892 int rc; 2893 2894 zap = malloc(size); 2895 if (zap == NULL) 2896 return (ENOMEM); 2897 2898 rc = dnode_read(spa, dnode, 0, zap, size); 2899 if (rc == 0) { 2900 if (zap->zap_block_type == ZBT_MICRO) 2901 rc = mzap_list((const mzap_phys_t *)zap, size, 2902 zfs_printf); 2903 else 2904 rc = fzap_list(spa, dnode, zap, zfs_printf); 2905 } 2906 free(zap); 2907 return (rc); 2908 } 2909 2910 static int 2911 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, 2912 dnode_phys_t *dnode) 2913 { 2914 off_t offset; 2915 2916 offset = objnum * sizeof(dnode_phys_t); 2917 return dnode_read(spa, &os->os_meta_dnode, offset, 2918 dnode, sizeof(dnode_phys_t)); 2919 } 2920 2921 /* 2922 * Lookup a name in a microzap directory. 2923 */ 2924 static int 2925 mzap_rlookup(const mzap_phys_t *mz, size_t size, char *name, uint64_t value) 2926 { 2927 const mzap_ent_phys_t *mze; 2928 int chunks, i; 2929 2930 /* 2931 * Microzap objects use exactly one block. Read the whole 2932 * thing. 2933 */ 2934 chunks = size / MZAP_ENT_LEN - 1; 2935 for (i = 0; i < chunks; i++) { 2936 mze = &mz->mz_chunk[i]; 2937 if (value == mze->mze_value) { 2938 strcpy(name, mze->mze_name); 2939 return (0); 2940 } 2941 } 2942 2943 return (ENOENT); 2944 } 2945 2946 static void 2947 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name) 2948 { 2949 size_t namelen; 2950 const zap_leaf_chunk_t *nc; 2951 char *p; 2952 2953 namelen = zc->l_entry.le_name_numints; 2954 2955 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 2956 p = name; 2957 while (namelen > 0) { 2958 size_t len; 2959 len = namelen; 2960 if (len > ZAP_LEAF_ARRAY_BYTES) 2961 len = ZAP_LEAF_ARRAY_BYTES; 2962 memcpy(p, nc->l_array.la_array, len); 2963 p += len; 2964 namelen -= len; 2965 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 2966 } 2967 2968 *p = '\0'; 2969 } 2970 2971 static int 2972 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2973 char *name, uint64_t value) 2974 { 2975 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2976 fat_zap_t z; 2977 uint64_t i; 2978 int j, rc; 2979 2980 if (zh->zap_magic != ZAP_MAGIC) 2981 return (EIO); 2982 2983 z.zap_block_shift = ilog2(bsize); 2984 z.zap_phys = zh; 2985 2986 /* 2987 * This assumes that the leaf blocks start at block 1. The 2988 * documentation isn't exactly clear on this. 2989 */ 2990 zap_leaf_t zl; 2991 zl.l_bs = z.zap_block_shift; 2992 zl.l_phys = malloc(bsize); 2993 if (zl.l_phys == NULL) 2994 return (ENOMEM); 2995 2996 for (i = 0; i < zh->zap_num_leafs; i++) { 2997 off_t off = ((off_t)(i + 1)) << zl.l_bs; 2998 2999 rc = dnode_read(spa, dnode, off, zl.l_phys, bsize); 3000 if (rc != 0) 3001 goto done; 3002 3003 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 3004 zap_leaf_chunk_t *zc; 3005 3006 zc = &ZAP_LEAF_CHUNK(&zl, j); 3007 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 3008 continue; 3009 if (zc->l_entry.le_value_intlen != 8 || 3010 zc->l_entry.le_value_numints != 1) 3011 continue; 3012 3013 if (fzap_leaf_value(&zl, zc) == value) { 3014 fzap_name_copy(&zl, zc, name); 3015 goto done; 3016 } 3017 } 3018 } 3019 3020 rc = ENOENT; 3021 done: 3022 free(zl.l_phys); 3023 return (rc); 3024 } 3025 3026 static int 3027 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, 3028 uint64_t value) 3029 { 3030 zap_phys_t *zap; 3031 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 3032 int rc; 3033 3034 zap = malloc(size); 3035 if (zap == NULL) 3036 return (ENOMEM); 3037 3038 rc = dnode_read(spa, dnode, 0, zap, size); 3039 if (rc == 0) { 3040 if (zap->zap_block_type == ZBT_MICRO) 3041 rc = mzap_rlookup((const mzap_phys_t *)zap, size, 3042 name, value); 3043 else 3044 rc = fzap_rlookup(spa, dnode, zap, name, value); 3045 } 3046 free(zap); 3047 return (rc); 3048 } 3049 3050 static int 3051 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result) 3052 { 3053 char name[256]; 3054 char component[256]; 3055 uint64_t dir_obj, parent_obj, child_dir_zapobj; 3056 dnode_phys_t child_dir_zap, dataset, dir, parent; 3057 dsl_dir_phys_t *dd; 3058 dsl_dataset_phys_t *ds; 3059 char *p; 3060 int len; 3061 3062 p = &name[sizeof(name) - 1]; 3063 *p = '\0'; 3064 3065 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3066 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3067 return (EIO); 3068 } 3069 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3070 dir_obj = ds->ds_dir_obj; 3071 3072 for (;;) { 3073 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir) != 0) 3074 return (EIO); 3075 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3076 3077 /* Actual loop condition. */ 3078 parent_obj = dd->dd_parent_obj; 3079 if (parent_obj == 0) 3080 break; 3081 3082 if (objset_get_dnode(spa, spa->spa_mos, parent_obj, 3083 &parent) != 0) 3084 return (EIO); 3085 dd = (dsl_dir_phys_t *)&parent.dn_bonus; 3086 child_dir_zapobj = dd->dd_child_dir_zapobj; 3087 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3088 &child_dir_zap) != 0) 3089 return (EIO); 3090 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0) 3091 return (EIO); 3092 3093 len = strlen(component); 3094 p -= len; 3095 memcpy(p, component, len); 3096 --p; 3097 *p = '/'; 3098 3099 /* Actual loop iteration. */ 3100 dir_obj = parent_obj; 3101 } 3102 3103 if (*p != '\0') 3104 ++p; 3105 strcpy(result, p); 3106 3107 return (0); 3108 } 3109 3110 static int 3111 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum) 3112 { 3113 char element[256]; 3114 uint64_t dir_obj, child_dir_zapobj; 3115 dnode_phys_t child_dir_zap, dir; 3116 dsl_dir_phys_t *dd; 3117 const char *p, *q; 3118 3119 if (objset_get_dnode(spa, spa->spa_mos, 3120 DMU_POOL_DIRECTORY_OBJECT, &dir)) 3121 return (EIO); 3122 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj), 3123 1, &dir_obj)) 3124 return (EIO); 3125 3126 p = name; 3127 for (;;) { 3128 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir)) 3129 return (EIO); 3130 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3131 3132 while (*p == '/') 3133 p++; 3134 /* Actual loop condition #1. */ 3135 if (*p == '\0') 3136 break; 3137 3138 q = strchr(p, '/'); 3139 if (q) { 3140 memcpy(element, p, q - p); 3141 element[q - p] = '\0'; 3142 p = q + 1; 3143 } else { 3144 strcpy(element, p); 3145 p += strlen(p); 3146 } 3147 3148 child_dir_zapobj = dd->dd_child_dir_zapobj; 3149 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3150 &child_dir_zap) != 0) 3151 return (EIO); 3152 3153 /* Actual loop condition #2. */ 3154 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj), 3155 1, &dir_obj) != 0) 3156 return (ENOENT); 3157 } 3158 3159 *objnum = dd->dd_head_dataset_obj; 3160 return (0); 3161 } 3162 3163 #ifndef BOOT2 3164 static int 3165 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/) 3166 { 3167 uint64_t dir_obj, child_dir_zapobj; 3168 dnode_phys_t child_dir_zap, dir, dataset; 3169 dsl_dataset_phys_t *ds; 3170 dsl_dir_phys_t *dd; 3171 3172 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3173 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3174 return (EIO); 3175 } 3176 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3177 dir_obj = ds->ds_dir_obj; 3178 3179 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir)) { 3180 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 3181 return (EIO); 3182 } 3183 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3184 3185 child_dir_zapobj = dd->dd_child_dir_zapobj; 3186 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3187 &child_dir_zap) != 0) { 3188 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 3189 return (EIO); 3190 } 3191 3192 return (zap_list(spa, &child_dir_zap) != 0); 3193 } 3194 3195 int 3196 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, 3197 int (*callback)(const char *, uint64_t)) 3198 { 3199 uint64_t dir_obj, child_dir_zapobj; 3200 dnode_phys_t child_dir_zap, dir, dataset; 3201 dsl_dataset_phys_t *ds; 3202 dsl_dir_phys_t *dd; 3203 zap_phys_t *zap; 3204 size_t size; 3205 int err; 3206 3207 err = objset_get_dnode(spa, spa->spa_mos, objnum, &dataset); 3208 if (err != 0) { 3209 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3210 return (err); 3211 } 3212 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3213 dir_obj = ds->ds_dir_obj; 3214 3215 err = objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir); 3216 if (err != 0) { 3217 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 3218 return (err); 3219 } 3220 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3221 3222 child_dir_zapobj = dd->dd_child_dir_zapobj; 3223 err = objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3224 &child_dir_zap); 3225 if (err != 0) { 3226 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 3227 return (err); 3228 } 3229 3230 size = child_dir_zap.dn_datablkszsec << SPA_MINBLOCKSHIFT; 3231 zap = malloc(size); 3232 if (zap != NULL) { 3233 err = dnode_read(spa, &child_dir_zap, 0, zap, size); 3234 if (err != 0) 3235 goto done; 3236 3237 if (zap->zap_block_type == ZBT_MICRO) 3238 err = mzap_list((const mzap_phys_t *)zap, size, 3239 callback); 3240 else 3241 err = fzap_list(spa, &child_dir_zap, zap, callback); 3242 } else { 3243 err = ENOMEM; 3244 } 3245 done: 3246 free(zap); 3247 return (err); 3248 } 3249 #endif 3250 3251 /* 3252 * Find the object set given the object number of its dataset object 3253 * and return its details in *objset 3254 */ 3255 static int 3256 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset) 3257 { 3258 dnode_phys_t dataset; 3259 dsl_dataset_phys_t *ds; 3260 3261 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3262 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3263 return (EIO); 3264 } 3265 3266 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3267 if (zio_read(spa, &ds->ds_bp, objset)) { 3268 printf("ZFS: can't read object set for dataset %ju\n", 3269 (uintmax_t)objnum); 3270 return (EIO); 3271 } 3272 3273 return (0); 3274 } 3275 3276 /* 3277 * Find the object set pointed to by the BOOTFS property or the root 3278 * dataset if there is none and return its details in *objset 3279 */ 3280 static int 3281 zfs_get_root(const spa_t *spa, uint64_t *objid) 3282 { 3283 dnode_phys_t dir, propdir; 3284 uint64_t props, bootfs, root; 3285 3286 *objid = 0; 3287 3288 /* 3289 * Start with the MOS directory object. 3290 */ 3291 if (objset_get_dnode(spa, spa->spa_mos, 3292 DMU_POOL_DIRECTORY_OBJECT, &dir)) { 3293 printf("ZFS: can't read MOS object directory\n"); 3294 return (EIO); 3295 } 3296 3297 /* 3298 * Lookup the pool_props and see if we can find a bootfs. 3299 */ 3300 if (zap_lookup(spa, &dir, DMU_POOL_PROPS, 3301 sizeof(props), 1, &props) == 0 && 3302 objset_get_dnode(spa, spa->spa_mos, props, &propdir) == 0 && 3303 zap_lookup(spa, &propdir, "bootfs", 3304 sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) { 3305 *objid = bootfs; 3306 return (0); 3307 } 3308 /* 3309 * Lookup the root dataset directory 3310 */ 3311 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, 3312 sizeof(root), 1, &root) || 3313 objset_get_dnode(spa, spa->spa_mos, root, &dir)) { 3314 printf("ZFS: can't find root dsl_dir\n"); 3315 return (EIO); 3316 } 3317 3318 /* 3319 * Use the information from the dataset directory's bonus buffer 3320 * to find the dataset object and from that the object set itself. 3321 */ 3322 dsl_dir_phys_t *dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3323 *objid = dd->dd_head_dataset_obj; 3324 return (0); 3325 } 3326 3327 static int 3328 zfs_mount_impl(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount) 3329 { 3330 3331 mount->spa = spa; 3332 3333 /* 3334 * Find the root object set if not explicitly provided 3335 */ 3336 if (rootobj == 0 && zfs_get_root(spa, &rootobj)) { 3337 printf("ZFS: can't find root filesystem\n"); 3338 return (EIO); 3339 } 3340 3341 if (zfs_mount_dataset(spa, rootobj, &mount->objset)) { 3342 printf("ZFS: can't open root filesystem\n"); 3343 return (EIO); 3344 } 3345 3346 mount->rootobj = rootobj; 3347 3348 return (0); 3349 } 3350 3351 /* 3352 * callback function for feature name checks. 3353 */ 3354 static int 3355 check_feature(const char *name, uint64_t value) 3356 { 3357 int i; 3358 3359 if (value == 0) 3360 return (0); 3361 if (name[0] == '\0') 3362 return (0); 3363 3364 for (i = 0; features_for_read[i] != NULL; i++) { 3365 if (strcmp(name, features_for_read[i]) == 0) 3366 return (0); 3367 } 3368 printf("ZFS: unsupported feature: %s\n", name); 3369 return (EIO); 3370 } 3371 3372 /* 3373 * Checks whether the MOS features that are active are supported. 3374 */ 3375 static int 3376 check_mos_features(const spa_t *spa) 3377 { 3378 dnode_phys_t dir; 3379 zap_phys_t *zap; 3380 uint64_t objnum; 3381 size_t size; 3382 int rc; 3383 3384 if ((rc = objset_get_dnode(spa, spa->spa_mos, DMU_OT_OBJECT_DIRECTORY, 3385 &dir)) != 0) 3386 return (rc); 3387 if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ, 3388 sizeof (objnum), 1, &objnum)) != 0) { 3389 /* 3390 * It is older pool without features. As we have already 3391 * tested the label, just return without raising the error. 3392 */ 3393 return (0); 3394 } 3395 3396 if ((rc = objset_get_dnode(spa, spa->spa_mos, objnum, &dir)) != 0) 3397 return (rc); 3398 3399 if (dir.dn_type != DMU_OTN_ZAP_METADATA) 3400 return (EIO); 3401 3402 size = dir.dn_datablkszsec << SPA_MINBLOCKSHIFT; 3403 zap = malloc(size); 3404 if (zap == NULL) 3405 return (ENOMEM); 3406 3407 if (dnode_read(spa, &dir, 0, zap, size)) { 3408 free(zap); 3409 return (EIO); 3410 } 3411 3412 if (zap->zap_block_type == ZBT_MICRO) 3413 rc = mzap_list((const mzap_phys_t *)zap, size, check_feature); 3414 else 3415 rc = fzap_list(spa, &dir, zap, check_feature); 3416 3417 free(zap); 3418 return (rc); 3419 } 3420 3421 static int 3422 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 3423 { 3424 dnode_phys_t dir; 3425 size_t size; 3426 int rc; 3427 char *nv; 3428 3429 *value = NULL; 3430 if ((rc = objset_get_dnode(spa, spa->spa_mos, obj, &dir)) != 0) 3431 return (rc); 3432 if (dir.dn_type != DMU_OT_PACKED_NVLIST && 3433 dir.dn_bonustype != DMU_OT_PACKED_NVLIST_SIZE) { 3434 return (EIO); 3435 } 3436 3437 if (dir.dn_bonuslen != sizeof (uint64_t)) 3438 return (EIO); 3439 3440 size = *(uint64_t *)DN_BONUS(&dir); 3441 nv = malloc(size); 3442 if (nv == NULL) 3443 return (ENOMEM); 3444 3445 rc = dnode_read(spa, &dir, 0, nv, size); 3446 if (rc != 0) { 3447 free(nv); 3448 nv = NULL; 3449 return (rc); 3450 } 3451 *value = nvlist_import(nv, size); 3452 free(nv); 3453 return (rc); 3454 } 3455 3456 static int 3457 zfs_spa_init(spa_t *spa) 3458 { 3459 struct uberblock checkpoint; 3460 dnode_phys_t dir; 3461 uint64_t config_object; 3462 nvlist_t *nvlist; 3463 int rc; 3464 3465 if (zio_read(spa, &spa->spa_uberblock->ub_rootbp, spa->spa_mos)) { 3466 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name); 3467 return (EIO); 3468 } 3469 if (spa->spa_mos->os_type != DMU_OST_META) { 3470 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name); 3471 return (EIO); 3472 } 3473 3474 if (objset_get_dnode(spa, &spa->spa_mos_master, 3475 DMU_POOL_DIRECTORY_OBJECT, &dir)) { 3476 printf("ZFS: failed to read pool %s directory object\n", 3477 spa->spa_name); 3478 return (EIO); 3479 } 3480 /* this is allowed to fail, older pools do not have salt */ 3481 rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1, 3482 sizeof (spa->spa_cksum_salt.zcs_bytes), 3483 spa->spa_cksum_salt.zcs_bytes); 3484 3485 rc = check_mos_features(spa); 3486 if (rc != 0) { 3487 printf("ZFS: pool %s is not supported\n", spa->spa_name); 3488 return (rc); 3489 } 3490 3491 rc = zap_lookup(spa, &dir, DMU_POOL_CONFIG, 3492 sizeof (config_object), 1, &config_object); 3493 if (rc != 0) { 3494 printf("ZFS: can not read MOS %s\n", DMU_POOL_CONFIG); 3495 return (EIO); 3496 } 3497 rc = load_nvlist(spa, config_object, &nvlist); 3498 if (rc != 0) 3499 return (rc); 3500 3501 rc = zap_lookup(spa, &dir, DMU_POOL_ZPOOL_CHECKPOINT, 3502 sizeof(uint64_t), sizeof(checkpoint) / sizeof(uint64_t), 3503 &checkpoint); 3504 if (rc == 0 && checkpoint.ub_checkpoint_txg != 0) { 3505 memcpy(&spa->spa_uberblock_checkpoint, &checkpoint, 3506 sizeof(checkpoint)); 3507 if (zio_read(spa, &spa->spa_uberblock_checkpoint.ub_rootbp, 3508 &spa->spa_mos_checkpoint)) { 3509 printf("ZFS: can not read checkpoint data.\n"); 3510 return (EIO); 3511 } 3512 } 3513 3514 /* 3515 * Update vdevs from MOS config. Note, we do skip encoding bytes 3516 * here. See also vdev_label_read_config(). 3517 */ 3518 rc = vdev_init_from_nvlist(spa, nvlist); 3519 nvlist_destroy(nvlist); 3520 return (rc); 3521 } 3522 3523 static int 3524 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb) 3525 { 3526 3527 if (dn->dn_bonustype != DMU_OT_SA) { 3528 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus; 3529 3530 sb->st_mode = zp->zp_mode; 3531 sb->st_uid = zp->zp_uid; 3532 sb->st_gid = zp->zp_gid; 3533 sb->st_size = zp->zp_size; 3534 } else { 3535 sa_hdr_phys_t *sahdrp; 3536 int hdrsize; 3537 size_t size = 0; 3538 void *buf = NULL; 3539 3540 if (dn->dn_bonuslen != 0) 3541 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 3542 else { 3543 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) { 3544 blkptr_t *bp = DN_SPILL_BLKPTR(dn); 3545 int error; 3546 3547 size = BP_GET_LSIZE(bp); 3548 buf = malloc(size); 3549 if (buf == NULL) 3550 error = ENOMEM; 3551 else 3552 error = zio_read(spa, bp, buf); 3553 3554 if (error != 0) { 3555 free(buf); 3556 return (error); 3557 } 3558 sahdrp = buf; 3559 } else { 3560 return (EIO); 3561 } 3562 } 3563 hdrsize = SA_HDR_SIZE(sahdrp); 3564 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize + 3565 SA_MODE_OFFSET); 3566 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize + 3567 SA_UID_OFFSET); 3568 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize + 3569 SA_GID_OFFSET); 3570 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize + 3571 SA_SIZE_OFFSET); 3572 free(buf); 3573 } 3574 3575 return (0); 3576 } 3577 3578 static int 3579 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize) 3580 { 3581 int rc = 0; 3582 3583 if (dn->dn_bonustype == DMU_OT_SA) { 3584 sa_hdr_phys_t *sahdrp = NULL; 3585 size_t size = 0; 3586 void *buf = NULL; 3587 int hdrsize; 3588 char *p; 3589 3590 if (dn->dn_bonuslen != 0) { 3591 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 3592 } else { 3593 blkptr_t *bp; 3594 3595 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0) 3596 return (EIO); 3597 bp = DN_SPILL_BLKPTR(dn); 3598 3599 size = BP_GET_LSIZE(bp); 3600 buf = malloc(size); 3601 if (buf == NULL) 3602 rc = ENOMEM; 3603 else 3604 rc = zio_read(spa, bp, buf); 3605 if (rc != 0) { 3606 free(buf); 3607 return (rc); 3608 } 3609 sahdrp = buf; 3610 } 3611 hdrsize = SA_HDR_SIZE(sahdrp); 3612 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET); 3613 memcpy(path, p, psize); 3614 free(buf); 3615 return (0); 3616 } 3617 /* 3618 * Second test is purely to silence bogus compiler 3619 * warning about accessing past the end of dn_bonus. 3620 */ 3621 if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen && 3622 sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) { 3623 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize); 3624 } else { 3625 rc = dnode_read(spa, dn, 0, path, psize); 3626 } 3627 return (rc); 3628 } 3629 3630 struct obj_list { 3631 uint64_t objnum; 3632 STAILQ_ENTRY(obj_list) entry; 3633 }; 3634 3635 /* 3636 * Lookup a file and return its dnode. 3637 */ 3638 static int 3639 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode) 3640 { 3641 int rc; 3642 uint64_t objnum; 3643 const spa_t *spa; 3644 dnode_phys_t dn; 3645 const char *p, *q; 3646 char element[256]; 3647 char path[1024]; 3648 int symlinks_followed = 0; 3649 struct stat sb; 3650 struct obj_list *entry, *tentry; 3651 STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache); 3652 3653 spa = mount->spa; 3654 if (mount->objset.os_type != DMU_OST_ZFS) { 3655 printf("ZFS: unexpected object set type %ju\n", 3656 (uintmax_t)mount->objset.os_type); 3657 return (EIO); 3658 } 3659 3660 if ((entry = malloc(sizeof(struct obj_list))) == NULL) 3661 return (ENOMEM); 3662 3663 /* 3664 * Get the root directory dnode. 3665 */ 3666 rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn); 3667 if (rc) { 3668 free(entry); 3669 return (rc); 3670 } 3671 3672 rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof(objnum), 1, &objnum); 3673 if (rc) { 3674 free(entry); 3675 return (rc); 3676 } 3677 entry->objnum = objnum; 3678 STAILQ_INSERT_HEAD(&on_cache, entry, entry); 3679 3680 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3681 if (rc != 0) 3682 goto done; 3683 3684 p = upath; 3685 while (p && *p) { 3686 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3687 if (rc != 0) 3688 goto done; 3689 3690 while (*p == '/') 3691 p++; 3692 if (*p == '\0') 3693 break; 3694 q = p; 3695 while (*q != '\0' && *q != '/') 3696 q++; 3697 3698 /* skip dot */ 3699 if (p + 1 == q && p[0] == '.') { 3700 p++; 3701 continue; 3702 } 3703 /* double dot */ 3704 if (p + 2 == q && p[0] == '.' && p[1] == '.') { 3705 p += 2; 3706 if (STAILQ_FIRST(&on_cache) == 3707 STAILQ_LAST(&on_cache, obj_list, entry)) { 3708 rc = ENOENT; 3709 goto done; 3710 } 3711 entry = STAILQ_FIRST(&on_cache); 3712 STAILQ_REMOVE_HEAD(&on_cache, entry); 3713 free(entry); 3714 objnum = (STAILQ_FIRST(&on_cache))->objnum; 3715 continue; 3716 } 3717 if (q - p + 1 > sizeof(element)) { 3718 rc = ENAMETOOLONG; 3719 goto done; 3720 } 3721 memcpy(element, p, q - p); 3722 element[q - p] = 0; 3723 p = q; 3724 3725 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0) 3726 goto done; 3727 if (!S_ISDIR(sb.st_mode)) { 3728 rc = ENOTDIR; 3729 goto done; 3730 } 3731 3732 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum); 3733 if (rc) 3734 goto done; 3735 objnum = ZFS_DIRENT_OBJ(objnum); 3736 3737 if ((entry = malloc(sizeof(struct obj_list))) == NULL) { 3738 rc = ENOMEM; 3739 goto done; 3740 } 3741 entry->objnum = objnum; 3742 STAILQ_INSERT_HEAD(&on_cache, entry, entry); 3743 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3744 if (rc) 3745 goto done; 3746 3747 /* 3748 * Check for symlink. 3749 */ 3750 rc = zfs_dnode_stat(spa, &dn, &sb); 3751 if (rc) 3752 goto done; 3753 if (S_ISLNK(sb.st_mode)) { 3754 if (symlinks_followed > 10) { 3755 rc = EMLINK; 3756 goto done; 3757 } 3758 symlinks_followed++; 3759 3760 /* 3761 * Read the link value and copy the tail of our 3762 * current path onto the end. 3763 */ 3764 if (sb.st_size + strlen(p) + 1 > sizeof(path)) { 3765 rc = ENAMETOOLONG; 3766 goto done; 3767 } 3768 strcpy(&path[sb.st_size], p); 3769 3770 rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size); 3771 if (rc != 0) 3772 goto done; 3773 3774 /* 3775 * Restart with the new path, starting either at 3776 * the root or at the parent depending whether or 3777 * not the link is relative. 3778 */ 3779 p = path; 3780 if (*p == '/') { 3781 while (STAILQ_FIRST(&on_cache) != 3782 STAILQ_LAST(&on_cache, obj_list, entry)) { 3783 entry = STAILQ_FIRST(&on_cache); 3784 STAILQ_REMOVE_HEAD(&on_cache, entry); 3785 free(entry); 3786 } 3787 } else { 3788 entry = STAILQ_FIRST(&on_cache); 3789 STAILQ_REMOVE_HEAD(&on_cache, entry); 3790 free(entry); 3791 } 3792 objnum = (STAILQ_FIRST(&on_cache))->objnum; 3793 } 3794 } 3795 3796 *dnode = dn; 3797 done: 3798 STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry) 3799 free(entry); 3800 return (rc); 3801 } 3802