1 /*- 2 * Copyright (c) 2007 Doug Rabson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 /* 29 * Stand-alone ZFS file reader. 30 */ 31 32 #include <stdbool.h> 33 #include <sys/endian.h> 34 #include <sys/stat.h> 35 #include <sys/stdint.h> 36 #include <sys/list.h> 37 #include <sys/zfs_bootenv.h> 38 #include <machine/_inttypes.h> 39 40 #include "zfsimpl.h" 41 #include "zfssubr.c" 42 43 #ifdef HAS_ZSTD_ZFS 44 extern int zstd_init(void); 45 #endif 46 47 struct zfsmount { 48 char *path; 49 const spa_t *spa; 50 objset_phys_t objset; 51 uint64_t rootobj; 52 STAILQ_ENTRY(zfsmount) next; 53 }; 54 55 typedef STAILQ_HEAD(zfs_mnt_list, zfsmount) zfs_mnt_list_t; 56 static zfs_mnt_list_t zfsmount = STAILQ_HEAD_INITIALIZER(zfsmount); 57 58 /* 59 * The indirect_child_t represents the vdev that we will read from, when we 60 * need to read all copies of the data (e.g. for scrub or reconstruction). 61 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), 62 * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, 63 * ic_vdev is a child of the mirror. 64 */ 65 typedef struct indirect_child { 66 void *ic_data; 67 vdev_t *ic_vdev; 68 } indirect_child_t; 69 70 /* 71 * The indirect_split_t represents one mapped segment of an i/o to the 72 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be 73 * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. 74 * For split blocks, there will be several of these. 75 */ 76 typedef struct indirect_split { 77 list_node_t is_node; /* link on iv_splits */ 78 79 /* 80 * is_split_offset is the offset into the i/o. 81 * This is the sum of the previous splits' is_size's. 82 */ 83 uint64_t is_split_offset; 84 85 vdev_t *is_vdev; /* top-level vdev */ 86 uint64_t is_target_offset; /* offset on is_vdev */ 87 uint64_t is_size; 88 int is_children; /* number of entries in is_child[] */ 89 90 /* 91 * is_good_child is the child that we are currently using to 92 * attempt reconstruction. 93 */ 94 int is_good_child; 95 96 indirect_child_t is_child[1]; /* variable-length */ 97 } indirect_split_t; 98 99 /* 100 * The indirect_vsd_t is associated with each i/o to the indirect vdev. 101 * It is the "Vdev-Specific Data" in the zio_t's io_vsd. 102 */ 103 typedef struct indirect_vsd { 104 boolean_t iv_split_block; 105 boolean_t iv_reconstruct; 106 107 list_t iv_splits; /* list of indirect_split_t's */ 108 } indirect_vsd_t; 109 110 /* 111 * List of all vdevs, chained through v_alllink. 112 */ 113 static vdev_list_t zfs_vdevs; 114 115 /* 116 * List of ZFS features supported for read 117 */ 118 static const char *features_for_read[] = { 119 "com.datto:bookmark_v2", 120 "com.datto:encryption", 121 "com.datto:resilver_defer", 122 "com.delphix:bookmark_written", 123 "com.delphix:device_removal", 124 "com.delphix:embedded_data", 125 "com.delphix:extensible_dataset", 126 "com.delphix:head_errlog", 127 "com.delphix:hole_birth", 128 "com.delphix:obsolete_counts", 129 "com.delphix:spacemap_histogram", 130 "com.delphix:spacemap_v2", 131 "com.delphix:zpool_checkpoint", 132 "com.intel:allocation_classes", 133 "com.joyent:multi_vdev_crash_dump", 134 "com.klarasystems:vdev_zaps_v2", 135 "org.freebsd:zstd_compress", 136 "org.illumos:lz4_compress", 137 "org.illumos:sha512", 138 "org.illumos:skein", 139 "org.open-zfs:large_blocks", 140 "org.openzfs:blake3", 141 "org.zfsonlinux:allocation_classes", 142 "org.zfsonlinux:large_dnode", 143 NULL 144 }; 145 146 /* 147 * List of all pools, chained through spa_link. 148 */ 149 static spa_list_t zfs_pools; 150 151 static const dnode_phys_t *dnode_cache_obj; 152 static uint64_t dnode_cache_bn; 153 static char *dnode_cache_buf; 154 155 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf); 156 static int zfs_get_root(const spa_t *spa, uint64_t *objid); 157 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result); 158 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, 159 const char *name, uint64_t integer_size, uint64_t num_integers, 160 void *value); 161 static int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t, 162 dnode_phys_t *); 163 static int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *, 164 size_t); 165 static int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t, 166 size_t); 167 static int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t, size_t); 168 vdev_indirect_mapping_t *vdev_indirect_mapping_open(spa_t *, objset_phys_t *, 169 uint64_t); 170 vdev_indirect_mapping_entry_phys_t * 171 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *, uint64_t, 172 uint64_t, uint64_t *); 173 174 static void 175 zfs_init(void) 176 { 177 STAILQ_INIT(&zfs_vdevs); 178 STAILQ_INIT(&zfs_pools); 179 180 dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE); 181 182 zfs_init_crc(); 183 #ifdef HAS_ZSTD_ZFS 184 zstd_init(); 185 #endif 186 } 187 188 static int 189 nvlist_check_features_for_read(nvlist_t *nvl) 190 { 191 nvlist_t *features = NULL; 192 nvs_data_t *data; 193 nvp_header_t *nvp; 194 nv_string_t *nvp_name; 195 int rc; 196 197 rc = nvlist_find(nvl, ZPOOL_CONFIG_FEATURES_FOR_READ, 198 DATA_TYPE_NVLIST, NULL, &features, NULL); 199 switch (rc) { 200 case 0: 201 break; /* Continue with checks */ 202 203 case ENOENT: 204 return (0); /* All features are disabled */ 205 206 default: 207 return (rc); /* Error while reading nvlist */ 208 } 209 210 data = (nvs_data_t *)features->nv_data; 211 nvp = &data->nvl_pair; /* first pair in nvlist */ 212 213 while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { 214 int i, found; 215 216 nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp)); 217 found = 0; 218 219 for (i = 0; features_for_read[i] != NULL; i++) { 220 if (memcmp(nvp_name->nv_data, features_for_read[i], 221 nvp_name->nv_size) == 0) { 222 found = 1; 223 break; 224 } 225 } 226 227 if (!found) { 228 printf("ZFS: unsupported feature: %.*s\n", 229 nvp_name->nv_size, nvp_name->nv_data); 230 rc = EIO; 231 } 232 nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); 233 } 234 nvlist_destroy(features); 235 236 return (rc); 237 } 238 239 static int 240 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, 241 off_t offset, size_t size) 242 { 243 size_t psize; 244 int rc; 245 246 if (vdev->v_phys_read == NULL) 247 return (ENOTSUP); 248 249 if (bp) { 250 psize = BP_GET_PSIZE(bp); 251 } else { 252 psize = size; 253 } 254 255 rc = vdev->v_phys_read(vdev, vdev->v_priv, offset, buf, psize); 256 if (rc == 0) { 257 if (bp != NULL) 258 rc = zio_checksum_verify(vdev->v_spa, bp, buf); 259 } 260 261 return (rc); 262 } 263 264 static int 265 vdev_write_phys(vdev_t *vdev, void *buf, off_t offset, size_t size) 266 { 267 if (vdev->v_phys_write == NULL) 268 return (ENOTSUP); 269 270 return (vdev->v_phys_write(vdev, offset, buf, size)); 271 } 272 273 typedef struct remap_segment { 274 vdev_t *rs_vd; 275 uint64_t rs_offset; 276 uint64_t rs_asize; 277 uint64_t rs_split_offset; 278 list_node_t rs_node; 279 } remap_segment_t; 280 281 static remap_segment_t * 282 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 283 { 284 remap_segment_t *rs = malloc(sizeof (remap_segment_t)); 285 286 if (rs != NULL) { 287 rs->rs_vd = vd; 288 rs->rs_offset = offset; 289 rs->rs_asize = asize; 290 rs->rs_split_offset = split_offset; 291 } 292 293 return (rs); 294 } 295 296 vdev_indirect_mapping_t * 297 vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os, 298 uint64_t mapping_object) 299 { 300 vdev_indirect_mapping_t *vim; 301 vdev_indirect_mapping_phys_t *vim_phys; 302 int rc; 303 304 vim = calloc(1, sizeof (*vim)); 305 if (vim == NULL) 306 return (NULL); 307 308 vim->vim_dn = calloc(1, sizeof (*vim->vim_dn)); 309 if (vim->vim_dn == NULL) { 310 free(vim); 311 return (NULL); 312 } 313 314 rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn); 315 if (rc != 0) { 316 free(vim->vim_dn); 317 free(vim); 318 return (NULL); 319 } 320 321 vim->vim_spa = spa; 322 vim->vim_phys = malloc(sizeof (*vim->vim_phys)); 323 if (vim->vim_phys == NULL) { 324 free(vim->vim_dn); 325 free(vim); 326 return (NULL); 327 } 328 329 vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn); 330 *vim->vim_phys = *vim_phys; 331 332 vim->vim_objset = os; 333 vim->vim_object = mapping_object; 334 vim->vim_entries = NULL; 335 336 vim->vim_havecounts = 337 (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0); 338 339 return (vim); 340 } 341 342 /* 343 * Compare an offset with an indirect mapping entry; there are three 344 * possible scenarios: 345 * 346 * 1. The offset is "less than" the mapping entry; meaning the 347 * offset is less than the source offset of the mapping entry. In 348 * this case, there is no overlap between the offset and the 349 * mapping entry and -1 will be returned. 350 * 351 * 2. The offset is "greater than" the mapping entry; meaning the 352 * offset is greater than the mapping entry's source offset plus 353 * the entry's size. In this case, there is no overlap between 354 * the offset and the mapping entry and 1 will be returned. 355 * 356 * NOTE: If the offset is actually equal to the entry's offset 357 * plus size, this is considered to be "greater" than the entry, 358 * and this case applies (i.e. 1 will be returned). Thus, the 359 * entry's "range" can be considered to be inclusive at its 360 * start, but exclusive at its end: e.g. [src, src + size). 361 * 362 * 3. The last case to consider is if the offset actually falls 363 * within the mapping entry's range. If this is the case, the 364 * offset is considered to be "equal to" the mapping entry and 365 * 0 will be returned. 366 * 367 * NOTE: If the offset is equal to the entry's source offset, 368 * this case applies and 0 will be returned. If the offset is 369 * equal to the entry's source plus its size, this case does 370 * *not* apply (see "NOTE" above for scenario 2), and 1 will be 371 * returned. 372 */ 373 static int 374 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) 375 { 376 const uint64_t *key = v_key; 377 const vdev_indirect_mapping_entry_phys_t *array_elem = 378 v_array_elem; 379 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); 380 381 if (*key < src_offset) { 382 return (-1); 383 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { 384 return (0); 385 } else { 386 return (1); 387 } 388 } 389 390 /* 391 * Return array entry. 392 */ 393 static vdev_indirect_mapping_entry_phys_t * 394 vdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index) 395 { 396 uint64_t size; 397 off_t offset = 0; 398 int rc; 399 400 if (vim->vim_phys->vimp_num_entries == 0) 401 return (NULL); 402 403 if (vim->vim_entries == NULL) { 404 uint64_t bsize; 405 406 bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT; 407 size = vim->vim_phys->vimp_num_entries * 408 sizeof (*vim->vim_entries); 409 if (size > bsize) { 410 size = bsize / sizeof (*vim->vim_entries); 411 size *= sizeof (*vim->vim_entries); 412 } 413 vim->vim_entries = malloc(size); 414 if (vim->vim_entries == NULL) 415 return (NULL); 416 vim->vim_num_entries = size / sizeof (*vim->vim_entries); 417 offset = index * sizeof (*vim->vim_entries); 418 } 419 420 /* We have data in vim_entries */ 421 if (offset == 0) { 422 if (index >= vim->vim_entry_offset && 423 index <= vim->vim_entry_offset + vim->vim_num_entries) { 424 index -= vim->vim_entry_offset; 425 return (&vim->vim_entries[index]); 426 } 427 offset = index * sizeof (*vim->vim_entries); 428 } 429 430 vim->vim_entry_offset = index; 431 size = vim->vim_num_entries * sizeof (*vim->vim_entries); 432 rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries, 433 size); 434 if (rc != 0) { 435 /* Read error, invalidate vim_entries. */ 436 free(vim->vim_entries); 437 vim->vim_entries = NULL; 438 return (NULL); 439 } 440 index -= vim->vim_entry_offset; 441 return (&vim->vim_entries[index]); 442 } 443 444 /* 445 * Returns the mapping entry for the given offset. 446 * 447 * It's possible that the given offset will not be in the mapping table 448 * (i.e. no mapping entries contain this offset), in which case, the 449 * return value depends on the "next_if_missing" parameter. 450 * 451 * If the offset is not found in the table and "next_if_missing" is 452 * B_FALSE, then NULL will always be returned. The behavior is intended 453 * to allow consumers to get the entry corresponding to the offset 454 * parameter, iff the offset overlaps with an entry in the table. 455 * 456 * If the offset is not found in the table and "next_if_missing" is 457 * B_TRUE, then the entry nearest to the given offset will be returned, 458 * such that the entry's source offset is greater than the offset 459 * passed in (i.e. the "next" mapping entry in the table is returned, if 460 * the offset is missing from the table). If there are no entries whose 461 * source offset is greater than the passed in offset, NULL is returned. 462 */ 463 static vdev_indirect_mapping_entry_phys_t * 464 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, 465 uint64_t offset) 466 { 467 ASSERT(vim->vim_phys->vimp_num_entries > 0); 468 469 vdev_indirect_mapping_entry_phys_t *entry; 470 471 uint64_t last = vim->vim_phys->vimp_num_entries - 1; 472 uint64_t base = 0; 473 474 /* 475 * We don't define these inside of the while loop because we use 476 * their value in the case that offset isn't in the mapping. 477 */ 478 uint64_t mid; 479 int result; 480 481 while (last >= base) { 482 mid = base + ((last - base) >> 1); 483 484 entry = vdev_indirect_mapping_entry(vim, mid); 485 if (entry == NULL) 486 break; 487 result = dva_mapping_overlap_compare(&offset, entry); 488 489 if (result == 0) { 490 break; 491 } else if (result < 0) { 492 last = mid - 1; 493 } else { 494 base = mid + 1; 495 } 496 } 497 return (entry); 498 } 499 500 /* 501 * Given an indirect vdev and an extent on that vdev, it duplicates the 502 * physical entries of the indirect mapping that correspond to the extent 503 * to a new array and returns a pointer to it. In addition, copied_entries 504 * is populated with the number of mapping entries that were duplicated. 505 * 506 * Finally, since we are doing an allocation, it is up to the caller to 507 * free the array allocated in this function. 508 */ 509 vdev_indirect_mapping_entry_phys_t * 510 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, 511 uint64_t asize, uint64_t *copied_entries) 512 { 513 vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; 514 vdev_indirect_mapping_t *vim = vd->v_mapping; 515 uint64_t entries = 0; 516 517 vdev_indirect_mapping_entry_phys_t *first_mapping = 518 vdev_indirect_mapping_entry_for_offset(vim, offset); 519 ASSERT3P(first_mapping, !=, NULL); 520 521 vdev_indirect_mapping_entry_phys_t *m = first_mapping; 522 while (asize > 0) { 523 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 524 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); 525 uint64_t inner_size = MIN(asize, size - inner_offset); 526 527 offset += inner_size; 528 asize -= inner_size; 529 entries++; 530 m++; 531 } 532 533 size_t copy_length = entries * sizeof (*first_mapping); 534 duplicate_mappings = malloc(copy_length); 535 if (duplicate_mappings != NULL) 536 bcopy(first_mapping, duplicate_mappings, copy_length); 537 else 538 entries = 0; 539 540 *copied_entries = entries; 541 542 return (duplicate_mappings); 543 } 544 545 static vdev_t * 546 vdev_lookup_top(spa_t *spa, uint64_t vdev) 547 { 548 vdev_t *rvd; 549 vdev_list_t *vlist; 550 551 vlist = &spa->spa_root_vdev->v_children; 552 STAILQ_FOREACH(rvd, vlist, v_childlink) 553 if (rvd->v_id == vdev) 554 break; 555 556 return (rvd); 557 } 558 559 /* 560 * This is a callback for vdev_indirect_remap() which allocates an 561 * indirect_split_t for each split segment and adds it to iv_splits. 562 */ 563 static void 564 vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, 565 uint64_t size, void *arg) 566 { 567 int n = 1; 568 zio_t *zio = arg; 569 indirect_vsd_t *iv = zio->io_vsd; 570 571 if (vd->v_read == vdev_indirect_read) 572 return; 573 574 if (vd->v_read == vdev_mirror_read) 575 n = vd->v_nchildren; 576 577 indirect_split_t *is = 578 malloc(offsetof(indirect_split_t, is_child[n])); 579 if (is == NULL) { 580 zio->io_error = ENOMEM; 581 return; 582 } 583 bzero(is, offsetof(indirect_split_t, is_child[n])); 584 585 is->is_children = n; 586 is->is_size = size; 587 is->is_split_offset = split_offset; 588 is->is_target_offset = offset; 589 is->is_vdev = vd; 590 591 /* 592 * Note that we only consider multiple copies of the data for 593 * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even 594 * though they use the same ops as mirror, because there's only one 595 * "good" copy under the replacing/spare. 596 */ 597 if (vd->v_read == vdev_mirror_read) { 598 int i = 0; 599 vdev_t *kid; 600 601 STAILQ_FOREACH(kid, &vd->v_children, v_childlink) { 602 is->is_child[i++].ic_vdev = kid; 603 } 604 } else { 605 is->is_child[0].ic_vdev = vd; 606 } 607 608 list_insert_tail(&iv->iv_splits, is); 609 } 610 611 static void 612 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg) 613 { 614 list_t stack; 615 spa_t *spa = vd->v_spa; 616 zio_t *zio = arg; 617 remap_segment_t *rs; 618 619 list_create(&stack, sizeof (remap_segment_t), 620 offsetof(remap_segment_t, rs_node)); 621 622 rs = rs_alloc(vd, offset, asize, 0); 623 if (rs == NULL) { 624 printf("vdev_indirect_remap: out of memory.\n"); 625 zio->io_error = ENOMEM; 626 } 627 for (; rs != NULL; rs = list_remove_head(&stack)) { 628 vdev_t *v = rs->rs_vd; 629 uint64_t num_entries = 0; 630 /* vdev_indirect_mapping_t *vim = v->v_mapping; */ 631 vdev_indirect_mapping_entry_phys_t *mapping = 632 vdev_indirect_mapping_duplicate_adjacent_entries(v, 633 rs->rs_offset, rs->rs_asize, &num_entries); 634 635 if (num_entries == 0) 636 zio->io_error = ENOMEM; 637 638 for (uint64_t i = 0; i < num_entries; i++) { 639 vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; 640 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 641 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); 642 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); 643 uint64_t inner_offset = rs->rs_offset - 644 DVA_MAPPING_GET_SRC_OFFSET(m); 645 uint64_t inner_size = 646 MIN(rs->rs_asize, size - inner_offset); 647 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 648 649 if (dst_v->v_read == vdev_indirect_read) { 650 remap_segment_t *o; 651 652 o = rs_alloc(dst_v, dst_offset + inner_offset, 653 inner_size, rs->rs_split_offset); 654 if (o == NULL) { 655 printf("vdev_indirect_remap: " 656 "out of memory.\n"); 657 zio->io_error = ENOMEM; 658 break; 659 } 660 661 list_insert_head(&stack, o); 662 } 663 vdev_indirect_gather_splits(rs->rs_split_offset, dst_v, 664 dst_offset + inner_offset, 665 inner_size, arg); 666 667 /* 668 * vdev_indirect_gather_splits can have memory 669 * allocation error, we can not recover from it. 670 */ 671 if (zio->io_error != 0) 672 break; 673 rs->rs_offset += inner_size; 674 rs->rs_asize -= inner_size; 675 rs->rs_split_offset += inner_size; 676 } 677 678 free(mapping); 679 free(rs); 680 if (zio->io_error != 0) 681 break; 682 } 683 684 list_destroy(&stack); 685 } 686 687 static void 688 vdev_indirect_map_free(zio_t *zio) 689 { 690 indirect_vsd_t *iv = zio->io_vsd; 691 indirect_split_t *is; 692 693 while ((is = list_head(&iv->iv_splits)) != NULL) { 694 for (int c = 0; c < is->is_children; c++) { 695 indirect_child_t *ic = &is->is_child[c]; 696 free(ic->ic_data); 697 } 698 list_remove(&iv->iv_splits, is); 699 free(is); 700 } 701 free(iv); 702 } 703 704 static int 705 vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 706 off_t offset, size_t bytes) 707 { 708 zio_t zio; 709 spa_t *spa = vdev->v_spa; 710 indirect_vsd_t *iv; 711 indirect_split_t *first; 712 int rc = EIO; 713 714 iv = calloc(1, sizeof(*iv)); 715 if (iv == NULL) 716 return (ENOMEM); 717 718 list_create(&iv->iv_splits, 719 sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); 720 721 bzero(&zio, sizeof(zio)); 722 zio.io_spa = spa; 723 zio.io_bp = (blkptr_t *)bp; 724 zio.io_data = buf; 725 zio.io_size = bytes; 726 zio.io_offset = offset; 727 zio.io_vd = vdev; 728 zio.io_vsd = iv; 729 730 if (vdev->v_mapping == NULL) { 731 vdev_indirect_config_t *vic; 732 733 vic = &vdev->vdev_indirect_config; 734 vdev->v_mapping = vdev_indirect_mapping_open(spa, 735 spa->spa_mos, vic->vic_mapping_object); 736 } 737 738 vdev_indirect_remap(vdev, offset, bytes, &zio); 739 if (zio.io_error != 0) 740 return (zio.io_error); 741 742 first = list_head(&iv->iv_splits); 743 if (first->is_size == zio.io_size) { 744 /* 745 * This is not a split block; we are pointing to the entire 746 * data, which will checksum the same as the original data. 747 * Pass the BP down so that the child i/o can verify the 748 * checksum, and try a different location if available 749 * (e.g. on a mirror). 750 * 751 * While this special case could be handled the same as the 752 * general (split block) case, doing it this way ensures 753 * that the vast majority of blocks on indirect vdevs 754 * (which are not split) are handled identically to blocks 755 * on non-indirect vdevs. This allows us to be less strict 756 * about performance in the general (but rare) case. 757 */ 758 rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp, 759 zio.io_data, first->is_target_offset, bytes); 760 } else { 761 iv->iv_split_block = B_TRUE; 762 /* 763 * Read one copy of each split segment, from the 764 * top-level vdev. Since we don't know the 765 * checksum of each split individually, the child 766 * zio can't ensure that we get the right data. 767 * E.g. if it's a mirror, it will just read from a 768 * random (healthy) leaf vdev. We have to verify 769 * the checksum in vdev_indirect_io_done(). 770 */ 771 for (indirect_split_t *is = list_head(&iv->iv_splits); 772 is != NULL; is = list_next(&iv->iv_splits, is)) { 773 char *ptr = zio.io_data; 774 775 rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp, 776 ptr + is->is_split_offset, is->is_target_offset, 777 is->is_size); 778 } 779 if (zio_checksum_verify(spa, zio.io_bp, zio.io_data)) 780 rc = ECKSUM; 781 else 782 rc = 0; 783 } 784 785 vdev_indirect_map_free(&zio); 786 if (rc == 0) 787 rc = zio.io_error; 788 789 return (rc); 790 } 791 792 static int 793 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 794 off_t offset, size_t bytes) 795 { 796 797 return (vdev_read_phys(vdev, bp, buf, 798 offset + VDEV_LABEL_START_SIZE, bytes)); 799 } 800 801 static int 802 vdev_missing_read(vdev_t *vdev __unused, const blkptr_t *bp __unused, 803 void *buf __unused, off_t offset __unused, size_t bytes __unused) 804 { 805 806 return (ENOTSUP); 807 } 808 809 static int 810 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 811 off_t offset, size_t bytes) 812 { 813 vdev_t *kid; 814 int rc; 815 816 rc = EIO; 817 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 818 if (kid->v_state != VDEV_STATE_HEALTHY) 819 continue; 820 rc = kid->v_read(kid, bp, buf, offset, bytes); 821 if (!rc) 822 return (0); 823 } 824 825 return (rc); 826 } 827 828 static int 829 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 830 off_t offset, size_t bytes) 831 { 832 vdev_t *kid; 833 834 /* 835 * Here we should have two kids: 836 * First one which is the one we are replacing and we can trust 837 * only this one to have valid data, but it might not be present. 838 * Second one is that one we are replacing with. It is most likely 839 * healthy, but we can't trust it has needed data, so we won't use it. 840 */ 841 kid = STAILQ_FIRST(&vdev->v_children); 842 if (kid == NULL) 843 return (EIO); 844 if (kid->v_state != VDEV_STATE_HEALTHY) 845 return (EIO); 846 return (kid->v_read(kid, bp, buf, offset, bytes)); 847 } 848 849 static vdev_t * 850 vdev_find(uint64_t guid) 851 { 852 vdev_t *vdev; 853 854 STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink) 855 if (vdev->v_guid == guid) 856 return (vdev); 857 858 return (0); 859 } 860 861 static vdev_t * 862 vdev_create(uint64_t guid, vdev_read_t *_read) 863 { 864 vdev_t *vdev; 865 vdev_indirect_config_t *vic; 866 867 vdev = calloc(1, sizeof(vdev_t)); 868 if (vdev != NULL) { 869 STAILQ_INIT(&vdev->v_children); 870 vdev->v_guid = guid; 871 vdev->v_read = _read; 872 873 /* 874 * root vdev has no read function, we use this fact to 875 * skip setting up data we do not need for root vdev. 876 * We only point root vdev from spa. 877 */ 878 if (_read != NULL) { 879 vic = &vdev->vdev_indirect_config; 880 vic->vic_prev_indirect_vdev = UINT64_MAX; 881 STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); 882 } 883 } 884 885 return (vdev); 886 } 887 888 static void 889 vdev_set_initial_state(vdev_t *vdev, const nvlist_t *nvlist) 890 { 891 uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; 892 uint64_t is_log; 893 894 is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; 895 is_log = 0; 896 (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, 897 &is_offline, NULL); 898 (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, 899 &is_removed, NULL); 900 (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, 901 &is_faulted, NULL); 902 (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 903 NULL, &is_degraded, NULL); 904 (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 905 NULL, &isnt_present, NULL); 906 (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, 907 &is_log, NULL); 908 909 if (is_offline != 0) 910 vdev->v_state = VDEV_STATE_OFFLINE; 911 else if (is_removed != 0) 912 vdev->v_state = VDEV_STATE_REMOVED; 913 else if (is_faulted != 0) 914 vdev->v_state = VDEV_STATE_FAULTED; 915 else if (is_degraded != 0) 916 vdev->v_state = VDEV_STATE_DEGRADED; 917 else if (isnt_present != 0) 918 vdev->v_state = VDEV_STATE_CANT_OPEN; 919 920 vdev->v_islog = is_log != 0; 921 } 922 923 static int 924 vdev_init(uint64_t guid, const nvlist_t *nvlist, vdev_t **vdevp) 925 { 926 uint64_t id, ashift, asize, nparity; 927 const char *path; 928 const char *type; 929 int len, pathlen; 930 char *name; 931 vdev_t *vdev; 932 933 if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id, 934 NULL) || 935 nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, 936 &type, &len)) { 937 return (ENOENT); 938 } 939 940 if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 && 941 memcmp(type, VDEV_TYPE_DISK, len) != 0 && 942 #ifdef ZFS_TEST 943 memcmp(type, VDEV_TYPE_FILE, len) != 0 && 944 #endif 945 memcmp(type, VDEV_TYPE_RAIDZ, len) != 0 && 946 memcmp(type, VDEV_TYPE_INDIRECT, len) != 0 && 947 memcmp(type, VDEV_TYPE_REPLACING, len) != 0 && 948 memcmp(type, VDEV_TYPE_HOLE, len) != 0) { 949 printf("ZFS: can only boot from disk, mirror, raidz1, " 950 "raidz2 and raidz3 vdevs, got: %.*s\n", len, type); 951 return (EIO); 952 } 953 954 if (memcmp(type, VDEV_TYPE_MIRROR, len) == 0) 955 vdev = vdev_create(guid, vdev_mirror_read); 956 else if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) 957 vdev = vdev_create(guid, vdev_raidz_read); 958 else if (memcmp(type, VDEV_TYPE_REPLACING, len) == 0) 959 vdev = vdev_create(guid, vdev_replacing_read); 960 else if (memcmp(type, VDEV_TYPE_INDIRECT, len) == 0) { 961 vdev_indirect_config_t *vic; 962 963 vdev = vdev_create(guid, vdev_indirect_read); 964 if (vdev != NULL) { 965 vdev->v_state = VDEV_STATE_HEALTHY; 966 vic = &vdev->vdev_indirect_config; 967 968 nvlist_find(nvlist, 969 ZPOOL_CONFIG_INDIRECT_OBJECT, 970 DATA_TYPE_UINT64, 971 NULL, &vic->vic_mapping_object, NULL); 972 nvlist_find(nvlist, 973 ZPOOL_CONFIG_INDIRECT_BIRTHS, 974 DATA_TYPE_UINT64, 975 NULL, &vic->vic_births_object, NULL); 976 nvlist_find(nvlist, 977 ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 978 DATA_TYPE_UINT64, 979 NULL, &vic->vic_prev_indirect_vdev, NULL); 980 } 981 } else if (memcmp(type, VDEV_TYPE_HOLE, len) == 0) { 982 vdev = vdev_create(guid, vdev_missing_read); 983 } else { 984 vdev = vdev_create(guid, vdev_disk_read); 985 } 986 987 if (vdev == NULL) 988 return (ENOMEM); 989 990 vdev_set_initial_state(vdev, nvlist); 991 vdev->v_id = id; 992 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, 993 DATA_TYPE_UINT64, NULL, &ashift, NULL) == 0) 994 vdev->v_ashift = ashift; 995 996 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, 997 DATA_TYPE_UINT64, NULL, &asize, NULL) == 0) { 998 vdev->v_psize = asize + 999 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 1000 } 1001 1002 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, 1003 DATA_TYPE_UINT64, NULL, &nparity, NULL) == 0) 1004 vdev->v_nparity = nparity; 1005 1006 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, 1007 DATA_TYPE_STRING, NULL, &path, &pathlen) == 0) { 1008 char prefix[] = "/dev/"; 1009 1010 len = strlen(prefix); 1011 if (len < pathlen && memcmp(path, prefix, len) == 0) { 1012 path += len; 1013 pathlen -= len; 1014 } 1015 name = malloc(pathlen + 1); 1016 bcopy(path, name, pathlen); 1017 name[pathlen] = '\0'; 1018 vdev->v_name = name; 1019 } else { 1020 name = NULL; 1021 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) { 1022 if (vdev->v_nparity < 1 || 1023 vdev->v_nparity > 3) { 1024 printf("ZFS: invalid raidz parity: %d\n", 1025 vdev->v_nparity); 1026 return (EIO); 1027 } 1028 (void) asprintf(&name, "%.*s%d-%" PRIu64, len, type, 1029 vdev->v_nparity, id); 1030 } else { 1031 (void) asprintf(&name, "%.*s-%" PRIu64, len, type, id); 1032 } 1033 vdev->v_name = name; 1034 } 1035 *vdevp = vdev; 1036 return (0); 1037 } 1038 1039 /* 1040 * Find slot for vdev. We return either NULL to signal to use 1041 * STAILQ_INSERT_HEAD, or we return link element to be used with 1042 * STAILQ_INSERT_AFTER. 1043 */ 1044 static vdev_t * 1045 vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev) 1046 { 1047 vdev_t *v, *previous; 1048 1049 if (STAILQ_EMPTY(&top_vdev->v_children)) 1050 return (NULL); 1051 1052 previous = NULL; 1053 STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) { 1054 if (v->v_id > vdev->v_id) 1055 return (previous); 1056 1057 if (v->v_id == vdev->v_id) 1058 return (v); 1059 1060 if (v->v_id < vdev->v_id) 1061 previous = v; 1062 } 1063 return (previous); 1064 } 1065 1066 static size_t 1067 vdev_child_count(vdev_t *vdev) 1068 { 1069 vdev_t *v; 1070 size_t count; 1071 1072 count = 0; 1073 STAILQ_FOREACH(v, &vdev->v_children, v_childlink) { 1074 count++; 1075 } 1076 return (count); 1077 } 1078 1079 /* 1080 * Insert vdev into top_vdev children list. List is ordered by v_id. 1081 */ 1082 static void 1083 vdev_insert(vdev_t *top_vdev, vdev_t *vdev) 1084 { 1085 vdev_t *previous; 1086 size_t count; 1087 1088 /* 1089 * The top level vdev can appear in random order, depending how 1090 * the firmware is presenting the disk devices. 1091 * However, we will insert vdev to create list ordered by v_id, 1092 * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER 1093 * as STAILQ does not have insert before. 1094 */ 1095 previous = vdev_find_previous(top_vdev, vdev); 1096 1097 if (previous == NULL) { 1098 STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink); 1099 } else if (previous->v_id == vdev->v_id) { 1100 /* 1101 * This vdev was configured from label config, 1102 * do not insert duplicate. 1103 */ 1104 return; 1105 } else { 1106 STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev, 1107 v_childlink); 1108 } 1109 1110 count = vdev_child_count(top_vdev); 1111 if (top_vdev->v_nchildren < count) 1112 top_vdev->v_nchildren = count; 1113 } 1114 1115 static int 1116 vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist) 1117 { 1118 vdev_t *top_vdev, *vdev; 1119 nvlist_t **kids = NULL; 1120 int rc, nkids; 1121 1122 /* Get top vdev. */ 1123 top_vdev = vdev_find(top_guid); 1124 if (top_vdev == NULL) { 1125 rc = vdev_init(top_guid, nvlist, &top_vdev); 1126 if (rc != 0) 1127 return (rc); 1128 top_vdev->v_spa = spa; 1129 top_vdev->v_top = top_vdev; 1130 vdev_insert(spa->spa_root_vdev, top_vdev); 1131 } 1132 1133 /* Add children if there are any. */ 1134 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1135 &nkids, &kids, NULL); 1136 if (rc == 0) { 1137 for (int i = 0; i < nkids; i++) { 1138 uint64_t guid; 1139 1140 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, 1141 DATA_TYPE_UINT64, NULL, &guid, NULL); 1142 if (rc != 0) 1143 goto done; 1144 1145 rc = vdev_init(guid, kids[i], &vdev); 1146 if (rc != 0) 1147 goto done; 1148 1149 vdev->v_spa = spa; 1150 vdev->v_top = top_vdev; 1151 vdev_insert(top_vdev, vdev); 1152 } 1153 } else { 1154 /* 1155 * When there are no children, nvlist_find() does return 1156 * error, reset it because leaf devices have no children. 1157 */ 1158 rc = 0; 1159 } 1160 done: 1161 if (kids != NULL) { 1162 for (int i = 0; i < nkids; i++) 1163 nvlist_destroy(kids[i]); 1164 free(kids); 1165 } 1166 1167 return (rc); 1168 } 1169 1170 static int 1171 vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist) 1172 { 1173 uint64_t pool_guid, top_guid; 1174 nvlist_t *vdevs; 1175 int rc; 1176 1177 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1178 NULL, &pool_guid, NULL) || 1179 nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64, 1180 NULL, &top_guid, NULL) || 1181 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1182 NULL, &vdevs, NULL)) { 1183 printf("ZFS: can't find vdev details\n"); 1184 return (ENOENT); 1185 } 1186 1187 rc = vdev_from_nvlist(spa, top_guid, vdevs); 1188 nvlist_destroy(vdevs); 1189 return (rc); 1190 } 1191 1192 static void 1193 vdev_set_state(vdev_t *vdev) 1194 { 1195 vdev_t *kid; 1196 int good_kids; 1197 int bad_kids; 1198 1199 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1200 vdev_set_state(kid); 1201 } 1202 1203 /* 1204 * A mirror or raidz is healthy if all its kids are healthy. A 1205 * mirror is degraded if any of its kids is healthy; a raidz 1206 * is degraded if at most nparity kids are offline. 1207 */ 1208 if (STAILQ_FIRST(&vdev->v_children)) { 1209 good_kids = 0; 1210 bad_kids = 0; 1211 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1212 if (kid->v_state == VDEV_STATE_HEALTHY) 1213 good_kids++; 1214 else 1215 bad_kids++; 1216 } 1217 if (bad_kids == 0) { 1218 vdev->v_state = VDEV_STATE_HEALTHY; 1219 } else { 1220 if (vdev->v_read == vdev_mirror_read) { 1221 if (good_kids) { 1222 vdev->v_state = VDEV_STATE_DEGRADED; 1223 } else { 1224 vdev->v_state = VDEV_STATE_OFFLINE; 1225 } 1226 } else if (vdev->v_read == vdev_raidz_read) { 1227 if (bad_kids > vdev->v_nparity) { 1228 vdev->v_state = VDEV_STATE_OFFLINE; 1229 } else { 1230 vdev->v_state = VDEV_STATE_DEGRADED; 1231 } 1232 } 1233 } 1234 } 1235 } 1236 1237 static int 1238 vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist) 1239 { 1240 vdev_t *vdev; 1241 nvlist_t **kids = NULL; 1242 int rc, nkids; 1243 1244 /* Update top vdev. */ 1245 vdev = vdev_find(top_guid); 1246 if (vdev != NULL) 1247 vdev_set_initial_state(vdev, nvlist); 1248 1249 /* Update children if there are any. */ 1250 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1251 &nkids, &kids, NULL); 1252 if (rc == 0) { 1253 for (int i = 0; i < nkids; i++) { 1254 uint64_t guid; 1255 1256 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, 1257 DATA_TYPE_UINT64, NULL, &guid, NULL); 1258 if (rc != 0) 1259 break; 1260 1261 vdev = vdev_find(guid); 1262 if (vdev != NULL) 1263 vdev_set_initial_state(vdev, kids[i]); 1264 } 1265 } else { 1266 rc = 0; 1267 } 1268 if (kids != NULL) { 1269 for (int i = 0; i < nkids; i++) 1270 nvlist_destroy(kids[i]); 1271 free(kids); 1272 } 1273 1274 return (rc); 1275 } 1276 1277 static int 1278 vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist) 1279 { 1280 uint64_t pool_guid, vdev_children; 1281 nvlist_t *vdevs = NULL, **kids = NULL; 1282 int rc, nkids; 1283 1284 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1285 NULL, &pool_guid, NULL) || 1286 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64, 1287 NULL, &vdev_children, NULL) || 1288 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1289 NULL, &vdevs, NULL)) { 1290 printf("ZFS: can't find vdev details\n"); 1291 return (ENOENT); 1292 } 1293 1294 /* Wrong guid?! */ 1295 if (spa->spa_guid != pool_guid) { 1296 nvlist_destroy(vdevs); 1297 return (EINVAL); 1298 } 1299 1300 spa->spa_root_vdev->v_nchildren = vdev_children; 1301 1302 rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1303 &nkids, &kids, NULL); 1304 nvlist_destroy(vdevs); 1305 1306 /* 1307 * MOS config has at least one child for root vdev. 1308 */ 1309 if (rc != 0) 1310 return (rc); 1311 1312 for (int i = 0; i < nkids; i++) { 1313 uint64_t guid; 1314 vdev_t *vdev; 1315 1316 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 1317 NULL, &guid, NULL); 1318 if (rc != 0) 1319 break; 1320 vdev = vdev_find(guid); 1321 /* 1322 * Top level vdev is missing, create it. 1323 */ 1324 if (vdev == NULL) 1325 rc = vdev_from_nvlist(spa, guid, kids[i]); 1326 else 1327 rc = vdev_update_from_nvlist(guid, kids[i]); 1328 if (rc != 0) 1329 break; 1330 } 1331 if (kids != NULL) { 1332 for (int i = 0; i < nkids; i++) 1333 nvlist_destroy(kids[i]); 1334 free(kids); 1335 } 1336 1337 /* 1338 * Re-evaluate top-level vdev state. 1339 */ 1340 vdev_set_state(spa->spa_root_vdev); 1341 1342 return (rc); 1343 } 1344 1345 static spa_t * 1346 spa_find_by_guid(uint64_t guid) 1347 { 1348 spa_t *spa; 1349 1350 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 1351 if (spa->spa_guid == guid) 1352 return (spa); 1353 1354 return (NULL); 1355 } 1356 1357 static spa_t * 1358 spa_find_by_name(const char *name) 1359 { 1360 spa_t *spa; 1361 1362 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 1363 if (strcmp(spa->spa_name, name) == 0) 1364 return (spa); 1365 1366 return (NULL); 1367 } 1368 1369 static spa_t * 1370 spa_create(uint64_t guid, const char *name) 1371 { 1372 spa_t *spa; 1373 1374 if ((spa = calloc(1, sizeof(spa_t))) == NULL) 1375 return (NULL); 1376 if ((spa->spa_name = strdup(name)) == NULL) { 1377 free(spa); 1378 return (NULL); 1379 } 1380 spa->spa_uberblock = &spa->spa_uberblock_master; 1381 spa->spa_mos = &spa->spa_mos_master; 1382 spa->spa_guid = guid; 1383 spa->spa_root_vdev = vdev_create(guid, NULL); 1384 if (spa->spa_root_vdev == NULL) { 1385 free(spa->spa_name); 1386 free(spa); 1387 return (NULL); 1388 } 1389 spa->spa_root_vdev->v_name = strdup("root"); 1390 STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); 1391 1392 return (spa); 1393 } 1394 1395 static const char * 1396 state_name(vdev_state_t state) 1397 { 1398 static const char *names[] = { 1399 "UNKNOWN", 1400 "CLOSED", 1401 "OFFLINE", 1402 "REMOVED", 1403 "CANT_OPEN", 1404 "FAULTED", 1405 "DEGRADED", 1406 "ONLINE" 1407 }; 1408 return (names[state]); 1409 } 1410 1411 #ifdef BOOT2 1412 1413 #define pager_printf printf 1414 1415 #else 1416 1417 static int 1418 pager_printf(const char *fmt, ...) 1419 { 1420 char line[80]; 1421 va_list args; 1422 1423 va_start(args, fmt); 1424 vsnprintf(line, sizeof(line), fmt, args); 1425 va_end(args); 1426 return (pager_output(line)); 1427 } 1428 1429 #endif 1430 1431 #define STATUS_FORMAT " %s %s\n" 1432 1433 static int 1434 print_state(int indent, const char *name, vdev_state_t state) 1435 { 1436 int i; 1437 char buf[512]; 1438 1439 buf[0] = 0; 1440 for (i = 0; i < indent; i++) 1441 strcat(buf, " "); 1442 strcat(buf, name); 1443 return (pager_printf(STATUS_FORMAT, buf, state_name(state))); 1444 } 1445 1446 static int 1447 vdev_status(vdev_t *vdev, int indent) 1448 { 1449 vdev_t *kid; 1450 int ret; 1451 1452 if (vdev->v_islog) { 1453 (void) pager_output(" logs\n"); 1454 indent++; 1455 } 1456 1457 ret = print_state(indent, vdev->v_name, vdev->v_state); 1458 if (ret != 0) 1459 return (ret); 1460 1461 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1462 ret = vdev_status(kid, indent + 1); 1463 if (ret != 0) 1464 return (ret); 1465 } 1466 return (ret); 1467 } 1468 1469 static int 1470 spa_status(spa_t *spa) 1471 { 1472 static char bootfs[ZFS_MAXNAMELEN]; 1473 uint64_t rootid; 1474 vdev_list_t *vlist; 1475 vdev_t *vdev; 1476 int good_kids, bad_kids, degraded_kids, ret; 1477 vdev_state_t state; 1478 1479 ret = pager_printf(" pool: %s\n", spa->spa_name); 1480 if (ret != 0) 1481 return (ret); 1482 1483 if (zfs_get_root(spa, &rootid) == 0 && 1484 zfs_rlookup(spa, rootid, bootfs) == 0) { 1485 if (bootfs[0] == '\0') 1486 ret = pager_printf("bootfs: %s\n", spa->spa_name); 1487 else 1488 ret = pager_printf("bootfs: %s/%s\n", spa->spa_name, 1489 bootfs); 1490 if (ret != 0) 1491 return (ret); 1492 } 1493 ret = pager_printf("config:\n\n"); 1494 if (ret != 0) 1495 return (ret); 1496 ret = pager_printf(STATUS_FORMAT, "NAME", "STATE"); 1497 if (ret != 0) 1498 return (ret); 1499 1500 good_kids = 0; 1501 degraded_kids = 0; 1502 bad_kids = 0; 1503 vlist = &spa->spa_root_vdev->v_children; 1504 STAILQ_FOREACH(vdev, vlist, v_childlink) { 1505 if (vdev->v_state == VDEV_STATE_HEALTHY) 1506 good_kids++; 1507 else if (vdev->v_state == VDEV_STATE_DEGRADED) 1508 degraded_kids++; 1509 else 1510 bad_kids++; 1511 } 1512 1513 state = VDEV_STATE_CLOSED; 1514 if (good_kids > 0 && (degraded_kids + bad_kids) == 0) 1515 state = VDEV_STATE_HEALTHY; 1516 else if ((good_kids + degraded_kids) > 0) 1517 state = VDEV_STATE_DEGRADED; 1518 1519 ret = print_state(0, spa->spa_name, state); 1520 if (ret != 0) 1521 return (ret); 1522 1523 STAILQ_FOREACH(vdev, vlist, v_childlink) { 1524 ret = vdev_status(vdev, 1); 1525 if (ret != 0) 1526 return (ret); 1527 } 1528 return (ret); 1529 } 1530 1531 static int 1532 spa_all_status(void) 1533 { 1534 spa_t *spa; 1535 int first = 1, ret = 0; 1536 1537 STAILQ_FOREACH(spa, &zfs_pools, spa_link) { 1538 if (!first) { 1539 ret = pager_printf("\n"); 1540 if (ret != 0) 1541 return (ret); 1542 } 1543 first = 0; 1544 ret = spa_status(spa); 1545 if (ret != 0) 1546 return (ret); 1547 } 1548 return (ret); 1549 } 1550 1551 static uint64_t 1552 vdev_label_offset(uint64_t psize, int l, uint64_t offset) 1553 { 1554 uint64_t label_offset; 1555 1556 if (l < VDEV_LABELS / 2) 1557 label_offset = 0; 1558 else 1559 label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t); 1560 1561 return (offset + l * sizeof (vdev_label_t) + label_offset); 1562 } 1563 1564 static int 1565 vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) 1566 { 1567 unsigned int seq1 = 0; 1568 unsigned int seq2 = 0; 1569 int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); 1570 1571 if (cmp != 0) 1572 return (cmp); 1573 1574 cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); 1575 if (cmp != 0) 1576 return (cmp); 1577 1578 if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) 1579 seq1 = MMP_SEQ(ub1); 1580 1581 if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) 1582 seq2 = MMP_SEQ(ub2); 1583 1584 return (AVL_CMP(seq1, seq2)); 1585 } 1586 1587 static int 1588 uberblock_verify(uberblock_t *ub) 1589 { 1590 if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) { 1591 byteswap_uint64_array(ub, sizeof (uberblock_t)); 1592 } 1593 1594 if (ub->ub_magic != UBERBLOCK_MAGIC || 1595 !SPA_VERSION_IS_SUPPORTED(ub->ub_version)) 1596 return (EINVAL); 1597 1598 return (0); 1599 } 1600 1601 static int 1602 vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset, 1603 size_t size) 1604 { 1605 blkptr_t bp; 1606 off_t off; 1607 1608 off = vdev_label_offset(vd->v_psize, l, offset); 1609 1610 BP_ZERO(&bp); 1611 BP_SET_LSIZE(&bp, size); 1612 BP_SET_PSIZE(&bp, size); 1613 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); 1614 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); 1615 DVA_SET_OFFSET(BP_IDENTITY(&bp), off); 1616 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); 1617 1618 return (vdev_read_phys(vd, &bp, buf, off, size)); 1619 } 1620 1621 /* 1622 * We do need to be sure we write to correct location. 1623 * Our vdev label does consist of 4 fields: 1624 * pad1 (8k), reserved. 1625 * bootenv (8k), checksummed, previously reserved, may contian garbage. 1626 * vdev_phys (112k), checksummed 1627 * uberblock ring (128k), checksummed. 1628 * 1629 * Since bootenv area may contain garbage, we can not reliably read it, as 1630 * we can get checksum errors. 1631 * Next best thing is vdev_phys - it is just after bootenv. It still may 1632 * be corrupted, but in such case we will miss this one write. 1633 */ 1634 static int 1635 vdev_label_write_validate(vdev_t *vd, int l, uint64_t offset) 1636 { 1637 uint64_t off, o_phys; 1638 void *buf; 1639 size_t size = VDEV_PHYS_SIZE; 1640 int rc; 1641 1642 o_phys = offsetof(vdev_label_t, vl_vdev_phys); 1643 off = vdev_label_offset(vd->v_psize, l, o_phys); 1644 1645 /* off should be 8K from bootenv */ 1646 if (vdev_label_offset(vd->v_psize, l, offset) + VDEV_PAD_SIZE != off) 1647 return (EINVAL); 1648 1649 buf = malloc(size); 1650 if (buf == NULL) 1651 return (ENOMEM); 1652 1653 /* Read vdev_phys */ 1654 rc = vdev_label_read(vd, l, buf, o_phys, size); 1655 free(buf); 1656 return (rc); 1657 } 1658 1659 static int 1660 vdev_label_write(vdev_t *vd, int l, vdev_boot_envblock_t *be, uint64_t offset) 1661 { 1662 zio_checksum_info_t *ci; 1663 zio_cksum_t cksum; 1664 off_t off; 1665 size_t size = VDEV_PAD_SIZE; 1666 int rc; 1667 1668 if (vd->v_phys_write == NULL) 1669 return (ENOTSUP); 1670 1671 off = vdev_label_offset(vd->v_psize, l, offset); 1672 1673 rc = vdev_label_write_validate(vd, l, offset); 1674 if (rc != 0) { 1675 return (rc); 1676 } 1677 1678 ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; 1679 be->vbe_zbt.zec_magic = ZEC_MAGIC; 1680 zio_checksum_label_verifier(&be->vbe_zbt.zec_cksum, off); 1681 ci->ci_func[0](be, size, NULL, &cksum); 1682 be->vbe_zbt.zec_cksum = cksum; 1683 1684 return (vdev_write_phys(vd, be, off, size)); 1685 } 1686 1687 static int 1688 vdev_write_bootenv_impl(vdev_t *vdev, vdev_boot_envblock_t *be) 1689 { 1690 vdev_t *kid; 1691 int rv = 0, rc; 1692 1693 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1694 if (kid->v_state != VDEV_STATE_HEALTHY) 1695 continue; 1696 rc = vdev_write_bootenv_impl(kid, be); 1697 if (rv == 0) 1698 rv = rc; 1699 } 1700 1701 /* 1702 * Non-leaf vdevs do not have v_phys_write. 1703 */ 1704 if (vdev->v_phys_write == NULL) 1705 return (rv); 1706 1707 for (int l = 0; l < VDEV_LABELS; l++) { 1708 rc = vdev_label_write(vdev, l, be, 1709 offsetof(vdev_label_t, vl_be)); 1710 if (rc != 0) { 1711 printf("failed to write bootenv to %s label %d: %d\n", 1712 vdev->v_name ? vdev->v_name : "unknown", l, rc); 1713 rv = rc; 1714 } 1715 } 1716 return (rv); 1717 } 1718 1719 int 1720 vdev_write_bootenv(vdev_t *vdev, nvlist_t *nvl) 1721 { 1722 vdev_boot_envblock_t *be; 1723 nvlist_t nv, *nvp; 1724 uint64_t version; 1725 int rv; 1726 1727 if (nvl->nv_size > sizeof(be->vbe_bootenv)) 1728 return (E2BIG); 1729 1730 version = VB_RAW; 1731 nvp = vdev_read_bootenv(vdev); 1732 if (nvp != NULL) { 1733 nvlist_find(nvp, BOOTENV_VERSION, DATA_TYPE_UINT64, NULL, 1734 &version, NULL); 1735 nvlist_destroy(nvp); 1736 } 1737 1738 be = calloc(1, sizeof(*be)); 1739 if (be == NULL) 1740 return (ENOMEM); 1741 1742 be->vbe_version = version; 1743 switch (version) { 1744 case VB_RAW: 1745 /* 1746 * If there is no envmap, we will just wipe bootenv. 1747 */ 1748 nvlist_find(nvl, GRUB_ENVMAP, DATA_TYPE_STRING, NULL, 1749 be->vbe_bootenv, NULL); 1750 rv = 0; 1751 break; 1752 1753 case VB_NVLIST: 1754 nv.nv_header = nvl->nv_header; 1755 nv.nv_asize = nvl->nv_asize; 1756 nv.nv_size = nvl->nv_size; 1757 1758 bcopy(&nv.nv_header, be->vbe_bootenv, sizeof(nv.nv_header)); 1759 nv.nv_data = be->vbe_bootenv + sizeof(nvs_header_t); 1760 bcopy(nvl->nv_data, nv.nv_data, nv.nv_size); 1761 rv = nvlist_export(&nv); 1762 break; 1763 1764 default: 1765 rv = EINVAL; 1766 break; 1767 } 1768 1769 if (rv == 0) { 1770 be->vbe_version = htobe64(be->vbe_version); 1771 rv = vdev_write_bootenv_impl(vdev, be); 1772 } 1773 free(be); 1774 return (rv); 1775 } 1776 1777 /* 1778 * Read the bootenv area from pool label, return the nvlist from it. 1779 * We return from first successful read. 1780 */ 1781 nvlist_t * 1782 vdev_read_bootenv(vdev_t *vdev) 1783 { 1784 vdev_t *kid; 1785 nvlist_t *benv; 1786 vdev_boot_envblock_t *be; 1787 char *command; 1788 bool ok; 1789 int rv; 1790 1791 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1792 if (kid->v_state != VDEV_STATE_HEALTHY) 1793 continue; 1794 1795 benv = vdev_read_bootenv(kid); 1796 if (benv != NULL) 1797 return (benv); 1798 } 1799 1800 be = malloc(sizeof (*be)); 1801 if (be == NULL) 1802 return (NULL); 1803 1804 rv = 0; 1805 for (int l = 0; l < VDEV_LABELS; l++) { 1806 rv = vdev_label_read(vdev, l, be, 1807 offsetof(vdev_label_t, vl_be), 1808 sizeof (*be)); 1809 if (rv == 0) 1810 break; 1811 } 1812 if (rv != 0) { 1813 free(be); 1814 return (NULL); 1815 } 1816 1817 be->vbe_version = be64toh(be->vbe_version); 1818 switch (be->vbe_version) { 1819 case VB_RAW: 1820 /* 1821 * we have textual data in vbe_bootenv, create nvlist 1822 * with key "envmap". 1823 */ 1824 benv = nvlist_create(NV_UNIQUE_NAME); 1825 if (benv != NULL) { 1826 if (*be->vbe_bootenv == '\0') { 1827 nvlist_add_uint64(benv, BOOTENV_VERSION, 1828 VB_NVLIST); 1829 break; 1830 } 1831 nvlist_add_uint64(benv, BOOTENV_VERSION, VB_RAW); 1832 be->vbe_bootenv[sizeof (be->vbe_bootenv) - 1] = '\0'; 1833 nvlist_add_string(benv, GRUB_ENVMAP, be->vbe_bootenv); 1834 } 1835 break; 1836 1837 case VB_NVLIST: 1838 benv = nvlist_import(be->vbe_bootenv, sizeof(be->vbe_bootenv)); 1839 break; 1840 1841 default: 1842 command = (char *)be; 1843 ok = false; 1844 1845 /* Check for legacy zfsbootcfg command string */ 1846 for (int i = 0; command[i] != '\0'; i++) { 1847 if (iscntrl(command[i])) { 1848 ok = false; 1849 break; 1850 } else { 1851 ok = true; 1852 } 1853 } 1854 benv = nvlist_create(NV_UNIQUE_NAME); 1855 if (benv != NULL) { 1856 if (ok) 1857 nvlist_add_string(benv, FREEBSD_BOOTONCE, 1858 command); 1859 else 1860 nvlist_add_uint64(benv, BOOTENV_VERSION, 1861 VB_NVLIST); 1862 } 1863 break; 1864 } 1865 free(be); 1866 return (benv); 1867 } 1868 1869 static uint64_t 1870 vdev_get_label_asize(nvlist_t *nvl) 1871 { 1872 nvlist_t *vdevs; 1873 uint64_t asize; 1874 const char *type; 1875 int len; 1876 1877 asize = 0; 1878 /* Get vdev tree */ 1879 if (nvlist_find(nvl, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1880 NULL, &vdevs, NULL) != 0) 1881 return (asize); 1882 1883 /* 1884 * Get vdev type. We will calculate asize for raidz, mirror and disk. 1885 * For raidz, the asize is raw size of all children. 1886 */ 1887 if (nvlist_find(vdevs, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, 1888 NULL, &type, &len) != 0) 1889 goto done; 1890 1891 if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 && 1892 memcmp(type, VDEV_TYPE_DISK, len) != 0 && 1893 memcmp(type, VDEV_TYPE_RAIDZ, len) != 0) 1894 goto done; 1895 1896 if (nvlist_find(vdevs, ZPOOL_CONFIG_ASIZE, DATA_TYPE_UINT64, 1897 NULL, &asize, NULL) != 0) 1898 goto done; 1899 1900 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) { 1901 nvlist_t **kids; 1902 int nkids; 1903 1904 if (nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, 1905 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids, NULL) != 0) { 1906 asize = 0; 1907 goto done; 1908 } 1909 1910 asize /= nkids; 1911 for (int i = 0; i < nkids; i++) 1912 nvlist_destroy(kids[i]); 1913 free(kids); 1914 } 1915 1916 asize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 1917 done: 1918 nvlist_destroy(vdevs); 1919 return (asize); 1920 } 1921 1922 static nvlist_t * 1923 vdev_label_read_config(vdev_t *vd, uint64_t txg) 1924 { 1925 vdev_phys_t *label; 1926 uint64_t best_txg = 0; 1927 uint64_t label_txg = 0; 1928 uint64_t asize; 1929 nvlist_t *nvl = NULL, *tmp; 1930 int error; 1931 1932 label = malloc(sizeof (vdev_phys_t)); 1933 if (label == NULL) 1934 return (NULL); 1935 1936 for (int l = 0; l < VDEV_LABELS; l++) { 1937 if (vdev_label_read(vd, l, label, 1938 offsetof(vdev_label_t, vl_vdev_phys), 1939 sizeof (vdev_phys_t))) 1940 continue; 1941 1942 tmp = nvlist_import(label->vp_nvlist, 1943 sizeof(label->vp_nvlist)); 1944 if (tmp == NULL) 1945 continue; 1946 1947 error = nvlist_find(tmp, ZPOOL_CONFIG_POOL_TXG, 1948 DATA_TYPE_UINT64, NULL, &label_txg, NULL); 1949 if (error != 0 || label_txg == 0) { 1950 nvlist_destroy(nvl); 1951 nvl = tmp; 1952 goto done; 1953 } 1954 1955 if (label_txg <= txg && label_txg > best_txg) { 1956 best_txg = label_txg; 1957 nvlist_destroy(nvl); 1958 nvl = tmp; 1959 tmp = NULL; 1960 1961 /* 1962 * Use asize from pool config. We need this 1963 * because we can get bad value from BIOS. 1964 */ 1965 asize = vdev_get_label_asize(nvl); 1966 if (asize != 0) { 1967 vd->v_psize = asize; 1968 } 1969 } 1970 nvlist_destroy(tmp); 1971 } 1972 1973 if (best_txg == 0) { 1974 nvlist_destroy(nvl); 1975 nvl = NULL; 1976 } 1977 done: 1978 free(label); 1979 return (nvl); 1980 } 1981 1982 static void 1983 vdev_uberblock_load(vdev_t *vd, uberblock_t *ub) 1984 { 1985 uberblock_t *buf; 1986 1987 buf = malloc(VDEV_UBERBLOCK_SIZE(vd)); 1988 if (buf == NULL) 1989 return; 1990 1991 for (int l = 0; l < VDEV_LABELS; l++) { 1992 for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { 1993 if (vdev_label_read(vd, l, buf, 1994 VDEV_UBERBLOCK_OFFSET(vd, n), 1995 VDEV_UBERBLOCK_SIZE(vd))) 1996 continue; 1997 if (uberblock_verify(buf) != 0) 1998 continue; 1999 2000 if (vdev_uberblock_compare(buf, ub) > 0) 2001 *ub = *buf; 2002 } 2003 } 2004 free(buf); 2005 } 2006 2007 static int 2008 vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv, 2009 spa_t **spap) 2010 { 2011 vdev_t vtmp; 2012 spa_t *spa; 2013 vdev_t *vdev; 2014 nvlist_t *nvl; 2015 uint64_t val; 2016 uint64_t guid, vdev_children; 2017 uint64_t pool_txg, pool_guid; 2018 const char *pool_name; 2019 int rc, namelen; 2020 2021 /* 2022 * Load the vdev label and figure out which 2023 * uberblock is most current. 2024 */ 2025 memset(&vtmp, 0, sizeof(vtmp)); 2026 vtmp.v_phys_read = _read; 2027 vtmp.v_phys_write = _write; 2028 vtmp.v_priv = priv; 2029 vtmp.v_psize = P2ALIGN(ldi_get_size(priv), 2030 (uint64_t)sizeof (vdev_label_t)); 2031 2032 /* Test for minimum device size. */ 2033 if (vtmp.v_psize < SPA_MINDEVSIZE) 2034 return (EIO); 2035 2036 nvl = vdev_label_read_config(&vtmp, UINT64_MAX); 2037 if (nvl == NULL) 2038 return (EIO); 2039 2040 if (nvlist_find(nvl, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, 2041 NULL, &val, NULL) != 0) { 2042 nvlist_destroy(nvl); 2043 return (EIO); 2044 } 2045 2046 if (!SPA_VERSION_IS_SUPPORTED(val)) { 2047 printf("ZFS: unsupported ZFS version %u (should be %u)\n", 2048 (unsigned)val, (unsigned)SPA_VERSION); 2049 nvlist_destroy(nvl); 2050 return (EIO); 2051 } 2052 2053 /* Check ZFS features for read */ 2054 rc = nvlist_check_features_for_read(nvl); 2055 if (rc != 0) { 2056 nvlist_destroy(nvl); 2057 return (EIO); 2058 } 2059 2060 if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, 2061 NULL, &val, NULL) != 0) { 2062 nvlist_destroy(nvl); 2063 return (EIO); 2064 } 2065 2066 if (val == POOL_STATE_DESTROYED) { 2067 /* We don't boot only from destroyed pools. */ 2068 nvlist_destroy(nvl); 2069 return (EIO); 2070 } 2071 2072 if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, 2073 NULL, &pool_txg, NULL) != 0 || 2074 nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 2075 NULL, &pool_guid, NULL) != 0 || 2076 nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING, 2077 NULL, &pool_name, &namelen) != 0) { 2078 /* 2079 * Cache and spare devices end up here - just ignore 2080 * them. 2081 */ 2082 nvlist_destroy(nvl); 2083 return (EIO); 2084 } 2085 2086 /* 2087 * Create the pool if this is the first time we've seen it. 2088 */ 2089 spa = spa_find_by_guid(pool_guid); 2090 if (spa == NULL) { 2091 char *name; 2092 2093 nvlist_find(nvl, ZPOOL_CONFIG_VDEV_CHILDREN, 2094 DATA_TYPE_UINT64, NULL, &vdev_children, NULL); 2095 name = malloc(namelen + 1); 2096 if (name == NULL) { 2097 nvlist_destroy(nvl); 2098 return (ENOMEM); 2099 } 2100 bcopy(pool_name, name, namelen); 2101 name[namelen] = '\0'; 2102 spa = spa_create(pool_guid, name); 2103 free(name); 2104 if (spa == NULL) { 2105 nvlist_destroy(nvl); 2106 return (ENOMEM); 2107 } 2108 spa->spa_root_vdev->v_nchildren = vdev_children; 2109 } 2110 if (pool_txg > spa->spa_txg) 2111 spa->spa_txg = pool_txg; 2112 2113 /* 2114 * Get the vdev tree and create our in-core copy of it. 2115 * If we already have a vdev with this guid, this must 2116 * be some kind of alias (overlapping slices, dangerously dedicated 2117 * disks etc). 2118 */ 2119 if (nvlist_find(nvl, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 2120 NULL, &guid, NULL) != 0) { 2121 nvlist_destroy(nvl); 2122 return (EIO); 2123 } 2124 vdev = vdev_find(guid); 2125 /* Has this vdev already been inited? */ 2126 if (vdev && vdev->v_phys_read) { 2127 nvlist_destroy(nvl); 2128 return (EIO); 2129 } 2130 2131 rc = vdev_init_from_label(spa, nvl); 2132 nvlist_destroy(nvl); 2133 if (rc != 0) 2134 return (rc); 2135 2136 /* 2137 * We should already have created an incomplete vdev for this 2138 * vdev. Find it and initialise it with our read proc. 2139 */ 2140 vdev = vdev_find(guid); 2141 if (vdev != NULL) { 2142 vdev->v_phys_read = _read; 2143 vdev->v_phys_write = _write; 2144 vdev->v_priv = priv; 2145 vdev->v_psize = vtmp.v_psize; 2146 /* 2147 * If no other state is set, mark vdev healthy. 2148 */ 2149 if (vdev->v_state == VDEV_STATE_UNKNOWN) 2150 vdev->v_state = VDEV_STATE_HEALTHY; 2151 } else { 2152 printf("ZFS: inconsistent nvlist contents\n"); 2153 return (EIO); 2154 } 2155 2156 if (vdev->v_islog) 2157 spa->spa_with_log = vdev->v_islog; 2158 2159 /* 2160 * Re-evaluate top-level vdev state. 2161 */ 2162 vdev_set_state(vdev->v_top); 2163 2164 /* 2165 * Ok, we are happy with the pool so far. Lets find 2166 * the best uberblock and then we can actually access 2167 * the contents of the pool. 2168 */ 2169 vdev_uberblock_load(vdev, spa->spa_uberblock); 2170 2171 if (spap != NULL) 2172 *spap = spa; 2173 return (0); 2174 } 2175 2176 static int 2177 ilog2(int n) 2178 { 2179 int v; 2180 2181 for (v = 0; v < 32; v++) 2182 if (n == (1 << v)) 2183 return (v); 2184 return (-1); 2185 } 2186 2187 static int 2188 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf) 2189 { 2190 blkptr_t gbh_bp; 2191 zio_gbh_phys_t zio_gb; 2192 char *pbuf; 2193 int i; 2194 2195 /* Artificial BP for gang block header. */ 2196 gbh_bp = *bp; 2197 BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 2198 BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 2199 BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER); 2200 BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF); 2201 for (i = 0; i < SPA_DVAS_PER_BP; i++) 2202 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0); 2203 2204 /* Read gang header block using the artificial BP. */ 2205 if (zio_read(spa, &gbh_bp, &zio_gb)) 2206 return (EIO); 2207 2208 pbuf = buf; 2209 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 2210 blkptr_t *gbp = &zio_gb.zg_blkptr[i]; 2211 2212 if (BP_IS_HOLE(gbp)) 2213 continue; 2214 if (zio_read(spa, gbp, pbuf)) 2215 return (EIO); 2216 pbuf += BP_GET_PSIZE(gbp); 2217 } 2218 2219 if (zio_checksum_verify(spa, bp, buf)) 2220 return (EIO); 2221 return (0); 2222 } 2223 2224 static int 2225 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) 2226 { 2227 int cpfunc = BP_GET_COMPRESS(bp); 2228 uint64_t align, size; 2229 void *pbuf; 2230 int i, error; 2231 2232 /* 2233 * Process data embedded in block pointer 2234 */ 2235 if (BP_IS_EMBEDDED(bp)) { 2236 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 2237 2238 size = BPE_GET_PSIZE(bp); 2239 ASSERT(size <= BPE_PAYLOAD_SIZE); 2240 2241 if (cpfunc != ZIO_COMPRESS_OFF) 2242 pbuf = malloc(size); 2243 else 2244 pbuf = buf; 2245 2246 if (pbuf == NULL) 2247 return (ENOMEM); 2248 2249 decode_embedded_bp_compressed(bp, pbuf); 2250 error = 0; 2251 2252 if (cpfunc != ZIO_COMPRESS_OFF) { 2253 error = zio_decompress_data(cpfunc, pbuf, 2254 size, buf, BP_GET_LSIZE(bp)); 2255 free(pbuf); 2256 } 2257 if (error != 0) 2258 printf("ZFS: i/o error - unable to decompress " 2259 "block pointer data, error %d\n", error); 2260 return (error); 2261 } 2262 2263 error = EIO; 2264 2265 for (i = 0; i < SPA_DVAS_PER_BP; i++) { 2266 const dva_t *dva = &bp->blk_dva[i]; 2267 vdev_t *vdev; 2268 vdev_list_t *vlist; 2269 uint64_t vdevid; 2270 off_t offset; 2271 2272 if (!dva->dva_word[0] && !dva->dva_word[1]) 2273 continue; 2274 2275 vdevid = DVA_GET_VDEV(dva); 2276 offset = DVA_GET_OFFSET(dva); 2277 vlist = &spa->spa_root_vdev->v_children; 2278 STAILQ_FOREACH(vdev, vlist, v_childlink) { 2279 if (vdev->v_id == vdevid) 2280 break; 2281 } 2282 if (!vdev || !vdev->v_read) 2283 continue; 2284 2285 size = BP_GET_PSIZE(bp); 2286 if (vdev->v_read == vdev_raidz_read) { 2287 align = 1ULL << vdev->v_ashift; 2288 if (P2PHASE(size, align) != 0) 2289 size = P2ROUNDUP(size, align); 2290 } 2291 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) 2292 pbuf = malloc(size); 2293 else 2294 pbuf = buf; 2295 2296 if (pbuf == NULL) { 2297 error = ENOMEM; 2298 break; 2299 } 2300 2301 if (DVA_GET_GANG(dva)) 2302 error = zio_read_gang(spa, bp, pbuf); 2303 else 2304 error = vdev->v_read(vdev, bp, pbuf, offset, size); 2305 if (error == 0) { 2306 if (cpfunc != ZIO_COMPRESS_OFF) 2307 error = zio_decompress_data(cpfunc, pbuf, 2308 BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp)); 2309 else if (size != BP_GET_PSIZE(bp)) 2310 bcopy(pbuf, buf, BP_GET_PSIZE(bp)); 2311 } else { 2312 printf("zio_read error: %d\n", error); 2313 } 2314 if (buf != pbuf) 2315 free(pbuf); 2316 if (error == 0) 2317 break; 2318 } 2319 if (error != 0) 2320 printf("ZFS: i/o error - all block copies unavailable\n"); 2321 2322 return (error); 2323 } 2324 2325 static int 2326 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, 2327 void *buf, size_t buflen) 2328 { 2329 int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 2330 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2331 int nlevels = dnode->dn_nlevels; 2332 int i, rc; 2333 2334 if (bsize > SPA_MAXBLOCKSIZE) { 2335 printf("ZFS: I/O error - blocks larger than %llu are not " 2336 "supported\n", SPA_MAXBLOCKSIZE); 2337 return (EIO); 2338 } 2339 2340 /* 2341 * Handle odd block sizes, mirrors dmu_read_impl(). Data can't exist 2342 * past the first block, so we'll clip the read to the portion of the 2343 * buffer within bsize and zero out the remainder. 2344 */ 2345 if (dnode->dn_maxblkid == 0) { 2346 size_t newbuflen; 2347 2348 newbuflen = offset > bsize ? 0 : MIN(buflen, bsize - offset); 2349 bzero((char *)buf + newbuflen, buflen - newbuflen); 2350 buflen = newbuflen; 2351 } 2352 2353 /* 2354 * Note: bsize may not be a power of two here so we need to do an 2355 * actual divide rather than a bitshift. 2356 */ 2357 while (buflen > 0) { 2358 uint64_t bn = offset / bsize; 2359 int boff = offset % bsize; 2360 int ibn; 2361 const blkptr_t *indbp; 2362 blkptr_t bp; 2363 2364 if (bn > dnode->dn_maxblkid) 2365 return (EIO); 2366 2367 if (dnode == dnode_cache_obj && bn == dnode_cache_bn) 2368 goto cached; 2369 2370 indbp = dnode->dn_blkptr; 2371 for (i = 0; i < nlevels; i++) { 2372 /* 2373 * Copy the bp from the indirect array so that 2374 * we can re-use the scratch buffer for multi-level 2375 * objects. 2376 */ 2377 ibn = bn >> ((nlevels - i - 1) * ibshift); 2378 ibn &= ((1 << ibshift) - 1); 2379 bp = indbp[ibn]; 2380 if (BP_IS_HOLE(&bp)) { 2381 memset(dnode_cache_buf, 0, bsize); 2382 break; 2383 } 2384 rc = zio_read(spa, &bp, dnode_cache_buf); 2385 if (rc) 2386 return (rc); 2387 indbp = (const blkptr_t *) dnode_cache_buf; 2388 } 2389 dnode_cache_obj = dnode; 2390 dnode_cache_bn = bn; 2391 cached: 2392 2393 /* 2394 * The buffer contains our data block. Copy what we 2395 * need from it and loop. 2396 */ 2397 i = bsize - boff; 2398 if (i > buflen) i = buflen; 2399 memcpy(buf, &dnode_cache_buf[boff], i); 2400 buf = ((char *)buf) + i; 2401 offset += i; 2402 buflen -= i; 2403 } 2404 2405 return (0); 2406 } 2407 2408 /* 2409 * Lookup a value in a microzap directory. 2410 */ 2411 static int 2412 mzap_lookup(const mzap_phys_t *mz, size_t size, const char *name, 2413 uint64_t *value) 2414 { 2415 const mzap_ent_phys_t *mze; 2416 int chunks, i; 2417 2418 /* 2419 * Microzap objects use exactly one block. Read the whole 2420 * thing. 2421 */ 2422 chunks = size / MZAP_ENT_LEN - 1; 2423 for (i = 0; i < chunks; i++) { 2424 mze = &mz->mz_chunk[i]; 2425 if (strcmp(mze->mze_name, name) == 0) { 2426 *value = mze->mze_value; 2427 return (0); 2428 } 2429 } 2430 2431 return (ENOENT); 2432 } 2433 2434 /* 2435 * Compare a name with a zap leaf entry. Return non-zero if the name 2436 * matches. 2437 */ 2438 static int 2439 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, 2440 const char *name) 2441 { 2442 size_t namelen; 2443 const zap_leaf_chunk_t *nc; 2444 const char *p; 2445 2446 namelen = zc->l_entry.le_name_numints; 2447 2448 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 2449 p = name; 2450 while (namelen > 0) { 2451 size_t len; 2452 2453 len = namelen; 2454 if (len > ZAP_LEAF_ARRAY_BYTES) 2455 len = ZAP_LEAF_ARRAY_BYTES; 2456 if (memcmp(p, nc->l_array.la_array, len)) 2457 return (0); 2458 p += len; 2459 namelen -= len; 2460 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 2461 } 2462 2463 return (1); 2464 } 2465 2466 /* 2467 * Extract a uint64_t value from a zap leaf entry. 2468 */ 2469 static uint64_t 2470 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc) 2471 { 2472 const zap_leaf_chunk_t *vc; 2473 int i; 2474 uint64_t value; 2475 const uint8_t *p; 2476 2477 vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk); 2478 for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) { 2479 value = (value << 8) | p[i]; 2480 } 2481 2482 return (value); 2483 } 2484 2485 static void 2486 stv(int len, void *addr, uint64_t value) 2487 { 2488 switch (len) { 2489 case 1: 2490 *(uint8_t *)addr = value; 2491 return; 2492 case 2: 2493 *(uint16_t *)addr = value; 2494 return; 2495 case 4: 2496 *(uint32_t *)addr = value; 2497 return; 2498 case 8: 2499 *(uint64_t *)addr = value; 2500 return; 2501 } 2502 } 2503 2504 /* 2505 * Extract a array from a zap leaf entry. 2506 */ 2507 static void 2508 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, 2509 uint64_t integer_size, uint64_t num_integers, void *buf) 2510 { 2511 uint64_t array_int_len = zc->l_entry.le_value_intlen; 2512 uint64_t value = 0; 2513 uint64_t *u64 = buf; 2514 char *p = buf; 2515 int len = MIN(zc->l_entry.le_value_numints, num_integers); 2516 int chunk = zc->l_entry.le_value_chunk; 2517 int byten = 0; 2518 2519 if (integer_size == 8 && len == 1) { 2520 *u64 = fzap_leaf_value(zl, zc); 2521 return; 2522 } 2523 2524 while (len > 0) { 2525 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array; 2526 int i; 2527 2528 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl)); 2529 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { 2530 value = (value << 8) | la->la_array[i]; 2531 byten++; 2532 if (byten == array_int_len) { 2533 stv(integer_size, p, value); 2534 byten = 0; 2535 len--; 2536 if (len == 0) 2537 return; 2538 p += integer_size; 2539 } 2540 } 2541 chunk = la->la_next; 2542 } 2543 } 2544 2545 static int 2546 fzap_check_size(uint64_t integer_size, uint64_t num_integers) 2547 { 2548 2549 switch (integer_size) { 2550 case 1: 2551 case 2: 2552 case 4: 2553 case 8: 2554 break; 2555 default: 2556 return (EINVAL); 2557 } 2558 2559 if (integer_size * num_integers > ZAP_MAXVALUELEN) 2560 return (E2BIG); 2561 2562 return (0); 2563 } 2564 2565 static void 2566 zap_leaf_free(zap_leaf_t *leaf) 2567 { 2568 free(leaf->l_phys); 2569 free(leaf); 2570 } 2571 2572 static int 2573 zap_get_leaf_byblk(fat_zap_t *zap, uint64_t blk, zap_leaf_t **lp) 2574 { 2575 int bs = FZAP_BLOCK_SHIFT(zap); 2576 int err; 2577 2578 *lp = malloc(sizeof(**lp)); 2579 if (*lp == NULL) 2580 return (ENOMEM); 2581 2582 (*lp)->l_bs = bs; 2583 (*lp)->l_phys = malloc(1 << bs); 2584 2585 if ((*lp)->l_phys == NULL) { 2586 free(*lp); 2587 return (ENOMEM); 2588 } 2589 err = dnode_read(zap->zap_spa, zap->zap_dnode, blk << bs, (*lp)->l_phys, 2590 1 << bs); 2591 if (err != 0) { 2592 zap_leaf_free(*lp); 2593 } 2594 return (err); 2595 } 2596 2597 static int 2598 zap_table_load(fat_zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, 2599 uint64_t *valp) 2600 { 2601 int bs = FZAP_BLOCK_SHIFT(zap); 2602 uint64_t blk = idx >> (bs - 3); 2603 uint64_t off = idx & ((1 << (bs - 3)) - 1); 2604 uint64_t *buf; 2605 int rc; 2606 2607 buf = malloc(1 << zap->zap_block_shift); 2608 if (buf == NULL) 2609 return (ENOMEM); 2610 rc = dnode_read(zap->zap_spa, zap->zap_dnode, (tbl->zt_blk + blk) << bs, 2611 buf, 1 << zap->zap_block_shift); 2612 if (rc == 0) 2613 *valp = buf[off]; 2614 free(buf); 2615 return (rc); 2616 } 2617 2618 static int 2619 zap_idx_to_blk(fat_zap_t *zap, uint64_t idx, uint64_t *valp) 2620 { 2621 if (zap->zap_phys->zap_ptrtbl.zt_numblks == 0) { 2622 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); 2623 return (0); 2624 } else { 2625 return (zap_table_load(zap, &zap->zap_phys->zap_ptrtbl, 2626 idx, valp)); 2627 } 2628 } 2629 2630 #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) 2631 static int 2632 zap_deref_leaf(fat_zap_t *zap, uint64_t h, zap_leaf_t **lp) 2633 { 2634 uint64_t idx, blk; 2635 int err; 2636 2637 idx = ZAP_HASH_IDX(h, zap->zap_phys->zap_ptrtbl.zt_shift); 2638 err = zap_idx_to_blk(zap, idx, &blk); 2639 if (err != 0) 2640 return (err); 2641 return (zap_get_leaf_byblk(zap, blk, lp)); 2642 } 2643 2644 #define CHAIN_END 0xffff /* end of the chunk chain */ 2645 #define LEAF_HASH(l, h) \ 2646 ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ 2647 ((h) >> \ 2648 (64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len))) 2649 #define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) 2650 2651 static int 2652 zap_leaf_lookup(zap_leaf_t *zl, uint64_t hash, const char *name, 2653 uint64_t integer_size, uint64_t num_integers, void *value) 2654 { 2655 int rc; 2656 uint16_t *chunkp; 2657 struct zap_leaf_entry *le; 2658 2659 /* 2660 * Make sure this chunk matches our hash. 2661 */ 2662 if (zl->l_phys->l_hdr.lh_prefix_len > 0 && 2663 zl->l_phys->l_hdr.lh_prefix != 2664 hash >> (64 - zl->l_phys->l_hdr.lh_prefix_len)) 2665 return (EIO); 2666 2667 rc = ENOENT; 2668 for (chunkp = LEAF_HASH_ENTPTR(zl, hash); 2669 *chunkp != CHAIN_END; chunkp = &le->le_next) { 2670 zap_leaf_chunk_t *zc; 2671 uint16_t chunk = *chunkp; 2672 2673 le = ZAP_LEAF_ENTRY(zl, chunk); 2674 if (le->le_hash != hash) 2675 continue; 2676 zc = &ZAP_LEAF_CHUNK(zl, chunk); 2677 if (fzap_name_equal(zl, zc, name)) { 2678 if (zc->l_entry.le_value_intlen > integer_size) { 2679 rc = EINVAL; 2680 } else { 2681 fzap_leaf_array(zl, zc, integer_size, 2682 num_integers, value); 2683 rc = 0; 2684 } 2685 break; 2686 } 2687 } 2688 return (rc); 2689 } 2690 2691 /* 2692 * Lookup a value in a fatzap directory. 2693 */ 2694 static int 2695 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2696 const char *name, uint64_t integer_size, uint64_t num_integers, 2697 void *value) 2698 { 2699 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2700 fat_zap_t z; 2701 zap_leaf_t *zl; 2702 uint64_t hash; 2703 int rc; 2704 2705 if (zh->zap_magic != ZAP_MAGIC) 2706 return (EIO); 2707 2708 if ((rc = fzap_check_size(integer_size, num_integers)) != 0) { 2709 return (rc); 2710 } 2711 2712 z.zap_block_shift = ilog2(bsize); 2713 z.zap_phys = zh; 2714 z.zap_spa = spa; 2715 z.zap_dnode = dnode; 2716 2717 hash = zap_hash(zh->zap_salt, name); 2718 rc = zap_deref_leaf(&z, hash, &zl); 2719 if (rc != 0) 2720 return (rc); 2721 2722 rc = zap_leaf_lookup(zl, hash, name, integer_size, num_integers, value); 2723 2724 zap_leaf_free(zl); 2725 return (rc); 2726 } 2727 2728 /* 2729 * Lookup a name in a zap object and return its value as a uint64_t. 2730 */ 2731 static int 2732 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, 2733 uint64_t integer_size, uint64_t num_integers, void *value) 2734 { 2735 int rc; 2736 zap_phys_t *zap; 2737 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2738 2739 zap = malloc(size); 2740 if (zap == NULL) 2741 return (ENOMEM); 2742 2743 rc = dnode_read(spa, dnode, 0, zap, size); 2744 if (rc) 2745 goto done; 2746 2747 switch (zap->zap_block_type) { 2748 case ZBT_MICRO: 2749 rc = mzap_lookup((const mzap_phys_t *)zap, size, name, value); 2750 break; 2751 case ZBT_HEADER: 2752 rc = fzap_lookup(spa, dnode, zap, name, integer_size, 2753 num_integers, value); 2754 break; 2755 default: 2756 printf("ZFS: invalid zap_type=%" PRIx64 "\n", 2757 zap->zap_block_type); 2758 rc = EIO; 2759 } 2760 done: 2761 free(zap); 2762 return (rc); 2763 } 2764 2765 /* 2766 * List a microzap directory. 2767 */ 2768 static int 2769 mzap_list(const mzap_phys_t *mz, size_t size, 2770 int (*callback)(const char *, uint64_t)) 2771 { 2772 const mzap_ent_phys_t *mze; 2773 int chunks, i, rc; 2774 2775 /* 2776 * Microzap objects use exactly one block. Read the whole 2777 * thing. 2778 */ 2779 rc = 0; 2780 chunks = size / MZAP_ENT_LEN - 1; 2781 for (i = 0; i < chunks; i++) { 2782 mze = &mz->mz_chunk[i]; 2783 if (mze->mze_name[0]) { 2784 rc = callback(mze->mze_name, mze->mze_value); 2785 if (rc != 0) 2786 break; 2787 } 2788 } 2789 2790 return (rc); 2791 } 2792 2793 /* 2794 * List a fatzap directory. 2795 */ 2796 static int 2797 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2798 int (*callback)(const char *, uint64_t)) 2799 { 2800 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2801 fat_zap_t z; 2802 uint64_t i; 2803 int j, rc; 2804 2805 if (zh->zap_magic != ZAP_MAGIC) 2806 return (EIO); 2807 2808 z.zap_block_shift = ilog2(bsize); 2809 z.zap_phys = zh; 2810 2811 /* 2812 * This assumes that the leaf blocks start at block 1. The 2813 * documentation isn't exactly clear on this. 2814 */ 2815 zap_leaf_t zl; 2816 zl.l_bs = z.zap_block_shift; 2817 zl.l_phys = malloc(bsize); 2818 if (zl.l_phys == NULL) 2819 return (ENOMEM); 2820 2821 for (i = 0; i < zh->zap_num_leafs; i++) { 2822 off_t off = ((off_t)(i + 1)) << zl.l_bs; 2823 char name[256], *p; 2824 uint64_t value; 2825 2826 if (dnode_read(spa, dnode, off, zl.l_phys, bsize)) { 2827 free(zl.l_phys); 2828 return (EIO); 2829 } 2830 2831 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 2832 zap_leaf_chunk_t *zc, *nc; 2833 int namelen; 2834 2835 zc = &ZAP_LEAF_CHUNK(&zl, j); 2836 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 2837 continue; 2838 namelen = zc->l_entry.le_name_numints; 2839 if (namelen > sizeof(name)) 2840 namelen = sizeof(name); 2841 2842 /* 2843 * Paste the name back together. 2844 */ 2845 nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); 2846 p = name; 2847 while (namelen > 0) { 2848 int len; 2849 len = namelen; 2850 if (len > ZAP_LEAF_ARRAY_BYTES) 2851 len = ZAP_LEAF_ARRAY_BYTES; 2852 memcpy(p, nc->l_array.la_array, len); 2853 p += len; 2854 namelen -= len; 2855 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); 2856 } 2857 2858 /* 2859 * Assume the first eight bytes of the value are 2860 * a uint64_t. 2861 */ 2862 value = fzap_leaf_value(&zl, zc); 2863 2864 /* printf("%s 0x%jx\n", name, (uintmax_t)value); */ 2865 rc = callback((const char *)name, value); 2866 if (rc != 0) { 2867 free(zl.l_phys); 2868 return (rc); 2869 } 2870 } 2871 } 2872 2873 free(zl.l_phys); 2874 return (0); 2875 } 2876 2877 static int zfs_printf(const char *name, uint64_t value __unused) 2878 { 2879 2880 printf("%s\n", name); 2881 2882 return (0); 2883 } 2884 2885 /* 2886 * List a zap directory. 2887 */ 2888 static int 2889 zap_list(const spa_t *spa, const dnode_phys_t *dnode) 2890 { 2891 zap_phys_t *zap; 2892 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2893 int rc; 2894 2895 zap = malloc(size); 2896 if (zap == NULL) 2897 return (ENOMEM); 2898 2899 rc = dnode_read(spa, dnode, 0, zap, size); 2900 if (rc == 0) { 2901 if (zap->zap_block_type == ZBT_MICRO) 2902 rc = mzap_list((const mzap_phys_t *)zap, size, 2903 zfs_printf); 2904 else 2905 rc = fzap_list(spa, dnode, zap, zfs_printf); 2906 } 2907 free(zap); 2908 return (rc); 2909 } 2910 2911 static int 2912 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, 2913 dnode_phys_t *dnode) 2914 { 2915 off_t offset; 2916 2917 offset = objnum * sizeof(dnode_phys_t); 2918 return dnode_read(spa, &os->os_meta_dnode, offset, 2919 dnode, sizeof(dnode_phys_t)); 2920 } 2921 2922 /* 2923 * Lookup a name in a microzap directory. 2924 */ 2925 static int 2926 mzap_rlookup(const mzap_phys_t *mz, size_t size, char *name, uint64_t value) 2927 { 2928 const mzap_ent_phys_t *mze; 2929 int chunks, i; 2930 2931 /* 2932 * Microzap objects use exactly one block. Read the whole 2933 * thing. 2934 */ 2935 chunks = size / MZAP_ENT_LEN - 1; 2936 for (i = 0; i < chunks; i++) { 2937 mze = &mz->mz_chunk[i]; 2938 if (value == mze->mze_value) { 2939 strcpy(name, mze->mze_name); 2940 return (0); 2941 } 2942 } 2943 2944 return (ENOENT); 2945 } 2946 2947 static void 2948 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name) 2949 { 2950 size_t namelen; 2951 const zap_leaf_chunk_t *nc; 2952 char *p; 2953 2954 namelen = zc->l_entry.le_name_numints; 2955 2956 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 2957 p = name; 2958 while (namelen > 0) { 2959 size_t len; 2960 len = namelen; 2961 if (len > ZAP_LEAF_ARRAY_BYTES) 2962 len = ZAP_LEAF_ARRAY_BYTES; 2963 memcpy(p, nc->l_array.la_array, len); 2964 p += len; 2965 namelen -= len; 2966 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 2967 } 2968 2969 *p = '\0'; 2970 } 2971 2972 static int 2973 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2974 char *name, uint64_t value) 2975 { 2976 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2977 fat_zap_t z; 2978 uint64_t i; 2979 int j, rc; 2980 2981 if (zh->zap_magic != ZAP_MAGIC) 2982 return (EIO); 2983 2984 z.zap_block_shift = ilog2(bsize); 2985 z.zap_phys = zh; 2986 2987 /* 2988 * This assumes that the leaf blocks start at block 1. The 2989 * documentation isn't exactly clear on this. 2990 */ 2991 zap_leaf_t zl; 2992 zl.l_bs = z.zap_block_shift; 2993 zl.l_phys = malloc(bsize); 2994 if (zl.l_phys == NULL) 2995 return (ENOMEM); 2996 2997 for (i = 0; i < zh->zap_num_leafs; i++) { 2998 off_t off = ((off_t)(i + 1)) << zl.l_bs; 2999 3000 rc = dnode_read(spa, dnode, off, zl.l_phys, bsize); 3001 if (rc != 0) 3002 goto done; 3003 3004 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 3005 zap_leaf_chunk_t *zc; 3006 3007 zc = &ZAP_LEAF_CHUNK(&zl, j); 3008 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 3009 continue; 3010 if (zc->l_entry.le_value_intlen != 8 || 3011 zc->l_entry.le_value_numints != 1) 3012 continue; 3013 3014 if (fzap_leaf_value(&zl, zc) == value) { 3015 fzap_name_copy(&zl, zc, name); 3016 goto done; 3017 } 3018 } 3019 } 3020 3021 rc = ENOENT; 3022 done: 3023 free(zl.l_phys); 3024 return (rc); 3025 } 3026 3027 static int 3028 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, 3029 uint64_t value) 3030 { 3031 zap_phys_t *zap; 3032 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 3033 int rc; 3034 3035 zap = malloc(size); 3036 if (zap == NULL) 3037 return (ENOMEM); 3038 3039 rc = dnode_read(spa, dnode, 0, zap, size); 3040 if (rc == 0) { 3041 if (zap->zap_block_type == ZBT_MICRO) 3042 rc = mzap_rlookup((const mzap_phys_t *)zap, size, 3043 name, value); 3044 else 3045 rc = fzap_rlookup(spa, dnode, zap, name, value); 3046 } 3047 free(zap); 3048 return (rc); 3049 } 3050 3051 static int 3052 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result) 3053 { 3054 char name[256]; 3055 char component[256]; 3056 uint64_t dir_obj, parent_obj, child_dir_zapobj; 3057 dnode_phys_t child_dir_zap, snapnames_zap, dataset, dir, parent; 3058 dsl_dir_phys_t *dd; 3059 dsl_dataset_phys_t *ds; 3060 char *p; 3061 int len; 3062 boolean_t issnap = B_FALSE; 3063 3064 p = &name[sizeof(name) - 1]; 3065 *p = '\0'; 3066 3067 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3068 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3069 return (EIO); 3070 } 3071 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3072 dir_obj = ds->ds_dir_obj; 3073 if (ds->ds_snapnames_zapobj == 0) 3074 issnap = B_TRUE; 3075 3076 for (;;) { 3077 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir) != 0) 3078 return (EIO); 3079 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3080 3081 /* Actual loop condition. */ 3082 parent_obj = dd->dd_parent_obj; 3083 if (parent_obj == 0) 3084 break; 3085 3086 if (objset_get_dnode(spa, spa->spa_mos, parent_obj, 3087 &parent) != 0) 3088 return (EIO); 3089 dd = (dsl_dir_phys_t *)&parent.dn_bonus; 3090 if (issnap == B_TRUE) { 3091 /* 3092 * The dataset we are looking up is a snapshot 3093 * the dir_obj is the parent already, we don't want 3094 * the grandparent just yet. Reset to the parent. 3095 */ 3096 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3097 /* Lookup the dataset to get the snapname ZAP */ 3098 if (objset_get_dnode(spa, spa->spa_mos, 3099 dd->dd_head_dataset_obj, &dataset)) 3100 return (EIO); 3101 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3102 if (objset_get_dnode(spa, spa->spa_mos, 3103 ds->ds_snapnames_zapobj, &snapnames_zap) != 0) 3104 return (EIO); 3105 /* Get the name of the snapshot */ 3106 if (zap_rlookup(spa, &snapnames_zap, component, 3107 objnum) != 0) 3108 return (EIO); 3109 len = strlen(component); 3110 p -= len; 3111 memcpy(p, component, len); 3112 --p; 3113 *p = '@'; 3114 issnap = B_FALSE; 3115 continue; 3116 } 3117 3118 child_dir_zapobj = dd->dd_child_dir_zapobj; 3119 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3120 &child_dir_zap) != 0) 3121 return (EIO); 3122 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0) 3123 return (EIO); 3124 3125 len = strlen(component); 3126 p -= len; 3127 memcpy(p, component, len); 3128 --p; 3129 *p = '/'; 3130 3131 /* Actual loop iteration. */ 3132 dir_obj = parent_obj; 3133 } 3134 3135 if (*p != '\0') 3136 ++p; 3137 strcpy(result, p); 3138 3139 return (0); 3140 } 3141 3142 static int 3143 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum) 3144 { 3145 char element[256]; 3146 uint64_t dir_obj, child_dir_zapobj; 3147 dnode_phys_t child_dir_zap, snapnames_zap, dir, dataset; 3148 dsl_dir_phys_t *dd; 3149 dsl_dataset_phys_t *ds; 3150 const char *p, *q; 3151 boolean_t issnap = B_FALSE; 3152 3153 if (objset_get_dnode(spa, spa->spa_mos, 3154 DMU_POOL_DIRECTORY_OBJECT, &dir)) 3155 return (EIO); 3156 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj), 3157 1, &dir_obj)) 3158 return (EIO); 3159 3160 p = name; 3161 for (;;) { 3162 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir)) 3163 return (EIO); 3164 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3165 3166 while (*p == '/') 3167 p++; 3168 /* Actual loop condition #1. */ 3169 if (*p == '\0') 3170 break; 3171 3172 q = strchr(p, '/'); 3173 if (q) { 3174 memcpy(element, p, q - p); 3175 element[q - p] = '\0'; 3176 p = q + 1; 3177 } else { 3178 strcpy(element, p); 3179 p += strlen(p); 3180 } 3181 3182 if (issnap == B_TRUE) { 3183 if (objset_get_dnode(spa, spa->spa_mos, 3184 dd->dd_head_dataset_obj, &dataset)) 3185 return (EIO); 3186 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3187 if (objset_get_dnode(spa, spa->spa_mos, 3188 ds->ds_snapnames_zapobj, &snapnames_zap) != 0) 3189 return (EIO); 3190 /* Actual loop condition #2. */ 3191 if (zap_lookup(spa, &snapnames_zap, element, 3192 sizeof (dir_obj), 1, &dir_obj) != 0) 3193 return (ENOENT); 3194 *objnum = dir_obj; 3195 return (0); 3196 } else if ((q = strchr(element, '@')) != NULL) { 3197 issnap = B_TRUE; 3198 element[q - element] = '\0'; 3199 p = q + 1; 3200 } 3201 child_dir_zapobj = dd->dd_child_dir_zapobj; 3202 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3203 &child_dir_zap) != 0) 3204 return (EIO); 3205 3206 /* Actual loop condition #2. */ 3207 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj), 3208 1, &dir_obj) != 0) 3209 return (ENOENT); 3210 } 3211 3212 *objnum = dd->dd_head_dataset_obj; 3213 return (0); 3214 } 3215 3216 #ifndef BOOT2 3217 static int 3218 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/) 3219 { 3220 uint64_t dir_obj, child_dir_zapobj; 3221 dnode_phys_t child_dir_zap, dir, dataset; 3222 dsl_dataset_phys_t *ds; 3223 dsl_dir_phys_t *dd; 3224 3225 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3226 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3227 return (EIO); 3228 } 3229 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3230 dir_obj = ds->ds_dir_obj; 3231 3232 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir)) { 3233 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 3234 return (EIO); 3235 } 3236 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3237 3238 child_dir_zapobj = dd->dd_child_dir_zapobj; 3239 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3240 &child_dir_zap) != 0) { 3241 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 3242 return (EIO); 3243 } 3244 3245 return (zap_list(spa, &child_dir_zap) != 0); 3246 } 3247 3248 int 3249 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, 3250 int (*callback)(const char *, uint64_t)) 3251 { 3252 uint64_t dir_obj, child_dir_zapobj; 3253 dnode_phys_t child_dir_zap, dir, dataset; 3254 dsl_dataset_phys_t *ds; 3255 dsl_dir_phys_t *dd; 3256 zap_phys_t *zap; 3257 size_t size; 3258 int err; 3259 3260 err = objset_get_dnode(spa, spa->spa_mos, objnum, &dataset); 3261 if (err != 0) { 3262 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3263 return (err); 3264 } 3265 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3266 dir_obj = ds->ds_dir_obj; 3267 3268 err = objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir); 3269 if (err != 0) { 3270 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 3271 return (err); 3272 } 3273 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3274 3275 child_dir_zapobj = dd->dd_child_dir_zapobj; 3276 err = objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3277 &child_dir_zap); 3278 if (err != 0) { 3279 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 3280 return (err); 3281 } 3282 3283 size = child_dir_zap.dn_datablkszsec << SPA_MINBLOCKSHIFT; 3284 zap = malloc(size); 3285 if (zap != NULL) { 3286 err = dnode_read(spa, &child_dir_zap, 0, zap, size); 3287 if (err != 0) 3288 goto done; 3289 3290 if (zap->zap_block_type == ZBT_MICRO) 3291 err = mzap_list((const mzap_phys_t *)zap, size, 3292 callback); 3293 else 3294 err = fzap_list(spa, &child_dir_zap, zap, callback); 3295 } else { 3296 err = ENOMEM; 3297 } 3298 done: 3299 free(zap); 3300 return (err); 3301 } 3302 #endif 3303 3304 /* 3305 * Find the object set given the object number of its dataset object 3306 * and return its details in *objset 3307 */ 3308 static int 3309 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset) 3310 { 3311 dnode_phys_t dataset; 3312 dsl_dataset_phys_t *ds; 3313 3314 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3315 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3316 return (EIO); 3317 } 3318 3319 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3320 if (zio_read(spa, &ds->ds_bp, objset)) { 3321 printf("ZFS: can't read object set for dataset %ju\n", 3322 (uintmax_t)objnum); 3323 return (EIO); 3324 } 3325 3326 return (0); 3327 } 3328 3329 /* 3330 * Find the object set pointed to by the BOOTFS property or the root 3331 * dataset if there is none and return its details in *objset 3332 */ 3333 static int 3334 zfs_get_root(const spa_t *spa, uint64_t *objid) 3335 { 3336 dnode_phys_t dir, propdir; 3337 uint64_t props, bootfs, root; 3338 3339 *objid = 0; 3340 3341 /* 3342 * Start with the MOS directory object. 3343 */ 3344 if (objset_get_dnode(spa, spa->spa_mos, 3345 DMU_POOL_DIRECTORY_OBJECT, &dir)) { 3346 printf("ZFS: can't read MOS object directory\n"); 3347 return (EIO); 3348 } 3349 3350 /* 3351 * Lookup the pool_props and see if we can find a bootfs. 3352 */ 3353 if (zap_lookup(spa, &dir, DMU_POOL_PROPS, 3354 sizeof(props), 1, &props) == 0 && 3355 objset_get_dnode(spa, spa->spa_mos, props, &propdir) == 0 && 3356 zap_lookup(spa, &propdir, "bootfs", 3357 sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) { 3358 *objid = bootfs; 3359 return (0); 3360 } 3361 /* 3362 * Lookup the root dataset directory 3363 */ 3364 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, 3365 sizeof(root), 1, &root) || 3366 objset_get_dnode(spa, spa->spa_mos, root, &dir)) { 3367 printf("ZFS: can't find root dsl_dir\n"); 3368 return (EIO); 3369 } 3370 3371 /* 3372 * Use the information from the dataset directory's bonus buffer 3373 * to find the dataset object and from that the object set itself. 3374 */ 3375 dsl_dir_phys_t *dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3376 *objid = dd->dd_head_dataset_obj; 3377 return (0); 3378 } 3379 3380 static int 3381 zfs_mount_impl(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount) 3382 { 3383 3384 mount->spa = spa; 3385 3386 /* 3387 * Find the root object set if not explicitly provided 3388 */ 3389 if (rootobj == 0 && zfs_get_root(spa, &rootobj)) { 3390 printf("ZFS: can't find root filesystem\n"); 3391 return (EIO); 3392 } 3393 3394 if (zfs_mount_dataset(spa, rootobj, &mount->objset)) { 3395 printf("ZFS: can't open root filesystem\n"); 3396 return (EIO); 3397 } 3398 3399 mount->rootobj = rootobj; 3400 3401 return (0); 3402 } 3403 3404 /* 3405 * callback function for feature name checks. 3406 */ 3407 static int 3408 check_feature(const char *name, uint64_t value) 3409 { 3410 int i; 3411 3412 if (value == 0) 3413 return (0); 3414 if (name[0] == '\0') 3415 return (0); 3416 3417 for (i = 0; features_for_read[i] != NULL; i++) { 3418 if (strcmp(name, features_for_read[i]) == 0) 3419 return (0); 3420 } 3421 printf("ZFS: unsupported feature: %s\n", name); 3422 return (EIO); 3423 } 3424 3425 /* 3426 * Checks whether the MOS features that are active are supported. 3427 */ 3428 static int 3429 check_mos_features(const spa_t *spa) 3430 { 3431 dnode_phys_t dir; 3432 zap_phys_t *zap; 3433 uint64_t objnum; 3434 size_t size; 3435 int rc; 3436 3437 if ((rc = objset_get_dnode(spa, spa->spa_mos, DMU_OT_OBJECT_DIRECTORY, 3438 &dir)) != 0) 3439 return (rc); 3440 if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ, 3441 sizeof (objnum), 1, &objnum)) != 0) { 3442 /* 3443 * It is older pool without features. As we have already 3444 * tested the label, just return without raising the error. 3445 */ 3446 return (0); 3447 } 3448 3449 if ((rc = objset_get_dnode(spa, spa->spa_mos, objnum, &dir)) != 0) 3450 return (rc); 3451 3452 if (dir.dn_type != DMU_OTN_ZAP_METADATA) 3453 return (EIO); 3454 3455 size = dir.dn_datablkszsec << SPA_MINBLOCKSHIFT; 3456 zap = malloc(size); 3457 if (zap == NULL) 3458 return (ENOMEM); 3459 3460 if (dnode_read(spa, &dir, 0, zap, size)) { 3461 free(zap); 3462 return (EIO); 3463 } 3464 3465 if (zap->zap_block_type == ZBT_MICRO) 3466 rc = mzap_list((const mzap_phys_t *)zap, size, check_feature); 3467 else 3468 rc = fzap_list(spa, &dir, zap, check_feature); 3469 3470 free(zap); 3471 return (rc); 3472 } 3473 3474 static int 3475 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 3476 { 3477 dnode_phys_t dir; 3478 size_t size; 3479 int rc; 3480 char *nv; 3481 3482 *value = NULL; 3483 if ((rc = objset_get_dnode(spa, spa->spa_mos, obj, &dir)) != 0) 3484 return (rc); 3485 if (dir.dn_type != DMU_OT_PACKED_NVLIST && 3486 dir.dn_bonustype != DMU_OT_PACKED_NVLIST_SIZE) { 3487 return (EIO); 3488 } 3489 3490 if (dir.dn_bonuslen != sizeof (uint64_t)) 3491 return (EIO); 3492 3493 size = *(uint64_t *)DN_BONUS(&dir); 3494 nv = malloc(size); 3495 if (nv == NULL) 3496 return (ENOMEM); 3497 3498 rc = dnode_read(spa, &dir, 0, nv, size); 3499 if (rc != 0) { 3500 free(nv); 3501 nv = NULL; 3502 return (rc); 3503 } 3504 *value = nvlist_import(nv, size); 3505 free(nv); 3506 return (rc); 3507 } 3508 3509 static int 3510 zfs_spa_init(spa_t *spa) 3511 { 3512 struct uberblock checkpoint; 3513 dnode_phys_t dir; 3514 uint64_t config_object; 3515 nvlist_t *nvlist; 3516 int rc; 3517 3518 if (zio_read(spa, &spa->spa_uberblock->ub_rootbp, spa->spa_mos)) { 3519 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name); 3520 return (EIO); 3521 } 3522 if (spa->spa_mos->os_type != DMU_OST_META) { 3523 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name); 3524 return (EIO); 3525 } 3526 3527 if (objset_get_dnode(spa, &spa->spa_mos_master, 3528 DMU_POOL_DIRECTORY_OBJECT, &dir)) { 3529 printf("ZFS: failed to read pool %s directory object\n", 3530 spa->spa_name); 3531 return (EIO); 3532 } 3533 /* this is allowed to fail, older pools do not have salt */ 3534 rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1, 3535 sizeof (spa->spa_cksum_salt.zcs_bytes), 3536 spa->spa_cksum_salt.zcs_bytes); 3537 3538 rc = check_mos_features(spa); 3539 if (rc != 0) { 3540 printf("ZFS: pool %s is not supported\n", spa->spa_name); 3541 return (rc); 3542 } 3543 3544 rc = zap_lookup(spa, &dir, DMU_POOL_CONFIG, 3545 sizeof (config_object), 1, &config_object); 3546 if (rc != 0) { 3547 printf("ZFS: can not read MOS %s\n", DMU_POOL_CONFIG); 3548 return (EIO); 3549 } 3550 rc = load_nvlist(spa, config_object, &nvlist); 3551 if (rc != 0) 3552 return (rc); 3553 3554 rc = zap_lookup(spa, &dir, DMU_POOL_ZPOOL_CHECKPOINT, 3555 sizeof(uint64_t), sizeof(checkpoint) / sizeof(uint64_t), 3556 &checkpoint); 3557 if (rc == 0 && checkpoint.ub_checkpoint_txg != 0) { 3558 memcpy(&spa->spa_uberblock_checkpoint, &checkpoint, 3559 sizeof(checkpoint)); 3560 if (zio_read(spa, &spa->spa_uberblock_checkpoint.ub_rootbp, 3561 &spa->spa_mos_checkpoint)) { 3562 printf("ZFS: can not read checkpoint data.\n"); 3563 return (EIO); 3564 } 3565 } 3566 3567 /* 3568 * Update vdevs from MOS config. Note, we do skip encoding bytes 3569 * here. See also vdev_label_read_config(). 3570 */ 3571 rc = vdev_init_from_nvlist(spa, nvlist); 3572 nvlist_destroy(nvlist); 3573 return (rc); 3574 } 3575 3576 static int 3577 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb) 3578 { 3579 3580 if (dn->dn_bonustype != DMU_OT_SA) { 3581 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus; 3582 3583 sb->st_mode = zp->zp_mode; 3584 sb->st_uid = zp->zp_uid; 3585 sb->st_gid = zp->zp_gid; 3586 sb->st_size = zp->zp_size; 3587 } else { 3588 sa_hdr_phys_t *sahdrp; 3589 int hdrsize; 3590 size_t size = 0; 3591 void *buf = NULL; 3592 3593 if (dn->dn_bonuslen != 0) 3594 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 3595 else { 3596 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) { 3597 blkptr_t *bp = DN_SPILL_BLKPTR(dn); 3598 int error; 3599 3600 size = BP_GET_LSIZE(bp); 3601 buf = malloc(size); 3602 if (buf == NULL) 3603 error = ENOMEM; 3604 else 3605 error = zio_read(spa, bp, buf); 3606 3607 if (error != 0) { 3608 free(buf); 3609 return (error); 3610 } 3611 sahdrp = buf; 3612 } else { 3613 return (EIO); 3614 } 3615 } 3616 hdrsize = SA_HDR_SIZE(sahdrp); 3617 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize + 3618 SA_MODE_OFFSET); 3619 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize + 3620 SA_UID_OFFSET); 3621 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize + 3622 SA_GID_OFFSET); 3623 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize + 3624 SA_SIZE_OFFSET); 3625 free(buf); 3626 } 3627 3628 return (0); 3629 } 3630 3631 static int 3632 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize) 3633 { 3634 int rc = 0; 3635 3636 if (dn->dn_bonustype == DMU_OT_SA) { 3637 sa_hdr_phys_t *sahdrp = NULL; 3638 size_t size = 0; 3639 void *buf = NULL; 3640 int hdrsize; 3641 char *p; 3642 3643 if (dn->dn_bonuslen != 0) { 3644 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 3645 } else { 3646 blkptr_t *bp; 3647 3648 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0) 3649 return (EIO); 3650 bp = DN_SPILL_BLKPTR(dn); 3651 3652 size = BP_GET_LSIZE(bp); 3653 buf = malloc(size); 3654 if (buf == NULL) 3655 rc = ENOMEM; 3656 else 3657 rc = zio_read(spa, bp, buf); 3658 if (rc != 0) { 3659 free(buf); 3660 return (rc); 3661 } 3662 sahdrp = buf; 3663 } 3664 hdrsize = SA_HDR_SIZE(sahdrp); 3665 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET); 3666 memcpy(path, p, psize); 3667 free(buf); 3668 return (0); 3669 } 3670 /* 3671 * Second test is purely to silence bogus compiler 3672 * warning about accessing past the end of dn_bonus. 3673 */ 3674 if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen && 3675 sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) { 3676 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize); 3677 } else { 3678 rc = dnode_read(spa, dn, 0, path, psize); 3679 } 3680 return (rc); 3681 } 3682 3683 struct obj_list { 3684 uint64_t objnum; 3685 STAILQ_ENTRY(obj_list) entry; 3686 }; 3687 3688 /* 3689 * Lookup a file and return its dnode. 3690 */ 3691 static int 3692 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode) 3693 { 3694 int rc; 3695 uint64_t objnum; 3696 const spa_t *spa; 3697 dnode_phys_t dn; 3698 const char *p, *q; 3699 char element[256]; 3700 char path[1024]; 3701 int symlinks_followed = 0; 3702 struct stat sb; 3703 struct obj_list *entry, *tentry; 3704 STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache); 3705 3706 spa = mount->spa; 3707 if (mount->objset.os_type != DMU_OST_ZFS) { 3708 printf("ZFS: unexpected object set type %ju\n", 3709 (uintmax_t)mount->objset.os_type); 3710 return (EIO); 3711 } 3712 3713 if ((entry = malloc(sizeof(struct obj_list))) == NULL) 3714 return (ENOMEM); 3715 3716 /* 3717 * Get the root directory dnode. 3718 */ 3719 rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn); 3720 if (rc) { 3721 free(entry); 3722 return (rc); 3723 } 3724 3725 rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof(objnum), 1, &objnum); 3726 if (rc) { 3727 free(entry); 3728 return (rc); 3729 } 3730 entry->objnum = objnum; 3731 STAILQ_INSERT_HEAD(&on_cache, entry, entry); 3732 3733 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3734 if (rc != 0) 3735 goto done; 3736 3737 p = upath; 3738 while (p && *p) { 3739 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3740 if (rc != 0) 3741 goto done; 3742 3743 while (*p == '/') 3744 p++; 3745 if (*p == '\0') 3746 break; 3747 q = p; 3748 while (*q != '\0' && *q != '/') 3749 q++; 3750 3751 /* skip dot */ 3752 if (p + 1 == q && p[0] == '.') { 3753 p++; 3754 continue; 3755 } 3756 /* double dot */ 3757 if (p + 2 == q && p[0] == '.' && p[1] == '.') { 3758 p += 2; 3759 if (STAILQ_FIRST(&on_cache) == 3760 STAILQ_LAST(&on_cache, obj_list, entry)) { 3761 rc = ENOENT; 3762 goto done; 3763 } 3764 entry = STAILQ_FIRST(&on_cache); 3765 STAILQ_REMOVE_HEAD(&on_cache, entry); 3766 free(entry); 3767 objnum = (STAILQ_FIRST(&on_cache))->objnum; 3768 continue; 3769 } 3770 if (q - p + 1 > sizeof(element)) { 3771 rc = ENAMETOOLONG; 3772 goto done; 3773 } 3774 memcpy(element, p, q - p); 3775 element[q - p] = 0; 3776 p = q; 3777 3778 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0) 3779 goto done; 3780 if (!S_ISDIR(sb.st_mode)) { 3781 rc = ENOTDIR; 3782 goto done; 3783 } 3784 3785 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum); 3786 if (rc) 3787 goto done; 3788 objnum = ZFS_DIRENT_OBJ(objnum); 3789 3790 if ((entry = malloc(sizeof(struct obj_list))) == NULL) { 3791 rc = ENOMEM; 3792 goto done; 3793 } 3794 entry->objnum = objnum; 3795 STAILQ_INSERT_HEAD(&on_cache, entry, entry); 3796 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3797 if (rc) 3798 goto done; 3799 3800 /* 3801 * Check for symlink. 3802 */ 3803 rc = zfs_dnode_stat(spa, &dn, &sb); 3804 if (rc) 3805 goto done; 3806 if (S_ISLNK(sb.st_mode)) { 3807 if (symlinks_followed > 10) { 3808 rc = EMLINK; 3809 goto done; 3810 } 3811 symlinks_followed++; 3812 3813 /* 3814 * Read the link value and copy the tail of our 3815 * current path onto the end. 3816 */ 3817 if (sb.st_size + strlen(p) + 1 > sizeof(path)) { 3818 rc = ENAMETOOLONG; 3819 goto done; 3820 } 3821 strcpy(&path[sb.st_size], p); 3822 3823 rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size); 3824 if (rc != 0) 3825 goto done; 3826 3827 /* 3828 * Restart with the new path, starting either at 3829 * the root or at the parent depending whether or 3830 * not the link is relative. 3831 */ 3832 p = path; 3833 if (*p == '/') { 3834 while (STAILQ_FIRST(&on_cache) != 3835 STAILQ_LAST(&on_cache, obj_list, entry)) { 3836 entry = STAILQ_FIRST(&on_cache); 3837 STAILQ_REMOVE_HEAD(&on_cache, entry); 3838 free(entry); 3839 } 3840 } else { 3841 entry = STAILQ_FIRST(&on_cache); 3842 STAILQ_REMOVE_HEAD(&on_cache, entry); 3843 free(entry); 3844 } 3845 objnum = (STAILQ_FIRST(&on_cache))->objnum; 3846 } 3847 } 3848 3849 *dnode = dn; 3850 done: 3851 STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry) 3852 free(entry); 3853 return (rc); 3854 } 3855 3856 /* 3857 * Return either a cached copy of the bootenv, or read each of the vdev children 3858 * looking for the bootenv. Cache what's found and return the results. Returns 0 3859 * when benvp is filled in, and some errno when not. 3860 */ 3861 static int 3862 zfs_get_bootenv_spa(spa_t *spa, nvlist_t **benvp) 3863 { 3864 vdev_t *vd; 3865 nvlist_t *benv = NULL; 3866 3867 if (spa->spa_bootenv == NULL) { 3868 STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, 3869 v_childlink) { 3870 benv = vdev_read_bootenv(vd); 3871 3872 if (benv != NULL) 3873 break; 3874 } 3875 spa->spa_bootenv = benv; 3876 } 3877 benv = spa->spa_bootenv; 3878 3879 if (benv == NULL) 3880 return (ENOENT); 3881 3882 *benvp = benv; 3883 return (0); 3884 } 3885 3886 /* 3887 * Store nvlist to pool label bootenv area. Also updates cached pointer in spa. 3888 */ 3889 static int 3890 zfs_set_bootenv_spa(spa_t *spa, nvlist_t *benv) 3891 { 3892 vdev_t *vd; 3893 3894 STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) { 3895 vdev_write_bootenv(vd, benv); 3896 } 3897 3898 spa->spa_bootenv = benv; 3899 return (0); 3900 } 3901 3902 /* 3903 * Get bootonce value by key. The bootonce <key, value> pair is removed from the 3904 * bootenv nvlist and the remaining nvlist is committed back to disk. This process 3905 * the bootonce flag since we've reached the point in the boot that we've 'used' 3906 * the BE. For chained boot scenarios, we may reach this point multiple times (but 3907 * only remove it and return 0 the first time). 3908 */ 3909 static int 3910 zfs_get_bootonce_spa(spa_t *spa, const char *key, char *buf, size_t size) 3911 { 3912 nvlist_t *benv; 3913 char *result = NULL; 3914 int result_size, rv; 3915 3916 if ((rv = zfs_get_bootenv_spa(spa, &benv)) != 0) 3917 return (rv); 3918 3919 if ((rv = nvlist_find(benv, key, DATA_TYPE_STRING, NULL, 3920 &result, &result_size)) == 0) { 3921 if (result_size == 0) { 3922 /* ignore empty string */ 3923 rv = ENOENT; 3924 } else if (buf != NULL) { 3925 size = MIN((size_t)result_size + 1, size); 3926 strlcpy(buf, result, size); 3927 } 3928 (void)nvlist_remove(benv, key, DATA_TYPE_STRING); 3929 (void)zfs_set_bootenv_spa(spa, benv); 3930 } 3931 3932 return (rv); 3933 } 3934