1 /*- 2 * Copyright (c) 2007 Doug Rabson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 /* 31 * Stand-alone ZFS file reader. 32 */ 33 34 #include <stdbool.h> 35 #include <sys/endian.h> 36 #include <sys/stat.h> 37 #include <sys/stdint.h> 38 #include <sys/list.h> 39 #include <sys/zfs_bootenv.h> 40 #include <machine/_inttypes.h> 41 42 #include "zfsimpl.h" 43 #include "zfssubr.c" 44 45 #ifdef HAS_ZSTD_ZFS 46 extern int zstd_init(void); 47 #endif 48 49 struct zfsmount { 50 const spa_t *spa; 51 objset_phys_t objset; 52 uint64_t rootobj; 53 }; 54 static struct zfsmount zfsmount __unused; 55 56 /* 57 * The indirect_child_t represents the vdev that we will read from, when we 58 * need to read all copies of the data (e.g. for scrub or reconstruction). 59 * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), 60 * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, 61 * ic_vdev is a child of the mirror. 62 */ 63 typedef struct indirect_child { 64 void *ic_data; 65 vdev_t *ic_vdev; 66 } indirect_child_t; 67 68 /* 69 * The indirect_split_t represents one mapped segment of an i/o to the 70 * indirect vdev. For non-split (contiguously-mapped) blocks, there will be 71 * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. 72 * For split blocks, there will be several of these. 73 */ 74 typedef struct indirect_split { 75 list_node_t is_node; /* link on iv_splits */ 76 77 /* 78 * is_split_offset is the offset into the i/o. 79 * This is the sum of the previous splits' is_size's. 80 */ 81 uint64_t is_split_offset; 82 83 vdev_t *is_vdev; /* top-level vdev */ 84 uint64_t is_target_offset; /* offset on is_vdev */ 85 uint64_t is_size; 86 int is_children; /* number of entries in is_child[] */ 87 88 /* 89 * is_good_child is the child that we are currently using to 90 * attempt reconstruction. 91 */ 92 int is_good_child; 93 94 indirect_child_t is_child[1]; /* variable-length */ 95 } indirect_split_t; 96 97 /* 98 * The indirect_vsd_t is associated with each i/o to the indirect vdev. 99 * It is the "Vdev-Specific Data" in the zio_t's io_vsd. 100 */ 101 typedef struct indirect_vsd { 102 boolean_t iv_split_block; 103 boolean_t iv_reconstruct; 104 105 list_t iv_splits; /* list of indirect_split_t's */ 106 } indirect_vsd_t; 107 108 /* 109 * List of all vdevs, chained through v_alllink. 110 */ 111 static vdev_list_t zfs_vdevs; 112 113 /* 114 * List of ZFS features supported for read 115 */ 116 static const char *features_for_read[] = { 117 "org.illumos:lz4_compress", 118 "com.delphix:hole_birth", 119 "com.delphix:extensible_dataset", 120 "com.delphix:embedded_data", 121 "org.open-zfs:large_blocks", 122 "org.illumos:sha512", 123 "org.illumos:skein", 124 "org.zfsonlinux:large_dnode", 125 "com.joyent:multi_vdev_crash_dump", 126 "com.delphix:spacemap_histogram", 127 "com.delphix:zpool_checkpoint", 128 "com.delphix:spacemap_v2", 129 "com.datto:encryption", 130 "org.zfsonlinux:allocation_classes", 131 "com.datto:resilver_defer", 132 "com.delphix:device_removal", 133 "com.delphix:obsolete_counts", 134 "com.intel:allocation_classes", 135 "org.freebsd:zstd_compress", 136 NULL 137 }; 138 139 /* 140 * List of all pools, chained through spa_link. 141 */ 142 static spa_list_t zfs_pools; 143 144 static const dnode_phys_t *dnode_cache_obj; 145 static uint64_t dnode_cache_bn; 146 static char *dnode_cache_buf; 147 148 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf); 149 static int zfs_get_root(const spa_t *spa, uint64_t *objid); 150 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result); 151 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, 152 const char *name, uint64_t integer_size, uint64_t num_integers, 153 void *value); 154 static int objset_get_dnode(const spa_t *, const objset_phys_t *, uint64_t, 155 dnode_phys_t *); 156 static int dnode_read(const spa_t *, const dnode_phys_t *, off_t, void *, 157 size_t); 158 static int vdev_indirect_read(vdev_t *, const blkptr_t *, void *, off_t, 159 size_t); 160 static int vdev_mirror_read(vdev_t *, const blkptr_t *, void *, off_t, size_t); 161 vdev_indirect_mapping_t *vdev_indirect_mapping_open(spa_t *, objset_phys_t *, 162 uint64_t); 163 vdev_indirect_mapping_entry_phys_t * 164 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *, uint64_t, 165 uint64_t, uint64_t *); 166 167 static void 168 zfs_init(void) 169 { 170 STAILQ_INIT(&zfs_vdevs); 171 STAILQ_INIT(&zfs_pools); 172 173 dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE); 174 175 zfs_init_crc(); 176 #ifdef HAS_ZSTD_ZFS 177 zstd_init(); 178 #endif 179 } 180 181 static int 182 nvlist_check_features_for_read(nvlist_t *nvl) 183 { 184 nvlist_t *features = NULL; 185 nvs_data_t *data; 186 nvp_header_t *nvp; 187 nv_string_t *nvp_name; 188 int rc; 189 190 rc = nvlist_find(nvl, ZPOOL_CONFIG_FEATURES_FOR_READ, 191 DATA_TYPE_NVLIST, NULL, &features, NULL); 192 if (rc != 0) 193 return (rc); 194 195 data = (nvs_data_t *)features->nv_data; 196 nvp = &data->nvl_pair; /* first pair in nvlist */ 197 198 while (nvp->encoded_size != 0 && nvp->decoded_size != 0) { 199 int i, found; 200 201 nvp_name = (nv_string_t *)((uintptr_t)nvp + sizeof(*nvp)); 202 found = 0; 203 204 for (i = 0; features_for_read[i] != NULL; i++) { 205 if (memcmp(nvp_name->nv_data, features_for_read[i], 206 nvp_name->nv_size) == 0) { 207 found = 1; 208 break; 209 } 210 } 211 212 if (!found) { 213 printf("ZFS: unsupported feature: %.*s\n", 214 nvp_name->nv_size, nvp_name->nv_data); 215 rc = EIO; 216 } 217 nvp = (nvp_header_t *)((uint8_t *)nvp + nvp->encoded_size); 218 } 219 nvlist_destroy(features); 220 221 return (rc); 222 } 223 224 static int 225 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, 226 off_t offset, size_t size) 227 { 228 size_t psize; 229 int rc; 230 231 if (vdev->v_phys_read == NULL) 232 return (ENOTSUP); 233 234 if (bp) { 235 psize = BP_GET_PSIZE(bp); 236 } else { 237 psize = size; 238 } 239 240 rc = vdev->v_phys_read(vdev, vdev->v_priv, offset, buf, psize); 241 if (rc == 0) { 242 if (bp != NULL) 243 rc = zio_checksum_verify(vdev->v_spa, bp, buf); 244 } 245 246 return (rc); 247 } 248 249 static int 250 vdev_write_phys(vdev_t *vdev, void *buf, off_t offset, size_t size) 251 { 252 if (vdev->v_phys_write == NULL) 253 return (ENOTSUP); 254 255 return (vdev->v_phys_write(vdev, offset, buf, size)); 256 } 257 258 typedef struct remap_segment { 259 vdev_t *rs_vd; 260 uint64_t rs_offset; 261 uint64_t rs_asize; 262 uint64_t rs_split_offset; 263 list_node_t rs_node; 264 } remap_segment_t; 265 266 static remap_segment_t * 267 rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 268 { 269 remap_segment_t *rs = malloc(sizeof (remap_segment_t)); 270 271 if (rs != NULL) { 272 rs->rs_vd = vd; 273 rs->rs_offset = offset; 274 rs->rs_asize = asize; 275 rs->rs_split_offset = split_offset; 276 } 277 278 return (rs); 279 } 280 281 vdev_indirect_mapping_t * 282 vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os, 283 uint64_t mapping_object) 284 { 285 vdev_indirect_mapping_t *vim; 286 vdev_indirect_mapping_phys_t *vim_phys; 287 int rc; 288 289 vim = calloc(1, sizeof (*vim)); 290 if (vim == NULL) 291 return (NULL); 292 293 vim->vim_dn = calloc(1, sizeof (*vim->vim_dn)); 294 if (vim->vim_dn == NULL) { 295 free(vim); 296 return (NULL); 297 } 298 299 rc = objset_get_dnode(spa, os, mapping_object, vim->vim_dn); 300 if (rc != 0) { 301 free(vim->vim_dn); 302 free(vim); 303 return (NULL); 304 } 305 306 vim->vim_spa = spa; 307 vim->vim_phys = malloc(sizeof (*vim->vim_phys)); 308 if (vim->vim_phys == NULL) { 309 free(vim->vim_dn); 310 free(vim); 311 return (NULL); 312 } 313 314 vim_phys = (vdev_indirect_mapping_phys_t *)DN_BONUS(vim->vim_dn); 315 *vim->vim_phys = *vim_phys; 316 317 vim->vim_objset = os; 318 vim->vim_object = mapping_object; 319 vim->vim_entries = NULL; 320 321 vim->vim_havecounts = 322 (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0); 323 324 return (vim); 325 } 326 327 /* 328 * Compare an offset with an indirect mapping entry; there are three 329 * possible scenarios: 330 * 331 * 1. The offset is "less than" the mapping entry; meaning the 332 * offset is less than the source offset of the mapping entry. In 333 * this case, there is no overlap between the offset and the 334 * mapping entry and -1 will be returned. 335 * 336 * 2. The offset is "greater than" the mapping entry; meaning the 337 * offset is greater than the mapping entry's source offset plus 338 * the entry's size. In this case, there is no overlap between 339 * the offset and the mapping entry and 1 will be returned. 340 * 341 * NOTE: If the offset is actually equal to the entry's offset 342 * plus size, this is considered to be "greater" than the entry, 343 * and this case applies (i.e. 1 will be returned). Thus, the 344 * entry's "range" can be considered to be inclusive at its 345 * start, but exclusive at its end: e.g. [src, src + size). 346 * 347 * 3. The last case to consider is if the offset actually falls 348 * within the mapping entry's range. If this is the case, the 349 * offset is considered to be "equal to" the mapping entry and 350 * 0 will be returned. 351 * 352 * NOTE: If the offset is equal to the entry's source offset, 353 * this case applies and 0 will be returned. If the offset is 354 * equal to the entry's source plus its size, this case does 355 * *not* apply (see "NOTE" above for scenario 2), and 1 will be 356 * returned. 357 */ 358 static int 359 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) 360 { 361 const uint64_t *key = v_key; 362 const vdev_indirect_mapping_entry_phys_t *array_elem = 363 v_array_elem; 364 uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); 365 366 if (*key < src_offset) { 367 return (-1); 368 } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { 369 return (0); 370 } else { 371 return (1); 372 } 373 } 374 375 /* 376 * Return array entry. 377 */ 378 static vdev_indirect_mapping_entry_phys_t * 379 vdev_indirect_mapping_entry(vdev_indirect_mapping_t *vim, uint64_t index) 380 { 381 uint64_t size; 382 off_t offset = 0; 383 int rc; 384 385 if (vim->vim_phys->vimp_num_entries == 0) 386 return (NULL); 387 388 if (vim->vim_entries == NULL) { 389 uint64_t bsize; 390 391 bsize = vim->vim_dn->dn_datablkszsec << SPA_MINBLOCKSHIFT; 392 size = vim->vim_phys->vimp_num_entries * 393 sizeof (*vim->vim_entries); 394 if (size > bsize) { 395 size = bsize / sizeof (*vim->vim_entries); 396 size *= sizeof (*vim->vim_entries); 397 } 398 vim->vim_entries = malloc(size); 399 if (vim->vim_entries == NULL) 400 return (NULL); 401 vim->vim_num_entries = size / sizeof (*vim->vim_entries); 402 offset = index * sizeof (*vim->vim_entries); 403 } 404 405 /* We have data in vim_entries */ 406 if (offset == 0) { 407 if (index >= vim->vim_entry_offset && 408 index <= vim->vim_entry_offset + vim->vim_num_entries) { 409 index -= vim->vim_entry_offset; 410 return (&vim->vim_entries[index]); 411 } 412 offset = index * sizeof (*vim->vim_entries); 413 } 414 415 vim->vim_entry_offset = index; 416 size = vim->vim_num_entries * sizeof (*vim->vim_entries); 417 rc = dnode_read(vim->vim_spa, vim->vim_dn, offset, vim->vim_entries, 418 size); 419 if (rc != 0) { 420 /* Read error, invalidate vim_entries. */ 421 free(vim->vim_entries); 422 vim->vim_entries = NULL; 423 return (NULL); 424 } 425 index -= vim->vim_entry_offset; 426 return (&vim->vim_entries[index]); 427 } 428 429 /* 430 * Returns the mapping entry for the given offset. 431 * 432 * It's possible that the given offset will not be in the mapping table 433 * (i.e. no mapping entries contain this offset), in which case, the 434 * return value value depends on the "next_if_missing" parameter. 435 * 436 * If the offset is not found in the table and "next_if_missing" is 437 * B_FALSE, then NULL will always be returned. The behavior is intended 438 * to allow consumers to get the entry corresponding to the offset 439 * parameter, iff the offset overlaps with an entry in the table. 440 * 441 * If the offset is not found in the table and "next_if_missing" is 442 * B_TRUE, then the entry nearest to the given offset will be returned, 443 * such that the entry's source offset is greater than the offset 444 * passed in (i.e. the "next" mapping entry in the table is returned, if 445 * the offset is missing from the table). If there are no entries whose 446 * source offset is greater than the passed in offset, NULL is returned. 447 */ 448 static vdev_indirect_mapping_entry_phys_t * 449 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, 450 uint64_t offset) 451 { 452 ASSERT(vim->vim_phys->vimp_num_entries > 0); 453 454 vdev_indirect_mapping_entry_phys_t *entry; 455 456 uint64_t last = vim->vim_phys->vimp_num_entries - 1; 457 uint64_t base = 0; 458 459 /* 460 * We don't define these inside of the while loop because we use 461 * their value in the case that offset isn't in the mapping. 462 */ 463 uint64_t mid; 464 int result; 465 466 while (last >= base) { 467 mid = base + ((last - base) >> 1); 468 469 entry = vdev_indirect_mapping_entry(vim, mid); 470 if (entry == NULL) 471 break; 472 result = dva_mapping_overlap_compare(&offset, entry); 473 474 if (result == 0) { 475 break; 476 } else if (result < 0) { 477 last = mid - 1; 478 } else { 479 base = mid + 1; 480 } 481 } 482 return (entry); 483 } 484 485 /* 486 * Given an indirect vdev and an extent on that vdev, it duplicates the 487 * physical entries of the indirect mapping that correspond to the extent 488 * to a new array and returns a pointer to it. In addition, copied_entries 489 * is populated with the number of mapping entries that were duplicated. 490 * 491 * Finally, since we are doing an allocation, it is up to the caller to 492 * free the array allocated in this function. 493 */ 494 vdev_indirect_mapping_entry_phys_t * 495 vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, 496 uint64_t asize, uint64_t *copied_entries) 497 { 498 vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; 499 vdev_indirect_mapping_t *vim = vd->v_mapping; 500 uint64_t entries = 0; 501 502 vdev_indirect_mapping_entry_phys_t *first_mapping = 503 vdev_indirect_mapping_entry_for_offset(vim, offset); 504 ASSERT3P(first_mapping, !=, NULL); 505 506 vdev_indirect_mapping_entry_phys_t *m = first_mapping; 507 while (asize > 0) { 508 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 509 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); 510 uint64_t inner_size = MIN(asize, size - inner_offset); 511 512 offset += inner_size; 513 asize -= inner_size; 514 entries++; 515 m++; 516 } 517 518 size_t copy_length = entries * sizeof (*first_mapping); 519 duplicate_mappings = malloc(copy_length); 520 if (duplicate_mappings != NULL) 521 bcopy(first_mapping, duplicate_mappings, copy_length); 522 else 523 entries = 0; 524 525 *copied_entries = entries; 526 527 return (duplicate_mappings); 528 } 529 530 static vdev_t * 531 vdev_lookup_top(spa_t *spa, uint64_t vdev) 532 { 533 vdev_t *rvd; 534 vdev_list_t *vlist; 535 536 vlist = &spa->spa_root_vdev->v_children; 537 STAILQ_FOREACH(rvd, vlist, v_childlink) 538 if (rvd->v_id == vdev) 539 break; 540 541 return (rvd); 542 } 543 544 /* 545 * This is a callback for vdev_indirect_remap() which allocates an 546 * indirect_split_t for each split segment and adds it to iv_splits. 547 */ 548 static void 549 vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, 550 uint64_t size, void *arg) 551 { 552 int n = 1; 553 zio_t *zio = arg; 554 indirect_vsd_t *iv = zio->io_vsd; 555 556 if (vd->v_read == vdev_indirect_read) 557 return; 558 559 if (vd->v_read == vdev_mirror_read) 560 n = vd->v_nchildren; 561 562 indirect_split_t *is = 563 malloc(offsetof(indirect_split_t, is_child[n])); 564 if (is == NULL) { 565 zio->io_error = ENOMEM; 566 return; 567 } 568 bzero(is, offsetof(indirect_split_t, is_child[n])); 569 570 is->is_children = n; 571 is->is_size = size; 572 is->is_split_offset = split_offset; 573 is->is_target_offset = offset; 574 is->is_vdev = vd; 575 576 /* 577 * Note that we only consider multiple copies of the data for 578 * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even 579 * though they use the same ops as mirror, because there's only one 580 * "good" copy under the replacing/spare. 581 */ 582 if (vd->v_read == vdev_mirror_read) { 583 int i = 0; 584 vdev_t *kid; 585 586 STAILQ_FOREACH(kid, &vd->v_children, v_childlink) { 587 is->is_child[i++].ic_vdev = kid; 588 } 589 } else { 590 is->is_child[0].ic_vdev = vd; 591 } 592 593 list_insert_tail(&iv->iv_splits, is); 594 } 595 596 static void 597 vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg) 598 { 599 list_t stack; 600 spa_t *spa = vd->v_spa; 601 zio_t *zio = arg; 602 remap_segment_t *rs; 603 604 list_create(&stack, sizeof (remap_segment_t), 605 offsetof(remap_segment_t, rs_node)); 606 607 rs = rs_alloc(vd, offset, asize, 0); 608 if (rs == NULL) { 609 printf("vdev_indirect_remap: out of memory.\n"); 610 zio->io_error = ENOMEM; 611 } 612 for (; rs != NULL; rs = list_remove_head(&stack)) { 613 vdev_t *v = rs->rs_vd; 614 uint64_t num_entries = 0; 615 /* vdev_indirect_mapping_t *vim = v->v_mapping; */ 616 vdev_indirect_mapping_entry_phys_t *mapping = 617 vdev_indirect_mapping_duplicate_adjacent_entries(v, 618 rs->rs_offset, rs->rs_asize, &num_entries); 619 620 if (num_entries == 0) 621 zio->io_error = ENOMEM; 622 623 for (uint64_t i = 0; i < num_entries; i++) { 624 vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; 625 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 626 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); 627 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); 628 uint64_t inner_offset = rs->rs_offset - 629 DVA_MAPPING_GET_SRC_OFFSET(m); 630 uint64_t inner_size = 631 MIN(rs->rs_asize, size - inner_offset); 632 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 633 634 if (dst_v->v_read == vdev_indirect_read) { 635 remap_segment_t *o; 636 637 o = rs_alloc(dst_v, dst_offset + inner_offset, 638 inner_size, rs->rs_split_offset); 639 if (o == NULL) { 640 printf("vdev_indirect_remap: " 641 "out of memory.\n"); 642 zio->io_error = ENOMEM; 643 break; 644 } 645 646 list_insert_head(&stack, o); 647 } 648 vdev_indirect_gather_splits(rs->rs_split_offset, dst_v, 649 dst_offset + inner_offset, 650 inner_size, arg); 651 652 /* 653 * vdev_indirect_gather_splits can have memory 654 * allocation error, we can not recover from it. 655 */ 656 if (zio->io_error != 0) 657 break; 658 rs->rs_offset += inner_size; 659 rs->rs_asize -= inner_size; 660 rs->rs_split_offset += inner_size; 661 } 662 663 free(mapping); 664 free(rs); 665 if (zio->io_error != 0) 666 break; 667 } 668 669 list_destroy(&stack); 670 } 671 672 static void 673 vdev_indirect_map_free(zio_t *zio) 674 { 675 indirect_vsd_t *iv = zio->io_vsd; 676 indirect_split_t *is; 677 678 while ((is = list_head(&iv->iv_splits)) != NULL) { 679 for (int c = 0; c < is->is_children; c++) { 680 indirect_child_t *ic = &is->is_child[c]; 681 free(ic->ic_data); 682 } 683 list_remove(&iv->iv_splits, is); 684 free(is); 685 } 686 free(iv); 687 } 688 689 static int 690 vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 691 off_t offset, size_t bytes) 692 { 693 zio_t zio; 694 spa_t *spa = vdev->v_spa; 695 indirect_vsd_t *iv; 696 indirect_split_t *first; 697 int rc = EIO; 698 699 iv = calloc(1, sizeof(*iv)); 700 if (iv == NULL) 701 return (ENOMEM); 702 703 list_create(&iv->iv_splits, 704 sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); 705 706 bzero(&zio, sizeof(zio)); 707 zio.io_spa = spa; 708 zio.io_bp = (blkptr_t *)bp; 709 zio.io_data = buf; 710 zio.io_size = bytes; 711 zio.io_offset = offset; 712 zio.io_vd = vdev; 713 zio.io_vsd = iv; 714 715 if (vdev->v_mapping == NULL) { 716 vdev_indirect_config_t *vic; 717 718 vic = &vdev->vdev_indirect_config; 719 vdev->v_mapping = vdev_indirect_mapping_open(spa, 720 spa->spa_mos, vic->vic_mapping_object); 721 } 722 723 vdev_indirect_remap(vdev, offset, bytes, &zio); 724 if (zio.io_error != 0) 725 return (zio.io_error); 726 727 first = list_head(&iv->iv_splits); 728 if (first->is_size == zio.io_size) { 729 /* 730 * This is not a split block; we are pointing to the entire 731 * data, which will checksum the same as the original data. 732 * Pass the BP down so that the child i/o can verify the 733 * checksum, and try a different location if available 734 * (e.g. on a mirror). 735 * 736 * While this special case could be handled the same as the 737 * general (split block) case, doing it this way ensures 738 * that the vast majority of blocks on indirect vdevs 739 * (which are not split) are handled identically to blocks 740 * on non-indirect vdevs. This allows us to be less strict 741 * about performance in the general (but rare) case. 742 */ 743 rc = first->is_vdev->v_read(first->is_vdev, zio.io_bp, 744 zio.io_data, first->is_target_offset, bytes); 745 } else { 746 iv->iv_split_block = B_TRUE; 747 /* 748 * Read one copy of each split segment, from the 749 * top-level vdev. Since we don't know the 750 * checksum of each split individually, the child 751 * zio can't ensure that we get the right data. 752 * E.g. if it's a mirror, it will just read from a 753 * random (healthy) leaf vdev. We have to verify 754 * the checksum in vdev_indirect_io_done(). 755 */ 756 for (indirect_split_t *is = list_head(&iv->iv_splits); 757 is != NULL; is = list_next(&iv->iv_splits, is)) { 758 char *ptr = zio.io_data; 759 760 rc = is->is_vdev->v_read(is->is_vdev, zio.io_bp, 761 ptr + is->is_split_offset, is->is_target_offset, 762 is->is_size); 763 } 764 if (zio_checksum_verify(spa, zio.io_bp, zio.io_data)) 765 rc = ECKSUM; 766 else 767 rc = 0; 768 } 769 770 vdev_indirect_map_free(&zio); 771 if (rc == 0) 772 rc = zio.io_error; 773 774 return (rc); 775 } 776 777 static int 778 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 779 off_t offset, size_t bytes) 780 { 781 782 return (vdev_read_phys(vdev, bp, buf, 783 offset + VDEV_LABEL_START_SIZE, bytes)); 784 } 785 786 static int 787 vdev_missing_read(vdev_t *vdev __unused, const blkptr_t *bp __unused, 788 void *buf __unused, off_t offset __unused, size_t bytes __unused) 789 { 790 791 return (ENOTSUP); 792 } 793 794 static int 795 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 796 off_t offset, size_t bytes) 797 { 798 vdev_t *kid; 799 int rc; 800 801 rc = EIO; 802 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 803 if (kid->v_state != VDEV_STATE_HEALTHY) 804 continue; 805 rc = kid->v_read(kid, bp, buf, offset, bytes); 806 if (!rc) 807 return (0); 808 } 809 810 return (rc); 811 } 812 813 static int 814 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 815 off_t offset, size_t bytes) 816 { 817 vdev_t *kid; 818 819 /* 820 * Here we should have two kids: 821 * First one which is the one we are replacing and we can trust 822 * only this one to have valid data, but it might not be present. 823 * Second one is that one we are replacing with. It is most likely 824 * healthy, but we can't trust it has needed data, so we won't use it. 825 */ 826 kid = STAILQ_FIRST(&vdev->v_children); 827 if (kid == NULL) 828 return (EIO); 829 if (kid->v_state != VDEV_STATE_HEALTHY) 830 return (EIO); 831 return (kid->v_read(kid, bp, buf, offset, bytes)); 832 } 833 834 static vdev_t * 835 vdev_find(uint64_t guid) 836 { 837 vdev_t *vdev; 838 839 STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink) 840 if (vdev->v_guid == guid) 841 return (vdev); 842 843 return (0); 844 } 845 846 static vdev_t * 847 vdev_create(uint64_t guid, vdev_read_t *_read) 848 { 849 vdev_t *vdev; 850 vdev_indirect_config_t *vic; 851 852 vdev = calloc(1, sizeof(vdev_t)); 853 if (vdev != NULL) { 854 STAILQ_INIT(&vdev->v_children); 855 vdev->v_guid = guid; 856 vdev->v_read = _read; 857 858 /* 859 * root vdev has no read function, we use this fact to 860 * skip setting up data we do not need for root vdev. 861 * We only point root vdev from spa. 862 */ 863 if (_read != NULL) { 864 vic = &vdev->vdev_indirect_config; 865 vic->vic_prev_indirect_vdev = UINT64_MAX; 866 STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); 867 } 868 } 869 870 return (vdev); 871 } 872 873 static void 874 vdev_set_initial_state(vdev_t *vdev, const nvlist_t *nvlist) 875 { 876 uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; 877 uint64_t is_log; 878 879 is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; 880 is_log = 0; 881 (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, 882 &is_offline, NULL); 883 (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, 884 &is_removed, NULL); 885 (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, 886 &is_faulted, NULL); 887 (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 888 NULL, &is_degraded, NULL); 889 (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 890 NULL, &isnt_present, NULL); 891 (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, 892 &is_log, NULL); 893 894 if (is_offline != 0) 895 vdev->v_state = VDEV_STATE_OFFLINE; 896 else if (is_removed != 0) 897 vdev->v_state = VDEV_STATE_REMOVED; 898 else if (is_faulted != 0) 899 vdev->v_state = VDEV_STATE_FAULTED; 900 else if (is_degraded != 0) 901 vdev->v_state = VDEV_STATE_DEGRADED; 902 else if (isnt_present != 0) 903 vdev->v_state = VDEV_STATE_CANT_OPEN; 904 905 vdev->v_islog = is_log != 0; 906 } 907 908 static int 909 vdev_init(uint64_t guid, const nvlist_t *nvlist, vdev_t **vdevp) 910 { 911 uint64_t id, ashift, asize, nparity; 912 const char *path; 913 const char *type; 914 int len, pathlen; 915 char *name; 916 vdev_t *vdev; 917 918 if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id, 919 NULL) || 920 nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, 921 &type, &len)) { 922 return (ENOENT); 923 } 924 925 if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 && 926 memcmp(type, VDEV_TYPE_DISK, len) != 0 && 927 #ifdef ZFS_TEST 928 memcmp(type, VDEV_TYPE_FILE, len) != 0 && 929 #endif 930 memcmp(type, VDEV_TYPE_RAIDZ, len) != 0 && 931 memcmp(type, VDEV_TYPE_INDIRECT, len) != 0 && 932 memcmp(type, VDEV_TYPE_REPLACING, len) != 0 && 933 memcmp(type, VDEV_TYPE_HOLE, len) != 0) { 934 printf("ZFS: can only boot from disk, mirror, raidz1, " 935 "raidz2 and raidz3 vdevs, got: %.*s\n", len, type); 936 return (EIO); 937 } 938 939 if (memcmp(type, VDEV_TYPE_MIRROR, len) == 0) 940 vdev = vdev_create(guid, vdev_mirror_read); 941 else if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) 942 vdev = vdev_create(guid, vdev_raidz_read); 943 else if (memcmp(type, VDEV_TYPE_REPLACING, len) == 0) 944 vdev = vdev_create(guid, vdev_replacing_read); 945 else if (memcmp(type, VDEV_TYPE_INDIRECT, len) == 0) { 946 vdev_indirect_config_t *vic; 947 948 vdev = vdev_create(guid, vdev_indirect_read); 949 if (vdev != NULL) { 950 vdev->v_state = VDEV_STATE_HEALTHY; 951 vic = &vdev->vdev_indirect_config; 952 953 nvlist_find(nvlist, 954 ZPOOL_CONFIG_INDIRECT_OBJECT, 955 DATA_TYPE_UINT64, 956 NULL, &vic->vic_mapping_object, NULL); 957 nvlist_find(nvlist, 958 ZPOOL_CONFIG_INDIRECT_BIRTHS, 959 DATA_TYPE_UINT64, 960 NULL, &vic->vic_births_object, NULL); 961 nvlist_find(nvlist, 962 ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 963 DATA_TYPE_UINT64, 964 NULL, &vic->vic_prev_indirect_vdev, NULL); 965 } 966 } else if (memcmp(type, VDEV_TYPE_HOLE, len) == 0) { 967 vdev = vdev_create(guid, vdev_missing_read); 968 } else { 969 vdev = vdev_create(guid, vdev_disk_read); 970 } 971 972 if (vdev == NULL) 973 return (ENOMEM); 974 975 vdev_set_initial_state(vdev, nvlist); 976 vdev->v_id = id; 977 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, 978 DATA_TYPE_UINT64, NULL, &ashift, NULL) == 0) 979 vdev->v_ashift = ashift; 980 981 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, 982 DATA_TYPE_UINT64, NULL, &asize, NULL) == 0) { 983 vdev->v_psize = asize + 984 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 985 } 986 987 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, 988 DATA_TYPE_UINT64, NULL, &nparity, NULL) == 0) 989 vdev->v_nparity = nparity; 990 991 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, 992 DATA_TYPE_STRING, NULL, &path, &pathlen) == 0) { 993 char prefix[] = "/dev/"; 994 995 len = strlen(prefix); 996 if (len < pathlen && memcmp(path, prefix, len) == 0) { 997 path += len; 998 pathlen -= len; 999 } 1000 name = malloc(pathlen + 1); 1001 bcopy(path, name, pathlen); 1002 name[pathlen] = '\0'; 1003 vdev->v_name = name; 1004 } else { 1005 name = NULL; 1006 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) { 1007 if (vdev->v_nparity < 1 || 1008 vdev->v_nparity > 3) { 1009 printf("ZFS: invalid raidz parity: %d\n", 1010 vdev->v_nparity); 1011 return (EIO); 1012 } 1013 (void) asprintf(&name, "%.*s%d-%" PRIu64, len, type, 1014 vdev->v_nparity, id); 1015 } else { 1016 (void) asprintf(&name, "%.*s-%" PRIu64, len, type, id); 1017 } 1018 vdev->v_name = name; 1019 } 1020 *vdevp = vdev; 1021 return (0); 1022 } 1023 1024 /* 1025 * Find slot for vdev. We return either NULL to signal to use 1026 * STAILQ_INSERT_HEAD, or we return link element to be used with 1027 * STAILQ_INSERT_AFTER. 1028 */ 1029 static vdev_t * 1030 vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev) 1031 { 1032 vdev_t *v, *previous; 1033 1034 if (STAILQ_EMPTY(&top_vdev->v_children)) 1035 return (NULL); 1036 1037 previous = NULL; 1038 STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) { 1039 if (v->v_id > vdev->v_id) 1040 return (previous); 1041 1042 if (v->v_id == vdev->v_id) 1043 return (v); 1044 1045 if (v->v_id < vdev->v_id) 1046 previous = v; 1047 } 1048 return (previous); 1049 } 1050 1051 static size_t 1052 vdev_child_count(vdev_t *vdev) 1053 { 1054 vdev_t *v; 1055 size_t count; 1056 1057 count = 0; 1058 STAILQ_FOREACH(v, &vdev->v_children, v_childlink) { 1059 count++; 1060 } 1061 return (count); 1062 } 1063 1064 /* 1065 * Insert vdev into top_vdev children list. List is ordered by v_id. 1066 */ 1067 static void 1068 vdev_insert(vdev_t *top_vdev, vdev_t *vdev) 1069 { 1070 vdev_t *previous; 1071 size_t count; 1072 1073 /* 1074 * The top level vdev can appear in random order, depending how 1075 * the firmware is presenting the disk devices. 1076 * However, we will insert vdev to create list ordered by v_id, 1077 * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER 1078 * as STAILQ does not have insert before. 1079 */ 1080 previous = vdev_find_previous(top_vdev, vdev); 1081 1082 if (previous == NULL) { 1083 STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink); 1084 } else if (previous->v_id == vdev->v_id) { 1085 /* 1086 * This vdev was configured from label config, 1087 * do not insert duplicate. 1088 */ 1089 return; 1090 } else { 1091 STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev, 1092 v_childlink); 1093 } 1094 1095 count = vdev_child_count(top_vdev); 1096 if (top_vdev->v_nchildren < count) 1097 top_vdev->v_nchildren = count; 1098 } 1099 1100 static int 1101 vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const nvlist_t *nvlist) 1102 { 1103 vdev_t *top_vdev, *vdev; 1104 nvlist_t **kids = NULL; 1105 int rc, nkids; 1106 1107 /* Get top vdev. */ 1108 top_vdev = vdev_find(top_guid); 1109 if (top_vdev == NULL) { 1110 rc = vdev_init(top_guid, nvlist, &top_vdev); 1111 if (rc != 0) 1112 return (rc); 1113 top_vdev->v_spa = spa; 1114 top_vdev->v_top = top_vdev; 1115 vdev_insert(spa->spa_root_vdev, top_vdev); 1116 } 1117 1118 /* Add children if there are any. */ 1119 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1120 &nkids, &kids, NULL); 1121 if (rc == 0) { 1122 for (int i = 0; i < nkids; i++) { 1123 uint64_t guid; 1124 1125 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, 1126 DATA_TYPE_UINT64, NULL, &guid, NULL); 1127 if (rc != 0) 1128 goto done; 1129 1130 rc = vdev_init(guid, kids[i], &vdev); 1131 if (rc != 0) 1132 goto done; 1133 1134 vdev->v_spa = spa; 1135 vdev->v_top = top_vdev; 1136 vdev_insert(top_vdev, vdev); 1137 } 1138 } else { 1139 /* 1140 * When there are no children, nvlist_find() does return 1141 * error, reset it because leaf devices have no children. 1142 */ 1143 rc = 0; 1144 } 1145 done: 1146 if (kids != NULL) { 1147 for (int i = 0; i < nkids; i++) 1148 nvlist_destroy(kids[i]); 1149 free(kids); 1150 } 1151 1152 return (rc); 1153 } 1154 1155 static int 1156 vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist) 1157 { 1158 uint64_t pool_guid, top_guid; 1159 nvlist_t *vdevs; 1160 int rc; 1161 1162 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1163 NULL, &pool_guid, NULL) || 1164 nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64, 1165 NULL, &top_guid, NULL) || 1166 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1167 NULL, &vdevs, NULL)) { 1168 printf("ZFS: can't find vdev details\n"); 1169 return (ENOENT); 1170 } 1171 1172 rc = vdev_from_nvlist(spa, top_guid, vdevs); 1173 nvlist_destroy(vdevs); 1174 return (rc); 1175 } 1176 1177 static void 1178 vdev_set_state(vdev_t *vdev) 1179 { 1180 vdev_t *kid; 1181 int good_kids; 1182 int bad_kids; 1183 1184 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1185 vdev_set_state(kid); 1186 } 1187 1188 /* 1189 * A mirror or raidz is healthy if all its kids are healthy. A 1190 * mirror is degraded if any of its kids is healthy; a raidz 1191 * is degraded if at most nparity kids are offline. 1192 */ 1193 if (STAILQ_FIRST(&vdev->v_children)) { 1194 good_kids = 0; 1195 bad_kids = 0; 1196 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1197 if (kid->v_state == VDEV_STATE_HEALTHY) 1198 good_kids++; 1199 else 1200 bad_kids++; 1201 } 1202 if (bad_kids == 0) { 1203 vdev->v_state = VDEV_STATE_HEALTHY; 1204 } else { 1205 if (vdev->v_read == vdev_mirror_read) { 1206 if (good_kids) { 1207 vdev->v_state = VDEV_STATE_DEGRADED; 1208 } else { 1209 vdev->v_state = VDEV_STATE_OFFLINE; 1210 } 1211 } else if (vdev->v_read == vdev_raidz_read) { 1212 if (bad_kids > vdev->v_nparity) { 1213 vdev->v_state = VDEV_STATE_OFFLINE; 1214 } else { 1215 vdev->v_state = VDEV_STATE_DEGRADED; 1216 } 1217 } 1218 } 1219 } 1220 } 1221 1222 static int 1223 vdev_update_from_nvlist(uint64_t top_guid, const nvlist_t *nvlist) 1224 { 1225 vdev_t *vdev; 1226 nvlist_t **kids = NULL; 1227 int rc, nkids; 1228 1229 /* Update top vdev. */ 1230 vdev = vdev_find(top_guid); 1231 if (vdev != NULL) 1232 vdev_set_initial_state(vdev, nvlist); 1233 1234 /* Update children if there are any. */ 1235 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1236 &nkids, &kids, NULL); 1237 if (rc == 0) { 1238 for (int i = 0; i < nkids; i++) { 1239 uint64_t guid; 1240 1241 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, 1242 DATA_TYPE_UINT64, NULL, &guid, NULL); 1243 if (rc != 0) 1244 break; 1245 1246 vdev = vdev_find(guid); 1247 if (vdev != NULL) 1248 vdev_set_initial_state(vdev, kids[i]); 1249 } 1250 } else { 1251 rc = 0; 1252 } 1253 if (kids != NULL) { 1254 for (int i = 0; i < nkids; i++) 1255 nvlist_destroy(kids[i]); 1256 free(kids); 1257 } 1258 1259 return (rc); 1260 } 1261 1262 static int 1263 vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist) 1264 { 1265 uint64_t pool_guid, vdev_children; 1266 nvlist_t *vdevs = NULL, **kids = NULL; 1267 int rc, nkids; 1268 1269 if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 1270 NULL, &pool_guid, NULL) || 1271 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64, 1272 NULL, &vdev_children, NULL) || 1273 nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1274 NULL, &vdevs, NULL)) { 1275 printf("ZFS: can't find vdev details\n"); 1276 return (ENOENT); 1277 } 1278 1279 /* Wrong guid?! */ 1280 if (spa->spa_guid != pool_guid) { 1281 nvlist_destroy(vdevs); 1282 return (EINVAL); 1283 } 1284 1285 spa->spa_root_vdev->v_nchildren = vdev_children; 1286 1287 rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, 1288 &nkids, &kids, NULL); 1289 nvlist_destroy(vdevs); 1290 1291 /* 1292 * MOS config has at least one child for root vdev. 1293 */ 1294 if (rc != 0) 1295 return (rc); 1296 1297 for (int i = 0; i < nkids; i++) { 1298 uint64_t guid; 1299 vdev_t *vdev; 1300 1301 rc = nvlist_find(kids[i], ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 1302 NULL, &guid, NULL); 1303 if (rc != 0) 1304 break; 1305 vdev = vdev_find(guid); 1306 /* 1307 * Top level vdev is missing, create it. 1308 */ 1309 if (vdev == NULL) 1310 rc = vdev_from_nvlist(spa, guid, kids[i]); 1311 else 1312 rc = vdev_update_from_nvlist(guid, kids[i]); 1313 if (rc != 0) 1314 break; 1315 } 1316 if (kids != NULL) { 1317 for (int i = 0; i < nkids; i++) 1318 nvlist_destroy(kids[i]); 1319 free(kids); 1320 } 1321 1322 /* 1323 * Re-evaluate top-level vdev state. 1324 */ 1325 vdev_set_state(spa->spa_root_vdev); 1326 1327 return (rc); 1328 } 1329 1330 static spa_t * 1331 spa_find_by_guid(uint64_t guid) 1332 { 1333 spa_t *spa; 1334 1335 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 1336 if (spa->spa_guid == guid) 1337 return (spa); 1338 1339 return (NULL); 1340 } 1341 1342 static spa_t * 1343 spa_find_by_name(const char *name) 1344 { 1345 spa_t *spa; 1346 1347 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 1348 if (strcmp(spa->spa_name, name) == 0) 1349 return (spa); 1350 1351 return (NULL); 1352 } 1353 1354 static spa_t * 1355 spa_find_by_dev(struct zfs_devdesc *dev) 1356 { 1357 1358 if (dev->dd.d_dev->dv_type != DEVT_ZFS) 1359 return (NULL); 1360 1361 if (dev->pool_guid == 0) 1362 return (STAILQ_FIRST(&zfs_pools)); 1363 1364 return (spa_find_by_guid(dev->pool_guid)); 1365 } 1366 1367 static spa_t * 1368 spa_create(uint64_t guid, const char *name) 1369 { 1370 spa_t *spa; 1371 1372 if ((spa = calloc(1, sizeof(spa_t))) == NULL) 1373 return (NULL); 1374 if ((spa->spa_name = strdup(name)) == NULL) { 1375 free(spa); 1376 return (NULL); 1377 } 1378 spa->spa_uberblock = &spa->spa_uberblock_master; 1379 spa->spa_mos = &spa->spa_mos_master; 1380 spa->spa_guid = guid; 1381 spa->spa_root_vdev = vdev_create(guid, NULL); 1382 if (spa->spa_root_vdev == NULL) { 1383 free(spa->spa_name); 1384 free(spa); 1385 return (NULL); 1386 } 1387 spa->spa_root_vdev->v_name = strdup("root"); 1388 STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); 1389 1390 return (spa); 1391 } 1392 1393 static const char * 1394 state_name(vdev_state_t state) 1395 { 1396 static const char *names[] = { 1397 "UNKNOWN", 1398 "CLOSED", 1399 "OFFLINE", 1400 "REMOVED", 1401 "CANT_OPEN", 1402 "FAULTED", 1403 "DEGRADED", 1404 "ONLINE" 1405 }; 1406 return (names[state]); 1407 } 1408 1409 #ifdef BOOT2 1410 1411 #define pager_printf printf 1412 1413 #else 1414 1415 static int 1416 pager_printf(const char *fmt, ...) 1417 { 1418 char line[80]; 1419 va_list args; 1420 1421 va_start(args, fmt); 1422 vsnprintf(line, sizeof(line), fmt, args); 1423 va_end(args); 1424 return (pager_output(line)); 1425 } 1426 1427 #endif 1428 1429 #define STATUS_FORMAT " %s %s\n" 1430 1431 static int 1432 print_state(int indent, const char *name, vdev_state_t state) 1433 { 1434 int i; 1435 char buf[512]; 1436 1437 buf[0] = 0; 1438 for (i = 0; i < indent; i++) 1439 strcat(buf, " "); 1440 strcat(buf, name); 1441 return (pager_printf(STATUS_FORMAT, buf, state_name(state))); 1442 } 1443 1444 static int 1445 vdev_status(vdev_t *vdev, int indent) 1446 { 1447 vdev_t *kid; 1448 int ret; 1449 1450 if (vdev->v_islog) { 1451 (void) pager_output(" logs\n"); 1452 indent++; 1453 } 1454 1455 ret = print_state(indent, vdev->v_name, vdev->v_state); 1456 if (ret != 0) 1457 return (ret); 1458 1459 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1460 ret = vdev_status(kid, indent + 1); 1461 if (ret != 0) 1462 return (ret); 1463 } 1464 return (ret); 1465 } 1466 1467 static int 1468 spa_status(spa_t *spa) 1469 { 1470 static char bootfs[ZFS_MAXNAMELEN]; 1471 uint64_t rootid; 1472 vdev_list_t *vlist; 1473 vdev_t *vdev; 1474 int good_kids, bad_kids, degraded_kids, ret; 1475 vdev_state_t state; 1476 1477 ret = pager_printf(" pool: %s\n", spa->spa_name); 1478 if (ret != 0) 1479 return (ret); 1480 1481 if (zfs_get_root(spa, &rootid) == 0 && 1482 zfs_rlookup(spa, rootid, bootfs) == 0) { 1483 if (bootfs[0] == '\0') 1484 ret = pager_printf("bootfs: %s\n", spa->spa_name); 1485 else 1486 ret = pager_printf("bootfs: %s/%s\n", spa->spa_name, 1487 bootfs); 1488 if (ret != 0) 1489 return (ret); 1490 } 1491 ret = pager_printf("config:\n\n"); 1492 if (ret != 0) 1493 return (ret); 1494 ret = pager_printf(STATUS_FORMAT, "NAME", "STATE"); 1495 if (ret != 0) 1496 return (ret); 1497 1498 good_kids = 0; 1499 degraded_kids = 0; 1500 bad_kids = 0; 1501 vlist = &spa->spa_root_vdev->v_children; 1502 STAILQ_FOREACH(vdev, vlist, v_childlink) { 1503 if (vdev->v_state == VDEV_STATE_HEALTHY) 1504 good_kids++; 1505 else if (vdev->v_state == VDEV_STATE_DEGRADED) 1506 degraded_kids++; 1507 else 1508 bad_kids++; 1509 } 1510 1511 state = VDEV_STATE_CLOSED; 1512 if (good_kids > 0 && (degraded_kids + bad_kids) == 0) 1513 state = VDEV_STATE_HEALTHY; 1514 else if ((good_kids + degraded_kids) > 0) 1515 state = VDEV_STATE_DEGRADED; 1516 1517 ret = print_state(0, spa->spa_name, state); 1518 if (ret != 0) 1519 return (ret); 1520 1521 STAILQ_FOREACH(vdev, vlist, v_childlink) { 1522 ret = vdev_status(vdev, 1); 1523 if (ret != 0) 1524 return (ret); 1525 } 1526 return (ret); 1527 } 1528 1529 static int 1530 spa_all_status(void) 1531 { 1532 spa_t *spa; 1533 int first = 1, ret = 0; 1534 1535 STAILQ_FOREACH(spa, &zfs_pools, spa_link) { 1536 if (!first) { 1537 ret = pager_printf("\n"); 1538 if (ret != 0) 1539 return (ret); 1540 } 1541 first = 0; 1542 ret = spa_status(spa); 1543 if (ret != 0) 1544 return (ret); 1545 } 1546 return (ret); 1547 } 1548 1549 static uint64_t 1550 vdev_label_offset(uint64_t psize, int l, uint64_t offset) 1551 { 1552 uint64_t label_offset; 1553 1554 if (l < VDEV_LABELS / 2) 1555 label_offset = 0; 1556 else 1557 label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t); 1558 1559 return (offset + l * sizeof (vdev_label_t) + label_offset); 1560 } 1561 1562 static int 1563 vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) 1564 { 1565 unsigned int seq1 = 0; 1566 unsigned int seq2 = 0; 1567 int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); 1568 1569 if (cmp != 0) 1570 return (cmp); 1571 1572 cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); 1573 if (cmp != 0) 1574 return (cmp); 1575 1576 if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) 1577 seq1 = MMP_SEQ(ub1); 1578 1579 if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) 1580 seq2 = MMP_SEQ(ub2); 1581 1582 return (AVL_CMP(seq1, seq2)); 1583 } 1584 1585 static int 1586 uberblock_verify(uberblock_t *ub) 1587 { 1588 if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) { 1589 byteswap_uint64_array(ub, sizeof (uberblock_t)); 1590 } 1591 1592 if (ub->ub_magic != UBERBLOCK_MAGIC || 1593 !SPA_VERSION_IS_SUPPORTED(ub->ub_version)) 1594 return (EINVAL); 1595 1596 return (0); 1597 } 1598 1599 static int 1600 vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset, 1601 size_t size) 1602 { 1603 blkptr_t bp; 1604 off_t off; 1605 1606 off = vdev_label_offset(vd->v_psize, l, offset); 1607 1608 BP_ZERO(&bp); 1609 BP_SET_LSIZE(&bp, size); 1610 BP_SET_PSIZE(&bp, size); 1611 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); 1612 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); 1613 DVA_SET_OFFSET(BP_IDENTITY(&bp), off); 1614 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); 1615 1616 return (vdev_read_phys(vd, &bp, buf, off, size)); 1617 } 1618 1619 /* 1620 * We do need to be sure we write to correct location. 1621 * Our vdev label does consist of 4 fields: 1622 * pad1 (8k), reserved. 1623 * bootenv (8k), checksummed, previously reserved, may contian garbage. 1624 * vdev_phys (112k), checksummed 1625 * uberblock ring (128k), checksummed. 1626 * 1627 * Since bootenv area may contain garbage, we can not reliably read it, as 1628 * we can get checksum errors. 1629 * Next best thing is vdev_phys - it is just after bootenv. It still may 1630 * be corrupted, but in such case we will miss this one write. 1631 */ 1632 static int 1633 vdev_label_write_validate(vdev_t *vd, int l, uint64_t offset) 1634 { 1635 uint64_t off, o_phys; 1636 void *buf; 1637 size_t size = VDEV_PHYS_SIZE; 1638 int rc; 1639 1640 o_phys = offsetof(vdev_label_t, vl_vdev_phys); 1641 off = vdev_label_offset(vd->v_psize, l, o_phys); 1642 1643 /* off should be 8K from bootenv */ 1644 if (vdev_label_offset(vd->v_psize, l, offset) + VDEV_PAD_SIZE != off) 1645 return (EINVAL); 1646 1647 buf = malloc(size); 1648 if (buf == NULL) 1649 return (ENOMEM); 1650 1651 /* Read vdev_phys */ 1652 rc = vdev_label_read(vd, l, buf, o_phys, size); 1653 free(buf); 1654 return (rc); 1655 } 1656 1657 static int 1658 vdev_label_write(vdev_t *vd, int l, vdev_boot_envblock_t *be, uint64_t offset) 1659 { 1660 zio_checksum_info_t *ci; 1661 zio_cksum_t cksum; 1662 off_t off; 1663 size_t size = VDEV_PAD_SIZE; 1664 int rc; 1665 1666 if (vd->v_phys_write == NULL) 1667 return (ENOTSUP); 1668 1669 off = vdev_label_offset(vd->v_psize, l, offset); 1670 1671 rc = vdev_label_write_validate(vd, l, offset); 1672 if (rc != 0) { 1673 return (rc); 1674 } 1675 1676 ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; 1677 be->vbe_zbt.zec_magic = ZEC_MAGIC; 1678 zio_checksum_label_verifier(&be->vbe_zbt.zec_cksum, off); 1679 ci->ci_func[0](be, size, NULL, &cksum); 1680 be->vbe_zbt.zec_cksum = cksum; 1681 1682 return (vdev_write_phys(vd, be, off, size)); 1683 } 1684 1685 static int 1686 vdev_write_bootenv_impl(vdev_t *vdev, vdev_boot_envblock_t *be) 1687 { 1688 vdev_t *kid; 1689 int rv = 0, rc; 1690 1691 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1692 if (kid->v_state != VDEV_STATE_HEALTHY) 1693 continue; 1694 rc = vdev_write_bootenv_impl(kid, be); 1695 if (rv == 0) 1696 rv = rc; 1697 } 1698 1699 /* 1700 * Non-leaf vdevs do not have v_phys_write. 1701 */ 1702 if (vdev->v_phys_write == NULL) 1703 return (rv); 1704 1705 for (int l = 0; l < VDEV_LABELS; l++) { 1706 rc = vdev_label_write(vdev, l, be, 1707 offsetof(vdev_label_t, vl_be)); 1708 if (rc != 0) { 1709 printf("failed to write bootenv to %s label %d: %d\n", 1710 vdev->v_name ? vdev->v_name : "unknown", l, rc); 1711 rv = rc; 1712 } 1713 } 1714 return (rv); 1715 } 1716 1717 int 1718 vdev_write_bootenv(vdev_t *vdev, nvlist_t *nvl) 1719 { 1720 vdev_boot_envblock_t *be; 1721 nvlist_t nv, *nvp; 1722 uint64_t version; 1723 int rv; 1724 1725 if (nvl->nv_size > sizeof(be->vbe_bootenv)) 1726 return (E2BIG); 1727 1728 version = VB_RAW; 1729 nvp = vdev_read_bootenv(vdev); 1730 if (nvp != NULL) { 1731 nvlist_find(nvp, BOOTENV_VERSION, DATA_TYPE_UINT64, NULL, 1732 &version, NULL); 1733 nvlist_destroy(nvp); 1734 } 1735 1736 be = calloc(1, sizeof(*be)); 1737 if (be == NULL) 1738 return (ENOMEM); 1739 1740 be->vbe_version = version; 1741 switch (version) { 1742 case VB_RAW: 1743 /* 1744 * If there is no envmap, we will just wipe bootenv. 1745 */ 1746 nvlist_find(nvl, GRUB_ENVMAP, DATA_TYPE_STRING, NULL, 1747 be->vbe_bootenv, NULL); 1748 rv = 0; 1749 break; 1750 1751 case VB_NVLIST: 1752 nv.nv_header = nvl->nv_header; 1753 nv.nv_asize = nvl->nv_asize; 1754 nv.nv_size = nvl->nv_size; 1755 1756 bcopy(&nv.nv_header, be->vbe_bootenv, sizeof(nv.nv_header)); 1757 nv.nv_data = be->vbe_bootenv + sizeof(nvs_header_t); 1758 bcopy(nvl->nv_data, nv.nv_data, nv.nv_size); 1759 rv = nvlist_export(&nv); 1760 break; 1761 1762 default: 1763 rv = EINVAL; 1764 break; 1765 } 1766 1767 if (rv == 0) { 1768 be->vbe_version = htobe64(be->vbe_version); 1769 rv = vdev_write_bootenv_impl(vdev, be); 1770 } 1771 free(be); 1772 return (rv); 1773 } 1774 1775 /* 1776 * Read the bootenv area from pool label, return the nvlist from it. 1777 * We return from first successful read. 1778 */ 1779 nvlist_t * 1780 vdev_read_bootenv(vdev_t *vdev) 1781 { 1782 vdev_t *kid; 1783 nvlist_t *benv; 1784 vdev_boot_envblock_t *be; 1785 char *command; 1786 bool ok; 1787 int rv; 1788 1789 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 1790 if (kid->v_state != VDEV_STATE_HEALTHY) 1791 continue; 1792 1793 benv = vdev_read_bootenv(kid); 1794 if (benv != NULL) 1795 return (benv); 1796 } 1797 1798 be = malloc(sizeof (*be)); 1799 if (be == NULL) 1800 return (NULL); 1801 1802 rv = 0; 1803 for (int l = 0; l < VDEV_LABELS; l++) { 1804 rv = vdev_label_read(vdev, l, be, 1805 offsetof(vdev_label_t, vl_be), 1806 sizeof (*be)); 1807 if (rv == 0) 1808 break; 1809 } 1810 if (rv != 0) { 1811 free(be); 1812 return (NULL); 1813 } 1814 1815 be->vbe_version = be64toh(be->vbe_version); 1816 switch (be->vbe_version) { 1817 case VB_RAW: 1818 /* 1819 * we have textual data in vbe_bootenv, create nvlist 1820 * with key "envmap". 1821 */ 1822 benv = nvlist_create(NV_UNIQUE_NAME); 1823 if (benv != NULL) { 1824 if (*be->vbe_bootenv == '\0') { 1825 nvlist_add_uint64(benv, BOOTENV_VERSION, 1826 VB_NVLIST); 1827 break; 1828 } 1829 nvlist_add_uint64(benv, BOOTENV_VERSION, VB_RAW); 1830 be->vbe_bootenv[sizeof (be->vbe_bootenv) - 1] = '\0'; 1831 nvlist_add_string(benv, GRUB_ENVMAP, be->vbe_bootenv); 1832 } 1833 break; 1834 1835 case VB_NVLIST: 1836 benv = nvlist_import(be->vbe_bootenv, sizeof(be->vbe_bootenv)); 1837 break; 1838 1839 default: 1840 command = (char *)be; 1841 ok = false; 1842 1843 /* Check for legacy zfsbootcfg command string */ 1844 for (int i = 0; command[i] != '\0'; i++) { 1845 if (iscntrl(command[i])) { 1846 ok = false; 1847 break; 1848 } else { 1849 ok = true; 1850 } 1851 } 1852 benv = nvlist_create(NV_UNIQUE_NAME); 1853 if (benv != NULL) { 1854 if (ok) 1855 nvlist_add_string(benv, FREEBSD_BOOTONCE, 1856 command); 1857 else 1858 nvlist_add_uint64(benv, BOOTENV_VERSION, 1859 VB_NVLIST); 1860 } 1861 break; 1862 } 1863 free(be); 1864 return (benv); 1865 } 1866 1867 static uint64_t 1868 vdev_get_label_asize(nvlist_t *nvl) 1869 { 1870 nvlist_t *vdevs; 1871 uint64_t asize; 1872 const char *type; 1873 int len; 1874 1875 asize = 0; 1876 /* Get vdev tree */ 1877 if (nvlist_find(nvl, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, 1878 NULL, &vdevs, NULL) != 0) 1879 return (asize); 1880 1881 /* 1882 * Get vdev type. We will calculate asize for raidz, mirror and disk. 1883 * For raidz, the asize is raw size of all children. 1884 */ 1885 if (nvlist_find(vdevs, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, 1886 NULL, &type, &len) != 0) 1887 goto done; 1888 1889 if (memcmp(type, VDEV_TYPE_MIRROR, len) != 0 && 1890 memcmp(type, VDEV_TYPE_DISK, len) != 0 && 1891 memcmp(type, VDEV_TYPE_RAIDZ, len) != 0) 1892 goto done; 1893 1894 if (nvlist_find(vdevs, ZPOOL_CONFIG_ASIZE, DATA_TYPE_UINT64, 1895 NULL, &asize, NULL) != 0) 1896 goto done; 1897 1898 if (memcmp(type, VDEV_TYPE_RAIDZ, len) == 0) { 1899 nvlist_t **kids; 1900 int nkids; 1901 1902 if (nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, 1903 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids, NULL) != 0) { 1904 asize = 0; 1905 goto done; 1906 } 1907 1908 asize /= nkids; 1909 for (int i = 0; i < nkids; i++) 1910 nvlist_destroy(kids[i]); 1911 free(kids); 1912 } 1913 1914 asize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 1915 done: 1916 nvlist_destroy(vdevs); 1917 return (asize); 1918 } 1919 1920 static nvlist_t * 1921 vdev_label_read_config(vdev_t *vd, uint64_t txg) 1922 { 1923 vdev_phys_t *label; 1924 uint64_t best_txg = 0; 1925 uint64_t label_txg = 0; 1926 uint64_t asize; 1927 nvlist_t *nvl = NULL, *tmp; 1928 int error; 1929 1930 label = malloc(sizeof (vdev_phys_t)); 1931 if (label == NULL) 1932 return (NULL); 1933 1934 for (int l = 0; l < VDEV_LABELS; l++) { 1935 if (vdev_label_read(vd, l, label, 1936 offsetof(vdev_label_t, vl_vdev_phys), 1937 sizeof (vdev_phys_t))) 1938 continue; 1939 1940 tmp = nvlist_import(label->vp_nvlist, 1941 sizeof(label->vp_nvlist)); 1942 if (tmp == NULL) 1943 continue; 1944 1945 error = nvlist_find(tmp, ZPOOL_CONFIG_POOL_TXG, 1946 DATA_TYPE_UINT64, NULL, &label_txg, NULL); 1947 if (error != 0 || label_txg == 0) { 1948 nvlist_destroy(nvl); 1949 nvl = tmp; 1950 goto done; 1951 } 1952 1953 if (label_txg <= txg && label_txg > best_txg) { 1954 best_txg = label_txg; 1955 nvlist_destroy(nvl); 1956 nvl = tmp; 1957 tmp = NULL; 1958 1959 /* 1960 * Use asize from pool config. We need this 1961 * because we can get bad value from BIOS. 1962 */ 1963 asize = vdev_get_label_asize(nvl); 1964 if (asize != 0) { 1965 vd->v_psize = asize; 1966 } 1967 } 1968 nvlist_destroy(tmp); 1969 } 1970 1971 if (best_txg == 0) { 1972 nvlist_destroy(nvl); 1973 nvl = NULL; 1974 } 1975 done: 1976 free(label); 1977 return (nvl); 1978 } 1979 1980 static void 1981 vdev_uberblock_load(vdev_t *vd, uberblock_t *ub) 1982 { 1983 uberblock_t *buf; 1984 1985 buf = malloc(VDEV_UBERBLOCK_SIZE(vd)); 1986 if (buf == NULL) 1987 return; 1988 1989 for (int l = 0; l < VDEV_LABELS; l++) { 1990 for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { 1991 if (vdev_label_read(vd, l, buf, 1992 VDEV_UBERBLOCK_OFFSET(vd, n), 1993 VDEV_UBERBLOCK_SIZE(vd))) 1994 continue; 1995 if (uberblock_verify(buf) != 0) 1996 continue; 1997 1998 if (vdev_uberblock_compare(buf, ub) > 0) 1999 *ub = *buf; 2000 } 2001 } 2002 free(buf); 2003 } 2004 2005 static int 2006 vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t *_write, void *priv, 2007 spa_t **spap) 2008 { 2009 vdev_t vtmp; 2010 spa_t *spa; 2011 vdev_t *vdev; 2012 nvlist_t *nvl; 2013 uint64_t val; 2014 uint64_t guid, vdev_children; 2015 uint64_t pool_txg, pool_guid; 2016 const char *pool_name; 2017 int rc, namelen; 2018 2019 /* 2020 * Load the vdev label and figure out which 2021 * uberblock is most current. 2022 */ 2023 memset(&vtmp, 0, sizeof(vtmp)); 2024 vtmp.v_phys_read = _read; 2025 vtmp.v_phys_write = _write; 2026 vtmp.v_priv = priv; 2027 vtmp.v_psize = P2ALIGN(ldi_get_size(priv), 2028 (uint64_t)sizeof (vdev_label_t)); 2029 2030 /* Test for minimum device size. */ 2031 if (vtmp.v_psize < SPA_MINDEVSIZE) 2032 return (EIO); 2033 2034 nvl = vdev_label_read_config(&vtmp, UINT64_MAX); 2035 if (nvl == NULL) 2036 return (EIO); 2037 2038 if (nvlist_find(nvl, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, 2039 NULL, &val, NULL) != 0) { 2040 nvlist_destroy(nvl); 2041 return (EIO); 2042 } 2043 2044 if (!SPA_VERSION_IS_SUPPORTED(val)) { 2045 printf("ZFS: unsupported ZFS version %u (should be %u)\n", 2046 (unsigned)val, (unsigned)SPA_VERSION); 2047 nvlist_destroy(nvl); 2048 return (EIO); 2049 } 2050 2051 /* Check ZFS features for read */ 2052 rc = nvlist_check_features_for_read(nvl); 2053 if (rc != 0) { 2054 nvlist_destroy(nvl); 2055 return (EIO); 2056 } 2057 2058 if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, 2059 NULL, &val, NULL) != 0) { 2060 nvlist_destroy(nvl); 2061 return (EIO); 2062 } 2063 2064 if (val == POOL_STATE_DESTROYED) { 2065 /* We don't boot only from destroyed pools. */ 2066 nvlist_destroy(nvl); 2067 return (EIO); 2068 } 2069 2070 if (nvlist_find(nvl, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64, 2071 NULL, &pool_txg, NULL) != 0 || 2072 nvlist_find(nvl, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 2073 NULL, &pool_guid, NULL) != 0 || 2074 nvlist_find(nvl, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING, 2075 NULL, &pool_name, &namelen) != 0) { 2076 /* 2077 * Cache and spare devices end up here - just ignore 2078 * them. 2079 */ 2080 nvlist_destroy(nvl); 2081 return (EIO); 2082 } 2083 2084 /* 2085 * Create the pool if this is the first time we've seen it. 2086 */ 2087 spa = spa_find_by_guid(pool_guid); 2088 if (spa == NULL) { 2089 char *name; 2090 2091 nvlist_find(nvl, ZPOOL_CONFIG_VDEV_CHILDREN, 2092 DATA_TYPE_UINT64, NULL, &vdev_children, NULL); 2093 name = malloc(namelen + 1); 2094 if (name == NULL) { 2095 nvlist_destroy(nvl); 2096 return (ENOMEM); 2097 } 2098 bcopy(pool_name, name, namelen); 2099 name[namelen] = '\0'; 2100 spa = spa_create(pool_guid, name); 2101 free(name); 2102 if (spa == NULL) { 2103 nvlist_destroy(nvl); 2104 return (ENOMEM); 2105 } 2106 spa->spa_root_vdev->v_nchildren = vdev_children; 2107 } 2108 if (pool_txg > spa->spa_txg) 2109 spa->spa_txg = pool_txg; 2110 2111 /* 2112 * Get the vdev tree and create our in-core copy of it. 2113 * If we already have a vdev with this guid, this must 2114 * be some kind of alias (overlapping slices, dangerously dedicated 2115 * disks etc). 2116 */ 2117 if (nvlist_find(nvl, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 2118 NULL, &guid, NULL) != 0) { 2119 nvlist_destroy(nvl); 2120 return (EIO); 2121 } 2122 vdev = vdev_find(guid); 2123 /* Has this vdev already been inited? */ 2124 if (vdev && vdev->v_phys_read) { 2125 nvlist_destroy(nvl); 2126 return (EIO); 2127 } 2128 2129 rc = vdev_init_from_label(spa, nvl); 2130 nvlist_destroy(nvl); 2131 if (rc != 0) 2132 return (rc); 2133 2134 /* 2135 * We should already have created an incomplete vdev for this 2136 * vdev. Find it and initialise it with our read proc. 2137 */ 2138 vdev = vdev_find(guid); 2139 if (vdev != NULL) { 2140 vdev->v_phys_read = _read; 2141 vdev->v_phys_write = _write; 2142 vdev->v_priv = priv; 2143 vdev->v_psize = vtmp.v_psize; 2144 /* 2145 * If no other state is set, mark vdev healthy. 2146 */ 2147 if (vdev->v_state == VDEV_STATE_UNKNOWN) 2148 vdev->v_state = VDEV_STATE_HEALTHY; 2149 } else { 2150 printf("ZFS: inconsistent nvlist contents\n"); 2151 return (EIO); 2152 } 2153 2154 if (vdev->v_islog) 2155 spa->spa_with_log = vdev->v_islog; 2156 2157 /* 2158 * Re-evaluate top-level vdev state. 2159 */ 2160 vdev_set_state(vdev->v_top); 2161 2162 /* 2163 * Ok, we are happy with the pool so far. Lets find 2164 * the best uberblock and then we can actually access 2165 * the contents of the pool. 2166 */ 2167 vdev_uberblock_load(vdev, spa->spa_uberblock); 2168 2169 if (spap != NULL) 2170 *spap = spa; 2171 return (0); 2172 } 2173 2174 static int 2175 ilog2(int n) 2176 { 2177 int v; 2178 2179 for (v = 0; v < 32; v++) 2180 if (n == (1 << v)) 2181 return (v); 2182 return (-1); 2183 } 2184 2185 static int 2186 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf) 2187 { 2188 blkptr_t gbh_bp; 2189 zio_gbh_phys_t zio_gb; 2190 char *pbuf; 2191 int i; 2192 2193 /* Artificial BP for gang block header. */ 2194 gbh_bp = *bp; 2195 BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 2196 BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 2197 BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER); 2198 BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF); 2199 for (i = 0; i < SPA_DVAS_PER_BP; i++) 2200 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0); 2201 2202 /* Read gang header block using the artificial BP. */ 2203 if (zio_read(spa, &gbh_bp, &zio_gb)) 2204 return (EIO); 2205 2206 pbuf = buf; 2207 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 2208 blkptr_t *gbp = &zio_gb.zg_blkptr[i]; 2209 2210 if (BP_IS_HOLE(gbp)) 2211 continue; 2212 if (zio_read(spa, gbp, pbuf)) 2213 return (EIO); 2214 pbuf += BP_GET_PSIZE(gbp); 2215 } 2216 2217 if (zio_checksum_verify(spa, bp, buf)) 2218 return (EIO); 2219 return (0); 2220 } 2221 2222 static int 2223 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) 2224 { 2225 int cpfunc = BP_GET_COMPRESS(bp); 2226 uint64_t align, size; 2227 void *pbuf; 2228 int i, error; 2229 2230 /* 2231 * Process data embedded in block pointer 2232 */ 2233 if (BP_IS_EMBEDDED(bp)) { 2234 ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 2235 2236 size = BPE_GET_PSIZE(bp); 2237 ASSERT(size <= BPE_PAYLOAD_SIZE); 2238 2239 if (cpfunc != ZIO_COMPRESS_OFF) 2240 pbuf = malloc(size); 2241 else 2242 pbuf = buf; 2243 2244 if (pbuf == NULL) 2245 return (ENOMEM); 2246 2247 decode_embedded_bp_compressed(bp, pbuf); 2248 error = 0; 2249 2250 if (cpfunc != ZIO_COMPRESS_OFF) { 2251 error = zio_decompress_data(cpfunc, pbuf, 2252 size, buf, BP_GET_LSIZE(bp)); 2253 free(pbuf); 2254 } 2255 if (error != 0) 2256 printf("ZFS: i/o error - unable to decompress " 2257 "block pointer data, error %d\n", error); 2258 return (error); 2259 } 2260 2261 error = EIO; 2262 2263 for (i = 0; i < SPA_DVAS_PER_BP; i++) { 2264 const dva_t *dva = &bp->blk_dva[i]; 2265 vdev_t *vdev; 2266 vdev_list_t *vlist; 2267 uint64_t vdevid; 2268 off_t offset; 2269 2270 if (!dva->dva_word[0] && !dva->dva_word[1]) 2271 continue; 2272 2273 vdevid = DVA_GET_VDEV(dva); 2274 offset = DVA_GET_OFFSET(dva); 2275 vlist = &spa->spa_root_vdev->v_children; 2276 STAILQ_FOREACH(vdev, vlist, v_childlink) { 2277 if (vdev->v_id == vdevid) 2278 break; 2279 } 2280 if (!vdev || !vdev->v_read) 2281 continue; 2282 2283 size = BP_GET_PSIZE(bp); 2284 if (vdev->v_read == vdev_raidz_read) { 2285 align = 1ULL << vdev->v_ashift; 2286 if (P2PHASE(size, align) != 0) 2287 size = P2ROUNDUP(size, align); 2288 } 2289 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) 2290 pbuf = malloc(size); 2291 else 2292 pbuf = buf; 2293 2294 if (pbuf == NULL) { 2295 error = ENOMEM; 2296 break; 2297 } 2298 2299 if (DVA_GET_GANG(dva)) 2300 error = zio_read_gang(spa, bp, pbuf); 2301 else 2302 error = vdev->v_read(vdev, bp, pbuf, offset, size); 2303 if (error == 0) { 2304 if (cpfunc != ZIO_COMPRESS_OFF) 2305 error = zio_decompress_data(cpfunc, pbuf, 2306 BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp)); 2307 else if (size != BP_GET_PSIZE(bp)) 2308 bcopy(pbuf, buf, BP_GET_PSIZE(bp)); 2309 } else { 2310 printf("zio_read error: %d\n", error); 2311 } 2312 if (buf != pbuf) 2313 free(pbuf); 2314 if (error == 0) 2315 break; 2316 } 2317 if (error != 0) 2318 printf("ZFS: i/o error - all block copies unavailable\n"); 2319 2320 return (error); 2321 } 2322 2323 static int 2324 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, 2325 void *buf, size_t buflen) 2326 { 2327 int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 2328 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2329 int nlevels = dnode->dn_nlevels; 2330 int i, rc; 2331 2332 if (bsize > SPA_MAXBLOCKSIZE) { 2333 printf("ZFS: I/O error - blocks larger than %llu are not " 2334 "supported\n", SPA_MAXBLOCKSIZE); 2335 return (EIO); 2336 } 2337 2338 /* 2339 * Note: bsize may not be a power of two here so we need to do an 2340 * actual divide rather than a bitshift. 2341 */ 2342 while (buflen > 0) { 2343 uint64_t bn = offset / bsize; 2344 int boff = offset % bsize; 2345 int ibn; 2346 const blkptr_t *indbp; 2347 blkptr_t bp; 2348 2349 if (bn > dnode->dn_maxblkid) 2350 return (EIO); 2351 2352 if (dnode == dnode_cache_obj && bn == dnode_cache_bn) 2353 goto cached; 2354 2355 indbp = dnode->dn_blkptr; 2356 for (i = 0; i < nlevels; i++) { 2357 /* 2358 * Copy the bp from the indirect array so that 2359 * we can re-use the scratch buffer for multi-level 2360 * objects. 2361 */ 2362 ibn = bn >> ((nlevels - i - 1) * ibshift); 2363 ibn &= ((1 << ibshift) - 1); 2364 bp = indbp[ibn]; 2365 if (BP_IS_HOLE(&bp)) { 2366 memset(dnode_cache_buf, 0, bsize); 2367 break; 2368 } 2369 rc = zio_read(spa, &bp, dnode_cache_buf); 2370 if (rc) 2371 return (rc); 2372 indbp = (const blkptr_t *) dnode_cache_buf; 2373 } 2374 dnode_cache_obj = dnode; 2375 dnode_cache_bn = bn; 2376 cached: 2377 2378 /* 2379 * The buffer contains our data block. Copy what we 2380 * need from it and loop. 2381 */ 2382 i = bsize - boff; 2383 if (i > buflen) i = buflen; 2384 memcpy(buf, &dnode_cache_buf[boff], i); 2385 buf = ((char *)buf) + i; 2386 offset += i; 2387 buflen -= i; 2388 } 2389 2390 return (0); 2391 } 2392 2393 /* 2394 * Lookup a value in a microzap directory. 2395 */ 2396 static int 2397 mzap_lookup(const mzap_phys_t *mz, size_t size, const char *name, 2398 uint64_t *value) 2399 { 2400 const mzap_ent_phys_t *mze; 2401 int chunks, i; 2402 2403 /* 2404 * Microzap objects use exactly one block. Read the whole 2405 * thing. 2406 */ 2407 chunks = size / MZAP_ENT_LEN - 1; 2408 for (i = 0; i < chunks; i++) { 2409 mze = &mz->mz_chunk[i]; 2410 if (strcmp(mze->mze_name, name) == 0) { 2411 *value = mze->mze_value; 2412 return (0); 2413 } 2414 } 2415 2416 return (ENOENT); 2417 } 2418 2419 /* 2420 * Compare a name with a zap leaf entry. Return non-zero if the name 2421 * matches. 2422 */ 2423 static int 2424 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, 2425 const char *name) 2426 { 2427 size_t namelen; 2428 const zap_leaf_chunk_t *nc; 2429 const char *p; 2430 2431 namelen = zc->l_entry.le_name_numints; 2432 2433 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 2434 p = name; 2435 while (namelen > 0) { 2436 size_t len; 2437 2438 len = namelen; 2439 if (len > ZAP_LEAF_ARRAY_BYTES) 2440 len = ZAP_LEAF_ARRAY_BYTES; 2441 if (memcmp(p, nc->l_array.la_array, len)) 2442 return (0); 2443 p += len; 2444 namelen -= len; 2445 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 2446 } 2447 2448 return (1); 2449 } 2450 2451 /* 2452 * Extract a uint64_t value from a zap leaf entry. 2453 */ 2454 static uint64_t 2455 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc) 2456 { 2457 const zap_leaf_chunk_t *vc; 2458 int i; 2459 uint64_t value; 2460 const uint8_t *p; 2461 2462 vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk); 2463 for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) { 2464 value = (value << 8) | p[i]; 2465 } 2466 2467 return (value); 2468 } 2469 2470 static void 2471 stv(int len, void *addr, uint64_t value) 2472 { 2473 switch (len) { 2474 case 1: 2475 *(uint8_t *)addr = value; 2476 return; 2477 case 2: 2478 *(uint16_t *)addr = value; 2479 return; 2480 case 4: 2481 *(uint32_t *)addr = value; 2482 return; 2483 case 8: 2484 *(uint64_t *)addr = value; 2485 return; 2486 } 2487 } 2488 2489 /* 2490 * Extract a array from a zap leaf entry. 2491 */ 2492 static void 2493 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, 2494 uint64_t integer_size, uint64_t num_integers, void *buf) 2495 { 2496 uint64_t array_int_len = zc->l_entry.le_value_intlen; 2497 uint64_t value = 0; 2498 uint64_t *u64 = buf; 2499 char *p = buf; 2500 int len = MIN(zc->l_entry.le_value_numints, num_integers); 2501 int chunk = zc->l_entry.le_value_chunk; 2502 int byten = 0; 2503 2504 if (integer_size == 8 && len == 1) { 2505 *u64 = fzap_leaf_value(zl, zc); 2506 return; 2507 } 2508 2509 while (len > 0) { 2510 struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array; 2511 int i; 2512 2513 ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl)); 2514 for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { 2515 value = (value << 8) | la->la_array[i]; 2516 byten++; 2517 if (byten == array_int_len) { 2518 stv(integer_size, p, value); 2519 byten = 0; 2520 len--; 2521 if (len == 0) 2522 return; 2523 p += integer_size; 2524 } 2525 } 2526 chunk = la->la_next; 2527 } 2528 } 2529 2530 static int 2531 fzap_check_size(uint64_t integer_size, uint64_t num_integers) 2532 { 2533 2534 switch (integer_size) { 2535 case 1: 2536 case 2: 2537 case 4: 2538 case 8: 2539 break; 2540 default: 2541 return (EINVAL); 2542 } 2543 2544 if (integer_size * num_integers > ZAP_MAXVALUELEN) 2545 return (E2BIG); 2546 2547 return (0); 2548 } 2549 2550 static void 2551 zap_leaf_free(zap_leaf_t *leaf) 2552 { 2553 free(leaf->l_phys); 2554 free(leaf); 2555 } 2556 2557 static int 2558 zap_get_leaf_byblk(fat_zap_t *zap, uint64_t blk, zap_leaf_t **lp) 2559 { 2560 int bs = FZAP_BLOCK_SHIFT(zap); 2561 int err; 2562 2563 *lp = malloc(sizeof(**lp)); 2564 if (*lp == NULL) 2565 return (ENOMEM); 2566 2567 (*lp)->l_bs = bs; 2568 (*lp)->l_phys = malloc(1 << bs); 2569 2570 if ((*lp)->l_phys == NULL) { 2571 free(*lp); 2572 return (ENOMEM); 2573 } 2574 err = dnode_read(zap->zap_spa, zap->zap_dnode, blk << bs, (*lp)->l_phys, 2575 1 << bs); 2576 if (err != 0) { 2577 zap_leaf_free(*lp); 2578 } 2579 return (err); 2580 } 2581 2582 static int 2583 zap_table_load(fat_zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, 2584 uint64_t *valp) 2585 { 2586 int bs = FZAP_BLOCK_SHIFT(zap); 2587 uint64_t blk = idx >> (bs - 3); 2588 uint64_t off = idx & ((1 << (bs - 3)) - 1); 2589 uint64_t *buf; 2590 int rc; 2591 2592 buf = malloc(1 << zap->zap_block_shift); 2593 if (buf == NULL) 2594 return (ENOMEM); 2595 rc = dnode_read(zap->zap_spa, zap->zap_dnode, (tbl->zt_blk + blk) << bs, 2596 buf, 1 << zap->zap_block_shift); 2597 if (rc == 0) 2598 *valp = buf[off]; 2599 free(buf); 2600 return (rc); 2601 } 2602 2603 static int 2604 zap_idx_to_blk(fat_zap_t *zap, uint64_t idx, uint64_t *valp) 2605 { 2606 if (zap->zap_phys->zap_ptrtbl.zt_numblks == 0) { 2607 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); 2608 return (0); 2609 } else { 2610 return (zap_table_load(zap, &zap->zap_phys->zap_ptrtbl, 2611 idx, valp)); 2612 } 2613 } 2614 2615 #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) 2616 static int 2617 zap_deref_leaf(fat_zap_t *zap, uint64_t h, zap_leaf_t **lp) 2618 { 2619 uint64_t idx, blk; 2620 int err; 2621 2622 idx = ZAP_HASH_IDX(h, zap->zap_phys->zap_ptrtbl.zt_shift); 2623 err = zap_idx_to_blk(zap, idx, &blk); 2624 if (err != 0) 2625 return (err); 2626 return (zap_get_leaf_byblk(zap, blk, lp)); 2627 } 2628 2629 #define CHAIN_END 0xffff /* end of the chunk chain */ 2630 #define LEAF_HASH(l, h) \ 2631 ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ 2632 ((h) >> \ 2633 (64 - ZAP_LEAF_HASH_SHIFT(l) - (l)->l_phys->l_hdr.lh_prefix_len))) 2634 #define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) 2635 2636 static int 2637 zap_leaf_lookup(zap_leaf_t *zl, uint64_t hash, const char *name, 2638 uint64_t integer_size, uint64_t num_integers, void *value) 2639 { 2640 int rc; 2641 uint16_t *chunkp; 2642 struct zap_leaf_entry *le; 2643 2644 /* 2645 * Make sure this chunk matches our hash. 2646 */ 2647 if (zl->l_phys->l_hdr.lh_prefix_len > 0 && 2648 zl->l_phys->l_hdr.lh_prefix != 2649 hash >> (64 - zl->l_phys->l_hdr.lh_prefix_len)) 2650 return (EIO); 2651 2652 rc = ENOENT; 2653 for (chunkp = LEAF_HASH_ENTPTR(zl, hash); 2654 *chunkp != CHAIN_END; chunkp = &le->le_next) { 2655 zap_leaf_chunk_t *zc; 2656 uint16_t chunk = *chunkp; 2657 2658 le = ZAP_LEAF_ENTRY(zl, chunk); 2659 if (le->le_hash != hash) 2660 continue; 2661 zc = &ZAP_LEAF_CHUNK(zl, chunk); 2662 if (fzap_name_equal(zl, zc, name)) { 2663 if (zc->l_entry.le_value_intlen > integer_size) { 2664 rc = EINVAL; 2665 } else { 2666 fzap_leaf_array(zl, zc, integer_size, 2667 num_integers, value); 2668 rc = 0; 2669 } 2670 break; 2671 } 2672 } 2673 return (rc); 2674 } 2675 2676 /* 2677 * Lookup a value in a fatzap directory. 2678 */ 2679 static int 2680 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2681 const char *name, uint64_t integer_size, uint64_t num_integers, 2682 void *value) 2683 { 2684 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2685 fat_zap_t z; 2686 zap_leaf_t *zl; 2687 uint64_t hash; 2688 int rc; 2689 2690 if (zh->zap_magic != ZAP_MAGIC) 2691 return (EIO); 2692 2693 if ((rc = fzap_check_size(integer_size, num_integers)) != 0) { 2694 return (rc); 2695 } 2696 2697 z.zap_block_shift = ilog2(bsize); 2698 z.zap_phys = zh; 2699 z.zap_spa = spa; 2700 z.zap_dnode = dnode; 2701 2702 hash = zap_hash(zh->zap_salt, name); 2703 rc = zap_deref_leaf(&z, hash, &zl); 2704 if (rc != 0) 2705 return (rc); 2706 2707 rc = zap_leaf_lookup(zl, hash, name, integer_size, num_integers, value); 2708 2709 zap_leaf_free(zl); 2710 return (rc); 2711 } 2712 2713 /* 2714 * Lookup a name in a zap object and return its value as a uint64_t. 2715 */ 2716 static int 2717 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, 2718 uint64_t integer_size, uint64_t num_integers, void *value) 2719 { 2720 int rc; 2721 zap_phys_t *zap; 2722 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2723 2724 zap = malloc(size); 2725 if (zap == NULL) 2726 return (ENOMEM); 2727 2728 rc = dnode_read(spa, dnode, 0, zap, size); 2729 if (rc) 2730 goto done; 2731 2732 switch (zap->zap_block_type) { 2733 case ZBT_MICRO: 2734 rc = mzap_lookup((const mzap_phys_t *)zap, size, name, value); 2735 break; 2736 case ZBT_HEADER: 2737 rc = fzap_lookup(spa, dnode, zap, name, integer_size, 2738 num_integers, value); 2739 break; 2740 default: 2741 printf("ZFS: invalid zap_type=%" PRIx64 "\n", 2742 zap->zap_block_type); 2743 rc = EIO; 2744 } 2745 done: 2746 free(zap); 2747 return (rc); 2748 } 2749 2750 /* 2751 * List a microzap directory. 2752 */ 2753 static int 2754 mzap_list(const mzap_phys_t *mz, size_t size, 2755 int (*callback)(const char *, uint64_t)) 2756 { 2757 const mzap_ent_phys_t *mze; 2758 int chunks, i, rc; 2759 2760 /* 2761 * Microzap objects use exactly one block. Read the whole 2762 * thing. 2763 */ 2764 rc = 0; 2765 chunks = size / MZAP_ENT_LEN - 1; 2766 for (i = 0; i < chunks; i++) { 2767 mze = &mz->mz_chunk[i]; 2768 if (mze->mze_name[0]) { 2769 rc = callback(mze->mze_name, mze->mze_value); 2770 if (rc != 0) 2771 break; 2772 } 2773 } 2774 2775 return (rc); 2776 } 2777 2778 /* 2779 * List a fatzap directory. 2780 */ 2781 static int 2782 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2783 int (*callback)(const char *, uint64_t)) 2784 { 2785 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2786 fat_zap_t z; 2787 uint64_t i; 2788 int j, rc; 2789 2790 if (zh->zap_magic != ZAP_MAGIC) 2791 return (EIO); 2792 2793 z.zap_block_shift = ilog2(bsize); 2794 z.zap_phys = zh; 2795 2796 /* 2797 * This assumes that the leaf blocks start at block 1. The 2798 * documentation isn't exactly clear on this. 2799 */ 2800 zap_leaf_t zl; 2801 zl.l_bs = z.zap_block_shift; 2802 zl.l_phys = malloc(bsize); 2803 if (zl.l_phys == NULL) 2804 return (ENOMEM); 2805 2806 for (i = 0; i < zh->zap_num_leafs; i++) { 2807 off_t off = ((off_t)(i + 1)) << zl.l_bs; 2808 char name[256], *p; 2809 uint64_t value; 2810 2811 if (dnode_read(spa, dnode, off, zl.l_phys, bsize)) { 2812 free(zl.l_phys); 2813 return (EIO); 2814 } 2815 2816 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 2817 zap_leaf_chunk_t *zc, *nc; 2818 int namelen; 2819 2820 zc = &ZAP_LEAF_CHUNK(&zl, j); 2821 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 2822 continue; 2823 namelen = zc->l_entry.le_name_numints; 2824 if (namelen > sizeof(name)) 2825 namelen = sizeof(name); 2826 2827 /* 2828 * Paste the name back together. 2829 */ 2830 nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); 2831 p = name; 2832 while (namelen > 0) { 2833 int len; 2834 len = namelen; 2835 if (len > ZAP_LEAF_ARRAY_BYTES) 2836 len = ZAP_LEAF_ARRAY_BYTES; 2837 memcpy(p, nc->l_array.la_array, len); 2838 p += len; 2839 namelen -= len; 2840 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); 2841 } 2842 2843 /* 2844 * Assume the first eight bytes of the value are 2845 * a uint64_t. 2846 */ 2847 value = fzap_leaf_value(&zl, zc); 2848 2849 /* printf("%s 0x%jx\n", name, (uintmax_t)value); */ 2850 rc = callback((const char *)name, value); 2851 if (rc != 0) { 2852 free(zl.l_phys); 2853 return (rc); 2854 } 2855 } 2856 } 2857 2858 free(zl.l_phys); 2859 return (0); 2860 } 2861 2862 static int zfs_printf(const char *name, uint64_t value __unused) 2863 { 2864 2865 printf("%s\n", name); 2866 2867 return (0); 2868 } 2869 2870 /* 2871 * List a zap directory. 2872 */ 2873 static int 2874 zap_list(const spa_t *spa, const dnode_phys_t *dnode) 2875 { 2876 zap_phys_t *zap; 2877 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2878 int rc; 2879 2880 zap = malloc(size); 2881 if (zap == NULL) 2882 return (ENOMEM); 2883 2884 rc = dnode_read(spa, dnode, 0, zap, size); 2885 if (rc == 0) { 2886 if (zap->zap_block_type == ZBT_MICRO) 2887 rc = mzap_list((const mzap_phys_t *)zap, size, 2888 zfs_printf); 2889 else 2890 rc = fzap_list(spa, dnode, zap, zfs_printf); 2891 } 2892 free(zap); 2893 return (rc); 2894 } 2895 2896 static int 2897 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, 2898 dnode_phys_t *dnode) 2899 { 2900 off_t offset; 2901 2902 offset = objnum * sizeof(dnode_phys_t); 2903 return dnode_read(spa, &os->os_meta_dnode, offset, 2904 dnode, sizeof(dnode_phys_t)); 2905 } 2906 2907 /* 2908 * Lookup a name in a microzap directory. 2909 */ 2910 static int 2911 mzap_rlookup(const mzap_phys_t *mz, size_t size, char *name, uint64_t value) 2912 { 2913 const mzap_ent_phys_t *mze; 2914 int chunks, i; 2915 2916 /* 2917 * Microzap objects use exactly one block. Read the whole 2918 * thing. 2919 */ 2920 chunks = size / MZAP_ENT_LEN - 1; 2921 for (i = 0; i < chunks; i++) { 2922 mze = &mz->mz_chunk[i]; 2923 if (value == mze->mze_value) { 2924 strcpy(name, mze->mze_name); 2925 return (0); 2926 } 2927 } 2928 2929 return (ENOENT); 2930 } 2931 2932 static void 2933 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name) 2934 { 2935 size_t namelen; 2936 const zap_leaf_chunk_t *nc; 2937 char *p; 2938 2939 namelen = zc->l_entry.le_name_numints; 2940 2941 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 2942 p = name; 2943 while (namelen > 0) { 2944 size_t len; 2945 len = namelen; 2946 if (len > ZAP_LEAF_ARRAY_BYTES) 2947 len = ZAP_LEAF_ARRAY_BYTES; 2948 memcpy(p, nc->l_array.la_array, len); 2949 p += len; 2950 namelen -= len; 2951 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 2952 } 2953 2954 *p = '\0'; 2955 } 2956 2957 static int 2958 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, zap_phys_t *zh, 2959 char *name, uint64_t value) 2960 { 2961 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 2962 fat_zap_t z; 2963 uint64_t i; 2964 int j, rc; 2965 2966 if (zh->zap_magic != ZAP_MAGIC) 2967 return (EIO); 2968 2969 z.zap_block_shift = ilog2(bsize); 2970 z.zap_phys = zh; 2971 2972 /* 2973 * This assumes that the leaf blocks start at block 1. The 2974 * documentation isn't exactly clear on this. 2975 */ 2976 zap_leaf_t zl; 2977 zl.l_bs = z.zap_block_shift; 2978 zl.l_phys = malloc(bsize); 2979 if (zl.l_phys == NULL) 2980 return (ENOMEM); 2981 2982 for (i = 0; i < zh->zap_num_leafs; i++) { 2983 off_t off = ((off_t)(i + 1)) << zl.l_bs; 2984 2985 rc = dnode_read(spa, dnode, off, zl.l_phys, bsize); 2986 if (rc != 0) 2987 goto done; 2988 2989 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 2990 zap_leaf_chunk_t *zc; 2991 2992 zc = &ZAP_LEAF_CHUNK(&zl, j); 2993 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 2994 continue; 2995 if (zc->l_entry.le_value_intlen != 8 || 2996 zc->l_entry.le_value_numints != 1) 2997 continue; 2998 2999 if (fzap_leaf_value(&zl, zc) == value) { 3000 fzap_name_copy(&zl, zc, name); 3001 goto done; 3002 } 3003 } 3004 } 3005 3006 rc = ENOENT; 3007 done: 3008 free(zl.l_phys); 3009 return (rc); 3010 } 3011 3012 static int 3013 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, 3014 uint64_t value) 3015 { 3016 zap_phys_t *zap; 3017 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 3018 int rc; 3019 3020 zap = malloc(size); 3021 if (zap == NULL) 3022 return (ENOMEM); 3023 3024 rc = dnode_read(spa, dnode, 0, zap, size); 3025 if (rc == 0) { 3026 if (zap->zap_block_type == ZBT_MICRO) 3027 rc = mzap_rlookup((const mzap_phys_t *)zap, size, 3028 name, value); 3029 else 3030 rc = fzap_rlookup(spa, dnode, zap, name, value); 3031 } 3032 free(zap); 3033 return (rc); 3034 } 3035 3036 static int 3037 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result) 3038 { 3039 char name[256]; 3040 char component[256]; 3041 uint64_t dir_obj, parent_obj, child_dir_zapobj; 3042 dnode_phys_t child_dir_zap, dataset, dir, parent; 3043 dsl_dir_phys_t *dd; 3044 dsl_dataset_phys_t *ds; 3045 char *p; 3046 int len; 3047 3048 p = &name[sizeof(name) - 1]; 3049 *p = '\0'; 3050 3051 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3052 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3053 return (EIO); 3054 } 3055 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3056 dir_obj = ds->ds_dir_obj; 3057 3058 for (;;) { 3059 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir) != 0) 3060 return (EIO); 3061 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3062 3063 /* Actual loop condition. */ 3064 parent_obj = dd->dd_parent_obj; 3065 if (parent_obj == 0) 3066 break; 3067 3068 if (objset_get_dnode(spa, spa->spa_mos, parent_obj, 3069 &parent) != 0) 3070 return (EIO); 3071 dd = (dsl_dir_phys_t *)&parent.dn_bonus; 3072 child_dir_zapobj = dd->dd_child_dir_zapobj; 3073 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3074 &child_dir_zap) != 0) 3075 return (EIO); 3076 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0) 3077 return (EIO); 3078 3079 len = strlen(component); 3080 p -= len; 3081 memcpy(p, component, len); 3082 --p; 3083 *p = '/'; 3084 3085 /* Actual loop iteration. */ 3086 dir_obj = parent_obj; 3087 } 3088 3089 if (*p != '\0') 3090 ++p; 3091 strcpy(result, p); 3092 3093 return (0); 3094 } 3095 3096 static int 3097 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum) 3098 { 3099 char element[256]; 3100 uint64_t dir_obj, child_dir_zapobj; 3101 dnode_phys_t child_dir_zap, dir; 3102 dsl_dir_phys_t *dd; 3103 const char *p, *q; 3104 3105 if (objset_get_dnode(spa, spa->spa_mos, 3106 DMU_POOL_DIRECTORY_OBJECT, &dir)) 3107 return (EIO); 3108 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj), 3109 1, &dir_obj)) 3110 return (EIO); 3111 3112 p = name; 3113 for (;;) { 3114 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir)) 3115 return (EIO); 3116 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3117 3118 while (*p == '/') 3119 p++; 3120 /* Actual loop condition #1. */ 3121 if (*p == '\0') 3122 break; 3123 3124 q = strchr(p, '/'); 3125 if (q) { 3126 memcpy(element, p, q - p); 3127 element[q - p] = '\0'; 3128 p = q + 1; 3129 } else { 3130 strcpy(element, p); 3131 p += strlen(p); 3132 } 3133 3134 child_dir_zapobj = dd->dd_child_dir_zapobj; 3135 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3136 &child_dir_zap) != 0) 3137 return (EIO); 3138 3139 /* Actual loop condition #2. */ 3140 if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj), 3141 1, &dir_obj) != 0) 3142 return (ENOENT); 3143 } 3144 3145 *objnum = dd->dd_head_dataset_obj; 3146 return (0); 3147 } 3148 3149 #ifndef BOOT2 3150 static int 3151 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/) 3152 { 3153 uint64_t dir_obj, child_dir_zapobj; 3154 dnode_phys_t child_dir_zap, dir, dataset; 3155 dsl_dataset_phys_t *ds; 3156 dsl_dir_phys_t *dd; 3157 3158 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3159 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3160 return (EIO); 3161 } 3162 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3163 dir_obj = ds->ds_dir_obj; 3164 3165 if (objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir)) { 3166 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 3167 return (EIO); 3168 } 3169 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3170 3171 child_dir_zapobj = dd->dd_child_dir_zapobj; 3172 if (objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3173 &child_dir_zap) != 0) { 3174 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 3175 return (EIO); 3176 } 3177 3178 return (zap_list(spa, &child_dir_zap) != 0); 3179 } 3180 3181 int 3182 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, 3183 int (*callback)(const char *, uint64_t)) 3184 { 3185 uint64_t dir_obj, child_dir_zapobj; 3186 dnode_phys_t child_dir_zap, dir, dataset; 3187 dsl_dataset_phys_t *ds; 3188 dsl_dir_phys_t *dd; 3189 zap_phys_t *zap; 3190 size_t size; 3191 int err; 3192 3193 err = objset_get_dnode(spa, spa->spa_mos, objnum, &dataset); 3194 if (err != 0) { 3195 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3196 return (err); 3197 } 3198 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3199 dir_obj = ds->ds_dir_obj; 3200 3201 err = objset_get_dnode(spa, spa->spa_mos, dir_obj, &dir); 3202 if (err != 0) { 3203 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 3204 return (err); 3205 } 3206 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3207 3208 child_dir_zapobj = dd->dd_child_dir_zapobj; 3209 err = objset_get_dnode(spa, spa->spa_mos, child_dir_zapobj, 3210 &child_dir_zap); 3211 if (err != 0) { 3212 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 3213 return (err); 3214 } 3215 3216 size = child_dir_zap.dn_datablkszsec << SPA_MINBLOCKSHIFT; 3217 zap = malloc(size); 3218 if (zap != NULL) { 3219 err = dnode_read(spa, &child_dir_zap, 0, zap, size); 3220 if (err != 0) 3221 goto done; 3222 3223 if (zap->zap_block_type == ZBT_MICRO) 3224 err = mzap_list((const mzap_phys_t *)zap, size, 3225 callback); 3226 else 3227 err = fzap_list(spa, &child_dir_zap, zap, callback); 3228 } else { 3229 err = ENOMEM; 3230 } 3231 done: 3232 free(zap); 3233 return (err); 3234 } 3235 #endif 3236 3237 /* 3238 * Find the object set given the object number of its dataset object 3239 * and return its details in *objset 3240 */ 3241 static int 3242 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset) 3243 { 3244 dnode_phys_t dataset; 3245 dsl_dataset_phys_t *ds; 3246 3247 if (objset_get_dnode(spa, spa->spa_mos, objnum, &dataset)) { 3248 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 3249 return (EIO); 3250 } 3251 3252 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 3253 if (zio_read(spa, &ds->ds_bp, objset)) { 3254 printf("ZFS: can't read object set for dataset %ju\n", 3255 (uintmax_t)objnum); 3256 return (EIO); 3257 } 3258 3259 return (0); 3260 } 3261 3262 /* 3263 * Find the object set pointed to by the BOOTFS property or the root 3264 * dataset if there is none and return its details in *objset 3265 */ 3266 static int 3267 zfs_get_root(const spa_t *spa, uint64_t *objid) 3268 { 3269 dnode_phys_t dir, propdir; 3270 uint64_t props, bootfs, root; 3271 3272 *objid = 0; 3273 3274 /* 3275 * Start with the MOS directory object. 3276 */ 3277 if (objset_get_dnode(spa, spa->spa_mos, 3278 DMU_POOL_DIRECTORY_OBJECT, &dir)) { 3279 printf("ZFS: can't read MOS object directory\n"); 3280 return (EIO); 3281 } 3282 3283 /* 3284 * Lookup the pool_props and see if we can find a bootfs. 3285 */ 3286 if (zap_lookup(spa, &dir, DMU_POOL_PROPS, 3287 sizeof(props), 1, &props) == 0 && 3288 objset_get_dnode(spa, spa->spa_mos, props, &propdir) == 0 && 3289 zap_lookup(spa, &propdir, "bootfs", 3290 sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) { 3291 *objid = bootfs; 3292 return (0); 3293 } 3294 /* 3295 * Lookup the root dataset directory 3296 */ 3297 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, 3298 sizeof(root), 1, &root) || 3299 objset_get_dnode(spa, spa->spa_mos, root, &dir)) { 3300 printf("ZFS: can't find root dsl_dir\n"); 3301 return (EIO); 3302 } 3303 3304 /* 3305 * Use the information from the dataset directory's bonus buffer 3306 * to find the dataset object and from that the object set itself. 3307 */ 3308 dsl_dir_phys_t *dd = (dsl_dir_phys_t *)&dir.dn_bonus; 3309 *objid = dd->dd_head_dataset_obj; 3310 return (0); 3311 } 3312 3313 static int 3314 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount) 3315 { 3316 3317 mount->spa = spa; 3318 3319 /* 3320 * Find the root object set if not explicitly provided 3321 */ 3322 if (rootobj == 0 && zfs_get_root(spa, &rootobj)) { 3323 printf("ZFS: can't find root filesystem\n"); 3324 return (EIO); 3325 } 3326 3327 if (zfs_mount_dataset(spa, rootobj, &mount->objset)) { 3328 printf("ZFS: can't open root filesystem\n"); 3329 return (EIO); 3330 } 3331 3332 mount->rootobj = rootobj; 3333 3334 return (0); 3335 } 3336 3337 /* 3338 * callback function for feature name checks. 3339 */ 3340 static int 3341 check_feature(const char *name, uint64_t value) 3342 { 3343 int i; 3344 3345 if (value == 0) 3346 return (0); 3347 if (name[0] == '\0') 3348 return (0); 3349 3350 for (i = 0; features_for_read[i] != NULL; i++) { 3351 if (strcmp(name, features_for_read[i]) == 0) 3352 return (0); 3353 } 3354 printf("ZFS: unsupported feature: %s\n", name); 3355 return (EIO); 3356 } 3357 3358 /* 3359 * Checks whether the MOS features that are active are supported. 3360 */ 3361 static int 3362 check_mos_features(const spa_t *spa) 3363 { 3364 dnode_phys_t dir; 3365 zap_phys_t *zap; 3366 uint64_t objnum; 3367 size_t size; 3368 int rc; 3369 3370 if ((rc = objset_get_dnode(spa, spa->spa_mos, DMU_OT_OBJECT_DIRECTORY, 3371 &dir)) != 0) 3372 return (rc); 3373 if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ, 3374 sizeof (objnum), 1, &objnum)) != 0) { 3375 /* 3376 * It is older pool without features. As we have already 3377 * tested the label, just return without raising the error. 3378 */ 3379 return (0); 3380 } 3381 3382 if ((rc = objset_get_dnode(spa, spa->spa_mos, objnum, &dir)) != 0) 3383 return (rc); 3384 3385 if (dir.dn_type != DMU_OTN_ZAP_METADATA) 3386 return (EIO); 3387 3388 size = dir.dn_datablkszsec << SPA_MINBLOCKSHIFT; 3389 zap = malloc(size); 3390 if (zap == NULL) 3391 return (ENOMEM); 3392 3393 if (dnode_read(spa, &dir, 0, zap, size)) { 3394 free(zap); 3395 return (EIO); 3396 } 3397 3398 if (zap->zap_block_type == ZBT_MICRO) 3399 rc = mzap_list((const mzap_phys_t *)zap, size, check_feature); 3400 else 3401 rc = fzap_list(spa, &dir, zap, check_feature); 3402 3403 free(zap); 3404 return (rc); 3405 } 3406 3407 static int 3408 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 3409 { 3410 dnode_phys_t dir; 3411 size_t size; 3412 int rc; 3413 char *nv; 3414 3415 *value = NULL; 3416 if ((rc = objset_get_dnode(spa, spa->spa_mos, obj, &dir)) != 0) 3417 return (rc); 3418 if (dir.dn_type != DMU_OT_PACKED_NVLIST && 3419 dir.dn_bonustype != DMU_OT_PACKED_NVLIST_SIZE) { 3420 return (EIO); 3421 } 3422 3423 if (dir.dn_bonuslen != sizeof (uint64_t)) 3424 return (EIO); 3425 3426 size = *(uint64_t *)DN_BONUS(&dir); 3427 nv = malloc(size); 3428 if (nv == NULL) 3429 return (ENOMEM); 3430 3431 rc = dnode_read(spa, &dir, 0, nv, size); 3432 if (rc != 0) { 3433 free(nv); 3434 nv = NULL; 3435 return (rc); 3436 } 3437 *value = nvlist_import(nv, size); 3438 free(nv); 3439 return (rc); 3440 } 3441 3442 static int 3443 zfs_spa_init(spa_t *spa) 3444 { 3445 struct uberblock checkpoint; 3446 dnode_phys_t dir; 3447 uint64_t config_object; 3448 nvlist_t *nvlist; 3449 int rc; 3450 3451 if (zio_read(spa, &spa->spa_uberblock->ub_rootbp, spa->spa_mos)) { 3452 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name); 3453 return (EIO); 3454 } 3455 if (spa->spa_mos->os_type != DMU_OST_META) { 3456 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name); 3457 return (EIO); 3458 } 3459 3460 if (objset_get_dnode(spa, &spa->spa_mos_master, 3461 DMU_POOL_DIRECTORY_OBJECT, &dir)) { 3462 printf("ZFS: failed to read pool %s directory object\n", 3463 spa->spa_name); 3464 return (EIO); 3465 } 3466 /* this is allowed to fail, older pools do not have salt */ 3467 rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1, 3468 sizeof (spa->spa_cksum_salt.zcs_bytes), 3469 spa->spa_cksum_salt.zcs_bytes); 3470 3471 rc = check_mos_features(spa); 3472 if (rc != 0) { 3473 printf("ZFS: pool %s is not supported\n", spa->spa_name); 3474 return (rc); 3475 } 3476 3477 rc = zap_lookup(spa, &dir, DMU_POOL_CONFIG, 3478 sizeof (config_object), 1, &config_object); 3479 if (rc != 0) { 3480 printf("ZFS: can not read MOS %s\n", DMU_POOL_CONFIG); 3481 return (EIO); 3482 } 3483 rc = load_nvlist(spa, config_object, &nvlist); 3484 if (rc != 0) 3485 return (rc); 3486 3487 rc = zap_lookup(spa, &dir, DMU_POOL_ZPOOL_CHECKPOINT, 3488 sizeof(uint64_t), sizeof(checkpoint) / sizeof(uint64_t), 3489 &checkpoint); 3490 if (rc == 0 && checkpoint.ub_checkpoint_txg != 0) { 3491 memcpy(&spa->spa_uberblock_checkpoint, &checkpoint, 3492 sizeof(checkpoint)); 3493 if (zio_read(spa, &spa->spa_uberblock_checkpoint.ub_rootbp, 3494 &spa->spa_mos_checkpoint)) { 3495 printf("ZFS: can not read checkpoint data.\n"); 3496 return (EIO); 3497 } 3498 } 3499 3500 /* 3501 * Update vdevs from MOS config. Note, we do skip encoding bytes 3502 * here. See also vdev_label_read_config(). 3503 */ 3504 rc = vdev_init_from_nvlist(spa, nvlist); 3505 nvlist_destroy(nvlist); 3506 return (rc); 3507 } 3508 3509 static int 3510 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb) 3511 { 3512 3513 if (dn->dn_bonustype != DMU_OT_SA) { 3514 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus; 3515 3516 sb->st_mode = zp->zp_mode; 3517 sb->st_uid = zp->zp_uid; 3518 sb->st_gid = zp->zp_gid; 3519 sb->st_size = zp->zp_size; 3520 } else { 3521 sa_hdr_phys_t *sahdrp; 3522 int hdrsize; 3523 size_t size = 0; 3524 void *buf = NULL; 3525 3526 if (dn->dn_bonuslen != 0) 3527 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 3528 else { 3529 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) { 3530 blkptr_t *bp = DN_SPILL_BLKPTR(dn); 3531 int error; 3532 3533 size = BP_GET_LSIZE(bp); 3534 buf = malloc(size); 3535 if (buf == NULL) 3536 error = ENOMEM; 3537 else 3538 error = zio_read(spa, bp, buf); 3539 3540 if (error != 0) { 3541 free(buf); 3542 return (error); 3543 } 3544 sahdrp = buf; 3545 } else { 3546 return (EIO); 3547 } 3548 } 3549 hdrsize = SA_HDR_SIZE(sahdrp); 3550 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize + 3551 SA_MODE_OFFSET); 3552 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize + 3553 SA_UID_OFFSET); 3554 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize + 3555 SA_GID_OFFSET); 3556 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize + 3557 SA_SIZE_OFFSET); 3558 free(buf); 3559 } 3560 3561 return (0); 3562 } 3563 3564 static int 3565 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize) 3566 { 3567 int rc = 0; 3568 3569 if (dn->dn_bonustype == DMU_OT_SA) { 3570 sa_hdr_phys_t *sahdrp = NULL; 3571 size_t size = 0; 3572 void *buf = NULL; 3573 int hdrsize; 3574 char *p; 3575 3576 if (dn->dn_bonuslen != 0) { 3577 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 3578 } else { 3579 blkptr_t *bp; 3580 3581 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0) 3582 return (EIO); 3583 bp = DN_SPILL_BLKPTR(dn); 3584 3585 size = BP_GET_LSIZE(bp); 3586 buf = malloc(size); 3587 if (buf == NULL) 3588 rc = ENOMEM; 3589 else 3590 rc = zio_read(spa, bp, buf); 3591 if (rc != 0) { 3592 free(buf); 3593 return (rc); 3594 } 3595 sahdrp = buf; 3596 } 3597 hdrsize = SA_HDR_SIZE(sahdrp); 3598 p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET); 3599 memcpy(path, p, psize); 3600 free(buf); 3601 return (0); 3602 } 3603 /* 3604 * Second test is purely to silence bogus compiler 3605 * warning about accessing past the end of dn_bonus. 3606 */ 3607 if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen && 3608 sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) { 3609 memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize); 3610 } else { 3611 rc = dnode_read(spa, dn, 0, path, psize); 3612 } 3613 return (rc); 3614 } 3615 3616 struct obj_list { 3617 uint64_t objnum; 3618 STAILQ_ENTRY(obj_list) entry; 3619 }; 3620 3621 /* 3622 * Lookup a file and return its dnode. 3623 */ 3624 static int 3625 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode) 3626 { 3627 int rc; 3628 uint64_t objnum; 3629 const spa_t *spa; 3630 dnode_phys_t dn; 3631 const char *p, *q; 3632 char element[256]; 3633 char path[1024]; 3634 int symlinks_followed = 0; 3635 struct stat sb; 3636 struct obj_list *entry, *tentry; 3637 STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache); 3638 3639 spa = mount->spa; 3640 if (mount->objset.os_type != DMU_OST_ZFS) { 3641 printf("ZFS: unexpected object set type %ju\n", 3642 (uintmax_t)mount->objset.os_type); 3643 return (EIO); 3644 } 3645 3646 if ((entry = malloc(sizeof(struct obj_list))) == NULL) 3647 return (ENOMEM); 3648 3649 /* 3650 * Get the root directory dnode. 3651 */ 3652 rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn); 3653 if (rc) { 3654 free(entry); 3655 return (rc); 3656 } 3657 3658 rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof(objnum), 1, &objnum); 3659 if (rc) { 3660 free(entry); 3661 return (rc); 3662 } 3663 entry->objnum = objnum; 3664 STAILQ_INSERT_HEAD(&on_cache, entry, entry); 3665 3666 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3667 if (rc != 0) 3668 goto done; 3669 3670 p = upath; 3671 while (p && *p) { 3672 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3673 if (rc != 0) 3674 goto done; 3675 3676 while (*p == '/') 3677 p++; 3678 if (*p == '\0') 3679 break; 3680 q = p; 3681 while (*q != '\0' && *q != '/') 3682 q++; 3683 3684 /* skip dot */ 3685 if (p + 1 == q && p[0] == '.') { 3686 p++; 3687 continue; 3688 } 3689 /* double dot */ 3690 if (p + 2 == q && p[0] == '.' && p[1] == '.') { 3691 p += 2; 3692 if (STAILQ_FIRST(&on_cache) == 3693 STAILQ_LAST(&on_cache, obj_list, entry)) { 3694 rc = ENOENT; 3695 goto done; 3696 } 3697 entry = STAILQ_FIRST(&on_cache); 3698 STAILQ_REMOVE_HEAD(&on_cache, entry); 3699 free(entry); 3700 objnum = (STAILQ_FIRST(&on_cache))->objnum; 3701 continue; 3702 } 3703 if (q - p + 1 > sizeof(element)) { 3704 rc = ENAMETOOLONG; 3705 goto done; 3706 } 3707 memcpy(element, p, q - p); 3708 element[q - p] = 0; 3709 p = q; 3710 3711 if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0) 3712 goto done; 3713 if (!S_ISDIR(sb.st_mode)) { 3714 rc = ENOTDIR; 3715 goto done; 3716 } 3717 3718 rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum); 3719 if (rc) 3720 goto done; 3721 objnum = ZFS_DIRENT_OBJ(objnum); 3722 3723 if ((entry = malloc(sizeof(struct obj_list))) == NULL) { 3724 rc = ENOMEM; 3725 goto done; 3726 } 3727 entry->objnum = objnum; 3728 STAILQ_INSERT_HEAD(&on_cache, entry, entry); 3729 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 3730 if (rc) 3731 goto done; 3732 3733 /* 3734 * Check for symlink. 3735 */ 3736 rc = zfs_dnode_stat(spa, &dn, &sb); 3737 if (rc) 3738 goto done; 3739 if (S_ISLNK(sb.st_mode)) { 3740 if (symlinks_followed > 10) { 3741 rc = EMLINK; 3742 goto done; 3743 } 3744 symlinks_followed++; 3745 3746 /* 3747 * Read the link value and copy the tail of our 3748 * current path onto the end. 3749 */ 3750 if (sb.st_size + strlen(p) + 1 > sizeof(path)) { 3751 rc = ENAMETOOLONG; 3752 goto done; 3753 } 3754 strcpy(&path[sb.st_size], p); 3755 3756 rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size); 3757 if (rc != 0) 3758 goto done; 3759 3760 /* 3761 * Restart with the new path, starting either at 3762 * the root or at the parent depending whether or 3763 * not the link is relative. 3764 */ 3765 p = path; 3766 if (*p == '/') { 3767 while (STAILQ_FIRST(&on_cache) != 3768 STAILQ_LAST(&on_cache, obj_list, entry)) { 3769 entry = STAILQ_FIRST(&on_cache); 3770 STAILQ_REMOVE_HEAD(&on_cache, entry); 3771 free(entry); 3772 } 3773 } else { 3774 entry = STAILQ_FIRST(&on_cache); 3775 STAILQ_REMOVE_HEAD(&on_cache, entry); 3776 free(entry); 3777 } 3778 objnum = (STAILQ_FIRST(&on_cache))->objnum; 3779 } 3780 } 3781 3782 *dnode = dn; 3783 done: 3784 STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry) 3785 free(entry); 3786 return (rc); 3787 } 3788