1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2011, 2019 by Delphix. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 * Copyright 2016 Nexenta Systems, Inc. 28 * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. 29 * Copyright (c) 2015, 2017, Intel Corporation. 30 * Copyright (c) 2020 Datto Inc. 31 * Copyright (c) 2020, The FreeBSD Foundation [1] 32 * 33 * [1] Portions of this software were developed by Allan Jude 34 * under sponsorship from the FreeBSD Foundation. 35 * Copyright (c) 2021 Allan Jude 36 * Copyright (c) 2021 Toomas Soome <tsoome@me.com> 37 * Copyright (c) 2023, 2024, Klara Inc. 38 * Copyright (c) 2023, Rob Norris <robn@despairlabs.com> 39 */ 40 41 #include <stdio.h> 42 #include <unistd.h> 43 #include <stdlib.h> 44 #include <ctype.h> 45 #include <getopt.h> 46 #include <openssl/evp.h> 47 #include <sys/zfs_context.h> 48 #include <sys/spa.h> 49 #include <sys/spa_impl.h> 50 #include <sys/dmu.h> 51 #include <sys/zap.h> 52 #include <sys/zap_impl.h> 53 #include <sys/fs/zfs.h> 54 #include <sys/zfs_znode.h> 55 #include <sys/zfs_sa.h> 56 #include <sys/sa.h> 57 #include <sys/sa_impl.h> 58 #include <sys/vdev.h> 59 #include <sys/vdev_impl.h> 60 #include <sys/metaslab_impl.h> 61 #include <sys/dmu_objset.h> 62 #include <sys/dsl_dir.h> 63 #include <sys/dsl_dataset.h> 64 #include <sys/dsl_pool.h> 65 #include <sys/dsl_bookmark.h> 66 #include <sys/dbuf.h> 67 #include <sys/zil.h> 68 #include <sys/zil_impl.h> 69 #include <sys/stat.h> 70 #include <sys/resource.h> 71 #include <sys/dmu_send.h> 72 #include <sys/dmu_traverse.h> 73 #include <sys/zio_checksum.h> 74 #include <sys/zio_compress.h> 75 #include <sys/zfs_fuid.h> 76 #include <sys/arc.h> 77 #include <sys/arc_impl.h> 78 #include <sys/ddt.h> 79 #include <sys/ddt_impl.h> 80 #include <sys/zfeature.h> 81 #include <sys/abd.h> 82 #include <sys/blkptr.h> 83 #include <sys/dsl_crypt.h> 84 #include <sys/dsl_scan.h> 85 #include <sys/btree.h> 86 #include <sys/brt.h> 87 #include <sys/brt_impl.h> 88 #include <zfs_comutil.h> 89 #include <sys/zstd/zstd.h> 90 #include <sys/backtrace.h> 91 92 #include <libzpool.h> 93 #include <libnvpair.h> 94 #include <libzutil.h> 95 #include <libzfs_core.h> 96 97 #include <libzdb.h> 98 99 #include "zdb.h" 100 101 102 extern int reference_tracking_enable; 103 extern int zfs_recover; 104 extern uint_t zfs_vdev_async_read_max_active; 105 extern boolean_t spa_load_verify_dryrun; 106 extern boolean_t spa_mode_readable_spacemaps; 107 extern uint_t zfs_reconstruct_indirect_combinations_max; 108 extern uint_t zfs_btree_verify_intensity; 109 110 enum { 111 ARG_ALLOCATED = 256, 112 ARG_BLOCK_BIN_MODE, 113 ARG_BLOCK_CLASSES, 114 }; 115 116 static const char cmdname[] = "zdb"; 117 uint8_t dump_opt[512]; 118 119 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); 120 121 static uint64_t *zopt_metaslab = NULL; 122 static unsigned zopt_metaslab_args = 0; 123 124 125 static zopt_object_range_t *zopt_object_ranges = NULL; 126 static unsigned zopt_object_args = 0; 127 128 static int flagbits[256]; 129 130 131 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ 132 static int leaked_objects = 0; 133 static zfs_range_tree_t *mos_refd_objs; 134 static spa_t *spa; 135 static objset_t *os; 136 static boolean_t kernel_init_done; 137 static boolean_t corruption_found = B_FALSE; 138 139 static enum { 140 BIN_AUTO = 0, 141 BIN_PSIZE, 142 BIN_LSIZE, 143 BIN_ASIZE, 144 } block_bin_mode = BIN_AUTO; 145 146 static enum { 147 CLASS_NORMAL = 1 << 1, 148 CLASS_SPECIAL = 1 << 2, 149 CLASS_DEDUP = 1 << 3, 150 CLASS_OTHER = 1 << 4, 151 } block_classes = 0; 152 153 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, 154 boolean_t); 155 static void mos_obj_refd(uint64_t); 156 static void mos_obj_refd_multiple(uint64_t); 157 static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, 158 dmu_tx_t *tx); 159 160 161 162 static void zdb_print_blkptr(const blkptr_t *bp, int flags); 163 static void zdb_exit(int reason); 164 165 typedef struct sublivelist_verify_block_refcnt { 166 /* block pointer entry in livelist being verified */ 167 blkptr_t svbr_blk; 168 169 /* 170 * Refcount gets incremented to 1 when we encounter the first 171 * FREE entry for the svfbr block pointer and a node for it 172 * is created in our ZDB verification/tracking metadata. 173 * 174 * As we encounter more FREE entries we increment this counter 175 * and similarly decrement it whenever we find the respective 176 * ALLOC entries for this block. 177 * 178 * When the refcount gets to 0 it means that all the FREE and 179 * ALLOC entries of this block have paired up and we no longer 180 * need to track it in our verification logic (e.g. the node 181 * containing this struct in our verification data structure 182 * should be freed). 183 * 184 * [refer to sublivelist_verify_blkptr() for the actual code] 185 */ 186 uint32_t svbr_refcnt; 187 } sublivelist_verify_block_refcnt_t; 188 189 static int 190 sublivelist_block_refcnt_compare(const void *larg, const void *rarg) 191 { 192 const sublivelist_verify_block_refcnt_t *l = larg; 193 const sublivelist_verify_block_refcnt_t *r = rarg; 194 return (livelist_compare(&l->svbr_blk, &r->svbr_blk)); 195 } 196 197 static int 198 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, 199 dmu_tx_t *tx) 200 { 201 ASSERT0P(tx); 202 struct sublivelist_verify *sv = arg; 203 sublivelist_verify_block_refcnt_t current = { 204 .svbr_blk = *bp, 205 206 /* 207 * Start with 1 in case this is the first free entry. 208 * This field is not used for our B-Tree comparisons 209 * anyway. 210 */ 211 .svbr_refcnt = 1, 212 }; 213 214 zfs_btree_index_t where; 215 sublivelist_verify_block_refcnt_t *pair = 216 zfs_btree_find(&sv->sv_pair, ¤t, &where); 217 if (free) { 218 if (pair == NULL) { 219 /* first free entry for this block pointer */ 220 zfs_btree_add(&sv->sv_pair, ¤t); 221 } else { 222 pair->svbr_refcnt++; 223 } 224 } else { 225 if (pair == NULL) { 226 /* block that is currently marked as allocated */ 227 for (int i = 0; i < SPA_DVAS_PER_BP; i++) { 228 if (DVA_IS_EMPTY(&bp->blk_dva[i])) 229 break; 230 sublivelist_verify_block_t svb = { 231 .svb_dva = bp->blk_dva[i], 232 .svb_allocated_txg = 233 BP_GET_BIRTH(bp) 234 }; 235 236 if (zfs_btree_find(&sv->sv_leftover, &svb, 237 &where) == NULL) { 238 zfs_btree_add_idx(&sv->sv_leftover, 239 &svb, &where); 240 } 241 } 242 } else { 243 /* alloc matches a free entry */ 244 pair->svbr_refcnt--; 245 if (pair->svbr_refcnt == 0) { 246 /* all allocs and frees have been matched */ 247 zfs_btree_remove_idx(&sv->sv_pair, &where); 248 } 249 } 250 } 251 252 return (0); 253 } 254 255 static int 256 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) 257 { 258 int err; 259 struct sublivelist_verify *sv = args; 260 261 zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL, 262 sizeof (sublivelist_verify_block_refcnt_t)); 263 264 err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, 265 sv, NULL); 266 267 sublivelist_verify_block_refcnt_t *e; 268 zfs_btree_index_t *cookie = NULL; 269 while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { 270 char blkbuf[BP_SPRINTF_LEN]; 271 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), 272 &e->svbr_blk, B_TRUE); 273 (void) printf("\tERROR: %d unmatched FREE(s): %s\n", 274 e->svbr_refcnt, blkbuf); 275 corruption_found = B_TRUE; 276 } 277 zfs_btree_destroy(&sv->sv_pair); 278 279 return (err); 280 } 281 282 static int 283 livelist_block_compare(const void *larg, const void *rarg) 284 { 285 const sublivelist_verify_block_t *l = larg; 286 const sublivelist_verify_block_t *r = rarg; 287 288 if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) 289 return (-1); 290 else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) 291 return (+1); 292 293 if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) 294 return (-1); 295 else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) 296 return (+1); 297 298 if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) 299 return (-1); 300 else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) 301 return (+1); 302 303 return (0); 304 } 305 306 /* 307 * Check for errors in a livelist while tracking all unfreed ALLOCs in the 308 * sublivelist_verify_t: sv->sv_leftover 309 */ 310 static void 311 livelist_verify(dsl_deadlist_t *dl, void *arg) 312 { 313 sublivelist_verify_t *sv = arg; 314 dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); 315 } 316 317 /* 318 * Check for errors in the livelist entry and discard the intermediary 319 * data structures 320 */ 321 static int 322 sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) 323 { 324 (void) args; 325 sublivelist_verify_t sv; 326 zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, 327 sizeof (sublivelist_verify_block_t)); 328 int err = sublivelist_verify_func(&sv, dle); 329 zfs_btree_clear(&sv.sv_leftover); 330 zfs_btree_destroy(&sv.sv_leftover); 331 return (err); 332 } 333 334 typedef struct metaslab_verify { 335 /* 336 * Tree containing all the leftover ALLOCs from the livelists 337 * that are part of this metaslab. 338 */ 339 zfs_btree_t mv_livelist_allocs; 340 341 /* 342 * Metaslab information. 343 */ 344 uint64_t mv_vdid; 345 uint64_t mv_msid; 346 uint64_t mv_start; 347 uint64_t mv_end; 348 349 /* 350 * What's currently allocated for this metaslab. 351 */ 352 zfs_range_tree_t *mv_allocated; 353 } metaslab_verify_t; 354 355 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); 356 357 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, 358 void *arg); 359 360 typedef struct unflushed_iter_cb_arg { 361 spa_t *uic_spa; 362 uint64_t uic_txg; 363 void *uic_arg; 364 zdb_log_sm_cb_t uic_cb; 365 } unflushed_iter_cb_arg_t; 366 367 static int 368 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) 369 { 370 unflushed_iter_cb_arg_t *uic = arg; 371 return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); 372 } 373 374 static void 375 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) 376 { 377 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 378 return; 379 380 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 381 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 382 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { 383 space_map_t *sm = NULL; 384 VERIFY0(space_map_open(&sm, spa_meta_objset(spa), 385 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); 386 387 unflushed_iter_cb_arg_t uic = { 388 .uic_spa = spa, 389 .uic_txg = sls->sls_txg, 390 .uic_arg = arg, 391 .uic_cb = cb 392 }; 393 VERIFY0(space_map_iterate(sm, space_map_length(sm), 394 iterate_through_spacemap_logs_cb, &uic)); 395 space_map_close(sm); 396 } 397 spa_config_exit(spa, SCL_CONFIG, FTAG); 398 } 399 400 static void 401 verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, 402 uint64_t offset, uint64_t size) 403 { 404 sublivelist_verify_block_t svb = {{{0}}}; 405 DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); 406 DVA_SET_OFFSET(&svb.svb_dva, offset); 407 DVA_SET_ASIZE(&svb.svb_dva, 0); 408 zfs_btree_index_t where; 409 uint64_t end_offset = offset + size; 410 411 /* 412 * Look for an exact match for spacemap entry in the livelist entries. 413 * Then, look for other livelist entries that fall within the range 414 * of the spacemap entry as it may have been condensed 415 */ 416 sublivelist_verify_block_t *found = 417 zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); 418 if (found == NULL) { 419 found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); 420 } 421 for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && 422 DVA_GET_OFFSET(&found->svb_dva) < end_offset; 423 found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { 424 if (found->svb_allocated_txg <= txg) { 425 (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " 426 "from TXG %llx FREED at TXG %llx\n", 427 (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), 428 (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), 429 (u_longlong_t)found->svb_allocated_txg, 430 (u_longlong_t)txg); 431 corruption_found = B_TRUE; 432 } 433 } 434 } 435 436 static int 437 metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) 438 { 439 metaslab_verify_t *mv = arg; 440 uint64_t offset = sme->sme_offset; 441 uint64_t size = sme->sme_run; 442 uint64_t txg = sme->sme_txg; 443 444 if (sme->sme_type == SM_ALLOC) { 445 if (zfs_range_tree_contains(mv->mv_allocated, 446 offset, size)) { 447 (void) printf("ERROR: DOUBLE ALLOC: " 448 "%llu [%llx:%llx] " 449 "%llu:%llu LOG_SM\n", 450 (u_longlong_t)txg, (u_longlong_t)offset, 451 (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, 452 (u_longlong_t)mv->mv_msid); 453 corruption_found = B_TRUE; 454 } else { 455 zfs_range_tree_add(mv->mv_allocated, 456 offset, size); 457 } 458 } else { 459 if (!zfs_range_tree_contains(mv->mv_allocated, 460 offset, size)) { 461 (void) printf("ERROR: DOUBLE FREE: " 462 "%llu [%llx:%llx] " 463 "%llu:%llu LOG_SM\n", 464 (u_longlong_t)txg, (u_longlong_t)offset, 465 (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, 466 (u_longlong_t)mv->mv_msid); 467 corruption_found = B_TRUE; 468 } else { 469 zfs_range_tree_remove(mv->mv_allocated, 470 offset, size); 471 } 472 } 473 474 if (sme->sme_type != SM_ALLOC) { 475 /* 476 * If something is freed in the spacemap, verify that 477 * it is not listed as allocated in the livelist. 478 */ 479 verify_livelist_allocs(mv, txg, offset, size); 480 } 481 return (0); 482 } 483 484 static int 485 spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, 486 uint64_t txg, void *arg) 487 { 488 metaslab_verify_t *mv = arg; 489 uint64_t offset = sme->sme_offset; 490 uint64_t vdev_id = sme->sme_vdev; 491 492 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 493 494 /* skip indirect vdevs */ 495 if (!vdev_is_concrete(vd)) 496 return (0); 497 498 if (vdev_id != mv->mv_vdid) 499 return (0); 500 501 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 502 if (ms->ms_id != mv->mv_msid) 503 return (0); 504 505 if (txg < metaslab_unflushed_txg(ms)) 506 return (0); 507 508 509 ASSERT3U(txg, ==, sme->sme_txg); 510 return (metaslab_spacemap_validation_cb(sme, mv)); 511 } 512 513 static void 514 spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) 515 { 516 iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); 517 } 518 519 static void 520 spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) 521 { 522 if (sm == NULL) 523 return; 524 525 VERIFY0(space_map_iterate(sm, space_map_length(sm), 526 metaslab_spacemap_validation_cb, mv)); 527 } 528 529 static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); 530 531 /* 532 * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if 533 * they are part of that metaslab (mv_msid). 534 */ 535 static void 536 mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) 537 { 538 zfs_btree_index_t where; 539 sublivelist_verify_block_t *svb; 540 ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); 541 for (svb = zfs_btree_first(&sv->sv_leftover, &where); 542 svb != NULL; 543 svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { 544 if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) 545 continue; 546 547 if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && 548 (DVA_GET_OFFSET(&svb->svb_dva) + 549 DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { 550 (void) printf("ERROR: Found block that crosses " 551 "metaslab boundary: <%llu:%llx:%llx>\n", 552 (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), 553 (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), 554 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); 555 corruption_found = B_TRUE; 556 continue; 557 } 558 559 if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) 560 continue; 561 562 if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) 563 continue; 564 565 if ((DVA_GET_OFFSET(&svb->svb_dva) + 566 DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { 567 (void) printf("ERROR: Found block that crosses " 568 "metaslab boundary: <%llu:%llx:%llx>\n", 569 (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), 570 (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), 571 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); 572 corruption_found = B_TRUE; 573 continue; 574 } 575 576 zfs_btree_add(&mv->mv_livelist_allocs, svb); 577 } 578 579 for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); 580 svb != NULL; 581 svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { 582 zfs_btree_remove(&sv->sv_leftover, svb); 583 } 584 } 585 586 /* 587 * [Livelist Check] 588 * Iterate through all the sublivelists and: 589 * - report leftover frees (**) 590 * - record leftover ALLOCs together with their TXG [see Cross Check] 591 * 592 * (**) Note: Double ALLOCs are valid in datasets that have dedup 593 * enabled. Similarly double FREEs are allowed as well but 594 * only if they pair up with a corresponding ALLOC entry once 595 * we our done with our sublivelist iteration. 596 * 597 * [Spacemap Check] 598 * for each metaslab: 599 * - iterate over spacemap and then the metaslab's entries in the 600 * spacemap log, then report any double FREEs and ALLOCs (do not 601 * blow up). 602 * 603 * [Cross Check] 604 * After finishing the Livelist Check phase and while being in the 605 * Spacemap Check phase, we find all the recorded leftover ALLOCs 606 * of the livelist check that are part of the metaslab that we are 607 * currently looking at in the Spacemap Check. We report any entries 608 * that are marked as ALLOCs in the livelists but have been actually 609 * freed (and potentially allocated again) after their TXG stamp in 610 * the spacemaps. Also report any ALLOCs from the livelists that 611 * belong to indirect vdevs (e.g. their vdev completed removal). 612 * 613 * Note that this will miss Log Spacemap entries that cancelled each other 614 * out before being flushed to the metaslab, so we are not guaranteed 615 * to match all erroneous ALLOCs. 616 */ 617 static void 618 livelist_metaslab_validate(spa_t *spa) 619 { 620 (void) printf("Verifying deleted livelist entries\n"); 621 622 sublivelist_verify_t sv; 623 zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, 624 sizeof (sublivelist_verify_block_t)); 625 iterate_deleted_livelists(spa, livelist_verify, &sv); 626 627 (void) printf("Verifying metaslab entries\n"); 628 vdev_t *rvd = spa->spa_root_vdev; 629 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 630 vdev_t *vd = rvd->vdev_child[c]; 631 632 if (!vdev_is_concrete(vd)) 633 continue; 634 635 for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { 636 metaslab_t *m = vd->vdev_ms[mid]; 637 638 (void) fprintf(stderr, 639 "\rverifying concrete vdev %llu, " 640 "metaslab %llu of %llu ...", 641 (longlong_t)vd->vdev_id, 642 (longlong_t)mid, 643 (longlong_t)vd->vdev_ms_count); 644 645 uint64_t shift, start; 646 zfs_range_seg_type_t type = 647 metaslab_calculate_range_tree_type(vd, m, 648 &start, &shift); 649 metaslab_verify_t mv; 650 mv.mv_allocated = zfs_range_tree_create_flags( 651 NULL, type, NULL, start, shift, 652 0, "livelist_metaslab_validate:mv_allocated"); 653 mv.mv_vdid = vd->vdev_id; 654 mv.mv_msid = m->ms_id; 655 mv.mv_start = m->ms_start; 656 mv.mv_end = m->ms_start + m->ms_size; 657 zfs_btree_create(&mv.mv_livelist_allocs, 658 livelist_block_compare, NULL, 659 sizeof (sublivelist_verify_block_t)); 660 661 mv_populate_livelist_allocs(&mv, &sv); 662 663 spacemap_check_ms_sm(m->ms_sm, &mv); 664 spacemap_check_sm_log(spa, &mv); 665 666 zfs_range_tree_vacate(mv.mv_allocated, NULL, NULL); 667 zfs_range_tree_destroy(mv.mv_allocated); 668 zfs_btree_clear(&mv.mv_livelist_allocs); 669 zfs_btree_destroy(&mv.mv_livelist_allocs); 670 } 671 } 672 (void) fprintf(stderr, "\n"); 673 674 /* 675 * If there are any segments in the leftover tree after we walked 676 * through all the metaslabs in the concrete vdevs then this means 677 * that we have segments in the livelists that belong to indirect 678 * vdevs and are marked as allocated. 679 */ 680 if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { 681 zfs_btree_destroy(&sv.sv_leftover); 682 return; 683 } 684 (void) printf("ERROR: Found livelist blocks marked as allocated " 685 "for indirect vdevs:\n"); 686 corruption_found = B_TRUE; 687 688 zfs_btree_index_t *where = NULL; 689 sublivelist_verify_block_t *svb; 690 while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != 691 NULL) { 692 int vdev_id = DVA_GET_VDEV(&svb->svb_dva); 693 ASSERT3U(vdev_id, <, rvd->vdev_children); 694 vdev_t *vd = rvd->vdev_child[vdev_id]; 695 ASSERT(!vdev_is_concrete(vd)); 696 (void) printf("<%d:%llx:%llx> TXG %llx\n", 697 vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), 698 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), 699 (u_longlong_t)svb->svb_allocated_txg); 700 } 701 (void) printf("\n"); 702 zfs_btree_destroy(&sv.sv_leftover); 703 } 704 705 /* 706 * These libumem hooks provide a reasonable set of defaults for the allocator's 707 * debugging facilities. 708 */ 709 const char * 710 _umem_debug_init(void) 711 { 712 return ("default,verbose"); /* $UMEM_DEBUG setting */ 713 } 714 715 const char * 716 _umem_logging_init(void) 717 { 718 return ("fail,contents"); /* $UMEM_LOGGING setting */ 719 } 720 721 static void 722 usage(void) 723 { 724 (void) fprintf(stderr, 725 "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] " 726 "[-I <inflight I/Os>]\n" 727 "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n" 728 "\t\t[-K <key>]\n" 729 "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n" 730 "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n" 731 "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n" 732 "\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n" 733 "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n" 734 "\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n" 735 "\t%s [-v] <bookmark>\n" 736 "\t%s -C [-A] [-U <cache>] [<poolname>]\n" 737 "\t%s -l [-Aqu] <device>\n" 738 "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] " 739 "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n" 740 "\t%s -O [-K <key>] <dataset> <path>\n" 741 "\t%s -r [-K <key>] <dataset> <path> <destination>\n" 742 "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n" 743 "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n" 744 "\t%s -E [-A] word0:word1:...:word15\n" 745 "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] " 746 "<poolname>\n\n", 747 cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, 748 cmdname, cmdname, cmdname, cmdname, cmdname); 749 750 (void) fprintf(stderr, " Dataset name must include at least one " 751 "separator character '/' or '@'\n"); 752 (void) fprintf(stderr, " If dataset name is specified, only that " 753 "dataset is dumped\n"); 754 (void) fprintf(stderr, " If object numbers or object number " 755 "ranges are specified, only those\n" 756 " objects or ranges are dumped.\n\n"); 757 (void) fprintf(stderr, 758 " Object ranges take the form <start>:<end>[:<flags>]\n" 759 " start Starting object number\n" 760 " end Ending object number, or -1 for no upper bound\n" 761 " flags Optional flags to select object types:\n" 762 " A All objects (this is the default)\n" 763 " d ZFS directories\n" 764 " f ZFS files \n" 765 " m SPA space maps\n" 766 " z ZAPs\n" 767 " - Negate effect of next flag\n\n"); 768 (void) fprintf(stderr, " Options to control amount of output:\n"); 769 (void) fprintf(stderr, " -b --block-stats " 770 "block statistics\n"); 771 (void) fprintf(stderr, " --bin=(lsize|psize|asize) " 772 "bin blocks based on this size in all three columns\n"); 773 (void) fprintf(stderr, 774 " --class=(normal|special|dedup|other)[,...]\n" 775 " only consider blocks from " 776 "these allocation classes\n"); 777 (void) fprintf(stderr, " -B --backup " 778 "backup stream\n"); 779 (void) fprintf(stderr, " -c --checksum " 780 "checksum all metadata (twice for all data) blocks\n"); 781 (void) fprintf(stderr, " -C --config " 782 "config (or cachefile if alone)\n"); 783 (void) fprintf(stderr, " -d --datasets " 784 "dataset(s)\n"); 785 (void) fprintf(stderr, " -D --dedup-stats " 786 "dedup statistics\n"); 787 (void) fprintf(stderr, " -E --embedded-block-pointer=INTEGER\n" 788 " decode and display block " 789 "from an embedded block pointer\n"); 790 (void) fprintf(stderr, " -h --history " 791 "pool history\n"); 792 (void) fprintf(stderr, " -i --intent-logs " 793 "intent logs\n"); 794 (void) fprintf(stderr, " -l --label " 795 "read label contents\n"); 796 (void) fprintf(stderr, " -k --checkpointed-state " 797 "examine the checkpointed state of the pool\n"); 798 (void) fprintf(stderr, " -L --disable-leak-tracking " 799 "disable leak tracking (do not load spacemaps)\n"); 800 (void) fprintf(stderr, " -m --metaslabs " 801 "metaslabs\n"); 802 (void) fprintf(stderr, " -M --metaslab-groups " 803 "metaslab groups\n"); 804 (void) fprintf(stderr, " -O --object-lookups " 805 "perform object lookups by path\n"); 806 (void) fprintf(stderr, " -r --copy-object " 807 "copy an object by path to file\n"); 808 (void) fprintf(stderr, " -R --read-block " 809 "read and display block from a device\n"); 810 (void) fprintf(stderr, " -s --io-stats " 811 "report stats on zdb's I/O\n"); 812 (void) fprintf(stderr, " -S --simulate-dedup " 813 "simulate dedup to measure effect\n"); 814 (void) fprintf(stderr, " -v --verbose " 815 "verbose (applies to all others)\n"); 816 (void) fprintf(stderr, " -y --livelist " 817 "perform livelist and metaslab validation on any livelists being " 818 "deleted\n\n"); 819 (void) fprintf(stderr, " Below options are intended for use " 820 "with other options:\n"); 821 (void) fprintf(stderr, " -A --ignore-assertions " 822 "ignore assertions (-A), enable panic recovery (-AA) or both " 823 "(-AAA)\n"); 824 (void) fprintf(stderr, " -e --exported " 825 "pool is exported/destroyed/has altroot/not in a cachefile\n"); 826 (void) fprintf(stderr, " -F --automatic-rewind " 827 "attempt automatic rewind within safe range of transaction " 828 "groups\n"); 829 (void) fprintf(stderr, " -G --dump-debug-msg " 830 "dump zfs_dbgmsg buffer before exiting\n"); 831 (void) fprintf(stderr, " -I --inflight=INTEGER " 832 "specify the maximum number of checksumming I/Os " 833 "[default is 200]\n"); 834 (void) fprintf(stderr, " -K --key=KEY " 835 "decryption key for encrypted dataset\n"); 836 (void) fprintf(stderr, " -o --option=\"NAME=VALUE\" " 837 "set the named tunable to the given value\n"); 838 (void) fprintf(stderr, " -p --path==PATH " 839 "use one or more with -e to specify path to vdev dir\n"); 840 (void) fprintf(stderr, " -P --parseable " 841 "print numbers in parseable form\n"); 842 (void) fprintf(stderr, " -q --skip-label " 843 "don't print label contents\n"); 844 (void) fprintf(stderr, " -t --txg=INTEGER " 845 "highest txg to use when searching for uberblocks\n"); 846 (void) fprintf(stderr, " -T --brt-stats " 847 "BRT statistics\n"); 848 (void) fprintf(stderr, " -u --uberblock " 849 "uberblock\n"); 850 (void) fprintf(stderr, " -U --cachefile=PATH " 851 "use alternate cachefile\n"); 852 (void) fprintf(stderr, " -V --verbatim " 853 "do verbatim import\n"); 854 (void) fprintf(stderr, " -x --dump-blocks=PATH " 855 "dump all read blocks into specified directory\n"); 856 (void) fprintf(stderr, " -X --extreme-rewind " 857 "attempt extreme rewind (does not work with dataset)\n"); 858 (void) fprintf(stderr, " -Y --all-reconstruction " 859 "attempt all reconstruction combinations for split blocks\n"); 860 (void) fprintf(stderr, " -Z --zstd-headers " 861 "show ZSTD headers \n"); 862 (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " 863 "to make only that option verbose\n"); 864 (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); 865 zdb_exit(2); 866 } 867 868 static void 869 dump_debug_buffer(void) 870 { 871 ssize_t ret __attribute__((unused)); 872 873 if (!dump_opt['G']) 874 return; 875 /* 876 * We use write() instead of printf() so that this function 877 * is safe to call from a signal handler. 878 */ 879 ret = write(STDERR_FILENO, "\n", 1); 880 zfs_dbgmsg_print(STDERR_FILENO, "zdb"); 881 } 882 883 static void sig_handler(int signo) 884 { 885 struct sigaction action; 886 887 libspl_backtrace(STDERR_FILENO); 888 dump_debug_buffer(); 889 890 /* 891 * Restore default action and re-raise signal so SIGSEGV and 892 * SIGABRT can trigger a core dump. 893 */ 894 action.sa_handler = SIG_DFL; 895 sigemptyset(&action.sa_mask); 896 action.sa_flags = 0; 897 (void) sigaction(signo, &action, NULL); 898 raise(signo); 899 } 900 901 /* 902 * Called for usage errors that are discovered after a call to spa_open(), 903 * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. 904 */ 905 906 static void 907 fatal(const char *fmt, ...) 908 { 909 va_list ap; 910 911 va_start(ap, fmt); 912 (void) fprintf(stderr, "%s: ", cmdname); 913 (void) vfprintf(stderr, fmt, ap); 914 va_end(ap); 915 (void) fprintf(stderr, "\n"); 916 917 dump_debug_buffer(); 918 919 zdb_exit(1); 920 } 921 922 static void 923 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) 924 { 925 (void) size; 926 nvlist_t *nv; 927 size_t nvsize = *(uint64_t *)data; 928 char *packed = umem_alloc(nvsize, UMEM_NOFAIL); 929 930 VERIFY0(dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); 931 932 VERIFY0(nvlist_unpack(packed, nvsize, &nv, 0)); 933 934 umem_free(packed, nvsize); 935 936 dump_nvlist(nv, 8); 937 938 nvlist_free(nv); 939 } 940 941 static void 942 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) 943 { 944 (void) os, (void) object, (void) size; 945 spa_history_phys_t *shp = data; 946 947 if (shp == NULL) 948 return; 949 950 (void) printf("\t\tpool_create_len = %llu\n", 951 (u_longlong_t)shp->sh_pool_create_len); 952 (void) printf("\t\tphys_max_off = %llu\n", 953 (u_longlong_t)shp->sh_phys_max_off); 954 (void) printf("\t\tbof = %llu\n", 955 (u_longlong_t)shp->sh_bof); 956 (void) printf("\t\teof = %llu\n", 957 (u_longlong_t)shp->sh_eof); 958 (void) printf("\t\trecords_lost = %llu\n", 959 (u_longlong_t)shp->sh_records_lost); 960 } 961 962 static void 963 zdb_nicenum(uint64_t num, char *buf, size_t buflen) 964 { 965 if (dump_opt['P']) 966 (void) snprintf(buf, buflen, "%llu", (longlong_t)num); 967 else 968 nicenum(num, buf, buflen); 969 } 970 971 static void 972 zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen) 973 { 974 if (dump_opt['P']) 975 (void) snprintf(buf, buflen, "%llu", (longlong_t)bytes); 976 else 977 zfs_nicebytes(bytes, buf, buflen); 978 } 979 980 static const char histo_stars[] = "****************************************"; 981 static const uint64_t histo_width = sizeof (histo_stars) - 1; 982 983 static void 984 dump_histogram(const uint64_t *histo, int size, int offset) 985 { 986 int i; 987 int minidx = size - 1; 988 int maxidx = 0; 989 uint64_t max = 0; 990 991 for (i = 0; i < size; i++) { 992 if (histo[i] == 0) 993 continue; 994 if (histo[i] > max) 995 max = histo[i]; 996 if (i > maxidx) 997 maxidx = i; 998 if (i < minidx) 999 minidx = i; 1000 } 1001 1002 if (max < histo_width) 1003 max = histo_width; 1004 1005 for (i = minidx; i <= maxidx; i++) { 1006 (void) printf("\t\t\t%3u: %6llu %s\n", 1007 i + offset, (u_longlong_t)histo[i], 1008 &histo_stars[(max - histo[i]) * histo_width / max]); 1009 } 1010 } 1011 1012 static void 1013 dump_zap_stats(objset_t *os, uint64_t object) 1014 { 1015 int error; 1016 zap_stats_t zs; 1017 1018 error = zap_get_stats(os, object, &zs); 1019 if (error) 1020 return; 1021 1022 if (zs.zs_ptrtbl_len == 0) { 1023 ASSERT(zs.zs_num_blocks == 1); 1024 (void) printf("\tmicrozap: %llu bytes, %llu entries\n", 1025 (u_longlong_t)zs.zs_blocksize, 1026 (u_longlong_t)zs.zs_num_entries); 1027 return; 1028 } 1029 1030 (void) printf("\tFat ZAP stats:\n"); 1031 1032 (void) printf("\t\tPointer table:\n"); 1033 (void) printf("\t\t\t%llu elements\n", 1034 (u_longlong_t)zs.zs_ptrtbl_len); 1035 (void) printf("\t\t\tzt_blk: %llu\n", 1036 (u_longlong_t)zs.zs_ptrtbl_zt_blk); 1037 (void) printf("\t\t\tzt_numblks: %llu\n", 1038 (u_longlong_t)zs.zs_ptrtbl_zt_numblks); 1039 (void) printf("\t\t\tzt_shift: %llu\n", 1040 (u_longlong_t)zs.zs_ptrtbl_zt_shift); 1041 (void) printf("\t\t\tzt_blks_copied: %llu\n", 1042 (u_longlong_t)zs.zs_ptrtbl_blks_copied); 1043 (void) printf("\t\t\tzt_nextblk: %llu\n", 1044 (u_longlong_t)zs.zs_ptrtbl_nextblk); 1045 1046 (void) printf("\t\tZAP entries: %llu\n", 1047 (u_longlong_t)zs.zs_num_entries); 1048 (void) printf("\t\tLeaf blocks: %llu\n", 1049 (u_longlong_t)zs.zs_num_leafs); 1050 (void) printf("\t\tTotal blocks: %llu\n", 1051 (u_longlong_t)zs.zs_num_blocks); 1052 (void) printf("\t\tzap_block_type: 0x%llx\n", 1053 (u_longlong_t)zs.zs_block_type); 1054 (void) printf("\t\tzap_magic: 0x%llx\n", 1055 (u_longlong_t)zs.zs_magic); 1056 (void) printf("\t\tzap_salt: 0x%llx\n", 1057 (u_longlong_t)zs.zs_salt); 1058 1059 (void) printf("\t\tLeafs with 2^n pointers:\n"); 1060 dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); 1061 1062 (void) printf("\t\tBlocks with n*5 entries:\n"); 1063 dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); 1064 1065 (void) printf("\t\tBlocks n/10 full:\n"); 1066 dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); 1067 1068 (void) printf("\t\tEntries with n chunks:\n"); 1069 dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); 1070 1071 (void) printf("\t\tBuckets with n entries:\n"); 1072 dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); 1073 } 1074 1075 static void 1076 dump_none(objset_t *os, uint64_t object, void *data, size_t size) 1077 { 1078 (void) os, (void) object, (void) data, (void) size; 1079 } 1080 1081 static void 1082 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) 1083 { 1084 (void) os, (void) object, (void) data, (void) size; 1085 (void) printf("\tUNKNOWN OBJECT TYPE\n"); 1086 } 1087 1088 static void 1089 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) 1090 { 1091 (void) os, (void) object, (void) data, (void) size; 1092 } 1093 1094 static void 1095 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) 1096 { 1097 uint64_t *arr; 1098 uint64_t oursize; 1099 if (dump_opt['d'] < 6) 1100 return; 1101 1102 if (data == NULL) { 1103 dmu_object_info_t doi; 1104 1105 VERIFY0(dmu_object_info(os, object, &doi)); 1106 size = doi.doi_max_offset; 1107 /* 1108 * We cap the size at 1 mebibyte here to prevent 1109 * allocation failures and nigh-infinite printing if the 1110 * object is extremely large. 1111 */ 1112 oursize = MIN(size, 1 << 20); 1113 arr = kmem_alloc(oursize, KM_SLEEP); 1114 1115 int err = dmu_read(os, object, 0, oursize, arr, 0); 1116 if (err != 0) { 1117 (void) printf("got error %u from dmu_read\n", err); 1118 kmem_free(arr, oursize); 1119 return; 1120 } 1121 } else { 1122 /* 1123 * Even though the allocation is already done in this code path, 1124 * we still cap the size to prevent excessive printing. 1125 */ 1126 oursize = MIN(size, 1 << 20); 1127 arr = data; 1128 } 1129 1130 if (size == 0) { 1131 if (data == NULL) 1132 kmem_free(arr, oursize); 1133 (void) printf("\t\t[]\n"); 1134 return; 1135 } 1136 1137 (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); 1138 for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { 1139 if (i % 4 != 0) 1140 (void) printf(", %0llx", (u_longlong_t)arr[i]); 1141 else 1142 (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); 1143 } 1144 if (oursize != size) 1145 (void) printf(", ... "); 1146 (void) printf("]\n"); 1147 1148 if (data == NULL) 1149 kmem_free(arr, oursize); 1150 } 1151 1152 static void 1153 dump_zap(objset_t *os, uint64_t object, void *data, size_t size) 1154 { 1155 (void) data, (void) size; 1156 zap_cursor_t zc; 1157 zap_attribute_t *attrp = zap_attribute_long_alloc(); 1158 void *prop; 1159 unsigned i; 1160 1161 dump_zap_stats(os, object); 1162 (void) printf("\n"); 1163 1164 for (zap_cursor_init(&zc, os, object); 1165 zap_cursor_retrieve(&zc, attrp) == 0; 1166 zap_cursor_advance(&zc)) { 1167 boolean_t key64 = 1168 !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY); 1169 1170 if (key64) 1171 (void) printf("\t\t0x%010" PRIu64 "x = ", 1172 *(uint64_t *)attrp->za_name); 1173 else 1174 (void) printf("\t\t%s = ", attrp->za_name); 1175 1176 if (attrp->za_num_integers == 0) { 1177 (void) printf("\n"); 1178 continue; 1179 } 1180 prop = umem_zalloc(attrp->za_num_integers * 1181 attrp->za_integer_length, UMEM_NOFAIL); 1182 1183 if (key64) 1184 (void) zap_lookup_uint64(os, object, 1185 (const uint64_t *)attrp->za_name, 1, 1186 attrp->za_integer_length, attrp->za_num_integers, 1187 prop); 1188 else 1189 (void) zap_lookup(os, object, attrp->za_name, 1190 attrp->za_integer_length, attrp->za_num_integers, 1191 prop); 1192 1193 if (attrp->za_integer_length == 1 && !key64) { 1194 if (strcmp(attrp->za_name, 1195 DSL_CRYPTO_KEY_MASTER_KEY) == 0 || 1196 strcmp(attrp->za_name, 1197 DSL_CRYPTO_KEY_HMAC_KEY) == 0 || 1198 strcmp(attrp->za_name, DSL_CRYPTO_KEY_IV) == 0 || 1199 strcmp(attrp->za_name, DSL_CRYPTO_KEY_MAC) == 0 || 1200 strcmp(attrp->za_name, 1201 DMU_POOL_CHECKSUM_SALT) == 0) { 1202 uint8_t *u8 = prop; 1203 1204 for (i = 0; i < attrp->za_num_integers; i++) { 1205 (void) printf("%02x", u8[i]); 1206 } 1207 } else { 1208 (void) printf("%s", (char *)prop); 1209 } 1210 } else { 1211 for (i = 0; i < attrp->za_num_integers; i++) { 1212 switch (attrp->za_integer_length) { 1213 case 1: 1214 (void) printf("%u ", 1215 ((uint8_t *)prop)[i]); 1216 break; 1217 case 2: 1218 (void) printf("%u ", 1219 ((uint16_t *)prop)[i]); 1220 break; 1221 case 4: 1222 (void) printf("%u ", 1223 ((uint32_t *)prop)[i]); 1224 break; 1225 case 8: 1226 (void) printf("%lld ", 1227 (u_longlong_t)((int64_t *)prop)[i]); 1228 break; 1229 } 1230 } 1231 } 1232 (void) printf("\n"); 1233 umem_free(prop, 1234 attrp->za_num_integers * attrp->za_integer_length); 1235 } 1236 zap_cursor_fini(&zc); 1237 zap_attribute_free(attrp); 1238 } 1239 1240 static void 1241 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) 1242 { 1243 bpobj_phys_t *bpop = data; 1244 uint64_t i; 1245 char bytes[32], comp[32], uncomp[32]; 1246 1247 /* make sure the output won't get truncated */ 1248 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 1249 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); 1250 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); 1251 1252 if (bpop == NULL) 1253 return; 1254 1255 zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); 1256 zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); 1257 zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); 1258 1259 (void) printf("\t\tnum_blkptrs = %llu\n", 1260 (u_longlong_t)bpop->bpo_num_blkptrs); 1261 (void) printf("\t\tbytes = %s\n", bytes); 1262 if (size >= BPOBJ_SIZE_V1) { 1263 (void) printf("\t\tcomp = %s\n", comp); 1264 (void) printf("\t\tuncomp = %s\n", uncomp); 1265 } 1266 if (size >= BPOBJ_SIZE_V2) { 1267 (void) printf("\t\tsubobjs = %llu\n", 1268 (u_longlong_t)bpop->bpo_subobjs); 1269 (void) printf("\t\tnum_subobjs = %llu\n", 1270 (u_longlong_t)bpop->bpo_num_subobjs); 1271 } 1272 if (size >= sizeof (*bpop)) { 1273 (void) printf("\t\tnum_freed = %llu\n", 1274 (u_longlong_t)bpop->bpo_num_freed); 1275 } 1276 1277 if (dump_opt['d'] < 5) 1278 return; 1279 1280 for (i = 0; i < bpop->bpo_num_blkptrs; i++) { 1281 char blkbuf[BP_SPRINTF_LEN]; 1282 blkptr_t bp; 1283 1284 int err = dmu_read(os, object, 1285 i * sizeof (bp), sizeof (bp), &bp, 0); 1286 if (err != 0) { 1287 (void) printf("got error %u from dmu_read\n", err); 1288 break; 1289 } 1290 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, 1291 BP_GET_FREE(&bp)); 1292 (void) printf("\t%s\n", blkbuf); 1293 } 1294 } 1295 1296 static void 1297 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) 1298 { 1299 (void) data, (void) size; 1300 dmu_object_info_t doi; 1301 int64_t i; 1302 1303 VERIFY0(dmu_object_info(os, object, &doi)); 1304 uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); 1305 1306 int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); 1307 if (err != 0) { 1308 (void) printf("got error %u from dmu_read\n", err); 1309 kmem_free(subobjs, doi.doi_max_offset); 1310 return; 1311 } 1312 1313 int64_t last_nonzero = -1; 1314 for (i = 0; i < doi.doi_max_offset / 8; i++) { 1315 if (subobjs[i] != 0) 1316 last_nonzero = i; 1317 } 1318 1319 for (i = 0; i <= last_nonzero; i++) { 1320 (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]); 1321 } 1322 kmem_free(subobjs, doi.doi_max_offset); 1323 } 1324 1325 static void 1326 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) 1327 { 1328 (void) data, (void) size; 1329 dump_zap_stats(os, object); 1330 /* contents are printed elsewhere, properly decoded */ 1331 } 1332 1333 static void 1334 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) 1335 { 1336 (void) data, (void) size; 1337 zap_cursor_t zc; 1338 zap_attribute_t *attrp = zap_attribute_alloc(); 1339 1340 dump_zap_stats(os, object); 1341 (void) printf("\n"); 1342 1343 for (zap_cursor_init(&zc, os, object); 1344 zap_cursor_retrieve(&zc, attrp) == 0; 1345 zap_cursor_advance(&zc)) { 1346 (void) printf("\t\t%s = ", attrp->za_name); 1347 if (attrp->za_num_integers == 0) { 1348 (void) printf("\n"); 1349 continue; 1350 } 1351 (void) printf(" %llx : [%d:%d:%d]\n", 1352 (u_longlong_t)attrp->za_first_integer, 1353 (int)ATTR_LENGTH(attrp->za_first_integer), 1354 (int)ATTR_BSWAP(attrp->za_first_integer), 1355 (int)ATTR_NUM(attrp->za_first_integer)); 1356 } 1357 zap_cursor_fini(&zc); 1358 zap_attribute_free(attrp); 1359 } 1360 1361 static void 1362 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) 1363 { 1364 (void) data, (void) size; 1365 zap_cursor_t zc; 1366 zap_attribute_t *attrp = zap_attribute_alloc(); 1367 uint16_t *layout_attrs; 1368 unsigned i; 1369 1370 dump_zap_stats(os, object); 1371 (void) printf("\n"); 1372 1373 for (zap_cursor_init(&zc, os, object); 1374 zap_cursor_retrieve(&zc, attrp) == 0; 1375 zap_cursor_advance(&zc)) { 1376 (void) printf("\t\t%s = [", attrp->za_name); 1377 if (attrp->za_num_integers == 0) { 1378 (void) printf("\n"); 1379 continue; 1380 } 1381 1382 VERIFY(attrp->za_integer_length == 2); 1383 layout_attrs = umem_zalloc(attrp->za_num_integers * 1384 attrp->za_integer_length, UMEM_NOFAIL); 1385 1386 VERIFY(zap_lookup(os, object, attrp->za_name, 1387 attrp->za_integer_length, 1388 attrp->za_num_integers, layout_attrs) == 0); 1389 1390 for (i = 0; i != attrp->za_num_integers; i++) 1391 (void) printf(" %d ", (int)layout_attrs[i]); 1392 (void) printf("]\n"); 1393 umem_free(layout_attrs, 1394 attrp->za_num_integers * attrp->za_integer_length); 1395 } 1396 zap_cursor_fini(&zc); 1397 zap_attribute_free(attrp); 1398 } 1399 1400 static void 1401 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) 1402 { 1403 (void) data, (void) size; 1404 zap_cursor_t zc; 1405 zap_attribute_t *attrp = zap_attribute_long_alloc(); 1406 const char *typenames[] = { 1407 /* 0 */ "not specified", 1408 /* 1 */ "FIFO", 1409 /* 2 */ "Character Device", 1410 /* 3 */ "3 (invalid)", 1411 /* 4 */ "Directory", 1412 /* 5 */ "5 (invalid)", 1413 /* 6 */ "Block Device", 1414 /* 7 */ "7 (invalid)", 1415 /* 8 */ "Regular File", 1416 /* 9 */ "9 (invalid)", 1417 /* 10 */ "Symbolic Link", 1418 /* 11 */ "11 (invalid)", 1419 /* 12 */ "Socket", 1420 /* 13 */ "Door", 1421 /* 14 */ "Event Port", 1422 /* 15 */ "15 (invalid)", 1423 }; 1424 1425 dump_zap_stats(os, object); 1426 (void) printf("\n"); 1427 1428 for (zap_cursor_init(&zc, os, object); 1429 zap_cursor_retrieve(&zc, attrp) == 0; 1430 zap_cursor_advance(&zc)) { 1431 (void) printf("\t\t%s = %lld (type: %s)\n", 1432 attrp->za_name, ZFS_DIRENT_OBJ(attrp->za_first_integer), 1433 typenames[ZFS_DIRENT_TYPE(attrp->za_first_integer)]); 1434 } 1435 zap_cursor_fini(&zc); 1436 zap_attribute_free(attrp); 1437 } 1438 1439 static int 1440 get_dtl_refcount(vdev_t *vd) 1441 { 1442 int refcount = 0; 1443 1444 if (vd->vdev_ops->vdev_op_leaf) { 1445 space_map_t *sm = vd->vdev_dtl_sm; 1446 1447 if (sm != NULL && 1448 sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) 1449 return (1); 1450 return (0); 1451 } 1452 1453 for (unsigned c = 0; c < vd->vdev_children; c++) 1454 refcount += get_dtl_refcount(vd->vdev_child[c]); 1455 return (refcount); 1456 } 1457 1458 static int 1459 get_metaslab_refcount(vdev_t *vd) 1460 { 1461 int refcount = 0; 1462 1463 if (vd->vdev_top == vd) { 1464 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 1465 space_map_t *sm = vd->vdev_ms[m]->ms_sm; 1466 1467 if (sm != NULL && 1468 sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) 1469 refcount++; 1470 } 1471 } 1472 for (unsigned c = 0; c < vd->vdev_children; c++) 1473 refcount += get_metaslab_refcount(vd->vdev_child[c]); 1474 1475 return (refcount); 1476 } 1477 1478 static int 1479 get_obsolete_refcount(vdev_t *vd) 1480 { 1481 uint64_t obsolete_sm_object; 1482 int refcount = 0; 1483 1484 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 1485 if (vd->vdev_top == vd && obsolete_sm_object != 0) { 1486 dmu_object_info_t doi; 1487 VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, 1488 obsolete_sm_object, &doi)); 1489 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { 1490 refcount++; 1491 } 1492 } else { 1493 ASSERT0P(vd->vdev_obsolete_sm); 1494 ASSERT0(obsolete_sm_object); 1495 } 1496 for (unsigned c = 0; c < vd->vdev_children; c++) { 1497 refcount += get_obsolete_refcount(vd->vdev_child[c]); 1498 } 1499 1500 return (refcount); 1501 } 1502 1503 static int 1504 get_prev_obsolete_spacemap_refcount(spa_t *spa) 1505 { 1506 uint64_t prev_obj = 1507 spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; 1508 if (prev_obj != 0) { 1509 dmu_object_info_t doi; 1510 VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); 1511 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { 1512 return (1); 1513 } 1514 } 1515 return (0); 1516 } 1517 1518 static int 1519 get_checkpoint_refcount(vdev_t *vd) 1520 { 1521 int refcount = 0; 1522 1523 if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && 1524 zap_contains(spa_meta_objset(vd->vdev_spa), 1525 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) 1526 refcount++; 1527 1528 for (uint64_t c = 0; c < vd->vdev_children; c++) 1529 refcount += get_checkpoint_refcount(vd->vdev_child[c]); 1530 1531 return (refcount); 1532 } 1533 1534 static int 1535 get_log_spacemap_refcount(spa_t *spa) 1536 { 1537 return (avl_numnodes(&spa->spa_sm_logs_by_txg)); 1538 } 1539 1540 static int 1541 verify_spacemap_refcounts(spa_t *spa) 1542 { 1543 uint64_t expected_refcount = 0; 1544 uint64_t actual_refcount; 1545 1546 (void) feature_get_refcount(spa, 1547 &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], 1548 &expected_refcount); 1549 actual_refcount = get_dtl_refcount(spa->spa_root_vdev); 1550 actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); 1551 actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); 1552 actual_refcount += get_prev_obsolete_spacemap_refcount(spa); 1553 actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); 1554 actual_refcount += get_log_spacemap_refcount(spa); 1555 1556 if (expected_refcount != actual_refcount) { 1557 (void) printf("space map refcount mismatch: expected %lld != " 1558 "actual %lld\n", 1559 (longlong_t)expected_refcount, 1560 (longlong_t)actual_refcount); 1561 return (2); 1562 } 1563 return (0); 1564 } 1565 1566 static void 1567 dump_spacemap(objset_t *os, space_map_t *sm) 1568 { 1569 const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", 1570 "INVALID", "INVALID", "INVALID", "INVALID" }; 1571 1572 if (sm == NULL) 1573 return; 1574 1575 (void) printf("space map object %llu:\n", 1576 (longlong_t)sm->sm_object); 1577 (void) printf(" smp_length = 0x%llx\n", 1578 (longlong_t)sm->sm_phys->smp_length); 1579 (void) printf(" smp_alloc = 0x%llx\n", 1580 (longlong_t)sm->sm_phys->smp_alloc); 1581 1582 if (dump_opt['d'] < 6 && dump_opt['m'] < 4) 1583 return; 1584 1585 /* 1586 * Print out the freelist entries in both encoded and decoded form. 1587 */ 1588 uint8_t mapshift = sm->sm_shift; 1589 int64_t alloc = 0; 1590 uint64_t word, entry_id = 0; 1591 for (uint64_t offset = 0; offset < space_map_length(sm); 1592 offset += sizeof (word)) { 1593 1594 VERIFY0(dmu_read(os, space_map_object(sm), offset, 1595 sizeof (word), &word, DMU_READ_PREFETCH)); 1596 1597 if (sm_entry_is_debug(word)) { 1598 uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); 1599 uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); 1600 if (de_txg == 0) { 1601 (void) printf( 1602 "\t [%6llu] PADDING\n", 1603 (u_longlong_t)entry_id); 1604 } else { 1605 (void) printf( 1606 "\t [%6llu] %s: txg %llu pass %llu\n", 1607 (u_longlong_t)entry_id, 1608 ddata[SM_DEBUG_ACTION_DECODE(word)], 1609 (u_longlong_t)de_txg, 1610 (u_longlong_t)de_sync_pass); 1611 } 1612 entry_id++; 1613 continue; 1614 } 1615 1616 char entry_type; 1617 uint64_t entry_off, entry_run, entry_vdev; 1618 1619 if (sm_entry_is_single_word(word)) { 1620 entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? 1621 'A' : 'F'; 1622 entry_off = (SM_OFFSET_DECODE(word) << mapshift) + 1623 sm->sm_start; 1624 entry_run = SM_RUN_DECODE(word) << mapshift; 1625 1626 (void) printf("\t [%6llu] %c " 1627 "range: %012llx-%012llx size: %08llx\n", 1628 (u_longlong_t)entry_id, entry_type, 1629 (u_longlong_t)entry_off, 1630 (u_longlong_t)(entry_off + entry_run - 1), 1631 (u_longlong_t)entry_run); 1632 } else { 1633 /* it is a two-word entry so we read another word */ 1634 ASSERT(sm_entry_is_double_word(word)); 1635 1636 uint64_t extra_word; 1637 offset += sizeof (extra_word); 1638 ASSERT3U(offset, <, space_map_length(sm)); 1639 VERIFY0(dmu_read(os, space_map_object(sm), offset, 1640 sizeof (extra_word), &extra_word, 1641 DMU_READ_PREFETCH)); 1642 1643 entry_run = SM2_RUN_DECODE(word) << mapshift; 1644 entry_vdev = SM2_VDEV_DECODE(word); 1645 entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 1646 'A' : 'F'; 1647 entry_off = (SM2_OFFSET_DECODE(extra_word) << 1648 mapshift) + sm->sm_start; 1649 1650 if (zopt_metaslab_args == 0 || 1651 zopt_metaslab[0] == entry_vdev) { 1652 (void) printf("\t [%6llu] %c " 1653 "range: %012llx-%012llx size: %08llx " 1654 "vdev: %llu\n", 1655 (u_longlong_t)entry_id, entry_type, 1656 (u_longlong_t)entry_off, 1657 (u_longlong_t)(entry_off + entry_run - 1), 1658 (u_longlong_t)entry_run, 1659 (u_longlong_t)entry_vdev); 1660 } 1661 } 1662 1663 if (entry_type == 'A') 1664 alloc += entry_run; 1665 else 1666 alloc -= entry_run; 1667 entry_id++; 1668 } 1669 if (alloc != space_map_allocated(sm)) { 1670 (void) printf("space_map_object alloc (%lld) INCONSISTENT " 1671 "with space map summary (%lld)\n", 1672 (longlong_t)space_map_allocated(sm), (longlong_t)alloc); 1673 } 1674 } 1675 1676 static void 1677 dump_metaslab_stats(metaslab_t *msp) 1678 { 1679 char maxbuf[32]; 1680 zfs_range_tree_t *rt = msp->ms_allocatable; 1681 zfs_btree_t *t = &msp->ms_allocatable_by_size; 1682 int free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size; 1683 1684 /* max sure nicenum has enough space */ 1685 _Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated"); 1686 1687 zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); 1688 1689 (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", 1690 "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, 1691 "freepct", free_pct); 1692 (void) printf("\tIn-memory histogram:\n"); 1693 dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); 1694 } 1695 1696 static void 1697 dump_allocated(void *arg, uint64_t start, uint64_t size) 1698 { 1699 uint64_t *off = arg; 1700 if (*off != start) 1701 (void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off, 1702 start - *off); 1703 *off = start + size; 1704 } 1705 1706 static void 1707 dump_metaslab(metaslab_t *msp) 1708 { 1709 vdev_t *vd = msp->ms_group->mg_vd; 1710 spa_t *spa = vd->vdev_spa; 1711 space_map_t *sm = msp->ms_sm; 1712 char freebuf[32]; 1713 1714 zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, 1715 sizeof (freebuf)); 1716 1717 (void) printf( 1718 "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", 1719 (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, 1720 (u_longlong_t)space_map_object(sm), freebuf); 1721 1722 if (dump_opt[ARG_ALLOCATED] || 1723 (dump_opt['m'] > 2 && !dump_opt['L'])) { 1724 mutex_enter(&msp->ms_lock); 1725 VERIFY0(metaslab_load(msp)); 1726 } 1727 1728 if (dump_opt['m'] > 2 && !dump_opt['L']) { 1729 zfs_range_tree_stat_verify(msp->ms_allocatable); 1730 dump_metaslab_stats(msp); 1731 } 1732 1733 if (dump_opt[ARG_ALLOCATED]) { 1734 uint64_t off = msp->ms_start; 1735 zfs_range_tree_walk(msp->ms_allocatable, dump_allocated, 1736 &off); 1737 if (off != msp->ms_start + msp->ms_size) 1738 (void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off, 1739 msp->ms_size - off); 1740 } 1741 1742 if (dump_opt['m'] > 1 && sm != NULL && 1743 spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { 1744 /* 1745 * The space map histogram represents free space in chunks 1746 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). 1747 */ 1748 (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", 1749 (u_longlong_t)msp->ms_fragmentation); 1750 dump_histogram(sm->sm_phys->smp_histogram, 1751 SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); 1752 } 1753 1754 if (dump_opt[ARG_ALLOCATED] || 1755 (dump_opt['m'] > 2 && !dump_opt['L'])) { 1756 metaslab_unload(msp); 1757 mutex_exit(&msp->ms_lock); 1758 } 1759 1760 if (vd->vdev_ops == &vdev_draid_ops) 1761 ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); 1762 else 1763 ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); 1764 1765 dump_spacemap(spa->spa_meta_objset, msp->ms_sm); 1766 1767 if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 1768 (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", 1769 (u_longlong_t)metaslab_unflushed_txg(msp)); 1770 } 1771 } 1772 1773 static void 1774 print_vdev_metaslab_header(vdev_t *vd) 1775 { 1776 vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; 1777 const char *bias_str = ""; 1778 if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { 1779 bias_str = VDEV_ALLOC_BIAS_LOG; 1780 } else if (alloc_bias == VDEV_BIAS_SPECIAL) { 1781 bias_str = VDEV_ALLOC_BIAS_SPECIAL; 1782 } else if (alloc_bias == VDEV_BIAS_DEDUP) { 1783 bias_str = VDEV_ALLOC_BIAS_DEDUP; 1784 } 1785 1786 uint64_t ms_flush_data_obj = 0; 1787 if (vd->vdev_top_zap != 0) { 1788 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), 1789 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, 1790 sizeof (uint64_t), 1, &ms_flush_data_obj); 1791 if (error != ENOENT) { 1792 ASSERT0(error); 1793 } 1794 } 1795 1796 (void) printf("\tvdev %10llu\t%s metaslab shift %4llu", 1797 (u_longlong_t)vd->vdev_id, bias_str, 1798 (u_longlong_t)vd->vdev_ms_shift); 1799 1800 if (ms_flush_data_obj != 0) { 1801 (void) printf(" ms_unflushed_phys object %llu", 1802 (u_longlong_t)ms_flush_data_obj); 1803 } 1804 1805 (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", 1806 "metaslabs", (u_longlong_t)vd->vdev_ms_count, 1807 "offset", "spacemap", "free"); 1808 (void) printf("\t%15s %19s %15s %12s\n", 1809 "---------------", "-------------------", 1810 "---------------", "------------"); 1811 } 1812 1813 static void 1814 dump_metaslab_groups(spa_t *spa, boolean_t show_special) 1815 { 1816 vdev_t *rvd = spa->spa_root_vdev; 1817 metaslab_class_t *mc = spa_normal_class(spa); 1818 metaslab_class_t *smc = spa_special_class(spa); 1819 uint64_t fragmentation; 1820 1821 metaslab_class_histogram_verify(mc); 1822 1823 for (unsigned c = 0; c < rvd->vdev_children; c++) { 1824 vdev_t *tvd = rvd->vdev_child[c]; 1825 metaslab_group_t *mg = tvd->vdev_mg; 1826 1827 if (mg == NULL || (mg->mg_class != mc && 1828 (!show_special || mg->mg_class != smc))) 1829 continue; 1830 1831 metaslab_group_histogram_verify(mg); 1832 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 1833 1834 (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" 1835 "fragmentation", 1836 (u_longlong_t)tvd->vdev_id, 1837 (u_longlong_t)tvd->vdev_ms_count); 1838 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 1839 (void) printf("%3s\n", "-"); 1840 } else { 1841 (void) printf("%3llu%%\n", 1842 (u_longlong_t)mg->mg_fragmentation); 1843 } 1844 dump_histogram(mg->mg_histogram, 1845 ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); 1846 } 1847 1848 (void) printf("\tpool %s\tfragmentation", spa_name(spa)); 1849 fragmentation = metaslab_class_fragmentation(mc); 1850 if (fragmentation == ZFS_FRAG_INVALID) 1851 (void) printf("\t%3s\n", "-"); 1852 else 1853 (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); 1854 dump_histogram(mc->mc_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0); 1855 } 1856 1857 static void 1858 print_vdev_indirect(vdev_t *vd) 1859 { 1860 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 1861 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1862 vdev_indirect_births_t *vib = vd->vdev_indirect_births; 1863 1864 if (vim == NULL) { 1865 ASSERT0P(vib); 1866 return; 1867 } 1868 1869 ASSERT3U(vdev_indirect_mapping_object(vim), ==, 1870 vic->vic_mapping_object); 1871 ASSERT3U(vdev_indirect_births_object(vib), ==, 1872 vic->vic_births_object); 1873 1874 (void) printf("indirect births obj %llu:\n", 1875 (longlong_t)vic->vic_births_object); 1876 (void) printf(" vib_count = %llu\n", 1877 (longlong_t)vdev_indirect_births_count(vib)); 1878 for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { 1879 vdev_indirect_birth_entry_phys_t *cur_vibe = 1880 &vib->vib_entries[i]; 1881 (void) printf("\toffset %llx -> txg %llu\n", 1882 (longlong_t)cur_vibe->vibe_offset, 1883 (longlong_t)cur_vibe->vibe_phys_birth_txg); 1884 } 1885 (void) printf("\n"); 1886 1887 (void) printf("indirect mapping obj %llu:\n", 1888 (longlong_t)vic->vic_mapping_object); 1889 (void) printf(" vim_max_offset = 0x%llx\n", 1890 (longlong_t)vdev_indirect_mapping_max_offset(vim)); 1891 (void) printf(" vim_bytes_mapped = 0x%llx\n", 1892 (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); 1893 (void) printf(" vim_count = %llu\n", 1894 (longlong_t)vdev_indirect_mapping_num_entries(vim)); 1895 1896 if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) 1897 return; 1898 1899 uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); 1900 1901 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { 1902 vdev_indirect_mapping_entry_phys_t *vimep = 1903 &vim->vim_entries[i]; 1904 (void) printf("\t<%llx:%llx:%llx> -> " 1905 "<%llx:%llx:%llx> (%x obsolete)\n", 1906 (longlong_t)vd->vdev_id, 1907 (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), 1908 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 1909 (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), 1910 (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), 1911 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 1912 counts[i]); 1913 } 1914 (void) printf("\n"); 1915 1916 uint64_t obsolete_sm_object; 1917 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 1918 if (obsolete_sm_object != 0) { 1919 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1920 (void) printf("obsolete space map object %llu:\n", 1921 (u_longlong_t)obsolete_sm_object); 1922 ASSERT(vd->vdev_obsolete_sm != NULL); 1923 ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, 1924 obsolete_sm_object); 1925 dump_spacemap(mos, vd->vdev_obsolete_sm); 1926 (void) printf("\n"); 1927 } 1928 } 1929 1930 static void 1931 dump_metaslabs(spa_t *spa) 1932 { 1933 vdev_t *vd, *rvd = spa->spa_root_vdev; 1934 uint64_t m, c = 0, children = rvd->vdev_children; 1935 1936 (void) printf("\nMetaslabs:\n"); 1937 1938 if (zopt_metaslab_args > 0) { 1939 c = zopt_metaslab[0]; 1940 1941 if (c >= children) 1942 (void) fatal("bad vdev id: %llu", (u_longlong_t)c); 1943 1944 if (zopt_metaslab_args > 1) { 1945 vd = rvd->vdev_child[c]; 1946 print_vdev_metaslab_header(vd); 1947 1948 for (m = 1; m < zopt_metaslab_args; m++) { 1949 if (zopt_metaslab[m] < vd->vdev_ms_count) 1950 dump_metaslab( 1951 vd->vdev_ms[zopt_metaslab[m]]); 1952 else 1953 (void) fprintf(stderr, "bad metaslab " 1954 "number %llu\n", 1955 (u_longlong_t)zopt_metaslab[m]); 1956 } 1957 (void) printf("\n"); 1958 return; 1959 } 1960 children = c + 1; 1961 } 1962 for (; c < children; c++) { 1963 vd = rvd->vdev_child[c]; 1964 print_vdev_metaslab_header(vd); 1965 1966 print_vdev_indirect(vd); 1967 1968 for (m = 0; m < vd->vdev_ms_count; m++) 1969 dump_metaslab(vd->vdev_ms[m]); 1970 (void) printf("\n"); 1971 } 1972 } 1973 1974 static void 1975 dump_log_spacemaps(spa_t *spa) 1976 { 1977 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1978 return; 1979 1980 (void) printf("\nLog Space Maps in Pool:\n"); 1981 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 1982 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { 1983 space_map_t *sm = NULL; 1984 VERIFY0(space_map_open(&sm, spa_meta_objset(spa), 1985 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); 1986 1987 (void) printf("Log Spacemap object %llu txg %llu\n", 1988 (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); 1989 dump_spacemap(spa->spa_meta_objset, sm); 1990 space_map_close(sm); 1991 } 1992 (void) printf("\n"); 1993 } 1994 1995 static void 1996 dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, 1997 uint64_t index) 1998 { 1999 const ddt_key_t *ddk = &ddlwe->ddlwe_key; 2000 char blkbuf[BP_SPRINTF_LEN]; 2001 blkptr_t blk; 2002 int p; 2003 2004 for (p = 0; p < DDT_NPHYS(ddt); p++) { 2005 const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; 2006 ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); 2007 2008 if (ddt_phys_birth(ddp, v) == 0) 2009 continue; 2010 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); 2011 snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); 2012 (void) printf("index %llx refcnt %llu phys %d %s\n", 2013 (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v), 2014 p, blkbuf); 2015 } 2016 } 2017 2018 static void 2019 dump_dedup_ratio(const ddt_stat_t *dds) 2020 { 2021 double rL, rP, rD, D, dedup, compress, copies; 2022 2023 if (dds->dds_blocks == 0) 2024 return; 2025 2026 rL = (double)dds->dds_ref_lsize; 2027 rP = (double)dds->dds_ref_psize; 2028 rD = (double)dds->dds_ref_dsize; 2029 D = (double)dds->dds_dsize; 2030 2031 dedup = rD / D; 2032 compress = rL / rP; 2033 copies = rD / rP; 2034 2035 (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " 2036 "dedup * compress / copies = %.2f\n\n", 2037 dedup, compress, copies, dedup * compress / copies); 2038 } 2039 2040 static void 2041 dump_ddt_log(ddt_t *ddt) 2042 { 2043 if (ddt->ddt_version != DDT_VERSION_FDT || 2044 !(ddt->ddt_flags & DDT_FLAG_LOG)) 2045 return; 2046 2047 for (int n = 0; n < 2; n++) { 2048 ddt_log_t *ddl = &ddt->ddt_log[n]; 2049 2050 char flagstr[64] = {0}; 2051 if (ddl->ddl_flags > 0) { 2052 flagstr[0] = ' '; 2053 int c = 1; 2054 if (ddl->ddl_flags & DDL_FLAG_FLUSHING) 2055 c += strlcpy(&flagstr[c], " FLUSHING", 2056 sizeof (flagstr) - c); 2057 if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT) 2058 c += strlcpy(&flagstr[c], " CHECKPOINT", 2059 sizeof (flagstr) - c); 2060 if (ddl->ddl_flags & 2061 ~(DDL_FLAG_FLUSHING|DDL_FLAG_CHECKPOINT)) 2062 c += strlcpy(&flagstr[c], " UNKNOWN", 2063 sizeof (flagstr) - c); 2064 flagstr[1] = '['; 2065 flagstr[c] = ']'; 2066 } 2067 2068 uint64_t count = avl_numnodes(&ddl->ddl_tree); 2069 2070 printf(DMU_POOL_DDT_LOG ": flags=0x%02x%s; obj=%llu; " 2071 "len=%llu; txg=%llu; entries=%llu\n", 2072 zio_checksum_table[ddt->ddt_checksum].ci_name, n, 2073 ddl->ddl_flags, flagstr, 2074 (u_longlong_t)ddl->ddl_object, 2075 (u_longlong_t)ddl->ddl_length, 2076 (u_longlong_t)ddl->ddl_first_txg, (u_longlong_t)count); 2077 2078 if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT) { 2079 const ddt_key_t *ddk = &ddl->ddl_checkpoint; 2080 printf(" checkpoint: " 2081 "%016llx:%016llx:%016llx:%016llx:%016llx\n", 2082 (u_longlong_t)ddk->ddk_cksum.zc_word[0], 2083 (u_longlong_t)ddk->ddk_cksum.zc_word[1], 2084 (u_longlong_t)ddk->ddk_cksum.zc_word[2], 2085 (u_longlong_t)ddk->ddk_cksum.zc_word[3], 2086 (u_longlong_t)ddk->ddk_prop); 2087 } 2088 2089 if (count == 0 || dump_opt['D'] < 4) 2090 continue; 2091 2092 ddt_lightweight_entry_t ddlwe; 2093 uint64_t index = 0; 2094 for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); 2095 ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) { 2096 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); 2097 dump_ddt_entry(ddt, &ddlwe, index++); 2098 } 2099 } 2100 } 2101 2102 static void 2103 dump_ddt_object(ddt_t *ddt, ddt_type_t type, ddt_class_t class) 2104 { 2105 char name[DDT_NAMELEN]; 2106 ddt_lightweight_entry_t ddlwe; 2107 uint64_t walk = 0; 2108 dmu_object_info_t doi; 2109 uint64_t count, dspace, mspace; 2110 int error; 2111 2112 error = ddt_object_info(ddt, type, class, &doi); 2113 2114 if (error == ENOENT) 2115 return; 2116 ASSERT0(error); 2117 2118 error = ddt_object_count(ddt, type, class, &count); 2119 ASSERT0(error); 2120 if (count == 0) 2121 return; 2122 2123 dspace = doi.doi_physical_blocks_512 << 9; 2124 mspace = doi.doi_fill_count * doi.doi_data_block_size; 2125 2126 ddt_object_name(ddt, type, class, name); 2127 2128 (void) printf("%s: dspace=%llu; mspace=%llu; entries=%llu\n", name, 2129 (u_longlong_t)dspace, (u_longlong_t)mspace, (u_longlong_t)count); 2130 2131 if (dump_opt['D'] < 3) 2132 return; 2133 2134 (void) printf("%s: object=%llu\n", name, 2135 (u_longlong_t)ddt->ddt_object[type][class]); 2136 zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); 2137 2138 if (dump_opt['D'] < 4) 2139 return; 2140 2141 if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) 2142 return; 2143 2144 (void) printf("%s contents:\n\n", name); 2145 2146 while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0) 2147 dump_ddt_entry(ddt, &ddlwe, walk); 2148 2149 ASSERT3U(error, ==, ENOENT); 2150 2151 (void) printf("\n"); 2152 } 2153 2154 static void 2155 dump_ddt(ddt_t *ddt) 2156 { 2157 if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) 2158 return; 2159 2160 char flagstr[64] = {0}; 2161 if (ddt->ddt_flags > 0) { 2162 flagstr[0] = ' '; 2163 int c = 1; 2164 if (ddt->ddt_flags & DDT_FLAG_FLAT) 2165 c += strlcpy(&flagstr[c], " FLAT", 2166 sizeof (flagstr) - c); 2167 if (ddt->ddt_flags & DDT_FLAG_LOG) 2168 c += strlcpy(&flagstr[c], " LOG", 2169 sizeof (flagstr) - c); 2170 if (ddt->ddt_flags & ~DDT_FLAG_MASK) 2171 c += strlcpy(&flagstr[c], " UNKNOWN", 2172 sizeof (flagstr) - c); 2173 flagstr[1] = '['; 2174 flagstr[c] = ']'; 2175 } 2176 2177 printf("DDT-%s: version=%llu [%s]; flags=0x%02llx%s; rootobj=%llu\n", 2178 zio_checksum_table[ddt->ddt_checksum].ci_name, 2179 (u_longlong_t)ddt->ddt_version, 2180 (ddt->ddt_version == 0) ? "LEGACY" : 2181 (ddt->ddt_version == 1) ? "FDT" : "UNKNOWN", 2182 (u_longlong_t)ddt->ddt_flags, flagstr, 2183 (u_longlong_t)ddt->ddt_dir_object); 2184 2185 for (ddt_type_t type = 0; type < DDT_TYPES; type++) 2186 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) 2187 dump_ddt_object(ddt, type, class); 2188 2189 dump_ddt_log(ddt); 2190 } 2191 2192 static void 2193 dump_all_ddts(spa_t *spa) 2194 { 2195 ddt_histogram_t ddh_total = {{{0}}}; 2196 ddt_stat_t dds_total = {0}; 2197 2198 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) 2199 dump_ddt(spa->spa_ddt[c]); 2200 2201 ddt_get_dedup_stats(spa, &dds_total); 2202 2203 if (dds_total.dds_blocks == 0) { 2204 (void) printf("All DDTs are empty\n"); 2205 return; 2206 } 2207 2208 (void) printf("\n"); 2209 2210 if (dump_opt['D'] > 1) { 2211 (void) printf("DDT histogram (aggregated over all DDTs):\n"); 2212 ddt_get_dedup_histogram(spa, &ddh_total); 2213 zpool_dump_ddt(&dds_total, &ddh_total); 2214 } 2215 2216 dump_dedup_ratio(&dds_total); 2217 2218 /* 2219 * Dump a histogram of unique class entry age 2220 */ 2221 if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) { 2222 ddt_age_histo_t histogram; 2223 2224 (void) printf("DDT walk unique, building age histogram...\n"); 2225 ddt_prune_walk(spa, 0, &histogram); 2226 2227 /* 2228 * print out histogram for unique entry class birth 2229 */ 2230 if (histogram.dah_entries > 0) { 2231 (void) printf("%5s %9s %4s\n", 2232 "age", "blocks", "amnt"); 2233 (void) printf("%5s %9s %4s\n", 2234 "-----", "---------", "----"); 2235 for (int i = 0; i < HIST_BINS; i++) { 2236 (void) printf("%5d %9d %4d%%\n", 1 << i, 2237 (int)histogram.dah_age_histo[i], 2238 (int)((histogram.dah_age_histo[i] * 100) / 2239 histogram.dah_entries)); 2240 } 2241 } 2242 } 2243 } 2244 2245 static void 2246 dump_brt(spa_t *spa) 2247 { 2248 if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) { 2249 printf("BRT: unsupported on this pool\n"); 2250 return; 2251 } 2252 2253 if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { 2254 printf("BRT: empty\n"); 2255 return; 2256 } 2257 2258 char count[32], used[32], saved[32]; 2259 zdb_nicebytes(brt_get_used(spa), used, sizeof (used)); 2260 zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved)); 2261 uint64_t ratio = brt_get_ratio(spa); 2262 printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved, 2263 (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100)); 2264 2265 if (dump_opt['T'] < 2) 2266 return; 2267 2268 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 2269 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 2270 if (!brtvd->bv_initiated) { 2271 printf("BRT: vdev %" PRIu64 ": empty\n", vdevid); 2272 continue; 2273 } 2274 2275 zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count)); 2276 zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used)); 2277 zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved)); 2278 printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n", 2279 vdevid, count, used, saved); 2280 } 2281 2282 if (dump_opt['T'] < 3) 2283 return; 2284 2285 /* -TTT shows a per-vdev histograms; -TTTT shows all entries */ 2286 boolean_t do_histo = dump_opt['T'] == 3; 2287 2288 char dva[64]; 2289 2290 if (!do_histo) 2291 printf("\n%-16s %-10s\n", "DVA", "REFCNT"); 2292 2293 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 2294 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 2295 if (!brtvd->bv_initiated) 2296 continue; 2297 2298 uint64_t counts[64] = {}; 2299 2300 zap_cursor_t zc; 2301 zap_attribute_t *za = zap_attribute_alloc(); 2302 for (zap_cursor_init(&zc, spa->spa_meta_objset, 2303 brtvd->bv_mos_entries); 2304 zap_cursor_retrieve(&zc, za) == 0; 2305 zap_cursor_advance(&zc)) { 2306 uint64_t refcnt; 2307 VERIFY0(zap_lookup_uint64(spa->spa_meta_objset, 2308 brtvd->bv_mos_entries, 2309 (const uint64_t *)za->za_name, 1, 2310 za->za_integer_length, za->za_num_integers, 2311 &refcnt)); 2312 2313 if (do_histo) 2314 counts[highbit64(refcnt)]++; 2315 else { 2316 uint64_t offset = 2317 *(const uint64_t *)za->za_name; 2318 2319 snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", 2320 vdevid, (u_longlong_t)offset); 2321 printf("%-16s %-10llu\n", dva, 2322 (u_longlong_t)refcnt); 2323 } 2324 } 2325 zap_cursor_fini(&zc); 2326 zap_attribute_free(za); 2327 2328 if (do_histo) { 2329 printf("\nBRT: vdev %" PRIu64 2330 ": DVAs with 2^n refcnts:\n", vdevid); 2331 dump_histogram(counts, 64, 0); 2332 } 2333 } 2334 } 2335 2336 static void 2337 dump_dtl_seg(void *arg, uint64_t start, uint64_t size) 2338 { 2339 char *prefix = arg; 2340 2341 (void) printf("%s [%llu,%llu) length %llu\n", 2342 prefix, 2343 (u_longlong_t)start, 2344 (u_longlong_t)(start + size), 2345 (u_longlong_t)(size)); 2346 } 2347 2348 static void 2349 dump_dtl(vdev_t *vd, int indent) 2350 { 2351 spa_t *spa = vd->vdev_spa; 2352 boolean_t required; 2353 const char *name[DTL_TYPES] = { "missing", "partial", "scrub", 2354 "outage" }; 2355 char prefix[256]; 2356 2357 spa_vdev_state_enter(spa, SCL_NONE); 2358 required = vdev_dtl_required(vd); 2359 (void) spa_vdev_state_exit(spa, NULL, 0); 2360 2361 if (indent == 0) 2362 (void) printf("\nDirty time logs:\n\n"); 2363 2364 (void) printf("\t%*s%s [%s]\n", indent, "", 2365 vd->vdev_path ? vd->vdev_path : 2366 vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), 2367 required ? "DTL-required" : "DTL-expendable"); 2368 2369 for (int t = 0; t < DTL_TYPES; t++) { 2370 zfs_range_tree_t *rt = vd->vdev_dtl[t]; 2371 if (zfs_range_tree_space(rt) == 0) 2372 continue; 2373 (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", 2374 indent + 2, "", name[t]); 2375 zfs_range_tree_walk(rt, dump_dtl_seg, prefix); 2376 if (dump_opt['d'] > 5 && vd->vdev_children == 0) 2377 dump_spacemap(spa->spa_meta_objset, 2378 vd->vdev_dtl_sm); 2379 } 2380 2381 for (unsigned c = 0; c < vd->vdev_children; c++) 2382 dump_dtl(vd->vdev_child[c], indent + 4); 2383 } 2384 2385 static void 2386 dump_history(spa_t *spa) 2387 { 2388 nvlist_t **events = NULL; 2389 char *buf; 2390 uint64_t resid, len, off = 0; 2391 uint_t num = 0; 2392 int error; 2393 char tbuf[30]; 2394 2395 if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { 2396 (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", 2397 __func__); 2398 return; 2399 } 2400 2401 do { 2402 len = SPA_OLD_MAXBLOCKSIZE; 2403 2404 if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { 2405 (void) fprintf(stderr, "Unable to read history: " 2406 "error %d\n", error); 2407 free(buf); 2408 return; 2409 } 2410 2411 if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) 2412 break; 2413 2414 off -= resid; 2415 } while (len != 0); 2416 2417 (void) printf("\nHistory:\n"); 2418 for (unsigned i = 0; i < num; i++) { 2419 boolean_t printed = B_FALSE; 2420 2421 if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) { 2422 time_t tsec; 2423 struct tm t; 2424 2425 tsec = fnvlist_lookup_uint64(events[i], 2426 ZPOOL_HIST_TIME); 2427 (void) localtime_r(&tsec, &t); 2428 (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); 2429 } else { 2430 tbuf[0] = '\0'; 2431 } 2432 2433 if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) { 2434 (void) printf("%s %s\n", tbuf, 2435 fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD)); 2436 } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) { 2437 uint64_t ievent; 2438 2439 ievent = fnvlist_lookup_uint64(events[i], 2440 ZPOOL_HIST_INT_EVENT); 2441 if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) 2442 goto next; 2443 2444 (void) printf(" %s [internal %s txg:%ju] %s\n", 2445 tbuf, 2446 zfs_history_event_names[ievent], 2447 fnvlist_lookup_uint64(events[i], 2448 ZPOOL_HIST_TXG), 2449 fnvlist_lookup_string(events[i], 2450 ZPOOL_HIST_INT_STR)); 2451 } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) { 2452 (void) printf("%s [txg:%ju] %s", tbuf, 2453 fnvlist_lookup_uint64(events[i], 2454 ZPOOL_HIST_TXG), 2455 fnvlist_lookup_string(events[i], 2456 ZPOOL_HIST_INT_NAME)); 2457 2458 if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) { 2459 (void) printf(" %s (%llu)", 2460 fnvlist_lookup_string(events[i], 2461 ZPOOL_HIST_DSNAME), 2462 (u_longlong_t)fnvlist_lookup_uint64( 2463 events[i], 2464 ZPOOL_HIST_DSID)); 2465 } 2466 2467 (void) printf(" %s\n", fnvlist_lookup_string(events[i], 2468 ZPOOL_HIST_INT_STR)); 2469 } else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) { 2470 (void) printf("%s ioctl %s\n", tbuf, 2471 fnvlist_lookup_string(events[i], 2472 ZPOOL_HIST_IOCTL)); 2473 2474 if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) { 2475 (void) printf(" input:\n"); 2476 dump_nvlist(fnvlist_lookup_nvlist(events[i], 2477 ZPOOL_HIST_INPUT_NVL), 8); 2478 } 2479 if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) { 2480 (void) printf(" output:\n"); 2481 dump_nvlist(fnvlist_lookup_nvlist(events[i], 2482 ZPOOL_HIST_OUTPUT_NVL), 8); 2483 } 2484 if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) { 2485 (void) printf(" errno: %lld\n", 2486 (longlong_t)fnvlist_lookup_int64(events[i], 2487 ZPOOL_HIST_ERRNO)); 2488 } 2489 } else { 2490 goto next; 2491 } 2492 2493 printed = B_TRUE; 2494 next: 2495 if (dump_opt['h'] > 1) { 2496 if (!printed) 2497 (void) printf("unrecognized record:\n"); 2498 dump_nvlist(events[i], 2); 2499 } 2500 } 2501 free(buf); 2502 } 2503 2504 static void 2505 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) 2506 { 2507 (void) os, (void) object, (void) data, (void) size; 2508 } 2509 2510 static uint64_t 2511 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, 2512 const zbookmark_phys_t *zb) 2513 { 2514 if (dnp == NULL) { 2515 ASSERT(zb->zb_level < 0); 2516 if (zb->zb_object == 0) 2517 return (zb->zb_blkid); 2518 return (zb->zb_blkid * BP_GET_LSIZE(bp)); 2519 } 2520 2521 ASSERT(zb->zb_level >= 0); 2522 2523 return ((zb->zb_blkid << 2524 (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * 2525 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 2526 } 2527 2528 static void 2529 snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, 2530 const blkptr_t *bp) 2531 { 2532 static abd_t *pabd = NULL; 2533 void *buf; 2534 zio_t *zio; 2535 zfs_zstdhdr_t zstd_hdr; 2536 int error; 2537 2538 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD) 2539 return; 2540 2541 if (BP_IS_HOLE(bp)) 2542 return; 2543 2544 if (BP_IS_EMBEDDED(bp)) { 2545 buf = malloc(SPA_MAXBLOCKSIZE); 2546 if (buf == NULL) { 2547 (void) fprintf(stderr, "out of memory\n"); 2548 zdb_exit(1); 2549 } 2550 decode_embedded_bp_compressed(bp, buf); 2551 memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); 2552 free(buf); 2553 zstd_hdr.c_len = BE_32(zstd_hdr.c_len); 2554 zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); 2555 (void) snprintf(blkbuf + strlen(blkbuf), 2556 buflen - strlen(blkbuf), 2557 " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", 2558 zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), 2559 zfs_get_hdrlevel(&zstd_hdr)); 2560 return; 2561 } 2562 2563 if (!pabd) 2564 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); 2565 zio = zio_root(spa, NULL, NULL, 0); 2566 2567 /* Decrypt but don't decompress so we can read the compression header */ 2568 zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL, 2569 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS, 2570 NULL)); 2571 error = zio_wait(zio); 2572 if (error) { 2573 (void) fprintf(stderr, "read failed: %d\n", error); 2574 return; 2575 } 2576 buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp)); 2577 memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); 2578 zstd_hdr.c_len = BE_32(zstd_hdr.c_len); 2579 zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); 2580 2581 (void) snprintf(blkbuf + strlen(blkbuf), 2582 buflen - strlen(blkbuf), 2583 " ZSTD:size=%u:version=%u:level=%u:NORMAL", 2584 zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), 2585 zfs_get_hdrlevel(&zstd_hdr)); 2586 2587 abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); 2588 } 2589 2590 static void 2591 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, 2592 boolean_t bp_freed) 2593 { 2594 const dva_t *dva = bp->blk_dva; 2595 int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; 2596 int i; 2597 2598 if (dump_opt['b'] >= 6) { 2599 snprintf_blkptr(blkbuf, buflen, bp); 2600 if (bp_freed) { 2601 (void) snprintf(blkbuf + strlen(blkbuf), 2602 buflen - strlen(blkbuf), " %s", "FREE"); 2603 } 2604 return; 2605 } 2606 2607 if (BP_IS_EMBEDDED(bp)) { 2608 (void) sprintf(blkbuf, 2609 "EMBEDDED et=%u %llxL/%llxP B=%llu", 2610 (int)BPE_GET_ETYPE(bp), 2611 (u_longlong_t)BPE_GET_LSIZE(bp), 2612 (u_longlong_t)BPE_GET_PSIZE(bp), 2613 (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); 2614 return; 2615 } 2616 2617 blkbuf[0] = '\0'; 2618 2619 for (i = 0; i < ndvas; i++) { 2620 (void) snprintf(blkbuf + strlen(blkbuf), 2621 buflen - strlen(blkbuf), "%llu:%llx:%llx%s ", 2622 (u_longlong_t)DVA_GET_VDEV(&dva[i]), 2623 (u_longlong_t)DVA_GET_OFFSET(&dva[i]), 2624 (u_longlong_t)DVA_GET_ASIZE(&dva[i]), 2625 (DVA_GET_GANG(&dva[i]) ? "G" : "")); 2626 } 2627 2628 if (BP_IS_HOLE(bp)) { 2629 (void) snprintf(blkbuf + strlen(blkbuf), 2630 buflen - strlen(blkbuf), 2631 "%llxL B=%llu", 2632 (u_longlong_t)BP_GET_LSIZE(bp), 2633 (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); 2634 } else { 2635 (void) snprintf(blkbuf + strlen(blkbuf), 2636 buflen - strlen(blkbuf), 2637 "%llxL/%llxP F=%llu B=%llu/%llu", 2638 (u_longlong_t)BP_GET_LSIZE(bp), 2639 (u_longlong_t)BP_GET_PSIZE(bp), 2640 (u_longlong_t)BP_GET_FILL(bp), 2641 (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), 2642 (u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp)); 2643 if (bp_freed) 2644 (void) snprintf(blkbuf + strlen(blkbuf), 2645 buflen - strlen(blkbuf), " %s", "FREE"); 2646 (void) snprintf(blkbuf + strlen(blkbuf), 2647 buflen - strlen(blkbuf), 2648 " cksum=%016llx:%016llx:%016llx:%016llx", 2649 (u_longlong_t)bp->blk_cksum.zc_word[0], 2650 (u_longlong_t)bp->blk_cksum.zc_word[1], 2651 (u_longlong_t)bp->blk_cksum.zc_word[2], 2652 (u_longlong_t)bp->blk_cksum.zc_word[3]); 2653 } 2654 } 2655 2656 static u_longlong_t 2657 print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, 2658 const dnode_phys_t *dnp) 2659 { 2660 char blkbuf[BP_SPRINTF_LEN]; 2661 u_longlong_t offset; 2662 int l; 2663 2664 offset = (u_longlong_t)blkid2offset(dnp, bp, zb); 2665 2666 (void) printf("%16llx ", offset); 2667 2668 ASSERT(zb->zb_level >= 0); 2669 2670 for (l = dnp->dn_nlevels - 1; l >= -1; l--) { 2671 if (l == zb->zb_level) { 2672 (void) printf("L%llx", (u_longlong_t)zb->zb_level); 2673 } else { 2674 (void) printf(" "); 2675 } 2676 } 2677 2678 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); 2679 if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) 2680 snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); 2681 (void) printf("%s", blkbuf); 2682 2683 if (!BP_IS_EMBEDDED(bp)) { 2684 if (BP_GET_TYPE(bp) != dnp->dn_type) { 2685 (void) printf(" (ERROR: Block pointer type " 2686 "(%llu) does not match dnode type (%hhu))", 2687 BP_GET_TYPE(bp), dnp->dn_type); 2688 corruption_found = B_TRUE; 2689 } 2690 if (BP_GET_LEVEL(bp) != zb->zb_level) { 2691 (void) printf(" (ERROR: Block pointer level " 2692 "(%llu) does not match bookmark level (%lld))", 2693 BP_GET_LEVEL(bp), (longlong_t)zb->zb_level); 2694 corruption_found = B_TRUE; 2695 } 2696 } 2697 (void) printf("\n"); 2698 2699 return (offset); 2700 } 2701 2702 static int 2703 visit_indirect(spa_t *spa, const dnode_phys_t *dnp, 2704 blkptr_t *bp, const zbookmark_phys_t *zb) 2705 { 2706 u_longlong_t offset; 2707 int err = 0; 2708 2709 if (BP_GET_BIRTH(bp) == 0) 2710 return (0); 2711 2712 offset = print_indirect(spa, bp, zb, dnp); 2713 2714 if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { 2715 arc_flags_t flags = ARC_FLAG_WAIT; 2716 int i; 2717 blkptr_t *cbp; 2718 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 2719 arc_buf_t *buf; 2720 uint64_t fill = 0; 2721 ASSERT(!BP_IS_REDACTED(bp)); 2722 2723 err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, 2724 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 2725 if (err) 2726 return (err); 2727 ASSERT(buf->b_data); 2728 2729 /* recursively visit blocks below this */ 2730 cbp = buf->b_data; 2731 for (i = 0; i < epb; i++, cbp++) { 2732 zbookmark_phys_t czb; 2733 2734 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 2735 zb->zb_level - 1, 2736 zb->zb_blkid * epb + i); 2737 err = visit_indirect(spa, dnp, cbp, &czb); 2738 if (err) 2739 break; 2740 fill += BP_GET_FILL(cbp); 2741 } 2742 if (!err) { 2743 if (fill != BP_GET_FILL(bp)) { 2744 (void) printf("%16llx: Block pointer " 2745 "fill (%llu) does not match calculated " 2746 "value (%llu)\n", offset, BP_GET_FILL(bp), 2747 (u_longlong_t)fill); 2748 corruption_found = B_TRUE; 2749 } 2750 } 2751 arc_buf_destroy(buf, &buf); 2752 } 2753 2754 return (err); 2755 } 2756 2757 static void 2758 dump_indirect(dnode_t *dn) 2759 { 2760 dnode_phys_t *dnp = dn->dn_phys; 2761 zbookmark_phys_t czb; 2762 2763 (void) printf("Indirect blocks:\n"); 2764 2765 SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), 2766 dn->dn_object, dnp->dn_nlevels - 1, 0); 2767 for (int j = 0; j < dnp->dn_nblkptr; j++) { 2768 czb.zb_blkid = j; 2769 (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, 2770 &dnp->dn_blkptr[j], &czb); 2771 } 2772 2773 (void) printf("\n"); 2774 } 2775 2776 static void 2777 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) 2778 { 2779 (void) os, (void) object; 2780 dsl_dir_phys_t *dd = data; 2781 time_t crtime; 2782 char nice[32]; 2783 2784 /* make sure nicenum has enough space */ 2785 _Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated"); 2786 2787 if (dd == NULL) 2788 return; 2789 2790 ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); 2791 2792 crtime = dd->dd_creation_time; 2793 (void) printf("\t\tcreation_time = %s", ctime(&crtime)); 2794 (void) printf("\t\thead_dataset_obj = %llu\n", 2795 (u_longlong_t)dd->dd_head_dataset_obj); 2796 (void) printf("\t\tparent_dir_obj = %llu\n", 2797 (u_longlong_t)dd->dd_parent_obj); 2798 (void) printf("\t\torigin_obj = %llu\n", 2799 (u_longlong_t)dd->dd_origin_obj); 2800 (void) printf("\t\tchild_dir_zapobj = %llu\n", 2801 (u_longlong_t)dd->dd_child_dir_zapobj); 2802 zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); 2803 (void) printf("\t\tused_bytes = %s\n", nice); 2804 zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); 2805 (void) printf("\t\tcompressed_bytes = %s\n", nice); 2806 zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); 2807 (void) printf("\t\tuncompressed_bytes = %s\n", nice); 2808 zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); 2809 (void) printf("\t\tquota = %s\n", nice); 2810 zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); 2811 (void) printf("\t\treserved = %s\n", nice); 2812 (void) printf("\t\tprops_zapobj = %llu\n", 2813 (u_longlong_t)dd->dd_props_zapobj); 2814 (void) printf("\t\tdeleg_zapobj = %llu\n", 2815 (u_longlong_t)dd->dd_deleg_zapobj); 2816 (void) printf("\t\tflags = %llx\n", 2817 (u_longlong_t)dd->dd_flags); 2818 2819 #define DO(which) \ 2820 zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ 2821 sizeof (nice)); \ 2822 (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) 2823 DO(HEAD); 2824 DO(SNAP); 2825 DO(CHILD); 2826 DO(CHILD_RSRV); 2827 DO(REFRSRV); 2828 #undef DO 2829 (void) printf("\t\tclones = %llu\n", 2830 (u_longlong_t)dd->dd_clones); 2831 } 2832 2833 static void 2834 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) 2835 { 2836 (void) os, (void) object; 2837 dsl_dataset_phys_t *ds = data; 2838 time_t crtime; 2839 char used[32], compressed[32], uncompressed[32], unique[32]; 2840 char blkbuf[BP_SPRINTF_LEN]; 2841 2842 /* make sure nicenum has enough space */ 2843 _Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated"); 2844 _Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ, 2845 "compressed truncated"); 2846 _Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ, 2847 "uncompressed truncated"); 2848 _Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated"); 2849 2850 if (ds == NULL) 2851 return; 2852 2853 ASSERT(size == sizeof (*ds)); 2854 crtime = ds->ds_creation_time; 2855 zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); 2856 zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); 2857 zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, 2858 sizeof (uncompressed)); 2859 zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); 2860 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); 2861 2862 (void) printf("\t\tdir_obj = %llu\n", 2863 (u_longlong_t)ds->ds_dir_obj); 2864 (void) printf("\t\tprev_snap_obj = %llu\n", 2865 (u_longlong_t)ds->ds_prev_snap_obj); 2866 (void) printf("\t\tprev_snap_txg = %llu\n", 2867 (u_longlong_t)ds->ds_prev_snap_txg); 2868 (void) printf("\t\tnext_snap_obj = %llu\n", 2869 (u_longlong_t)ds->ds_next_snap_obj); 2870 (void) printf("\t\tsnapnames_zapobj = %llu\n", 2871 (u_longlong_t)ds->ds_snapnames_zapobj); 2872 (void) printf("\t\tnum_children = %llu\n", 2873 (u_longlong_t)ds->ds_num_children); 2874 (void) printf("\t\tuserrefs_obj = %llu\n", 2875 (u_longlong_t)ds->ds_userrefs_obj); 2876 (void) printf("\t\tcreation_time = %s", ctime(&crtime)); 2877 (void) printf("\t\tcreation_txg = %llu\n", 2878 (u_longlong_t)ds->ds_creation_txg); 2879 (void) printf("\t\tdeadlist_obj = %llu\n", 2880 (u_longlong_t)ds->ds_deadlist_obj); 2881 (void) printf("\t\tused_bytes = %s\n", used); 2882 (void) printf("\t\tcompressed_bytes = %s\n", compressed); 2883 (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); 2884 (void) printf("\t\tunique = %s\n", unique); 2885 (void) printf("\t\tfsid_guid = %llu\n", 2886 (u_longlong_t)ds->ds_fsid_guid); 2887 (void) printf("\t\tguid = %llu\n", 2888 (u_longlong_t)ds->ds_guid); 2889 (void) printf("\t\tflags = %llx\n", 2890 (u_longlong_t)ds->ds_flags); 2891 (void) printf("\t\tnext_clones_obj = %llu\n", 2892 (u_longlong_t)ds->ds_next_clones_obj); 2893 (void) printf("\t\tprops_obj = %llu\n", 2894 (u_longlong_t)ds->ds_props_obj); 2895 (void) printf("\t\tbp = %s\n", blkbuf); 2896 } 2897 2898 static int 2899 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2900 { 2901 (void) arg, (void) tx; 2902 char blkbuf[BP_SPRINTF_LEN]; 2903 2904 if (BP_GET_BIRTH(bp) != 0) { 2905 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 2906 (void) printf("\t%s\n", blkbuf); 2907 } 2908 return (0); 2909 } 2910 2911 static void 2912 dump_bptree(objset_t *os, uint64_t obj, const char *name) 2913 { 2914 char bytes[32]; 2915 bptree_phys_t *bt; 2916 dmu_buf_t *db; 2917 2918 /* make sure nicenum has enough space */ 2919 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 2920 2921 if (dump_opt['d'] < 3) 2922 return; 2923 2924 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 2925 bt = db->db_data; 2926 zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); 2927 (void) printf("\n %s: %llu datasets, %s\n", 2928 name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); 2929 dmu_buf_rele(db, FTAG); 2930 2931 if (dump_opt['d'] < 5) 2932 return; 2933 2934 (void) printf("\n"); 2935 2936 (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); 2937 } 2938 2939 static int 2940 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) 2941 { 2942 (void) arg, (void) tx; 2943 char blkbuf[BP_SPRINTF_LEN]; 2944 2945 ASSERT(BP_GET_BIRTH(bp) != 0); 2946 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); 2947 (void) printf("\t%s\n", blkbuf); 2948 return (0); 2949 } 2950 2951 static void 2952 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) 2953 { 2954 char bytes[32]; 2955 char comp[32]; 2956 char uncomp[32]; 2957 uint64_t i; 2958 2959 /* make sure nicenum has enough space */ 2960 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 2961 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); 2962 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); 2963 2964 if (dump_opt['d'] < 3) 2965 return; 2966 2967 zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); 2968 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 2969 zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); 2970 zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); 2971 if (bpo->bpo_havefreed) { 2972 (void) printf(" %*s: object %llu, %llu local " 2973 "blkptrs, %llu freed, %llu subobjs in object %llu, " 2974 "%s (%s/%s comp)\n", 2975 indent * 8, name, 2976 (u_longlong_t)bpo->bpo_object, 2977 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 2978 (u_longlong_t)bpo->bpo_phys->bpo_num_freed, 2979 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, 2980 (u_longlong_t)bpo->bpo_phys->bpo_subobjs, 2981 bytes, comp, uncomp); 2982 } else { 2983 (void) printf(" %*s: object %llu, %llu local " 2984 "blkptrs, %llu subobjs in object %llu, " 2985 "%s (%s/%s comp)\n", 2986 indent * 8, name, 2987 (u_longlong_t)bpo->bpo_object, 2988 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 2989 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, 2990 (u_longlong_t)bpo->bpo_phys->bpo_subobjs, 2991 bytes, comp, uncomp); 2992 } 2993 2994 for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { 2995 uint64_t subobj; 2996 bpobj_t subbpo; 2997 int error; 2998 VERIFY0(dmu_read(bpo->bpo_os, 2999 bpo->bpo_phys->bpo_subobjs, 3000 i * sizeof (subobj), sizeof (subobj), &subobj, 0)); 3001 error = bpobj_open(&subbpo, bpo->bpo_os, subobj); 3002 if (error != 0) { 3003 (void) printf("ERROR %u while trying to open " 3004 "subobj id %llu\n", 3005 error, (u_longlong_t)subobj); 3006 corruption_found = B_TRUE; 3007 continue; 3008 } 3009 dump_full_bpobj(&subbpo, "subobj", indent + 1); 3010 bpobj_close(&subbpo); 3011 } 3012 } else { 3013 if (bpo->bpo_havefreed) { 3014 (void) printf(" %*s: object %llu, %llu blkptrs, " 3015 "%llu freed, %s\n", 3016 indent * 8, name, 3017 (u_longlong_t)bpo->bpo_object, 3018 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 3019 (u_longlong_t)bpo->bpo_phys->bpo_num_freed, 3020 bytes); 3021 } else { 3022 (void) printf(" %*s: object %llu, %llu blkptrs, " 3023 "%s\n", 3024 indent * 8, name, 3025 (u_longlong_t)bpo->bpo_object, 3026 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 3027 bytes); 3028 } 3029 } 3030 3031 if (dump_opt['d'] < 5) 3032 return; 3033 3034 3035 if (indent == 0) { 3036 (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); 3037 (void) printf("\n"); 3038 } 3039 } 3040 3041 static int 3042 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, 3043 boolean_t print_list) 3044 { 3045 int err = 0; 3046 zfs_bookmark_phys_t prop; 3047 objset_t *mos = dp->dp_spa->spa_meta_objset; 3048 err = dsl_bookmark_lookup(dp, name, NULL, &prop); 3049 3050 if (err != 0) { 3051 return (err); 3052 } 3053 3054 (void) printf("\t#%s: ", strchr(name, '#') + 1); 3055 (void) printf("{guid: %llx creation_txg: %llu creation_time: " 3056 "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, 3057 (u_longlong_t)prop.zbm_creation_txg, 3058 (u_longlong_t)prop.zbm_creation_time, 3059 (u_longlong_t)prop.zbm_redaction_obj); 3060 3061 IMPLY(print_list, print_redact); 3062 if (!print_redact || prop.zbm_redaction_obj == 0) 3063 return (0); 3064 3065 redaction_list_t *rl; 3066 VERIFY0(dsl_redaction_list_hold_obj(dp, 3067 prop.zbm_redaction_obj, FTAG, &rl)); 3068 3069 redaction_list_phys_t *rlp = rl->rl_phys; 3070 (void) printf("\tRedacted:\n\t\tProgress: "); 3071 if (rlp->rlp_last_object != UINT64_MAX || 3072 rlp->rlp_last_blkid != UINT64_MAX) { 3073 (void) printf("%llu %llu (incomplete)\n", 3074 (u_longlong_t)rlp->rlp_last_object, 3075 (u_longlong_t)rlp->rlp_last_blkid); 3076 } else { 3077 (void) printf("complete\n"); 3078 } 3079 (void) printf("\t\tSnapshots: ["); 3080 for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { 3081 if (i > 0) 3082 (void) printf(", "); 3083 (void) printf("%0llu", 3084 (u_longlong_t)rlp->rlp_snaps[i]); 3085 } 3086 (void) printf("]\n\t\tLength: %llu\n", 3087 (u_longlong_t)rlp->rlp_num_entries); 3088 3089 if (!print_list) { 3090 dsl_redaction_list_rele(rl, FTAG); 3091 return (0); 3092 } 3093 3094 if (rlp->rlp_num_entries == 0) { 3095 dsl_redaction_list_rele(rl, FTAG); 3096 (void) printf("\t\tRedaction List: []\n\n"); 3097 return (0); 3098 } 3099 3100 redact_block_phys_t *rbp_buf; 3101 uint64_t size; 3102 dmu_object_info_t doi; 3103 3104 VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); 3105 size = doi.doi_max_offset; 3106 rbp_buf = kmem_alloc(size, KM_SLEEP); 3107 3108 err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, 3109 rbp_buf, 0); 3110 if (err != 0) { 3111 dsl_redaction_list_rele(rl, FTAG); 3112 kmem_free(rbp_buf, size); 3113 return (err); 3114 } 3115 3116 (void) printf("\t\tRedaction List: [{object: %llx, offset: " 3117 "%llx, blksz: %x, count: %llx}", 3118 (u_longlong_t)rbp_buf[0].rbp_object, 3119 (u_longlong_t)rbp_buf[0].rbp_blkid, 3120 (uint_t)(redact_block_get_size(&rbp_buf[0])), 3121 (u_longlong_t)redact_block_get_count(&rbp_buf[0])); 3122 3123 for (size_t i = 1; i < rlp->rlp_num_entries; i++) { 3124 (void) printf(",\n\t\t{object: %llx, offset: %llx, " 3125 "blksz: %x, count: %llx}", 3126 (u_longlong_t)rbp_buf[i].rbp_object, 3127 (u_longlong_t)rbp_buf[i].rbp_blkid, 3128 (uint_t)(redact_block_get_size(&rbp_buf[i])), 3129 (u_longlong_t)redact_block_get_count(&rbp_buf[i])); 3130 } 3131 dsl_redaction_list_rele(rl, FTAG); 3132 kmem_free(rbp_buf, size); 3133 (void) printf("]\n\n"); 3134 return (0); 3135 } 3136 3137 static void 3138 dump_bookmarks(objset_t *os, int verbosity) 3139 { 3140 zap_cursor_t zc; 3141 zap_attribute_t *attrp; 3142 dsl_dataset_t *ds = dmu_objset_ds(os); 3143 dsl_pool_t *dp = spa_get_dsl(os->os_spa); 3144 objset_t *mos = os->os_spa->spa_meta_objset; 3145 if (verbosity < 4) 3146 return; 3147 attrp = zap_attribute_alloc(); 3148 dsl_pool_config_enter(dp, FTAG); 3149 3150 for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); 3151 zap_cursor_retrieve(&zc, attrp) == 0; 3152 zap_cursor_advance(&zc)) { 3153 char osname[ZFS_MAX_DATASET_NAME_LEN]; 3154 char buf[ZFS_MAX_DATASET_NAME_LEN]; 3155 int len; 3156 dmu_objset_name(os, osname); 3157 len = snprintf(buf, sizeof (buf), "%s#%s", osname, 3158 attrp->za_name); 3159 VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN); 3160 (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); 3161 } 3162 zap_cursor_fini(&zc); 3163 dsl_pool_config_exit(dp, FTAG); 3164 zap_attribute_free(attrp); 3165 } 3166 3167 static void 3168 bpobj_count_refd(bpobj_t *bpo) 3169 { 3170 mos_obj_refd(bpo->bpo_object); 3171 3172 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 3173 mos_obj_refd(bpo->bpo_phys->bpo_subobjs); 3174 for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { 3175 uint64_t subobj; 3176 bpobj_t subbpo; 3177 int error; 3178 VERIFY0(dmu_read(bpo->bpo_os, 3179 bpo->bpo_phys->bpo_subobjs, 3180 i * sizeof (subobj), sizeof (subobj), &subobj, 0)); 3181 error = bpobj_open(&subbpo, bpo->bpo_os, subobj); 3182 if (error != 0) { 3183 (void) printf("ERROR %u while trying to open " 3184 "subobj id %llu\n", 3185 error, (u_longlong_t)subobj); 3186 corruption_found = B_TRUE; 3187 continue; 3188 } 3189 bpobj_count_refd(&subbpo); 3190 bpobj_close(&subbpo); 3191 } 3192 } 3193 } 3194 3195 static int 3196 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) 3197 { 3198 spa_t *spa = arg; 3199 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; 3200 if (dle->dle_bpobj.bpo_object != empty_bpobj) 3201 bpobj_count_refd(&dle->dle_bpobj); 3202 return (0); 3203 } 3204 3205 static int 3206 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) 3207 { 3208 ASSERT0P(arg); 3209 if (dump_opt['d'] >= 5) { 3210 char buf[128]; 3211 (void) snprintf(buf, sizeof (buf), 3212 "mintxg %llu -> obj %llu", 3213 (longlong_t)dle->dle_mintxg, 3214 (longlong_t)dle->dle_bpobj.bpo_object); 3215 3216 dump_full_bpobj(&dle->dle_bpobj, buf, 0); 3217 } else { 3218 (void) printf("mintxg %llu -> obj %llu\n", 3219 (longlong_t)dle->dle_mintxg, 3220 (longlong_t)dle->dle_bpobj.bpo_object); 3221 } 3222 return (0); 3223 } 3224 3225 static void 3226 dump_blkptr_list(dsl_deadlist_t *dl, const char *name) 3227 { 3228 char bytes[32]; 3229 char comp[32]; 3230 char uncomp[32]; 3231 char entries[32]; 3232 spa_t *spa = dmu_objset_spa(dl->dl_os); 3233 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; 3234 3235 if (dl->dl_oldfmt) { 3236 if (dl->dl_bpobj.bpo_object != empty_bpobj) 3237 bpobj_count_refd(&dl->dl_bpobj); 3238 } else { 3239 mos_obj_refd(dl->dl_object); 3240 dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); 3241 } 3242 3243 /* make sure nicenum has enough space */ 3244 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 3245 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); 3246 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); 3247 _Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated"); 3248 3249 if (dump_opt['d'] < 3) 3250 return; 3251 3252 if (dl->dl_oldfmt) { 3253 dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); 3254 return; 3255 } 3256 3257 zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); 3258 zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); 3259 zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); 3260 zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); 3261 (void) printf("\n %s: %s (%s/%s comp), %s entries\n", 3262 name, bytes, comp, uncomp, entries); 3263 3264 if (dump_opt['d'] < 4) 3265 return; 3266 3267 (void) putchar('\n'); 3268 3269 dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); 3270 } 3271 3272 static int 3273 verify_dd_livelist(objset_t *os) 3274 { 3275 uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; 3276 dsl_pool_t *dp = spa_get_dsl(os->os_spa); 3277 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; 3278 3279 ASSERT(!dmu_objset_is_snapshot(os)); 3280 if (!dsl_deadlist_is_open(&dd->dd_livelist)) 3281 return (0); 3282 3283 /* Iterate through the livelist to check for duplicates */ 3284 dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, 3285 NULL); 3286 3287 dsl_pool_config_enter(dp, FTAG); 3288 dsl_deadlist_space(&dd->dd_livelist, &ll_used, 3289 &ll_comp, &ll_uncomp); 3290 3291 dsl_dataset_t *origin_ds; 3292 ASSERT(dsl_pool_config_held(dp)); 3293 VERIFY0(dsl_dataset_hold_obj(dp, 3294 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); 3295 VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, 3296 &used, &comp, &uncomp)); 3297 dsl_dataset_rele(origin_ds, FTAG); 3298 dsl_pool_config_exit(dp, FTAG); 3299 /* 3300 * It's possible that the dataset's uncomp space is larger than the 3301 * livelist's because livelists do not track embedded block pointers 3302 */ 3303 if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { 3304 char nice_used[32], nice_comp[32], nice_uncomp[32]; 3305 (void) printf("Discrepancy in space accounting:\n"); 3306 zdb_nicenum(used, nice_used, sizeof (nice_used)); 3307 zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); 3308 zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); 3309 (void) printf("dir: used %s, comp %s, uncomp %s\n", 3310 nice_used, nice_comp, nice_uncomp); 3311 zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); 3312 zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); 3313 zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); 3314 (void) printf("livelist: used %s, comp %s, uncomp %s\n", 3315 nice_used, nice_comp, nice_uncomp); 3316 return (1); 3317 } 3318 return (0); 3319 } 3320 3321 static char *key_material = NULL; 3322 3323 static boolean_t 3324 zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out) 3325 { 3326 uint64_t keyformat, salt, iters; 3327 int i; 3328 unsigned char c; 3329 FILE *f; 3330 3331 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, 3332 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t), 3333 1, &keyformat)); 3334 3335 switch (keyformat) { 3336 case ZFS_KEYFORMAT_HEX: 3337 for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) { 3338 if (!isxdigit(key_material[i]) || 3339 !isxdigit(key_material[i+1])) 3340 return (B_FALSE); 3341 if (sscanf(&key_material[i], "%02hhx", &c) != 1) 3342 return (B_FALSE); 3343 key_out[i / 2] = c; 3344 } 3345 break; 3346 3347 case ZFS_KEYFORMAT_PASSPHRASE: 3348 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, 3349 dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 3350 sizeof (uint64_t), 1, &salt)); 3351 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, 3352 dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 3353 sizeof (uint64_t), 1, &iters)); 3354 3355 if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material), 3356 ((uint8_t *)&salt), sizeof (uint64_t), iters, 3357 WRAPPING_KEY_LEN, key_out) != 1) 3358 return (B_FALSE); 3359 3360 break; 3361 3362 case ZFS_KEYFORMAT_RAW: 3363 if ((f = fopen(key_material, "r")) == NULL) 3364 return (B_FALSE); 3365 3366 if (fread(key_out, 1, WRAPPING_KEY_LEN, f) != 3367 WRAPPING_KEY_LEN) { 3368 (void) fclose(f); 3369 return (B_FALSE); 3370 } 3371 3372 /* Check the key length */ 3373 if (fgetc(f) != EOF) { 3374 (void) fclose(f); 3375 return (B_FALSE); 3376 } 3377 3378 (void) fclose(f); 3379 break; 3380 3381 default: 3382 fatal("no support for key format %u\n", 3383 (unsigned int) keyformat); 3384 } 3385 3386 return (B_TRUE); 3387 } 3388 3389 static char encroot[ZFS_MAX_DATASET_NAME_LEN]; 3390 static boolean_t key_loaded = B_FALSE; 3391 3392 static void 3393 zdb_load_key(objset_t *os) 3394 { 3395 dsl_pool_t *dp; 3396 dsl_dir_t *dd, *rdd; 3397 uint8_t key[WRAPPING_KEY_LEN]; 3398 uint64_t rddobj; 3399 int err; 3400 3401 dp = spa_get_dsl(os->os_spa); 3402 dd = os->os_dsl_dataset->ds_dir; 3403 3404 dsl_pool_config_enter(dp, FTAG); 3405 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, 3406 DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj)); 3407 VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd)); 3408 dsl_dir_name(rdd, encroot); 3409 dsl_dir_rele(rdd, FTAG); 3410 3411 if (!zdb_derive_key(dd, key)) 3412 fatal("couldn't derive encryption key"); 3413 3414 dsl_pool_config_exit(dp, FTAG); 3415 3416 ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE); 3417 3418 dsl_crypto_params_t *dcp; 3419 nvlist_t *crypto_args; 3420 3421 crypto_args = fnvlist_alloc(); 3422 fnvlist_add_uint8_array(crypto_args, "wkeydata", 3423 (uint8_t *)key, WRAPPING_KEY_LEN); 3424 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, 3425 NULL, crypto_args, &dcp)); 3426 err = spa_keystore_load_wkey(encroot, dcp, B_FALSE); 3427 3428 dsl_crypto_params_free(dcp, (err != 0)); 3429 fnvlist_free(crypto_args); 3430 3431 if (err != 0) 3432 fatal( 3433 "couldn't load encryption key for %s: %s", 3434 encroot, err == ZFS_ERR_CRYPTO_NOTSUP ? 3435 "crypto params not supported" : strerror(err)); 3436 3437 ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE); 3438 3439 printf("Unlocked encryption root: %s\n", encroot); 3440 key_loaded = B_TRUE; 3441 } 3442 3443 static void 3444 zdb_unload_key(void) 3445 { 3446 if (!key_loaded) 3447 return; 3448 3449 VERIFY0(spa_keystore_unload_wkey(encroot)); 3450 key_loaded = B_FALSE; 3451 } 3452 3453 static avl_tree_t idx_tree; 3454 static avl_tree_t domain_tree; 3455 static boolean_t fuid_table_loaded; 3456 static objset_t *sa_os = NULL; 3457 static sa_attr_type_t *sa_attr_table = NULL; 3458 3459 static int 3460 open_objset(const char *path, const void *tag, objset_t **osp) 3461 { 3462 int err; 3463 uint64_t sa_attrs = 0; 3464 uint64_t version = 0; 3465 3466 VERIFY0P(sa_os); 3467 3468 /* 3469 * We can't own an objset if it's redacted. Therefore, we do this 3470 * dance: hold the objset, then acquire a long hold on its dataset, then 3471 * release the pool (which is held as part of holding the objset). 3472 */ 3473 3474 if (dump_opt['K']) { 3475 /* decryption requested, try to load keys */ 3476 err = dmu_objset_hold(path, tag, osp); 3477 if (err != 0) { 3478 (void) fprintf(stderr, "failed to hold dataset " 3479 "'%s': %s\n", 3480 path, strerror(err)); 3481 return (err); 3482 } 3483 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); 3484 dsl_pool_rele(dmu_objset_pool(*osp), tag); 3485 3486 /* succeeds or dies */ 3487 zdb_load_key(*osp); 3488 3489 /* release it all */ 3490 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); 3491 dsl_dataset_rele(dmu_objset_ds(*osp), tag); 3492 } 3493 3494 int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0; 3495 3496 err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp); 3497 if (err != 0) { 3498 (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", 3499 path, strerror(err)); 3500 return (err); 3501 } 3502 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); 3503 dsl_pool_rele(dmu_objset_pool(*osp), tag); 3504 3505 if (dmu_objset_type(*osp) == DMU_OST_ZFS && 3506 (key_loaded || !(*osp)->os_encrypted)) { 3507 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 3508 8, 1, &version); 3509 if (version >= ZPL_VERSION_SA) { 3510 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 3511 8, 1, &sa_attrs); 3512 } 3513 err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, 3514 &sa_attr_table); 3515 if (err != 0) { 3516 (void) fprintf(stderr, "sa_setup failed: %s\n", 3517 strerror(err)); 3518 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); 3519 dsl_dataset_rele_flags(dmu_objset_ds(*osp), 3520 ds_hold_flags, tag); 3521 *osp = NULL; 3522 } 3523 } 3524 sa_os = *osp; 3525 3526 return (err); 3527 } 3528 3529 static void 3530 close_objset(objset_t *os, const void *tag) 3531 { 3532 VERIFY3P(os, ==, sa_os); 3533 if (os->os_sa != NULL) 3534 sa_tear_down(os); 3535 dsl_dataset_long_rele(dmu_objset_ds(os), tag); 3536 dsl_dataset_rele_flags(dmu_objset_ds(os), 3537 key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag); 3538 sa_attr_table = NULL; 3539 sa_os = NULL; 3540 3541 zdb_unload_key(); 3542 } 3543 3544 static void 3545 fuid_table_destroy(void) 3546 { 3547 if (fuid_table_loaded) { 3548 zfs_fuid_table_destroy(&idx_tree, &domain_tree); 3549 fuid_table_loaded = B_FALSE; 3550 } 3551 } 3552 3553 /* 3554 * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on 3555 * a live pool are normally cleaned up during ddt_sync(). We can't do that (and 3556 * wouldn't want to anyway), but if we don't clean up the presence of stuff on 3557 * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves. 3558 * 3559 * Note that this is not a particularly efficient way to do this, but 3560 * ddt_remove() is the only public method that can do the work we need, and it 3561 * requires the right locks and etc to do the job. This is only ever called 3562 * during zdb shutdown so efficiency is not especially important. 3563 */ 3564 static void 3565 zdb_ddt_cleanup(spa_t *spa) 3566 { 3567 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 3568 ddt_t *ddt = spa->spa_ddt[c]; 3569 if (!ddt) 3570 continue; 3571 3572 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3573 ddt_enter(ddt); 3574 ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next; 3575 while (dde) { 3576 next = AVL_NEXT(&ddt->ddt_tree, dde); 3577 dde->dde_io = NULL; 3578 ddt_remove(ddt, dde); 3579 dde = next; 3580 } 3581 ddt_exit(ddt); 3582 spa_config_exit(spa, SCL_CONFIG, FTAG); 3583 } 3584 } 3585 3586 static void 3587 zdb_exit(int reason) 3588 { 3589 if (spa != NULL) 3590 zdb_ddt_cleanup(spa); 3591 3592 if (os != NULL) { 3593 close_objset(os, FTAG); 3594 } else if (spa != NULL) { 3595 spa_close(spa, FTAG); 3596 } 3597 3598 fuid_table_destroy(); 3599 3600 if (kernel_init_done) 3601 kernel_fini(); 3602 3603 exit(reason); 3604 } 3605 3606 /* 3607 * print uid or gid information. 3608 * For normal POSIX id just the id is printed in decimal format. 3609 * For CIFS files with FUID the fuid is printed in hex followed by 3610 * the domain-rid string. 3611 */ 3612 static void 3613 print_idstr(uint64_t id, const char *id_type) 3614 { 3615 if (FUID_INDEX(id)) { 3616 const char *domain = 3617 zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); 3618 (void) printf("\t%s %llx [%s-%d]\n", id_type, 3619 (u_longlong_t)id, domain, (int)FUID_RID(id)); 3620 } else { 3621 (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); 3622 } 3623 3624 } 3625 3626 static void 3627 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) 3628 { 3629 uint32_t uid_idx, gid_idx; 3630 3631 uid_idx = FUID_INDEX(uid); 3632 gid_idx = FUID_INDEX(gid); 3633 3634 /* Load domain table, if not already loaded */ 3635 if (!fuid_table_loaded && (uid_idx || gid_idx)) { 3636 uint64_t fuid_obj; 3637 3638 /* first find the fuid object. It lives in the master node */ 3639 VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 3640 8, 1, &fuid_obj)); 3641 zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); 3642 (void) zfs_fuid_table_load(os, fuid_obj, 3643 &idx_tree, &domain_tree); 3644 fuid_table_loaded = B_TRUE; 3645 } 3646 3647 print_idstr(uid, "uid"); 3648 print_idstr(gid, "gid"); 3649 } 3650 3651 static void 3652 dump_znode_sa_xattr(sa_handle_t *hdl) 3653 { 3654 nvlist_t *sa_xattr; 3655 nvpair_t *elem = NULL; 3656 int sa_xattr_size = 0; 3657 int sa_xattr_entries = 0; 3658 int error; 3659 char *sa_xattr_packed; 3660 3661 error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size); 3662 if (error || sa_xattr_size == 0) 3663 return; 3664 3665 sa_xattr_packed = malloc(sa_xattr_size); 3666 if (sa_xattr_packed == NULL) 3667 return; 3668 3669 error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR], 3670 sa_xattr_packed, sa_xattr_size); 3671 if (error) { 3672 free(sa_xattr_packed); 3673 return; 3674 } 3675 3676 error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0); 3677 if (error) { 3678 free(sa_xattr_packed); 3679 return; 3680 } 3681 3682 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) 3683 sa_xattr_entries++; 3684 3685 (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", 3686 sa_xattr_size, sa_xattr_entries); 3687 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { 3688 boolean_t can_print = !dump_opt['P']; 3689 uchar_t *value; 3690 uint_t cnt, idx; 3691 3692 (void) printf("\t\t%s = ", nvpair_name(elem)); 3693 nvpair_value_byte_array(elem, &value, &cnt); 3694 3695 for (idx = 0; idx < cnt; ++idx) { 3696 if (!isprint(value[idx])) { 3697 can_print = B_FALSE; 3698 break; 3699 } 3700 } 3701 3702 for (idx = 0; idx < cnt; ++idx) { 3703 if (can_print) 3704 (void) putchar(value[idx]); 3705 else 3706 (void) printf("\\%3.3o", value[idx]); 3707 } 3708 (void) putchar('\n'); 3709 } 3710 3711 nvlist_free(sa_xattr); 3712 free(sa_xattr_packed); 3713 } 3714 3715 static void 3716 dump_znode_symlink(sa_handle_t *hdl) 3717 { 3718 int sa_symlink_size = 0; 3719 char linktarget[MAXPATHLEN]; 3720 int error; 3721 3722 error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size); 3723 if (error || sa_symlink_size == 0) { 3724 return; 3725 } 3726 if (sa_symlink_size >= sizeof (linktarget)) { 3727 (void) printf("symlink size %d is too large\n", 3728 sa_symlink_size); 3729 return; 3730 } 3731 linktarget[sa_symlink_size] = '\0'; 3732 if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK], 3733 &linktarget, sa_symlink_size) == 0) 3734 (void) printf("\ttarget %s\n", linktarget); 3735 } 3736 3737 static void 3738 dump_znode(objset_t *os, uint64_t object, void *data, size_t size) 3739 { 3740 (void) data, (void) size; 3741 char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ 3742 sa_handle_t *hdl; 3743 uint64_t xattr, rdev, gen; 3744 uint64_t uid, gid, mode, fsize, parent, links; 3745 uint64_t pflags; 3746 uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; 3747 time_t z_crtime, z_atime, z_mtime, z_ctime; 3748 sa_bulk_attr_t bulk[12]; 3749 int idx = 0; 3750 int error; 3751 3752 VERIFY3P(os, ==, sa_os); 3753 if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { 3754 (void) printf("Failed to get handle for SA znode\n"); 3755 return; 3756 } 3757 3758 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); 3759 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); 3760 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, 3761 &links, 8); 3762 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); 3763 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, 3764 &mode, 8); 3765 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], 3766 NULL, &parent, 8); 3767 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, 3768 &fsize, 8); 3769 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, 3770 acctm, 16); 3771 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, 3772 modtm, 16); 3773 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, 3774 crtm, 16); 3775 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, 3776 chgtm, 16); 3777 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, 3778 &pflags, 8); 3779 3780 if (sa_bulk_lookup(hdl, bulk, idx)) { 3781 (void) sa_handle_destroy(hdl); 3782 return; 3783 } 3784 3785 z_crtime = (time_t)crtm[0]; 3786 z_atime = (time_t)acctm[0]; 3787 z_mtime = (time_t)modtm[0]; 3788 z_ctime = (time_t)chgtm[0]; 3789 3790 if (dump_opt['d'] > 4) { 3791 error = zfs_obj_to_path(os, object, path, sizeof (path)); 3792 if (error == ESTALE) { 3793 (void) snprintf(path, sizeof (path), "on delete queue"); 3794 } else if (error != 0) { 3795 leaked_objects++; 3796 (void) snprintf(path, sizeof (path), 3797 "path not found, possibly leaked"); 3798 } 3799 (void) printf("\tpath %s\n", path); 3800 } 3801 3802 if (S_ISLNK(mode)) 3803 dump_znode_symlink(hdl); 3804 dump_uidgid(os, uid, gid); 3805 (void) printf("\tatime %s", ctime(&z_atime)); 3806 (void) printf("\tmtime %s", ctime(&z_mtime)); 3807 (void) printf("\tctime %s", ctime(&z_ctime)); 3808 (void) printf("\tcrtime %s", ctime(&z_crtime)); 3809 (void) printf("\tgen %llu\n", (u_longlong_t)gen); 3810 (void) printf("\tmode %llo\n", (u_longlong_t)mode); 3811 (void) printf("\tsize %llu\n", (u_longlong_t)fsize); 3812 (void) printf("\tparent %llu\n", (u_longlong_t)parent); 3813 (void) printf("\tlinks %llu\n", (u_longlong_t)links); 3814 (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); 3815 if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { 3816 uint64_t projid; 3817 3818 if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, 3819 sizeof (uint64_t)) == 0) 3820 (void) printf("\tprojid %llu\n", (u_longlong_t)projid); 3821 } 3822 if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, 3823 sizeof (uint64_t)) == 0) 3824 (void) printf("\txattr %llu\n", (u_longlong_t)xattr); 3825 if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, 3826 sizeof (uint64_t)) == 0) 3827 (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); 3828 dump_znode_sa_xattr(hdl); 3829 sa_handle_destroy(hdl); 3830 } 3831 3832 static void 3833 dump_acl(objset_t *os, uint64_t object, void *data, size_t size) 3834 { 3835 (void) os, (void) object, (void) data, (void) size; 3836 } 3837 3838 static void 3839 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) 3840 { 3841 (void) os, (void) object, (void) data, (void) size; 3842 } 3843 3844 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { 3845 dump_none, /* unallocated */ 3846 dump_zap, /* object directory */ 3847 dump_uint64, /* object array */ 3848 dump_none, /* packed nvlist */ 3849 dump_packed_nvlist, /* packed nvlist size */ 3850 dump_none, /* bpobj */ 3851 dump_bpobj, /* bpobj header */ 3852 dump_none, /* SPA space map header */ 3853 dump_none, /* SPA space map */ 3854 dump_none, /* ZIL intent log */ 3855 dump_dnode, /* DMU dnode */ 3856 dump_dmu_objset, /* DMU objset */ 3857 dump_dsl_dir, /* DSL directory */ 3858 dump_zap, /* DSL directory child map */ 3859 dump_zap, /* DSL dataset snap map */ 3860 dump_zap, /* DSL props */ 3861 dump_dsl_dataset, /* DSL dataset */ 3862 dump_znode, /* ZFS znode */ 3863 dump_acl, /* ZFS V0 ACL */ 3864 dump_uint8, /* ZFS plain file */ 3865 dump_zpldir, /* ZFS directory */ 3866 dump_zap, /* ZFS master node */ 3867 dump_zap, /* ZFS delete queue */ 3868 dump_uint8, /* zvol object */ 3869 dump_zap, /* zvol prop */ 3870 dump_uint8, /* other uint8[] */ 3871 dump_uint64, /* other uint64[] */ 3872 dump_zap, /* other ZAP */ 3873 dump_zap, /* persistent error log */ 3874 dump_uint8, /* SPA history */ 3875 dump_history_offsets, /* SPA history offsets */ 3876 dump_zap, /* Pool properties */ 3877 dump_zap, /* DSL permissions */ 3878 dump_acl, /* ZFS ACL */ 3879 dump_uint8, /* ZFS SYSACL */ 3880 dump_none, /* FUID nvlist */ 3881 dump_packed_nvlist, /* FUID nvlist size */ 3882 dump_zap, /* DSL dataset next clones */ 3883 dump_zap, /* DSL scrub queue */ 3884 dump_zap, /* ZFS user/group/project used */ 3885 dump_zap, /* ZFS user/group/project quota */ 3886 dump_zap, /* snapshot refcount tags */ 3887 dump_ddt_zap, /* DDT ZAP object */ 3888 dump_zap, /* DDT statistics */ 3889 dump_znode, /* SA object */ 3890 dump_zap, /* SA Master Node */ 3891 dump_sa_attrs, /* SA attribute registration */ 3892 dump_sa_layouts, /* SA attribute layouts */ 3893 dump_zap, /* DSL scrub translations */ 3894 dump_none, /* fake dedup BP */ 3895 dump_zap, /* deadlist */ 3896 dump_none, /* deadlist hdr */ 3897 dump_zap, /* dsl clones */ 3898 dump_bpobj_subobjs, /* bpobj subobjs */ 3899 dump_unknown, /* Unknown type, must be last */ 3900 }; 3901 3902 static boolean_t 3903 match_object_type(dmu_object_type_t obj_type, uint64_t flags) 3904 { 3905 boolean_t match = B_TRUE; 3906 3907 switch (obj_type) { 3908 case DMU_OT_DIRECTORY_CONTENTS: 3909 if (!(flags & ZOR_FLAG_DIRECTORY)) 3910 match = B_FALSE; 3911 break; 3912 case DMU_OT_PLAIN_FILE_CONTENTS: 3913 if (!(flags & ZOR_FLAG_PLAIN_FILE)) 3914 match = B_FALSE; 3915 break; 3916 case DMU_OT_SPACE_MAP: 3917 if (!(flags & ZOR_FLAG_SPACE_MAP)) 3918 match = B_FALSE; 3919 break; 3920 default: 3921 if (strcmp(zdb_ot_name(obj_type), "zap") == 0) { 3922 if (!(flags & ZOR_FLAG_ZAP)) 3923 match = B_FALSE; 3924 break; 3925 } 3926 3927 /* 3928 * If all bits except some of the supported flags are 3929 * set, the user combined the all-types flag (A) with 3930 * a negated flag to exclude some types (e.g. A-f to 3931 * show all object types except plain files). 3932 */ 3933 if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES) 3934 match = B_FALSE; 3935 3936 break; 3937 } 3938 3939 return (match); 3940 } 3941 3942 static void 3943 dump_object(objset_t *os, uint64_t object, int verbosity, 3944 boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags) 3945 { 3946 dmu_buf_t *db = NULL; 3947 dmu_object_info_t doi; 3948 dnode_t *dn; 3949 boolean_t dnode_held = B_FALSE; 3950 void *bonus = NULL; 3951 size_t bsize = 0; 3952 char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; 3953 char bonus_size[32]; 3954 char aux[50]; 3955 int error; 3956 3957 /* make sure nicenum has enough space */ 3958 _Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated"); 3959 _Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated"); 3960 _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated"); 3961 _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated"); 3962 _Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ, 3963 "bonus_size truncated"); 3964 3965 if (*print_header) { 3966 (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", 3967 "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", 3968 "lsize", "%full", "type"); 3969 *print_header = 0; 3970 } 3971 3972 if (object == 0) { 3973 dn = DMU_META_DNODE(os); 3974 dmu_object_info_from_dnode(dn, &doi); 3975 } else { 3976 /* 3977 * Encrypted datasets will have sensitive bonus buffers 3978 * encrypted. Therefore we cannot hold the bonus buffer and 3979 * must hold the dnode itself instead. 3980 */ 3981 error = dmu_object_info(os, object, &doi); 3982 if (error) 3983 fatal("dmu_object_info() failed, errno %u", error); 3984 3985 if (!key_loaded && os->os_encrypted && 3986 DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { 3987 error = dnode_hold(os, object, FTAG, &dn); 3988 if (error) 3989 fatal("dnode_hold() failed, errno %u", error); 3990 dnode_held = B_TRUE; 3991 } else { 3992 error = dmu_bonus_hold(os, object, FTAG, &db); 3993 if (error) 3994 fatal("dmu_bonus_hold(%llu) failed, errno %u", 3995 object, error); 3996 bonus = db->db_data; 3997 bsize = db->db_size; 3998 dn = DB_DNODE((dmu_buf_impl_t *)db); 3999 } 4000 } 4001 4002 /* 4003 * Default to showing all object types if no flags were specified. 4004 */ 4005 if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES && 4006 !match_object_type(doi.doi_type, flags)) 4007 goto out; 4008 4009 if (dnode_slots_used) 4010 *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; 4011 4012 zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); 4013 zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); 4014 zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); 4015 zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); 4016 zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); 4017 zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); 4018 (void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 * 4019 doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? 4020 DNODES_PER_BLOCK : 1) / doi.doi_max_offset); 4021 4022 aux[0] = '\0'; 4023 4024 if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { 4025 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), 4026 " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); 4027 } 4028 4029 if (doi.doi_compress == ZIO_COMPRESS_INHERIT && 4030 ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) { 4031 const char *compname = NULL; 4032 if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION, 4033 ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel), 4034 &compname) == 0) { 4035 (void) snprintf(aux + strlen(aux), 4036 sizeof (aux) - strlen(aux), " (Z=inherit=%s)", 4037 compname); 4038 } else { 4039 (void) snprintf(aux + strlen(aux), 4040 sizeof (aux) - strlen(aux), 4041 " (Z=inherit=%s-unknown)", 4042 ZDB_COMPRESS_NAME(os->os_compress)); 4043 } 4044 } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) { 4045 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), 4046 " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress)); 4047 } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { 4048 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), 4049 " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); 4050 } 4051 4052 (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", 4053 (u_longlong_t)object, doi.doi_indirection, iblk, dblk, 4054 asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux); 4055 4056 if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { 4057 (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", 4058 "", "", "", "", "", "", bonus_size, "bonus", 4059 zdb_ot_name(doi.doi_bonus_type)); 4060 } 4061 4062 if (verbosity >= 4) { 4063 (void) printf("\tdnode flags: %s%s%s%s\n", 4064 (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? 4065 "USED_BYTES " : "", 4066 (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? 4067 "USERUSED_ACCOUNTED " : "", 4068 (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? 4069 "USEROBJUSED_ACCOUNTED " : "", 4070 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? 4071 "SPILL_BLKPTR" : ""); 4072 (void) printf("\tdnode maxblkid: %llu\n", 4073 (longlong_t)dn->dn_phys->dn_maxblkid); 4074 4075 if (!dnode_held) { 4076 object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, 4077 object, bonus, bsize); 4078 } else { 4079 (void) printf("\t\t(bonus encrypted)\n"); 4080 } 4081 4082 if (key_loaded || 4083 (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) { 4084 object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, 4085 NULL, 0); 4086 } else { 4087 (void) printf("\t\t(object encrypted)\n"); 4088 } 4089 4090 *print_header = B_TRUE; 4091 } 4092 4093 if (verbosity >= 5) { 4094 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 4095 char blkbuf[BP_SPRINTF_LEN]; 4096 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), 4097 DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE); 4098 (void) printf("\nSpill block: %s\n", blkbuf); 4099 } 4100 dump_indirect(dn); 4101 } 4102 4103 if (verbosity >= 5) { 4104 /* 4105 * Report the list of segments that comprise the object. 4106 */ 4107 uint64_t start = 0; 4108 uint64_t end; 4109 uint64_t blkfill = 1; 4110 int minlvl = 1; 4111 4112 if (dn->dn_type == DMU_OT_DNODE) { 4113 minlvl = 0; 4114 blkfill = DNODES_PER_BLOCK; 4115 } 4116 4117 for (;;) { 4118 char segsize[32]; 4119 /* make sure nicenum has enough space */ 4120 _Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ, 4121 "segsize truncated"); 4122 error = dnode_next_offset(dn, 4123 0, &start, minlvl, blkfill, 0); 4124 if (error) 4125 break; 4126 end = start; 4127 error = dnode_next_offset(dn, 4128 DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); 4129 zdb_nicenum(end - start, segsize, sizeof (segsize)); 4130 (void) printf("\t\tsegment [%016llx, %016llx)" 4131 " size %5s\n", (u_longlong_t)start, 4132 (u_longlong_t)end, segsize); 4133 if (error) 4134 break; 4135 start = end; 4136 } 4137 } 4138 4139 out: 4140 if (db != NULL) 4141 dmu_buf_rele(db, FTAG); 4142 if (dnode_held) 4143 dnode_rele(dn, FTAG); 4144 } 4145 4146 static void 4147 count_dir_mos_objects(dsl_dir_t *dd) 4148 { 4149 mos_obj_refd(dd->dd_object); 4150 mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); 4151 mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); 4152 mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); 4153 mos_obj_refd(dsl_dir_phys(dd)->dd_clones); 4154 4155 /* 4156 * The dd_crypto_obj can be referenced by multiple dsl_dir's. 4157 * Ignore the references after the first one. 4158 */ 4159 mos_obj_refd_multiple(dd->dd_crypto_obj); 4160 } 4161 4162 static void 4163 count_ds_mos_objects(dsl_dataset_t *ds) 4164 { 4165 mos_obj_refd(ds->ds_object); 4166 mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); 4167 mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); 4168 mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); 4169 mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); 4170 mos_obj_refd(ds->ds_bookmarks_obj); 4171 4172 if (!dsl_dataset_is_snapshot(ds)) { 4173 count_dir_mos_objects(ds->ds_dir); 4174 } 4175 } 4176 4177 static const char *const objset_types[DMU_OST_NUMTYPES] = { 4178 "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; 4179 4180 /* 4181 * Parse a string denoting a range of object IDs of the form 4182 * <start>[:<end>[:flags]], and store the results in zor. 4183 * Return 0 on success. On error, return 1 and update the msg 4184 * pointer to point to a descriptive error message. 4185 */ 4186 static int 4187 parse_object_range(char *range, zopt_object_range_t *zor, const char **msg) 4188 { 4189 uint64_t flags = 0; 4190 char *p, *s, *dup, *flagstr, *tmp = NULL; 4191 size_t len; 4192 int i; 4193 int rc = 0; 4194 4195 if (strchr(range, ':') == NULL) { 4196 zor->zor_obj_start = strtoull(range, &p, 0); 4197 if (*p != '\0') { 4198 *msg = "Invalid characters in object ID"; 4199 rc = 1; 4200 } 4201 zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); 4202 zor->zor_obj_end = zor->zor_obj_start; 4203 return (rc); 4204 } 4205 4206 if (strchr(range, ':') == range) { 4207 *msg = "Invalid leading colon"; 4208 rc = 1; 4209 return (rc); 4210 } 4211 4212 len = strlen(range); 4213 if (range[len - 1] == ':') { 4214 *msg = "Invalid trailing colon"; 4215 rc = 1; 4216 return (rc); 4217 } 4218 4219 dup = strdup(range); 4220 s = strtok_r(dup, ":", &tmp); 4221 zor->zor_obj_start = strtoull(s, &p, 0); 4222 4223 if (*p != '\0') { 4224 *msg = "Invalid characters in start object ID"; 4225 rc = 1; 4226 goto out; 4227 } 4228 4229 s = strtok_r(NULL, ":", &tmp); 4230 zor->zor_obj_end = strtoull(s, &p, 0); 4231 4232 if (*p != '\0') { 4233 *msg = "Invalid characters in end object ID"; 4234 rc = 1; 4235 goto out; 4236 } 4237 4238 if (zor->zor_obj_start > zor->zor_obj_end) { 4239 *msg = "Start object ID may not exceed end object ID"; 4240 rc = 1; 4241 goto out; 4242 } 4243 4244 s = strtok_r(NULL, ":", &tmp); 4245 if (s == NULL) { 4246 zor->zor_flags = ZOR_FLAG_ALL_TYPES; 4247 goto out; 4248 } else if (strtok_r(NULL, ":", &tmp) != NULL) { 4249 *msg = "Invalid colon-delimited field after flags"; 4250 rc = 1; 4251 goto out; 4252 } 4253 4254 flagstr = s; 4255 for (i = 0; flagstr[i]; i++) { 4256 int bit; 4257 boolean_t negation = (flagstr[i] == '-'); 4258 4259 if (negation) { 4260 i++; 4261 if (flagstr[i] == '\0') { 4262 *msg = "Invalid trailing negation operator"; 4263 rc = 1; 4264 goto out; 4265 } 4266 } 4267 bit = flagbits[(uchar_t)flagstr[i]]; 4268 if (bit == 0) { 4269 *msg = "Invalid flag"; 4270 rc = 1; 4271 goto out; 4272 } 4273 if (negation) 4274 flags &= ~bit; 4275 else 4276 flags |= bit; 4277 } 4278 zor->zor_flags = flags; 4279 4280 zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); 4281 zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end); 4282 4283 out: 4284 free(dup); 4285 return (rc); 4286 } 4287 4288 static void 4289 dump_objset(objset_t *os) 4290 { 4291 dmu_objset_stats_t dds = { 0 }; 4292 uint64_t object, object_count; 4293 uint64_t refdbytes, usedobjs, scratch; 4294 char numbuf[32]; 4295 char blkbuf[BP_SPRINTF_LEN + 20]; 4296 char osname[ZFS_MAX_DATASET_NAME_LEN]; 4297 const char *type = "UNKNOWN"; 4298 int verbosity = dump_opt['d']; 4299 boolean_t print_header; 4300 unsigned i; 4301 int error; 4302 uint64_t total_slots_used = 0; 4303 uint64_t max_slot_used = 0; 4304 uint64_t dnode_slots; 4305 uint64_t obj_start; 4306 uint64_t obj_end; 4307 uint64_t flags; 4308 4309 /* make sure nicenum has enough space */ 4310 _Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated"); 4311 4312 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 4313 dmu_objset_fast_stat(os, &dds); 4314 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 4315 4316 print_header = B_TRUE; 4317 4318 if (dds.dds_type < DMU_OST_NUMTYPES) 4319 type = objset_types[dds.dds_type]; 4320 4321 if (dds.dds_type == DMU_OST_META) { 4322 dds.dds_creation_txg = TXG_INITIAL; 4323 usedobjs = BP_GET_FILL(os->os_rootbp); 4324 refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> 4325 dd_used_bytes; 4326 } else { 4327 dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); 4328 } 4329 4330 ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); 4331 4332 zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); 4333 4334 if (verbosity >= 4) { 4335 (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); 4336 (void) snprintf_blkptr(blkbuf + strlen(blkbuf), 4337 sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); 4338 } else { 4339 blkbuf[0] = '\0'; 4340 } 4341 4342 dmu_objset_name(os, osname); 4343 4344 (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " 4345 "%s, %llu objects%s%s\n", 4346 osname, type, (u_longlong_t)dmu_objset_id(os), 4347 (u_longlong_t)dds.dds_creation_txg, 4348 numbuf, (u_longlong_t)usedobjs, blkbuf, 4349 (dds.dds_inconsistent) ? " (inconsistent)" : ""); 4350 4351 for (i = 0; i < zopt_object_args; i++) { 4352 obj_start = zopt_object_ranges[i].zor_obj_start; 4353 obj_end = zopt_object_ranges[i].zor_obj_end; 4354 flags = zopt_object_ranges[i].zor_flags; 4355 4356 object = obj_start; 4357 if (object == 0 || obj_start == obj_end) 4358 dump_object(os, object, verbosity, &print_header, NULL, 4359 flags); 4360 else 4361 object--; 4362 4363 while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) && 4364 object <= obj_end) { 4365 dump_object(os, object, verbosity, &print_header, NULL, 4366 flags); 4367 } 4368 } 4369 4370 if (zopt_object_args > 0) { 4371 (void) printf("\n"); 4372 return; 4373 } 4374 4375 if (dump_opt['i'] != 0 || verbosity >= 2) 4376 dump_intent_log(dmu_objset_zil(os)); 4377 4378 if (dmu_objset_ds(os) != NULL) { 4379 dsl_dataset_t *ds = dmu_objset_ds(os); 4380 dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); 4381 if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && 4382 !dmu_objset_is_snapshot(os)) { 4383 dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); 4384 if (verify_dd_livelist(os) != 0) 4385 fatal("livelist is incorrect"); 4386 } 4387 4388 if (dsl_dataset_remap_deadlist_exists(ds)) { 4389 (void) printf("ds_remap_deadlist:\n"); 4390 dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); 4391 } 4392 count_ds_mos_objects(ds); 4393 } 4394 4395 if (dmu_objset_ds(os) != NULL) 4396 dump_bookmarks(os, verbosity); 4397 4398 if (verbosity < 2) 4399 return; 4400 4401 if (BP_IS_HOLE(os->os_rootbp)) 4402 return; 4403 4404 dump_object(os, 0, verbosity, &print_header, NULL, 0); 4405 object_count = 0; 4406 if (DMU_USERUSED_DNODE(os) != NULL && 4407 DMU_USERUSED_DNODE(os)->dn_type != 0) { 4408 dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, 4409 NULL, 0); 4410 dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, 4411 NULL, 0); 4412 } 4413 4414 if (DMU_PROJECTUSED_DNODE(os) != NULL && 4415 DMU_PROJECTUSED_DNODE(os)->dn_type != 0) 4416 dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, 4417 &print_header, NULL, 0); 4418 4419 object = 0; 4420 while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { 4421 dump_object(os, object, verbosity, &print_header, &dnode_slots, 4422 0); 4423 object_count++; 4424 total_slots_used += dnode_slots; 4425 max_slot_used = object + dnode_slots - 1; 4426 } 4427 4428 (void) printf("\n"); 4429 4430 (void) printf(" Dnode slots:\n"); 4431 (void) printf("\tTotal used: %10llu\n", 4432 (u_longlong_t)total_slots_used); 4433 (void) printf("\tMax used: %10llu\n", 4434 (u_longlong_t)max_slot_used); 4435 (void) printf("\tPercent empty: %10lf\n", 4436 (double)(max_slot_used - total_slots_used)*100 / 4437 (double)max_slot_used); 4438 (void) printf("\n"); 4439 4440 if (error != ESRCH) { 4441 (void) fprintf(stderr, "dmu_object_next() = %d\n", error); 4442 abort(); 4443 } 4444 4445 ASSERT3U(object_count, ==, usedobjs); 4446 4447 if (leaked_objects != 0) { 4448 (void) printf("%d potentially leaked objects detected\n", 4449 leaked_objects); 4450 leaked_objects = 0; 4451 } 4452 } 4453 4454 static void 4455 dump_uberblock(uberblock_t *ub, const char *header, const char *footer) 4456 { 4457 time_t timestamp = ub->ub_timestamp; 4458 4459 (void) printf("%s", header ? header : ""); 4460 (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); 4461 (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); 4462 (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); 4463 (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); 4464 (void) printf("\ttimestamp = %llu UTC = %s", 4465 (u_longlong_t)ub->ub_timestamp, ctime(×tamp)); 4466 4467 char blkbuf[BP_SPRINTF_LEN]; 4468 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); 4469 (void) printf("\tbp = %s\n", blkbuf); 4470 4471 (void) printf("\tmmp_magic = %016llx\n", 4472 (u_longlong_t)ub->ub_mmp_magic); 4473 if (MMP_VALID(ub)) { 4474 (void) printf("\tmmp_delay = %0llu\n", 4475 (u_longlong_t)ub->ub_mmp_delay); 4476 if (MMP_SEQ_VALID(ub)) 4477 (void) printf("\tmmp_seq = %u\n", 4478 (unsigned int) MMP_SEQ(ub)); 4479 if (MMP_FAIL_INT_VALID(ub)) 4480 (void) printf("\tmmp_fail = %u\n", 4481 (unsigned int) MMP_FAIL_INT(ub)); 4482 if (MMP_INTERVAL_VALID(ub)) 4483 (void) printf("\tmmp_write = %u\n", 4484 (unsigned int) MMP_INTERVAL(ub)); 4485 /* After MMP_* to make summarize_uberblock_mmp cleaner */ 4486 (void) printf("\tmmp_valid = %x\n", 4487 (unsigned int) ub->ub_mmp_config & 0xFF); 4488 } 4489 4490 if (dump_opt['u'] >= 4) { 4491 char blkbuf[BP_SPRINTF_LEN]; 4492 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); 4493 (void) printf("\trootbp = %s\n", blkbuf); 4494 } 4495 (void) printf("\tcheckpoint_txg = %llu\n", 4496 (u_longlong_t)ub->ub_checkpoint_txg); 4497 4498 (void) printf("\traidz_reflow state=%u off=%llu\n", 4499 (int)RRSS_GET_STATE(ub), 4500 (u_longlong_t)RRSS_GET_OFFSET(ub)); 4501 4502 (void) printf("%s", footer ? footer : ""); 4503 } 4504 4505 static void 4506 dump_config(spa_t *spa) 4507 { 4508 dmu_buf_t *db; 4509 size_t nvsize = 0; 4510 int error = 0; 4511 4512 4513 error = dmu_bonus_hold(spa->spa_meta_objset, 4514 spa->spa_config_object, FTAG, &db); 4515 4516 if (error == 0) { 4517 nvsize = *(uint64_t *)db->db_data; 4518 dmu_buf_rele(db, FTAG); 4519 4520 (void) printf("\nMOS Configuration:\n"); 4521 dump_packed_nvlist(spa->spa_meta_objset, 4522 spa->spa_config_object, (void *)&nvsize, 1); 4523 } else { 4524 (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", 4525 (u_longlong_t)spa->spa_config_object, error); 4526 } 4527 } 4528 4529 static void 4530 dump_cachefile(const char *cachefile) 4531 { 4532 int fd; 4533 struct stat64 statbuf; 4534 char *buf; 4535 nvlist_t *config; 4536 4537 if ((fd = open64(cachefile, O_RDONLY)) < 0) { 4538 (void) printf("cannot open '%s': %s\n", cachefile, 4539 strerror(errno)); 4540 zdb_exit(1); 4541 } 4542 4543 if (fstat64(fd, &statbuf) != 0) { 4544 (void) printf("failed to stat '%s': %s\n", cachefile, 4545 strerror(errno)); 4546 zdb_exit(1); 4547 } 4548 4549 if ((buf = malloc(statbuf.st_size)) == NULL) { 4550 (void) fprintf(stderr, "failed to allocate %llu bytes\n", 4551 (u_longlong_t)statbuf.st_size); 4552 zdb_exit(1); 4553 } 4554 4555 if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { 4556 (void) fprintf(stderr, "failed to read %llu bytes\n", 4557 (u_longlong_t)statbuf.st_size); 4558 zdb_exit(1); 4559 } 4560 4561 (void) close(fd); 4562 4563 if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { 4564 (void) fprintf(stderr, "failed to unpack nvlist\n"); 4565 zdb_exit(1); 4566 } 4567 4568 free(buf); 4569 4570 dump_nvlist(config, 0); 4571 4572 nvlist_free(config); 4573 } 4574 4575 /* 4576 * ZFS label nvlist stats 4577 */ 4578 typedef struct zdb_nvl_stats { 4579 int zns_list_count; 4580 int zns_leaf_count; 4581 size_t zns_leaf_largest; 4582 size_t zns_leaf_total; 4583 nvlist_t *zns_string; 4584 nvlist_t *zns_uint64; 4585 nvlist_t *zns_boolean; 4586 } zdb_nvl_stats_t; 4587 4588 static void 4589 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats) 4590 { 4591 nvlist_t *list, **array; 4592 nvpair_t *nvp = NULL; 4593 const char *name; 4594 uint_t i, items; 4595 4596 stats->zns_list_count++; 4597 4598 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 4599 name = nvpair_name(nvp); 4600 4601 switch (nvpair_type(nvp)) { 4602 case DATA_TYPE_STRING: 4603 fnvlist_add_string(stats->zns_string, name, 4604 fnvpair_value_string(nvp)); 4605 break; 4606 case DATA_TYPE_UINT64: 4607 fnvlist_add_uint64(stats->zns_uint64, name, 4608 fnvpair_value_uint64(nvp)); 4609 break; 4610 case DATA_TYPE_BOOLEAN: 4611 fnvlist_add_boolean(stats->zns_boolean, name); 4612 break; 4613 case DATA_TYPE_NVLIST: 4614 if (nvpair_value_nvlist(nvp, &list) == 0) 4615 collect_nvlist_stats(list, stats); 4616 break; 4617 case DATA_TYPE_NVLIST_ARRAY: 4618 if (nvpair_value_nvlist_array(nvp, &array, &items) != 0) 4619 break; 4620 4621 for (i = 0; i < items; i++) { 4622 collect_nvlist_stats(array[i], stats); 4623 4624 /* collect stats on leaf vdev */ 4625 if (strcmp(name, "children") == 0) { 4626 size_t size; 4627 4628 (void) nvlist_size(array[i], &size, 4629 NV_ENCODE_XDR); 4630 stats->zns_leaf_total += size; 4631 if (size > stats->zns_leaf_largest) 4632 stats->zns_leaf_largest = size; 4633 stats->zns_leaf_count++; 4634 } 4635 } 4636 break; 4637 default: 4638 (void) printf("skip type %d!\n", (int)nvpair_type(nvp)); 4639 } 4640 } 4641 } 4642 4643 static void 4644 dump_nvlist_stats(nvlist_t *nvl, size_t cap) 4645 { 4646 zdb_nvl_stats_t stats = { 0 }; 4647 size_t size, sum = 0, total; 4648 size_t noise; 4649 4650 /* requires nvlist with non-unique names for stat collection */ 4651 VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0)); 4652 VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0)); 4653 VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0)); 4654 VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR)); 4655 4656 (void) printf("\n\nZFS Label NVList Config Stats:\n"); 4657 4658 VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR)); 4659 (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n", 4660 (int)total, (int)(cap - total), 100.0 * total / cap); 4661 4662 collect_nvlist_stats(nvl, &stats); 4663 4664 VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR)); 4665 size -= noise; 4666 sum += size; 4667 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:", 4668 (int)fnvlist_num_pairs(stats.zns_uint64), 4669 (int)size, 100.0 * size / total); 4670 4671 VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR)); 4672 size -= noise; 4673 sum += size; 4674 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:", 4675 (int)fnvlist_num_pairs(stats.zns_string), 4676 (int)size, 100.0 * size / total); 4677 4678 VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR)); 4679 size -= noise; 4680 sum += size; 4681 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:", 4682 (int)fnvlist_num_pairs(stats.zns_boolean), 4683 (int)size, 100.0 * size / total); 4684 4685 size = total - sum; /* treat remainder as nvlist overhead */ 4686 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:", 4687 stats.zns_list_count, (int)size, 100.0 * size / total); 4688 4689 if (stats.zns_leaf_count > 0) { 4690 size_t average = stats.zns_leaf_total / stats.zns_leaf_count; 4691 4692 (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:", 4693 stats.zns_leaf_count, (int)average); 4694 (void) printf("%24d bytes largest\n", 4695 (int)stats.zns_leaf_largest); 4696 4697 if (dump_opt['l'] >= 3 && average > 0) 4698 (void) printf(" space for %d additional leaf vdevs\n", 4699 (int)((cap - total) / average)); 4700 } 4701 (void) printf("\n"); 4702 4703 nvlist_free(stats.zns_string); 4704 nvlist_free(stats.zns_uint64); 4705 nvlist_free(stats.zns_boolean); 4706 } 4707 4708 typedef struct cksum_record { 4709 zio_cksum_t cksum; 4710 boolean_t labels[VDEV_LABELS]; 4711 avl_node_t link; 4712 } cksum_record_t; 4713 4714 static int 4715 cksum_record_compare(const void *x1, const void *x2) 4716 { 4717 const cksum_record_t *l = (cksum_record_t *)x1; 4718 const cksum_record_t *r = (cksum_record_t *)x2; 4719 int arraysize = ARRAY_SIZE(l->cksum.zc_word); 4720 int difference = 0; 4721 4722 for (int i = 0; i < arraysize; i++) { 4723 difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); 4724 if (difference) 4725 break; 4726 } 4727 4728 return (difference); 4729 } 4730 4731 static cksum_record_t * 4732 cksum_record_alloc(zio_cksum_t *cksum, int l) 4733 { 4734 cksum_record_t *rec; 4735 4736 rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); 4737 rec->cksum = *cksum; 4738 rec->labels[l] = B_TRUE; 4739 4740 return (rec); 4741 } 4742 4743 static cksum_record_t * 4744 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) 4745 { 4746 cksum_record_t lookup = { .cksum = *cksum }; 4747 avl_index_t where; 4748 4749 return (avl_find(tree, &lookup, &where)); 4750 } 4751 4752 static cksum_record_t * 4753 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) 4754 { 4755 cksum_record_t *rec; 4756 4757 rec = cksum_record_lookup(tree, cksum); 4758 if (rec) { 4759 rec->labels[l] = B_TRUE; 4760 } else { 4761 rec = cksum_record_alloc(cksum, l); 4762 avl_add(tree, rec); 4763 } 4764 4765 return (rec); 4766 } 4767 4768 static int 4769 first_label(cksum_record_t *rec) 4770 { 4771 for (int i = 0; i < VDEV_LABELS; i++) 4772 if (rec->labels[i]) 4773 return (i); 4774 4775 return (-1); 4776 } 4777 4778 static void 4779 print_label_numbers(const char *prefix, const cksum_record_t *rec) 4780 { 4781 fputs(prefix, stdout); 4782 for (int i = 0; i < VDEV_LABELS; i++) 4783 if (rec->labels[i] == B_TRUE) 4784 printf("%d ", i); 4785 putchar('\n'); 4786 } 4787 4788 #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) 4789 4790 typedef struct zdb_label { 4791 vdev_label_t label; 4792 uint64_t label_offset; 4793 nvlist_t *config_nv; 4794 cksum_record_t *config; 4795 cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; 4796 boolean_t header_printed; 4797 boolean_t read_failed; 4798 boolean_t cksum_valid; 4799 } zdb_label_t; 4800 4801 static void 4802 print_label_header(zdb_label_t *label, int l) 4803 { 4804 4805 if (dump_opt['q']) 4806 return; 4807 4808 if (label->header_printed == B_TRUE) 4809 return; 4810 4811 (void) printf("------------------------------------\n"); 4812 (void) printf("LABEL %d %s\n", l, 4813 label->cksum_valid ? "" : "(Bad label cksum)"); 4814 (void) printf("------------------------------------\n"); 4815 4816 label->header_printed = B_TRUE; 4817 } 4818 4819 static void 4820 print_l2arc_header(void) 4821 { 4822 (void) printf("------------------------------------\n"); 4823 (void) printf("L2ARC device header\n"); 4824 (void) printf("------------------------------------\n"); 4825 } 4826 4827 static void 4828 print_l2arc_log_blocks(void) 4829 { 4830 (void) printf("------------------------------------\n"); 4831 (void) printf("L2ARC device log blocks\n"); 4832 (void) printf("------------------------------------\n"); 4833 } 4834 4835 static void 4836 dump_l2arc_log_entries(uint64_t log_entries, 4837 l2arc_log_ent_phys_t *le, uint64_t i) 4838 { 4839 for (int j = 0; j < log_entries; j++) { 4840 dva_t dva = le[j].le_dva; 4841 (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " 4842 "vdev: %llu, offset: %llu\n", 4843 (u_longlong_t)i, j + 1, 4844 (u_longlong_t)DVA_GET_ASIZE(&dva), 4845 (u_longlong_t)DVA_GET_VDEV(&dva), 4846 (u_longlong_t)DVA_GET_OFFSET(&dva)); 4847 (void) printf("|\t\t\t\tbirth: %llu\n", 4848 (u_longlong_t)le[j].le_birth); 4849 (void) printf("|\t\t\t\tlsize: %llu\n", 4850 (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); 4851 (void) printf("|\t\t\t\tpsize: %llu\n", 4852 (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); 4853 (void) printf("|\t\t\t\tcompr: %llu\n", 4854 (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); 4855 (void) printf("|\t\t\t\tcomplevel: %llu\n", 4856 (u_longlong_t)(&le[j])->le_complevel); 4857 (void) printf("|\t\t\t\ttype: %llu\n", 4858 (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); 4859 (void) printf("|\t\t\t\tprotected: %llu\n", 4860 (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); 4861 (void) printf("|\t\t\t\tprefetch: %llu\n", 4862 (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); 4863 (void) printf("|\t\t\t\taddress: %llu\n", 4864 (u_longlong_t)le[j].le_daddr); 4865 (void) printf("|\t\t\t\tARC state: %llu\n", 4866 (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop)); 4867 (void) printf("|\n"); 4868 } 4869 (void) printf("\n"); 4870 } 4871 4872 static void 4873 dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps) 4874 { 4875 (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr); 4876 (void) printf("|\t\tpayload_asize: %llu\n", 4877 (u_longlong_t)lbps->lbp_payload_asize); 4878 (void) printf("|\t\tpayload_start: %llu\n", 4879 (u_longlong_t)lbps->lbp_payload_start); 4880 (void) printf("|\t\tlsize: %llu\n", 4881 (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop)); 4882 (void) printf("|\t\tasize: %llu\n", 4883 (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop)); 4884 (void) printf("|\t\tcompralgo: %llu\n", 4885 (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop)); 4886 (void) printf("|\t\tcksumalgo: %llu\n", 4887 (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop)); 4888 (void) printf("|\n\n"); 4889 } 4890 4891 static void 4892 dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr, 4893 l2arc_dev_hdr_phys_t *rebuild) 4894 { 4895 l2arc_log_blk_phys_t this_lb; 4896 uint64_t asize; 4897 l2arc_log_blkptr_t lbps[2]; 4898 zio_cksum_t cksum; 4899 int failed = 0; 4900 l2arc_dev_t dev; 4901 4902 if (!dump_opt['q']) 4903 print_l2arc_log_blocks(); 4904 memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); 4905 4906 dev.l2ad_evict = l2dhdr->dh_evict; 4907 dev.l2ad_start = l2dhdr->dh_start; 4908 dev.l2ad_end = l2dhdr->dh_end; 4909 4910 if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) { 4911 /* no log blocks to read */ 4912 if (!dump_opt['q']) { 4913 (void) printf("No log blocks to read\n"); 4914 (void) printf("\n"); 4915 } 4916 return; 4917 } else { 4918 dev.l2ad_hand = lbps[0].lbp_daddr + 4919 L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); 4920 } 4921 4922 dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); 4923 4924 for (;;) { 4925 if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) 4926 break; 4927 4928 /* L2BLK_GET_PSIZE returns aligned size for log blocks */ 4929 asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); 4930 if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { 4931 if (!dump_opt['q']) { 4932 (void) printf("Error while reading next log " 4933 "block\n\n"); 4934 } 4935 break; 4936 } 4937 4938 fletcher_4_native_varsize(&this_lb, asize, &cksum); 4939 if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { 4940 failed++; 4941 if (!dump_opt['q']) { 4942 (void) printf("Invalid cksum\n"); 4943 dump_l2arc_log_blkptr(&lbps[0]); 4944 } 4945 break; 4946 } 4947 4948 switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { 4949 case ZIO_COMPRESS_OFF: 4950 break; 4951 default: { 4952 abd_t *abd = abd_alloc_linear(asize, B_TRUE); 4953 abd_copy_from_buf_off(abd, &this_lb, 0, asize); 4954 abd_t dabd; 4955 abd_get_from_buf_struct(&dabd, &this_lb, 4956 sizeof (this_lb)); 4957 int err = zio_decompress_data(L2BLK_GET_COMPRESS( 4958 (&lbps[0])->lbp_prop), abd, &dabd, 4959 asize, sizeof (this_lb), NULL); 4960 abd_free(&dabd); 4961 abd_free(abd); 4962 if (err != 0) { 4963 (void) printf("L2ARC block decompression " 4964 "failed\n"); 4965 goto out; 4966 } 4967 break; 4968 } 4969 } 4970 4971 if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) 4972 byteswap_uint64_array(&this_lb, sizeof (this_lb)); 4973 if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { 4974 if (!dump_opt['q']) 4975 (void) printf("Invalid log block magic\n\n"); 4976 break; 4977 } 4978 4979 rebuild->dh_lb_count++; 4980 rebuild->dh_lb_asize += asize; 4981 if (dump_opt['l'] > 1 && !dump_opt['q']) { 4982 (void) printf("lb[%4llu]\tmagic: %llu\n", 4983 (u_longlong_t)rebuild->dh_lb_count, 4984 (u_longlong_t)this_lb.lb_magic); 4985 dump_l2arc_log_blkptr(&lbps[0]); 4986 } 4987 4988 if (dump_opt['l'] > 2 && !dump_opt['q']) 4989 dump_l2arc_log_entries(l2dhdr->dh_log_entries, 4990 this_lb.lb_entries, 4991 rebuild->dh_lb_count); 4992 4993 if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, 4994 lbps[0].lbp_payload_start, dev.l2ad_evict) && 4995 !dev.l2ad_first) 4996 break; 4997 4998 lbps[0] = lbps[1]; 4999 lbps[1] = this_lb.lb_prev_lbp; 5000 } 5001 out: 5002 if (!dump_opt['q']) { 5003 (void) printf("log_blk_count:\t %llu with valid cksum\n", 5004 (u_longlong_t)rebuild->dh_lb_count); 5005 (void) printf("\t\t %d with invalid cksum\n", failed); 5006 (void) printf("log_blk_asize:\t %llu\n\n", 5007 (u_longlong_t)rebuild->dh_lb_asize); 5008 } 5009 } 5010 5011 static int 5012 dump_l2arc_header(int fd) 5013 { 5014 l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0}; 5015 int error = B_FALSE; 5016 5017 if (pread64(fd, &l2dhdr, sizeof (l2dhdr), 5018 VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { 5019 error = B_TRUE; 5020 } else { 5021 if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) 5022 byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); 5023 5024 if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) 5025 error = B_TRUE; 5026 } 5027 5028 if (error) { 5029 (void) printf("L2ARC device header not found\n\n"); 5030 /* Do not return an error here for backward compatibility */ 5031 return (0); 5032 } else if (!dump_opt['q']) { 5033 print_l2arc_header(); 5034 5035 (void) printf(" magic: %llu\n", 5036 (u_longlong_t)l2dhdr.dh_magic); 5037 (void) printf(" version: %llu\n", 5038 (u_longlong_t)l2dhdr.dh_version); 5039 (void) printf(" pool_guid: %llu\n", 5040 (u_longlong_t)l2dhdr.dh_spa_guid); 5041 (void) printf(" flags: %llu\n", 5042 (u_longlong_t)l2dhdr.dh_flags); 5043 (void) printf(" start_lbps[0]: %llu\n", 5044 (u_longlong_t) 5045 l2dhdr.dh_start_lbps[0].lbp_daddr); 5046 (void) printf(" start_lbps[1]: %llu\n", 5047 (u_longlong_t) 5048 l2dhdr.dh_start_lbps[1].lbp_daddr); 5049 (void) printf(" log_blk_ent: %llu\n", 5050 (u_longlong_t)l2dhdr.dh_log_entries); 5051 (void) printf(" start: %llu\n", 5052 (u_longlong_t)l2dhdr.dh_start); 5053 (void) printf(" end: %llu\n", 5054 (u_longlong_t)l2dhdr.dh_end); 5055 (void) printf(" evict: %llu\n", 5056 (u_longlong_t)l2dhdr.dh_evict); 5057 (void) printf(" lb_asize_refcount: %llu\n", 5058 (u_longlong_t)l2dhdr.dh_lb_asize); 5059 (void) printf(" lb_count_refcount: %llu\n", 5060 (u_longlong_t)l2dhdr.dh_lb_count); 5061 (void) printf(" trim_action_time: %llu\n", 5062 (u_longlong_t)l2dhdr.dh_trim_action_time); 5063 (void) printf(" trim_state: %llu\n\n", 5064 (u_longlong_t)l2dhdr.dh_trim_state); 5065 } 5066 5067 dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild); 5068 /* 5069 * The total aligned size of log blocks and the number of log blocks 5070 * reported in the header of the device may be less than what zdb 5071 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). 5072 * This happens because dump_l2arc_log_blocks() lacks the memory 5073 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system 5074 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize 5075 * and dh_lb_count will be lower to begin with than what exists on the 5076 * device. This is normal and zdb should not exit with an error. The 5077 * opposite case should never happen though, the values reported in the 5078 * header should never be higher than what dump_l2arc_log_blocks() and 5079 * l2arc_rebuild() report. If this happens there is a leak in the 5080 * accounting of log blocks. 5081 */ 5082 if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || 5083 l2dhdr.dh_lb_count > rebuild.dh_lb_count) 5084 return (1); 5085 5086 return (0); 5087 } 5088 5089 static void 5090 dump_config_from_label(zdb_label_t *label, size_t buflen, int l) 5091 { 5092 if (dump_opt['q']) 5093 return; 5094 5095 if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) 5096 return; 5097 5098 print_label_header(label, l); 5099 dump_nvlist(label->config_nv, 4); 5100 print_label_numbers(" labels = ", label->config); 5101 5102 if (dump_opt['l'] >= 2) 5103 dump_nvlist_stats(label->config_nv, buflen); 5104 } 5105 5106 #define ZDB_MAX_UB_HEADER_SIZE 32 5107 5108 static void 5109 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) 5110 { 5111 5112 vdev_t vd; 5113 char header[ZDB_MAX_UB_HEADER_SIZE]; 5114 5115 vd.vdev_ashift = ashift; 5116 vd.vdev_top = &vd; 5117 5118 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { 5119 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); 5120 uberblock_t *ub = (void *)((char *)&label->label + uoff); 5121 cksum_record_t *rec = label->uberblocks[i]; 5122 5123 if (rec == NULL) { 5124 if (dump_opt['u'] >= 2) { 5125 print_label_header(label, label_num); 5126 (void) printf(" Uberblock[%d] invalid\n", i); 5127 } 5128 continue; 5129 } 5130 5131 if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) 5132 continue; 5133 5134 if ((dump_opt['u'] < 4) && 5135 (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && 5136 (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) 5137 continue; 5138 5139 print_label_header(label, label_num); 5140 (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, 5141 " Uberblock[%d]\n", i); 5142 dump_uberblock(ub, header, ""); 5143 print_label_numbers(" labels = ", rec); 5144 } 5145 } 5146 5147 static char curpath[PATH_MAX]; 5148 5149 /* 5150 * Iterate through the path components, recursively passing 5151 * current one's obj and remaining path until we find the obj 5152 * for the last one. 5153 */ 5154 static int 5155 dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj) 5156 { 5157 int err; 5158 boolean_t header = B_TRUE; 5159 uint64_t child_obj; 5160 char *s; 5161 dmu_buf_t *db; 5162 dmu_object_info_t doi; 5163 5164 if ((s = strchr(name, '/')) != NULL) 5165 *s = '\0'; 5166 err = zap_lookup(os, obj, name, 8, 1, &child_obj); 5167 5168 (void) strlcat(curpath, name, sizeof (curpath)); 5169 5170 if (err != 0) { 5171 (void) fprintf(stderr, "failed to lookup %s: %s\n", 5172 curpath, strerror(err)); 5173 return (err); 5174 } 5175 5176 child_obj = ZFS_DIRENT_OBJ(child_obj); 5177 err = sa_buf_hold(os, child_obj, FTAG, &db); 5178 if (err != 0) { 5179 (void) fprintf(stderr, 5180 "failed to get SA dbuf for obj %llu: %s\n", 5181 (u_longlong_t)child_obj, strerror(err)); 5182 return (EINVAL); 5183 } 5184 dmu_object_info_from_db(db, &doi); 5185 sa_buf_rele(db, FTAG); 5186 5187 if (doi.doi_bonus_type != DMU_OT_SA && 5188 doi.doi_bonus_type != DMU_OT_ZNODE) { 5189 (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", 5190 doi.doi_bonus_type, (u_longlong_t)child_obj); 5191 return (EINVAL); 5192 } 5193 5194 if (dump_opt['v'] > 6) { 5195 (void) printf("obj=%llu %s type=%d bonustype=%d\n", 5196 (u_longlong_t)child_obj, curpath, doi.doi_type, 5197 doi.doi_bonus_type); 5198 } 5199 5200 (void) strlcat(curpath, "/", sizeof (curpath)); 5201 5202 switch (doi.doi_type) { 5203 case DMU_OT_DIRECTORY_CONTENTS: 5204 if (s != NULL && *(s + 1) != '\0') 5205 return (dump_path_impl(os, child_obj, s + 1, retobj)); 5206 zfs_fallthrough; 5207 case DMU_OT_PLAIN_FILE_CONTENTS: 5208 if (retobj != NULL) { 5209 *retobj = child_obj; 5210 } else { 5211 dump_object(os, child_obj, dump_opt['v'], &header, 5212 NULL, 0); 5213 } 5214 return (0); 5215 default: 5216 (void) fprintf(stderr, "object %llu has non-file/directory " 5217 "type %d\n", (u_longlong_t)obj, doi.doi_type); 5218 break; 5219 } 5220 5221 return (EINVAL); 5222 } 5223 5224 /* 5225 * Dump the blocks for the object specified by path inside the dataset. 5226 */ 5227 static int 5228 dump_path(char *ds, char *path, uint64_t *retobj) 5229 { 5230 int err; 5231 objset_t *os; 5232 uint64_t root_obj; 5233 5234 err = open_objset(ds, FTAG, &os); 5235 if (err != 0) 5236 return (err); 5237 5238 err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); 5239 if (err != 0) { 5240 (void) fprintf(stderr, "can't lookup root znode: %s\n", 5241 strerror(err)); 5242 close_objset(os, FTAG); 5243 return (EINVAL); 5244 } 5245 5246 (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); 5247 5248 err = dump_path_impl(os, root_obj, path, retobj); 5249 5250 close_objset(os, FTAG); 5251 return (err); 5252 } 5253 5254 static int 5255 dump_backup_bytes(objset_t *os, void *buf, int len, void *arg) 5256 { 5257 const char *p = (const char *)buf; 5258 ssize_t nwritten; 5259 5260 (void) os; 5261 (void) arg; 5262 5263 /* Write the data out, handling short writes and signals. */ 5264 while ((nwritten = write(STDOUT_FILENO, p, len)) < len) { 5265 if (nwritten < 0) { 5266 if (errno == EINTR) 5267 continue; 5268 return (errno); 5269 } 5270 p += nwritten; 5271 len -= nwritten; 5272 } 5273 5274 return (0); 5275 } 5276 5277 static void 5278 dump_backup(const char *pool, uint64_t objset_id, const char *flagstr) 5279 { 5280 boolean_t embed = B_FALSE; 5281 boolean_t large_block = B_FALSE; 5282 boolean_t compress = B_FALSE; 5283 boolean_t raw = B_FALSE; 5284 5285 const char *c; 5286 for (c = flagstr; c != NULL && *c != '\0'; c++) { 5287 switch (*c) { 5288 case 'e': 5289 embed = B_TRUE; 5290 break; 5291 case 'L': 5292 large_block = B_TRUE; 5293 break; 5294 case 'c': 5295 compress = B_TRUE; 5296 break; 5297 case 'w': 5298 raw = B_TRUE; 5299 break; 5300 default: 5301 fprintf(stderr, "dump_backup: invalid flag " 5302 "'%c'\n", *c); 5303 return; 5304 } 5305 } 5306 5307 if (isatty(STDOUT_FILENO)) { 5308 fprintf(stderr, "dump_backup: stream cannot be written " 5309 "to a terminal\n"); 5310 return; 5311 } 5312 5313 offset_t off = 0; 5314 dmu_send_outparams_t out = { 5315 .dso_outfunc = dump_backup_bytes, 5316 .dso_dryrun = B_FALSE, 5317 }; 5318 5319 int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed, 5320 large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO, 5321 &off, &out); 5322 if (err != 0) { 5323 fprintf(stderr, "dump_backup: dmu_send_obj: %s\n", 5324 strerror(err)); 5325 return; 5326 } 5327 } 5328 5329 static int 5330 zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) 5331 { 5332 int err = 0; 5333 uint64_t size, readsize, oursize, offset; 5334 ssize_t writesize; 5335 sa_handle_t *hdl; 5336 5337 (void) printf("Copying object %" PRIu64 " to file %s\n", srcobj, 5338 destfile); 5339 5340 VERIFY3P(os, ==, sa_os); 5341 if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) { 5342 (void) printf("Failed to get handle for SA znode\n"); 5343 return (err); 5344 } 5345 if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) { 5346 (void) sa_handle_destroy(hdl); 5347 return (err); 5348 } 5349 (void) sa_handle_destroy(hdl); 5350 5351 (void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj, 5352 size); 5353 if (size == 0) { 5354 return (EINVAL); 5355 } 5356 5357 int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); 5358 if (fd == -1) 5359 return (errno); 5360 /* 5361 * We cap the size at 1 mebibyte here to prevent 5362 * allocation failures and nigh-infinite printing if the 5363 * object is extremely large. 5364 */ 5365 oursize = MIN(size, 1 << 20); 5366 offset = 0; 5367 char *buf = kmem_alloc(oursize, KM_NOSLEEP); 5368 if (buf == NULL) { 5369 (void) close(fd); 5370 return (ENOMEM); 5371 } 5372 5373 while (offset < size) { 5374 readsize = MIN(size - offset, 1 << 20); 5375 err = dmu_read(os, srcobj, offset, readsize, buf, 0); 5376 if (err != 0) { 5377 (void) printf("got error %u from dmu_read\n", err); 5378 kmem_free(buf, oursize); 5379 (void) close(fd); 5380 return (err); 5381 } 5382 if (dump_opt['v'] > 3) { 5383 (void) printf("Read offset=%" PRIu64 " size=%" PRIu64 5384 " error=%d\n", offset, readsize, err); 5385 } 5386 5387 writesize = write(fd, buf, readsize); 5388 if (writesize < 0) { 5389 err = errno; 5390 break; 5391 } else if (writesize != readsize) { 5392 /* Incomplete write */ 5393 (void) fprintf(stderr, "Short write, only wrote %llu of" 5394 " %" PRIu64 " bytes, exiting...\n", 5395 (u_longlong_t)writesize, readsize); 5396 break; 5397 } 5398 5399 offset += readsize; 5400 } 5401 5402 (void) close(fd); 5403 5404 if (buf != NULL) 5405 kmem_free(buf, oursize); 5406 5407 return (err); 5408 } 5409 5410 static boolean_t 5411 label_cksum_valid(vdev_label_t *label, uint64_t offset) 5412 { 5413 zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; 5414 zio_cksum_t expected_cksum; 5415 zio_cksum_t actual_cksum; 5416 zio_cksum_t verifier; 5417 zio_eck_t *eck; 5418 int byteswap; 5419 5420 void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys); 5421 eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1; 5422 5423 offset += offsetof(vdev_label_t, vl_vdev_phys); 5424 ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); 5425 5426 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 5427 if (byteswap) 5428 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 5429 5430 expected_cksum = eck->zec_cksum; 5431 eck->zec_cksum = verifier; 5432 5433 abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE); 5434 ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum); 5435 abd_free(abd); 5436 5437 if (byteswap) 5438 byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); 5439 5440 if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) 5441 return (B_TRUE); 5442 5443 return (B_FALSE); 5444 } 5445 5446 static int 5447 dump_label(const char *dev) 5448 { 5449 char path[MAXPATHLEN]; 5450 zdb_label_t labels[VDEV_LABELS] = {{{{0}}}}; 5451 uint64_t psize, ashift, l2cache; 5452 struct stat64 statbuf; 5453 boolean_t config_found = B_FALSE; 5454 boolean_t error = B_FALSE; 5455 boolean_t read_l2arc_header = B_FALSE; 5456 avl_tree_t config_tree; 5457 avl_tree_t uberblock_tree; 5458 void *node, *cookie; 5459 int fd; 5460 5461 /* 5462 * Check if we were given absolute path and use it as is. 5463 * Otherwise if the provided vdev name doesn't point to a file, 5464 * try prepending expected disk paths and partition numbers. 5465 */ 5466 (void) strlcpy(path, dev, sizeof (path)); 5467 if (dev[0] != '/' && stat64(path, &statbuf) != 0) { 5468 int error; 5469 5470 error = zfs_resolve_shortname(dev, path, MAXPATHLEN); 5471 if (error == 0 && zfs_dev_is_whole_disk(path)) { 5472 if (zfs_append_partition(path, MAXPATHLEN) == -1) 5473 error = ENOENT; 5474 } 5475 5476 if (error || (stat64(path, &statbuf) != 0)) { 5477 (void) printf("failed to find device %s, try " 5478 "specifying absolute path instead\n", dev); 5479 return (1); 5480 } 5481 } 5482 5483 if ((fd = open64(path, O_RDONLY)) < 0) { 5484 (void) printf("cannot open '%s': %s\n", path, strerror(errno)); 5485 zdb_exit(1); 5486 } 5487 5488 if (fstat64_blk(fd, &statbuf) != 0) { 5489 (void) printf("failed to stat '%s': %s\n", path, 5490 strerror(errno)); 5491 (void) close(fd); 5492 zdb_exit(1); 5493 } 5494 5495 if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) 5496 (void) printf("failed to invalidate cache '%s' : %s\n", path, 5497 strerror(errno)); 5498 5499 avl_create(&config_tree, cksum_record_compare, 5500 sizeof (cksum_record_t), offsetof(cksum_record_t, link)); 5501 avl_create(&uberblock_tree, cksum_record_compare, 5502 sizeof (cksum_record_t), offsetof(cksum_record_t, link)); 5503 5504 psize = statbuf.st_size; 5505 psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t); 5506 ashift = SPA_MINBLOCKSHIFT; 5507 5508 /* 5509 * 1. Read the label from disk 5510 * 2. Verify label cksum 5511 * 3. Unpack the configuration and insert in config tree. 5512 * 4. Traverse all uberblocks and insert in uberblock tree. 5513 */ 5514 for (int l = 0; l < VDEV_LABELS; l++) { 5515 zdb_label_t *label = &labels[l]; 5516 char *buf = label->label.vl_vdev_phys.vp_nvlist; 5517 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); 5518 nvlist_t *config; 5519 cksum_record_t *rec; 5520 zio_cksum_t cksum; 5521 vdev_t vd; 5522 5523 label->label_offset = vdev_label_offset(psize, l, 0); 5524 5525 if (pread64(fd, &label->label, sizeof (label->label), 5526 label->label_offset) != sizeof (label->label)) { 5527 if (!dump_opt['q']) 5528 (void) printf("failed to read label %d\n", l); 5529 label->read_failed = B_TRUE; 5530 error = B_TRUE; 5531 continue; 5532 } 5533 5534 label->read_failed = B_FALSE; 5535 label->cksum_valid = label_cksum_valid(&label->label, 5536 label->label_offset); 5537 5538 if (nvlist_unpack(buf, buflen, &config, 0) == 0) { 5539 nvlist_t *vdev_tree = NULL; 5540 size_t size; 5541 5542 if ((nvlist_lookup_nvlist(config, 5543 ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || 5544 (nvlist_lookup_uint64(vdev_tree, 5545 ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) 5546 ashift = SPA_MINBLOCKSHIFT; 5547 5548 if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) 5549 size = buflen; 5550 5551 /* If the device is a cache device read the header. */ 5552 if (!read_l2arc_header) { 5553 if (nvlist_lookup_uint64(config, 5554 ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && 5555 l2cache == POOL_STATE_L2CACHE) { 5556 read_l2arc_header = B_TRUE; 5557 } 5558 } 5559 5560 fletcher_4_native_varsize(buf, size, &cksum); 5561 rec = cksum_record_insert(&config_tree, &cksum, l); 5562 5563 label->config = rec; 5564 label->config_nv = config; 5565 config_found = B_TRUE; 5566 } else { 5567 error = B_TRUE; 5568 } 5569 5570 vd.vdev_ashift = ashift; 5571 vd.vdev_top = &vd; 5572 5573 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { 5574 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); 5575 uberblock_t *ub = (void *)((char *)label + uoff); 5576 5577 if (uberblock_verify(ub)) 5578 continue; 5579 5580 fletcher_4_native_varsize(ub, sizeof (*ub), &cksum); 5581 rec = cksum_record_insert(&uberblock_tree, &cksum, l); 5582 5583 label->uberblocks[i] = rec; 5584 } 5585 } 5586 5587 /* 5588 * Dump the label and uberblocks. 5589 */ 5590 for (int l = 0; l < VDEV_LABELS; l++) { 5591 zdb_label_t *label = &labels[l]; 5592 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); 5593 5594 if (label->read_failed == B_TRUE) 5595 continue; 5596 5597 if (label->config_nv) { 5598 dump_config_from_label(label, buflen, l); 5599 } else { 5600 if (!dump_opt['q']) 5601 (void) printf("failed to unpack label %d\n", l); 5602 } 5603 5604 if (dump_opt['u']) 5605 dump_label_uberblocks(label, ashift, l); 5606 5607 nvlist_free(label->config_nv); 5608 } 5609 5610 /* 5611 * Dump the L2ARC header, if existent. 5612 */ 5613 if (read_l2arc_header) 5614 error |= dump_l2arc_header(fd); 5615 5616 cookie = NULL; 5617 while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) 5618 umem_free(node, sizeof (cksum_record_t)); 5619 5620 cookie = NULL; 5621 while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) 5622 umem_free(node, sizeof (cksum_record_t)); 5623 5624 avl_destroy(&config_tree); 5625 avl_destroy(&uberblock_tree); 5626 5627 (void) close(fd); 5628 5629 return (config_found == B_FALSE ? 2 : 5630 (error == B_TRUE ? 1 : 0)); 5631 } 5632 5633 static uint64_t dataset_feature_count[SPA_FEATURES]; 5634 static uint64_t global_feature_count[SPA_FEATURES]; 5635 static uint64_t remap_deadlist_count = 0; 5636 5637 static int 5638 dump_one_objset(const char *dsname, void *arg) 5639 { 5640 (void) arg; 5641 int error; 5642 objset_t *os; 5643 spa_feature_t f; 5644 5645 error = open_objset(dsname, FTAG, &os); 5646 if (error != 0) 5647 return (0); 5648 5649 for (f = 0; f < SPA_FEATURES; f++) { 5650 if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) 5651 continue; 5652 ASSERT(spa_feature_table[f].fi_flags & 5653 ZFEATURE_FLAG_PER_DATASET); 5654 dataset_feature_count[f]++; 5655 } 5656 5657 if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { 5658 remap_deadlist_count++; 5659 } 5660 5661 for (dsl_bookmark_node_t *dbn = 5662 avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; 5663 dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { 5664 mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); 5665 if (dbn->dbn_phys.zbm_redaction_obj != 0) { 5666 global_feature_count[ 5667 SPA_FEATURE_REDACTION_BOOKMARKS]++; 5668 objset_t *mos = os->os_spa->spa_meta_objset; 5669 dnode_t *rl; 5670 VERIFY0(dnode_hold(mos, 5671 dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl)); 5672 if (rl->dn_have_spill) { 5673 global_feature_count[ 5674 SPA_FEATURE_REDACTION_LIST_SPILL]++; 5675 } 5676 } 5677 if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) 5678 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; 5679 } 5680 5681 if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && 5682 !dmu_objset_is_snapshot(os)) { 5683 global_feature_count[SPA_FEATURE_LIVELIST]++; 5684 } 5685 5686 dump_objset(os); 5687 close_objset(os, FTAG); 5688 fuid_table_destroy(); 5689 return (0); 5690 } 5691 5692 /* 5693 * Block statistics. 5694 */ 5695 #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) 5696 typedef struct zdb_blkstats { 5697 uint64_t zb_asize; 5698 uint64_t zb_lsize; 5699 uint64_t zb_psize; 5700 uint64_t zb_count; 5701 uint64_t zb_gangs; 5702 uint64_t zb_ditto_samevdev; 5703 uint64_t zb_ditto_same_ms; 5704 uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; 5705 } zdb_blkstats_t; 5706 5707 /* 5708 * Extended object types to report deferred frees and dedup auto-ditto blocks. 5709 */ 5710 #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) 5711 #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) 5712 #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) 5713 #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) 5714 5715 static const char *zdb_ot_extname[] = { 5716 "deferred free", 5717 "dedup ditto", 5718 "other", 5719 "Total", 5720 }; 5721 5722 #define ZB_TOTAL DN_MAX_LEVELS 5723 #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) 5724 5725 typedef struct zdb_brt_entry { 5726 dva_t zbre_dva; 5727 uint64_t zbre_refcount; 5728 avl_node_t zbre_node; 5729 } zdb_brt_entry_t; 5730 5731 typedef struct zdb_cb { 5732 zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; 5733 uint64_t zcb_removing_size; 5734 uint64_t zcb_checkpoint_size; 5735 uint64_t zcb_dedup_asize; 5736 uint64_t zcb_dedup_blocks; 5737 uint64_t zcb_clone_asize; 5738 uint64_t zcb_clone_blocks; 5739 uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; 5740 uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; 5741 uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; 5742 uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; 5743 uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; 5744 uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; 5745 uint64_t zcb_psize_total; 5746 uint64_t zcb_lsize_total; 5747 uint64_t zcb_asize_total; 5748 uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; 5749 uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] 5750 [BPE_PAYLOAD_SIZE + 1]; 5751 uint64_t zcb_start; 5752 hrtime_t zcb_lastprint; 5753 uint64_t zcb_totalasize; 5754 uint64_t zcb_errors[256]; 5755 int zcb_readfails; 5756 int zcb_haderrors; 5757 spa_t *zcb_spa; 5758 uint32_t **zcb_vd_obsolete_counts; 5759 avl_tree_t zcb_brt; 5760 boolean_t zcb_brt_is_active; 5761 } zdb_cb_t; 5762 5763 /* test if two DVA offsets from same vdev are within the same metaslab */ 5764 static boolean_t 5765 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) 5766 { 5767 vdev_t *vd = vdev_lookup_top(spa, vdev); 5768 uint64_t ms_shift = vd->vdev_ms_shift; 5769 5770 return ((off1 >> ms_shift) == (off2 >> ms_shift)); 5771 } 5772 5773 /* 5774 * Used to simplify reporting of the histogram data. 5775 */ 5776 typedef struct one_histo { 5777 const char *name; 5778 uint64_t *count; 5779 uint64_t *len; 5780 uint64_t cumulative; 5781 } one_histo_t; 5782 5783 /* 5784 * The number of separate histograms processed for psize, lsize and asize. 5785 */ 5786 #define NUM_HISTO 3 5787 5788 /* 5789 * This routine will create a fixed column size output of three different 5790 * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M 5791 * the count, length and cumulative length of the psize, lsize and 5792 * asize blocks. 5793 * 5794 * All three types of blocks are listed on a single line 5795 * 5796 * By default the table is printed in nicenumber format (e.g. 123K) but 5797 * if the '-P' parameter is specified then the full raw number (parseable) 5798 * is printed out. 5799 */ 5800 static void 5801 dump_size_histograms(zdb_cb_t *zcb) 5802 { 5803 /* 5804 * A temporary buffer that allows us to convert a number into 5805 * a string using zdb_nicenumber to allow either raw or human 5806 * readable numbers to be output. 5807 */ 5808 char numbuf[32]; 5809 5810 /* 5811 * Define titles which are used in the headers of the tables 5812 * printed by this routine. 5813 */ 5814 const char blocksize_title1[] = "block"; 5815 const char blocksize_title2[] = "size"; 5816 const char count_title[] = "Count"; 5817 const char length_title[] = "Size"; 5818 const char cumulative_title[] = "Cum."; 5819 5820 /* 5821 * Setup the histogram arrays (psize, lsize, and asize). 5822 */ 5823 one_histo_t parm_histo[NUM_HISTO]; 5824 5825 parm_histo[0].name = "psize"; 5826 parm_histo[0].count = zcb->zcb_psize_count; 5827 parm_histo[0].len = zcb->zcb_psize_len; 5828 parm_histo[0].cumulative = 0; 5829 5830 parm_histo[1].name = "lsize"; 5831 parm_histo[1].count = zcb->zcb_lsize_count; 5832 parm_histo[1].len = zcb->zcb_lsize_len; 5833 parm_histo[1].cumulative = 0; 5834 5835 parm_histo[2].name = "asize"; 5836 parm_histo[2].count = zcb->zcb_asize_count; 5837 parm_histo[2].len = zcb->zcb_asize_len; 5838 parm_histo[2].cumulative = 0; 5839 5840 5841 (void) printf("\nBlock Size Histogram\n"); 5842 switch (block_bin_mode) { 5843 case BIN_PSIZE: 5844 printf("(note: all categories are binned by %s)\n", "psize"); 5845 break; 5846 case BIN_LSIZE: 5847 printf("(note: all categories are binned by %s)\n", "lsize"); 5848 break; 5849 case BIN_ASIZE: 5850 printf("(note: all categories are binned by %s)\n", "asize"); 5851 break; 5852 default: 5853 printf("(note: all categories are binned separately)\n"); 5854 break; 5855 } 5856 if (block_classes != 0) { 5857 char buf[256] = ""; 5858 if (block_classes & CLASS_NORMAL) 5859 strlcat(buf, "\"normal\", ", sizeof (buf)); 5860 if (block_classes & CLASS_SPECIAL) 5861 strlcat(buf, "\"special\", ", sizeof (buf)); 5862 if (block_classes & CLASS_DEDUP) 5863 strlcat(buf, "\"dedup\", ", sizeof (buf)); 5864 if (block_classes & CLASS_OTHER) 5865 strlcat(buf, "\"other\", ", sizeof (buf)); 5866 buf[strlen(buf)-2] = '\0'; 5867 printf("(note: only blocks in these classes are counted: %s)\n", 5868 buf); 5869 } 5870 /* 5871 * Print the first line titles 5872 */ 5873 if (dump_opt['P']) 5874 (void) printf("\n%s\t", blocksize_title1); 5875 else 5876 (void) printf("\n%7s ", blocksize_title1); 5877 5878 for (int j = 0; j < NUM_HISTO; j++) { 5879 if (dump_opt['P']) { 5880 if (j < NUM_HISTO - 1) { 5881 (void) printf("%s\t\t\t", parm_histo[j].name); 5882 } else { 5883 /* Don't print trailing spaces */ 5884 (void) printf(" %s", parm_histo[j].name); 5885 } 5886 } else { 5887 if (j < NUM_HISTO - 1) { 5888 /* Left aligned strings in the output */ 5889 (void) printf("%-7s ", 5890 parm_histo[j].name); 5891 } else { 5892 /* Don't print trailing spaces */ 5893 (void) printf("%s", parm_histo[j].name); 5894 } 5895 } 5896 } 5897 (void) printf("\n"); 5898 5899 /* 5900 * Print the second line titles 5901 */ 5902 if (dump_opt['P']) { 5903 (void) printf("%s\t", blocksize_title2); 5904 } else { 5905 (void) printf("%7s ", blocksize_title2); 5906 } 5907 5908 for (int i = 0; i < NUM_HISTO; i++) { 5909 if (dump_opt['P']) { 5910 (void) printf("%s\t%s\t%s\t", 5911 count_title, length_title, cumulative_title); 5912 } else { 5913 (void) printf("%7s%7s%7s", 5914 count_title, length_title, cumulative_title); 5915 } 5916 } 5917 (void) printf("\n"); 5918 5919 /* 5920 * Print the rows 5921 */ 5922 for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { 5923 5924 /* 5925 * Print the first column showing the blocksize 5926 */ 5927 zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); 5928 5929 if (dump_opt['P']) { 5930 printf("%s", numbuf); 5931 } else { 5932 printf("%7s:", numbuf); 5933 } 5934 5935 /* 5936 * Print the remaining set of 3 columns per size: 5937 * for psize, lsize and asize 5938 */ 5939 for (int j = 0; j < NUM_HISTO; j++) { 5940 parm_histo[j].cumulative += parm_histo[j].len[i]; 5941 5942 zdb_nicenum(parm_histo[j].count[i], 5943 numbuf, sizeof (numbuf)); 5944 if (dump_opt['P']) 5945 (void) printf("\t%s", numbuf); 5946 else 5947 (void) printf("%7s", numbuf); 5948 5949 zdb_nicenum(parm_histo[j].len[i], 5950 numbuf, sizeof (numbuf)); 5951 if (dump_opt['P']) 5952 (void) printf("\t%s", numbuf); 5953 else 5954 (void) printf("%7s", numbuf); 5955 5956 zdb_nicenum(parm_histo[j].cumulative, 5957 numbuf, sizeof (numbuf)); 5958 if (dump_opt['P']) 5959 (void) printf("\t%s", numbuf); 5960 else 5961 (void) printf("%7s", numbuf); 5962 } 5963 (void) printf("\n"); 5964 } 5965 } 5966 5967 static void 5968 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, 5969 dmu_object_type_t type) 5970 { 5971 int i; 5972 5973 ASSERT(type < ZDB_OT_TOTAL); 5974 5975 if (zilog && zil_bp_tree_add(zilog, bp) != 0) 5976 return; 5977 5978 /* 5979 * This flag controls if we will issue a claim for the block while 5980 * counting it, to ensure that all blocks are referenced in space maps. 5981 * We don't issue claims if we're not doing leak tracking, because it's 5982 * expensive if the user isn't interested. We also don't claim the 5983 * second or later occurences of cloned or dedup'd blocks, because we 5984 * already claimed them the first time. 5985 */ 5986 boolean_t do_claim = !dump_opt['L']; 5987 5988 spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); 5989 5990 blkptr_t tempbp; 5991 if (BP_GET_DEDUP(bp)) { 5992 /* 5993 * Dedup'd blocks are special. We need to count them, so we can 5994 * later uncount them when reporting leaked space, and we must 5995 * only claim them once. 5996 * 5997 * We use the existing dedup system to track what we've seen. 5998 * The first time we see a block, we do a ddt_lookup() to see 5999 * if it exists in the DDT. If we're doing leak tracking, we 6000 * claim the block at this time. 6001 * 6002 * Each time we see a block, we reduce the refcount in the 6003 * entry by one, and add to the size and count of dedup'd 6004 * blocks to report at the end. 6005 */ 6006 6007 ddt_t *ddt = ddt_select(zcb->zcb_spa, bp); 6008 6009 ddt_enter(ddt); 6010 6011 /* 6012 * Find the block. This will create the entry in memory, but 6013 * we'll know if that happened by its refcount. 6014 */ 6015 ddt_entry_t *dde = ddt_lookup(ddt, bp, B_TRUE); 6016 6017 /* 6018 * ddt_lookup() can return NULL if this block didn't exist 6019 * in the DDT and creating it would take the DDT over its 6020 * quota. Since we got the block from disk, it must exist in 6021 * the DDT, so this can't happen. However, when unique entries 6022 * are pruned, the dedup bit can be set with no corresponding 6023 * entry in the DDT. 6024 */ 6025 if (dde == NULL) { 6026 ddt_exit(ddt); 6027 goto skipped; 6028 } 6029 6030 /* Get the phys for this variant */ 6031 ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); 6032 6033 /* 6034 * This entry may have multiple sets of DVAs. We must claim 6035 * each set the first time we see them in a real block on disk, 6036 * or count them on subsequent occurences. We don't have a 6037 * convenient way to track the first time we see each variant, 6038 * so we repurpose dde_io as a set of "seen" flag bits. We can 6039 * do this safely in zdb because it never writes, so it will 6040 * never have a writing zio for this block in that pointer. 6041 */ 6042 boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v)); 6043 if (!seen) 6044 dde->dde_io = 6045 (void *)(((uintptr_t)dde->dde_io) | (1 << v)); 6046 6047 /* Consume a reference for this block. */ 6048 if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0) 6049 ddt_phys_decref(dde->dde_phys, v); 6050 6051 /* 6052 * If this entry has a single flat phys, it may have been 6053 * extended with additional DVAs at some time in its life. 6054 * This block might be from before it was fully extended, and 6055 * so have fewer DVAs. 6056 * 6057 * If this is the first time we've seen this block, and we 6058 * claimed it as-is, then we would miss the claim on some 6059 * number of DVAs, which would then be seen as leaked. 6060 * 6061 * In all cases, if we've had fewer DVAs, then the asize would 6062 * be too small, and would lead to the pool apparently using 6063 * more space than allocated. 6064 * 6065 * To handle this, we copy the canonical set of DVAs from the 6066 * entry back to the block pointer before we claim it. 6067 */ 6068 if (v == DDT_PHYS_FLAT) { 6069 ASSERT3U(BP_GET_PHYSICAL_BIRTH(bp), ==, 6070 ddt_phys_birth(dde->dde_phys, v)); 6071 tempbp = *bp; 6072 ddt_bp_fill(dde->dde_phys, v, &tempbp, 6073 BP_GET_PHYSICAL_BIRTH(bp)); 6074 bp = &tempbp; 6075 } 6076 6077 if (seen) { 6078 /* 6079 * The second or later time we see this block, 6080 * it's a duplicate and we count it. 6081 */ 6082 zcb->zcb_dedup_asize += BP_GET_ASIZE(bp); 6083 zcb->zcb_dedup_blocks++; 6084 6085 /* Already claimed, don't do it again. */ 6086 do_claim = B_FALSE; 6087 } 6088 6089 ddt_exit(ddt); 6090 } else if (zcb->zcb_brt_is_active && 6091 brt_maybe_exists(zcb->zcb_spa, bp)) { 6092 /* 6093 * Cloned blocks are special. We need to count them, so we can 6094 * later uncount them when reporting leaked space, and we must 6095 * only claim them once. 6096 * 6097 * To do this, we keep our own in-memory BRT. For each block 6098 * we haven't seen before, we look it up in the real BRT and 6099 * if its there, we note it and its refcount then proceed as 6100 * normal. If we see the block again, we count it as a clone 6101 * and then give it no further consideration. 6102 */ 6103 zdb_brt_entry_t zbre_search, *zbre; 6104 avl_index_t where; 6105 6106 zbre_search.zbre_dva = bp->blk_dva[0]; 6107 zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); 6108 if (zbre == NULL) { 6109 /* Not seen before; track it */ 6110 uint64_t refcnt = 6111 brt_entry_get_refcount(zcb->zcb_spa, bp); 6112 if (refcnt > 0) { 6113 zbre = umem_zalloc(sizeof (zdb_brt_entry_t), 6114 UMEM_NOFAIL); 6115 zbre->zbre_dva = bp->blk_dva[0]; 6116 zbre->zbre_refcount = refcnt; 6117 avl_insert(&zcb->zcb_brt, zbre, where); 6118 } 6119 } else { 6120 /* 6121 * Second or later occurrence, count it and take a 6122 * refcount. 6123 */ 6124 zcb->zcb_clone_asize += BP_GET_ASIZE(bp); 6125 zcb->zcb_clone_blocks++; 6126 6127 zbre->zbre_refcount--; 6128 if (zbre->zbre_refcount == 0) { 6129 avl_remove(&zcb->zcb_brt, zbre); 6130 umem_free(zbre, sizeof (zdb_brt_entry_t)); 6131 } 6132 6133 /* Already claimed, don't do it again. */ 6134 do_claim = B_FALSE; 6135 } 6136 } 6137 6138 skipped: 6139 for (i = 0; i < 4; i++) { 6140 int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; 6141 int t = (i & 1) ? type : ZDB_OT_TOTAL; 6142 int equal; 6143 zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; 6144 6145 zb->zb_asize += BP_GET_ASIZE(bp); 6146 zb->zb_lsize += BP_GET_LSIZE(bp); 6147 zb->zb_psize += BP_GET_PSIZE(bp); 6148 zb->zb_count++; 6149 6150 /* 6151 * The histogram is only big enough to record blocks up to 6152 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, 6153 * "other", bucket. 6154 */ 6155 unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; 6156 idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); 6157 zb->zb_psize_histogram[idx]++; 6158 6159 zb->zb_gangs += BP_COUNT_GANG(bp); 6160 6161 switch (BP_GET_NDVAS(bp)) { 6162 case 2: 6163 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 6164 DVA_GET_VDEV(&bp->blk_dva[1])) { 6165 zb->zb_ditto_samevdev++; 6166 6167 if (same_metaslab(zcb->zcb_spa, 6168 DVA_GET_VDEV(&bp->blk_dva[0]), 6169 DVA_GET_OFFSET(&bp->blk_dva[0]), 6170 DVA_GET_OFFSET(&bp->blk_dva[1]))) 6171 zb->zb_ditto_same_ms++; 6172 } 6173 break; 6174 case 3: 6175 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 6176 DVA_GET_VDEV(&bp->blk_dva[1])) + 6177 (DVA_GET_VDEV(&bp->blk_dva[0]) == 6178 DVA_GET_VDEV(&bp->blk_dva[2])) + 6179 (DVA_GET_VDEV(&bp->blk_dva[1]) == 6180 DVA_GET_VDEV(&bp->blk_dva[2])); 6181 if (equal != 0) { 6182 zb->zb_ditto_samevdev++; 6183 6184 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 6185 DVA_GET_VDEV(&bp->blk_dva[1]) && 6186 same_metaslab(zcb->zcb_spa, 6187 DVA_GET_VDEV(&bp->blk_dva[0]), 6188 DVA_GET_OFFSET(&bp->blk_dva[0]), 6189 DVA_GET_OFFSET(&bp->blk_dva[1]))) 6190 zb->zb_ditto_same_ms++; 6191 else if (DVA_GET_VDEV(&bp->blk_dva[0]) == 6192 DVA_GET_VDEV(&bp->blk_dva[2]) && 6193 same_metaslab(zcb->zcb_spa, 6194 DVA_GET_VDEV(&bp->blk_dva[0]), 6195 DVA_GET_OFFSET(&bp->blk_dva[0]), 6196 DVA_GET_OFFSET(&bp->blk_dva[2]))) 6197 zb->zb_ditto_same_ms++; 6198 else if (DVA_GET_VDEV(&bp->blk_dva[1]) == 6199 DVA_GET_VDEV(&bp->blk_dva[2]) && 6200 same_metaslab(zcb->zcb_spa, 6201 DVA_GET_VDEV(&bp->blk_dva[1]), 6202 DVA_GET_OFFSET(&bp->blk_dva[1]), 6203 DVA_GET_OFFSET(&bp->blk_dva[2]))) 6204 zb->zb_ditto_same_ms++; 6205 } 6206 break; 6207 } 6208 } 6209 6210 spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); 6211 6212 if (BP_IS_EMBEDDED(bp)) { 6213 zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; 6214 zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] 6215 [BPE_GET_PSIZE(bp)]++; 6216 return; 6217 } 6218 6219 if (block_classes != 0) { 6220 spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); 6221 6222 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[0]); 6223 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[0]); 6224 vdev_t *vd = vdev_lookup_top(zcb->zcb_spa, vdev); 6225 ASSERT(vd != NULL); 6226 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6227 ASSERT(ms != NULL); 6228 metaslab_group_t *mg = ms->ms_group; 6229 ASSERT(mg != NULL); 6230 metaslab_class_t *mc = mg->mg_class; 6231 ASSERT(mc != NULL); 6232 6233 spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); 6234 6235 int class; 6236 if (mc == spa_normal_class(zcb->zcb_spa)) { 6237 class = CLASS_NORMAL; 6238 } else if (mc == spa_special_class(zcb->zcb_spa)) { 6239 class = CLASS_SPECIAL; 6240 } else if (mc == spa_dedup_class(zcb->zcb_spa)) { 6241 class = CLASS_DEDUP; 6242 } else { 6243 class = CLASS_OTHER; 6244 } 6245 6246 if (!(block_classes & class)) { 6247 goto hist_skipped; 6248 } 6249 } 6250 6251 /* 6252 * The binning histogram bins by powers of two up to 6253 * SPA_MAXBLOCKSIZE rather than creating bins for 6254 * every possible blocksize found in the pool. 6255 */ 6256 int bin; 6257 6258 /* 6259 * Binning strategy: each bin includes blocks up to and including 6260 * the given size (excluding blocks that fit into the previous bin). 6261 * This way, the "4K" bin includes blocks within the (2K; 4K] range. 6262 */ 6263 #define BIN(size) (highbit64((size) - 1)) 6264 6265 switch (block_bin_mode) { 6266 case BIN_PSIZE: bin = BIN(BP_GET_PSIZE(bp)); break; 6267 case BIN_LSIZE: bin = BIN(BP_GET_LSIZE(bp)); break; 6268 case BIN_ASIZE: bin = BIN(BP_GET_ASIZE(bp)); break; 6269 case BIN_AUTO: break; 6270 default: PANIC("bad block_bin_mode"); abort(); 6271 } 6272 6273 if (block_bin_mode == BIN_AUTO) 6274 bin = BIN(BP_GET_PSIZE(bp)); 6275 6276 zcb->zcb_psize_count[bin]++; 6277 zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); 6278 zcb->zcb_psize_total += BP_GET_PSIZE(bp); 6279 6280 if (block_bin_mode == BIN_AUTO) 6281 bin = BIN(BP_GET_LSIZE(bp)); 6282 6283 zcb->zcb_lsize_count[bin]++; 6284 zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); 6285 zcb->zcb_lsize_total += BP_GET_LSIZE(bp); 6286 6287 if (block_bin_mode == BIN_AUTO) 6288 bin = BIN(BP_GET_ASIZE(bp)); 6289 6290 zcb->zcb_asize_count[bin]++; 6291 zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); 6292 zcb->zcb_asize_total += BP_GET_ASIZE(bp); 6293 6294 #undef BIN 6295 6296 hist_skipped: 6297 if (!do_claim) 6298 return; 6299 6300 VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa, 6301 spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL, 6302 ZIO_FLAG_CANFAIL))); 6303 } 6304 6305 static void 6306 zdb_blkptr_done(zio_t *zio) 6307 { 6308 spa_t *spa = zio->io_spa; 6309 blkptr_t *bp = zio->io_bp; 6310 int ioerr = zio->io_error; 6311 zdb_cb_t *zcb = zio->io_private; 6312 zbookmark_phys_t *zb = &zio->io_bookmark; 6313 6314 mutex_enter(&spa->spa_scrub_lock); 6315 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 6316 cv_broadcast(&spa->spa_scrub_io_cv); 6317 6318 if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 6319 char blkbuf[BP_SPRINTF_LEN]; 6320 6321 zcb->zcb_haderrors = 1; 6322 zcb->zcb_errors[ioerr]++; 6323 6324 if (dump_opt['b'] >= 2) 6325 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 6326 else 6327 blkbuf[0] = '\0'; 6328 6329 (void) printf("zdb_blkptr_cb: " 6330 "Got error %d reading " 6331 "<%llu, %llu, %lld, %llx> %s -- skipping\n", 6332 ioerr, 6333 (u_longlong_t)zb->zb_objset, 6334 (u_longlong_t)zb->zb_object, 6335 (u_longlong_t)zb->zb_level, 6336 (u_longlong_t)zb->zb_blkid, 6337 blkbuf); 6338 } 6339 mutex_exit(&spa->spa_scrub_lock); 6340 6341 abd_free(zio->io_abd); 6342 } 6343 6344 static int 6345 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 6346 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 6347 { 6348 zdb_cb_t *zcb = arg; 6349 dmu_object_type_t type; 6350 boolean_t is_metadata; 6351 6352 if (zb->zb_level == ZB_DNODE_LEVEL) 6353 return (0); 6354 6355 if (dump_opt['b'] >= 5 && BP_GET_BIRTH(bp) > 0) { 6356 char blkbuf[BP_SPRINTF_LEN]; 6357 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 6358 (void) printf("objset %llu object %llu " 6359 "level %lld offset 0x%llx %s\n", 6360 (u_longlong_t)zb->zb_objset, 6361 (u_longlong_t)zb->zb_object, 6362 (longlong_t)zb->zb_level, 6363 (u_longlong_t)blkid2offset(dnp, bp, zb), 6364 blkbuf); 6365 } 6366 6367 if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) 6368 return (0); 6369 6370 type = BP_GET_TYPE(bp); 6371 6372 zdb_count_block(zcb, zilog, bp, 6373 (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); 6374 6375 is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); 6376 6377 if (!BP_IS_EMBEDDED(bp) && 6378 (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { 6379 size_t size = BP_GET_PSIZE(bp); 6380 abd_t *abd = abd_alloc(size, B_FALSE); 6381 int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; 6382 6383 /* If it's an intent log block, failure is expected. */ 6384 if (zb->zb_level == ZB_ZIL_LEVEL) 6385 flags |= ZIO_FLAG_SPECULATIVE; 6386 6387 mutex_enter(&spa->spa_scrub_lock); 6388 while (spa->spa_load_verify_bytes > max_inflight_bytes) 6389 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 6390 spa->spa_load_verify_bytes += size; 6391 mutex_exit(&spa->spa_scrub_lock); 6392 6393 zio_nowait(zio_read(NULL, spa, bp, abd, size, 6394 zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); 6395 } 6396 6397 zcb->zcb_readfails = 0; 6398 6399 /* only call gethrtime() every 100 blocks */ 6400 static int iters; 6401 if (++iters > 100) 6402 iters = 0; 6403 else 6404 return (0); 6405 6406 if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { 6407 uint64_t now = gethrtime(); 6408 char buf[10]; 6409 uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; 6410 uint64_t kb_per_sec = 6411 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); 6412 uint64_t sec_remaining = 6413 (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; 6414 6415 /* make sure nicenum has enough space */ 6416 _Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated"); 6417 6418 zfs_nicebytes(bytes, buf, sizeof (buf)); 6419 (void) fprintf(stderr, 6420 "\r%5s completed (%4"PRIu64"MB/s) " 6421 "estimated time remaining: " 6422 "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec ", 6423 buf, kb_per_sec / 1024, 6424 sec_remaining / 60 / 60, 6425 sec_remaining / 60 % 60, 6426 sec_remaining % 60); 6427 6428 zcb->zcb_lastprint = now; 6429 } 6430 6431 return (0); 6432 } 6433 6434 static void 6435 zdb_leak(void *arg, uint64_t start, uint64_t size) 6436 { 6437 vdev_t *vd = arg; 6438 6439 (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", 6440 (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); 6441 } 6442 6443 static metaslab_ops_t zdb_metaslab_ops = { 6444 NULL /* alloc */ 6445 }; 6446 6447 static int 6448 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, 6449 uint64_t txg, void *arg) 6450 { 6451 spa_vdev_removal_t *svr = arg; 6452 6453 uint64_t offset = sme->sme_offset; 6454 uint64_t size = sme->sme_run; 6455 6456 /* skip vdevs we don't care about */ 6457 if (sme->sme_vdev != svr->svr_vdev_id) 6458 return (0); 6459 6460 vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); 6461 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6462 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 6463 6464 if (txg < metaslab_unflushed_txg(ms)) 6465 return (0); 6466 6467 if (sme->sme_type == SM_ALLOC) 6468 zfs_range_tree_add(svr->svr_allocd_segs, offset, size); 6469 else 6470 zfs_range_tree_remove(svr->svr_allocd_segs, offset, size); 6471 6472 return (0); 6473 } 6474 6475 static void 6476 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 6477 uint64_t size, void *arg) 6478 { 6479 (void) inner_offset, (void) arg; 6480 6481 /* 6482 * This callback was called through a remap from 6483 * a device being removed. Therefore, the vdev that 6484 * this callback is applied to is a concrete 6485 * vdev. 6486 */ 6487 ASSERT(vdev_is_concrete(vd)); 6488 6489 VERIFY0(metaslab_claim_impl(vd, offset, size, 6490 spa_min_claim_txg(vd->vdev_spa))); 6491 } 6492 6493 static void 6494 claim_segment_cb(void *arg, uint64_t offset, uint64_t size) 6495 { 6496 vdev_t *vd = arg; 6497 6498 vdev_indirect_ops.vdev_op_remap(vd, offset, size, 6499 claim_segment_impl_cb, NULL); 6500 } 6501 6502 /* 6503 * After accounting for all allocated blocks that are directly referenced, 6504 * we might have missed a reference to a block from a partially complete 6505 * (and thus unused) indirect mapping object. We perform a secondary pass 6506 * through the metaslabs we have already mapped and claim the destination 6507 * blocks. 6508 */ 6509 static void 6510 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) 6511 { 6512 if (dump_opt['L']) 6513 return; 6514 6515 if (spa->spa_vdev_removal == NULL) 6516 return; 6517 6518 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6519 6520 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 6521 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); 6522 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6523 6524 ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); 6525 6526 zfs_range_tree_t *allocs = zfs_range_tree_create_flags( 6527 NULL, ZFS_RANGE_SEG64, NULL, 0, 0, 6528 0, "zdb_claim_removing:allocs"); 6529 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { 6530 metaslab_t *msp = vd->vdev_ms[msi]; 6531 6532 ASSERT0(zfs_range_tree_space(allocs)); 6533 if (msp->ms_sm != NULL) 6534 VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); 6535 zfs_range_tree_vacate(allocs, zfs_range_tree_add, 6536 svr->svr_allocd_segs); 6537 } 6538 zfs_range_tree_destroy(allocs); 6539 6540 iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); 6541 6542 /* 6543 * Clear everything past what has been synced, 6544 * because we have not allocated mappings for 6545 * it yet. 6546 */ 6547 zfs_range_tree_clear(svr->svr_allocd_segs, 6548 vdev_indirect_mapping_max_offset(vim), 6549 vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); 6550 6551 zcb->zcb_removing_size += zfs_range_tree_space(svr->svr_allocd_segs); 6552 zfs_range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); 6553 6554 spa_config_exit(spa, SCL_CONFIG, FTAG); 6555 } 6556 6557 static int 6558 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 6559 dmu_tx_t *tx) 6560 { 6561 (void) tx; 6562 zdb_cb_t *zcb = arg; 6563 spa_t *spa = zcb->zcb_spa; 6564 vdev_t *vd; 6565 const dva_t *dva = &bp->blk_dva[0]; 6566 6567 ASSERT(!bp_freed); 6568 ASSERT(!dump_opt['L']); 6569 ASSERT3U(BP_GET_NDVAS(bp), ==, 1); 6570 6571 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 6572 vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); 6573 ASSERT3P(vd, !=, NULL); 6574 spa_config_exit(spa, SCL_VDEV, FTAG); 6575 6576 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); 6577 ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); 6578 6579 vdev_indirect_mapping_increment_obsolete_count( 6580 vd->vdev_indirect_mapping, 6581 DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), 6582 zcb->zcb_vd_obsolete_counts[vd->vdev_id]); 6583 6584 return (0); 6585 } 6586 6587 static uint32_t * 6588 zdb_load_obsolete_counts(vdev_t *vd) 6589 { 6590 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6591 spa_t *spa = vd->vdev_spa; 6592 spa_condensing_indirect_phys_t *scip = 6593 &spa->spa_condensing_indirect_phys; 6594 uint64_t obsolete_sm_object; 6595 uint32_t *counts; 6596 6597 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 6598 EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL); 6599 counts = vdev_indirect_mapping_load_obsolete_counts(vim); 6600 if (vd->vdev_obsolete_sm != NULL) { 6601 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, 6602 vd->vdev_obsolete_sm); 6603 } 6604 if (scip->scip_vdev == vd->vdev_id && 6605 scip->scip_prev_obsolete_sm_object != 0) { 6606 space_map_t *prev_obsolete_sm = NULL; 6607 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 6608 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 6609 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, 6610 prev_obsolete_sm); 6611 space_map_close(prev_obsolete_sm); 6612 } 6613 return (counts); 6614 } 6615 6616 typedef struct checkpoint_sm_exclude_entry_arg { 6617 vdev_t *cseea_vd; 6618 uint64_t cseea_checkpoint_size; 6619 } checkpoint_sm_exclude_entry_arg_t; 6620 6621 static int 6622 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) 6623 { 6624 checkpoint_sm_exclude_entry_arg_t *cseea = arg; 6625 vdev_t *vd = cseea->cseea_vd; 6626 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; 6627 uint64_t end = sme->sme_offset + sme->sme_run; 6628 6629 ASSERT(sme->sme_type == SM_FREE); 6630 6631 /* 6632 * Since the vdev_checkpoint_sm exists in the vdev level 6633 * and the ms_sm space maps exist in the metaslab level, 6634 * an entry in the checkpoint space map could theoretically 6635 * cross the boundaries of the metaslab that it belongs. 6636 * 6637 * In reality, because of the way that we populate and 6638 * manipulate the checkpoint's space maps currently, 6639 * there shouldn't be any entries that cross metaslabs. 6640 * Hence the assertion below. 6641 * 6642 * That said, there is no fundamental requirement that 6643 * the checkpoint's space map entries should not cross 6644 * metaslab boundaries. So if needed we could add code 6645 * that handles metaslab-crossing segments in the future. 6646 */ 6647 VERIFY3U(sme->sme_offset, >=, ms->ms_start); 6648 VERIFY3U(end, <=, ms->ms_start + ms->ms_size); 6649 6650 /* 6651 * By removing the entry from the allocated segments we 6652 * also verify that the entry is there to begin with. 6653 */ 6654 mutex_enter(&ms->ms_lock); 6655 zfs_range_tree_remove(ms->ms_allocatable, sme->sme_offset, 6656 sme->sme_run); 6657 mutex_exit(&ms->ms_lock); 6658 6659 cseea->cseea_checkpoint_size += sme->sme_run; 6660 return (0); 6661 } 6662 6663 static void 6664 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) 6665 { 6666 spa_t *spa = vd->vdev_spa; 6667 space_map_t *checkpoint_sm = NULL; 6668 uint64_t checkpoint_sm_obj; 6669 6670 /* 6671 * If there is no vdev_top_zap, we are in a pool whose 6672 * version predates the pool checkpoint feature. 6673 */ 6674 if (vd->vdev_top_zap == 0) 6675 return; 6676 6677 /* 6678 * If there is no reference of the vdev_checkpoint_sm in 6679 * the vdev_top_zap, then one of the following scenarios 6680 * is true: 6681 * 6682 * 1] There is no checkpoint 6683 * 2] There is a checkpoint, but no checkpointed blocks 6684 * have been freed yet 6685 * 3] The current vdev is indirect 6686 * 6687 * In these cases we return immediately. 6688 */ 6689 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, 6690 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 6691 return; 6692 6693 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, 6694 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, 6695 &checkpoint_sm_obj)); 6696 6697 checkpoint_sm_exclude_entry_arg_t cseea; 6698 cseea.cseea_vd = vd; 6699 cseea.cseea_checkpoint_size = 0; 6700 6701 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), 6702 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); 6703 6704 VERIFY0(space_map_iterate(checkpoint_sm, 6705 space_map_length(checkpoint_sm), 6706 checkpoint_sm_exclude_entry_cb, &cseea)); 6707 space_map_close(checkpoint_sm); 6708 6709 zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; 6710 } 6711 6712 static void 6713 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) 6714 { 6715 ASSERT(!dump_opt['L']); 6716 6717 vdev_t *rvd = spa->spa_root_vdev; 6718 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 6719 ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); 6720 zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); 6721 } 6722 } 6723 6724 static int 6725 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, 6726 uint64_t txg, void *arg) 6727 { 6728 int64_t *ualloc_space = arg; 6729 6730 uint64_t offset = sme->sme_offset; 6731 uint64_t vdev_id = sme->sme_vdev; 6732 6733 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 6734 if (!vdev_is_concrete(vd)) 6735 return (0); 6736 6737 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6738 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 6739 6740 if (txg < metaslab_unflushed_txg(ms)) 6741 return (0); 6742 6743 if (sme->sme_type == SM_ALLOC) 6744 *ualloc_space += sme->sme_run; 6745 else 6746 *ualloc_space -= sme->sme_run; 6747 6748 return (0); 6749 } 6750 6751 static int64_t 6752 get_unflushed_alloc_space(spa_t *spa) 6753 { 6754 if (dump_opt['L']) 6755 return (0); 6756 6757 int64_t ualloc_space = 0; 6758 iterate_through_spacemap_logs(spa, count_unflushed_space_cb, 6759 &ualloc_space); 6760 return (ualloc_space); 6761 } 6762 6763 static int 6764 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) 6765 { 6766 maptype_t *uic_maptype = arg; 6767 6768 uint64_t offset = sme->sme_offset; 6769 uint64_t size = sme->sme_run; 6770 uint64_t vdev_id = sme->sme_vdev; 6771 6772 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 6773 6774 /* skip indirect vdevs */ 6775 if (!vdev_is_concrete(vd)) 6776 return (0); 6777 6778 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6779 6780 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 6781 ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); 6782 6783 if (txg < metaslab_unflushed_txg(ms)) 6784 return (0); 6785 6786 if (*uic_maptype == sme->sme_type) 6787 zfs_range_tree_add(ms->ms_allocatable, offset, size); 6788 else 6789 zfs_range_tree_remove(ms->ms_allocatable, offset, size); 6790 6791 return (0); 6792 } 6793 6794 static void 6795 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) 6796 { 6797 iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); 6798 } 6799 6800 static void 6801 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) 6802 { 6803 vdev_t *rvd = spa->spa_root_vdev; 6804 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 6805 vdev_t *vd = rvd->vdev_child[i]; 6806 6807 ASSERT3U(i, ==, vd->vdev_id); 6808 6809 if (vd->vdev_ops == &vdev_indirect_ops) 6810 continue; 6811 6812 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 6813 metaslab_t *msp = vd->vdev_ms[m]; 6814 6815 (void) fprintf(stderr, 6816 "\rloading concrete vdev %llu, " 6817 "metaslab %llu of %llu ...", 6818 (longlong_t)vd->vdev_id, 6819 (longlong_t)msp->ms_id, 6820 (longlong_t)vd->vdev_ms_count); 6821 6822 mutex_enter(&msp->ms_lock); 6823 zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); 6824 6825 /* 6826 * We don't want to spend the CPU manipulating the 6827 * size-ordered tree, so clear the range_tree ops. 6828 */ 6829 msp->ms_allocatable->rt_ops = NULL; 6830 6831 if (msp->ms_sm != NULL) { 6832 VERIFY0(space_map_load(msp->ms_sm, 6833 msp->ms_allocatable, maptype)); 6834 } 6835 if (!msp->ms_loaded) 6836 msp->ms_loaded = B_TRUE; 6837 mutex_exit(&msp->ms_lock); 6838 } 6839 } 6840 6841 load_unflushed_to_ms_allocatables(spa, maptype); 6842 } 6843 6844 /* 6845 * vm_idxp is an in-out parameter which (for indirect vdevs) is the 6846 * index in vim_entries that has the first entry in this metaslab. 6847 * On return, it will be set to the first entry after this metaslab. 6848 */ 6849 static void 6850 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, 6851 uint64_t *vim_idxp) 6852 { 6853 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6854 6855 mutex_enter(&msp->ms_lock); 6856 zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); 6857 6858 /* 6859 * We don't want to spend the CPU manipulating the 6860 * size-ordered tree, so clear the range_tree ops. 6861 */ 6862 msp->ms_allocatable->rt_ops = NULL; 6863 6864 for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); 6865 (*vim_idxp)++) { 6866 vdev_indirect_mapping_entry_phys_t *vimep = 6867 &vim->vim_entries[*vim_idxp]; 6868 uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); 6869 uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); 6870 ASSERT3U(ent_offset, >=, msp->ms_start); 6871 if (ent_offset >= msp->ms_start + msp->ms_size) 6872 break; 6873 6874 /* 6875 * Mappings do not cross metaslab boundaries, 6876 * because we create them by walking the metaslabs. 6877 */ 6878 ASSERT3U(ent_offset + ent_len, <=, 6879 msp->ms_start + msp->ms_size); 6880 zfs_range_tree_add(msp->ms_allocatable, ent_offset, ent_len); 6881 } 6882 6883 if (!msp->ms_loaded) 6884 msp->ms_loaded = B_TRUE; 6885 mutex_exit(&msp->ms_lock); 6886 } 6887 6888 static void 6889 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) 6890 { 6891 ASSERT(!dump_opt['L']); 6892 6893 vdev_t *rvd = spa->spa_root_vdev; 6894 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 6895 vdev_t *vd = rvd->vdev_child[c]; 6896 6897 ASSERT3U(c, ==, vd->vdev_id); 6898 6899 if (vd->vdev_ops != &vdev_indirect_ops) 6900 continue; 6901 6902 /* 6903 * Note: we don't check for mapping leaks on 6904 * removing vdevs because their ms_allocatable's 6905 * are used to look for leaks in allocated space. 6906 */ 6907 zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); 6908 6909 /* 6910 * Normally, indirect vdevs don't have any 6911 * metaslabs. We want to set them up for 6912 * zio_claim(). 6913 */ 6914 vdev_metaslab_group_create(vd); 6915 VERIFY0(vdev_metaslab_init(vd, 0)); 6916 6917 vdev_indirect_mapping_t *vim __maybe_unused = 6918 vd->vdev_indirect_mapping; 6919 uint64_t vim_idx = 0; 6920 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 6921 6922 (void) fprintf(stderr, 6923 "\rloading indirect vdev %llu, " 6924 "metaslab %llu of %llu ...", 6925 (longlong_t)vd->vdev_id, 6926 (longlong_t)vd->vdev_ms[m]->ms_id, 6927 (longlong_t)vd->vdev_ms_count); 6928 6929 load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], 6930 &vim_idx); 6931 } 6932 ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); 6933 } 6934 } 6935 6936 static void 6937 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) 6938 { 6939 zcb->zcb_spa = spa; 6940 6941 if (dump_opt['L']) 6942 return; 6943 6944 dsl_pool_t *dp = spa->spa_dsl_pool; 6945 vdev_t *rvd = spa->spa_root_vdev; 6946 6947 /* 6948 * We are going to be changing the meaning of the metaslab's 6949 * ms_allocatable. Ensure that the allocator doesn't try to 6950 * use the tree. 6951 */ 6952 spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; 6953 spa->spa_log_class->mc_ops = &zdb_metaslab_ops; 6954 spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; 6955 spa->spa_special_embedded_log_class->mc_ops = &zdb_metaslab_ops; 6956 6957 zcb->zcb_vd_obsolete_counts = 6958 umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), 6959 UMEM_NOFAIL); 6960 6961 /* 6962 * For leak detection, we overload the ms_allocatable trees 6963 * to contain allocated segments instead of free segments. 6964 * As a result, we can't use the normal metaslab_load/unload 6965 * interfaces. 6966 */ 6967 zdb_leak_init_prepare_indirect_vdevs(spa, zcb); 6968 load_concrete_ms_allocatable_trees(spa, SM_ALLOC); 6969 6970 /* 6971 * On load_concrete_ms_allocatable_trees() we loaded all the 6972 * allocated entries from the ms_sm to the ms_allocatable for 6973 * each metaslab. If the pool has a checkpoint or is in the 6974 * middle of discarding a checkpoint, some of these blocks 6975 * may have been freed but their ms_sm may not have been 6976 * updated because they are referenced by the checkpoint. In 6977 * order to avoid false-positives during leak-detection, we 6978 * go through the vdev's checkpoint space map and exclude all 6979 * its entries from their relevant ms_allocatable. 6980 * 6981 * We also aggregate the space held by the checkpoint and add 6982 * it to zcb_checkpoint_size. 6983 * 6984 * Note that at this point we are also verifying that all the 6985 * entries on the checkpoint_sm are marked as allocated in 6986 * the ms_sm of their relevant metaslab. 6987 * [see comment in checkpoint_sm_exclude_entry_cb()] 6988 */ 6989 zdb_leak_init_exclude_checkpoint(spa, zcb); 6990 ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); 6991 6992 /* for cleaner progress output */ 6993 (void) fprintf(stderr, "\n"); 6994 6995 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { 6996 ASSERT(spa_feature_is_enabled(spa, 6997 SPA_FEATURE_DEVICE_REMOVAL)); 6998 (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, 6999 increment_indirect_mapping_cb, zcb, NULL); 7000 } 7001 } 7002 7003 static boolean_t 7004 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) 7005 { 7006 boolean_t leaks = B_FALSE; 7007 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 7008 uint64_t total_leaked = 0; 7009 boolean_t are_precise = B_FALSE; 7010 7011 ASSERT(vim != NULL); 7012 7013 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { 7014 vdev_indirect_mapping_entry_phys_t *vimep = 7015 &vim->vim_entries[i]; 7016 uint64_t obsolete_bytes = 0; 7017 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); 7018 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 7019 7020 /* 7021 * This is not very efficient but it's easy to 7022 * verify correctness. 7023 */ 7024 for (uint64_t inner_offset = 0; 7025 inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); 7026 inner_offset += 1ULL << vd->vdev_ashift) { 7027 if (zfs_range_tree_contains(msp->ms_allocatable, 7028 offset + inner_offset, 1ULL << vd->vdev_ashift)) { 7029 obsolete_bytes += 1ULL << vd->vdev_ashift; 7030 } 7031 } 7032 7033 int64_t bytes_leaked = obsolete_bytes - 7034 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; 7035 ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, 7036 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); 7037 7038 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); 7039 if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) { 7040 (void) printf("obsolete indirect mapping count " 7041 "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", 7042 (u_longlong_t)vd->vdev_id, 7043 (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), 7044 (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 7045 (u_longlong_t)bytes_leaked); 7046 } 7047 total_leaked += ABS(bytes_leaked); 7048 } 7049 7050 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); 7051 if (!are_precise && total_leaked > 0) { 7052 int pct_leaked = total_leaked * 100 / 7053 vdev_indirect_mapping_bytes_mapped(vim); 7054 (void) printf("cannot verify obsolete indirect mapping " 7055 "counts of vdev %llu because precise feature was not " 7056 "enabled when it was removed: %d%% (%llx bytes) of mapping" 7057 "unreferenced\n", 7058 (u_longlong_t)vd->vdev_id, pct_leaked, 7059 (u_longlong_t)total_leaked); 7060 } else if (total_leaked > 0) { 7061 (void) printf("obsolete indirect mapping count mismatch " 7062 "for vdev %llu -- %llx total bytes mismatched\n", 7063 (u_longlong_t)vd->vdev_id, 7064 (u_longlong_t)total_leaked); 7065 leaks |= B_TRUE; 7066 } 7067 7068 vdev_indirect_mapping_free_obsolete_counts(vim, 7069 zcb->zcb_vd_obsolete_counts[vd->vdev_id]); 7070 zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; 7071 7072 return (leaks); 7073 } 7074 7075 static boolean_t 7076 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) 7077 { 7078 if (dump_opt['L']) 7079 return (B_FALSE); 7080 7081 boolean_t leaks = B_FALSE; 7082 vdev_t *rvd = spa->spa_root_vdev; 7083 for (unsigned c = 0; c < rvd->vdev_children; c++) { 7084 vdev_t *vd = rvd->vdev_child[c]; 7085 7086 if (zcb->zcb_vd_obsolete_counts[c] != NULL) { 7087 leaks |= zdb_check_for_obsolete_leaks(vd, zcb); 7088 } 7089 7090 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 7091 metaslab_t *msp = vd->vdev_ms[m]; 7092 ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == 7093 spa_embedded_log_class(spa) || 7094 msp->ms_group->mg_class == 7095 spa_special_embedded_log_class(spa)) ? 7096 vd->vdev_log_mg : vd->vdev_mg); 7097 7098 /* 7099 * ms_allocatable has been overloaded 7100 * to contain allocated segments. Now that 7101 * we finished traversing all blocks, any 7102 * block that remains in the ms_allocatable 7103 * represents an allocated block that we 7104 * did not claim during the traversal. 7105 * Claimed blocks would have been removed 7106 * from the ms_allocatable. For indirect 7107 * vdevs, space remaining in the tree 7108 * represents parts of the mapping that are 7109 * not referenced, which is not a bug. 7110 */ 7111 if (vd->vdev_ops == &vdev_indirect_ops) { 7112 zfs_range_tree_vacate(msp->ms_allocatable, 7113 NULL, NULL); 7114 } else { 7115 zfs_range_tree_vacate(msp->ms_allocatable, 7116 zdb_leak, vd); 7117 } 7118 if (msp->ms_loaded) { 7119 msp->ms_loaded = B_FALSE; 7120 } 7121 } 7122 } 7123 7124 umem_free(zcb->zcb_vd_obsolete_counts, 7125 rvd->vdev_children * sizeof (uint32_t *)); 7126 zcb->zcb_vd_obsolete_counts = NULL; 7127 7128 return (leaks); 7129 } 7130 7131 static int 7132 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 7133 { 7134 (void) tx; 7135 zdb_cb_t *zcb = arg; 7136 7137 if (dump_opt['b'] >= 5) { 7138 char blkbuf[BP_SPRINTF_LEN]; 7139 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 7140 (void) printf("[%s] %s\n", 7141 "deferred free", blkbuf); 7142 } 7143 zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); 7144 return (0); 7145 } 7146 7147 /* 7148 * Iterate over livelists which have been destroyed by the user but 7149 * are still present in the MOS, waiting to be freed 7150 */ 7151 static void 7152 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) 7153 { 7154 objset_t *mos = spa->spa_meta_objset; 7155 uint64_t zap_obj; 7156 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, 7157 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); 7158 if (err == ENOENT) 7159 return; 7160 ASSERT0(err); 7161 7162 zap_cursor_t zc; 7163 zap_attribute_t *attrp = zap_attribute_alloc(); 7164 dsl_deadlist_t ll; 7165 /* NULL out os prior to dsl_deadlist_open in case it's garbage */ 7166 ll.dl_os = NULL; 7167 for (zap_cursor_init(&zc, mos, zap_obj); 7168 zap_cursor_retrieve(&zc, attrp) == 0; 7169 (void) zap_cursor_advance(&zc)) { 7170 VERIFY0(dsl_deadlist_open(&ll, mos, attrp->za_first_integer)); 7171 func(&ll, arg); 7172 dsl_deadlist_close(&ll); 7173 } 7174 zap_cursor_fini(&zc); 7175 zap_attribute_free(attrp); 7176 } 7177 7178 static int 7179 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 7180 dmu_tx_t *tx) 7181 { 7182 ASSERT(!bp_freed); 7183 return (count_block_cb(arg, bp, tx)); 7184 } 7185 7186 static int 7187 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) 7188 { 7189 zdb_cb_t *zbc = args; 7190 bplist_t blks; 7191 bplist_create(&blks); 7192 /* determine which blocks have been alloc'd but not freed */ 7193 VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); 7194 /* count those blocks */ 7195 (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); 7196 bplist_destroy(&blks); 7197 return (0); 7198 } 7199 7200 static void 7201 livelist_count_blocks(dsl_deadlist_t *ll, void *arg) 7202 { 7203 dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); 7204 } 7205 7206 /* 7207 * Count the blocks in the livelists that have been destroyed by the user 7208 * but haven't yet been freed. 7209 */ 7210 static void 7211 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) 7212 { 7213 iterate_deleted_livelists(spa, livelist_count_blocks, zbc); 7214 } 7215 7216 static void 7217 dump_livelist_cb(dsl_deadlist_t *ll, void *arg) 7218 { 7219 ASSERT0P(arg); 7220 global_feature_count[SPA_FEATURE_LIVELIST]++; 7221 dump_blkptr_list(ll, "Deleted Livelist"); 7222 dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); 7223 } 7224 7225 /* 7226 * Print out, register object references to, and increment feature counts for 7227 * livelists that have been destroyed by the user but haven't yet been freed. 7228 */ 7229 static void 7230 deleted_livelists_dump_mos(spa_t *spa) 7231 { 7232 uint64_t zap_obj; 7233 objset_t *mos = spa->spa_meta_objset; 7234 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, 7235 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); 7236 if (err == ENOENT) 7237 return; 7238 mos_obj_refd(zap_obj); 7239 iterate_deleted_livelists(spa, dump_livelist_cb, NULL); 7240 } 7241 7242 static int 7243 zdb_brt_entry_compare(const void *zcn1, const void *zcn2) 7244 { 7245 const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva; 7246 const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva; 7247 int cmp; 7248 7249 cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); 7250 if (cmp == 0) 7251 cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)); 7252 7253 return (cmp); 7254 } 7255 7256 static int 7257 dump_block_stats(spa_t *spa) 7258 { 7259 zdb_cb_t *zcb; 7260 zdb_blkstats_t *zb, *tzb; 7261 uint64_t norm_alloc, norm_space, total_alloc, total_found; 7262 int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 7263 TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; 7264 boolean_t leaks = B_FALSE; 7265 int e, c, err; 7266 bp_embedded_type_t i; 7267 7268 ddt_prefetch_all(spa); 7269 7270 zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); 7271 7272 if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { 7273 avl_create(&zcb->zcb_brt, zdb_brt_entry_compare, 7274 sizeof (zdb_brt_entry_t), 7275 offsetof(zdb_brt_entry_t, zbre_node)); 7276 zcb->zcb_brt_is_active = B_TRUE; 7277 } 7278 7279 (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", 7280 (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", 7281 (dump_opt['c'] == 1) ? "metadata " : "", 7282 dump_opt['c'] ? "checksums " : "", 7283 (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", 7284 !dump_opt['L'] ? "nothing leaked " : ""); 7285 7286 /* 7287 * When leak detection is enabled we load all space maps as SM_ALLOC 7288 * maps, then traverse the pool claiming each block we discover. If 7289 * the pool is perfectly consistent, the segment trees will be empty 7290 * when we're done. Anything left over is a leak; any block we can't 7291 * claim (because it's not part of any space map) is a double 7292 * allocation, reference to a freed block, or an unclaimed log block. 7293 * 7294 * When leak detection is disabled (-L option) we still traverse the 7295 * pool claiming each block we discover, but we skip opening any space 7296 * maps. 7297 */ 7298 zdb_leak_init(spa, zcb); 7299 7300 /* 7301 * If there's a deferred-free bplist, process that first. 7302 */ 7303 (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, 7304 bpobj_count_block_cb, zcb, NULL); 7305 7306 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 7307 (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, 7308 bpobj_count_block_cb, zcb, NULL); 7309 } 7310 7311 zdb_claim_removing(spa, zcb); 7312 7313 if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { 7314 VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, 7315 spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, 7316 zcb, NULL)); 7317 } 7318 7319 deleted_livelists_count_blocks(spa, zcb); 7320 7321 if (dump_opt['c'] > 1) 7322 flags |= TRAVERSE_PREFETCH_DATA; 7323 7324 zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); 7325 zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); 7326 zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); 7327 zcb->zcb_totalasize += 7328 metaslab_class_get_alloc(spa_embedded_log_class(spa)); 7329 zcb->zcb_totalasize += 7330 metaslab_class_get_alloc(spa_special_embedded_log_class(spa)); 7331 zcb->zcb_start = zcb->zcb_lastprint = gethrtime(); 7332 err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb); 7333 7334 /* 7335 * If we've traversed the data blocks then we need to wait for those 7336 * I/Os to complete. We leverage "The Godfather" zio to wait on 7337 * all async I/Os to complete. 7338 */ 7339 if (dump_opt['c']) { 7340 for (c = 0; c < max_ncpus; c++) { 7341 (void) zio_wait(spa->spa_async_zio_root[c]); 7342 spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL, 7343 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 7344 ZIO_FLAG_GODFATHER); 7345 } 7346 } 7347 ASSERT0(spa->spa_load_verify_bytes); 7348 7349 /* 7350 * Done after zio_wait() since zcb_haderrors is modified in 7351 * zdb_blkptr_done() 7352 */ 7353 zcb->zcb_haderrors |= err; 7354 7355 if (zcb->zcb_haderrors) { 7356 (void) printf("\nError counts:\n\n"); 7357 (void) printf("\t%5s %s\n", "errno", "count"); 7358 for (e = 0; e < 256; e++) { 7359 if (zcb->zcb_errors[e] != 0) { 7360 (void) printf("\t%5d %llu\n", 7361 e, (u_longlong_t)zcb->zcb_errors[e]); 7362 } 7363 } 7364 } 7365 7366 /* 7367 * Report any leaked segments. 7368 */ 7369 leaks |= zdb_leak_fini(spa, zcb); 7370 7371 tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; 7372 7373 norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 7374 norm_space = metaslab_class_get_space(spa_normal_class(spa)); 7375 7376 total_alloc = norm_alloc + 7377 metaslab_class_get_alloc(spa_log_class(spa)) + 7378 metaslab_class_get_alloc(spa_embedded_log_class(spa)) + 7379 metaslab_class_get_alloc(spa_special_embedded_log_class(spa)) + 7380 metaslab_class_get_alloc(spa_special_class(spa)) + 7381 metaslab_class_get_alloc(spa_dedup_class(spa)) + 7382 get_unflushed_alloc_space(spa); 7383 total_found = 7384 tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize + 7385 zcb->zcb_removing_size + zcb->zcb_checkpoint_size; 7386 7387 if (total_found == total_alloc && !dump_opt['L']) { 7388 (void) printf("\n\tNo leaks (block sum matches space" 7389 " maps exactly)\n"); 7390 } else if (!dump_opt['L']) { 7391 (void) printf("block traversal size %llu != alloc %llu " 7392 "(%s %lld)\n", 7393 (u_longlong_t)total_found, 7394 (u_longlong_t)total_alloc, 7395 (dump_opt['L']) ? "unreachable" : "leaked", 7396 (longlong_t)(total_alloc - total_found)); 7397 } 7398 7399 if (tzb->zb_count == 0) { 7400 umem_free(zcb, sizeof (zdb_cb_t)); 7401 return (2); 7402 } 7403 7404 (void) printf("\n"); 7405 (void) printf("\t%-16s %14llu\n", "bp count:", 7406 (u_longlong_t)tzb->zb_count); 7407 (void) printf("\t%-16s %14llu\n", "ganged count:", 7408 (longlong_t)tzb->zb_gangs); 7409 (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", 7410 (u_longlong_t)tzb->zb_lsize, 7411 (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); 7412 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", 7413 "bp physical:", (u_longlong_t)tzb->zb_psize, 7414 (u_longlong_t)(tzb->zb_psize / tzb->zb_count), 7415 (double)tzb->zb_lsize / tzb->zb_psize); 7416 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", 7417 "bp allocated:", (u_longlong_t)tzb->zb_asize, 7418 (u_longlong_t)(tzb->zb_asize / tzb->zb_count), 7419 (double)tzb->zb_lsize / tzb->zb_asize); 7420 (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", 7421 "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize, 7422 (u_longlong_t)zcb->zcb_dedup_blocks, 7423 (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0); 7424 (void) printf("\t%-16s %14llu count: %6llu\n", 7425 "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize, 7426 (u_longlong_t)zcb->zcb_clone_blocks); 7427 (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", 7428 (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); 7429 7430 if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) { 7431 uint64_t alloc = metaslab_class_get_alloc( 7432 spa_special_class(spa)); 7433 uint64_t space = metaslab_class_get_space( 7434 spa_special_class(spa)); 7435 7436 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 7437 "Special class", (u_longlong_t)alloc, 7438 100.0 * alloc / space); 7439 } 7440 7441 if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) { 7442 uint64_t alloc = metaslab_class_get_alloc( 7443 spa_dedup_class(spa)); 7444 uint64_t space = metaslab_class_get_space( 7445 spa_dedup_class(spa)); 7446 7447 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 7448 "Dedup class", (u_longlong_t)alloc, 7449 100.0 * alloc / space); 7450 } 7451 7452 if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) { 7453 uint64_t alloc = metaslab_class_get_alloc( 7454 spa_embedded_log_class(spa)); 7455 uint64_t space = metaslab_class_get_space( 7456 spa_embedded_log_class(spa)); 7457 7458 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 7459 "Embedded log class", (u_longlong_t)alloc, 7460 100.0 * alloc / space); 7461 } 7462 7463 if (spa_special_embedded_log_class(spa)->mc_allocator[0].mca_rotor 7464 != NULL) { 7465 uint64_t alloc = metaslab_class_get_alloc( 7466 spa_special_embedded_log_class(spa)); 7467 uint64_t space = metaslab_class_get_space( 7468 spa_special_embedded_log_class(spa)); 7469 7470 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 7471 "Special embedded log", (u_longlong_t)alloc, 7472 100.0 * alloc / space); 7473 } 7474 7475 for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { 7476 if (zcb->zcb_embedded_blocks[i] == 0) 7477 continue; 7478 (void) printf("\n"); 7479 (void) printf("\tadditional, non-pointer bps of type %u: " 7480 "%10llu\n", 7481 i, (u_longlong_t)zcb->zcb_embedded_blocks[i]); 7482 7483 if (dump_opt['b'] >= 3) { 7484 (void) printf("\t number of (compressed) bytes: " 7485 "number of bps\n"); 7486 dump_histogram(zcb->zcb_embedded_histogram[i], 7487 sizeof (zcb->zcb_embedded_histogram[i]) / 7488 sizeof (zcb->zcb_embedded_histogram[i][0]), 0); 7489 } 7490 } 7491 7492 if (tzb->zb_ditto_samevdev != 0) { 7493 (void) printf("\tDittoed blocks on same vdev: %llu\n", 7494 (longlong_t)tzb->zb_ditto_samevdev); 7495 } 7496 if (tzb->zb_ditto_same_ms != 0) { 7497 (void) printf("\tDittoed blocks in same metaslab: %llu\n", 7498 (longlong_t)tzb->zb_ditto_same_ms); 7499 } 7500 7501 for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { 7502 vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; 7503 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 7504 7505 if (vim == NULL) { 7506 continue; 7507 } 7508 7509 char mem[32]; 7510 zdb_nicenum(vdev_indirect_mapping_num_entries(vim), 7511 mem, vdev_indirect_mapping_size(vim)); 7512 7513 (void) printf("\tindirect vdev id %llu has %llu segments " 7514 "(%s in memory)\n", 7515 (longlong_t)vd->vdev_id, 7516 (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); 7517 } 7518 7519 if (dump_opt['b'] >= 2) { 7520 int l, t, level; 7521 char csize[32], lsize[32], psize[32], asize[32]; 7522 char avg[32], gang[32]; 7523 (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" 7524 "\t avg\t comp\t%%Total\tType\n"); 7525 7526 zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t), 7527 UMEM_NOFAIL); 7528 7529 for (t = 0; t <= ZDB_OT_TOTAL; t++) { 7530 const char *typename; 7531 7532 /* make sure nicenum has enough space */ 7533 _Static_assert(sizeof (csize) >= NN_NUMBUF_SZ, 7534 "csize truncated"); 7535 _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, 7536 "lsize truncated"); 7537 _Static_assert(sizeof (psize) >= NN_NUMBUF_SZ, 7538 "psize truncated"); 7539 _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, 7540 "asize truncated"); 7541 _Static_assert(sizeof (avg) >= NN_NUMBUF_SZ, 7542 "avg truncated"); 7543 _Static_assert(sizeof (gang) >= NN_NUMBUF_SZ, 7544 "gang truncated"); 7545 7546 if (t < DMU_OT_NUMTYPES) 7547 typename = dmu_ot[t].ot_name; 7548 else 7549 typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; 7550 7551 if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) { 7552 (void) printf("%6s\t%5s\t%5s\t%5s" 7553 "\t%5s\t%5s\t%6s\t%s\n", 7554 "-", 7555 "-", 7556 "-", 7557 "-", 7558 "-", 7559 "-", 7560 "-", 7561 typename); 7562 continue; 7563 } 7564 7565 for (l = ZB_TOTAL - 1; l >= -1; l--) { 7566 level = (l == -1 ? ZB_TOTAL : l); 7567 zb = &zcb->zcb_type[level][t]; 7568 7569 if (zb->zb_asize == 0) 7570 continue; 7571 7572 if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES && 7573 (level > 0 || DMU_OT_IS_METADATA(t))) { 7574 mdstats->zb_count += zb->zb_count; 7575 mdstats->zb_lsize += zb->zb_lsize; 7576 mdstats->zb_psize += zb->zb_psize; 7577 mdstats->zb_asize += zb->zb_asize; 7578 mdstats->zb_gangs += zb->zb_gangs; 7579 } 7580 7581 if (dump_opt['b'] < 3 && level != ZB_TOTAL) 7582 continue; 7583 7584 if (level == 0 && zb->zb_asize == 7585 zcb->zcb_type[ZB_TOTAL][t].zb_asize) 7586 continue; 7587 7588 zdb_nicenum(zb->zb_count, csize, 7589 sizeof (csize)); 7590 zdb_nicenum(zb->zb_lsize, lsize, 7591 sizeof (lsize)); 7592 zdb_nicenum(zb->zb_psize, psize, 7593 sizeof (psize)); 7594 zdb_nicenum(zb->zb_asize, asize, 7595 sizeof (asize)); 7596 zdb_nicenum(zb->zb_asize / zb->zb_count, avg, 7597 sizeof (avg)); 7598 zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); 7599 7600 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" 7601 "\t%5.2f\t%6.2f\t", 7602 csize, lsize, psize, asize, avg, 7603 (double)zb->zb_lsize / zb->zb_psize, 7604 100.0 * zb->zb_asize / tzb->zb_asize); 7605 7606 if (level == ZB_TOTAL) 7607 (void) printf("%s\n", typename); 7608 else 7609 (void) printf(" L%d %s\n", 7610 level, typename); 7611 7612 if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { 7613 (void) printf("\t number of ganged " 7614 "blocks: %s\n", gang); 7615 } 7616 7617 if (dump_opt['b'] >= 4) { 7618 (void) printf("psize " 7619 "(in 512-byte sectors): " 7620 "number of blocks\n"); 7621 dump_histogram(zb->zb_psize_histogram, 7622 PSIZE_HISTO_SIZE, 0); 7623 } 7624 } 7625 } 7626 zdb_nicenum(mdstats->zb_count, csize, 7627 sizeof (csize)); 7628 zdb_nicenum(mdstats->zb_lsize, lsize, 7629 sizeof (lsize)); 7630 zdb_nicenum(mdstats->zb_psize, psize, 7631 sizeof (psize)); 7632 zdb_nicenum(mdstats->zb_asize, asize, 7633 sizeof (asize)); 7634 zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg, 7635 sizeof (avg)); 7636 zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang)); 7637 7638 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" 7639 "\t%5.2f\t%6.2f\t", 7640 csize, lsize, psize, asize, avg, 7641 (double)mdstats->zb_lsize / mdstats->zb_psize, 7642 100.0 * mdstats->zb_asize / tzb->zb_asize); 7643 (void) printf("%s\n", "Metadata Total"); 7644 7645 /* Output a table summarizing block sizes in the pool */ 7646 if (dump_opt['b'] >= 2) { 7647 dump_size_histograms(zcb); 7648 } 7649 7650 umem_free(mdstats, sizeof (zfs_blkstat_t)); 7651 } 7652 7653 (void) printf("\n"); 7654 7655 if (leaks) { 7656 umem_free(zcb, sizeof (zdb_cb_t)); 7657 return (2); 7658 } 7659 7660 if (zcb->zcb_haderrors) { 7661 umem_free(zcb, sizeof (zdb_cb_t)); 7662 return (3); 7663 } 7664 7665 umem_free(zcb, sizeof (zdb_cb_t)); 7666 return (0); 7667 } 7668 7669 typedef struct zdb_ddt_entry { 7670 /* key must be first for ddt_key_compare */ 7671 ddt_key_t zdde_key; 7672 uint64_t zdde_ref_blocks; 7673 uint64_t zdde_ref_lsize; 7674 uint64_t zdde_ref_psize; 7675 uint64_t zdde_ref_dsize; 7676 avl_node_t zdde_node; 7677 } zdb_ddt_entry_t; 7678 7679 static int 7680 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 7681 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 7682 { 7683 (void) zilog, (void) dnp; 7684 avl_tree_t *t = arg; 7685 avl_index_t where; 7686 zdb_ddt_entry_t *zdde, zdde_search; 7687 7688 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 7689 BP_IS_EMBEDDED(bp)) 7690 return (0); 7691 7692 if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { 7693 (void) printf("traversing objset %llu, %llu objects, " 7694 "%lu blocks so far\n", 7695 (u_longlong_t)zb->zb_objset, 7696 (u_longlong_t)BP_GET_FILL(bp), 7697 avl_numnodes(t)); 7698 } 7699 7700 if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || 7701 BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) 7702 return (0); 7703 7704 ddt_key_fill(&zdde_search.zdde_key, bp); 7705 7706 zdde = avl_find(t, &zdde_search, &where); 7707 7708 if (zdde == NULL) { 7709 zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); 7710 zdde->zdde_key = zdde_search.zdde_key; 7711 avl_insert(t, zdde, where); 7712 } 7713 7714 zdde->zdde_ref_blocks += 1; 7715 zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); 7716 zdde->zdde_ref_psize += BP_GET_PSIZE(bp); 7717 zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); 7718 7719 return (0); 7720 } 7721 7722 static void 7723 dump_simulated_ddt(spa_t *spa) 7724 { 7725 avl_tree_t t; 7726 void *cookie = NULL; 7727 zdb_ddt_entry_t *zdde; 7728 ddt_histogram_t ddh_total = {{{0}}}; 7729 ddt_stat_t dds_total = {0}; 7730 7731 avl_create(&t, ddt_key_compare, 7732 sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); 7733 7734 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7735 7736 (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 7737 TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); 7738 7739 spa_config_exit(spa, SCL_CONFIG, FTAG); 7740 7741 while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { 7742 uint64_t refcnt = zdde->zdde_ref_blocks; 7743 ASSERT(refcnt != 0); 7744 7745 ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1]; 7746 7747 dds->dds_blocks += zdde->zdde_ref_blocks / refcnt; 7748 dds->dds_lsize += zdde->zdde_ref_lsize / refcnt; 7749 dds->dds_psize += zdde->zdde_ref_psize / refcnt; 7750 dds->dds_dsize += zdde->zdde_ref_dsize / refcnt; 7751 7752 dds->dds_ref_blocks += zdde->zdde_ref_blocks; 7753 dds->dds_ref_lsize += zdde->zdde_ref_lsize; 7754 dds->dds_ref_psize += zdde->zdde_ref_psize; 7755 dds->dds_ref_dsize += zdde->zdde_ref_dsize; 7756 7757 umem_free(zdde, sizeof (*zdde)); 7758 } 7759 7760 avl_destroy(&t); 7761 7762 ddt_histogram_total(&dds_total, &ddh_total); 7763 7764 (void) printf("Simulated DDT histogram:\n"); 7765 7766 zpool_dump_ddt(&dds_total, &ddh_total); 7767 7768 dump_dedup_ratio(&dds_total); 7769 } 7770 7771 static int 7772 verify_device_removal_feature_counts(spa_t *spa) 7773 { 7774 uint64_t dr_feature_refcount = 0; 7775 uint64_t oc_feature_refcount = 0; 7776 uint64_t indirect_vdev_count = 0; 7777 uint64_t precise_vdev_count = 0; 7778 uint64_t obsolete_counts_object_count = 0; 7779 uint64_t obsolete_sm_count = 0; 7780 uint64_t obsolete_counts_count = 0; 7781 uint64_t scip_count = 0; 7782 uint64_t obsolete_bpobj_count = 0; 7783 int ret = 0; 7784 7785 spa_condensing_indirect_phys_t *scip = 7786 &spa->spa_condensing_indirect_phys; 7787 if (scip->scip_next_mapping_object != 0) { 7788 vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; 7789 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 7790 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 7791 7792 (void) printf("Condensing indirect vdev %llu: new mapping " 7793 "object %llu, prev obsolete sm %llu\n", 7794 (u_longlong_t)scip->scip_vdev, 7795 (u_longlong_t)scip->scip_next_mapping_object, 7796 (u_longlong_t)scip->scip_prev_obsolete_sm_object); 7797 if (scip->scip_prev_obsolete_sm_object != 0) { 7798 space_map_t *prev_obsolete_sm = NULL; 7799 VERIFY0(space_map_open(&prev_obsolete_sm, 7800 spa->spa_meta_objset, 7801 scip->scip_prev_obsolete_sm_object, 7802 0, vd->vdev_asize, 0)); 7803 dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); 7804 (void) printf("\n"); 7805 space_map_close(prev_obsolete_sm); 7806 } 7807 7808 scip_count += 2; 7809 } 7810 7811 for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { 7812 vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; 7813 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 7814 7815 if (vic->vic_mapping_object != 0) { 7816 ASSERT(vd->vdev_ops == &vdev_indirect_ops || 7817 vd->vdev_removing); 7818 indirect_vdev_count++; 7819 7820 if (vd->vdev_indirect_mapping->vim_havecounts) { 7821 obsolete_counts_count++; 7822 } 7823 } 7824 7825 boolean_t are_precise; 7826 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); 7827 if (are_precise) { 7828 ASSERT(vic->vic_mapping_object != 0); 7829 precise_vdev_count++; 7830 } 7831 7832 uint64_t obsolete_sm_object; 7833 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 7834 if (obsolete_sm_object != 0) { 7835 ASSERT(vic->vic_mapping_object != 0); 7836 obsolete_sm_count++; 7837 } 7838 } 7839 7840 (void) feature_get_refcount(spa, 7841 &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], 7842 &dr_feature_refcount); 7843 (void) feature_get_refcount(spa, 7844 &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], 7845 &oc_feature_refcount); 7846 7847 if (dr_feature_refcount != indirect_vdev_count) { 7848 ret = 1; 7849 (void) printf("Number of indirect vdevs (%llu) " \ 7850 "does not match feature count (%llu)\n", 7851 (u_longlong_t)indirect_vdev_count, 7852 (u_longlong_t)dr_feature_refcount); 7853 } else { 7854 (void) printf("Verified device_removal feature refcount " \ 7855 "of %llu is correct\n", 7856 (u_longlong_t)dr_feature_refcount); 7857 } 7858 7859 if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 7860 DMU_POOL_OBSOLETE_BPOBJ) == 0) { 7861 obsolete_bpobj_count++; 7862 } 7863 7864 7865 obsolete_counts_object_count = precise_vdev_count; 7866 obsolete_counts_object_count += obsolete_sm_count; 7867 obsolete_counts_object_count += obsolete_counts_count; 7868 obsolete_counts_object_count += scip_count; 7869 obsolete_counts_object_count += obsolete_bpobj_count; 7870 obsolete_counts_object_count += remap_deadlist_count; 7871 7872 if (oc_feature_refcount != obsolete_counts_object_count) { 7873 ret = 1; 7874 (void) printf("Number of obsolete counts objects (%llu) " \ 7875 "does not match feature count (%llu)\n", 7876 (u_longlong_t)obsolete_counts_object_count, 7877 (u_longlong_t)oc_feature_refcount); 7878 (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " 7879 "ob:%llu rd:%llu\n", 7880 (u_longlong_t)precise_vdev_count, 7881 (u_longlong_t)obsolete_sm_count, 7882 (u_longlong_t)obsolete_counts_count, 7883 (u_longlong_t)scip_count, 7884 (u_longlong_t)obsolete_bpobj_count, 7885 (u_longlong_t)remap_deadlist_count); 7886 } else { 7887 (void) printf("Verified indirect_refcount feature refcount " \ 7888 "of %llu is correct\n", 7889 (u_longlong_t)oc_feature_refcount); 7890 } 7891 return (ret); 7892 } 7893 7894 static void 7895 zdb_set_skip_mmp(char *target) 7896 { 7897 spa_t *spa; 7898 7899 /* 7900 * Disable the activity check to allow examination of 7901 * active pools. 7902 */ 7903 spa_namespace_enter(FTAG); 7904 if ((spa = spa_lookup(target)) != NULL) { 7905 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 7906 } 7907 spa_namespace_exit(FTAG); 7908 } 7909 7910 #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" 7911 /* 7912 * Import the checkpointed state of the pool specified by the target 7913 * parameter as readonly. The function also accepts a pool config 7914 * as an optional parameter, else it attempts to infer the config by 7915 * the name of the target pool. 7916 * 7917 * Note that the checkpointed state's pool name will be the name of 7918 * the original pool with the above suffix appended to it. In addition, 7919 * if the target is not a pool name (e.g. a path to a dataset) then 7920 * the new_path parameter is populated with the updated path to 7921 * reflect the fact that we are looking into the checkpointed state. 7922 * 7923 * The function returns a newly-allocated copy of the name of the 7924 * pool containing the checkpointed state. When this copy is no 7925 * longer needed it should be freed with free(3C). Same thing 7926 * applies to the new_path parameter if allocated. 7927 */ 7928 static char * 7929 import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa, 7930 char **new_path) 7931 { 7932 int error = 0; 7933 char *poolname, *bogus_name = NULL; 7934 boolean_t freecfg = B_FALSE; 7935 7936 /* If the target is not a pool, the extract the pool name */ 7937 char *path_start = strchr(target, '/'); 7938 if (target_is_spa || path_start == NULL) { 7939 poolname = target; 7940 } else { 7941 size_t poolname_len = path_start - target; 7942 poolname = strndup(target, poolname_len); 7943 } 7944 7945 if (cfg == NULL) { 7946 zdb_set_skip_mmp(poolname); 7947 error = spa_get_stats(poolname, &cfg, NULL, 0); 7948 if (error != 0) { 7949 fatal("Tried to read config of pool \"%s\" but " 7950 "spa_get_stats() failed with error %d\n", 7951 poolname, error); 7952 } 7953 freecfg = B_TRUE; 7954 } 7955 7956 if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) { 7957 if (target != poolname) 7958 free(poolname); 7959 return (NULL); 7960 } 7961 fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); 7962 7963 error = spa_import(bogus_name, cfg, NULL, 7964 ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | 7965 ZFS_IMPORT_SKIP_MMP); 7966 if (freecfg) 7967 nvlist_free(cfg); 7968 if (error != 0) { 7969 fatal("Tried to import pool \"%s\" but spa_import() failed " 7970 "with error %d\n", bogus_name, error); 7971 } 7972 7973 if (new_path != NULL && !target_is_spa) { 7974 if (asprintf(new_path, "%s%s", bogus_name, 7975 path_start != NULL ? path_start : "") == -1) { 7976 free(bogus_name); 7977 if (!target_is_spa && path_start != NULL) 7978 free(poolname); 7979 return (NULL); 7980 } 7981 } 7982 7983 if (target != poolname) 7984 free(poolname); 7985 7986 return (bogus_name); 7987 } 7988 7989 typedef struct verify_checkpoint_sm_entry_cb_arg { 7990 vdev_t *vcsec_vd; 7991 7992 /* the following fields are only used for printing progress */ 7993 uint64_t vcsec_entryid; 7994 uint64_t vcsec_num_entries; 7995 } verify_checkpoint_sm_entry_cb_arg_t; 7996 7997 #define ENTRIES_PER_PROGRESS_UPDATE 10000 7998 7999 static int 8000 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) 8001 { 8002 verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; 8003 vdev_t *vd = vcsec->vcsec_vd; 8004 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; 8005 uint64_t end = sme->sme_offset + sme->sme_run; 8006 8007 ASSERT(sme->sme_type == SM_FREE); 8008 8009 if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { 8010 (void) fprintf(stderr, 8011 "\rverifying vdev %llu, space map entry %llu of %llu ...", 8012 (longlong_t)vd->vdev_id, 8013 (longlong_t)vcsec->vcsec_entryid, 8014 (longlong_t)vcsec->vcsec_num_entries); 8015 } 8016 vcsec->vcsec_entryid++; 8017 8018 /* 8019 * See comment in checkpoint_sm_exclude_entry_cb() 8020 */ 8021 VERIFY3U(sme->sme_offset, >=, ms->ms_start); 8022 VERIFY3U(end, <=, ms->ms_start + ms->ms_size); 8023 8024 /* 8025 * The entries in the vdev_checkpoint_sm should be marked as 8026 * allocated in the checkpointed state of the pool, therefore 8027 * their respective ms_allocateable trees should not contain them. 8028 */ 8029 mutex_enter(&ms->ms_lock); 8030 zfs_range_tree_verify_not_present(ms->ms_allocatable, 8031 sme->sme_offset, sme->sme_run); 8032 mutex_exit(&ms->ms_lock); 8033 8034 return (0); 8035 } 8036 8037 /* 8038 * Verify that all segments in the vdev_checkpoint_sm are allocated 8039 * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's 8040 * ms_allocatable). 8041 * 8042 * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of 8043 * each vdev in the current state of the pool to the metaslab space maps 8044 * (ms_sm) of the checkpointed state of the pool. 8045 * 8046 * Note that the function changes the state of the ms_allocatable 8047 * trees of the current spa_t. The entries of these ms_allocatable 8048 * trees are cleared out and then repopulated from with the free 8049 * entries of their respective ms_sm space maps. 8050 */ 8051 static void 8052 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) 8053 { 8054 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; 8055 vdev_t *current_rvd = current->spa_root_vdev; 8056 8057 load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); 8058 8059 for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { 8060 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; 8061 vdev_t *current_vd = current_rvd->vdev_child[c]; 8062 8063 space_map_t *checkpoint_sm = NULL; 8064 uint64_t checkpoint_sm_obj; 8065 8066 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { 8067 /* 8068 * Since we don't allow device removal in a pool 8069 * that has a checkpoint, we expect that all removed 8070 * vdevs were removed from the pool before the 8071 * checkpoint. 8072 */ 8073 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); 8074 continue; 8075 } 8076 8077 /* 8078 * If the checkpoint space map doesn't exist, then nothing 8079 * here is checkpointed so there's nothing to verify. 8080 */ 8081 if (current_vd->vdev_top_zap == 0 || 8082 zap_contains(spa_meta_objset(current), 8083 current_vd->vdev_top_zap, 8084 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 8085 continue; 8086 8087 VERIFY0(zap_lookup(spa_meta_objset(current), 8088 current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 8089 sizeof (uint64_t), 1, &checkpoint_sm_obj)); 8090 8091 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), 8092 checkpoint_sm_obj, 0, current_vd->vdev_asize, 8093 current_vd->vdev_ashift)); 8094 8095 verify_checkpoint_sm_entry_cb_arg_t vcsec; 8096 vcsec.vcsec_vd = ckpoint_vd; 8097 vcsec.vcsec_entryid = 0; 8098 vcsec.vcsec_num_entries = 8099 space_map_length(checkpoint_sm) / sizeof (uint64_t); 8100 VERIFY0(space_map_iterate(checkpoint_sm, 8101 space_map_length(checkpoint_sm), 8102 verify_checkpoint_sm_entry_cb, &vcsec)); 8103 if (dump_opt['m'] > 3) 8104 dump_spacemap(current->spa_meta_objset, checkpoint_sm); 8105 space_map_close(checkpoint_sm); 8106 } 8107 8108 /* 8109 * If we've added vdevs since we took the checkpoint, ensure 8110 * that their checkpoint space maps are empty. 8111 */ 8112 if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { 8113 for (uint64_t c = ckpoint_rvd->vdev_children; 8114 c < current_rvd->vdev_children; c++) { 8115 vdev_t *current_vd = current_rvd->vdev_child[c]; 8116 VERIFY0P(current_vd->vdev_checkpoint_sm); 8117 } 8118 } 8119 8120 /* for cleaner progress output */ 8121 (void) fprintf(stderr, "\n"); 8122 } 8123 8124 /* 8125 * Verifies that all space that's allocated in the checkpoint is 8126 * still allocated in the current version, by checking that everything 8127 * in checkpoint's ms_allocatable (which is actually allocated, not 8128 * allocatable/free) is not present in current's ms_allocatable. 8129 * 8130 * Note that the function changes the state of the ms_allocatable 8131 * trees of both spas when called. The entries of all ms_allocatable 8132 * trees are cleared out and then repopulated from their respective 8133 * ms_sm space maps. In the checkpointed state we load the allocated 8134 * entries, and in the current state we load the free entries. 8135 */ 8136 static void 8137 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) 8138 { 8139 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; 8140 vdev_t *current_rvd = current->spa_root_vdev; 8141 8142 load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); 8143 load_concrete_ms_allocatable_trees(current, SM_FREE); 8144 8145 for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { 8146 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; 8147 vdev_t *current_vd = current_rvd->vdev_child[i]; 8148 8149 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { 8150 /* 8151 * See comment in verify_checkpoint_vdev_spacemaps() 8152 */ 8153 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); 8154 continue; 8155 } 8156 8157 for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { 8158 metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; 8159 metaslab_t *current_msp = current_vd->vdev_ms[m]; 8160 8161 (void) fprintf(stderr, 8162 "\rverifying vdev %llu of %llu, " 8163 "metaslab %llu of %llu ...", 8164 (longlong_t)current_vd->vdev_id, 8165 (longlong_t)current_rvd->vdev_children, 8166 (longlong_t)current_vd->vdev_ms[m]->ms_id, 8167 (longlong_t)current_vd->vdev_ms_count); 8168 8169 /* 8170 * We walk through the ms_allocatable trees that 8171 * are loaded with the allocated blocks from the 8172 * ms_sm spacemaps of the checkpoint. For each 8173 * one of these ranges we ensure that none of them 8174 * exists in the ms_allocatable trees of the 8175 * current state which are loaded with the ranges 8176 * that are currently free. 8177 * 8178 * This way we ensure that none of the blocks that 8179 * are part of the checkpoint were freed by mistake. 8180 */ 8181 zfs_range_tree_walk(ckpoint_msp->ms_allocatable, 8182 (zfs_range_tree_func_t *) 8183 zfs_range_tree_verify_not_present, 8184 current_msp->ms_allocatable); 8185 } 8186 } 8187 8188 /* for cleaner progress output */ 8189 (void) fprintf(stderr, "\n"); 8190 } 8191 8192 static void 8193 verify_checkpoint_blocks(spa_t *spa) 8194 { 8195 ASSERT(!dump_opt['L']); 8196 8197 spa_t *checkpoint_spa; 8198 char *checkpoint_pool; 8199 int error = 0; 8200 8201 /* 8202 * We import the checkpointed state of the pool (under a different 8203 * name) so we can do verification on it against the current state 8204 * of the pool. 8205 */ 8206 checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, B_TRUE, 8207 NULL); 8208 ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); 8209 8210 error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); 8211 if (error != 0) { 8212 fatal("Tried to open pool \"%s\" but spa_open() failed with " 8213 "error %d\n", checkpoint_pool, error); 8214 } 8215 8216 /* 8217 * Ensure that ranges in the checkpoint space maps of each vdev 8218 * are allocated according to the checkpointed state's metaslab 8219 * space maps. 8220 */ 8221 verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); 8222 8223 /* 8224 * Ensure that allocated ranges in the checkpoint's metaslab 8225 * space maps remain allocated in the metaslab space maps of 8226 * the current state. 8227 */ 8228 verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); 8229 8230 /* 8231 * Once we are done, we get rid of the checkpointed state. 8232 */ 8233 spa_close(checkpoint_spa, FTAG); 8234 free(checkpoint_pool); 8235 } 8236 8237 static void 8238 dump_leftover_checkpoint_blocks(spa_t *spa) 8239 { 8240 vdev_t *rvd = spa->spa_root_vdev; 8241 8242 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 8243 vdev_t *vd = rvd->vdev_child[i]; 8244 8245 space_map_t *checkpoint_sm = NULL; 8246 uint64_t checkpoint_sm_obj; 8247 8248 if (vd->vdev_top_zap == 0) 8249 continue; 8250 8251 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, 8252 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 8253 continue; 8254 8255 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, 8256 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 8257 sizeof (uint64_t), 1, &checkpoint_sm_obj)); 8258 8259 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), 8260 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); 8261 dump_spacemap(spa->spa_meta_objset, checkpoint_sm); 8262 space_map_close(checkpoint_sm); 8263 } 8264 } 8265 8266 static int 8267 verify_checkpoint(spa_t *spa) 8268 { 8269 uberblock_t checkpoint; 8270 int error; 8271 8272 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 8273 return (0); 8274 8275 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 8276 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 8277 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 8278 8279 if (error == ENOENT && !dump_opt['L']) { 8280 /* 8281 * If the feature is active but the uberblock is missing 8282 * then we must be in the middle of discarding the 8283 * checkpoint. 8284 */ 8285 (void) printf("\nPartially discarded checkpoint " 8286 "state found:\n"); 8287 if (dump_opt['m'] > 3) 8288 dump_leftover_checkpoint_blocks(spa); 8289 return (0); 8290 } else if (error != 0) { 8291 (void) printf("lookup error %d when looking for " 8292 "checkpointed uberblock in MOS\n", error); 8293 return (error); 8294 } 8295 dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); 8296 8297 if (checkpoint.ub_checkpoint_txg == 0) { 8298 (void) printf("\nub_checkpoint_txg not set in checkpointed " 8299 "uberblock\n"); 8300 error = 3; 8301 } 8302 8303 if (error == 0 && !dump_opt['L']) 8304 verify_checkpoint_blocks(spa); 8305 8306 return (error); 8307 } 8308 8309 static void 8310 mos_leaks_cb(void *arg, uint64_t start, uint64_t size) 8311 { 8312 (void) arg; 8313 for (uint64_t i = start; i < size; i++) { 8314 (void) printf("MOS object %llu referenced but not allocated\n", 8315 (u_longlong_t)i); 8316 } 8317 } 8318 8319 static void 8320 mos_obj_refd(uint64_t obj) 8321 { 8322 if (obj != 0 && mos_refd_objs != NULL) 8323 zfs_range_tree_add(mos_refd_objs, obj, 1); 8324 } 8325 8326 /* 8327 * Call on a MOS object that may already have been referenced. 8328 */ 8329 static void 8330 mos_obj_refd_multiple(uint64_t obj) 8331 { 8332 if (obj != 0 && mos_refd_objs != NULL && 8333 !zfs_range_tree_contains(mos_refd_objs, obj, 1)) 8334 zfs_range_tree_add(mos_refd_objs, obj, 1); 8335 } 8336 8337 static void 8338 mos_leak_vdev_top_zap(vdev_t *vd) 8339 { 8340 uint64_t ms_flush_data_obj; 8341 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), 8342 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, 8343 sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); 8344 if (error == ENOENT) 8345 return; 8346 ASSERT0(error); 8347 8348 mos_obj_refd(ms_flush_data_obj); 8349 } 8350 8351 static void 8352 mos_leak_vdev(vdev_t *vd) 8353 { 8354 mos_obj_refd(vd->vdev_dtl_object); 8355 mos_obj_refd(vd->vdev_ms_array); 8356 mos_obj_refd(vd->vdev_indirect_config.vic_births_object); 8357 mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); 8358 mos_obj_refd(vd->vdev_leaf_zap); 8359 if (vd->vdev_checkpoint_sm != NULL) 8360 mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); 8361 if (vd->vdev_indirect_mapping != NULL) { 8362 mos_obj_refd(vd->vdev_indirect_mapping-> 8363 vim_phys->vimp_counts_object); 8364 } 8365 if (vd->vdev_obsolete_sm != NULL) 8366 mos_obj_refd(vd->vdev_obsolete_sm->sm_object); 8367 8368 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 8369 metaslab_t *ms = vd->vdev_ms[m]; 8370 mos_obj_refd(space_map_object(ms->ms_sm)); 8371 } 8372 8373 if (vd->vdev_root_zap != 0) 8374 mos_obj_refd(vd->vdev_root_zap); 8375 8376 if (vd->vdev_top_zap != 0) { 8377 mos_obj_refd(vd->vdev_top_zap); 8378 mos_leak_vdev_top_zap(vd); 8379 } 8380 8381 for (uint64_t c = 0; c < vd->vdev_children; c++) { 8382 mos_leak_vdev(vd->vdev_child[c]); 8383 } 8384 } 8385 8386 static void 8387 mos_leak_log_spacemaps(spa_t *spa) 8388 { 8389 uint64_t spacemap_zap; 8390 int error = zap_lookup(spa_meta_objset(spa), 8391 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, 8392 sizeof (spacemap_zap), 1, &spacemap_zap); 8393 if (error == ENOENT) 8394 return; 8395 ASSERT0(error); 8396 8397 mos_obj_refd(spacemap_zap); 8398 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 8399 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) 8400 mos_obj_refd(sls->sls_sm_obj); 8401 } 8402 8403 static void 8404 errorlog_count_refd(objset_t *mos, uint64_t errlog) 8405 { 8406 zap_cursor_t zc; 8407 zap_attribute_t *za = zap_attribute_alloc(); 8408 for (zap_cursor_init(&zc, mos, errlog); 8409 zap_cursor_retrieve(&zc, za) == 0; 8410 zap_cursor_advance(&zc)) { 8411 mos_obj_refd(za->za_first_integer); 8412 } 8413 zap_cursor_fini(&zc); 8414 zap_attribute_free(za); 8415 } 8416 8417 static int 8418 dump_mos_leaks(spa_t *spa) 8419 { 8420 int rv = 0; 8421 objset_t *mos = spa->spa_meta_objset; 8422 dsl_pool_t *dp = spa->spa_dsl_pool; 8423 8424 /* Visit and mark all referenced objects in the MOS */ 8425 8426 mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); 8427 mos_obj_refd(spa->spa_pool_props_object); 8428 mos_obj_refd(spa->spa_config_object); 8429 mos_obj_refd(spa->spa_ddt_stat_object); 8430 mos_obj_refd(spa->spa_feat_desc_obj); 8431 mos_obj_refd(spa->spa_feat_enabled_txg_obj); 8432 mos_obj_refd(spa->spa_feat_for_read_obj); 8433 mos_obj_refd(spa->spa_feat_for_write_obj); 8434 mos_obj_refd(spa->spa_history); 8435 mos_obj_refd(spa->spa_errlog_last); 8436 mos_obj_refd(spa->spa_errlog_scrub); 8437 8438 if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { 8439 errorlog_count_refd(mos, spa->spa_errlog_last); 8440 errorlog_count_refd(mos, spa->spa_errlog_scrub); 8441 } 8442 8443 mos_obj_refd(spa->spa_all_vdev_zaps); 8444 mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); 8445 mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); 8446 mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); 8447 bpobj_count_refd(&spa->spa_deferred_bpobj); 8448 mos_obj_refd(dp->dp_empty_bpobj); 8449 bpobj_count_refd(&dp->dp_obsolete_bpobj); 8450 bpobj_count_refd(&dp->dp_free_bpobj); 8451 mos_obj_refd(spa->spa_l2cache.sav_object); 8452 mos_obj_refd(spa->spa_spares.sav_object); 8453 8454 if (spa->spa_syncing_log_sm != NULL) 8455 mos_obj_refd(spa->spa_syncing_log_sm->sm_object); 8456 mos_leak_log_spacemaps(spa); 8457 8458 mos_obj_refd(spa->spa_condensing_indirect_phys. 8459 scip_next_mapping_object); 8460 mos_obj_refd(spa->spa_condensing_indirect_phys. 8461 scip_prev_obsolete_sm_object); 8462 if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { 8463 vdev_indirect_mapping_t *vim = 8464 vdev_indirect_mapping_open(mos, 8465 spa->spa_condensing_indirect_phys.scip_next_mapping_object); 8466 mos_obj_refd(vim->vim_phys->vimp_counts_object); 8467 vdev_indirect_mapping_close(vim); 8468 } 8469 deleted_livelists_dump_mos(spa); 8470 8471 if (dp->dp_origin_snap != NULL) { 8472 dsl_dataset_t *ds; 8473 8474 dsl_pool_config_enter(dp, FTAG); 8475 VERIFY0(dsl_dataset_hold_obj(dp, 8476 dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, 8477 FTAG, &ds)); 8478 count_ds_mos_objects(ds); 8479 dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); 8480 dsl_dataset_rele(ds, FTAG); 8481 dsl_pool_config_exit(dp, FTAG); 8482 8483 count_ds_mos_objects(dp->dp_origin_snap); 8484 dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); 8485 } 8486 count_dir_mos_objects(dp->dp_mos_dir); 8487 if (dp->dp_free_dir != NULL) 8488 count_dir_mos_objects(dp->dp_free_dir); 8489 if (dp->dp_leak_dir != NULL) 8490 count_dir_mos_objects(dp->dp_leak_dir); 8491 8492 mos_leak_vdev(spa->spa_root_vdev); 8493 8494 for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 8495 ddt_t *ddt = spa->spa_ddt[c]; 8496 if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) 8497 continue; 8498 8499 /* DDT store objects */ 8500 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 8501 for (ddt_class_t class = 0; class < DDT_CLASSES; 8502 class++) { 8503 mos_obj_refd(ddt->ddt_object[type][class]); 8504 } 8505 } 8506 8507 /* FDT container */ 8508 if (ddt->ddt_version == DDT_VERSION_FDT) 8509 mos_obj_refd(ddt->ddt_dir_object); 8510 8511 /* FDT log objects */ 8512 if (ddt->ddt_flags & DDT_FLAG_LOG) { 8513 mos_obj_refd(ddt->ddt_log[0].ddl_object); 8514 mos_obj_refd(ddt->ddt_log[1].ddl_object); 8515 } 8516 } 8517 8518 for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { 8519 brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; 8520 if (brtvd->bv_initiated) { 8521 mos_obj_refd(brtvd->bv_mos_brtvdev); 8522 mos_obj_refd(brtvd->bv_mos_entries); 8523 } 8524 } 8525 8526 /* 8527 * Visit all allocated objects and make sure they are referenced. 8528 */ 8529 uint64_t object = 0; 8530 while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { 8531 if (zfs_range_tree_contains(mos_refd_objs, object, 1)) { 8532 zfs_range_tree_remove(mos_refd_objs, object, 1); 8533 } else { 8534 dmu_object_info_t doi; 8535 const char *name; 8536 VERIFY0(dmu_object_info(mos, object, &doi)); 8537 if (doi.doi_type & DMU_OT_NEWTYPE) { 8538 dmu_object_byteswap_t bswap = 8539 DMU_OT_BYTESWAP(doi.doi_type); 8540 name = dmu_ot_byteswap[bswap].ob_name; 8541 } else { 8542 name = dmu_ot[doi.doi_type].ot_name; 8543 } 8544 8545 (void) printf("MOS object %llu (%s) leaked\n", 8546 (u_longlong_t)object, name); 8547 rv = 2; 8548 } 8549 } 8550 (void) zfs_range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); 8551 if (!zfs_range_tree_is_empty(mos_refd_objs)) 8552 rv = 2; 8553 zfs_range_tree_vacate(mos_refd_objs, NULL, NULL); 8554 zfs_range_tree_destroy(mos_refd_objs); 8555 return (rv); 8556 } 8557 8558 typedef struct log_sm_obsolete_stats_arg { 8559 uint64_t lsos_current_txg; 8560 8561 uint64_t lsos_total_entries; 8562 uint64_t lsos_valid_entries; 8563 8564 uint64_t lsos_sm_entries; 8565 uint64_t lsos_valid_sm_entries; 8566 } log_sm_obsolete_stats_arg_t; 8567 8568 static int 8569 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, 8570 uint64_t txg, void *arg) 8571 { 8572 log_sm_obsolete_stats_arg_t *lsos = arg; 8573 8574 uint64_t offset = sme->sme_offset; 8575 uint64_t vdev_id = sme->sme_vdev; 8576 8577 if (lsos->lsos_current_txg == 0) { 8578 /* this is the first log */ 8579 lsos->lsos_current_txg = txg; 8580 } else if (lsos->lsos_current_txg < txg) { 8581 /* we just changed log - print stats and reset */ 8582 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", 8583 (u_longlong_t)lsos->lsos_valid_sm_entries, 8584 (u_longlong_t)lsos->lsos_sm_entries, 8585 (u_longlong_t)lsos->lsos_current_txg); 8586 lsos->lsos_valid_sm_entries = 0; 8587 lsos->lsos_sm_entries = 0; 8588 lsos->lsos_current_txg = txg; 8589 } 8590 ASSERT3U(lsos->lsos_current_txg, ==, txg); 8591 8592 lsos->lsos_sm_entries++; 8593 lsos->lsos_total_entries++; 8594 8595 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 8596 if (!vdev_is_concrete(vd)) 8597 return (0); 8598 8599 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 8600 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 8601 8602 if (txg < metaslab_unflushed_txg(ms)) 8603 return (0); 8604 lsos->lsos_valid_sm_entries++; 8605 lsos->lsos_valid_entries++; 8606 return (0); 8607 } 8608 8609 static void 8610 dump_log_spacemap_obsolete_stats(spa_t *spa) 8611 { 8612 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 8613 return; 8614 8615 log_sm_obsolete_stats_arg_t lsos = {0}; 8616 8617 (void) printf("Log Space Map Obsolete Entry Statistics:\n"); 8618 8619 iterate_through_spacemap_logs(spa, 8620 log_spacemap_obsolete_stats_cb, &lsos); 8621 8622 /* print stats for latest log */ 8623 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", 8624 (u_longlong_t)lsos.lsos_valid_sm_entries, 8625 (u_longlong_t)lsos.lsos_sm_entries, 8626 (u_longlong_t)lsos.lsos_current_txg); 8627 8628 (void) printf("%-8llu valid entries out of %-8llu - total\n\n", 8629 (u_longlong_t)lsos.lsos_valid_entries, 8630 (u_longlong_t)lsos.lsos_total_entries); 8631 } 8632 8633 static void 8634 dump_zpool(spa_t *spa) 8635 { 8636 dsl_pool_t *dp = spa_get_dsl(spa); 8637 int rc = 0; 8638 8639 if (dump_opt['y']) { 8640 livelist_metaslab_validate(spa); 8641 } 8642 8643 if (dump_opt['S']) { 8644 dump_simulated_ddt(spa); 8645 return; 8646 } 8647 8648 if (!dump_opt['e'] && dump_opt['C'] > 1) { 8649 (void) printf("\nCached configuration:\n"); 8650 dump_nvlist(spa->spa_config, 8); 8651 } 8652 8653 if (dump_opt['C']) 8654 dump_config(spa); 8655 8656 if (dump_opt['u']) 8657 dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); 8658 8659 if (dump_opt['D']) 8660 dump_all_ddts(spa); 8661 8662 if (dump_opt['T']) 8663 dump_brt(spa); 8664 8665 if (dump_opt['d'] > 2 || dump_opt['m']) 8666 dump_metaslabs(spa); 8667 if (dump_opt['M']) 8668 dump_metaslab_groups(spa, dump_opt['M'] > 1); 8669 if (dump_opt['d'] > 2 || dump_opt['m']) { 8670 dump_log_spacemaps(spa); 8671 dump_log_spacemap_obsolete_stats(spa); 8672 } 8673 8674 if (dump_opt['d'] || dump_opt['i']) { 8675 spa_feature_t f; 8676 mos_refd_objs = zfs_range_tree_create_flags( 8677 NULL, ZFS_RANGE_SEG64, NULL, 0, 0, 8678 0, "dump_zpool:mos_refd_objs"); 8679 dump_objset(dp->dp_meta_objset); 8680 8681 if (dump_opt['d'] >= 3) { 8682 dsl_pool_t *dp = spa->spa_dsl_pool; 8683 dump_full_bpobj(&spa->spa_deferred_bpobj, 8684 "Deferred frees", 0); 8685 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 8686 dump_full_bpobj(&dp->dp_free_bpobj, 8687 "Pool snapshot frees", 0); 8688 } 8689 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { 8690 ASSERT(spa_feature_is_enabled(spa, 8691 SPA_FEATURE_DEVICE_REMOVAL)); 8692 dump_full_bpobj(&dp->dp_obsolete_bpobj, 8693 "Pool obsolete blocks", 0); 8694 } 8695 8696 if (spa_feature_is_active(spa, 8697 SPA_FEATURE_ASYNC_DESTROY)) { 8698 dump_bptree(spa->spa_meta_objset, 8699 dp->dp_bptree_obj, 8700 "Pool dataset frees"); 8701 } 8702 dump_dtl(spa->spa_root_vdev, 0); 8703 } 8704 8705 for (spa_feature_t f = 0; f < SPA_FEATURES; f++) 8706 global_feature_count[f] = UINT64_MAX; 8707 global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; 8708 global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0; 8709 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; 8710 global_feature_count[SPA_FEATURE_LIVELIST] = 0; 8711 8712 (void) dmu_objset_find(spa_name(spa), dump_one_objset, 8713 NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 8714 8715 if (rc == 0 && !dump_opt['L']) 8716 rc = dump_mos_leaks(spa); 8717 8718 for (f = 0; f < SPA_FEATURES; f++) { 8719 uint64_t refcount; 8720 8721 uint64_t *arr; 8722 if (!(spa_feature_table[f].fi_flags & 8723 ZFEATURE_FLAG_PER_DATASET)) { 8724 if (global_feature_count[f] == UINT64_MAX) 8725 continue; 8726 if (!spa_feature_is_enabled(spa, f)) { 8727 ASSERT0(global_feature_count[f]); 8728 continue; 8729 } 8730 arr = global_feature_count; 8731 } else { 8732 if (!spa_feature_is_enabled(spa, f)) { 8733 ASSERT0(dataset_feature_count[f]); 8734 continue; 8735 } 8736 arr = dataset_feature_count; 8737 } 8738 if (feature_get_refcount(spa, &spa_feature_table[f], 8739 &refcount) == ENOTSUP) 8740 continue; 8741 if (arr[f] != refcount) { 8742 (void) printf("%s feature refcount mismatch: " 8743 "%lld consumers != %lld refcount\n", 8744 spa_feature_table[f].fi_uname, 8745 (longlong_t)arr[f], (longlong_t)refcount); 8746 rc = 2; 8747 } else { 8748 (void) printf("Verified %s feature refcount " 8749 "of %llu is correct\n", 8750 spa_feature_table[f].fi_uname, 8751 (longlong_t)refcount); 8752 } 8753 } 8754 8755 if (rc == 0) 8756 rc = verify_device_removal_feature_counts(spa); 8757 } 8758 8759 if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) 8760 rc = dump_block_stats(spa); 8761 8762 if (rc == 0) 8763 rc = verify_spacemap_refcounts(spa); 8764 8765 if (dump_opt['s']) 8766 show_pool_stats(spa); 8767 8768 if (dump_opt['h']) 8769 dump_history(spa); 8770 8771 if (rc == 0) 8772 rc = verify_checkpoint(spa); 8773 8774 if (rc != 0) { 8775 dump_debug_buffer(); 8776 zdb_exit(rc); 8777 } 8778 } 8779 8780 #define ZDB_FLAG_CHECKSUM 0x0001 8781 #define ZDB_FLAG_DECOMPRESS 0x0002 8782 #define ZDB_FLAG_BSWAP 0x0004 8783 #define ZDB_FLAG_GBH 0x0008 8784 #define ZDB_FLAG_INDIRECT 0x0010 8785 #define ZDB_FLAG_RAW 0x0020 8786 #define ZDB_FLAG_PRINT_BLKPTR 0x0040 8787 #define ZDB_FLAG_VERBOSE 0x0080 8788 8789 static int flagbits[256]; 8790 static char flagbitstr[16]; 8791 8792 static void 8793 zdb_print_blkptr(const blkptr_t *bp, int flags) 8794 { 8795 char blkbuf[BP_SPRINTF_LEN]; 8796 8797 if (flags & ZDB_FLAG_BSWAP) 8798 byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); 8799 8800 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 8801 (void) printf("%s\n", blkbuf); 8802 } 8803 8804 static void 8805 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) 8806 { 8807 int i; 8808 8809 for (i = 0; i < nbps; i++) 8810 zdb_print_blkptr(&bp[i], flags); 8811 } 8812 8813 static void 8814 zdb_dump_gbh(void *buf, uint64_t size, int flags) 8815 { 8816 zdb_dump_indirect((blkptr_t *)buf, gbh_nblkptrs(size), flags); 8817 } 8818 8819 static void 8820 zdb_dump_block_raw(void *buf, uint64_t size, int flags) 8821 { 8822 if (flags & ZDB_FLAG_BSWAP) 8823 byteswap_uint64_array(buf, size); 8824 VERIFY(write(fileno(stdout), buf, size) == size); 8825 } 8826 8827 static void 8828 zdb_dump_block(char *label, void *buf, uint64_t size, int flags) 8829 { 8830 uint64_t *d = (uint64_t *)buf; 8831 unsigned nwords = size / sizeof (uint64_t); 8832 int do_bswap = !!(flags & ZDB_FLAG_BSWAP); 8833 unsigned i, j; 8834 const char *hdr; 8835 char *c; 8836 8837 8838 if (do_bswap) 8839 hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; 8840 else 8841 hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; 8842 8843 (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); 8844 8845 #ifdef _ZFS_LITTLE_ENDIAN 8846 /* correct the endianness */ 8847 do_bswap = !do_bswap; 8848 #endif 8849 for (i = 0; i < nwords; i += 2) { 8850 (void) printf("%06llx: %016llx %016llx ", 8851 (u_longlong_t)(i * sizeof (uint64_t)), 8852 (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), 8853 (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); 8854 8855 c = (char *)&d[i]; 8856 for (j = 0; j < 2 * sizeof (uint64_t); j++) 8857 (void) printf("%c", isprint(c[j]) ? c[j] : '.'); 8858 (void) printf("\n"); 8859 } 8860 } 8861 8862 /* 8863 * There are two acceptable formats: 8864 * leaf_name - For example: c1t0d0 or /tmp/ztest.0a 8865 * child[.child]* - For example: 0.1.1 8866 * 8867 * The second form can be used to specify arbitrary vdevs anywhere 8868 * in the hierarchy. For example, in a pool with a mirror of 8869 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . 8870 */ 8871 static vdev_t * 8872 zdb_vdev_lookup(vdev_t *vdev, const char *path) 8873 { 8874 char *s, *p, *q; 8875 unsigned i; 8876 8877 if (vdev == NULL) 8878 return (NULL); 8879 8880 /* First, assume the x.x.x.x format */ 8881 i = strtoul(path, &s, 10); 8882 if (s == path || (s && *s != '.' && *s != '\0')) 8883 goto name; 8884 if (i >= vdev->vdev_children) 8885 return (NULL); 8886 8887 vdev = vdev->vdev_child[i]; 8888 if (s && *s == '\0') 8889 return (vdev); 8890 return (zdb_vdev_lookup(vdev, s+1)); 8891 8892 name: 8893 for (i = 0; i < vdev->vdev_children; i++) { 8894 vdev_t *vc = vdev->vdev_child[i]; 8895 8896 if (vc->vdev_path == NULL) { 8897 vc = zdb_vdev_lookup(vc, path); 8898 if (vc == NULL) 8899 continue; 8900 else 8901 return (vc); 8902 } 8903 8904 p = strrchr(vc->vdev_path, '/'); 8905 p = p ? p + 1 : vc->vdev_path; 8906 q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; 8907 8908 if (strcmp(vc->vdev_path, path) == 0) 8909 return (vc); 8910 if (strcmp(p, path) == 0) 8911 return (vc); 8912 if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) 8913 return (vc); 8914 } 8915 8916 return (NULL); 8917 } 8918 8919 static int 8920 name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr) 8921 { 8922 dsl_dataset_t *ds; 8923 8924 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 8925 int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id, 8926 NULL, &ds); 8927 if (error != 0) { 8928 (void) fprintf(stderr, "failed to hold objset %llu: %s\n", 8929 (u_longlong_t)objset_id, strerror(error)); 8930 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 8931 return (error); 8932 } 8933 dsl_dataset_name(ds, outstr); 8934 dsl_dataset_rele(ds, NULL); 8935 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 8936 return (0); 8937 } 8938 8939 static boolean_t 8940 zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) 8941 { 8942 char *s0, *s1, *tmp = NULL; 8943 8944 if (sizes == NULL) 8945 return (B_FALSE); 8946 8947 s0 = strtok_r(sizes, "/", &tmp); 8948 if (s0 == NULL) 8949 return (B_FALSE); 8950 s1 = strtok_r(NULL, "/", &tmp); 8951 *lsize = strtoull(s0, NULL, 16); 8952 *psize = s1 ? strtoull(s1, NULL, 16) : *lsize; 8953 return (*lsize >= *psize && *psize > 0); 8954 } 8955 8956 #define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) 8957 8958 static boolean_t 8959 try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize, 8960 int flags, int cfunc, void *lbuf, void *lbuf2) 8961 { 8962 if (flags & ZDB_FLAG_VERBOSE) { 8963 (void) fprintf(stderr, 8964 "Trying %05llx -> %05llx (%s)\n", 8965 (u_longlong_t)psize, 8966 (u_longlong_t)lsize, 8967 zio_compress_table[cfunc].ci_name); 8968 } 8969 8970 /* 8971 * We set lbuf to all zeros and lbuf2 to all 8972 * ones, then decompress to both buffers and 8973 * compare their contents. This way we can 8974 * know if decompression filled exactly to 8975 * lsize or if it left some bytes unwritten. 8976 */ 8977 8978 memset(lbuf, 0x00, lsize); 8979 memset(lbuf2, 0xff, lsize); 8980 8981 abd_t labd, labd2; 8982 abd_get_from_buf_struct(&labd, lbuf, lsize); 8983 abd_get_from_buf_struct(&labd2, lbuf2, lsize); 8984 8985 boolean_t ret = B_FALSE; 8986 if (zio_decompress_data(cfunc, pabd, 8987 &labd, psize, lsize, NULL) == 0 && 8988 zio_decompress_data(cfunc, pabd, 8989 &labd2, psize, lsize, NULL) == 0 && 8990 memcmp(lbuf, lbuf2, lsize) == 0) 8991 ret = B_TRUE; 8992 8993 abd_free(&labd2); 8994 abd_free(&labd); 8995 8996 return (ret); 8997 } 8998 8999 static uint64_t 9000 zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, 9001 uint64_t psize, int flags) 9002 { 9003 (void) buf; 9004 uint64_t orig_lsize = lsize; 9005 boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL)); 9006 /* 9007 * We don't know how the data was compressed, so just try 9008 * every decompress function at every inflated blocksize. 9009 */ 9010 void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 9011 int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 }; 9012 int *cfuncp = cfuncs; 9013 uint64_t maxlsize = SPA_MAXBLOCKSIZE; 9014 uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | 9015 ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | 9016 ZIO_COMPRESS_MASK(ZLE); 9017 *cfuncp++ = ZIO_COMPRESS_LZ4; 9018 *cfuncp++ = ZIO_COMPRESS_LZJB; 9019 mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); 9020 /* 9021 * Every gzip level has the same decompressor, no need to 9022 * run it 9 times per bruteforce attempt. 9023 */ 9024 mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3); 9025 mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5); 9026 mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7); 9027 mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9); 9028 for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) 9029 if (((1ULL << c) & mask) == 0) 9030 *cfuncp++ = c; 9031 9032 /* 9033 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this 9034 * could take a while and we should let the user know 9035 * we are not stuck. On the other hand, printing progress 9036 * info gets old after a while. User can specify 'v' flag 9037 * to see the progression. 9038 */ 9039 if (lsize == psize) 9040 lsize += SPA_MINBLOCKSIZE; 9041 else 9042 maxlsize = lsize; 9043 9044 for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { 9045 for (cfuncp = cfuncs; *cfuncp; cfuncp++) { 9046 if (try_decompress_block(pabd, lsize, psize, flags, 9047 *cfuncp, lbuf, lbuf2)) { 9048 tryzle = B_FALSE; 9049 break; 9050 } 9051 } 9052 if (*cfuncp != 0) 9053 break; 9054 } 9055 if (tryzle) { 9056 for (lsize = orig_lsize; lsize <= maxlsize; 9057 lsize += SPA_MINBLOCKSIZE) { 9058 if (try_decompress_block(pabd, lsize, psize, flags, 9059 ZIO_COMPRESS_ZLE, lbuf, lbuf2)) { 9060 *cfuncp = ZIO_COMPRESS_ZLE; 9061 break; 9062 } 9063 } 9064 } 9065 umem_free(lbuf2, SPA_MAXBLOCKSIZE); 9066 9067 if (*cfuncp == ZIO_COMPRESS_ZLE) { 9068 printf("\nZLE decompression was selected. If you " 9069 "suspect the results are wrong,\ntry avoiding ZLE " 9070 "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); 9071 } 9072 9073 return (lsize > maxlsize ? -1 : lsize); 9074 } 9075 9076 /* 9077 * Read a block from a pool and print it out. The syntax of the 9078 * block descriptor is: 9079 * 9080 * pool:vdev_specifier:offset:[lsize/]psize[:flags] 9081 * 9082 * pool - The name of the pool you wish to read from 9083 * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) 9084 * offset - offset, in hex, in bytes 9085 * size - Amount of data to read, in hex, in bytes 9086 * flags - A string of characters specifying options 9087 * b: Decode a blkptr at given offset within block 9088 * c: Calculate and display checksums 9089 * d: Decompress data before dumping 9090 * e: Byteswap data before dumping 9091 * g: Display data as a gang block header 9092 * i: Display as an indirect block 9093 * r: Dump raw data to stdout 9094 * v: Verbose 9095 * 9096 */ 9097 static void 9098 zdb_read_block(char *thing, spa_t *spa) 9099 { 9100 blkptr_t blk, *bp = &blk; 9101 dva_t *dva = bp->blk_dva; 9102 int flags = 0; 9103 uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0; 9104 zio_t *zio; 9105 vdev_t *vd; 9106 abd_t *pabd; 9107 void *lbuf, *buf; 9108 char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL; 9109 const char *vdev, *errmsg = NULL; 9110 int i, len, error; 9111 boolean_t borrowed = B_FALSE, found = B_FALSE; 9112 9113 dup = strdup(thing); 9114 s = strtok_r(dup, ":", &tmp); 9115 vdev = s ?: ""; 9116 s = strtok_r(NULL, ":", &tmp); 9117 offset = strtoull(s ? s : "", NULL, 16); 9118 sizes = strtok_r(NULL, ":", &tmp); 9119 s = strtok_r(NULL, ":", &tmp); 9120 flagstr = strdup(s ?: ""); 9121 9122 if (!zdb_parse_block_sizes(sizes, &lsize, &psize)) 9123 errmsg = "invalid size(s)"; 9124 if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE)) 9125 errmsg = "size must be a multiple of sector size"; 9126 if (!IS_P2ALIGNED(offset, DEV_BSIZE)) 9127 errmsg = "offset must be a multiple of sector size"; 9128 if (errmsg) { 9129 (void) printf("Invalid block specifier: %s - %s\n", 9130 thing, errmsg); 9131 goto done; 9132 } 9133 9134 tmp = NULL; 9135 for (s = strtok_r(flagstr, ":", &tmp); 9136 s != NULL; 9137 s = strtok_r(NULL, ":", &tmp)) { 9138 len = strlen(flagstr); 9139 for (i = 0; i < len; i++) { 9140 int bit = flagbits[(uchar_t)flagstr[i]]; 9141 9142 if (bit == 0) { 9143 (void) printf("***Ignoring flag: %c\n", 9144 (uchar_t)flagstr[i]); 9145 continue; 9146 } 9147 found = B_TRUE; 9148 flags |= bit; 9149 9150 p = &flagstr[i + 1]; 9151 if (*p != ':' && *p != '\0') { 9152 int j = 0, nextbit = flagbits[(uchar_t)*p]; 9153 char *end, offstr[8] = { 0 }; 9154 if ((bit == ZDB_FLAG_PRINT_BLKPTR) && 9155 (nextbit == 0)) { 9156 /* look ahead to isolate the offset */ 9157 while (nextbit == 0 && 9158 strchr(flagbitstr, *p) == NULL) { 9159 offstr[j] = *p; 9160 j++; 9161 if (i + j > strlen(flagstr)) 9162 break; 9163 p++; 9164 nextbit = flagbits[(uchar_t)*p]; 9165 } 9166 blkptr_offset = strtoull(offstr, &end, 9167 16); 9168 i += j; 9169 } else if (nextbit == 0) { 9170 (void) printf("***Ignoring flag arg:" 9171 " '%c'\n", (uchar_t)*p); 9172 } 9173 } 9174 } 9175 } 9176 if (blkptr_offset % sizeof (blkptr_t)) { 9177 printf("Block pointer offset 0x%llx " 9178 "must be divisible by 0x%x\n", 9179 (longlong_t)blkptr_offset, (int)sizeof (blkptr_t)); 9180 goto done; 9181 } 9182 if (found == B_FALSE && strlen(flagstr) > 0) { 9183 printf("Invalid flag arg: '%s'\n", flagstr); 9184 goto done; 9185 } 9186 9187 vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); 9188 if (vd == NULL) { 9189 (void) printf("***Invalid vdev: %s\n", vdev); 9190 goto done; 9191 } else { 9192 if (vd->vdev_path) 9193 (void) fprintf(stderr, "Found vdev: %s\n", 9194 vd->vdev_path); 9195 else 9196 (void) fprintf(stderr, "Found vdev type: %s\n", 9197 vd->vdev_ops->vdev_op_type); 9198 } 9199 9200 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); 9201 lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 9202 9203 BP_ZERO(bp); 9204 9205 DVA_SET_VDEV(&dva[0], vd->vdev_id); 9206 DVA_SET_OFFSET(&dva[0], offset); 9207 DVA_SET_GANG(&dva[0], 0); 9208 DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); 9209 9210 BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); 9211 9212 BP_SET_LSIZE(bp, lsize); 9213 BP_SET_PSIZE(bp, psize); 9214 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 9215 BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); 9216 BP_SET_TYPE(bp, DMU_OT_NONE); 9217 BP_SET_LEVEL(bp, 0); 9218 BP_SET_DEDUP(bp, 0); 9219 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 9220 9221 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9222 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 9223 9224 if (vd == vd->vdev_top) { 9225 /* 9226 * Treat this as a normal block read. 9227 */ 9228 zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, 9229 ZIO_PRIORITY_SYNC_READ, 9230 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); 9231 } else { 9232 /* 9233 * Treat this as a vdev child I/O. 9234 */ 9235 zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, 9236 psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, 9237 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | 9238 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, 9239 NULL, NULL)); 9240 } 9241 9242 error = zio_wait(zio); 9243 spa_config_exit(spa, SCL_STATE, FTAG); 9244 9245 if (error) { 9246 (void) printf("Read of %s failed, error: %d\n", thing, error); 9247 goto out; 9248 } 9249 9250 uint64_t orig_lsize = lsize; 9251 buf = lbuf; 9252 if (flags & ZDB_FLAG_DECOMPRESS) { 9253 lsize = zdb_decompress_block(pabd, buf, lbuf, 9254 lsize, psize, flags); 9255 if (lsize == -1) { 9256 (void) printf("Decompress of %s failed\n", thing); 9257 goto out; 9258 } 9259 } else { 9260 buf = abd_borrow_buf_copy(pabd, lsize); 9261 borrowed = B_TRUE; 9262 } 9263 /* 9264 * Try to detect invalid block pointer. If invalid, try 9265 * decompressing. 9266 */ 9267 if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) && 9268 !(flags & ZDB_FLAG_DECOMPRESS)) { 9269 const blkptr_t *b = (const blkptr_t *)(void *) 9270 ((uintptr_t)buf + (uintptr_t)blkptr_offset); 9271 if (zfs_blkptr_verify(spa, b, 9272 BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY)) { 9273 abd_return_buf_copy(pabd, buf, lsize); 9274 borrowed = B_FALSE; 9275 buf = lbuf; 9276 lsize = zdb_decompress_block(pabd, buf, 9277 lbuf, lsize, psize, flags); 9278 b = (const blkptr_t *)(void *) 9279 ((uintptr_t)buf + (uintptr_t)blkptr_offset); 9280 if (lsize == -1 || zfs_blkptr_verify(spa, b, 9281 BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { 9282 printf("invalid block pointer at this DVA\n"); 9283 goto out; 9284 } 9285 } 9286 } 9287 9288 if (flags & ZDB_FLAG_PRINT_BLKPTR) 9289 zdb_print_blkptr((blkptr_t *)(void *) 9290 ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); 9291 else if (flags & ZDB_FLAG_RAW) 9292 zdb_dump_block_raw(buf, lsize, flags); 9293 else if (flags & ZDB_FLAG_INDIRECT) 9294 zdb_dump_indirect((blkptr_t *)buf, 9295 orig_lsize / sizeof (blkptr_t), flags); 9296 else if (flags & ZDB_FLAG_GBH) 9297 zdb_dump_gbh(buf, lsize, flags); 9298 else 9299 zdb_dump_block(thing, buf, lsize, flags); 9300 9301 /* 9302 * If :c was specified, iterate through the checksum table to 9303 * calculate and display each checksum for our specified 9304 * DVA and length. 9305 */ 9306 if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) && 9307 !(flags & ZDB_FLAG_GBH)) { 9308 zio_t *czio; 9309 (void) printf("\n"); 9310 for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL; 9311 ck < ZIO_CHECKSUM_FUNCTIONS; ck++) { 9312 9313 if ((zio_checksum_table[ck].ci_flags & 9314 ZCHECKSUM_FLAG_EMBEDDED) || 9315 ck == ZIO_CHECKSUM_NOPARITY) { 9316 continue; 9317 } 9318 BP_SET_CHECKSUM(bp, ck); 9319 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9320 czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 9321 if (vd == vd->vdev_top) { 9322 zio_nowait(zio_read(czio, spa, bp, pabd, psize, 9323 NULL, NULL, 9324 ZIO_PRIORITY_SYNC_READ, 9325 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | 9326 ZIO_FLAG_DONT_RETRY, NULL)); 9327 } else { 9328 zio_nowait(zio_vdev_child_io(czio, bp, vd, 9329 offset, pabd, psize, ZIO_TYPE_READ, 9330 ZIO_PRIORITY_SYNC_READ, 9331 ZIO_FLAG_DONT_PROPAGATE | 9332 ZIO_FLAG_DONT_RETRY | 9333 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | 9334 ZIO_FLAG_SPECULATIVE | 9335 ZIO_FLAG_OPTIONAL, NULL, NULL)); 9336 } 9337 error = zio_wait(czio); 9338 if (error == 0 || error == ECKSUM) { 9339 zio_t *ck_zio = zio_null(NULL, spa, NULL, 9340 NULL, NULL, 0); 9341 ck_zio->io_offset = 9342 DVA_GET_OFFSET(&bp->blk_dva[0]); 9343 ck_zio->io_bp = bp; 9344 zio_checksum_compute(ck_zio, ck, pabd, psize); 9345 printf( 9346 "%12s\t" 9347 "cksum=%016llx:%016llx:%016llx:%016llx\n", 9348 zio_checksum_table[ck].ci_name, 9349 (u_longlong_t)bp->blk_cksum.zc_word[0], 9350 (u_longlong_t)bp->blk_cksum.zc_word[1], 9351 (u_longlong_t)bp->blk_cksum.zc_word[2], 9352 (u_longlong_t)bp->blk_cksum.zc_word[3]); 9353 zio_wait(ck_zio); 9354 } else { 9355 printf("error %d reading block\n", error); 9356 } 9357 spa_config_exit(spa, SCL_STATE, FTAG); 9358 } 9359 } 9360 9361 if (borrowed) 9362 abd_return_buf_copy(pabd, buf, lsize); 9363 9364 out: 9365 abd_free(pabd); 9366 umem_free(lbuf, SPA_MAXBLOCKSIZE); 9367 done: 9368 free(flagstr); 9369 free(dup); 9370 } 9371 9372 static void 9373 zdb_embedded_block(char *thing) 9374 { 9375 blkptr_t bp = {{{{0}}}}; 9376 unsigned long long *words = (void *)&bp; 9377 char *buf; 9378 int err; 9379 9380 err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" 9381 "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", 9382 words + 0, words + 1, words + 2, words + 3, 9383 words + 4, words + 5, words + 6, words + 7, 9384 words + 8, words + 9, words + 10, words + 11, 9385 words + 12, words + 13, words + 14, words + 15); 9386 if (err != 16) { 9387 (void) fprintf(stderr, "invalid input format\n"); 9388 zdb_exit(1); 9389 } 9390 ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); 9391 buf = malloc(SPA_MAXBLOCKSIZE); 9392 if (buf == NULL) { 9393 (void) fprintf(stderr, "out of memory\n"); 9394 zdb_exit(1); 9395 } 9396 err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); 9397 if (err != 0) { 9398 (void) fprintf(stderr, "decode failed: %u\n", err); 9399 zdb_exit(1); 9400 } 9401 zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); 9402 free(buf); 9403 } 9404 9405 /* check for valid hex or decimal numeric string */ 9406 static boolean_t 9407 zdb_numeric(char *str) 9408 { 9409 int i = 0, len; 9410 9411 len = strlen(str); 9412 if (len == 0) 9413 return (B_FALSE); 9414 if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0) 9415 i = 2; 9416 for (; i < len; i++) { 9417 if (!isxdigit(str[i])) 9418 return (B_FALSE); 9419 } 9420 return (B_TRUE); 9421 } 9422 9423 static int 9424 dummy_get_file_info(dmu_object_type_t bonustype, const void *data, 9425 zfs_file_info_t *zoi) 9426 { 9427 (void) data, (void) zoi; 9428 9429 if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) 9430 return (ENOENT); 9431 9432 (void) fprintf(stderr, "dummy_get_file_info: not implemented"); 9433 abort(); 9434 } 9435 9436 int 9437 main(int argc, char **argv) 9438 { 9439 int c; 9440 int dump_all = 1; 9441 int verbose = 0; 9442 int error = 0; 9443 char **searchdirs = NULL; 9444 int nsearch = 0; 9445 char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN]; 9446 nvlist_t *policy = NULL; 9447 uint64_t max_txg = UINT64_MAX; 9448 int64_t objset_id = -1; 9449 uint64_t object; 9450 int flags = ZFS_IMPORT_MISSING_LOG; 9451 int rewind = ZPOOL_NEVER_REWIND; 9452 char *spa_config_path_env, *objset_str; 9453 boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; 9454 nvlist_t *cfg = NULL; 9455 struct sigaction action; 9456 boolean_t force_import = B_FALSE; 9457 boolean_t config_path_console = B_FALSE; 9458 char pbuf[MAXPATHLEN]; 9459 9460 dprintf_setup(&argc, argv); 9461 9462 /* 9463 * Set up signal handlers, so if we crash due to bad on-disk data we 9464 * can get more info. Unlike ztest, we don't bail out if we can't set 9465 * up signal handlers, because zdb is very useful without them. 9466 */ 9467 action.sa_handler = sig_handler; 9468 sigemptyset(&action.sa_mask); 9469 action.sa_flags = 0; 9470 if (sigaction(SIGSEGV, &action, NULL) < 0) { 9471 (void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n", 9472 strerror(errno)); 9473 } 9474 if (sigaction(SIGABRT, &action, NULL) < 0) { 9475 (void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n", 9476 strerror(errno)); 9477 } 9478 9479 /* 9480 * If there is an environment variable SPA_CONFIG_PATH it overrides 9481 * default spa_config_path setting. If -U flag is specified it will 9482 * override this environment variable settings once again. 9483 */ 9484 spa_config_path_env = getenv("SPA_CONFIG_PATH"); 9485 if (spa_config_path_env != NULL) 9486 spa_config_path = spa_config_path_env; 9487 9488 /* 9489 * For performance reasons, we set this tunable down. We do so before 9490 * the arg parsing section so that the user can override this value if 9491 * they choose. 9492 */ 9493 zfs_btree_verify_intensity = 3; 9494 9495 struct option long_options[] = { 9496 {"ignore-assertions", no_argument, NULL, 'A'}, 9497 {"block-stats", no_argument, NULL, 'b'}, 9498 {"backup", no_argument, NULL, 'B'}, 9499 {"checksum", no_argument, NULL, 'c'}, 9500 {"config", no_argument, NULL, 'C'}, 9501 {"datasets", no_argument, NULL, 'd'}, 9502 {"dedup-stats", no_argument, NULL, 'D'}, 9503 {"exported", no_argument, NULL, 'e'}, 9504 {"embedded-block-pointer", no_argument, NULL, 'E'}, 9505 {"automatic-rewind", no_argument, NULL, 'F'}, 9506 {"dump-debug-msg", no_argument, NULL, 'G'}, 9507 {"history", no_argument, NULL, 'h'}, 9508 {"intent-logs", no_argument, NULL, 'i'}, 9509 {"inflight", required_argument, NULL, 'I'}, 9510 {"checkpointed-state", no_argument, NULL, 'k'}, 9511 {"key", required_argument, NULL, 'K'}, 9512 {"label", no_argument, NULL, 'l'}, 9513 {"disable-leak-tracking", no_argument, NULL, 'L'}, 9514 {"metaslabs", no_argument, NULL, 'm'}, 9515 {"metaslab-groups", no_argument, NULL, 'M'}, 9516 {"numeric", no_argument, NULL, 'N'}, 9517 {"option", required_argument, NULL, 'o'}, 9518 {"object-lookups", no_argument, NULL, 'O'}, 9519 {"path", required_argument, NULL, 'p'}, 9520 {"parseable", no_argument, NULL, 'P'}, 9521 {"skip-label", no_argument, NULL, 'q'}, 9522 {"copy-object", no_argument, NULL, 'r'}, 9523 {"read-block", no_argument, NULL, 'R'}, 9524 {"io-stats", no_argument, NULL, 's'}, 9525 {"simulate-dedup", no_argument, NULL, 'S'}, 9526 {"txg", required_argument, NULL, 't'}, 9527 {"brt-stats", no_argument, NULL, 'T'}, 9528 {"uberblock", no_argument, NULL, 'u'}, 9529 {"cachefile", required_argument, NULL, 'U'}, 9530 {"verbose", no_argument, NULL, 'v'}, 9531 {"verbatim", no_argument, NULL, 'V'}, 9532 {"dump-blocks", required_argument, NULL, 'x'}, 9533 {"extreme-rewind", no_argument, NULL, 'X'}, 9534 {"all-reconstruction", no_argument, NULL, 'Y'}, 9535 {"livelist", no_argument, NULL, 'y'}, 9536 {"zstd-headers", no_argument, NULL, 'Z'}, 9537 {"allocated-map", no_argument, NULL, 9538 ARG_ALLOCATED}, 9539 {"bin", required_argument, NULL, 9540 ARG_BLOCK_BIN_MODE}, 9541 {"class", required_argument, NULL, 9542 ARG_BLOCK_CLASSES}, 9543 {0, 0, 0, 0} 9544 }; 9545 9546 while ((c = getopt_long(argc, argv, 9547 "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ", 9548 long_options, NULL)) != -1) { 9549 switch (c) { 9550 case 'b': 9551 case 'B': 9552 case 'c': 9553 case 'C': 9554 case 'd': 9555 case 'D': 9556 case 'E': 9557 case 'G': 9558 case 'h': 9559 case 'i': 9560 case 'l': 9561 case 'm': 9562 case 'M': 9563 case 'N': 9564 case 'O': 9565 case 'r': 9566 case 'R': 9567 case 's': 9568 case 'S': 9569 case 'T': 9570 case 'u': 9571 case 'y': 9572 case 'Z': 9573 case ARG_ALLOCATED: 9574 dump_opt[c]++; 9575 dump_all = 0; 9576 break; 9577 case 'A': 9578 case 'e': 9579 case 'F': 9580 case 'k': 9581 case 'L': 9582 case 'P': 9583 case 'q': 9584 case 'X': 9585 dump_opt[c]++; 9586 break; 9587 case 'Y': 9588 zfs_reconstruct_indirect_combinations_max = INT_MAX; 9589 zfs_deadman_enabled = 0; 9590 break; 9591 /* NB: Sort single match options below. */ 9592 case 'I': 9593 max_inflight_bytes = strtoull(optarg, NULL, 0); 9594 if (max_inflight_bytes == 0) { 9595 (void) fprintf(stderr, "maximum number " 9596 "of inflight bytes must be greater " 9597 "than 0\n"); 9598 usage(); 9599 } 9600 break; 9601 case 'K': 9602 dump_opt[c]++; 9603 key_material = strdup(optarg); 9604 /* redact key material in process table */ 9605 while (*optarg != '\0') { *optarg++ = '*'; } 9606 break; 9607 case 'o': 9608 dump_opt[c]++; 9609 dump_all = 0; 9610 error = handle_tunable_option(optarg, B_FALSE); 9611 if (error != 0) 9612 zdb_exit(1); 9613 break; 9614 case 'p': 9615 if (searchdirs == NULL) { 9616 searchdirs = umem_alloc(sizeof (char *), 9617 UMEM_NOFAIL); 9618 } else { 9619 char **tmp = umem_alloc((nsearch + 1) * 9620 sizeof (char *), UMEM_NOFAIL); 9621 memcpy(tmp, searchdirs, nsearch * 9622 sizeof (char *)); 9623 umem_free(searchdirs, 9624 nsearch * sizeof (char *)); 9625 searchdirs = tmp; 9626 } 9627 searchdirs[nsearch++] = optarg; 9628 break; 9629 case 't': 9630 max_txg = strtoull(optarg, NULL, 0); 9631 if (max_txg < TXG_INITIAL) { 9632 (void) fprintf(stderr, "incorrect txg " 9633 "specified: %s\n", optarg); 9634 usage(); 9635 } 9636 break; 9637 case 'U': 9638 config_path_console = B_TRUE; 9639 spa_config_path = optarg; 9640 if (spa_config_path[0] != '/') { 9641 (void) fprintf(stderr, 9642 "cachefile must be an absolute path " 9643 "(i.e. start with a slash)\n"); 9644 usage(); 9645 } 9646 break; 9647 case 'v': 9648 verbose++; 9649 break; 9650 case 'V': 9651 flags = ZFS_IMPORT_VERBATIM; 9652 break; 9653 case 'x': 9654 vn_dumpdir = optarg; 9655 break; 9656 case ARG_BLOCK_BIN_MODE: 9657 if (strcmp(optarg, "lsize") == 0) { 9658 block_bin_mode = BIN_LSIZE; 9659 } else if (strcmp(optarg, "psize") == 0) { 9660 block_bin_mode = BIN_PSIZE; 9661 } else if (strcmp(optarg, "asize") == 0) { 9662 block_bin_mode = BIN_ASIZE; 9663 } else { 9664 (void) fprintf(stderr, 9665 "--bin=\"%s\" must be one of \"lsize\", " 9666 "\"psize\" or \"asize\"\n", optarg); 9667 usage(); 9668 } 9669 break; 9670 9671 case ARG_BLOCK_CLASSES: { 9672 char *buf = strdup(optarg), *tok = buf, *next, 9673 *save = NULL; 9674 9675 while ((next = strtok_r(tok, ",", &save)) != NULL) { 9676 tok = NULL; 9677 9678 if (strcmp(next, "normal") == 0) { 9679 block_classes |= CLASS_NORMAL; 9680 } else if (strcmp(next, "special") == 0) { 9681 block_classes |= CLASS_SPECIAL; 9682 } else if (strcmp(next, "dedup") == 0) { 9683 block_classes |= CLASS_DEDUP; 9684 } else if (strcmp(next, "other") == 0) { 9685 block_classes |= CLASS_OTHER; 9686 } else { 9687 (void) fprintf(stderr, 9688 "--class=\"%s\" must be a " 9689 "comma-separated list of either " 9690 "\"normal\", \"special\", " 9691 "\"asize\" or \"other\"; " 9692 "got \"%s\"\n", 9693 optarg, next); 9694 usage(); 9695 } 9696 } 9697 9698 if (block_classes == 0) { 9699 (void) fprintf(stderr, 9700 "--class= must be a comma-separated " 9701 "list of either \"normal\", \"special\", " 9702 "\"asize\" or \"other\"; got empty\n"); 9703 usage(); 9704 } 9705 9706 free(buf); 9707 break; 9708 } 9709 default: 9710 usage(); 9711 break; 9712 } 9713 } 9714 9715 if (!dump_opt['e'] && searchdirs != NULL) { 9716 (void) fprintf(stderr, "-p option requires use of -e\n"); 9717 usage(); 9718 } 9719 #if defined(_LP64) 9720 /* 9721 * ZDB does not typically re-read blocks; therefore limit the ARC 9722 * to 256 MB, which can be used entirely for metadata. 9723 */ 9724 zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT; 9725 zfs_arc_max = 256 * 1024 * 1024; 9726 #endif 9727 9728 /* 9729 * "zdb -c" uses checksum-verifying scrub i/os which are async reads. 9730 * "zdb -b" uses traversal prefetch which uses async reads. 9731 * For good performance, let several of them be active at once. 9732 */ 9733 zfs_vdev_async_read_max_active = 10; 9734 9735 /* 9736 * Disable reference tracking for better performance. 9737 */ 9738 reference_tracking_enable = B_FALSE; 9739 9740 /* 9741 * Do not fail spa_load when spa_load_verify fails. This is needed 9742 * to load non-idle pools. 9743 */ 9744 spa_load_verify_dryrun = B_TRUE; 9745 9746 /* 9747 * ZDB should have ability to read spacemaps. 9748 */ 9749 spa_mode_readable_spacemaps = B_TRUE; 9750 9751 libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2)); 9752 zfs_recover = (dump_opt['A'] > 1); 9753 9754 if (dump_all) 9755 verbose = MAX(verbose, 1); 9756 9757 for (c = 0; c < 256; c++) { 9758 if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL) 9759 dump_opt[c] = 1; 9760 if (dump_opt[c]) 9761 dump_opt[c] += verbose; 9762 } 9763 9764 argc -= optind; 9765 argv += optind; 9766 if (argc < 2 && dump_opt['R']) 9767 usage(); 9768 9769 target = argv[0]; 9770 9771 /* 9772 * Automate cachefile 9773 */ 9774 if (!spa_config_path_env && !config_path_console && target && 9775 libzfs_core_init() == 0) { 9776 char *pname = strdup(target); 9777 const char *value; 9778 nvlist_t *pnvl = NULL; 9779 nvlist_t *vnvl = NULL; 9780 9781 if (strpbrk(pname, "/@") != NULL) 9782 *strpbrk(pname, "/@") = '\0'; 9783 9784 if (pname && lzc_get_props(pname, &pnvl) == 0) { 9785 if (nvlist_lookup_nvlist(pnvl, "cachefile", 9786 &vnvl) == 0) { 9787 value = fnvlist_lookup_string(vnvl, 9788 ZPROP_VALUE); 9789 } else { 9790 value = "-"; 9791 } 9792 strlcpy(pbuf, value, sizeof (pbuf)); 9793 if (pbuf[0] != '\0') { 9794 if (pbuf[0] == '/') { 9795 if (access(pbuf, F_OK) == 0) 9796 spa_config_path = pbuf; 9797 else 9798 force_import = B_TRUE; 9799 } else if ((strcmp(pbuf, "-") == 0 && 9800 access(ZPOOL_CACHE, F_OK) != 0) || 9801 strcmp(pbuf, "none") == 0) { 9802 force_import = B_TRUE; 9803 } 9804 } 9805 nvlist_free(vnvl); 9806 } 9807 9808 free(pname); 9809 nvlist_free(pnvl); 9810 libzfs_core_fini(); 9811 } 9812 9813 dmu_objset_register_type(DMU_OST_ZFS, dummy_get_file_info); 9814 kernel_init(SPA_MODE_READ); 9815 kernel_init_done = B_TRUE; 9816 9817 if (dump_opt['E']) { 9818 if (argc != 1) 9819 usage(); 9820 zdb_embedded_block(argv[0]); 9821 error = 0; 9822 goto fini; 9823 } 9824 9825 if (argc < 1) { 9826 if (!dump_opt['e'] && dump_opt['C']) { 9827 dump_cachefile(spa_config_path); 9828 error = 0; 9829 goto fini; 9830 } 9831 if (dump_opt['o']) 9832 /* 9833 * Avoid blasting tunable options off the top of the 9834 * screen. 9835 */ 9836 zdb_exit(1); 9837 usage(); 9838 } 9839 9840 if (dump_opt['l']) { 9841 error = dump_label(argv[0]); 9842 goto fini; 9843 } 9844 9845 if (dump_opt['X'] || dump_opt['F']) 9846 rewind = ZPOOL_DO_REWIND | 9847 (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); 9848 9849 /* -N implies -d */ 9850 if (dump_opt['N'] && dump_opt['d'] == 0) 9851 dump_opt['d'] = dump_opt['N']; 9852 9853 if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || 9854 nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || 9855 nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) 9856 fatal("internal error: %s", strerror(ENOMEM)); 9857 9858 error = 0; 9859 9860 if (strpbrk(target, "/@") != NULL) { 9861 size_t targetlen; 9862 9863 target_pool = strdup(target); 9864 *strpbrk(target_pool, "/@") = '\0'; 9865 9866 target_is_spa = B_FALSE; 9867 targetlen = strlen(target); 9868 if (targetlen && target[targetlen - 1] == '/') 9869 target[targetlen - 1] = '\0'; 9870 9871 /* 9872 * See if an objset ID was supplied (-d <pool>/<objset ID>). 9873 * To disambiguate tank/100, consider the 100 as objsetID 9874 * if -N was given, otherwise 100 is an objsetID iff 9875 * tank/100 as a named dataset fails on lookup. 9876 */ 9877 objset_str = strchr(target, '/'); 9878 if (objset_str && strlen(objset_str) > 1 && 9879 zdb_numeric(objset_str + 1)) { 9880 char *endptr; 9881 errno = 0; 9882 objset_str++; 9883 objset_id = strtoull(objset_str, &endptr, 0); 9884 /* dataset 0 is the same as opening the pool */ 9885 if (errno == 0 && endptr != objset_str && 9886 objset_id != 0) { 9887 if (dump_opt['N']) 9888 dataset_lookup = B_TRUE; 9889 } 9890 /* normal dataset name not an objset ID */ 9891 if (endptr == objset_str) { 9892 objset_id = -1; 9893 } 9894 } else if (objset_str && !zdb_numeric(objset_str + 1) && 9895 dump_opt['N']) { 9896 printf("Supply a numeric objset ID with -N\n"); 9897 error = 2; 9898 goto fini; 9899 } 9900 } else { 9901 target_pool = target; 9902 } 9903 9904 if (dump_opt['e'] || force_import) { 9905 importargs_t args = { 0 }; 9906 9907 /* 9908 * If path is not provided, search in /dev 9909 */ 9910 if (searchdirs == NULL) { 9911 searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL); 9912 searchdirs[nsearch++] = (char *)ZFS_DEVDIR; 9913 } 9914 9915 args.paths = nsearch; 9916 args.path = searchdirs; 9917 args.can_be_active = B_TRUE; 9918 9919 libpc_handle_t lpch = { 9920 .lpc_lib_handle = NULL, 9921 .lpc_ops = &libzpool_config_ops, 9922 .lpc_printerr = B_TRUE 9923 }; 9924 error = zpool_find_config(&lpch, target_pool, &cfg, &args); 9925 9926 if (error == 0) { 9927 9928 if (nvlist_add_nvlist(cfg, 9929 ZPOOL_LOAD_POLICY, policy) != 0) { 9930 fatal("can't open '%s': %s", 9931 target, strerror(ENOMEM)); 9932 } 9933 9934 if (dump_opt['C'] > 1) { 9935 (void) printf("\nConfiguration for import:\n"); 9936 dump_nvlist(cfg, 8); 9937 } 9938 9939 /* 9940 * Disable the activity check to allow examination of 9941 * active pools. 9942 */ 9943 error = spa_import(target_pool, cfg, NULL, 9944 flags | ZFS_IMPORT_SKIP_MMP); 9945 } 9946 } 9947 9948 if (searchdirs != NULL) { 9949 umem_free(searchdirs, nsearch * sizeof (char *)); 9950 searchdirs = NULL; 9951 } 9952 9953 /* 9954 * We need to make sure to process -O option or call 9955 * dump_path after the -e option has been processed, 9956 * which imports the pool to the namespace if it's 9957 * not in the cachefile. 9958 */ 9959 if (dump_opt['O']) { 9960 if (argc != 2) 9961 usage(); 9962 dump_opt['v'] = verbose + 3; 9963 error = dump_path(argv[0], argv[1], NULL); 9964 goto fini; 9965 } 9966 9967 if (dump_opt['r']) { 9968 target_is_spa = B_FALSE; 9969 if (argc != 3) 9970 usage(); 9971 dump_opt['v'] = verbose; 9972 error = dump_path(argv[0], argv[1], &object); 9973 if (error != 0) 9974 fatal("internal error: %s", strerror(error)); 9975 } 9976 9977 /* 9978 * import_checkpointed_state makes the assumption that the 9979 * target pool that we pass it is already part of the spa 9980 * namespace. Because of that we need to make sure to call 9981 * it always after the -e option has been processed, which 9982 * imports the pool to the namespace if it's not in the 9983 * cachefile. 9984 */ 9985 char *checkpoint_pool = NULL; 9986 char *checkpoint_target = NULL; 9987 if (dump_opt['k']) { 9988 checkpoint_pool = import_checkpointed_state(target, cfg, 9989 target_is_spa, &checkpoint_target); 9990 9991 if (checkpoint_target != NULL) 9992 target = checkpoint_target; 9993 } 9994 9995 if (cfg != NULL) { 9996 nvlist_free(cfg); 9997 cfg = NULL; 9998 } 9999 10000 if (target_pool != target) 10001 free(target_pool); 10002 10003 if (error == 0) { 10004 if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { 10005 ASSERT(checkpoint_pool != NULL); 10006 ASSERT0P(checkpoint_target); 10007 10008 error = spa_open(checkpoint_pool, &spa, FTAG); 10009 if (error != 0) { 10010 fatal("Tried to open pool \"%s\" but " 10011 "spa_open() failed with error %d\n", 10012 checkpoint_pool, error); 10013 } 10014 10015 } else if (target_is_spa || dump_opt['R'] || dump_opt['B'] || 10016 objset_id == 0) { 10017 zdb_set_skip_mmp(target); 10018 error = spa_open_rewind(target, &spa, FTAG, policy, 10019 NULL); 10020 if (error) { 10021 /* 10022 * If we're missing the log device then 10023 * try opening the pool after clearing the 10024 * log state. 10025 */ 10026 spa_namespace_enter(FTAG); 10027 if ((spa = spa_lookup(target)) != NULL && 10028 spa->spa_log_state == SPA_LOG_MISSING) { 10029 spa->spa_log_state = SPA_LOG_CLEAR; 10030 error = 0; 10031 } 10032 spa_namespace_exit(FTAG); 10033 10034 if (!error) { 10035 error = spa_open_rewind(target, &spa, 10036 FTAG, policy, NULL); 10037 } 10038 } 10039 } else if (strpbrk(target, "#") != NULL) { 10040 dsl_pool_t *dp; 10041 error = dsl_pool_hold(target, FTAG, &dp); 10042 if (error != 0) { 10043 fatal("can't dump '%s': %s", target, 10044 strerror(error)); 10045 } 10046 error = dump_bookmark(dp, target, B_TRUE, verbose > 1); 10047 dsl_pool_rele(dp, FTAG); 10048 if (error != 0) { 10049 fatal("can't dump '%s': %s", target, 10050 strerror(error)); 10051 } 10052 goto fini; 10053 } else { 10054 target_pool = strdup(target); 10055 if (strpbrk(target, "/@") != NULL) 10056 *strpbrk(target_pool, "/@") = '\0'; 10057 10058 zdb_set_skip_mmp(target); 10059 /* 10060 * If -N was supplied, the user has indicated that 10061 * zdb -d <pool>/<objsetID> is in effect. Otherwise 10062 * we first assume that the dataset string is the 10063 * dataset name. If dmu_objset_hold fails with the 10064 * dataset string, and we have an objset_id, retry the 10065 * lookup with the objsetID. 10066 */ 10067 boolean_t retry = B_TRUE; 10068 retry_lookup: 10069 if (dataset_lookup == B_TRUE) { 10070 /* 10071 * Use the supplied id to get the name 10072 * for open_objset. 10073 */ 10074 error = spa_open(target_pool, &spa, FTAG); 10075 if (error == 0) { 10076 error = name_from_objset_id(spa, 10077 objset_id, dsname); 10078 spa_close(spa, FTAG); 10079 if (error == 0) 10080 target = dsname; 10081 } 10082 } 10083 if (error == 0) { 10084 if (objset_id > 0 && retry) { 10085 int err = dmu_objset_hold(target, FTAG, 10086 &os); 10087 if (err) { 10088 dataset_lookup = B_TRUE; 10089 retry = B_FALSE; 10090 goto retry_lookup; 10091 } else { 10092 dmu_objset_rele(os, FTAG); 10093 } 10094 } 10095 error = open_objset(target, FTAG, &os); 10096 } 10097 if (error == 0) 10098 spa = dmu_objset_spa(os); 10099 free(target_pool); 10100 } 10101 } 10102 nvlist_free(policy); 10103 10104 if (error) 10105 fatal("can't open '%s': %s", target, strerror(error)); 10106 10107 /* 10108 * Set the pool failure mode to panic in order to prevent the pool 10109 * from suspending. A suspended I/O will have no way to resume and 10110 * can prevent the zdb(8) command from terminating as expected. 10111 */ 10112 if (spa != NULL) 10113 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; 10114 10115 argv++; 10116 argc--; 10117 if (dump_opt['r']) { 10118 error = zdb_copy_object(os, object, argv[1]); 10119 } else if (!dump_opt['R']) { 10120 flagbits['d'] = ZOR_FLAG_DIRECTORY; 10121 flagbits['f'] = ZOR_FLAG_PLAIN_FILE; 10122 flagbits['m'] = ZOR_FLAG_SPACE_MAP; 10123 flagbits['z'] = ZOR_FLAG_ZAP; 10124 flagbits['A'] = ZOR_FLAG_ALL_TYPES; 10125 10126 if (argc > 0 && dump_opt['d']) { 10127 zopt_object_args = argc; 10128 zopt_object_ranges = calloc(zopt_object_args, 10129 sizeof (zopt_object_range_t)); 10130 for (unsigned i = 0; i < zopt_object_args; i++) { 10131 int err; 10132 const char *msg = NULL; 10133 10134 err = parse_object_range(argv[i], 10135 &zopt_object_ranges[i], &msg); 10136 if (err != 0) 10137 fatal("Bad object or range: '%s': %s\n", 10138 argv[i], msg ?: ""); 10139 } 10140 } else if (argc > 0 && dump_opt['m']) { 10141 zopt_metaslab_args = argc; 10142 zopt_metaslab = calloc(zopt_metaslab_args, 10143 sizeof (uint64_t)); 10144 for (unsigned i = 0; i < zopt_metaslab_args; i++) { 10145 errno = 0; 10146 zopt_metaslab[i] = strtoull(argv[i], NULL, 0); 10147 if (zopt_metaslab[i] == 0 && errno != 0) 10148 fatal("bad number %s: %s", argv[i], 10149 strerror(errno)); 10150 } 10151 } 10152 if (dump_opt['B']) { 10153 dump_backup(target, objset_id, 10154 argc > 0 ? argv[0] : NULL); 10155 } else if (os != NULL) { 10156 dump_objset(os); 10157 } else if (zopt_object_args > 0 && !dump_opt['m']) { 10158 dump_objset(spa->spa_meta_objset); 10159 } else { 10160 dump_zpool(spa); 10161 } 10162 } else { 10163 flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; 10164 flagbits['c'] = ZDB_FLAG_CHECKSUM; 10165 flagbits['d'] = ZDB_FLAG_DECOMPRESS; 10166 flagbits['e'] = ZDB_FLAG_BSWAP; 10167 flagbits['g'] = ZDB_FLAG_GBH; 10168 flagbits['i'] = ZDB_FLAG_INDIRECT; 10169 flagbits['r'] = ZDB_FLAG_RAW; 10170 flagbits['v'] = ZDB_FLAG_VERBOSE; 10171 10172 for (int i = 0; i < argc; i++) 10173 zdb_read_block(argv[i], spa); 10174 } 10175 10176 if (dump_opt['k']) { 10177 free(checkpoint_pool); 10178 if (!target_is_spa) 10179 free(checkpoint_target); 10180 } 10181 10182 fini: 10183 if (spa != NULL) 10184 zdb_ddt_cleanup(spa); 10185 10186 if (os != NULL) { 10187 close_objset(os, FTAG); 10188 } else if (spa != NULL) { 10189 spa_close(spa, FTAG); 10190 } 10191 10192 fuid_table_destroy(); 10193 10194 dump_debug_buffer(); 10195 10196 if (kernel_init_done) 10197 kernel_fini(); 10198 10199 if (corruption_found && error == 0) 10200 error = 3; 10201 10202 return (error); 10203 } 10204