1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2019 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright 2016 Nexenta Systems, Inc. 27 * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. 28 * Copyright (c) 2015, 2017, Intel Corporation. 29 * Copyright (c) 2020 Datto Inc. 30 * Copyright (c) 2020, The FreeBSD Foundation [1] 31 * 32 * [1] Portions of this software were developed by Allan Jude 33 * under sponsorship from the FreeBSD Foundation. 34 * Copyright (c) 2021 Allan Jude 35 * Copyright (c) 2021 Toomas Soome <tsoome@me.com> 36 */ 37 38 #include <stdio.h> 39 #include <unistd.h> 40 #include <stdlib.h> 41 #include <ctype.h> 42 #include <getopt.h> 43 #include <openssl/evp.h> 44 #include <sys/zfs_context.h> 45 #include <sys/spa.h> 46 #include <sys/spa_impl.h> 47 #include <sys/dmu.h> 48 #include <sys/zap.h> 49 #include <sys/fs/zfs.h> 50 #include <sys/zfs_znode.h> 51 #include <sys/zfs_sa.h> 52 #include <sys/sa.h> 53 #include <sys/sa_impl.h> 54 #include <sys/vdev.h> 55 #include <sys/vdev_impl.h> 56 #include <sys/metaslab_impl.h> 57 #include <sys/dmu_objset.h> 58 #include <sys/dsl_dir.h> 59 #include <sys/dsl_dataset.h> 60 #include <sys/dsl_pool.h> 61 #include <sys/dsl_bookmark.h> 62 #include <sys/dbuf.h> 63 #include <sys/zil.h> 64 #include <sys/zil_impl.h> 65 #include <sys/stat.h> 66 #include <sys/resource.h> 67 #include <sys/dmu_send.h> 68 #include <sys/dmu_traverse.h> 69 #include <sys/zio_checksum.h> 70 #include <sys/zio_compress.h> 71 #include <sys/zfs_fuid.h> 72 #include <sys/arc.h> 73 #include <sys/arc_impl.h> 74 #include <sys/ddt.h> 75 #include <sys/zfeature.h> 76 #include <sys/abd.h> 77 #include <sys/blkptr.h> 78 #include <sys/dsl_crypt.h> 79 #include <sys/dsl_scan.h> 80 #include <sys/btree.h> 81 #include <zfs_comutil.h> 82 #include <sys/zstd/zstd.h> 83 84 #include <libnvpair.h> 85 #include <libzutil.h> 86 87 #include "zdb.h" 88 89 #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ 90 zio_compress_table[(idx)].ci_name : "UNKNOWN") 91 #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ 92 zio_checksum_table[(idx)].ci_name : "UNKNOWN") 93 #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ 94 (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ 95 DMU_OT_ZAP_OTHER : \ 96 (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ 97 DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) 98 99 /* Some platforms require part of inode IDs to be remapped */ 100 #ifdef __APPLE__ 101 #define ZDB_MAP_OBJECT_ID(obj) INO_XNUTOZFS(obj, 2) 102 #else 103 #define ZDB_MAP_OBJECT_ID(obj) (obj) 104 #endif 105 106 static const char * 107 zdb_ot_name(dmu_object_type_t type) 108 { 109 if (type < DMU_OT_NUMTYPES) 110 return (dmu_ot[type].ot_name); 111 else if ((type & DMU_OT_NEWTYPE) && 112 ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) 113 return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); 114 else 115 return ("UNKNOWN"); 116 } 117 118 extern int reference_tracking_enable; 119 extern int zfs_recover; 120 extern uint_t zfs_vdev_async_read_max_active; 121 extern boolean_t spa_load_verify_dryrun; 122 extern boolean_t spa_mode_readable_spacemaps; 123 extern uint_t zfs_reconstruct_indirect_combinations_max; 124 extern uint_t zfs_btree_verify_intensity; 125 126 static const char cmdname[] = "zdb"; 127 uint8_t dump_opt[256]; 128 129 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); 130 131 static uint64_t *zopt_metaslab = NULL; 132 static unsigned zopt_metaslab_args = 0; 133 134 typedef struct zopt_object_range { 135 uint64_t zor_obj_start; 136 uint64_t zor_obj_end; 137 uint64_t zor_flags; 138 } zopt_object_range_t; 139 140 static zopt_object_range_t *zopt_object_ranges = NULL; 141 static unsigned zopt_object_args = 0; 142 143 static int flagbits[256]; 144 145 #define ZOR_FLAG_PLAIN_FILE 0x0001 146 #define ZOR_FLAG_DIRECTORY 0x0002 147 #define ZOR_FLAG_SPACE_MAP 0x0004 148 #define ZOR_FLAG_ZAP 0x0008 149 #define ZOR_FLAG_ALL_TYPES -1 150 #define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ 151 ZOR_FLAG_DIRECTORY | \ 152 ZOR_FLAG_SPACE_MAP | \ 153 ZOR_FLAG_ZAP) 154 155 #define ZDB_FLAG_CHECKSUM 0x0001 156 #define ZDB_FLAG_DECOMPRESS 0x0002 157 #define ZDB_FLAG_BSWAP 0x0004 158 #define ZDB_FLAG_GBH 0x0008 159 #define ZDB_FLAG_INDIRECT 0x0010 160 #define ZDB_FLAG_RAW 0x0020 161 #define ZDB_FLAG_PRINT_BLKPTR 0x0040 162 #define ZDB_FLAG_VERBOSE 0x0080 163 164 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ 165 static int leaked_objects = 0; 166 static range_tree_t *mos_refd_objs; 167 168 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, 169 boolean_t); 170 static void mos_obj_refd(uint64_t); 171 static void mos_obj_refd_multiple(uint64_t); 172 static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, 173 dmu_tx_t *tx); 174 175 typedef struct sublivelist_verify { 176 /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ 177 zfs_btree_t sv_pair; 178 179 /* ALLOC's without a matching FREE, accumulates across sub-livelists */ 180 zfs_btree_t sv_leftover; 181 } sublivelist_verify_t; 182 183 static int 184 livelist_compare(const void *larg, const void *rarg) 185 { 186 const blkptr_t *l = larg; 187 const blkptr_t *r = rarg; 188 189 /* Sort them according to dva[0] */ 190 uint64_t l_dva0_vdev, r_dva0_vdev; 191 l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); 192 r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); 193 if (l_dva0_vdev < r_dva0_vdev) 194 return (-1); 195 else if (l_dva0_vdev > r_dva0_vdev) 196 return (+1); 197 198 /* if vdevs are equal, sort by offsets. */ 199 uint64_t l_dva0_offset; 200 uint64_t r_dva0_offset; 201 l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); 202 r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); 203 if (l_dva0_offset < r_dva0_offset) { 204 return (-1); 205 } else if (l_dva0_offset > r_dva0_offset) { 206 return (+1); 207 } 208 209 /* 210 * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, 211 * it's possible the offsets are equal. In that case, sort by txg 212 */ 213 if (l->blk_birth < r->blk_birth) { 214 return (-1); 215 } else if (l->blk_birth > r->blk_birth) { 216 return (+1); 217 } 218 return (0); 219 } 220 221 typedef struct sublivelist_verify_block { 222 dva_t svb_dva; 223 224 /* 225 * We need this to check if the block marked as allocated 226 * in the livelist was freed (and potentially reallocated) 227 * in the metaslab spacemaps at a later TXG. 228 */ 229 uint64_t svb_allocated_txg; 230 } sublivelist_verify_block_t; 231 232 static void zdb_print_blkptr(const blkptr_t *bp, int flags); 233 234 typedef struct sublivelist_verify_block_refcnt { 235 /* block pointer entry in livelist being verified */ 236 blkptr_t svbr_blk; 237 238 /* 239 * Refcount gets incremented to 1 when we encounter the first 240 * FREE entry for the svfbr block pointer and a node for it 241 * is created in our ZDB verification/tracking metadata. 242 * 243 * As we encounter more FREE entries we increment this counter 244 * and similarly decrement it whenever we find the respective 245 * ALLOC entries for this block. 246 * 247 * When the refcount gets to 0 it means that all the FREE and 248 * ALLOC entries of this block have paired up and we no longer 249 * need to track it in our verification logic (e.g. the node 250 * containing this struct in our verification data structure 251 * should be freed). 252 * 253 * [refer to sublivelist_verify_blkptr() for the actual code] 254 */ 255 uint32_t svbr_refcnt; 256 } sublivelist_verify_block_refcnt_t; 257 258 static int 259 sublivelist_block_refcnt_compare(const void *larg, const void *rarg) 260 { 261 const sublivelist_verify_block_refcnt_t *l = larg; 262 const sublivelist_verify_block_refcnt_t *r = rarg; 263 return (livelist_compare(&l->svbr_blk, &r->svbr_blk)); 264 } 265 266 static int 267 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, 268 dmu_tx_t *tx) 269 { 270 ASSERT3P(tx, ==, NULL); 271 struct sublivelist_verify *sv = arg; 272 sublivelist_verify_block_refcnt_t current = { 273 .svbr_blk = *bp, 274 275 /* 276 * Start with 1 in case this is the first free entry. 277 * This field is not used for our B-Tree comparisons 278 * anyway. 279 */ 280 .svbr_refcnt = 1, 281 }; 282 283 zfs_btree_index_t where; 284 sublivelist_verify_block_refcnt_t *pair = 285 zfs_btree_find(&sv->sv_pair, ¤t, &where); 286 if (free) { 287 if (pair == NULL) { 288 /* first free entry for this block pointer */ 289 zfs_btree_add(&sv->sv_pair, ¤t); 290 } else { 291 pair->svbr_refcnt++; 292 } 293 } else { 294 if (pair == NULL) { 295 /* block that is currently marked as allocated */ 296 for (int i = 0; i < SPA_DVAS_PER_BP; i++) { 297 if (DVA_IS_EMPTY(&bp->blk_dva[i])) 298 break; 299 sublivelist_verify_block_t svb = { 300 .svb_dva = bp->blk_dva[i], 301 .svb_allocated_txg = bp->blk_birth 302 }; 303 304 if (zfs_btree_find(&sv->sv_leftover, &svb, 305 &where) == NULL) { 306 zfs_btree_add_idx(&sv->sv_leftover, 307 &svb, &where); 308 } 309 } 310 } else { 311 /* alloc matches a free entry */ 312 pair->svbr_refcnt--; 313 if (pair->svbr_refcnt == 0) { 314 /* all allocs and frees have been matched */ 315 zfs_btree_remove_idx(&sv->sv_pair, &where); 316 } 317 } 318 } 319 320 return (0); 321 } 322 323 static int 324 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) 325 { 326 int err; 327 struct sublivelist_verify *sv = args; 328 329 zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, 330 sizeof (sublivelist_verify_block_refcnt_t)); 331 332 err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, 333 sv, NULL); 334 335 sublivelist_verify_block_refcnt_t *e; 336 zfs_btree_index_t *cookie = NULL; 337 while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { 338 char blkbuf[BP_SPRINTF_LEN]; 339 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), 340 &e->svbr_blk, B_TRUE); 341 (void) printf("\tERROR: %d unmatched FREE(s): %s\n", 342 e->svbr_refcnt, blkbuf); 343 } 344 zfs_btree_destroy(&sv->sv_pair); 345 346 return (err); 347 } 348 349 static int 350 livelist_block_compare(const void *larg, const void *rarg) 351 { 352 const sublivelist_verify_block_t *l = larg; 353 const sublivelist_verify_block_t *r = rarg; 354 355 if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) 356 return (-1); 357 else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) 358 return (+1); 359 360 if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) 361 return (-1); 362 else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) 363 return (+1); 364 365 if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) 366 return (-1); 367 else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) 368 return (+1); 369 370 return (0); 371 } 372 373 /* 374 * Check for errors in a livelist while tracking all unfreed ALLOCs in the 375 * sublivelist_verify_t: sv->sv_leftover 376 */ 377 static void 378 livelist_verify(dsl_deadlist_t *dl, void *arg) 379 { 380 sublivelist_verify_t *sv = arg; 381 dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); 382 } 383 384 /* 385 * Check for errors in the livelist entry and discard the intermediary 386 * data structures 387 */ 388 static int 389 sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) 390 { 391 (void) args; 392 sublivelist_verify_t sv; 393 zfs_btree_create(&sv.sv_leftover, livelist_block_compare, 394 sizeof (sublivelist_verify_block_t)); 395 int err = sublivelist_verify_func(&sv, dle); 396 zfs_btree_clear(&sv.sv_leftover); 397 zfs_btree_destroy(&sv.sv_leftover); 398 return (err); 399 } 400 401 typedef struct metaslab_verify { 402 /* 403 * Tree containing all the leftover ALLOCs from the livelists 404 * that are part of this metaslab. 405 */ 406 zfs_btree_t mv_livelist_allocs; 407 408 /* 409 * Metaslab information. 410 */ 411 uint64_t mv_vdid; 412 uint64_t mv_msid; 413 uint64_t mv_start; 414 uint64_t mv_end; 415 416 /* 417 * What's currently allocated for this metaslab. 418 */ 419 range_tree_t *mv_allocated; 420 } metaslab_verify_t; 421 422 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); 423 424 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, 425 void *arg); 426 427 typedef struct unflushed_iter_cb_arg { 428 spa_t *uic_spa; 429 uint64_t uic_txg; 430 void *uic_arg; 431 zdb_log_sm_cb_t uic_cb; 432 } unflushed_iter_cb_arg_t; 433 434 static int 435 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) 436 { 437 unflushed_iter_cb_arg_t *uic = arg; 438 return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); 439 } 440 441 static void 442 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) 443 { 444 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 445 return; 446 447 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 448 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 449 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { 450 space_map_t *sm = NULL; 451 VERIFY0(space_map_open(&sm, spa_meta_objset(spa), 452 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); 453 454 unflushed_iter_cb_arg_t uic = { 455 .uic_spa = spa, 456 .uic_txg = sls->sls_txg, 457 .uic_arg = arg, 458 .uic_cb = cb 459 }; 460 VERIFY0(space_map_iterate(sm, space_map_length(sm), 461 iterate_through_spacemap_logs_cb, &uic)); 462 space_map_close(sm); 463 } 464 spa_config_exit(spa, SCL_CONFIG, FTAG); 465 } 466 467 static void 468 verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, 469 uint64_t offset, uint64_t size) 470 { 471 sublivelist_verify_block_t svb = {{{0}}}; 472 DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); 473 DVA_SET_OFFSET(&svb.svb_dva, offset); 474 DVA_SET_ASIZE(&svb.svb_dva, size); 475 zfs_btree_index_t where; 476 uint64_t end_offset = offset + size; 477 478 /* 479 * Look for an exact match for spacemap entry in the livelist entries. 480 * Then, look for other livelist entries that fall within the range 481 * of the spacemap entry as it may have been condensed 482 */ 483 sublivelist_verify_block_t *found = 484 zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); 485 if (found == NULL) { 486 found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); 487 } 488 for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && 489 DVA_GET_OFFSET(&found->svb_dva) < end_offset; 490 found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { 491 if (found->svb_allocated_txg <= txg) { 492 (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " 493 "from TXG %llx FREED at TXG %llx\n", 494 (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), 495 (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), 496 (u_longlong_t)found->svb_allocated_txg, 497 (u_longlong_t)txg); 498 } 499 } 500 } 501 502 static int 503 metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) 504 { 505 metaslab_verify_t *mv = arg; 506 uint64_t offset = sme->sme_offset; 507 uint64_t size = sme->sme_run; 508 uint64_t txg = sme->sme_txg; 509 510 if (sme->sme_type == SM_ALLOC) { 511 if (range_tree_contains(mv->mv_allocated, 512 offset, size)) { 513 (void) printf("ERROR: DOUBLE ALLOC: " 514 "%llu [%llx:%llx] " 515 "%llu:%llu LOG_SM\n", 516 (u_longlong_t)txg, (u_longlong_t)offset, 517 (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, 518 (u_longlong_t)mv->mv_msid); 519 } else { 520 range_tree_add(mv->mv_allocated, 521 offset, size); 522 } 523 } else { 524 if (!range_tree_contains(mv->mv_allocated, 525 offset, size)) { 526 (void) printf("ERROR: DOUBLE FREE: " 527 "%llu [%llx:%llx] " 528 "%llu:%llu LOG_SM\n", 529 (u_longlong_t)txg, (u_longlong_t)offset, 530 (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, 531 (u_longlong_t)mv->mv_msid); 532 } else { 533 range_tree_remove(mv->mv_allocated, 534 offset, size); 535 } 536 } 537 538 if (sme->sme_type != SM_ALLOC) { 539 /* 540 * If something is freed in the spacemap, verify that 541 * it is not listed as allocated in the livelist. 542 */ 543 verify_livelist_allocs(mv, txg, offset, size); 544 } 545 return (0); 546 } 547 548 static int 549 spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, 550 uint64_t txg, void *arg) 551 { 552 metaslab_verify_t *mv = arg; 553 uint64_t offset = sme->sme_offset; 554 uint64_t vdev_id = sme->sme_vdev; 555 556 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 557 558 /* skip indirect vdevs */ 559 if (!vdev_is_concrete(vd)) 560 return (0); 561 562 if (vdev_id != mv->mv_vdid) 563 return (0); 564 565 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 566 if (ms->ms_id != mv->mv_msid) 567 return (0); 568 569 if (txg < metaslab_unflushed_txg(ms)) 570 return (0); 571 572 573 ASSERT3U(txg, ==, sme->sme_txg); 574 return (metaslab_spacemap_validation_cb(sme, mv)); 575 } 576 577 static void 578 spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) 579 { 580 iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); 581 } 582 583 static void 584 spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) 585 { 586 if (sm == NULL) 587 return; 588 589 VERIFY0(space_map_iterate(sm, space_map_length(sm), 590 metaslab_spacemap_validation_cb, mv)); 591 } 592 593 static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); 594 595 /* 596 * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if 597 * they are part of that metaslab (mv_msid). 598 */ 599 static void 600 mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) 601 { 602 zfs_btree_index_t where; 603 sublivelist_verify_block_t *svb; 604 ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); 605 for (svb = zfs_btree_first(&sv->sv_leftover, &where); 606 svb != NULL; 607 svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { 608 if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) 609 continue; 610 611 if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && 612 (DVA_GET_OFFSET(&svb->svb_dva) + 613 DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { 614 (void) printf("ERROR: Found block that crosses " 615 "metaslab boundary: <%llu:%llx:%llx>\n", 616 (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), 617 (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), 618 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); 619 continue; 620 } 621 622 if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) 623 continue; 624 625 if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) 626 continue; 627 628 if ((DVA_GET_OFFSET(&svb->svb_dva) + 629 DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { 630 (void) printf("ERROR: Found block that crosses " 631 "metaslab boundary: <%llu:%llx:%llx>\n", 632 (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), 633 (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), 634 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); 635 continue; 636 } 637 638 zfs_btree_add(&mv->mv_livelist_allocs, svb); 639 } 640 641 for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); 642 svb != NULL; 643 svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { 644 zfs_btree_remove(&sv->sv_leftover, svb); 645 } 646 } 647 648 /* 649 * [Livelist Check] 650 * Iterate through all the sublivelists and: 651 * - report leftover frees (**) 652 * - record leftover ALLOCs together with their TXG [see Cross Check] 653 * 654 * (**) Note: Double ALLOCs are valid in datasets that have dedup 655 * enabled. Similarly double FREEs are allowed as well but 656 * only if they pair up with a corresponding ALLOC entry once 657 * we our done with our sublivelist iteration. 658 * 659 * [Spacemap Check] 660 * for each metaslab: 661 * - iterate over spacemap and then the metaslab's entries in the 662 * spacemap log, then report any double FREEs and ALLOCs (do not 663 * blow up). 664 * 665 * [Cross Check] 666 * After finishing the Livelist Check phase and while being in the 667 * Spacemap Check phase, we find all the recorded leftover ALLOCs 668 * of the livelist check that are part of the metaslab that we are 669 * currently looking at in the Spacemap Check. We report any entries 670 * that are marked as ALLOCs in the livelists but have been actually 671 * freed (and potentially allocated again) after their TXG stamp in 672 * the spacemaps. Also report any ALLOCs from the livelists that 673 * belong to indirect vdevs (e.g. their vdev completed removal). 674 * 675 * Note that this will miss Log Spacemap entries that cancelled each other 676 * out before being flushed to the metaslab, so we are not guaranteed 677 * to match all erroneous ALLOCs. 678 */ 679 static void 680 livelist_metaslab_validate(spa_t *spa) 681 { 682 (void) printf("Verifying deleted livelist entries\n"); 683 684 sublivelist_verify_t sv; 685 zfs_btree_create(&sv.sv_leftover, livelist_block_compare, 686 sizeof (sublivelist_verify_block_t)); 687 iterate_deleted_livelists(spa, livelist_verify, &sv); 688 689 (void) printf("Verifying metaslab entries\n"); 690 vdev_t *rvd = spa->spa_root_vdev; 691 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 692 vdev_t *vd = rvd->vdev_child[c]; 693 694 if (!vdev_is_concrete(vd)) 695 continue; 696 697 for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { 698 metaslab_t *m = vd->vdev_ms[mid]; 699 700 (void) fprintf(stderr, 701 "\rverifying concrete vdev %llu, " 702 "metaslab %llu of %llu ...", 703 (longlong_t)vd->vdev_id, 704 (longlong_t)mid, 705 (longlong_t)vd->vdev_ms_count); 706 707 uint64_t shift, start; 708 range_seg_type_t type = 709 metaslab_calculate_range_tree_type(vd, m, 710 &start, &shift); 711 metaslab_verify_t mv; 712 mv.mv_allocated = range_tree_create(NULL, 713 type, NULL, start, shift); 714 mv.mv_vdid = vd->vdev_id; 715 mv.mv_msid = m->ms_id; 716 mv.mv_start = m->ms_start; 717 mv.mv_end = m->ms_start + m->ms_size; 718 zfs_btree_create(&mv.mv_livelist_allocs, 719 livelist_block_compare, 720 sizeof (sublivelist_verify_block_t)); 721 722 mv_populate_livelist_allocs(&mv, &sv); 723 724 spacemap_check_ms_sm(m->ms_sm, &mv); 725 spacemap_check_sm_log(spa, &mv); 726 727 range_tree_vacate(mv.mv_allocated, NULL, NULL); 728 range_tree_destroy(mv.mv_allocated); 729 zfs_btree_clear(&mv.mv_livelist_allocs); 730 zfs_btree_destroy(&mv.mv_livelist_allocs); 731 } 732 } 733 (void) fprintf(stderr, "\n"); 734 735 /* 736 * If there are any segments in the leftover tree after we walked 737 * through all the metaslabs in the concrete vdevs then this means 738 * that we have segments in the livelists that belong to indirect 739 * vdevs and are marked as allocated. 740 */ 741 if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { 742 zfs_btree_destroy(&sv.sv_leftover); 743 return; 744 } 745 (void) printf("ERROR: Found livelist blocks marked as allocated " 746 "for indirect vdevs:\n"); 747 748 zfs_btree_index_t *where = NULL; 749 sublivelist_verify_block_t *svb; 750 while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != 751 NULL) { 752 int vdev_id = DVA_GET_VDEV(&svb->svb_dva); 753 ASSERT3U(vdev_id, <, rvd->vdev_children); 754 vdev_t *vd = rvd->vdev_child[vdev_id]; 755 ASSERT(!vdev_is_concrete(vd)); 756 (void) printf("<%d:%llx:%llx> TXG %llx\n", 757 vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), 758 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), 759 (u_longlong_t)svb->svb_allocated_txg); 760 } 761 (void) printf("\n"); 762 zfs_btree_destroy(&sv.sv_leftover); 763 } 764 765 /* 766 * These libumem hooks provide a reasonable set of defaults for the allocator's 767 * debugging facilities. 768 */ 769 const char * 770 _umem_debug_init(void) 771 { 772 return ("default,verbose"); /* $UMEM_DEBUG setting */ 773 } 774 775 const char * 776 _umem_logging_init(void) 777 { 778 return ("fail,contents"); /* $UMEM_LOGGING setting */ 779 } 780 781 static void 782 usage(void) 783 { 784 (void) fprintf(stderr, 785 "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] " 786 "[-I <inflight I/Os>]\n" 787 "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n" 788 "\t\t[-K <key>]\n" 789 "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n" 790 "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n" 791 "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n" 792 "\t%s [-v] <bookmark>\n" 793 "\t%s -C [-A] [-U <cache>]\n" 794 "\t%s -l [-Aqu] <device>\n" 795 "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] " 796 "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n" 797 "\t%s -O [-K <key>] <dataset> <path>\n" 798 "\t%s -r [-K <key>] <dataset> <path> <destination>\n" 799 "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n" 800 "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n" 801 "\t%s -E [-A] word0:word1:...:word15\n" 802 "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] " 803 "<poolname>\n\n", 804 cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, 805 cmdname, cmdname, cmdname, cmdname); 806 807 (void) fprintf(stderr, " Dataset name must include at least one " 808 "separator character '/' or '@'\n"); 809 (void) fprintf(stderr, " If dataset name is specified, only that " 810 "dataset is dumped\n"); 811 (void) fprintf(stderr, " If object numbers or object number " 812 "ranges are specified, only those\n" 813 " objects or ranges are dumped.\n\n"); 814 (void) fprintf(stderr, 815 " Object ranges take the form <start>:<end>[:<flags>]\n" 816 " start Starting object number\n" 817 " end Ending object number, or -1 for no upper bound\n" 818 " flags Optional flags to select object types:\n" 819 " A All objects (this is the default)\n" 820 " d ZFS directories\n" 821 " f ZFS files \n" 822 " m SPA space maps\n" 823 " z ZAPs\n" 824 " - Negate effect of next flag\n\n"); 825 (void) fprintf(stderr, " Options to control amount of output:\n"); 826 (void) fprintf(stderr, " -b --block-stats " 827 "block statistics\n"); 828 (void) fprintf(stderr, " -c --checksum " 829 "checksum all metadata (twice for all data) blocks\n"); 830 (void) fprintf(stderr, " -C --config " 831 "config (or cachefile if alone)\n"); 832 (void) fprintf(stderr, " -d --datasets " 833 "dataset(s)\n"); 834 (void) fprintf(stderr, " -D --dedup-stats " 835 "dedup statistics\n"); 836 (void) fprintf(stderr, " -E --embedded-block-pointer=INTEGER\n" 837 " decode and display block " 838 "from an embedded block pointer\n"); 839 (void) fprintf(stderr, " -h --history " 840 "pool history\n"); 841 (void) fprintf(stderr, " -i --intent-logs " 842 "intent logs\n"); 843 (void) fprintf(stderr, " -l --label " 844 "read label contents\n"); 845 (void) fprintf(stderr, " -k --checkpointed-state " 846 "examine the checkpointed state of the pool\n"); 847 (void) fprintf(stderr, " -L --disable-leak-tracking " 848 "disable leak tracking (do not load spacemaps)\n"); 849 (void) fprintf(stderr, " -m --metaslabs " 850 "metaslabs\n"); 851 (void) fprintf(stderr, " -M --metaslab-groups " 852 "metaslab groups\n"); 853 (void) fprintf(stderr, " -O --object-lookups " 854 "perform object lookups by path\n"); 855 (void) fprintf(stderr, " -r --copy-object " 856 "copy an object by path to file\n"); 857 (void) fprintf(stderr, " -R --read-block " 858 "read and display block from a device\n"); 859 (void) fprintf(stderr, " -s --io-stats " 860 "report stats on zdb's I/O\n"); 861 (void) fprintf(stderr, " -S --simulate-dedup " 862 "simulate dedup to measure effect\n"); 863 (void) fprintf(stderr, " -v --verbose " 864 "verbose (applies to all others)\n"); 865 (void) fprintf(stderr, " -y --livelist " 866 "perform livelist and metaslab validation on any livelists being " 867 "deleted\n\n"); 868 (void) fprintf(stderr, " Below options are intended for use " 869 "with other options:\n"); 870 (void) fprintf(stderr, " -A --ignore-assertions " 871 "ignore assertions (-A), enable panic recovery (-AA) or both " 872 "(-AAA)\n"); 873 (void) fprintf(stderr, " -e --exported " 874 "pool is exported/destroyed/has altroot/not in a cachefile\n"); 875 (void) fprintf(stderr, " -F --automatic-rewind " 876 "attempt automatic rewind within safe range of transaction " 877 "groups\n"); 878 (void) fprintf(stderr, " -G --dump-debug-msg " 879 "dump zfs_dbgmsg buffer before exiting\n"); 880 (void) fprintf(stderr, " -I --inflight=INTEGER " 881 "specify the maximum number of checksumming I/Os " 882 "[default is 200]\n"); 883 (void) fprintf(stderr, " -K --key=KEY " 884 "decryption key for encrypted dataset\n"); 885 (void) fprintf(stderr, " -o --option=\"OPTION=INTEGER\" " 886 "set global variable to an unsigned 32-bit integer\n"); 887 (void) fprintf(stderr, " -p --path==PATH " 888 "use one or more with -e to specify path to vdev dir\n"); 889 (void) fprintf(stderr, " -P --parseable " 890 "print numbers in parseable form\n"); 891 (void) fprintf(stderr, " -q --skip-label " 892 "don't print label contents\n"); 893 (void) fprintf(stderr, " -t --txg=INTEGER " 894 "highest txg to use when searching for uberblocks\n"); 895 (void) fprintf(stderr, " -u --uberblock " 896 "uberblock\n"); 897 (void) fprintf(stderr, " -U --cachefile=PATH " 898 "use alternate cachefile\n"); 899 (void) fprintf(stderr, " -V --verbatim " 900 "do verbatim import\n"); 901 (void) fprintf(stderr, " -x --dump-blocks=PATH " 902 "dump all read blocks into specified directory\n"); 903 (void) fprintf(stderr, " -X --extreme-rewind " 904 "attempt extreme rewind (does not work with dataset)\n"); 905 (void) fprintf(stderr, " -Y --all-reconstruction " 906 "attempt all reconstruction combinations for split blocks\n"); 907 (void) fprintf(stderr, " -Z --zstd-headers " 908 "show ZSTD headers \n"); 909 (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " 910 "to make only that option verbose\n"); 911 (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); 912 exit(1); 913 } 914 915 static void 916 dump_debug_buffer(void) 917 { 918 if (dump_opt['G']) { 919 (void) printf("\n"); 920 (void) fflush(stdout); 921 zfs_dbgmsg_print("zdb"); 922 } 923 } 924 925 /* 926 * Called for usage errors that are discovered after a call to spa_open(), 927 * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. 928 */ 929 930 static void 931 fatal(const char *fmt, ...) 932 { 933 va_list ap; 934 935 va_start(ap, fmt); 936 (void) fprintf(stderr, "%s: ", cmdname); 937 (void) vfprintf(stderr, fmt, ap); 938 va_end(ap); 939 (void) fprintf(stderr, "\n"); 940 941 dump_debug_buffer(); 942 943 exit(1); 944 } 945 946 static void 947 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) 948 { 949 (void) size; 950 nvlist_t *nv; 951 size_t nvsize = *(uint64_t *)data; 952 char *packed = umem_alloc(nvsize, UMEM_NOFAIL); 953 954 VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); 955 956 VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); 957 958 umem_free(packed, nvsize); 959 960 dump_nvlist(nv, 8); 961 962 nvlist_free(nv); 963 } 964 965 static void 966 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) 967 { 968 (void) os, (void) object, (void) size; 969 spa_history_phys_t *shp = data; 970 971 if (shp == NULL) 972 return; 973 974 (void) printf("\t\tpool_create_len = %llu\n", 975 (u_longlong_t)shp->sh_pool_create_len); 976 (void) printf("\t\tphys_max_off = %llu\n", 977 (u_longlong_t)shp->sh_phys_max_off); 978 (void) printf("\t\tbof = %llu\n", 979 (u_longlong_t)shp->sh_bof); 980 (void) printf("\t\teof = %llu\n", 981 (u_longlong_t)shp->sh_eof); 982 (void) printf("\t\trecords_lost = %llu\n", 983 (u_longlong_t)shp->sh_records_lost); 984 } 985 986 static void 987 zdb_nicenum(uint64_t num, char *buf, size_t buflen) 988 { 989 if (dump_opt['P']) 990 (void) snprintf(buf, buflen, "%llu", (longlong_t)num); 991 else 992 nicenum(num, buf, buflen); 993 } 994 995 static const char histo_stars[] = "****************************************"; 996 static const uint64_t histo_width = sizeof (histo_stars) - 1; 997 998 static void 999 dump_histogram(const uint64_t *histo, int size, int offset) 1000 { 1001 int i; 1002 int minidx = size - 1; 1003 int maxidx = 0; 1004 uint64_t max = 0; 1005 1006 for (i = 0; i < size; i++) { 1007 if (histo[i] == 0) 1008 continue; 1009 if (histo[i] > max) 1010 max = histo[i]; 1011 if (i > maxidx) 1012 maxidx = i; 1013 if (i < minidx) 1014 minidx = i; 1015 } 1016 1017 if (max < histo_width) 1018 max = histo_width; 1019 1020 for (i = minidx; i <= maxidx; i++) { 1021 (void) printf("\t\t\t%3u: %6llu %s\n", 1022 i + offset, (u_longlong_t)histo[i], 1023 &histo_stars[(max - histo[i]) * histo_width / max]); 1024 } 1025 } 1026 1027 static void 1028 dump_zap_stats(objset_t *os, uint64_t object) 1029 { 1030 int error; 1031 zap_stats_t zs; 1032 1033 error = zap_get_stats(os, object, &zs); 1034 if (error) 1035 return; 1036 1037 if (zs.zs_ptrtbl_len == 0) { 1038 ASSERT(zs.zs_num_blocks == 1); 1039 (void) printf("\tmicrozap: %llu bytes, %llu entries\n", 1040 (u_longlong_t)zs.zs_blocksize, 1041 (u_longlong_t)zs.zs_num_entries); 1042 return; 1043 } 1044 1045 (void) printf("\tFat ZAP stats:\n"); 1046 1047 (void) printf("\t\tPointer table:\n"); 1048 (void) printf("\t\t\t%llu elements\n", 1049 (u_longlong_t)zs.zs_ptrtbl_len); 1050 (void) printf("\t\t\tzt_blk: %llu\n", 1051 (u_longlong_t)zs.zs_ptrtbl_zt_blk); 1052 (void) printf("\t\t\tzt_numblks: %llu\n", 1053 (u_longlong_t)zs.zs_ptrtbl_zt_numblks); 1054 (void) printf("\t\t\tzt_shift: %llu\n", 1055 (u_longlong_t)zs.zs_ptrtbl_zt_shift); 1056 (void) printf("\t\t\tzt_blks_copied: %llu\n", 1057 (u_longlong_t)zs.zs_ptrtbl_blks_copied); 1058 (void) printf("\t\t\tzt_nextblk: %llu\n", 1059 (u_longlong_t)zs.zs_ptrtbl_nextblk); 1060 1061 (void) printf("\t\tZAP entries: %llu\n", 1062 (u_longlong_t)zs.zs_num_entries); 1063 (void) printf("\t\tLeaf blocks: %llu\n", 1064 (u_longlong_t)zs.zs_num_leafs); 1065 (void) printf("\t\tTotal blocks: %llu\n", 1066 (u_longlong_t)zs.zs_num_blocks); 1067 (void) printf("\t\tzap_block_type: 0x%llx\n", 1068 (u_longlong_t)zs.zs_block_type); 1069 (void) printf("\t\tzap_magic: 0x%llx\n", 1070 (u_longlong_t)zs.zs_magic); 1071 (void) printf("\t\tzap_salt: 0x%llx\n", 1072 (u_longlong_t)zs.zs_salt); 1073 1074 (void) printf("\t\tLeafs with 2^n pointers:\n"); 1075 dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); 1076 1077 (void) printf("\t\tBlocks with n*5 entries:\n"); 1078 dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); 1079 1080 (void) printf("\t\tBlocks n/10 full:\n"); 1081 dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); 1082 1083 (void) printf("\t\tEntries with n chunks:\n"); 1084 dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); 1085 1086 (void) printf("\t\tBuckets with n entries:\n"); 1087 dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); 1088 } 1089 1090 static void 1091 dump_none(objset_t *os, uint64_t object, void *data, size_t size) 1092 { 1093 (void) os, (void) object, (void) data, (void) size; 1094 } 1095 1096 static void 1097 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) 1098 { 1099 (void) os, (void) object, (void) data, (void) size; 1100 (void) printf("\tUNKNOWN OBJECT TYPE\n"); 1101 } 1102 1103 static void 1104 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) 1105 { 1106 (void) os, (void) object, (void) data, (void) size; 1107 } 1108 1109 static void 1110 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) 1111 { 1112 uint64_t *arr; 1113 uint64_t oursize; 1114 if (dump_opt['d'] < 6) 1115 return; 1116 1117 if (data == NULL) { 1118 dmu_object_info_t doi; 1119 1120 VERIFY0(dmu_object_info(os, object, &doi)); 1121 size = doi.doi_max_offset; 1122 /* 1123 * We cap the size at 1 mebibyte here to prevent 1124 * allocation failures and nigh-infinite printing if the 1125 * object is extremely large. 1126 */ 1127 oursize = MIN(size, 1 << 20); 1128 arr = kmem_alloc(oursize, KM_SLEEP); 1129 1130 int err = dmu_read(os, object, 0, oursize, arr, 0); 1131 if (err != 0) { 1132 (void) printf("got error %u from dmu_read\n", err); 1133 kmem_free(arr, oursize); 1134 return; 1135 } 1136 } else { 1137 /* 1138 * Even though the allocation is already done in this code path, 1139 * we still cap the size to prevent excessive printing. 1140 */ 1141 oursize = MIN(size, 1 << 20); 1142 arr = data; 1143 } 1144 1145 if (size == 0) { 1146 if (data == NULL) 1147 kmem_free(arr, oursize); 1148 (void) printf("\t\t[]\n"); 1149 return; 1150 } 1151 1152 (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); 1153 for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { 1154 if (i % 4 != 0) 1155 (void) printf(", %0llx", (u_longlong_t)arr[i]); 1156 else 1157 (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); 1158 } 1159 if (oursize != size) 1160 (void) printf(", ... "); 1161 (void) printf("]\n"); 1162 1163 if (data == NULL) 1164 kmem_free(arr, oursize); 1165 } 1166 1167 static void 1168 dump_zap(objset_t *os, uint64_t object, void *data, size_t size) 1169 { 1170 (void) data, (void) size; 1171 zap_cursor_t zc; 1172 zap_attribute_t attr; 1173 void *prop; 1174 unsigned i; 1175 1176 dump_zap_stats(os, object); 1177 (void) printf("\n"); 1178 1179 for (zap_cursor_init(&zc, os, object); 1180 zap_cursor_retrieve(&zc, &attr) == 0; 1181 zap_cursor_advance(&zc)) { 1182 (void) printf("\t\t%s = ", attr.za_name); 1183 if (attr.za_num_integers == 0) { 1184 (void) printf("\n"); 1185 continue; 1186 } 1187 prop = umem_zalloc(attr.za_num_integers * 1188 attr.za_integer_length, UMEM_NOFAIL); 1189 (void) zap_lookup(os, object, attr.za_name, 1190 attr.za_integer_length, attr.za_num_integers, prop); 1191 if (attr.za_integer_length == 1) { 1192 if (strcmp(attr.za_name, 1193 DSL_CRYPTO_KEY_MASTER_KEY) == 0 || 1194 strcmp(attr.za_name, 1195 DSL_CRYPTO_KEY_HMAC_KEY) == 0 || 1196 strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 || 1197 strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 || 1198 strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) { 1199 uint8_t *u8 = prop; 1200 1201 for (i = 0; i < attr.za_num_integers; i++) { 1202 (void) printf("%02x", u8[i]); 1203 } 1204 } else { 1205 (void) printf("%s", (char *)prop); 1206 } 1207 } else { 1208 for (i = 0; i < attr.za_num_integers; i++) { 1209 switch (attr.za_integer_length) { 1210 case 2: 1211 (void) printf("%u ", 1212 ((uint16_t *)prop)[i]); 1213 break; 1214 case 4: 1215 (void) printf("%u ", 1216 ((uint32_t *)prop)[i]); 1217 break; 1218 case 8: 1219 (void) printf("%lld ", 1220 (u_longlong_t)((int64_t *)prop)[i]); 1221 break; 1222 } 1223 } 1224 } 1225 (void) printf("\n"); 1226 umem_free(prop, attr.za_num_integers * attr.za_integer_length); 1227 } 1228 zap_cursor_fini(&zc); 1229 } 1230 1231 static void 1232 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) 1233 { 1234 bpobj_phys_t *bpop = data; 1235 uint64_t i; 1236 char bytes[32], comp[32], uncomp[32]; 1237 1238 /* make sure the output won't get truncated */ 1239 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 1240 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); 1241 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); 1242 1243 if (bpop == NULL) 1244 return; 1245 1246 zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); 1247 zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); 1248 zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); 1249 1250 (void) printf("\t\tnum_blkptrs = %llu\n", 1251 (u_longlong_t)bpop->bpo_num_blkptrs); 1252 (void) printf("\t\tbytes = %s\n", bytes); 1253 if (size >= BPOBJ_SIZE_V1) { 1254 (void) printf("\t\tcomp = %s\n", comp); 1255 (void) printf("\t\tuncomp = %s\n", uncomp); 1256 } 1257 if (size >= BPOBJ_SIZE_V2) { 1258 (void) printf("\t\tsubobjs = %llu\n", 1259 (u_longlong_t)bpop->bpo_subobjs); 1260 (void) printf("\t\tnum_subobjs = %llu\n", 1261 (u_longlong_t)bpop->bpo_num_subobjs); 1262 } 1263 if (size >= sizeof (*bpop)) { 1264 (void) printf("\t\tnum_freed = %llu\n", 1265 (u_longlong_t)bpop->bpo_num_freed); 1266 } 1267 1268 if (dump_opt['d'] < 5) 1269 return; 1270 1271 for (i = 0; i < bpop->bpo_num_blkptrs; i++) { 1272 char blkbuf[BP_SPRINTF_LEN]; 1273 blkptr_t bp; 1274 1275 int err = dmu_read(os, object, 1276 i * sizeof (bp), sizeof (bp), &bp, 0); 1277 if (err != 0) { 1278 (void) printf("got error %u from dmu_read\n", err); 1279 break; 1280 } 1281 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, 1282 BP_GET_FREE(&bp)); 1283 (void) printf("\t%s\n", blkbuf); 1284 } 1285 } 1286 1287 static void 1288 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) 1289 { 1290 (void) data, (void) size; 1291 dmu_object_info_t doi; 1292 int64_t i; 1293 1294 VERIFY0(dmu_object_info(os, object, &doi)); 1295 uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); 1296 1297 int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); 1298 if (err != 0) { 1299 (void) printf("got error %u from dmu_read\n", err); 1300 kmem_free(subobjs, doi.doi_max_offset); 1301 return; 1302 } 1303 1304 int64_t last_nonzero = -1; 1305 for (i = 0; i < doi.doi_max_offset / 8; i++) { 1306 if (subobjs[i] != 0) 1307 last_nonzero = i; 1308 } 1309 1310 for (i = 0; i <= last_nonzero; i++) { 1311 (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]); 1312 } 1313 kmem_free(subobjs, doi.doi_max_offset); 1314 } 1315 1316 static void 1317 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) 1318 { 1319 (void) data, (void) size; 1320 dump_zap_stats(os, object); 1321 /* contents are printed elsewhere, properly decoded */ 1322 } 1323 1324 static void 1325 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) 1326 { 1327 (void) data, (void) size; 1328 zap_cursor_t zc; 1329 zap_attribute_t attr; 1330 1331 dump_zap_stats(os, object); 1332 (void) printf("\n"); 1333 1334 for (zap_cursor_init(&zc, os, object); 1335 zap_cursor_retrieve(&zc, &attr) == 0; 1336 zap_cursor_advance(&zc)) { 1337 (void) printf("\t\t%s = ", attr.za_name); 1338 if (attr.za_num_integers == 0) { 1339 (void) printf("\n"); 1340 continue; 1341 } 1342 (void) printf(" %llx : [%d:%d:%d]\n", 1343 (u_longlong_t)attr.za_first_integer, 1344 (int)ATTR_LENGTH(attr.za_first_integer), 1345 (int)ATTR_BSWAP(attr.za_first_integer), 1346 (int)ATTR_NUM(attr.za_first_integer)); 1347 } 1348 zap_cursor_fini(&zc); 1349 } 1350 1351 static void 1352 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) 1353 { 1354 (void) data, (void) size; 1355 zap_cursor_t zc; 1356 zap_attribute_t attr; 1357 uint16_t *layout_attrs; 1358 unsigned i; 1359 1360 dump_zap_stats(os, object); 1361 (void) printf("\n"); 1362 1363 for (zap_cursor_init(&zc, os, object); 1364 zap_cursor_retrieve(&zc, &attr) == 0; 1365 zap_cursor_advance(&zc)) { 1366 (void) printf("\t\t%s = [", attr.za_name); 1367 if (attr.za_num_integers == 0) { 1368 (void) printf("\n"); 1369 continue; 1370 } 1371 1372 VERIFY(attr.za_integer_length == 2); 1373 layout_attrs = umem_zalloc(attr.za_num_integers * 1374 attr.za_integer_length, UMEM_NOFAIL); 1375 1376 VERIFY(zap_lookup(os, object, attr.za_name, 1377 attr.za_integer_length, 1378 attr.za_num_integers, layout_attrs) == 0); 1379 1380 for (i = 0; i != attr.za_num_integers; i++) 1381 (void) printf(" %d ", (int)layout_attrs[i]); 1382 (void) printf("]\n"); 1383 umem_free(layout_attrs, 1384 attr.za_num_integers * attr.za_integer_length); 1385 } 1386 zap_cursor_fini(&zc); 1387 } 1388 1389 static void 1390 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) 1391 { 1392 (void) data, (void) size; 1393 zap_cursor_t zc; 1394 zap_attribute_t attr; 1395 const char *typenames[] = { 1396 /* 0 */ "not specified", 1397 /* 1 */ "FIFO", 1398 /* 2 */ "Character Device", 1399 /* 3 */ "3 (invalid)", 1400 /* 4 */ "Directory", 1401 /* 5 */ "5 (invalid)", 1402 /* 6 */ "Block Device", 1403 /* 7 */ "7 (invalid)", 1404 /* 8 */ "Regular File", 1405 /* 9 */ "9 (invalid)", 1406 /* 10 */ "Symbolic Link", 1407 /* 11 */ "11 (invalid)", 1408 /* 12 */ "Socket", 1409 /* 13 */ "Door", 1410 /* 14 */ "Event Port", 1411 /* 15 */ "15 (invalid)", 1412 }; 1413 1414 dump_zap_stats(os, object); 1415 (void) printf("\n"); 1416 1417 for (zap_cursor_init(&zc, os, object); 1418 zap_cursor_retrieve(&zc, &attr) == 0; 1419 zap_cursor_advance(&zc)) { 1420 (void) printf("\t\t%s = %lld (type: %s)\n", 1421 attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), 1422 typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); 1423 } 1424 zap_cursor_fini(&zc); 1425 } 1426 1427 static int 1428 get_dtl_refcount(vdev_t *vd) 1429 { 1430 int refcount = 0; 1431 1432 if (vd->vdev_ops->vdev_op_leaf) { 1433 space_map_t *sm = vd->vdev_dtl_sm; 1434 1435 if (sm != NULL && 1436 sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) 1437 return (1); 1438 return (0); 1439 } 1440 1441 for (unsigned c = 0; c < vd->vdev_children; c++) 1442 refcount += get_dtl_refcount(vd->vdev_child[c]); 1443 return (refcount); 1444 } 1445 1446 static int 1447 get_metaslab_refcount(vdev_t *vd) 1448 { 1449 int refcount = 0; 1450 1451 if (vd->vdev_top == vd) { 1452 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 1453 space_map_t *sm = vd->vdev_ms[m]->ms_sm; 1454 1455 if (sm != NULL && 1456 sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) 1457 refcount++; 1458 } 1459 } 1460 for (unsigned c = 0; c < vd->vdev_children; c++) 1461 refcount += get_metaslab_refcount(vd->vdev_child[c]); 1462 1463 return (refcount); 1464 } 1465 1466 static int 1467 get_obsolete_refcount(vdev_t *vd) 1468 { 1469 uint64_t obsolete_sm_object; 1470 int refcount = 0; 1471 1472 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 1473 if (vd->vdev_top == vd && obsolete_sm_object != 0) { 1474 dmu_object_info_t doi; 1475 VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, 1476 obsolete_sm_object, &doi)); 1477 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { 1478 refcount++; 1479 } 1480 } else { 1481 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); 1482 ASSERT3U(obsolete_sm_object, ==, 0); 1483 } 1484 for (unsigned c = 0; c < vd->vdev_children; c++) { 1485 refcount += get_obsolete_refcount(vd->vdev_child[c]); 1486 } 1487 1488 return (refcount); 1489 } 1490 1491 static int 1492 get_prev_obsolete_spacemap_refcount(spa_t *spa) 1493 { 1494 uint64_t prev_obj = 1495 spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; 1496 if (prev_obj != 0) { 1497 dmu_object_info_t doi; 1498 VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); 1499 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { 1500 return (1); 1501 } 1502 } 1503 return (0); 1504 } 1505 1506 static int 1507 get_checkpoint_refcount(vdev_t *vd) 1508 { 1509 int refcount = 0; 1510 1511 if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && 1512 zap_contains(spa_meta_objset(vd->vdev_spa), 1513 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) 1514 refcount++; 1515 1516 for (uint64_t c = 0; c < vd->vdev_children; c++) 1517 refcount += get_checkpoint_refcount(vd->vdev_child[c]); 1518 1519 return (refcount); 1520 } 1521 1522 static int 1523 get_log_spacemap_refcount(spa_t *spa) 1524 { 1525 return (avl_numnodes(&spa->spa_sm_logs_by_txg)); 1526 } 1527 1528 static int 1529 verify_spacemap_refcounts(spa_t *spa) 1530 { 1531 uint64_t expected_refcount = 0; 1532 uint64_t actual_refcount; 1533 1534 (void) feature_get_refcount(spa, 1535 &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], 1536 &expected_refcount); 1537 actual_refcount = get_dtl_refcount(spa->spa_root_vdev); 1538 actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); 1539 actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); 1540 actual_refcount += get_prev_obsolete_spacemap_refcount(spa); 1541 actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); 1542 actual_refcount += get_log_spacemap_refcount(spa); 1543 1544 if (expected_refcount != actual_refcount) { 1545 (void) printf("space map refcount mismatch: expected %lld != " 1546 "actual %lld\n", 1547 (longlong_t)expected_refcount, 1548 (longlong_t)actual_refcount); 1549 return (2); 1550 } 1551 return (0); 1552 } 1553 1554 static void 1555 dump_spacemap(objset_t *os, space_map_t *sm) 1556 { 1557 const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", 1558 "INVALID", "INVALID", "INVALID", "INVALID" }; 1559 1560 if (sm == NULL) 1561 return; 1562 1563 (void) printf("space map object %llu:\n", 1564 (longlong_t)sm->sm_object); 1565 (void) printf(" smp_length = 0x%llx\n", 1566 (longlong_t)sm->sm_phys->smp_length); 1567 (void) printf(" smp_alloc = 0x%llx\n", 1568 (longlong_t)sm->sm_phys->smp_alloc); 1569 1570 if (dump_opt['d'] < 6 && dump_opt['m'] < 4) 1571 return; 1572 1573 /* 1574 * Print out the freelist entries in both encoded and decoded form. 1575 */ 1576 uint8_t mapshift = sm->sm_shift; 1577 int64_t alloc = 0; 1578 uint64_t word, entry_id = 0; 1579 for (uint64_t offset = 0; offset < space_map_length(sm); 1580 offset += sizeof (word)) { 1581 1582 VERIFY0(dmu_read(os, space_map_object(sm), offset, 1583 sizeof (word), &word, DMU_READ_PREFETCH)); 1584 1585 if (sm_entry_is_debug(word)) { 1586 uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); 1587 uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); 1588 if (de_txg == 0) { 1589 (void) printf( 1590 "\t [%6llu] PADDING\n", 1591 (u_longlong_t)entry_id); 1592 } else { 1593 (void) printf( 1594 "\t [%6llu] %s: txg %llu pass %llu\n", 1595 (u_longlong_t)entry_id, 1596 ddata[SM_DEBUG_ACTION_DECODE(word)], 1597 (u_longlong_t)de_txg, 1598 (u_longlong_t)de_sync_pass); 1599 } 1600 entry_id++; 1601 continue; 1602 } 1603 1604 uint8_t words; 1605 char entry_type; 1606 uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; 1607 1608 if (sm_entry_is_single_word(word)) { 1609 entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? 1610 'A' : 'F'; 1611 entry_off = (SM_OFFSET_DECODE(word) << mapshift) + 1612 sm->sm_start; 1613 entry_run = SM_RUN_DECODE(word) << mapshift; 1614 words = 1; 1615 } else { 1616 /* it is a two-word entry so we read another word */ 1617 ASSERT(sm_entry_is_double_word(word)); 1618 1619 uint64_t extra_word; 1620 offset += sizeof (extra_word); 1621 VERIFY0(dmu_read(os, space_map_object(sm), offset, 1622 sizeof (extra_word), &extra_word, 1623 DMU_READ_PREFETCH)); 1624 1625 ASSERT3U(offset, <=, space_map_length(sm)); 1626 1627 entry_run = SM2_RUN_DECODE(word) << mapshift; 1628 entry_vdev = SM2_VDEV_DECODE(word); 1629 entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 1630 'A' : 'F'; 1631 entry_off = (SM2_OFFSET_DECODE(extra_word) << 1632 mapshift) + sm->sm_start; 1633 words = 2; 1634 } 1635 1636 (void) printf("\t [%6llu] %c range:" 1637 " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", 1638 (u_longlong_t)entry_id, 1639 entry_type, (u_longlong_t)entry_off, 1640 (u_longlong_t)(entry_off + entry_run), 1641 (u_longlong_t)entry_run, 1642 (u_longlong_t)entry_vdev, words); 1643 1644 if (entry_type == 'A') 1645 alloc += entry_run; 1646 else 1647 alloc -= entry_run; 1648 entry_id++; 1649 } 1650 if (alloc != space_map_allocated(sm)) { 1651 (void) printf("space_map_object alloc (%lld) INCONSISTENT " 1652 "with space map summary (%lld)\n", 1653 (longlong_t)space_map_allocated(sm), (longlong_t)alloc); 1654 } 1655 } 1656 1657 static void 1658 dump_metaslab_stats(metaslab_t *msp) 1659 { 1660 char maxbuf[32]; 1661 range_tree_t *rt = msp->ms_allocatable; 1662 zfs_btree_t *t = &msp->ms_allocatable_by_size; 1663 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1664 1665 /* max sure nicenum has enough space */ 1666 _Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated"); 1667 1668 zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); 1669 1670 (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", 1671 "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, 1672 "freepct", free_pct); 1673 (void) printf("\tIn-memory histogram:\n"); 1674 dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); 1675 } 1676 1677 static void 1678 dump_metaslab(metaslab_t *msp) 1679 { 1680 vdev_t *vd = msp->ms_group->mg_vd; 1681 spa_t *spa = vd->vdev_spa; 1682 space_map_t *sm = msp->ms_sm; 1683 char freebuf[32]; 1684 1685 zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, 1686 sizeof (freebuf)); 1687 1688 (void) printf( 1689 "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", 1690 (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, 1691 (u_longlong_t)space_map_object(sm), freebuf); 1692 1693 if (dump_opt['m'] > 2 && !dump_opt['L']) { 1694 mutex_enter(&msp->ms_lock); 1695 VERIFY0(metaslab_load(msp)); 1696 range_tree_stat_verify(msp->ms_allocatable); 1697 dump_metaslab_stats(msp); 1698 metaslab_unload(msp); 1699 mutex_exit(&msp->ms_lock); 1700 } 1701 1702 if (dump_opt['m'] > 1 && sm != NULL && 1703 spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { 1704 /* 1705 * The space map histogram represents free space in chunks 1706 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). 1707 */ 1708 (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", 1709 (u_longlong_t)msp->ms_fragmentation); 1710 dump_histogram(sm->sm_phys->smp_histogram, 1711 SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); 1712 } 1713 1714 if (vd->vdev_ops == &vdev_draid_ops) 1715 ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); 1716 else 1717 ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); 1718 1719 dump_spacemap(spa->spa_meta_objset, msp->ms_sm); 1720 1721 if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 1722 (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", 1723 (u_longlong_t)metaslab_unflushed_txg(msp)); 1724 } 1725 } 1726 1727 static void 1728 print_vdev_metaslab_header(vdev_t *vd) 1729 { 1730 vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; 1731 const char *bias_str = ""; 1732 if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { 1733 bias_str = VDEV_ALLOC_BIAS_LOG; 1734 } else if (alloc_bias == VDEV_BIAS_SPECIAL) { 1735 bias_str = VDEV_ALLOC_BIAS_SPECIAL; 1736 } else if (alloc_bias == VDEV_BIAS_DEDUP) { 1737 bias_str = VDEV_ALLOC_BIAS_DEDUP; 1738 } 1739 1740 uint64_t ms_flush_data_obj = 0; 1741 if (vd->vdev_top_zap != 0) { 1742 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), 1743 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, 1744 sizeof (uint64_t), 1, &ms_flush_data_obj); 1745 if (error != ENOENT) { 1746 ASSERT0(error); 1747 } 1748 } 1749 1750 (void) printf("\tvdev %10llu %s", 1751 (u_longlong_t)vd->vdev_id, bias_str); 1752 1753 if (ms_flush_data_obj != 0) { 1754 (void) printf(" ms_unflushed_phys object %llu", 1755 (u_longlong_t)ms_flush_data_obj); 1756 } 1757 1758 (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", 1759 "metaslabs", (u_longlong_t)vd->vdev_ms_count, 1760 "offset", "spacemap", "free"); 1761 (void) printf("\t%15s %19s %15s %12s\n", 1762 "---------------", "-------------------", 1763 "---------------", "------------"); 1764 } 1765 1766 static void 1767 dump_metaslab_groups(spa_t *spa, boolean_t show_special) 1768 { 1769 vdev_t *rvd = spa->spa_root_vdev; 1770 metaslab_class_t *mc = spa_normal_class(spa); 1771 metaslab_class_t *smc = spa_special_class(spa); 1772 uint64_t fragmentation; 1773 1774 metaslab_class_histogram_verify(mc); 1775 1776 for (unsigned c = 0; c < rvd->vdev_children; c++) { 1777 vdev_t *tvd = rvd->vdev_child[c]; 1778 metaslab_group_t *mg = tvd->vdev_mg; 1779 1780 if (mg == NULL || (mg->mg_class != mc && 1781 (!show_special || mg->mg_class != smc))) 1782 continue; 1783 1784 metaslab_group_histogram_verify(mg); 1785 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 1786 1787 (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" 1788 "fragmentation", 1789 (u_longlong_t)tvd->vdev_id, 1790 (u_longlong_t)tvd->vdev_ms_count); 1791 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 1792 (void) printf("%3s\n", "-"); 1793 } else { 1794 (void) printf("%3llu%%\n", 1795 (u_longlong_t)mg->mg_fragmentation); 1796 } 1797 dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); 1798 } 1799 1800 (void) printf("\tpool %s\tfragmentation", spa_name(spa)); 1801 fragmentation = metaslab_class_fragmentation(mc); 1802 if (fragmentation == ZFS_FRAG_INVALID) 1803 (void) printf("\t%3s\n", "-"); 1804 else 1805 (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); 1806 dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); 1807 } 1808 1809 static void 1810 print_vdev_indirect(vdev_t *vd) 1811 { 1812 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 1813 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1814 vdev_indirect_births_t *vib = vd->vdev_indirect_births; 1815 1816 if (vim == NULL) { 1817 ASSERT3P(vib, ==, NULL); 1818 return; 1819 } 1820 1821 ASSERT3U(vdev_indirect_mapping_object(vim), ==, 1822 vic->vic_mapping_object); 1823 ASSERT3U(vdev_indirect_births_object(vib), ==, 1824 vic->vic_births_object); 1825 1826 (void) printf("indirect births obj %llu:\n", 1827 (longlong_t)vic->vic_births_object); 1828 (void) printf(" vib_count = %llu\n", 1829 (longlong_t)vdev_indirect_births_count(vib)); 1830 for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { 1831 vdev_indirect_birth_entry_phys_t *cur_vibe = 1832 &vib->vib_entries[i]; 1833 (void) printf("\toffset %llx -> txg %llu\n", 1834 (longlong_t)cur_vibe->vibe_offset, 1835 (longlong_t)cur_vibe->vibe_phys_birth_txg); 1836 } 1837 (void) printf("\n"); 1838 1839 (void) printf("indirect mapping obj %llu:\n", 1840 (longlong_t)vic->vic_mapping_object); 1841 (void) printf(" vim_max_offset = 0x%llx\n", 1842 (longlong_t)vdev_indirect_mapping_max_offset(vim)); 1843 (void) printf(" vim_bytes_mapped = 0x%llx\n", 1844 (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); 1845 (void) printf(" vim_count = %llu\n", 1846 (longlong_t)vdev_indirect_mapping_num_entries(vim)); 1847 1848 if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) 1849 return; 1850 1851 uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); 1852 1853 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { 1854 vdev_indirect_mapping_entry_phys_t *vimep = 1855 &vim->vim_entries[i]; 1856 (void) printf("\t<%llx:%llx:%llx> -> " 1857 "<%llx:%llx:%llx> (%x obsolete)\n", 1858 (longlong_t)vd->vdev_id, 1859 (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), 1860 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 1861 (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), 1862 (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), 1863 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 1864 counts[i]); 1865 } 1866 (void) printf("\n"); 1867 1868 uint64_t obsolete_sm_object; 1869 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 1870 if (obsolete_sm_object != 0) { 1871 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1872 (void) printf("obsolete space map object %llu:\n", 1873 (u_longlong_t)obsolete_sm_object); 1874 ASSERT(vd->vdev_obsolete_sm != NULL); 1875 ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, 1876 obsolete_sm_object); 1877 dump_spacemap(mos, vd->vdev_obsolete_sm); 1878 (void) printf("\n"); 1879 } 1880 } 1881 1882 static void 1883 dump_metaslabs(spa_t *spa) 1884 { 1885 vdev_t *vd, *rvd = spa->spa_root_vdev; 1886 uint64_t m, c = 0, children = rvd->vdev_children; 1887 1888 (void) printf("\nMetaslabs:\n"); 1889 1890 if (!dump_opt['d'] && zopt_metaslab_args > 0) { 1891 c = zopt_metaslab[0]; 1892 1893 if (c >= children) 1894 (void) fatal("bad vdev id: %llu", (u_longlong_t)c); 1895 1896 if (zopt_metaslab_args > 1) { 1897 vd = rvd->vdev_child[c]; 1898 print_vdev_metaslab_header(vd); 1899 1900 for (m = 1; m < zopt_metaslab_args; m++) { 1901 if (zopt_metaslab[m] < vd->vdev_ms_count) 1902 dump_metaslab( 1903 vd->vdev_ms[zopt_metaslab[m]]); 1904 else 1905 (void) fprintf(stderr, "bad metaslab " 1906 "number %llu\n", 1907 (u_longlong_t)zopt_metaslab[m]); 1908 } 1909 (void) printf("\n"); 1910 return; 1911 } 1912 children = c + 1; 1913 } 1914 for (; c < children; c++) { 1915 vd = rvd->vdev_child[c]; 1916 print_vdev_metaslab_header(vd); 1917 1918 print_vdev_indirect(vd); 1919 1920 for (m = 0; m < vd->vdev_ms_count; m++) 1921 dump_metaslab(vd->vdev_ms[m]); 1922 (void) printf("\n"); 1923 } 1924 } 1925 1926 static void 1927 dump_log_spacemaps(spa_t *spa) 1928 { 1929 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1930 return; 1931 1932 (void) printf("\nLog Space Maps in Pool:\n"); 1933 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 1934 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { 1935 space_map_t *sm = NULL; 1936 VERIFY0(space_map_open(&sm, spa_meta_objset(spa), 1937 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); 1938 1939 (void) printf("Log Spacemap object %llu txg %llu\n", 1940 (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); 1941 dump_spacemap(spa->spa_meta_objset, sm); 1942 space_map_close(sm); 1943 } 1944 (void) printf("\n"); 1945 } 1946 1947 static void 1948 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) 1949 { 1950 const ddt_phys_t *ddp = dde->dde_phys; 1951 const ddt_key_t *ddk = &dde->dde_key; 1952 const char *types[4] = { "ditto", "single", "double", "triple" }; 1953 char blkbuf[BP_SPRINTF_LEN]; 1954 blkptr_t blk; 1955 int p; 1956 1957 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1958 if (ddp->ddp_phys_birth == 0) 1959 continue; 1960 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 1961 snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); 1962 (void) printf("index %llx refcnt %llu %s %s\n", 1963 (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, 1964 types[p], blkbuf); 1965 } 1966 } 1967 1968 static void 1969 dump_dedup_ratio(const ddt_stat_t *dds) 1970 { 1971 double rL, rP, rD, D, dedup, compress, copies; 1972 1973 if (dds->dds_blocks == 0) 1974 return; 1975 1976 rL = (double)dds->dds_ref_lsize; 1977 rP = (double)dds->dds_ref_psize; 1978 rD = (double)dds->dds_ref_dsize; 1979 D = (double)dds->dds_dsize; 1980 1981 dedup = rD / D; 1982 compress = rL / rP; 1983 copies = rD / rP; 1984 1985 (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " 1986 "dedup * compress / copies = %.2f\n\n", 1987 dedup, compress, copies, dedup * compress / copies); 1988 } 1989 1990 static void 1991 dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 1992 { 1993 char name[DDT_NAMELEN]; 1994 ddt_entry_t dde; 1995 uint64_t walk = 0; 1996 dmu_object_info_t doi; 1997 uint64_t count, dspace, mspace; 1998 int error; 1999 2000 error = ddt_object_info(ddt, type, class, &doi); 2001 2002 if (error == ENOENT) 2003 return; 2004 ASSERT(error == 0); 2005 2006 error = ddt_object_count(ddt, type, class, &count); 2007 ASSERT(error == 0); 2008 if (count == 0) 2009 return; 2010 2011 dspace = doi.doi_physical_blocks_512 << 9; 2012 mspace = doi.doi_fill_count * doi.doi_data_block_size; 2013 2014 ddt_object_name(ddt, type, class, name); 2015 2016 (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", 2017 name, 2018 (u_longlong_t)count, 2019 (u_longlong_t)(dspace / count), 2020 (u_longlong_t)(mspace / count)); 2021 2022 if (dump_opt['D'] < 3) 2023 return; 2024 2025 zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); 2026 2027 if (dump_opt['D'] < 4) 2028 return; 2029 2030 if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) 2031 return; 2032 2033 (void) printf("%s contents:\n\n", name); 2034 2035 while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) 2036 dump_dde(ddt, &dde, walk); 2037 2038 ASSERT3U(error, ==, ENOENT); 2039 2040 (void) printf("\n"); 2041 } 2042 2043 static void 2044 dump_all_ddts(spa_t *spa) 2045 { 2046 ddt_histogram_t ddh_total = {{{0}}}; 2047 ddt_stat_t dds_total = {0}; 2048 2049 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 2050 ddt_t *ddt = spa->spa_ddt[c]; 2051 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 2052 for (enum ddt_class class = 0; class < DDT_CLASSES; 2053 class++) { 2054 dump_ddt(ddt, type, class); 2055 } 2056 } 2057 } 2058 2059 ddt_get_dedup_stats(spa, &dds_total); 2060 2061 if (dds_total.dds_blocks == 0) { 2062 (void) printf("All DDTs are empty\n"); 2063 return; 2064 } 2065 2066 (void) printf("\n"); 2067 2068 if (dump_opt['D'] > 1) { 2069 (void) printf("DDT histogram (aggregated over all DDTs):\n"); 2070 ddt_get_dedup_histogram(spa, &ddh_total); 2071 zpool_dump_ddt(&dds_total, &ddh_total); 2072 } 2073 2074 dump_dedup_ratio(&dds_total); 2075 } 2076 2077 static void 2078 dump_dtl_seg(void *arg, uint64_t start, uint64_t size) 2079 { 2080 char *prefix = arg; 2081 2082 (void) printf("%s [%llu,%llu) length %llu\n", 2083 prefix, 2084 (u_longlong_t)start, 2085 (u_longlong_t)(start + size), 2086 (u_longlong_t)(size)); 2087 } 2088 2089 static void 2090 dump_dtl(vdev_t *vd, int indent) 2091 { 2092 spa_t *spa = vd->vdev_spa; 2093 boolean_t required; 2094 const char *name[DTL_TYPES] = { "missing", "partial", "scrub", 2095 "outage" }; 2096 char prefix[256]; 2097 2098 spa_vdev_state_enter(spa, SCL_NONE); 2099 required = vdev_dtl_required(vd); 2100 (void) spa_vdev_state_exit(spa, NULL, 0); 2101 2102 if (indent == 0) 2103 (void) printf("\nDirty time logs:\n\n"); 2104 2105 (void) printf("\t%*s%s [%s]\n", indent, "", 2106 vd->vdev_path ? vd->vdev_path : 2107 vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), 2108 required ? "DTL-required" : "DTL-expendable"); 2109 2110 for (int t = 0; t < DTL_TYPES; t++) { 2111 range_tree_t *rt = vd->vdev_dtl[t]; 2112 if (range_tree_space(rt) == 0) 2113 continue; 2114 (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", 2115 indent + 2, "", name[t]); 2116 range_tree_walk(rt, dump_dtl_seg, prefix); 2117 if (dump_opt['d'] > 5 && vd->vdev_children == 0) 2118 dump_spacemap(spa->spa_meta_objset, 2119 vd->vdev_dtl_sm); 2120 } 2121 2122 for (unsigned c = 0; c < vd->vdev_children; c++) 2123 dump_dtl(vd->vdev_child[c], indent + 4); 2124 } 2125 2126 static void 2127 dump_history(spa_t *spa) 2128 { 2129 nvlist_t **events = NULL; 2130 char *buf; 2131 uint64_t resid, len, off = 0; 2132 uint_t num = 0; 2133 int error; 2134 char tbuf[30]; 2135 2136 if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { 2137 (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", 2138 __func__); 2139 return; 2140 } 2141 2142 do { 2143 len = SPA_OLD_MAXBLOCKSIZE; 2144 2145 if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { 2146 (void) fprintf(stderr, "Unable to read history: " 2147 "error %d\n", error); 2148 free(buf); 2149 return; 2150 } 2151 2152 if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) 2153 break; 2154 2155 off -= resid; 2156 } while (len != 0); 2157 2158 (void) printf("\nHistory:\n"); 2159 for (unsigned i = 0; i < num; i++) { 2160 boolean_t printed = B_FALSE; 2161 2162 if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) { 2163 time_t tsec; 2164 struct tm t; 2165 2166 tsec = fnvlist_lookup_uint64(events[i], 2167 ZPOOL_HIST_TIME); 2168 (void) localtime_r(&tsec, &t); 2169 (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); 2170 } else { 2171 tbuf[0] = '\0'; 2172 } 2173 2174 if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) { 2175 (void) printf("%s %s\n", tbuf, 2176 fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD)); 2177 } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) { 2178 uint64_t ievent; 2179 2180 ievent = fnvlist_lookup_uint64(events[i], 2181 ZPOOL_HIST_INT_EVENT); 2182 if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) 2183 goto next; 2184 2185 (void) printf(" %s [internal %s txg:%ju] %s\n", 2186 tbuf, 2187 zfs_history_event_names[ievent], 2188 fnvlist_lookup_uint64(events[i], 2189 ZPOOL_HIST_TXG), 2190 fnvlist_lookup_string(events[i], 2191 ZPOOL_HIST_INT_STR)); 2192 } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) { 2193 (void) printf("%s [txg:%ju] %s", tbuf, 2194 fnvlist_lookup_uint64(events[i], 2195 ZPOOL_HIST_TXG), 2196 fnvlist_lookup_string(events[i], 2197 ZPOOL_HIST_INT_NAME)); 2198 2199 if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) { 2200 (void) printf(" %s (%llu)", 2201 fnvlist_lookup_string(events[i], 2202 ZPOOL_HIST_DSNAME), 2203 (u_longlong_t)fnvlist_lookup_uint64( 2204 events[i], 2205 ZPOOL_HIST_DSID)); 2206 } 2207 2208 (void) printf(" %s\n", fnvlist_lookup_string(events[i], 2209 ZPOOL_HIST_INT_STR)); 2210 } else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) { 2211 (void) printf("%s ioctl %s\n", tbuf, 2212 fnvlist_lookup_string(events[i], 2213 ZPOOL_HIST_IOCTL)); 2214 2215 if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) { 2216 (void) printf(" input:\n"); 2217 dump_nvlist(fnvlist_lookup_nvlist(events[i], 2218 ZPOOL_HIST_INPUT_NVL), 8); 2219 } 2220 if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) { 2221 (void) printf(" output:\n"); 2222 dump_nvlist(fnvlist_lookup_nvlist(events[i], 2223 ZPOOL_HIST_OUTPUT_NVL), 8); 2224 } 2225 if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) { 2226 (void) printf(" errno: %lld\n", 2227 (longlong_t)fnvlist_lookup_int64(events[i], 2228 ZPOOL_HIST_ERRNO)); 2229 } 2230 } else { 2231 goto next; 2232 } 2233 2234 printed = B_TRUE; 2235 next: 2236 if (dump_opt['h'] > 1) { 2237 if (!printed) 2238 (void) printf("unrecognized record:\n"); 2239 dump_nvlist(events[i], 2); 2240 } 2241 } 2242 free(buf); 2243 } 2244 2245 static void 2246 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) 2247 { 2248 (void) os, (void) object, (void) data, (void) size; 2249 } 2250 2251 static uint64_t 2252 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, 2253 const zbookmark_phys_t *zb) 2254 { 2255 if (dnp == NULL) { 2256 ASSERT(zb->zb_level < 0); 2257 if (zb->zb_object == 0) 2258 return (zb->zb_blkid); 2259 return (zb->zb_blkid * BP_GET_LSIZE(bp)); 2260 } 2261 2262 ASSERT(zb->zb_level >= 0); 2263 2264 return ((zb->zb_blkid << 2265 (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * 2266 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 2267 } 2268 2269 static void 2270 snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, 2271 const blkptr_t *bp) 2272 { 2273 abd_t *pabd; 2274 void *buf; 2275 zio_t *zio; 2276 zfs_zstdhdr_t zstd_hdr; 2277 int error; 2278 2279 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD) 2280 return; 2281 2282 if (BP_IS_HOLE(bp)) 2283 return; 2284 2285 if (BP_IS_EMBEDDED(bp)) { 2286 buf = malloc(SPA_MAXBLOCKSIZE); 2287 if (buf == NULL) { 2288 (void) fprintf(stderr, "out of memory\n"); 2289 exit(1); 2290 } 2291 decode_embedded_bp_compressed(bp, buf); 2292 memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); 2293 free(buf); 2294 zstd_hdr.c_len = BE_32(zstd_hdr.c_len); 2295 zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); 2296 (void) snprintf(blkbuf + strlen(blkbuf), 2297 buflen - strlen(blkbuf), 2298 " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", 2299 zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), 2300 zfs_get_hdrlevel(&zstd_hdr)); 2301 return; 2302 } 2303 2304 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); 2305 zio = zio_root(spa, NULL, NULL, 0); 2306 2307 /* Decrypt but don't decompress so we can read the compression header */ 2308 zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL, 2309 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS, 2310 NULL)); 2311 error = zio_wait(zio); 2312 if (error) { 2313 (void) fprintf(stderr, "read failed: %d\n", error); 2314 return; 2315 } 2316 buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp)); 2317 memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); 2318 zstd_hdr.c_len = BE_32(zstd_hdr.c_len); 2319 zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); 2320 2321 (void) snprintf(blkbuf + strlen(blkbuf), 2322 buflen - strlen(blkbuf), 2323 " ZSTD:size=%u:version=%u:level=%u:NORMAL", 2324 zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), 2325 zfs_get_hdrlevel(&zstd_hdr)); 2326 2327 abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); 2328 } 2329 2330 static void 2331 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, 2332 boolean_t bp_freed) 2333 { 2334 const dva_t *dva = bp->blk_dva; 2335 int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; 2336 int i; 2337 2338 if (dump_opt['b'] >= 6) { 2339 snprintf_blkptr(blkbuf, buflen, bp); 2340 if (bp_freed) { 2341 (void) snprintf(blkbuf + strlen(blkbuf), 2342 buflen - strlen(blkbuf), " %s", "FREE"); 2343 } 2344 return; 2345 } 2346 2347 if (BP_IS_EMBEDDED(bp)) { 2348 (void) sprintf(blkbuf, 2349 "EMBEDDED et=%u %llxL/%llxP B=%llu", 2350 (int)BPE_GET_ETYPE(bp), 2351 (u_longlong_t)BPE_GET_LSIZE(bp), 2352 (u_longlong_t)BPE_GET_PSIZE(bp), 2353 (u_longlong_t)bp->blk_birth); 2354 return; 2355 } 2356 2357 blkbuf[0] = '\0'; 2358 2359 for (i = 0; i < ndvas; i++) 2360 (void) snprintf(blkbuf + strlen(blkbuf), 2361 buflen - strlen(blkbuf), "%llu:%llx:%llx ", 2362 (u_longlong_t)DVA_GET_VDEV(&dva[i]), 2363 (u_longlong_t)DVA_GET_OFFSET(&dva[i]), 2364 (u_longlong_t)DVA_GET_ASIZE(&dva[i])); 2365 2366 if (BP_IS_HOLE(bp)) { 2367 (void) snprintf(blkbuf + strlen(blkbuf), 2368 buflen - strlen(blkbuf), 2369 "%llxL B=%llu", 2370 (u_longlong_t)BP_GET_LSIZE(bp), 2371 (u_longlong_t)bp->blk_birth); 2372 } else { 2373 (void) snprintf(blkbuf + strlen(blkbuf), 2374 buflen - strlen(blkbuf), 2375 "%llxL/%llxP F=%llu B=%llu/%llu", 2376 (u_longlong_t)BP_GET_LSIZE(bp), 2377 (u_longlong_t)BP_GET_PSIZE(bp), 2378 (u_longlong_t)BP_GET_FILL(bp), 2379 (u_longlong_t)bp->blk_birth, 2380 (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); 2381 if (bp_freed) 2382 (void) snprintf(blkbuf + strlen(blkbuf), 2383 buflen - strlen(blkbuf), " %s", "FREE"); 2384 (void) snprintf(blkbuf + strlen(blkbuf), 2385 buflen - strlen(blkbuf), 2386 " cksum=%016llx:%016llx:%016llx:%016llx", 2387 (u_longlong_t)bp->blk_cksum.zc_word[0], 2388 (u_longlong_t)bp->blk_cksum.zc_word[1], 2389 (u_longlong_t)bp->blk_cksum.zc_word[2], 2390 (u_longlong_t)bp->blk_cksum.zc_word[3]); 2391 } 2392 } 2393 2394 static void 2395 print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, 2396 const dnode_phys_t *dnp) 2397 { 2398 char blkbuf[BP_SPRINTF_LEN]; 2399 int l; 2400 2401 if (!BP_IS_EMBEDDED(bp)) { 2402 ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); 2403 ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); 2404 } 2405 2406 (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); 2407 2408 ASSERT(zb->zb_level >= 0); 2409 2410 for (l = dnp->dn_nlevels - 1; l >= -1; l--) { 2411 if (l == zb->zb_level) { 2412 (void) printf("L%llx", (u_longlong_t)zb->zb_level); 2413 } else { 2414 (void) printf(" "); 2415 } 2416 } 2417 2418 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); 2419 if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) 2420 snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); 2421 (void) printf("%s\n", blkbuf); 2422 } 2423 2424 static int 2425 visit_indirect(spa_t *spa, const dnode_phys_t *dnp, 2426 blkptr_t *bp, const zbookmark_phys_t *zb) 2427 { 2428 int err = 0; 2429 2430 if (bp->blk_birth == 0) 2431 return (0); 2432 2433 print_indirect(spa, bp, zb, dnp); 2434 2435 if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { 2436 arc_flags_t flags = ARC_FLAG_WAIT; 2437 int i; 2438 blkptr_t *cbp; 2439 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 2440 arc_buf_t *buf; 2441 uint64_t fill = 0; 2442 ASSERT(!BP_IS_REDACTED(bp)); 2443 2444 err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, 2445 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 2446 if (err) 2447 return (err); 2448 ASSERT(buf->b_data); 2449 2450 /* recursively visit blocks below this */ 2451 cbp = buf->b_data; 2452 for (i = 0; i < epb; i++, cbp++) { 2453 zbookmark_phys_t czb; 2454 2455 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 2456 zb->zb_level - 1, 2457 zb->zb_blkid * epb + i); 2458 err = visit_indirect(spa, dnp, cbp, &czb); 2459 if (err) 2460 break; 2461 fill += BP_GET_FILL(cbp); 2462 } 2463 if (!err) 2464 ASSERT3U(fill, ==, BP_GET_FILL(bp)); 2465 arc_buf_destroy(buf, &buf); 2466 } 2467 2468 return (err); 2469 } 2470 2471 static void 2472 dump_indirect(dnode_t *dn) 2473 { 2474 dnode_phys_t *dnp = dn->dn_phys; 2475 zbookmark_phys_t czb; 2476 2477 (void) printf("Indirect blocks:\n"); 2478 2479 SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), 2480 dn->dn_object, dnp->dn_nlevels - 1, 0); 2481 for (int j = 0; j < dnp->dn_nblkptr; j++) { 2482 czb.zb_blkid = j; 2483 (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, 2484 &dnp->dn_blkptr[j], &czb); 2485 } 2486 2487 (void) printf("\n"); 2488 } 2489 2490 static void 2491 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) 2492 { 2493 (void) os, (void) object; 2494 dsl_dir_phys_t *dd = data; 2495 time_t crtime; 2496 char nice[32]; 2497 2498 /* make sure nicenum has enough space */ 2499 _Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated"); 2500 2501 if (dd == NULL) 2502 return; 2503 2504 ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); 2505 2506 crtime = dd->dd_creation_time; 2507 (void) printf("\t\tcreation_time = %s", ctime(&crtime)); 2508 (void) printf("\t\thead_dataset_obj = %llu\n", 2509 (u_longlong_t)dd->dd_head_dataset_obj); 2510 (void) printf("\t\tparent_dir_obj = %llu\n", 2511 (u_longlong_t)dd->dd_parent_obj); 2512 (void) printf("\t\torigin_obj = %llu\n", 2513 (u_longlong_t)dd->dd_origin_obj); 2514 (void) printf("\t\tchild_dir_zapobj = %llu\n", 2515 (u_longlong_t)dd->dd_child_dir_zapobj); 2516 zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); 2517 (void) printf("\t\tused_bytes = %s\n", nice); 2518 zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); 2519 (void) printf("\t\tcompressed_bytes = %s\n", nice); 2520 zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); 2521 (void) printf("\t\tuncompressed_bytes = %s\n", nice); 2522 zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); 2523 (void) printf("\t\tquota = %s\n", nice); 2524 zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); 2525 (void) printf("\t\treserved = %s\n", nice); 2526 (void) printf("\t\tprops_zapobj = %llu\n", 2527 (u_longlong_t)dd->dd_props_zapobj); 2528 (void) printf("\t\tdeleg_zapobj = %llu\n", 2529 (u_longlong_t)dd->dd_deleg_zapobj); 2530 (void) printf("\t\tflags = %llx\n", 2531 (u_longlong_t)dd->dd_flags); 2532 2533 #define DO(which) \ 2534 zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ 2535 sizeof (nice)); \ 2536 (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) 2537 DO(HEAD); 2538 DO(SNAP); 2539 DO(CHILD); 2540 DO(CHILD_RSRV); 2541 DO(REFRSRV); 2542 #undef DO 2543 (void) printf("\t\tclones = %llu\n", 2544 (u_longlong_t)dd->dd_clones); 2545 } 2546 2547 static void 2548 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) 2549 { 2550 (void) os, (void) object; 2551 dsl_dataset_phys_t *ds = data; 2552 time_t crtime; 2553 char used[32], compressed[32], uncompressed[32], unique[32]; 2554 char blkbuf[BP_SPRINTF_LEN]; 2555 2556 /* make sure nicenum has enough space */ 2557 _Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated"); 2558 _Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ, 2559 "compressed truncated"); 2560 _Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ, 2561 "uncompressed truncated"); 2562 _Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated"); 2563 2564 if (ds == NULL) 2565 return; 2566 2567 ASSERT(size == sizeof (*ds)); 2568 crtime = ds->ds_creation_time; 2569 zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); 2570 zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); 2571 zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, 2572 sizeof (uncompressed)); 2573 zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); 2574 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); 2575 2576 (void) printf("\t\tdir_obj = %llu\n", 2577 (u_longlong_t)ds->ds_dir_obj); 2578 (void) printf("\t\tprev_snap_obj = %llu\n", 2579 (u_longlong_t)ds->ds_prev_snap_obj); 2580 (void) printf("\t\tprev_snap_txg = %llu\n", 2581 (u_longlong_t)ds->ds_prev_snap_txg); 2582 (void) printf("\t\tnext_snap_obj = %llu\n", 2583 (u_longlong_t)ds->ds_next_snap_obj); 2584 (void) printf("\t\tsnapnames_zapobj = %llu\n", 2585 (u_longlong_t)ds->ds_snapnames_zapobj); 2586 (void) printf("\t\tnum_children = %llu\n", 2587 (u_longlong_t)ds->ds_num_children); 2588 (void) printf("\t\tuserrefs_obj = %llu\n", 2589 (u_longlong_t)ds->ds_userrefs_obj); 2590 (void) printf("\t\tcreation_time = %s", ctime(&crtime)); 2591 (void) printf("\t\tcreation_txg = %llu\n", 2592 (u_longlong_t)ds->ds_creation_txg); 2593 (void) printf("\t\tdeadlist_obj = %llu\n", 2594 (u_longlong_t)ds->ds_deadlist_obj); 2595 (void) printf("\t\tused_bytes = %s\n", used); 2596 (void) printf("\t\tcompressed_bytes = %s\n", compressed); 2597 (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); 2598 (void) printf("\t\tunique = %s\n", unique); 2599 (void) printf("\t\tfsid_guid = %llu\n", 2600 (u_longlong_t)ds->ds_fsid_guid); 2601 (void) printf("\t\tguid = %llu\n", 2602 (u_longlong_t)ds->ds_guid); 2603 (void) printf("\t\tflags = %llx\n", 2604 (u_longlong_t)ds->ds_flags); 2605 (void) printf("\t\tnext_clones_obj = %llu\n", 2606 (u_longlong_t)ds->ds_next_clones_obj); 2607 (void) printf("\t\tprops_obj = %llu\n", 2608 (u_longlong_t)ds->ds_props_obj); 2609 (void) printf("\t\tbp = %s\n", blkbuf); 2610 } 2611 2612 static int 2613 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2614 { 2615 (void) arg, (void) tx; 2616 char blkbuf[BP_SPRINTF_LEN]; 2617 2618 if (bp->blk_birth != 0) { 2619 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 2620 (void) printf("\t%s\n", blkbuf); 2621 } 2622 return (0); 2623 } 2624 2625 static void 2626 dump_bptree(objset_t *os, uint64_t obj, const char *name) 2627 { 2628 char bytes[32]; 2629 bptree_phys_t *bt; 2630 dmu_buf_t *db; 2631 2632 /* make sure nicenum has enough space */ 2633 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 2634 2635 if (dump_opt['d'] < 3) 2636 return; 2637 2638 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 2639 bt = db->db_data; 2640 zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); 2641 (void) printf("\n %s: %llu datasets, %s\n", 2642 name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); 2643 dmu_buf_rele(db, FTAG); 2644 2645 if (dump_opt['d'] < 5) 2646 return; 2647 2648 (void) printf("\n"); 2649 2650 (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); 2651 } 2652 2653 static int 2654 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) 2655 { 2656 (void) arg, (void) tx; 2657 char blkbuf[BP_SPRINTF_LEN]; 2658 2659 ASSERT(bp->blk_birth != 0); 2660 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); 2661 (void) printf("\t%s\n", blkbuf); 2662 return (0); 2663 } 2664 2665 static void 2666 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) 2667 { 2668 char bytes[32]; 2669 char comp[32]; 2670 char uncomp[32]; 2671 uint64_t i; 2672 2673 /* make sure nicenum has enough space */ 2674 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 2675 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); 2676 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); 2677 2678 if (dump_opt['d'] < 3) 2679 return; 2680 2681 zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); 2682 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 2683 zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); 2684 zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); 2685 if (bpo->bpo_havefreed) { 2686 (void) printf(" %*s: object %llu, %llu local " 2687 "blkptrs, %llu freed, %llu subobjs in object %llu, " 2688 "%s (%s/%s comp)\n", 2689 indent * 8, name, 2690 (u_longlong_t)bpo->bpo_object, 2691 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 2692 (u_longlong_t)bpo->bpo_phys->bpo_num_freed, 2693 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, 2694 (u_longlong_t)bpo->bpo_phys->bpo_subobjs, 2695 bytes, comp, uncomp); 2696 } else { 2697 (void) printf(" %*s: object %llu, %llu local " 2698 "blkptrs, %llu subobjs in object %llu, " 2699 "%s (%s/%s comp)\n", 2700 indent * 8, name, 2701 (u_longlong_t)bpo->bpo_object, 2702 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 2703 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, 2704 (u_longlong_t)bpo->bpo_phys->bpo_subobjs, 2705 bytes, comp, uncomp); 2706 } 2707 2708 for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { 2709 uint64_t subobj; 2710 bpobj_t subbpo; 2711 int error; 2712 VERIFY0(dmu_read(bpo->bpo_os, 2713 bpo->bpo_phys->bpo_subobjs, 2714 i * sizeof (subobj), sizeof (subobj), &subobj, 0)); 2715 error = bpobj_open(&subbpo, bpo->bpo_os, subobj); 2716 if (error != 0) { 2717 (void) printf("ERROR %u while trying to open " 2718 "subobj id %llu\n", 2719 error, (u_longlong_t)subobj); 2720 continue; 2721 } 2722 dump_full_bpobj(&subbpo, "subobj", indent + 1); 2723 bpobj_close(&subbpo); 2724 } 2725 } else { 2726 if (bpo->bpo_havefreed) { 2727 (void) printf(" %*s: object %llu, %llu blkptrs, " 2728 "%llu freed, %s\n", 2729 indent * 8, name, 2730 (u_longlong_t)bpo->bpo_object, 2731 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 2732 (u_longlong_t)bpo->bpo_phys->bpo_num_freed, 2733 bytes); 2734 } else { 2735 (void) printf(" %*s: object %llu, %llu blkptrs, " 2736 "%s\n", 2737 indent * 8, name, 2738 (u_longlong_t)bpo->bpo_object, 2739 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 2740 bytes); 2741 } 2742 } 2743 2744 if (dump_opt['d'] < 5) 2745 return; 2746 2747 2748 if (indent == 0) { 2749 (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); 2750 (void) printf("\n"); 2751 } 2752 } 2753 2754 static int 2755 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, 2756 boolean_t print_list) 2757 { 2758 int err = 0; 2759 zfs_bookmark_phys_t prop; 2760 objset_t *mos = dp->dp_spa->spa_meta_objset; 2761 err = dsl_bookmark_lookup(dp, name, NULL, &prop); 2762 2763 if (err != 0) { 2764 return (err); 2765 } 2766 2767 (void) printf("\t#%s: ", strchr(name, '#') + 1); 2768 (void) printf("{guid: %llx creation_txg: %llu creation_time: " 2769 "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, 2770 (u_longlong_t)prop.zbm_creation_txg, 2771 (u_longlong_t)prop.zbm_creation_time, 2772 (u_longlong_t)prop.zbm_redaction_obj); 2773 2774 IMPLY(print_list, print_redact); 2775 if (!print_redact || prop.zbm_redaction_obj == 0) 2776 return (0); 2777 2778 redaction_list_t *rl; 2779 VERIFY0(dsl_redaction_list_hold_obj(dp, 2780 prop.zbm_redaction_obj, FTAG, &rl)); 2781 2782 redaction_list_phys_t *rlp = rl->rl_phys; 2783 (void) printf("\tRedacted:\n\t\tProgress: "); 2784 if (rlp->rlp_last_object != UINT64_MAX || 2785 rlp->rlp_last_blkid != UINT64_MAX) { 2786 (void) printf("%llu %llu (incomplete)\n", 2787 (u_longlong_t)rlp->rlp_last_object, 2788 (u_longlong_t)rlp->rlp_last_blkid); 2789 } else { 2790 (void) printf("complete\n"); 2791 } 2792 (void) printf("\t\tSnapshots: ["); 2793 for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { 2794 if (i > 0) 2795 (void) printf(", "); 2796 (void) printf("%0llu", 2797 (u_longlong_t)rlp->rlp_snaps[i]); 2798 } 2799 (void) printf("]\n\t\tLength: %llu\n", 2800 (u_longlong_t)rlp->rlp_num_entries); 2801 2802 if (!print_list) { 2803 dsl_redaction_list_rele(rl, FTAG); 2804 return (0); 2805 } 2806 2807 if (rlp->rlp_num_entries == 0) { 2808 dsl_redaction_list_rele(rl, FTAG); 2809 (void) printf("\t\tRedaction List: []\n\n"); 2810 return (0); 2811 } 2812 2813 redact_block_phys_t *rbp_buf; 2814 uint64_t size; 2815 dmu_object_info_t doi; 2816 2817 VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); 2818 size = doi.doi_max_offset; 2819 rbp_buf = kmem_alloc(size, KM_SLEEP); 2820 2821 err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, 2822 rbp_buf, 0); 2823 if (err != 0) { 2824 dsl_redaction_list_rele(rl, FTAG); 2825 kmem_free(rbp_buf, size); 2826 return (err); 2827 } 2828 2829 (void) printf("\t\tRedaction List: [{object: %llx, offset: " 2830 "%llx, blksz: %x, count: %llx}", 2831 (u_longlong_t)rbp_buf[0].rbp_object, 2832 (u_longlong_t)rbp_buf[0].rbp_blkid, 2833 (uint_t)(redact_block_get_size(&rbp_buf[0])), 2834 (u_longlong_t)redact_block_get_count(&rbp_buf[0])); 2835 2836 for (size_t i = 1; i < rlp->rlp_num_entries; i++) { 2837 (void) printf(",\n\t\t{object: %llx, offset: %llx, " 2838 "blksz: %x, count: %llx}", 2839 (u_longlong_t)rbp_buf[i].rbp_object, 2840 (u_longlong_t)rbp_buf[i].rbp_blkid, 2841 (uint_t)(redact_block_get_size(&rbp_buf[i])), 2842 (u_longlong_t)redact_block_get_count(&rbp_buf[i])); 2843 } 2844 dsl_redaction_list_rele(rl, FTAG); 2845 kmem_free(rbp_buf, size); 2846 (void) printf("]\n\n"); 2847 return (0); 2848 } 2849 2850 static void 2851 dump_bookmarks(objset_t *os, int verbosity) 2852 { 2853 zap_cursor_t zc; 2854 zap_attribute_t attr; 2855 dsl_dataset_t *ds = dmu_objset_ds(os); 2856 dsl_pool_t *dp = spa_get_dsl(os->os_spa); 2857 objset_t *mos = os->os_spa->spa_meta_objset; 2858 if (verbosity < 4) 2859 return; 2860 dsl_pool_config_enter(dp, FTAG); 2861 2862 for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); 2863 zap_cursor_retrieve(&zc, &attr) == 0; 2864 zap_cursor_advance(&zc)) { 2865 char osname[ZFS_MAX_DATASET_NAME_LEN]; 2866 char buf[ZFS_MAX_DATASET_NAME_LEN]; 2867 int len; 2868 dmu_objset_name(os, osname); 2869 len = snprintf(buf, sizeof (buf), "%s#%s", osname, 2870 attr.za_name); 2871 VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN); 2872 (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); 2873 } 2874 zap_cursor_fini(&zc); 2875 dsl_pool_config_exit(dp, FTAG); 2876 } 2877 2878 static void 2879 bpobj_count_refd(bpobj_t *bpo) 2880 { 2881 mos_obj_refd(bpo->bpo_object); 2882 2883 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 2884 mos_obj_refd(bpo->bpo_phys->bpo_subobjs); 2885 for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { 2886 uint64_t subobj; 2887 bpobj_t subbpo; 2888 int error; 2889 VERIFY0(dmu_read(bpo->bpo_os, 2890 bpo->bpo_phys->bpo_subobjs, 2891 i * sizeof (subobj), sizeof (subobj), &subobj, 0)); 2892 error = bpobj_open(&subbpo, bpo->bpo_os, subobj); 2893 if (error != 0) { 2894 (void) printf("ERROR %u while trying to open " 2895 "subobj id %llu\n", 2896 error, (u_longlong_t)subobj); 2897 continue; 2898 } 2899 bpobj_count_refd(&subbpo); 2900 bpobj_close(&subbpo); 2901 } 2902 } 2903 } 2904 2905 static int 2906 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) 2907 { 2908 spa_t *spa = arg; 2909 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; 2910 if (dle->dle_bpobj.bpo_object != empty_bpobj) 2911 bpobj_count_refd(&dle->dle_bpobj); 2912 return (0); 2913 } 2914 2915 static int 2916 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) 2917 { 2918 ASSERT(arg == NULL); 2919 if (dump_opt['d'] >= 5) { 2920 char buf[128]; 2921 (void) snprintf(buf, sizeof (buf), 2922 "mintxg %llu -> obj %llu", 2923 (longlong_t)dle->dle_mintxg, 2924 (longlong_t)dle->dle_bpobj.bpo_object); 2925 2926 dump_full_bpobj(&dle->dle_bpobj, buf, 0); 2927 } else { 2928 (void) printf("mintxg %llu -> obj %llu\n", 2929 (longlong_t)dle->dle_mintxg, 2930 (longlong_t)dle->dle_bpobj.bpo_object); 2931 } 2932 return (0); 2933 } 2934 2935 static void 2936 dump_blkptr_list(dsl_deadlist_t *dl, const char *name) 2937 { 2938 char bytes[32]; 2939 char comp[32]; 2940 char uncomp[32]; 2941 char entries[32]; 2942 spa_t *spa = dmu_objset_spa(dl->dl_os); 2943 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; 2944 2945 if (dl->dl_oldfmt) { 2946 if (dl->dl_bpobj.bpo_object != empty_bpobj) 2947 bpobj_count_refd(&dl->dl_bpobj); 2948 } else { 2949 mos_obj_refd(dl->dl_object); 2950 dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); 2951 } 2952 2953 /* make sure nicenum has enough space */ 2954 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 2955 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); 2956 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); 2957 _Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated"); 2958 2959 if (dump_opt['d'] < 3) 2960 return; 2961 2962 if (dl->dl_oldfmt) { 2963 dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); 2964 return; 2965 } 2966 2967 zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); 2968 zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); 2969 zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); 2970 zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); 2971 (void) printf("\n %s: %s (%s/%s comp), %s entries\n", 2972 name, bytes, comp, uncomp, entries); 2973 2974 if (dump_opt['d'] < 4) 2975 return; 2976 2977 (void) putchar('\n'); 2978 2979 dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); 2980 } 2981 2982 static int 2983 verify_dd_livelist(objset_t *os) 2984 { 2985 uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; 2986 dsl_pool_t *dp = spa_get_dsl(os->os_spa); 2987 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; 2988 2989 ASSERT(!dmu_objset_is_snapshot(os)); 2990 if (!dsl_deadlist_is_open(&dd->dd_livelist)) 2991 return (0); 2992 2993 /* Iterate through the livelist to check for duplicates */ 2994 dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, 2995 NULL); 2996 2997 dsl_pool_config_enter(dp, FTAG); 2998 dsl_deadlist_space(&dd->dd_livelist, &ll_used, 2999 &ll_comp, &ll_uncomp); 3000 3001 dsl_dataset_t *origin_ds; 3002 ASSERT(dsl_pool_config_held(dp)); 3003 VERIFY0(dsl_dataset_hold_obj(dp, 3004 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); 3005 VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, 3006 &used, &comp, &uncomp)); 3007 dsl_dataset_rele(origin_ds, FTAG); 3008 dsl_pool_config_exit(dp, FTAG); 3009 /* 3010 * It's possible that the dataset's uncomp space is larger than the 3011 * livelist's because livelists do not track embedded block pointers 3012 */ 3013 if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { 3014 char nice_used[32], nice_comp[32], nice_uncomp[32]; 3015 (void) printf("Discrepancy in space accounting:\n"); 3016 zdb_nicenum(used, nice_used, sizeof (nice_used)); 3017 zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); 3018 zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); 3019 (void) printf("dir: used %s, comp %s, uncomp %s\n", 3020 nice_used, nice_comp, nice_uncomp); 3021 zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); 3022 zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); 3023 zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); 3024 (void) printf("livelist: used %s, comp %s, uncomp %s\n", 3025 nice_used, nice_comp, nice_uncomp); 3026 return (1); 3027 } 3028 return (0); 3029 } 3030 3031 static char *key_material = NULL; 3032 3033 static boolean_t 3034 zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out) 3035 { 3036 uint64_t keyformat, salt, iters; 3037 int i; 3038 unsigned char c; 3039 3040 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, 3041 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t), 3042 1, &keyformat)); 3043 3044 switch (keyformat) { 3045 case ZFS_KEYFORMAT_HEX: 3046 for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) { 3047 if (!isxdigit(key_material[i]) || 3048 !isxdigit(key_material[i+1])) 3049 return (B_FALSE); 3050 if (sscanf(&key_material[i], "%02hhx", &c) != 1) 3051 return (B_FALSE); 3052 key_out[i / 2] = c; 3053 } 3054 break; 3055 3056 case ZFS_KEYFORMAT_PASSPHRASE: 3057 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, 3058 dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 3059 sizeof (uint64_t), 1, &salt)); 3060 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, 3061 dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 3062 sizeof (uint64_t), 1, &iters)); 3063 3064 if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material), 3065 ((uint8_t *)&salt), sizeof (uint64_t), iters, 3066 WRAPPING_KEY_LEN, key_out) != 1) 3067 return (B_FALSE); 3068 3069 break; 3070 3071 default: 3072 fatal("no support for key format %u\n", 3073 (unsigned int) keyformat); 3074 } 3075 3076 return (B_TRUE); 3077 } 3078 3079 static char encroot[ZFS_MAX_DATASET_NAME_LEN]; 3080 static boolean_t key_loaded = B_FALSE; 3081 3082 static void 3083 zdb_load_key(objset_t *os) 3084 { 3085 dsl_pool_t *dp; 3086 dsl_dir_t *dd, *rdd; 3087 uint8_t key[WRAPPING_KEY_LEN]; 3088 uint64_t rddobj; 3089 int err; 3090 3091 dp = spa_get_dsl(os->os_spa); 3092 dd = os->os_dsl_dataset->ds_dir; 3093 3094 dsl_pool_config_enter(dp, FTAG); 3095 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, 3096 DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj)); 3097 VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd)); 3098 dsl_dir_name(rdd, encroot); 3099 dsl_dir_rele(rdd, FTAG); 3100 3101 if (!zdb_derive_key(dd, key)) 3102 fatal("couldn't derive encryption key"); 3103 3104 dsl_pool_config_exit(dp, FTAG); 3105 3106 ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE); 3107 3108 dsl_crypto_params_t *dcp; 3109 nvlist_t *crypto_args; 3110 3111 crypto_args = fnvlist_alloc(); 3112 fnvlist_add_uint8_array(crypto_args, "wkeydata", 3113 (uint8_t *)key, WRAPPING_KEY_LEN); 3114 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, 3115 NULL, crypto_args, &dcp)); 3116 err = spa_keystore_load_wkey(encroot, dcp, B_FALSE); 3117 3118 dsl_crypto_params_free(dcp, (err != 0)); 3119 fnvlist_free(crypto_args); 3120 3121 if (err != 0) 3122 fatal( 3123 "couldn't load encryption key for %s: %s", 3124 encroot, err == ZFS_ERR_CRYPTO_NOTSUP ? 3125 "crypto params not supported" : strerror(err)); 3126 3127 ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE); 3128 3129 printf("Unlocked encryption root: %s\n", encroot); 3130 key_loaded = B_TRUE; 3131 } 3132 3133 static void 3134 zdb_unload_key(void) 3135 { 3136 if (!key_loaded) 3137 return; 3138 3139 VERIFY0(spa_keystore_unload_wkey(encroot)); 3140 key_loaded = B_FALSE; 3141 } 3142 3143 static avl_tree_t idx_tree; 3144 static avl_tree_t domain_tree; 3145 static boolean_t fuid_table_loaded; 3146 static objset_t *sa_os = NULL; 3147 static sa_attr_type_t *sa_attr_table = NULL; 3148 3149 static int 3150 open_objset(const char *path, const void *tag, objset_t **osp) 3151 { 3152 int err; 3153 uint64_t sa_attrs = 0; 3154 uint64_t version = 0; 3155 3156 VERIFY3P(sa_os, ==, NULL); 3157 3158 /* 3159 * We can't own an objset if it's redacted. Therefore, we do this 3160 * dance: hold the objset, then acquire a long hold on its dataset, then 3161 * release the pool (which is held as part of holding the objset). 3162 */ 3163 3164 if (dump_opt['K']) { 3165 /* decryption requested, try to load keys */ 3166 err = dmu_objset_hold(path, tag, osp); 3167 if (err != 0) { 3168 (void) fprintf(stderr, "failed to hold dataset " 3169 "'%s': %s\n", 3170 path, strerror(err)); 3171 return (err); 3172 } 3173 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); 3174 dsl_pool_rele(dmu_objset_pool(*osp), tag); 3175 3176 /* succeeds or dies */ 3177 zdb_load_key(*osp); 3178 3179 /* release it all */ 3180 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); 3181 dsl_dataset_rele(dmu_objset_ds(*osp), tag); 3182 } 3183 3184 int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0; 3185 3186 err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp); 3187 if (err != 0) { 3188 (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", 3189 path, strerror(err)); 3190 return (err); 3191 } 3192 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); 3193 dsl_pool_rele(dmu_objset_pool(*osp), tag); 3194 3195 if (dmu_objset_type(*osp) == DMU_OST_ZFS && 3196 (key_loaded || !(*osp)->os_encrypted)) { 3197 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 3198 8, 1, &version); 3199 if (version >= ZPL_VERSION_SA) { 3200 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 3201 8, 1, &sa_attrs); 3202 } 3203 err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, 3204 &sa_attr_table); 3205 if (err != 0) { 3206 (void) fprintf(stderr, "sa_setup failed: %s\n", 3207 strerror(err)); 3208 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); 3209 dsl_dataset_rele_flags(dmu_objset_ds(*osp), 3210 ds_hold_flags, tag); 3211 *osp = NULL; 3212 } 3213 } 3214 sa_os = *osp; 3215 3216 return (err); 3217 } 3218 3219 static void 3220 close_objset(objset_t *os, const void *tag) 3221 { 3222 VERIFY3P(os, ==, sa_os); 3223 if (os->os_sa != NULL) 3224 sa_tear_down(os); 3225 dsl_dataset_long_rele(dmu_objset_ds(os), tag); 3226 dsl_dataset_rele_flags(dmu_objset_ds(os), 3227 key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag); 3228 sa_attr_table = NULL; 3229 sa_os = NULL; 3230 3231 zdb_unload_key(); 3232 } 3233 3234 static void 3235 fuid_table_destroy(void) 3236 { 3237 if (fuid_table_loaded) { 3238 zfs_fuid_table_destroy(&idx_tree, &domain_tree); 3239 fuid_table_loaded = B_FALSE; 3240 } 3241 } 3242 3243 /* 3244 * print uid or gid information. 3245 * For normal POSIX id just the id is printed in decimal format. 3246 * For CIFS files with FUID the fuid is printed in hex followed by 3247 * the domain-rid string. 3248 */ 3249 static void 3250 print_idstr(uint64_t id, const char *id_type) 3251 { 3252 if (FUID_INDEX(id)) { 3253 const char *domain = 3254 zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); 3255 (void) printf("\t%s %llx [%s-%d]\n", id_type, 3256 (u_longlong_t)id, domain, (int)FUID_RID(id)); 3257 } else { 3258 (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); 3259 } 3260 3261 } 3262 3263 static void 3264 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) 3265 { 3266 uint32_t uid_idx, gid_idx; 3267 3268 uid_idx = FUID_INDEX(uid); 3269 gid_idx = FUID_INDEX(gid); 3270 3271 /* Load domain table, if not already loaded */ 3272 if (!fuid_table_loaded && (uid_idx || gid_idx)) { 3273 uint64_t fuid_obj; 3274 3275 /* first find the fuid object. It lives in the master node */ 3276 VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 3277 8, 1, &fuid_obj) == 0); 3278 zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); 3279 (void) zfs_fuid_table_load(os, fuid_obj, 3280 &idx_tree, &domain_tree); 3281 fuid_table_loaded = B_TRUE; 3282 } 3283 3284 print_idstr(uid, "uid"); 3285 print_idstr(gid, "gid"); 3286 } 3287 3288 static void 3289 dump_znode_sa_xattr(sa_handle_t *hdl) 3290 { 3291 nvlist_t *sa_xattr; 3292 nvpair_t *elem = NULL; 3293 int sa_xattr_size = 0; 3294 int sa_xattr_entries = 0; 3295 int error; 3296 char *sa_xattr_packed; 3297 3298 error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size); 3299 if (error || sa_xattr_size == 0) 3300 return; 3301 3302 sa_xattr_packed = malloc(sa_xattr_size); 3303 if (sa_xattr_packed == NULL) 3304 return; 3305 3306 error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR], 3307 sa_xattr_packed, sa_xattr_size); 3308 if (error) { 3309 free(sa_xattr_packed); 3310 return; 3311 } 3312 3313 error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0); 3314 if (error) { 3315 free(sa_xattr_packed); 3316 return; 3317 } 3318 3319 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) 3320 sa_xattr_entries++; 3321 3322 (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", 3323 sa_xattr_size, sa_xattr_entries); 3324 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { 3325 uchar_t *value; 3326 uint_t cnt, idx; 3327 3328 (void) printf("\t\t%s = ", nvpair_name(elem)); 3329 nvpair_value_byte_array(elem, &value, &cnt); 3330 for (idx = 0; idx < cnt; ++idx) { 3331 if (isprint(value[idx])) 3332 (void) putchar(value[idx]); 3333 else 3334 (void) printf("\\%3.3o", value[idx]); 3335 } 3336 (void) putchar('\n'); 3337 } 3338 3339 nvlist_free(sa_xattr); 3340 free(sa_xattr_packed); 3341 } 3342 3343 static void 3344 dump_znode_symlink(sa_handle_t *hdl) 3345 { 3346 int sa_symlink_size = 0; 3347 char linktarget[MAXPATHLEN]; 3348 int error; 3349 3350 error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size); 3351 if (error || sa_symlink_size == 0) { 3352 return; 3353 } 3354 if (sa_symlink_size >= sizeof (linktarget)) { 3355 (void) printf("symlink size %d is too large\n", 3356 sa_symlink_size); 3357 return; 3358 } 3359 linktarget[sa_symlink_size] = '\0'; 3360 if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK], 3361 &linktarget, sa_symlink_size) == 0) 3362 (void) printf("\ttarget %s\n", linktarget); 3363 } 3364 3365 static void 3366 dump_znode(objset_t *os, uint64_t object, void *data, size_t size) 3367 { 3368 (void) data, (void) size; 3369 char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ 3370 sa_handle_t *hdl; 3371 uint64_t xattr, rdev, gen; 3372 uint64_t uid, gid, mode, fsize, parent, links; 3373 uint64_t pflags; 3374 uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; 3375 time_t z_crtime, z_atime, z_mtime, z_ctime; 3376 sa_bulk_attr_t bulk[12]; 3377 int idx = 0; 3378 int error; 3379 3380 VERIFY3P(os, ==, sa_os); 3381 if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { 3382 (void) printf("Failed to get handle for SA znode\n"); 3383 return; 3384 } 3385 3386 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); 3387 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); 3388 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, 3389 &links, 8); 3390 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); 3391 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, 3392 &mode, 8); 3393 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], 3394 NULL, &parent, 8); 3395 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, 3396 &fsize, 8); 3397 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, 3398 acctm, 16); 3399 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, 3400 modtm, 16); 3401 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, 3402 crtm, 16); 3403 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, 3404 chgtm, 16); 3405 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, 3406 &pflags, 8); 3407 3408 if (sa_bulk_lookup(hdl, bulk, idx)) { 3409 (void) sa_handle_destroy(hdl); 3410 return; 3411 } 3412 3413 z_crtime = (time_t)crtm[0]; 3414 z_atime = (time_t)acctm[0]; 3415 z_mtime = (time_t)modtm[0]; 3416 z_ctime = (time_t)chgtm[0]; 3417 3418 if (dump_opt['d'] > 4) { 3419 error = zfs_obj_to_path(os, object, path, sizeof (path)); 3420 if (error == ESTALE) { 3421 (void) snprintf(path, sizeof (path), "on delete queue"); 3422 } else if (error != 0) { 3423 leaked_objects++; 3424 (void) snprintf(path, sizeof (path), 3425 "path not found, possibly leaked"); 3426 } 3427 (void) printf("\tpath %s\n", path); 3428 } 3429 3430 if (S_ISLNK(mode)) 3431 dump_znode_symlink(hdl); 3432 dump_uidgid(os, uid, gid); 3433 (void) printf("\tatime %s", ctime(&z_atime)); 3434 (void) printf("\tmtime %s", ctime(&z_mtime)); 3435 (void) printf("\tctime %s", ctime(&z_ctime)); 3436 (void) printf("\tcrtime %s", ctime(&z_crtime)); 3437 (void) printf("\tgen %llu\n", (u_longlong_t)gen); 3438 (void) printf("\tmode %llo\n", (u_longlong_t)mode); 3439 (void) printf("\tsize %llu\n", (u_longlong_t)fsize); 3440 (void) printf("\tparent %llu\n", (u_longlong_t)parent); 3441 (void) printf("\tlinks %llu\n", (u_longlong_t)links); 3442 (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); 3443 if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { 3444 uint64_t projid; 3445 3446 if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, 3447 sizeof (uint64_t)) == 0) 3448 (void) printf("\tprojid %llu\n", (u_longlong_t)projid); 3449 } 3450 if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, 3451 sizeof (uint64_t)) == 0) 3452 (void) printf("\txattr %llu\n", (u_longlong_t)xattr); 3453 if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, 3454 sizeof (uint64_t)) == 0) 3455 (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); 3456 dump_znode_sa_xattr(hdl); 3457 sa_handle_destroy(hdl); 3458 } 3459 3460 static void 3461 dump_acl(objset_t *os, uint64_t object, void *data, size_t size) 3462 { 3463 (void) os, (void) object, (void) data, (void) size; 3464 } 3465 3466 static void 3467 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) 3468 { 3469 (void) os, (void) object, (void) data, (void) size; 3470 } 3471 3472 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { 3473 dump_none, /* unallocated */ 3474 dump_zap, /* object directory */ 3475 dump_uint64, /* object array */ 3476 dump_none, /* packed nvlist */ 3477 dump_packed_nvlist, /* packed nvlist size */ 3478 dump_none, /* bpobj */ 3479 dump_bpobj, /* bpobj header */ 3480 dump_none, /* SPA space map header */ 3481 dump_none, /* SPA space map */ 3482 dump_none, /* ZIL intent log */ 3483 dump_dnode, /* DMU dnode */ 3484 dump_dmu_objset, /* DMU objset */ 3485 dump_dsl_dir, /* DSL directory */ 3486 dump_zap, /* DSL directory child map */ 3487 dump_zap, /* DSL dataset snap map */ 3488 dump_zap, /* DSL props */ 3489 dump_dsl_dataset, /* DSL dataset */ 3490 dump_znode, /* ZFS znode */ 3491 dump_acl, /* ZFS V0 ACL */ 3492 dump_uint8, /* ZFS plain file */ 3493 dump_zpldir, /* ZFS directory */ 3494 dump_zap, /* ZFS master node */ 3495 dump_zap, /* ZFS delete queue */ 3496 dump_uint8, /* zvol object */ 3497 dump_zap, /* zvol prop */ 3498 dump_uint8, /* other uint8[] */ 3499 dump_uint64, /* other uint64[] */ 3500 dump_zap, /* other ZAP */ 3501 dump_zap, /* persistent error log */ 3502 dump_uint8, /* SPA history */ 3503 dump_history_offsets, /* SPA history offsets */ 3504 dump_zap, /* Pool properties */ 3505 dump_zap, /* DSL permissions */ 3506 dump_acl, /* ZFS ACL */ 3507 dump_uint8, /* ZFS SYSACL */ 3508 dump_none, /* FUID nvlist */ 3509 dump_packed_nvlist, /* FUID nvlist size */ 3510 dump_zap, /* DSL dataset next clones */ 3511 dump_zap, /* DSL scrub queue */ 3512 dump_zap, /* ZFS user/group/project used */ 3513 dump_zap, /* ZFS user/group/project quota */ 3514 dump_zap, /* snapshot refcount tags */ 3515 dump_ddt_zap, /* DDT ZAP object */ 3516 dump_zap, /* DDT statistics */ 3517 dump_znode, /* SA object */ 3518 dump_zap, /* SA Master Node */ 3519 dump_sa_attrs, /* SA attribute registration */ 3520 dump_sa_layouts, /* SA attribute layouts */ 3521 dump_zap, /* DSL scrub translations */ 3522 dump_none, /* fake dedup BP */ 3523 dump_zap, /* deadlist */ 3524 dump_none, /* deadlist hdr */ 3525 dump_zap, /* dsl clones */ 3526 dump_bpobj_subobjs, /* bpobj subobjs */ 3527 dump_unknown, /* Unknown type, must be last */ 3528 }; 3529 3530 static boolean_t 3531 match_object_type(dmu_object_type_t obj_type, uint64_t flags) 3532 { 3533 boolean_t match = B_TRUE; 3534 3535 switch (obj_type) { 3536 case DMU_OT_DIRECTORY_CONTENTS: 3537 if (!(flags & ZOR_FLAG_DIRECTORY)) 3538 match = B_FALSE; 3539 break; 3540 case DMU_OT_PLAIN_FILE_CONTENTS: 3541 if (!(flags & ZOR_FLAG_PLAIN_FILE)) 3542 match = B_FALSE; 3543 break; 3544 case DMU_OT_SPACE_MAP: 3545 if (!(flags & ZOR_FLAG_SPACE_MAP)) 3546 match = B_FALSE; 3547 break; 3548 default: 3549 if (strcmp(zdb_ot_name(obj_type), "zap") == 0) { 3550 if (!(flags & ZOR_FLAG_ZAP)) 3551 match = B_FALSE; 3552 break; 3553 } 3554 3555 /* 3556 * If all bits except some of the supported flags are 3557 * set, the user combined the all-types flag (A) with 3558 * a negated flag to exclude some types (e.g. A-f to 3559 * show all object types except plain files). 3560 */ 3561 if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES) 3562 match = B_FALSE; 3563 3564 break; 3565 } 3566 3567 return (match); 3568 } 3569 3570 static void 3571 dump_object(objset_t *os, uint64_t object, int verbosity, 3572 boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags) 3573 { 3574 dmu_buf_t *db = NULL; 3575 dmu_object_info_t doi; 3576 dnode_t *dn; 3577 boolean_t dnode_held = B_FALSE; 3578 void *bonus = NULL; 3579 size_t bsize = 0; 3580 char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; 3581 char bonus_size[32]; 3582 char aux[50]; 3583 int error; 3584 3585 /* make sure nicenum has enough space */ 3586 _Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated"); 3587 _Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated"); 3588 _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated"); 3589 _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated"); 3590 _Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ, 3591 "bonus_size truncated"); 3592 3593 if (*print_header) { 3594 (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", 3595 "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", 3596 "lsize", "%full", "type"); 3597 *print_header = 0; 3598 } 3599 3600 if (object == 0) { 3601 dn = DMU_META_DNODE(os); 3602 dmu_object_info_from_dnode(dn, &doi); 3603 } else { 3604 /* 3605 * Encrypted datasets will have sensitive bonus buffers 3606 * encrypted. Therefore we cannot hold the bonus buffer and 3607 * must hold the dnode itself instead. 3608 */ 3609 error = dmu_object_info(os, object, &doi); 3610 if (error) 3611 fatal("dmu_object_info() failed, errno %u", error); 3612 3613 if (!key_loaded && os->os_encrypted && 3614 DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { 3615 error = dnode_hold(os, object, FTAG, &dn); 3616 if (error) 3617 fatal("dnode_hold() failed, errno %u", error); 3618 dnode_held = B_TRUE; 3619 } else { 3620 error = dmu_bonus_hold(os, object, FTAG, &db); 3621 if (error) 3622 fatal("dmu_bonus_hold(%llu) failed, errno %u", 3623 object, error); 3624 bonus = db->db_data; 3625 bsize = db->db_size; 3626 dn = DB_DNODE((dmu_buf_impl_t *)db); 3627 } 3628 } 3629 3630 /* 3631 * Default to showing all object types if no flags were specified. 3632 */ 3633 if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES && 3634 !match_object_type(doi.doi_type, flags)) 3635 goto out; 3636 3637 if (dnode_slots_used) 3638 *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; 3639 3640 zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); 3641 zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); 3642 zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); 3643 zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); 3644 zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); 3645 zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); 3646 (void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 * 3647 doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? 3648 DNODES_PER_BLOCK : 1) / doi.doi_max_offset); 3649 3650 aux[0] = '\0'; 3651 3652 if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { 3653 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), 3654 " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); 3655 } 3656 3657 if (doi.doi_compress == ZIO_COMPRESS_INHERIT && 3658 ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) { 3659 const char *compname = NULL; 3660 if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION, 3661 ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel), 3662 &compname) == 0) { 3663 (void) snprintf(aux + strlen(aux), 3664 sizeof (aux) - strlen(aux), " (Z=inherit=%s)", 3665 compname); 3666 } else { 3667 (void) snprintf(aux + strlen(aux), 3668 sizeof (aux) - strlen(aux), 3669 " (Z=inherit=%s-unknown)", 3670 ZDB_COMPRESS_NAME(os->os_compress)); 3671 } 3672 } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) { 3673 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), 3674 " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress)); 3675 } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { 3676 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), 3677 " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); 3678 } 3679 3680 (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", 3681 (u_longlong_t)object, doi.doi_indirection, iblk, dblk, 3682 asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux); 3683 3684 if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { 3685 (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", 3686 "", "", "", "", "", "", bonus_size, "bonus", 3687 zdb_ot_name(doi.doi_bonus_type)); 3688 } 3689 3690 if (verbosity >= 4) { 3691 (void) printf("\tdnode flags: %s%s%s%s\n", 3692 (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? 3693 "USED_BYTES " : "", 3694 (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? 3695 "USERUSED_ACCOUNTED " : "", 3696 (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? 3697 "USEROBJUSED_ACCOUNTED " : "", 3698 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? 3699 "SPILL_BLKPTR" : ""); 3700 (void) printf("\tdnode maxblkid: %llu\n", 3701 (longlong_t)dn->dn_phys->dn_maxblkid); 3702 3703 if (!dnode_held) { 3704 object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, 3705 object, bonus, bsize); 3706 } else { 3707 (void) printf("\t\t(bonus encrypted)\n"); 3708 } 3709 3710 if (key_loaded || 3711 (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) { 3712 object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, 3713 NULL, 0); 3714 } else { 3715 (void) printf("\t\t(object encrypted)\n"); 3716 } 3717 3718 *print_header = B_TRUE; 3719 } 3720 3721 if (verbosity >= 5) { 3722 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 3723 char blkbuf[BP_SPRINTF_LEN]; 3724 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), 3725 DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE); 3726 (void) printf("\nSpill block: %s\n", blkbuf); 3727 } 3728 dump_indirect(dn); 3729 } 3730 3731 if (verbosity >= 5) { 3732 /* 3733 * Report the list of segments that comprise the object. 3734 */ 3735 uint64_t start = 0; 3736 uint64_t end; 3737 uint64_t blkfill = 1; 3738 int minlvl = 1; 3739 3740 if (dn->dn_type == DMU_OT_DNODE) { 3741 minlvl = 0; 3742 blkfill = DNODES_PER_BLOCK; 3743 } 3744 3745 for (;;) { 3746 char segsize[32]; 3747 /* make sure nicenum has enough space */ 3748 _Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ, 3749 "segsize truncated"); 3750 error = dnode_next_offset(dn, 3751 0, &start, minlvl, blkfill, 0); 3752 if (error) 3753 break; 3754 end = start; 3755 error = dnode_next_offset(dn, 3756 DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); 3757 zdb_nicenum(end - start, segsize, sizeof (segsize)); 3758 (void) printf("\t\tsegment [%016llx, %016llx)" 3759 " size %5s\n", (u_longlong_t)start, 3760 (u_longlong_t)end, segsize); 3761 if (error) 3762 break; 3763 start = end; 3764 } 3765 } 3766 3767 out: 3768 if (db != NULL) 3769 dmu_buf_rele(db, FTAG); 3770 if (dnode_held) 3771 dnode_rele(dn, FTAG); 3772 } 3773 3774 static void 3775 count_dir_mos_objects(dsl_dir_t *dd) 3776 { 3777 mos_obj_refd(dd->dd_object); 3778 mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); 3779 mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); 3780 mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); 3781 mos_obj_refd(dsl_dir_phys(dd)->dd_clones); 3782 3783 /* 3784 * The dd_crypto_obj can be referenced by multiple dsl_dir's. 3785 * Ignore the references after the first one. 3786 */ 3787 mos_obj_refd_multiple(dd->dd_crypto_obj); 3788 } 3789 3790 static void 3791 count_ds_mos_objects(dsl_dataset_t *ds) 3792 { 3793 mos_obj_refd(ds->ds_object); 3794 mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); 3795 mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); 3796 mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); 3797 mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); 3798 mos_obj_refd(ds->ds_bookmarks_obj); 3799 3800 if (!dsl_dataset_is_snapshot(ds)) { 3801 count_dir_mos_objects(ds->ds_dir); 3802 } 3803 } 3804 3805 static const char *const objset_types[DMU_OST_NUMTYPES] = { 3806 "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; 3807 3808 /* 3809 * Parse a string denoting a range of object IDs of the form 3810 * <start>[:<end>[:flags]], and store the results in zor. 3811 * Return 0 on success. On error, return 1 and update the msg 3812 * pointer to point to a descriptive error message. 3813 */ 3814 static int 3815 parse_object_range(char *range, zopt_object_range_t *zor, const char **msg) 3816 { 3817 uint64_t flags = 0; 3818 char *p, *s, *dup, *flagstr, *tmp = NULL; 3819 size_t len; 3820 int i; 3821 int rc = 0; 3822 3823 if (strchr(range, ':') == NULL) { 3824 zor->zor_obj_start = strtoull(range, &p, 0); 3825 if (*p != '\0') { 3826 *msg = "Invalid characters in object ID"; 3827 rc = 1; 3828 } 3829 zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); 3830 zor->zor_obj_end = zor->zor_obj_start; 3831 return (rc); 3832 } 3833 3834 if (strchr(range, ':') == range) { 3835 *msg = "Invalid leading colon"; 3836 rc = 1; 3837 return (rc); 3838 } 3839 3840 len = strlen(range); 3841 if (range[len - 1] == ':') { 3842 *msg = "Invalid trailing colon"; 3843 rc = 1; 3844 return (rc); 3845 } 3846 3847 dup = strdup(range); 3848 s = strtok_r(dup, ":", &tmp); 3849 zor->zor_obj_start = strtoull(s, &p, 0); 3850 3851 if (*p != '\0') { 3852 *msg = "Invalid characters in start object ID"; 3853 rc = 1; 3854 goto out; 3855 } 3856 3857 s = strtok_r(NULL, ":", &tmp); 3858 zor->zor_obj_end = strtoull(s, &p, 0); 3859 3860 if (*p != '\0') { 3861 *msg = "Invalid characters in end object ID"; 3862 rc = 1; 3863 goto out; 3864 } 3865 3866 if (zor->zor_obj_start > zor->zor_obj_end) { 3867 *msg = "Start object ID may not exceed end object ID"; 3868 rc = 1; 3869 goto out; 3870 } 3871 3872 s = strtok_r(NULL, ":", &tmp); 3873 if (s == NULL) { 3874 zor->zor_flags = ZOR_FLAG_ALL_TYPES; 3875 goto out; 3876 } else if (strtok_r(NULL, ":", &tmp) != NULL) { 3877 *msg = "Invalid colon-delimited field after flags"; 3878 rc = 1; 3879 goto out; 3880 } 3881 3882 flagstr = s; 3883 for (i = 0; flagstr[i]; i++) { 3884 int bit; 3885 boolean_t negation = (flagstr[i] == '-'); 3886 3887 if (negation) { 3888 i++; 3889 if (flagstr[i] == '\0') { 3890 *msg = "Invalid trailing negation operator"; 3891 rc = 1; 3892 goto out; 3893 } 3894 } 3895 bit = flagbits[(uchar_t)flagstr[i]]; 3896 if (bit == 0) { 3897 *msg = "Invalid flag"; 3898 rc = 1; 3899 goto out; 3900 } 3901 if (negation) 3902 flags &= ~bit; 3903 else 3904 flags |= bit; 3905 } 3906 zor->zor_flags = flags; 3907 3908 zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); 3909 zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end); 3910 3911 out: 3912 free(dup); 3913 return (rc); 3914 } 3915 3916 static void 3917 dump_objset(objset_t *os) 3918 { 3919 dmu_objset_stats_t dds = { 0 }; 3920 uint64_t object, object_count; 3921 uint64_t refdbytes, usedobjs, scratch; 3922 char numbuf[32]; 3923 char blkbuf[BP_SPRINTF_LEN + 20]; 3924 char osname[ZFS_MAX_DATASET_NAME_LEN]; 3925 const char *type = "UNKNOWN"; 3926 int verbosity = dump_opt['d']; 3927 boolean_t print_header; 3928 unsigned i; 3929 int error; 3930 uint64_t total_slots_used = 0; 3931 uint64_t max_slot_used = 0; 3932 uint64_t dnode_slots; 3933 uint64_t obj_start; 3934 uint64_t obj_end; 3935 uint64_t flags; 3936 3937 /* make sure nicenum has enough space */ 3938 _Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated"); 3939 3940 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 3941 dmu_objset_fast_stat(os, &dds); 3942 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 3943 3944 print_header = B_TRUE; 3945 3946 if (dds.dds_type < DMU_OST_NUMTYPES) 3947 type = objset_types[dds.dds_type]; 3948 3949 if (dds.dds_type == DMU_OST_META) { 3950 dds.dds_creation_txg = TXG_INITIAL; 3951 usedobjs = BP_GET_FILL(os->os_rootbp); 3952 refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> 3953 dd_used_bytes; 3954 } else { 3955 dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); 3956 } 3957 3958 ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); 3959 3960 zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); 3961 3962 if (verbosity >= 4) { 3963 (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); 3964 (void) snprintf_blkptr(blkbuf + strlen(blkbuf), 3965 sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); 3966 } else { 3967 blkbuf[0] = '\0'; 3968 } 3969 3970 dmu_objset_name(os, osname); 3971 3972 (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " 3973 "%s, %llu objects%s%s\n", 3974 osname, type, (u_longlong_t)dmu_objset_id(os), 3975 (u_longlong_t)dds.dds_creation_txg, 3976 numbuf, (u_longlong_t)usedobjs, blkbuf, 3977 (dds.dds_inconsistent) ? " (inconsistent)" : ""); 3978 3979 for (i = 0; i < zopt_object_args; i++) { 3980 obj_start = zopt_object_ranges[i].zor_obj_start; 3981 obj_end = zopt_object_ranges[i].zor_obj_end; 3982 flags = zopt_object_ranges[i].zor_flags; 3983 3984 object = obj_start; 3985 if (object == 0 || obj_start == obj_end) 3986 dump_object(os, object, verbosity, &print_header, NULL, 3987 flags); 3988 else 3989 object--; 3990 3991 while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) && 3992 object <= obj_end) { 3993 dump_object(os, object, verbosity, &print_header, NULL, 3994 flags); 3995 } 3996 } 3997 3998 if (zopt_object_args > 0) { 3999 (void) printf("\n"); 4000 return; 4001 } 4002 4003 if (dump_opt['i'] != 0 || verbosity >= 2) 4004 dump_intent_log(dmu_objset_zil(os)); 4005 4006 if (dmu_objset_ds(os) != NULL) { 4007 dsl_dataset_t *ds = dmu_objset_ds(os); 4008 dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); 4009 if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && 4010 !dmu_objset_is_snapshot(os)) { 4011 dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); 4012 if (verify_dd_livelist(os) != 0) 4013 fatal("livelist is incorrect"); 4014 } 4015 4016 if (dsl_dataset_remap_deadlist_exists(ds)) { 4017 (void) printf("ds_remap_deadlist:\n"); 4018 dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); 4019 } 4020 count_ds_mos_objects(ds); 4021 } 4022 4023 if (dmu_objset_ds(os) != NULL) 4024 dump_bookmarks(os, verbosity); 4025 4026 if (verbosity < 2) 4027 return; 4028 4029 if (BP_IS_HOLE(os->os_rootbp)) 4030 return; 4031 4032 dump_object(os, 0, verbosity, &print_header, NULL, 0); 4033 object_count = 0; 4034 if (DMU_USERUSED_DNODE(os) != NULL && 4035 DMU_USERUSED_DNODE(os)->dn_type != 0) { 4036 dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, 4037 NULL, 0); 4038 dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, 4039 NULL, 0); 4040 } 4041 4042 if (DMU_PROJECTUSED_DNODE(os) != NULL && 4043 DMU_PROJECTUSED_DNODE(os)->dn_type != 0) 4044 dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, 4045 &print_header, NULL, 0); 4046 4047 object = 0; 4048 while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { 4049 dump_object(os, object, verbosity, &print_header, &dnode_slots, 4050 0); 4051 object_count++; 4052 total_slots_used += dnode_slots; 4053 max_slot_used = object + dnode_slots - 1; 4054 } 4055 4056 (void) printf("\n"); 4057 4058 (void) printf(" Dnode slots:\n"); 4059 (void) printf("\tTotal used: %10llu\n", 4060 (u_longlong_t)total_slots_used); 4061 (void) printf("\tMax used: %10llu\n", 4062 (u_longlong_t)max_slot_used); 4063 (void) printf("\tPercent empty: %10lf\n", 4064 (double)(max_slot_used - total_slots_used)*100 / 4065 (double)max_slot_used); 4066 (void) printf("\n"); 4067 4068 if (error != ESRCH) { 4069 (void) fprintf(stderr, "dmu_object_next() = %d\n", error); 4070 abort(); 4071 } 4072 4073 ASSERT3U(object_count, ==, usedobjs); 4074 4075 if (leaked_objects != 0) { 4076 (void) printf("%d potentially leaked objects detected\n", 4077 leaked_objects); 4078 leaked_objects = 0; 4079 } 4080 } 4081 4082 static void 4083 dump_uberblock(uberblock_t *ub, const char *header, const char *footer) 4084 { 4085 time_t timestamp = ub->ub_timestamp; 4086 4087 (void) printf("%s", header ? header : ""); 4088 (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); 4089 (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); 4090 (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); 4091 (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); 4092 (void) printf("\ttimestamp = %llu UTC = %s", 4093 (u_longlong_t)ub->ub_timestamp, ctime(×tamp)); 4094 4095 (void) printf("\tmmp_magic = %016llx\n", 4096 (u_longlong_t)ub->ub_mmp_magic); 4097 if (MMP_VALID(ub)) { 4098 (void) printf("\tmmp_delay = %0llu\n", 4099 (u_longlong_t)ub->ub_mmp_delay); 4100 if (MMP_SEQ_VALID(ub)) 4101 (void) printf("\tmmp_seq = %u\n", 4102 (unsigned int) MMP_SEQ(ub)); 4103 if (MMP_FAIL_INT_VALID(ub)) 4104 (void) printf("\tmmp_fail = %u\n", 4105 (unsigned int) MMP_FAIL_INT(ub)); 4106 if (MMP_INTERVAL_VALID(ub)) 4107 (void) printf("\tmmp_write = %u\n", 4108 (unsigned int) MMP_INTERVAL(ub)); 4109 /* After MMP_* to make summarize_uberblock_mmp cleaner */ 4110 (void) printf("\tmmp_valid = %x\n", 4111 (unsigned int) ub->ub_mmp_config & 0xFF); 4112 } 4113 4114 if (dump_opt['u'] >= 4) { 4115 char blkbuf[BP_SPRINTF_LEN]; 4116 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); 4117 (void) printf("\trootbp = %s\n", blkbuf); 4118 } 4119 (void) printf("\tcheckpoint_txg = %llu\n", 4120 (u_longlong_t)ub->ub_checkpoint_txg); 4121 (void) printf("%s", footer ? footer : ""); 4122 } 4123 4124 static void 4125 dump_config(spa_t *spa) 4126 { 4127 dmu_buf_t *db; 4128 size_t nvsize = 0; 4129 int error = 0; 4130 4131 4132 error = dmu_bonus_hold(spa->spa_meta_objset, 4133 spa->spa_config_object, FTAG, &db); 4134 4135 if (error == 0) { 4136 nvsize = *(uint64_t *)db->db_data; 4137 dmu_buf_rele(db, FTAG); 4138 4139 (void) printf("\nMOS Configuration:\n"); 4140 dump_packed_nvlist(spa->spa_meta_objset, 4141 spa->spa_config_object, (void *)&nvsize, 1); 4142 } else { 4143 (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", 4144 (u_longlong_t)spa->spa_config_object, error); 4145 } 4146 } 4147 4148 static void 4149 dump_cachefile(const char *cachefile) 4150 { 4151 int fd; 4152 struct stat64 statbuf; 4153 char *buf; 4154 nvlist_t *config; 4155 4156 if ((fd = open64(cachefile, O_RDONLY)) < 0) { 4157 (void) printf("cannot open '%s': %s\n", cachefile, 4158 strerror(errno)); 4159 exit(1); 4160 } 4161 4162 if (fstat64(fd, &statbuf) != 0) { 4163 (void) printf("failed to stat '%s': %s\n", cachefile, 4164 strerror(errno)); 4165 exit(1); 4166 } 4167 4168 if ((buf = malloc(statbuf.st_size)) == NULL) { 4169 (void) fprintf(stderr, "failed to allocate %llu bytes\n", 4170 (u_longlong_t)statbuf.st_size); 4171 exit(1); 4172 } 4173 4174 if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { 4175 (void) fprintf(stderr, "failed to read %llu bytes\n", 4176 (u_longlong_t)statbuf.st_size); 4177 exit(1); 4178 } 4179 4180 (void) close(fd); 4181 4182 if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { 4183 (void) fprintf(stderr, "failed to unpack nvlist\n"); 4184 exit(1); 4185 } 4186 4187 free(buf); 4188 4189 dump_nvlist(config, 0); 4190 4191 nvlist_free(config); 4192 } 4193 4194 /* 4195 * ZFS label nvlist stats 4196 */ 4197 typedef struct zdb_nvl_stats { 4198 int zns_list_count; 4199 int zns_leaf_count; 4200 size_t zns_leaf_largest; 4201 size_t zns_leaf_total; 4202 nvlist_t *zns_string; 4203 nvlist_t *zns_uint64; 4204 nvlist_t *zns_boolean; 4205 } zdb_nvl_stats_t; 4206 4207 static void 4208 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats) 4209 { 4210 nvlist_t *list, **array; 4211 nvpair_t *nvp = NULL; 4212 const char *name; 4213 uint_t i, items; 4214 4215 stats->zns_list_count++; 4216 4217 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 4218 name = nvpair_name(nvp); 4219 4220 switch (nvpair_type(nvp)) { 4221 case DATA_TYPE_STRING: 4222 fnvlist_add_string(stats->zns_string, name, 4223 fnvpair_value_string(nvp)); 4224 break; 4225 case DATA_TYPE_UINT64: 4226 fnvlist_add_uint64(stats->zns_uint64, name, 4227 fnvpair_value_uint64(nvp)); 4228 break; 4229 case DATA_TYPE_BOOLEAN: 4230 fnvlist_add_boolean(stats->zns_boolean, name); 4231 break; 4232 case DATA_TYPE_NVLIST: 4233 if (nvpair_value_nvlist(nvp, &list) == 0) 4234 collect_nvlist_stats(list, stats); 4235 break; 4236 case DATA_TYPE_NVLIST_ARRAY: 4237 if (nvpair_value_nvlist_array(nvp, &array, &items) != 0) 4238 break; 4239 4240 for (i = 0; i < items; i++) { 4241 collect_nvlist_stats(array[i], stats); 4242 4243 /* collect stats on leaf vdev */ 4244 if (strcmp(name, "children") == 0) { 4245 size_t size; 4246 4247 (void) nvlist_size(array[i], &size, 4248 NV_ENCODE_XDR); 4249 stats->zns_leaf_total += size; 4250 if (size > stats->zns_leaf_largest) 4251 stats->zns_leaf_largest = size; 4252 stats->zns_leaf_count++; 4253 } 4254 } 4255 break; 4256 default: 4257 (void) printf("skip type %d!\n", (int)nvpair_type(nvp)); 4258 } 4259 } 4260 } 4261 4262 static void 4263 dump_nvlist_stats(nvlist_t *nvl, size_t cap) 4264 { 4265 zdb_nvl_stats_t stats = { 0 }; 4266 size_t size, sum = 0, total; 4267 size_t noise; 4268 4269 /* requires nvlist with non-unique names for stat collection */ 4270 VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0)); 4271 VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0)); 4272 VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0)); 4273 VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR)); 4274 4275 (void) printf("\n\nZFS Label NVList Config Stats:\n"); 4276 4277 VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR)); 4278 (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n", 4279 (int)total, (int)(cap - total), 100.0 * total / cap); 4280 4281 collect_nvlist_stats(nvl, &stats); 4282 4283 VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR)); 4284 size -= noise; 4285 sum += size; 4286 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:", 4287 (int)fnvlist_num_pairs(stats.zns_uint64), 4288 (int)size, 100.0 * size / total); 4289 4290 VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR)); 4291 size -= noise; 4292 sum += size; 4293 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:", 4294 (int)fnvlist_num_pairs(stats.zns_string), 4295 (int)size, 100.0 * size / total); 4296 4297 VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR)); 4298 size -= noise; 4299 sum += size; 4300 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:", 4301 (int)fnvlist_num_pairs(stats.zns_boolean), 4302 (int)size, 100.0 * size / total); 4303 4304 size = total - sum; /* treat remainder as nvlist overhead */ 4305 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:", 4306 stats.zns_list_count, (int)size, 100.0 * size / total); 4307 4308 if (stats.zns_leaf_count > 0) { 4309 size_t average = stats.zns_leaf_total / stats.zns_leaf_count; 4310 4311 (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:", 4312 stats.zns_leaf_count, (int)average); 4313 (void) printf("%24d bytes largest\n", 4314 (int)stats.zns_leaf_largest); 4315 4316 if (dump_opt['l'] >= 3 && average > 0) 4317 (void) printf(" space for %d additional leaf vdevs\n", 4318 (int)((cap - total) / average)); 4319 } 4320 (void) printf("\n"); 4321 4322 nvlist_free(stats.zns_string); 4323 nvlist_free(stats.zns_uint64); 4324 nvlist_free(stats.zns_boolean); 4325 } 4326 4327 typedef struct cksum_record { 4328 zio_cksum_t cksum; 4329 boolean_t labels[VDEV_LABELS]; 4330 avl_node_t link; 4331 } cksum_record_t; 4332 4333 static int 4334 cksum_record_compare(const void *x1, const void *x2) 4335 { 4336 const cksum_record_t *l = (cksum_record_t *)x1; 4337 const cksum_record_t *r = (cksum_record_t *)x2; 4338 int arraysize = ARRAY_SIZE(l->cksum.zc_word); 4339 int difference = 0; 4340 4341 for (int i = 0; i < arraysize; i++) { 4342 difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); 4343 if (difference) 4344 break; 4345 } 4346 4347 return (difference); 4348 } 4349 4350 static cksum_record_t * 4351 cksum_record_alloc(zio_cksum_t *cksum, int l) 4352 { 4353 cksum_record_t *rec; 4354 4355 rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); 4356 rec->cksum = *cksum; 4357 rec->labels[l] = B_TRUE; 4358 4359 return (rec); 4360 } 4361 4362 static cksum_record_t * 4363 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) 4364 { 4365 cksum_record_t lookup = { .cksum = *cksum }; 4366 avl_index_t where; 4367 4368 return (avl_find(tree, &lookup, &where)); 4369 } 4370 4371 static cksum_record_t * 4372 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) 4373 { 4374 cksum_record_t *rec; 4375 4376 rec = cksum_record_lookup(tree, cksum); 4377 if (rec) { 4378 rec->labels[l] = B_TRUE; 4379 } else { 4380 rec = cksum_record_alloc(cksum, l); 4381 avl_add(tree, rec); 4382 } 4383 4384 return (rec); 4385 } 4386 4387 static int 4388 first_label(cksum_record_t *rec) 4389 { 4390 for (int i = 0; i < VDEV_LABELS; i++) 4391 if (rec->labels[i]) 4392 return (i); 4393 4394 return (-1); 4395 } 4396 4397 static void 4398 print_label_numbers(const char *prefix, const cksum_record_t *rec) 4399 { 4400 fputs(prefix, stdout); 4401 for (int i = 0; i < VDEV_LABELS; i++) 4402 if (rec->labels[i] == B_TRUE) 4403 printf("%d ", i); 4404 putchar('\n'); 4405 } 4406 4407 #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) 4408 4409 typedef struct zdb_label { 4410 vdev_label_t label; 4411 uint64_t label_offset; 4412 nvlist_t *config_nv; 4413 cksum_record_t *config; 4414 cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; 4415 boolean_t header_printed; 4416 boolean_t read_failed; 4417 boolean_t cksum_valid; 4418 } zdb_label_t; 4419 4420 static void 4421 print_label_header(zdb_label_t *label, int l) 4422 { 4423 4424 if (dump_opt['q']) 4425 return; 4426 4427 if (label->header_printed == B_TRUE) 4428 return; 4429 4430 (void) printf("------------------------------------\n"); 4431 (void) printf("LABEL %d %s\n", l, 4432 label->cksum_valid ? "" : "(Bad label cksum)"); 4433 (void) printf("------------------------------------\n"); 4434 4435 label->header_printed = B_TRUE; 4436 } 4437 4438 static void 4439 print_l2arc_header(void) 4440 { 4441 (void) printf("------------------------------------\n"); 4442 (void) printf("L2ARC device header\n"); 4443 (void) printf("------------------------------------\n"); 4444 } 4445 4446 static void 4447 print_l2arc_log_blocks(void) 4448 { 4449 (void) printf("------------------------------------\n"); 4450 (void) printf("L2ARC device log blocks\n"); 4451 (void) printf("------------------------------------\n"); 4452 } 4453 4454 static void 4455 dump_l2arc_log_entries(uint64_t log_entries, 4456 l2arc_log_ent_phys_t *le, uint64_t i) 4457 { 4458 for (int j = 0; j < log_entries; j++) { 4459 dva_t dva = le[j].le_dva; 4460 (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " 4461 "vdev: %llu, offset: %llu\n", 4462 (u_longlong_t)i, j + 1, 4463 (u_longlong_t)DVA_GET_ASIZE(&dva), 4464 (u_longlong_t)DVA_GET_VDEV(&dva), 4465 (u_longlong_t)DVA_GET_OFFSET(&dva)); 4466 (void) printf("|\t\t\t\tbirth: %llu\n", 4467 (u_longlong_t)le[j].le_birth); 4468 (void) printf("|\t\t\t\tlsize: %llu\n", 4469 (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); 4470 (void) printf("|\t\t\t\tpsize: %llu\n", 4471 (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); 4472 (void) printf("|\t\t\t\tcompr: %llu\n", 4473 (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); 4474 (void) printf("|\t\t\t\tcomplevel: %llu\n", 4475 (u_longlong_t)(&le[j])->le_complevel); 4476 (void) printf("|\t\t\t\ttype: %llu\n", 4477 (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); 4478 (void) printf("|\t\t\t\tprotected: %llu\n", 4479 (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); 4480 (void) printf("|\t\t\t\tprefetch: %llu\n", 4481 (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); 4482 (void) printf("|\t\t\t\taddress: %llu\n", 4483 (u_longlong_t)le[j].le_daddr); 4484 (void) printf("|\t\t\t\tARC state: %llu\n", 4485 (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop)); 4486 (void) printf("|\n"); 4487 } 4488 (void) printf("\n"); 4489 } 4490 4491 static void 4492 dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps) 4493 { 4494 (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr); 4495 (void) printf("|\t\tpayload_asize: %llu\n", 4496 (u_longlong_t)lbps->lbp_payload_asize); 4497 (void) printf("|\t\tpayload_start: %llu\n", 4498 (u_longlong_t)lbps->lbp_payload_start); 4499 (void) printf("|\t\tlsize: %llu\n", 4500 (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop)); 4501 (void) printf("|\t\tasize: %llu\n", 4502 (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop)); 4503 (void) printf("|\t\tcompralgo: %llu\n", 4504 (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop)); 4505 (void) printf("|\t\tcksumalgo: %llu\n", 4506 (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop)); 4507 (void) printf("|\n\n"); 4508 } 4509 4510 static void 4511 dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr, 4512 l2arc_dev_hdr_phys_t *rebuild) 4513 { 4514 l2arc_log_blk_phys_t this_lb; 4515 uint64_t asize; 4516 l2arc_log_blkptr_t lbps[2]; 4517 abd_t *abd; 4518 zio_cksum_t cksum; 4519 int failed = 0; 4520 l2arc_dev_t dev; 4521 4522 if (!dump_opt['q']) 4523 print_l2arc_log_blocks(); 4524 memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); 4525 4526 dev.l2ad_evict = l2dhdr->dh_evict; 4527 dev.l2ad_start = l2dhdr->dh_start; 4528 dev.l2ad_end = l2dhdr->dh_end; 4529 4530 if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) { 4531 /* no log blocks to read */ 4532 if (!dump_opt['q']) { 4533 (void) printf("No log blocks to read\n"); 4534 (void) printf("\n"); 4535 } 4536 return; 4537 } else { 4538 dev.l2ad_hand = lbps[0].lbp_daddr + 4539 L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); 4540 } 4541 4542 dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); 4543 4544 for (;;) { 4545 if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) 4546 break; 4547 4548 /* L2BLK_GET_PSIZE returns aligned size for log blocks */ 4549 asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); 4550 if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { 4551 if (!dump_opt['q']) { 4552 (void) printf("Error while reading next log " 4553 "block\n\n"); 4554 } 4555 break; 4556 } 4557 4558 fletcher_4_native_varsize(&this_lb, asize, &cksum); 4559 if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { 4560 failed++; 4561 if (!dump_opt['q']) { 4562 (void) printf("Invalid cksum\n"); 4563 dump_l2arc_log_blkptr(&lbps[0]); 4564 } 4565 break; 4566 } 4567 4568 switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { 4569 case ZIO_COMPRESS_OFF: 4570 break; 4571 default: 4572 abd = abd_alloc_for_io(asize, B_TRUE); 4573 abd_copy_from_buf_off(abd, &this_lb, 0, asize); 4574 if (zio_decompress_data(L2BLK_GET_COMPRESS( 4575 (&lbps[0])->lbp_prop), abd, &this_lb, 4576 asize, sizeof (this_lb), NULL) != 0) { 4577 (void) printf("L2ARC block decompression " 4578 "failed\n"); 4579 abd_free(abd); 4580 goto out; 4581 } 4582 abd_free(abd); 4583 break; 4584 } 4585 4586 if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) 4587 byteswap_uint64_array(&this_lb, sizeof (this_lb)); 4588 if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { 4589 if (!dump_opt['q']) 4590 (void) printf("Invalid log block magic\n\n"); 4591 break; 4592 } 4593 4594 rebuild->dh_lb_count++; 4595 rebuild->dh_lb_asize += asize; 4596 if (dump_opt['l'] > 1 && !dump_opt['q']) { 4597 (void) printf("lb[%4llu]\tmagic: %llu\n", 4598 (u_longlong_t)rebuild->dh_lb_count, 4599 (u_longlong_t)this_lb.lb_magic); 4600 dump_l2arc_log_blkptr(&lbps[0]); 4601 } 4602 4603 if (dump_opt['l'] > 2 && !dump_opt['q']) 4604 dump_l2arc_log_entries(l2dhdr->dh_log_entries, 4605 this_lb.lb_entries, 4606 rebuild->dh_lb_count); 4607 4608 if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, 4609 lbps[0].lbp_payload_start, dev.l2ad_evict) && 4610 !dev.l2ad_first) 4611 break; 4612 4613 lbps[0] = lbps[1]; 4614 lbps[1] = this_lb.lb_prev_lbp; 4615 } 4616 out: 4617 if (!dump_opt['q']) { 4618 (void) printf("log_blk_count:\t %llu with valid cksum\n", 4619 (u_longlong_t)rebuild->dh_lb_count); 4620 (void) printf("\t\t %d with invalid cksum\n", failed); 4621 (void) printf("log_blk_asize:\t %llu\n\n", 4622 (u_longlong_t)rebuild->dh_lb_asize); 4623 } 4624 } 4625 4626 static int 4627 dump_l2arc_header(int fd) 4628 { 4629 l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0}; 4630 int error = B_FALSE; 4631 4632 if (pread64(fd, &l2dhdr, sizeof (l2dhdr), 4633 VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { 4634 error = B_TRUE; 4635 } else { 4636 if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) 4637 byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); 4638 4639 if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) 4640 error = B_TRUE; 4641 } 4642 4643 if (error) { 4644 (void) printf("L2ARC device header not found\n\n"); 4645 /* Do not return an error here for backward compatibility */ 4646 return (0); 4647 } else if (!dump_opt['q']) { 4648 print_l2arc_header(); 4649 4650 (void) printf(" magic: %llu\n", 4651 (u_longlong_t)l2dhdr.dh_magic); 4652 (void) printf(" version: %llu\n", 4653 (u_longlong_t)l2dhdr.dh_version); 4654 (void) printf(" pool_guid: %llu\n", 4655 (u_longlong_t)l2dhdr.dh_spa_guid); 4656 (void) printf(" flags: %llu\n", 4657 (u_longlong_t)l2dhdr.dh_flags); 4658 (void) printf(" start_lbps[0]: %llu\n", 4659 (u_longlong_t) 4660 l2dhdr.dh_start_lbps[0].lbp_daddr); 4661 (void) printf(" start_lbps[1]: %llu\n", 4662 (u_longlong_t) 4663 l2dhdr.dh_start_lbps[1].lbp_daddr); 4664 (void) printf(" log_blk_ent: %llu\n", 4665 (u_longlong_t)l2dhdr.dh_log_entries); 4666 (void) printf(" start: %llu\n", 4667 (u_longlong_t)l2dhdr.dh_start); 4668 (void) printf(" end: %llu\n", 4669 (u_longlong_t)l2dhdr.dh_end); 4670 (void) printf(" evict: %llu\n", 4671 (u_longlong_t)l2dhdr.dh_evict); 4672 (void) printf(" lb_asize_refcount: %llu\n", 4673 (u_longlong_t)l2dhdr.dh_lb_asize); 4674 (void) printf(" lb_count_refcount: %llu\n", 4675 (u_longlong_t)l2dhdr.dh_lb_count); 4676 (void) printf(" trim_action_time: %llu\n", 4677 (u_longlong_t)l2dhdr.dh_trim_action_time); 4678 (void) printf(" trim_state: %llu\n\n", 4679 (u_longlong_t)l2dhdr.dh_trim_state); 4680 } 4681 4682 dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild); 4683 /* 4684 * The total aligned size of log blocks and the number of log blocks 4685 * reported in the header of the device may be less than what zdb 4686 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). 4687 * This happens because dump_l2arc_log_blocks() lacks the memory 4688 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system 4689 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize 4690 * and dh_lb_count will be lower to begin with than what exists on the 4691 * device. This is normal and zdb should not exit with an error. The 4692 * opposite case should never happen though, the values reported in the 4693 * header should never be higher than what dump_l2arc_log_blocks() and 4694 * l2arc_rebuild() report. If this happens there is a leak in the 4695 * accounting of log blocks. 4696 */ 4697 if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || 4698 l2dhdr.dh_lb_count > rebuild.dh_lb_count) 4699 return (1); 4700 4701 return (0); 4702 } 4703 4704 static void 4705 dump_config_from_label(zdb_label_t *label, size_t buflen, int l) 4706 { 4707 if (dump_opt['q']) 4708 return; 4709 4710 if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) 4711 return; 4712 4713 print_label_header(label, l); 4714 dump_nvlist(label->config_nv, 4); 4715 print_label_numbers(" labels = ", label->config); 4716 4717 if (dump_opt['l'] >= 2) 4718 dump_nvlist_stats(label->config_nv, buflen); 4719 } 4720 4721 #define ZDB_MAX_UB_HEADER_SIZE 32 4722 4723 static void 4724 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) 4725 { 4726 4727 vdev_t vd; 4728 char header[ZDB_MAX_UB_HEADER_SIZE]; 4729 4730 vd.vdev_ashift = ashift; 4731 vd.vdev_top = &vd; 4732 4733 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { 4734 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); 4735 uberblock_t *ub = (void *)((char *)&label->label + uoff); 4736 cksum_record_t *rec = label->uberblocks[i]; 4737 4738 if (rec == NULL) { 4739 if (dump_opt['u'] >= 2) { 4740 print_label_header(label, label_num); 4741 (void) printf(" Uberblock[%d] invalid\n", i); 4742 } 4743 continue; 4744 } 4745 4746 if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) 4747 continue; 4748 4749 if ((dump_opt['u'] < 4) && 4750 (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && 4751 (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) 4752 continue; 4753 4754 print_label_header(label, label_num); 4755 (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, 4756 " Uberblock[%d]\n", i); 4757 dump_uberblock(ub, header, ""); 4758 print_label_numbers(" labels = ", rec); 4759 } 4760 } 4761 4762 static char curpath[PATH_MAX]; 4763 4764 /* 4765 * Iterate through the path components, recursively passing 4766 * current one's obj and remaining path until we find the obj 4767 * for the last one. 4768 */ 4769 static int 4770 dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj) 4771 { 4772 int err; 4773 boolean_t header = B_TRUE; 4774 uint64_t child_obj; 4775 char *s; 4776 dmu_buf_t *db; 4777 dmu_object_info_t doi; 4778 4779 if ((s = strchr(name, '/')) != NULL) 4780 *s = '\0'; 4781 err = zap_lookup(os, obj, name, 8, 1, &child_obj); 4782 4783 (void) strlcat(curpath, name, sizeof (curpath)); 4784 4785 if (err != 0) { 4786 (void) fprintf(stderr, "failed to lookup %s: %s\n", 4787 curpath, strerror(err)); 4788 return (err); 4789 } 4790 4791 child_obj = ZFS_DIRENT_OBJ(child_obj); 4792 err = sa_buf_hold(os, child_obj, FTAG, &db); 4793 if (err != 0) { 4794 (void) fprintf(stderr, 4795 "failed to get SA dbuf for obj %llu: %s\n", 4796 (u_longlong_t)child_obj, strerror(err)); 4797 return (EINVAL); 4798 } 4799 dmu_object_info_from_db(db, &doi); 4800 sa_buf_rele(db, FTAG); 4801 4802 if (doi.doi_bonus_type != DMU_OT_SA && 4803 doi.doi_bonus_type != DMU_OT_ZNODE) { 4804 (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", 4805 doi.doi_bonus_type, (u_longlong_t)child_obj); 4806 return (EINVAL); 4807 } 4808 4809 if (dump_opt['v'] > 6) { 4810 (void) printf("obj=%llu %s type=%d bonustype=%d\n", 4811 (u_longlong_t)child_obj, curpath, doi.doi_type, 4812 doi.doi_bonus_type); 4813 } 4814 4815 (void) strlcat(curpath, "/", sizeof (curpath)); 4816 4817 switch (doi.doi_type) { 4818 case DMU_OT_DIRECTORY_CONTENTS: 4819 if (s != NULL && *(s + 1) != '\0') 4820 return (dump_path_impl(os, child_obj, s + 1, retobj)); 4821 zfs_fallthrough; 4822 case DMU_OT_PLAIN_FILE_CONTENTS: 4823 if (retobj != NULL) { 4824 *retobj = child_obj; 4825 } else { 4826 dump_object(os, child_obj, dump_opt['v'], &header, 4827 NULL, 0); 4828 } 4829 return (0); 4830 default: 4831 (void) fprintf(stderr, "object %llu has non-file/directory " 4832 "type %d\n", (u_longlong_t)obj, doi.doi_type); 4833 break; 4834 } 4835 4836 return (EINVAL); 4837 } 4838 4839 /* 4840 * Dump the blocks for the object specified by path inside the dataset. 4841 */ 4842 static int 4843 dump_path(char *ds, char *path, uint64_t *retobj) 4844 { 4845 int err; 4846 objset_t *os; 4847 uint64_t root_obj; 4848 4849 err = open_objset(ds, FTAG, &os); 4850 if (err != 0) 4851 return (err); 4852 4853 err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); 4854 if (err != 0) { 4855 (void) fprintf(stderr, "can't lookup root znode: %s\n", 4856 strerror(err)); 4857 close_objset(os, FTAG); 4858 return (EINVAL); 4859 } 4860 4861 (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); 4862 4863 err = dump_path_impl(os, root_obj, path, retobj); 4864 4865 close_objset(os, FTAG); 4866 return (err); 4867 } 4868 4869 static int 4870 zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) 4871 { 4872 int err = 0; 4873 uint64_t size, readsize, oursize, offset; 4874 ssize_t writesize; 4875 sa_handle_t *hdl; 4876 4877 (void) printf("Copying object %" PRIu64 " to file %s\n", srcobj, 4878 destfile); 4879 4880 VERIFY3P(os, ==, sa_os); 4881 if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) { 4882 (void) printf("Failed to get handle for SA znode\n"); 4883 return (err); 4884 } 4885 if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) { 4886 (void) sa_handle_destroy(hdl); 4887 return (err); 4888 } 4889 (void) sa_handle_destroy(hdl); 4890 4891 (void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj, 4892 size); 4893 if (size == 0) { 4894 return (EINVAL); 4895 } 4896 4897 int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); 4898 if (fd == -1) 4899 return (errno); 4900 /* 4901 * We cap the size at 1 mebibyte here to prevent 4902 * allocation failures and nigh-infinite printing if the 4903 * object is extremely large. 4904 */ 4905 oursize = MIN(size, 1 << 20); 4906 offset = 0; 4907 char *buf = kmem_alloc(oursize, KM_NOSLEEP); 4908 if (buf == NULL) { 4909 (void) close(fd); 4910 return (ENOMEM); 4911 } 4912 4913 while (offset < size) { 4914 readsize = MIN(size - offset, 1 << 20); 4915 err = dmu_read(os, srcobj, offset, readsize, buf, 0); 4916 if (err != 0) { 4917 (void) printf("got error %u from dmu_read\n", err); 4918 kmem_free(buf, oursize); 4919 (void) close(fd); 4920 return (err); 4921 } 4922 if (dump_opt['v'] > 3) { 4923 (void) printf("Read offset=%" PRIu64 " size=%" PRIu64 4924 " error=%d\n", offset, readsize, err); 4925 } 4926 4927 writesize = write(fd, buf, readsize); 4928 if (writesize < 0) { 4929 err = errno; 4930 break; 4931 } else if (writesize != readsize) { 4932 /* Incomplete write */ 4933 (void) fprintf(stderr, "Short write, only wrote %llu of" 4934 " %" PRIu64 " bytes, exiting...\n", 4935 (u_longlong_t)writesize, readsize); 4936 break; 4937 } 4938 4939 offset += readsize; 4940 } 4941 4942 (void) close(fd); 4943 4944 if (buf != NULL) 4945 kmem_free(buf, oursize); 4946 4947 return (err); 4948 } 4949 4950 static boolean_t 4951 label_cksum_valid(vdev_label_t *label, uint64_t offset) 4952 { 4953 zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; 4954 zio_cksum_t expected_cksum; 4955 zio_cksum_t actual_cksum; 4956 zio_cksum_t verifier; 4957 zio_eck_t *eck; 4958 int byteswap; 4959 4960 void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys); 4961 eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1; 4962 4963 offset += offsetof(vdev_label_t, vl_vdev_phys); 4964 ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); 4965 4966 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 4967 if (byteswap) 4968 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 4969 4970 expected_cksum = eck->zec_cksum; 4971 eck->zec_cksum = verifier; 4972 4973 abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE); 4974 ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum); 4975 abd_free(abd); 4976 4977 if (byteswap) 4978 byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); 4979 4980 if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) 4981 return (B_TRUE); 4982 4983 return (B_FALSE); 4984 } 4985 4986 static int 4987 dump_label(const char *dev) 4988 { 4989 char path[MAXPATHLEN]; 4990 zdb_label_t labels[VDEV_LABELS] = {{{{0}}}}; 4991 uint64_t psize, ashift, l2cache; 4992 struct stat64 statbuf; 4993 boolean_t config_found = B_FALSE; 4994 boolean_t error = B_FALSE; 4995 boolean_t read_l2arc_header = B_FALSE; 4996 avl_tree_t config_tree; 4997 avl_tree_t uberblock_tree; 4998 void *node, *cookie; 4999 int fd; 5000 5001 /* 5002 * Check if we were given absolute path and use it as is. 5003 * Otherwise if the provided vdev name doesn't point to a file, 5004 * try prepending expected disk paths and partition numbers. 5005 */ 5006 (void) strlcpy(path, dev, sizeof (path)); 5007 if (dev[0] != '/' && stat64(path, &statbuf) != 0) { 5008 int error; 5009 5010 error = zfs_resolve_shortname(dev, path, MAXPATHLEN); 5011 if (error == 0 && zfs_dev_is_whole_disk(path)) { 5012 if (zfs_append_partition(path, MAXPATHLEN) == -1) 5013 error = ENOENT; 5014 } 5015 5016 if (error || (stat64(path, &statbuf) != 0)) { 5017 (void) printf("failed to find device %s, try " 5018 "specifying absolute path instead\n", dev); 5019 return (1); 5020 } 5021 } 5022 5023 if ((fd = open64(path, O_RDONLY)) < 0) { 5024 (void) printf("cannot open '%s': %s\n", path, strerror(errno)); 5025 exit(1); 5026 } 5027 5028 if (fstat64_blk(fd, &statbuf) != 0) { 5029 (void) printf("failed to stat '%s': %s\n", path, 5030 strerror(errno)); 5031 (void) close(fd); 5032 exit(1); 5033 } 5034 5035 if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) 5036 (void) printf("failed to invalidate cache '%s' : %s\n", path, 5037 strerror(errno)); 5038 5039 avl_create(&config_tree, cksum_record_compare, 5040 sizeof (cksum_record_t), offsetof(cksum_record_t, link)); 5041 avl_create(&uberblock_tree, cksum_record_compare, 5042 sizeof (cksum_record_t), offsetof(cksum_record_t, link)); 5043 5044 psize = statbuf.st_size; 5045 psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); 5046 ashift = SPA_MINBLOCKSHIFT; 5047 5048 /* 5049 * 1. Read the label from disk 5050 * 2. Verify label cksum 5051 * 3. Unpack the configuration and insert in config tree. 5052 * 4. Traverse all uberblocks and insert in uberblock tree. 5053 */ 5054 for (int l = 0; l < VDEV_LABELS; l++) { 5055 zdb_label_t *label = &labels[l]; 5056 char *buf = label->label.vl_vdev_phys.vp_nvlist; 5057 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); 5058 nvlist_t *config; 5059 cksum_record_t *rec; 5060 zio_cksum_t cksum; 5061 vdev_t vd; 5062 5063 label->label_offset = vdev_label_offset(psize, l, 0); 5064 5065 if (pread64(fd, &label->label, sizeof (label->label), 5066 label->label_offset) != sizeof (label->label)) { 5067 if (!dump_opt['q']) 5068 (void) printf("failed to read label %d\n", l); 5069 label->read_failed = B_TRUE; 5070 error = B_TRUE; 5071 continue; 5072 } 5073 5074 label->read_failed = B_FALSE; 5075 label->cksum_valid = label_cksum_valid(&label->label, 5076 label->label_offset); 5077 5078 if (nvlist_unpack(buf, buflen, &config, 0) == 0) { 5079 nvlist_t *vdev_tree = NULL; 5080 size_t size; 5081 5082 if ((nvlist_lookup_nvlist(config, 5083 ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || 5084 (nvlist_lookup_uint64(vdev_tree, 5085 ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) 5086 ashift = SPA_MINBLOCKSHIFT; 5087 5088 if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) 5089 size = buflen; 5090 5091 /* If the device is a cache device clear the header. */ 5092 if (!read_l2arc_header) { 5093 if (nvlist_lookup_uint64(config, 5094 ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && 5095 l2cache == POOL_STATE_L2CACHE) { 5096 read_l2arc_header = B_TRUE; 5097 } 5098 } 5099 5100 fletcher_4_native_varsize(buf, size, &cksum); 5101 rec = cksum_record_insert(&config_tree, &cksum, l); 5102 5103 label->config = rec; 5104 label->config_nv = config; 5105 config_found = B_TRUE; 5106 } else { 5107 error = B_TRUE; 5108 } 5109 5110 vd.vdev_ashift = ashift; 5111 vd.vdev_top = &vd; 5112 5113 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { 5114 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); 5115 uberblock_t *ub = (void *)((char *)label + uoff); 5116 5117 if (uberblock_verify(ub)) 5118 continue; 5119 5120 fletcher_4_native_varsize(ub, sizeof (*ub), &cksum); 5121 rec = cksum_record_insert(&uberblock_tree, &cksum, l); 5122 5123 label->uberblocks[i] = rec; 5124 } 5125 } 5126 5127 /* 5128 * Dump the label and uberblocks. 5129 */ 5130 for (int l = 0; l < VDEV_LABELS; l++) { 5131 zdb_label_t *label = &labels[l]; 5132 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); 5133 5134 if (label->read_failed == B_TRUE) 5135 continue; 5136 5137 if (label->config_nv) { 5138 dump_config_from_label(label, buflen, l); 5139 } else { 5140 if (!dump_opt['q']) 5141 (void) printf("failed to unpack label %d\n", l); 5142 } 5143 5144 if (dump_opt['u']) 5145 dump_label_uberblocks(label, ashift, l); 5146 5147 nvlist_free(label->config_nv); 5148 } 5149 5150 /* 5151 * Dump the L2ARC header, if existent. 5152 */ 5153 if (read_l2arc_header) 5154 error |= dump_l2arc_header(fd); 5155 5156 cookie = NULL; 5157 while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) 5158 umem_free(node, sizeof (cksum_record_t)); 5159 5160 cookie = NULL; 5161 while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) 5162 umem_free(node, sizeof (cksum_record_t)); 5163 5164 avl_destroy(&config_tree); 5165 avl_destroy(&uberblock_tree); 5166 5167 (void) close(fd); 5168 5169 return (config_found == B_FALSE ? 2 : 5170 (error == B_TRUE ? 1 : 0)); 5171 } 5172 5173 static uint64_t dataset_feature_count[SPA_FEATURES]; 5174 static uint64_t global_feature_count[SPA_FEATURES]; 5175 static uint64_t remap_deadlist_count = 0; 5176 5177 static int 5178 dump_one_objset(const char *dsname, void *arg) 5179 { 5180 (void) arg; 5181 int error; 5182 objset_t *os; 5183 spa_feature_t f; 5184 5185 error = open_objset(dsname, FTAG, &os); 5186 if (error != 0) 5187 return (0); 5188 5189 for (f = 0; f < SPA_FEATURES; f++) { 5190 if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) 5191 continue; 5192 ASSERT(spa_feature_table[f].fi_flags & 5193 ZFEATURE_FLAG_PER_DATASET); 5194 dataset_feature_count[f]++; 5195 } 5196 5197 if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { 5198 remap_deadlist_count++; 5199 } 5200 5201 for (dsl_bookmark_node_t *dbn = 5202 avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; 5203 dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { 5204 mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); 5205 if (dbn->dbn_phys.zbm_redaction_obj != 0) 5206 global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; 5207 if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) 5208 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; 5209 } 5210 5211 if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && 5212 !dmu_objset_is_snapshot(os)) { 5213 global_feature_count[SPA_FEATURE_LIVELIST]++; 5214 } 5215 5216 dump_objset(os); 5217 close_objset(os, FTAG); 5218 fuid_table_destroy(); 5219 return (0); 5220 } 5221 5222 /* 5223 * Block statistics. 5224 */ 5225 #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) 5226 typedef struct zdb_blkstats { 5227 uint64_t zb_asize; 5228 uint64_t zb_lsize; 5229 uint64_t zb_psize; 5230 uint64_t zb_count; 5231 uint64_t zb_gangs; 5232 uint64_t zb_ditto_samevdev; 5233 uint64_t zb_ditto_same_ms; 5234 uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; 5235 } zdb_blkstats_t; 5236 5237 /* 5238 * Extended object types to report deferred frees and dedup auto-ditto blocks. 5239 */ 5240 #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) 5241 #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) 5242 #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) 5243 #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) 5244 5245 static const char *zdb_ot_extname[] = { 5246 "deferred free", 5247 "dedup ditto", 5248 "other", 5249 "Total", 5250 }; 5251 5252 #define ZB_TOTAL DN_MAX_LEVELS 5253 #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) 5254 5255 typedef struct zdb_cb { 5256 zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; 5257 uint64_t zcb_removing_size; 5258 uint64_t zcb_checkpoint_size; 5259 uint64_t zcb_dedup_asize; 5260 uint64_t zcb_dedup_blocks; 5261 uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; 5262 uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; 5263 uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; 5264 uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; 5265 uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; 5266 uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; 5267 uint64_t zcb_psize_total; 5268 uint64_t zcb_lsize_total; 5269 uint64_t zcb_asize_total; 5270 uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; 5271 uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] 5272 [BPE_PAYLOAD_SIZE + 1]; 5273 uint64_t zcb_start; 5274 hrtime_t zcb_lastprint; 5275 uint64_t zcb_totalasize; 5276 uint64_t zcb_errors[256]; 5277 int zcb_readfails; 5278 int zcb_haderrors; 5279 spa_t *zcb_spa; 5280 uint32_t **zcb_vd_obsolete_counts; 5281 } zdb_cb_t; 5282 5283 /* test if two DVA offsets from same vdev are within the same metaslab */ 5284 static boolean_t 5285 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) 5286 { 5287 vdev_t *vd = vdev_lookup_top(spa, vdev); 5288 uint64_t ms_shift = vd->vdev_ms_shift; 5289 5290 return ((off1 >> ms_shift) == (off2 >> ms_shift)); 5291 } 5292 5293 /* 5294 * Used to simplify reporting of the histogram data. 5295 */ 5296 typedef struct one_histo { 5297 const char *name; 5298 uint64_t *count; 5299 uint64_t *len; 5300 uint64_t cumulative; 5301 } one_histo_t; 5302 5303 /* 5304 * The number of separate histograms processed for psize, lsize and asize. 5305 */ 5306 #define NUM_HISTO 3 5307 5308 /* 5309 * This routine will create a fixed column size output of three different 5310 * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M 5311 * the count, length and cumulative length of the psize, lsize and 5312 * asize blocks. 5313 * 5314 * All three types of blocks are listed on a single line 5315 * 5316 * By default the table is printed in nicenumber format (e.g. 123K) but 5317 * if the '-P' parameter is specified then the full raw number (parseable) 5318 * is printed out. 5319 */ 5320 static void 5321 dump_size_histograms(zdb_cb_t *zcb) 5322 { 5323 /* 5324 * A temporary buffer that allows us to convert a number into 5325 * a string using zdb_nicenumber to allow either raw or human 5326 * readable numbers to be output. 5327 */ 5328 char numbuf[32]; 5329 5330 /* 5331 * Define titles which are used in the headers of the tables 5332 * printed by this routine. 5333 */ 5334 const char blocksize_title1[] = "block"; 5335 const char blocksize_title2[] = "size"; 5336 const char count_title[] = "Count"; 5337 const char length_title[] = "Size"; 5338 const char cumulative_title[] = "Cum."; 5339 5340 /* 5341 * Setup the histogram arrays (psize, lsize, and asize). 5342 */ 5343 one_histo_t parm_histo[NUM_HISTO]; 5344 5345 parm_histo[0].name = "psize"; 5346 parm_histo[0].count = zcb->zcb_psize_count; 5347 parm_histo[0].len = zcb->zcb_psize_len; 5348 parm_histo[0].cumulative = 0; 5349 5350 parm_histo[1].name = "lsize"; 5351 parm_histo[1].count = zcb->zcb_lsize_count; 5352 parm_histo[1].len = zcb->zcb_lsize_len; 5353 parm_histo[1].cumulative = 0; 5354 5355 parm_histo[2].name = "asize"; 5356 parm_histo[2].count = zcb->zcb_asize_count; 5357 parm_histo[2].len = zcb->zcb_asize_len; 5358 parm_histo[2].cumulative = 0; 5359 5360 5361 (void) printf("\nBlock Size Histogram\n"); 5362 /* 5363 * Print the first line titles 5364 */ 5365 if (dump_opt['P']) 5366 (void) printf("\n%s\t", blocksize_title1); 5367 else 5368 (void) printf("\n%7s ", blocksize_title1); 5369 5370 for (int j = 0; j < NUM_HISTO; j++) { 5371 if (dump_opt['P']) { 5372 if (j < NUM_HISTO - 1) { 5373 (void) printf("%s\t\t\t", parm_histo[j].name); 5374 } else { 5375 /* Don't print trailing spaces */ 5376 (void) printf(" %s", parm_histo[j].name); 5377 } 5378 } else { 5379 if (j < NUM_HISTO - 1) { 5380 /* Left aligned strings in the output */ 5381 (void) printf("%-7s ", 5382 parm_histo[j].name); 5383 } else { 5384 /* Don't print trailing spaces */ 5385 (void) printf("%s", parm_histo[j].name); 5386 } 5387 } 5388 } 5389 (void) printf("\n"); 5390 5391 /* 5392 * Print the second line titles 5393 */ 5394 if (dump_opt['P']) { 5395 (void) printf("%s\t", blocksize_title2); 5396 } else { 5397 (void) printf("%7s ", blocksize_title2); 5398 } 5399 5400 for (int i = 0; i < NUM_HISTO; i++) { 5401 if (dump_opt['P']) { 5402 (void) printf("%s\t%s\t%s\t", 5403 count_title, length_title, cumulative_title); 5404 } else { 5405 (void) printf("%7s%7s%7s", 5406 count_title, length_title, cumulative_title); 5407 } 5408 } 5409 (void) printf("\n"); 5410 5411 /* 5412 * Print the rows 5413 */ 5414 for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { 5415 5416 /* 5417 * Print the first column showing the blocksize 5418 */ 5419 zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); 5420 5421 if (dump_opt['P']) { 5422 printf("%s", numbuf); 5423 } else { 5424 printf("%7s:", numbuf); 5425 } 5426 5427 /* 5428 * Print the remaining set of 3 columns per size: 5429 * for psize, lsize and asize 5430 */ 5431 for (int j = 0; j < NUM_HISTO; j++) { 5432 parm_histo[j].cumulative += parm_histo[j].len[i]; 5433 5434 zdb_nicenum(parm_histo[j].count[i], 5435 numbuf, sizeof (numbuf)); 5436 if (dump_opt['P']) 5437 (void) printf("\t%s", numbuf); 5438 else 5439 (void) printf("%7s", numbuf); 5440 5441 zdb_nicenum(parm_histo[j].len[i], 5442 numbuf, sizeof (numbuf)); 5443 if (dump_opt['P']) 5444 (void) printf("\t%s", numbuf); 5445 else 5446 (void) printf("%7s", numbuf); 5447 5448 zdb_nicenum(parm_histo[j].cumulative, 5449 numbuf, sizeof (numbuf)); 5450 if (dump_opt['P']) 5451 (void) printf("\t%s", numbuf); 5452 else 5453 (void) printf("%7s", numbuf); 5454 } 5455 (void) printf("\n"); 5456 } 5457 } 5458 5459 static void 5460 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, 5461 dmu_object_type_t type) 5462 { 5463 uint64_t refcnt = 0; 5464 int i; 5465 5466 ASSERT(type < ZDB_OT_TOTAL); 5467 5468 if (zilog && zil_bp_tree_add(zilog, bp) != 0) 5469 return; 5470 5471 spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); 5472 5473 for (i = 0; i < 4; i++) { 5474 int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; 5475 int t = (i & 1) ? type : ZDB_OT_TOTAL; 5476 int equal; 5477 zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; 5478 5479 zb->zb_asize += BP_GET_ASIZE(bp); 5480 zb->zb_lsize += BP_GET_LSIZE(bp); 5481 zb->zb_psize += BP_GET_PSIZE(bp); 5482 zb->zb_count++; 5483 5484 /* 5485 * The histogram is only big enough to record blocks up to 5486 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, 5487 * "other", bucket. 5488 */ 5489 unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; 5490 idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); 5491 zb->zb_psize_histogram[idx]++; 5492 5493 zb->zb_gangs += BP_COUNT_GANG(bp); 5494 5495 switch (BP_GET_NDVAS(bp)) { 5496 case 2: 5497 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 5498 DVA_GET_VDEV(&bp->blk_dva[1])) { 5499 zb->zb_ditto_samevdev++; 5500 5501 if (same_metaslab(zcb->zcb_spa, 5502 DVA_GET_VDEV(&bp->blk_dva[0]), 5503 DVA_GET_OFFSET(&bp->blk_dva[0]), 5504 DVA_GET_OFFSET(&bp->blk_dva[1]))) 5505 zb->zb_ditto_same_ms++; 5506 } 5507 break; 5508 case 3: 5509 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 5510 DVA_GET_VDEV(&bp->blk_dva[1])) + 5511 (DVA_GET_VDEV(&bp->blk_dva[0]) == 5512 DVA_GET_VDEV(&bp->blk_dva[2])) + 5513 (DVA_GET_VDEV(&bp->blk_dva[1]) == 5514 DVA_GET_VDEV(&bp->blk_dva[2])); 5515 if (equal != 0) { 5516 zb->zb_ditto_samevdev++; 5517 5518 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 5519 DVA_GET_VDEV(&bp->blk_dva[1]) && 5520 same_metaslab(zcb->zcb_spa, 5521 DVA_GET_VDEV(&bp->blk_dva[0]), 5522 DVA_GET_OFFSET(&bp->blk_dva[0]), 5523 DVA_GET_OFFSET(&bp->blk_dva[1]))) 5524 zb->zb_ditto_same_ms++; 5525 else if (DVA_GET_VDEV(&bp->blk_dva[0]) == 5526 DVA_GET_VDEV(&bp->blk_dva[2]) && 5527 same_metaslab(zcb->zcb_spa, 5528 DVA_GET_VDEV(&bp->blk_dva[0]), 5529 DVA_GET_OFFSET(&bp->blk_dva[0]), 5530 DVA_GET_OFFSET(&bp->blk_dva[2]))) 5531 zb->zb_ditto_same_ms++; 5532 else if (DVA_GET_VDEV(&bp->blk_dva[1]) == 5533 DVA_GET_VDEV(&bp->blk_dva[2]) && 5534 same_metaslab(zcb->zcb_spa, 5535 DVA_GET_VDEV(&bp->blk_dva[1]), 5536 DVA_GET_OFFSET(&bp->blk_dva[1]), 5537 DVA_GET_OFFSET(&bp->blk_dva[2]))) 5538 zb->zb_ditto_same_ms++; 5539 } 5540 break; 5541 } 5542 } 5543 5544 spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); 5545 5546 if (BP_IS_EMBEDDED(bp)) { 5547 zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; 5548 zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] 5549 [BPE_GET_PSIZE(bp)]++; 5550 return; 5551 } 5552 /* 5553 * The binning histogram bins by powers of two up to 5554 * SPA_MAXBLOCKSIZE rather than creating bins for 5555 * every possible blocksize found in the pool. 5556 */ 5557 int bin = highbit64(BP_GET_PSIZE(bp)) - 1; 5558 5559 zcb->zcb_psize_count[bin]++; 5560 zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); 5561 zcb->zcb_psize_total += BP_GET_PSIZE(bp); 5562 5563 bin = highbit64(BP_GET_LSIZE(bp)) - 1; 5564 5565 zcb->zcb_lsize_count[bin]++; 5566 zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); 5567 zcb->zcb_lsize_total += BP_GET_LSIZE(bp); 5568 5569 bin = highbit64(BP_GET_ASIZE(bp)) - 1; 5570 5571 zcb->zcb_asize_count[bin]++; 5572 zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); 5573 zcb->zcb_asize_total += BP_GET_ASIZE(bp); 5574 5575 if (dump_opt['L']) 5576 return; 5577 5578 if (BP_GET_DEDUP(bp)) { 5579 ddt_t *ddt; 5580 ddt_entry_t *dde; 5581 5582 ddt = ddt_select(zcb->zcb_spa, bp); 5583 ddt_enter(ddt); 5584 dde = ddt_lookup(ddt, bp, B_FALSE); 5585 5586 if (dde == NULL) { 5587 refcnt = 0; 5588 } else { 5589 ddt_phys_t *ddp = ddt_phys_select(dde, bp); 5590 ddt_phys_decref(ddp); 5591 refcnt = ddp->ddp_refcnt; 5592 if (ddt_phys_total_refcnt(dde) == 0) 5593 ddt_remove(ddt, dde); 5594 } 5595 ddt_exit(ddt); 5596 } 5597 5598 VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, 5599 refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), 5600 bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); 5601 } 5602 5603 static void 5604 zdb_blkptr_done(zio_t *zio) 5605 { 5606 spa_t *spa = zio->io_spa; 5607 blkptr_t *bp = zio->io_bp; 5608 int ioerr = zio->io_error; 5609 zdb_cb_t *zcb = zio->io_private; 5610 zbookmark_phys_t *zb = &zio->io_bookmark; 5611 5612 mutex_enter(&spa->spa_scrub_lock); 5613 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 5614 cv_broadcast(&spa->spa_scrub_io_cv); 5615 5616 if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 5617 char blkbuf[BP_SPRINTF_LEN]; 5618 5619 zcb->zcb_haderrors = 1; 5620 zcb->zcb_errors[ioerr]++; 5621 5622 if (dump_opt['b'] >= 2) 5623 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 5624 else 5625 blkbuf[0] = '\0'; 5626 5627 (void) printf("zdb_blkptr_cb: " 5628 "Got error %d reading " 5629 "<%llu, %llu, %lld, %llx> %s -- skipping\n", 5630 ioerr, 5631 (u_longlong_t)zb->zb_objset, 5632 (u_longlong_t)zb->zb_object, 5633 (u_longlong_t)zb->zb_level, 5634 (u_longlong_t)zb->zb_blkid, 5635 blkbuf); 5636 } 5637 mutex_exit(&spa->spa_scrub_lock); 5638 5639 abd_free(zio->io_abd); 5640 } 5641 5642 static int 5643 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 5644 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 5645 { 5646 zdb_cb_t *zcb = arg; 5647 dmu_object_type_t type; 5648 boolean_t is_metadata; 5649 5650 if (zb->zb_level == ZB_DNODE_LEVEL) 5651 return (0); 5652 5653 if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { 5654 char blkbuf[BP_SPRINTF_LEN]; 5655 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 5656 (void) printf("objset %llu object %llu " 5657 "level %lld offset 0x%llx %s\n", 5658 (u_longlong_t)zb->zb_objset, 5659 (u_longlong_t)zb->zb_object, 5660 (longlong_t)zb->zb_level, 5661 (u_longlong_t)blkid2offset(dnp, bp, zb), 5662 blkbuf); 5663 } 5664 5665 if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) 5666 return (0); 5667 5668 type = BP_GET_TYPE(bp); 5669 5670 zdb_count_block(zcb, zilog, bp, 5671 (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); 5672 5673 is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); 5674 5675 if (!BP_IS_EMBEDDED(bp) && 5676 (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { 5677 size_t size = BP_GET_PSIZE(bp); 5678 abd_t *abd = abd_alloc(size, B_FALSE); 5679 int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; 5680 5681 /* If it's an intent log block, failure is expected. */ 5682 if (zb->zb_level == ZB_ZIL_LEVEL) 5683 flags |= ZIO_FLAG_SPECULATIVE; 5684 5685 mutex_enter(&spa->spa_scrub_lock); 5686 while (spa->spa_load_verify_bytes > max_inflight_bytes) 5687 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 5688 spa->spa_load_verify_bytes += size; 5689 mutex_exit(&spa->spa_scrub_lock); 5690 5691 zio_nowait(zio_read(NULL, spa, bp, abd, size, 5692 zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); 5693 } 5694 5695 zcb->zcb_readfails = 0; 5696 5697 /* only call gethrtime() every 100 blocks */ 5698 static int iters; 5699 if (++iters > 100) 5700 iters = 0; 5701 else 5702 return (0); 5703 5704 if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { 5705 uint64_t now = gethrtime(); 5706 char buf[10]; 5707 uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; 5708 uint64_t kb_per_sec = 5709 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); 5710 uint64_t sec_remaining = 5711 (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; 5712 5713 /* make sure nicenum has enough space */ 5714 _Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated"); 5715 5716 zfs_nicebytes(bytes, buf, sizeof (buf)); 5717 (void) fprintf(stderr, 5718 "\r%5s completed (%4"PRIu64"MB/s) " 5719 "estimated time remaining: " 5720 "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec ", 5721 buf, kb_per_sec / 1024, 5722 sec_remaining / 60 / 60, 5723 sec_remaining / 60 % 60, 5724 sec_remaining % 60); 5725 5726 zcb->zcb_lastprint = now; 5727 } 5728 5729 return (0); 5730 } 5731 5732 static void 5733 zdb_leak(void *arg, uint64_t start, uint64_t size) 5734 { 5735 vdev_t *vd = arg; 5736 5737 (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", 5738 (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); 5739 } 5740 5741 static metaslab_ops_t zdb_metaslab_ops = { 5742 NULL /* alloc */ 5743 }; 5744 5745 static int 5746 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, 5747 uint64_t txg, void *arg) 5748 { 5749 spa_vdev_removal_t *svr = arg; 5750 5751 uint64_t offset = sme->sme_offset; 5752 uint64_t size = sme->sme_run; 5753 5754 /* skip vdevs we don't care about */ 5755 if (sme->sme_vdev != svr->svr_vdev_id) 5756 return (0); 5757 5758 vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); 5759 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5760 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 5761 5762 if (txg < metaslab_unflushed_txg(ms)) 5763 return (0); 5764 5765 if (sme->sme_type == SM_ALLOC) 5766 range_tree_add(svr->svr_allocd_segs, offset, size); 5767 else 5768 range_tree_remove(svr->svr_allocd_segs, offset, size); 5769 5770 return (0); 5771 } 5772 5773 static void 5774 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5775 uint64_t size, void *arg) 5776 { 5777 (void) inner_offset, (void) arg; 5778 5779 /* 5780 * This callback was called through a remap from 5781 * a device being removed. Therefore, the vdev that 5782 * this callback is applied to is a concrete 5783 * vdev. 5784 */ 5785 ASSERT(vdev_is_concrete(vd)); 5786 5787 VERIFY0(metaslab_claim_impl(vd, offset, size, 5788 spa_min_claim_txg(vd->vdev_spa))); 5789 } 5790 5791 static void 5792 claim_segment_cb(void *arg, uint64_t offset, uint64_t size) 5793 { 5794 vdev_t *vd = arg; 5795 5796 vdev_indirect_ops.vdev_op_remap(vd, offset, size, 5797 claim_segment_impl_cb, NULL); 5798 } 5799 5800 /* 5801 * After accounting for all allocated blocks that are directly referenced, 5802 * we might have missed a reference to a block from a partially complete 5803 * (and thus unused) indirect mapping object. We perform a secondary pass 5804 * through the metaslabs we have already mapped and claim the destination 5805 * blocks. 5806 */ 5807 static void 5808 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) 5809 { 5810 if (dump_opt['L']) 5811 return; 5812 5813 if (spa->spa_vdev_removal == NULL) 5814 return; 5815 5816 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5817 5818 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 5819 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); 5820 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 5821 5822 ASSERT0(range_tree_space(svr->svr_allocd_segs)); 5823 5824 range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); 5825 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { 5826 metaslab_t *msp = vd->vdev_ms[msi]; 5827 5828 ASSERT0(range_tree_space(allocs)); 5829 if (msp->ms_sm != NULL) 5830 VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); 5831 range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); 5832 } 5833 range_tree_destroy(allocs); 5834 5835 iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); 5836 5837 /* 5838 * Clear everything past what has been synced, 5839 * because we have not allocated mappings for 5840 * it yet. 5841 */ 5842 range_tree_clear(svr->svr_allocd_segs, 5843 vdev_indirect_mapping_max_offset(vim), 5844 vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); 5845 5846 zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); 5847 range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); 5848 5849 spa_config_exit(spa, SCL_CONFIG, FTAG); 5850 } 5851 5852 static int 5853 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 5854 dmu_tx_t *tx) 5855 { 5856 (void) tx; 5857 zdb_cb_t *zcb = arg; 5858 spa_t *spa = zcb->zcb_spa; 5859 vdev_t *vd; 5860 const dva_t *dva = &bp->blk_dva[0]; 5861 5862 ASSERT(!bp_freed); 5863 ASSERT(!dump_opt['L']); 5864 ASSERT3U(BP_GET_NDVAS(bp), ==, 1); 5865 5866 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 5867 vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); 5868 ASSERT3P(vd, !=, NULL); 5869 spa_config_exit(spa, SCL_VDEV, FTAG); 5870 5871 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); 5872 ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); 5873 5874 vdev_indirect_mapping_increment_obsolete_count( 5875 vd->vdev_indirect_mapping, 5876 DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), 5877 zcb->zcb_vd_obsolete_counts[vd->vdev_id]); 5878 5879 return (0); 5880 } 5881 5882 static uint32_t * 5883 zdb_load_obsolete_counts(vdev_t *vd) 5884 { 5885 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 5886 spa_t *spa = vd->vdev_spa; 5887 spa_condensing_indirect_phys_t *scip = 5888 &spa->spa_condensing_indirect_phys; 5889 uint64_t obsolete_sm_object; 5890 uint32_t *counts; 5891 5892 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 5893 EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL); 5894 counts = vdev_indirect_mapping_load_obsolete_counts(vim); 5895 if (vd->vdev_obsolete_sm != NULL) { 5896 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, 5897 vd->vdev_obsolete_sm); 5898 } 5899 if (scip->scip_vdev == vd->vdev_id && 5900 scip->scip_prev_obsolete_sm_object != 0) { 5901 space_map_t *prev_obsolete_sm = NULL; 5902 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 5903 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 5904 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, 5905 prev_obsolete_sm); 5906 space_map_close(prev_obsolete_sm); 5907 } 5908 return (counts); 5909 } 5910 5911 static void 5912 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) 5913 { 5914 ddt_bookmark_t ddb = {0}; 5915 ddt_entry_t dde; 5916 int error; 5917 int p; 5918 5919 ASSERT(!dump_opt['L']); 5920 5921 while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { 5922 blkptr_t blk; 5923 ddt_phys_t *ddp = dde.dde_phys; 5924 5925 if (ddb.ddb_class == DDT_CLASS_UNIQUE) 5926 return; 5927 5928 ASSERT(ddt_phys_total_refcnt(&dde) > 1); 5929 5930 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 5931 if (ddp->ddp_phys_birth == 0) 5932 continue; 5933 ddt_bp_create(ddb.ddb_checksum, 5934 &dde.dde_key, ddp, &blk); 5935 if (p == DDT_PHYS_DITTO) { 5936 zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); 5937 } else { 5938 zcb->zcb_dedup_asize += 5939 BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); 5940 zcb->zcb_dedup_blocks++; 5941 } 5942 } 5943 ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; 5944 ddt_enter(ddt); 5945 VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); 5946 ddt_exit(ddt); 5947 } 5948 5949 ASSERT(error == ENOENT); 5950 } 5951 5952 typedef struct checkpoint_sm_exclude_entry_arg { 5953 vdev_t *cseea_vd; 5954 uint64_t cseea_checkpoint_size; 5955 } checkpoint_sm_exclude_entry_arg_t; 5956 5957 static int 5958 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) 5959 { 5960 checkpoint_sm_exclude_entry_arg_t *cseea = arg; 5961 vdev_t *vd = cseea->cseea_vd; 5962 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; 5963 uint64_t end = sme->sme_offset + sme->sme_run; 5964 5965 ASSERT(sme->sme_type == SM_FREE); 5966 5967 /* 5968 * Since the vdev_checkpoint_sm exists in the vdev level 5969 * and the ms_sm space maps exist in the metaslab level, 5970 * an entry in the checkpoint space map could theoretically 5971 * cross the boundaries of the metaslab that it belongs. 5972 * 5973 * In reality, because of the way that we populate and 5974 * manipulate the checkpoint's space maps currently, 5975 * there shouldn't be any entries that cross metaslabs. 5976 * Hence the assertion below. 5977 * 5978 * That said, there is no fundamental requirement that 5979 * the checkpoint's space map entries should not cross 5980 * metaslab boundaries. So if needed we could add code 5981 * that handles metaslab-crossing segments in the future. 5982 */ 5983 VERIFY3U(sme->sme_offset, >=, ms->ms_start); 5984 VERIFY3U(end, <=, ms->ms_start + ms->ms_size); 5985 5986 /* 5987 * By removing the entry from the allocated segments we 5988 * also verify that the entry is there to begin with. 5989 */ 5990 mutex_enter(&ms->ms_lock); 5991 range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); 5992 mutex_exit(&ms->ms_lock); 5993 5994 cseea->cseea_checkpoint_size += sme->sme_run; 5995 return (0); 5996 } 5997 5998 static void 5999 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) 6000 { 6001 spa_t *spa = vd->vdev_spa; 6002 space_map_t *checkpoint_sm = NULL; 6003 uint64_t checkpoint_sm_obj; 6004 6005 /* 6006 * If there is no vdev_top_zap, we are in a pool whose 6007 * version predates the pool checkpoint feature. 6008 */ 6009 if (vd->vdev_top_zap == 0) 6010 return; 6011 6012 /* 6013 * If there is no reference of the vdev_checkpoint_sm in 6014 * the vdev_top_zap, then one of the following scenarios 6015 * is true: 6016 * 6017 * 1] There is no checkpoint 6018 * 2] There is a checkpoint, but no checkpointed blocks 6019 * have been freed yet 6020 * 3] The current vdev is indirect 6021 * 6022 * In these cases we return immediately. 6023 */ 6024 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, 6025 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 6026 return; 6027 6028 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, 6029 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, 6030 &checkpoint_sm_obj)); 6031 6032 checkpoint_sm_exclude_entry_arg_t cseea; 6033 cseea.cseea_vd = vd; 6034 cseea.cseea_checkpoint_size = 0; 6035 6036 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), 6037 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); 6038 6039 VERIFY0(space_map_iterate(checkpoint_sm, 6040 space_map_length(checkpoint_sm), 6041 checkpoint_sm_exclude_entry_cb, &cseea)); 6042 space_map_close(checkpoint_sm); 6043 6044 zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; 6045 } 6046 6047 static void 6048 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) 6049 { 6050 ASSERT(!dump_opt['L']); 6051 6052 vdev_t *rvd = spa->spa_root_vdev; 6053 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 6054 ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); 6055 zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); 6056 } 6057 } 6058 6059 static int 6060 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, 6061 uint64_t txg, void *arg) 6062 { 6063 int64_t *ualloc_space = arg; 6064 6065 uint64_t offset = sme->sme_offset; 6066 uint64_t vdev_id = sme->sme_vdev; 6067 6068 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 6069 if (!vdev_is_concrete(vd)) 6070 return (0); 6071 6072 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6073 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 6074 6075 if (txg < metaslab_unflushed_txg(ms)) 6076 return (0); 6077 6078 if (sme->sme_type == SM_ALLOC) 6079 *ualloc_space += sme->sme_run; 6080 else 6081 *ualloc_space -= sme->sme_run; 6082 6083 return (0); 6084 } 6085 6086 static int64_t 6087 get_unflushed_alloc_space(spa_t *spa) 6088 { 6089 if (dump_opt['L']) 6090 return (0); 6091 6092 int64_t ualloc_space = 0; 6093 iterate_through_spacemap_logs(spa, count_unflushed_space_cb, 6094 &ualloc_space); 6095 return (ualloc_space); 6096 } 6097 6098 static int 6099 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) 6100 { 6101 maptype_t *uic_maptype = arg; 6102 6103 uint64_t offset = sme->sme_offset; 6104 uint64_t size = sme->sme_run; 6105 uint64_t vdev_id = sme->sme_vdev; 6106 6107 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 6108 6109 /* skip indirect vdevs */ 6110 if (!vdev_is_concrete(vd)) 6111 return (0); 6112 6113 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6114 6115 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 6116 ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); 6117 6118 if (txg < metaslab_unflushed_txg(ms)) 6119 return (0); 6120 6121 if (*uic_maptype == sme->sme_type) 6122 range_tree_add(ms->ms_allocatable, offset, size); 6123 else 6124 range_tree_remove(ms->ms_allocatable, offset, size); 6125 6126 return (0); 6127 } 6128 6129 static void 6130 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) 6131 { 6132 iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); 6133 } 6134 6135 static void 6136 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) 6137 { 6138 vdev_t *rvd = spa->spa_root_vdev; 6139 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 6140 vdev_t *vd = rvd->vdev_child[i]; 6141 6142 ASSERT3U(i, ==, vd->vdev_id); 6143 6144 if (vd->vdev_ops == &vdev_indirect_ops) 6145 continue; 6146 6147 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 6148 metaslab_t *msp = vd->vdev_ms[m]; 6149 6150 (void) fprintf(stderr, 6151 "\rloading concrete vdev %llu, " 6152 "metaslab %llu of %llu ...", 6153 (longlong_t)vd->vdev_id, 6154 (longlong_t)msp->ms_id, 6155 (longlong_t)vd->vdev_ms_count); 6156 6157 mutex_enter(&msp->ms_lock); 6158 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 6159 6160 /* 6161 * We don't want to spend the CPU manipulating the 6162 * size-ordered tree, so clear the range_tree ops. 6163 */ 6164 msp->ms_allocatable->rt_ops = NULL; 6165 6166 if (msp->ms_sm != NULL) { 6167 VERIFY0(space_map_load(msp->ms_sm, 6168 msp->ms_allocatable, maptype)); 6169 } 6170 if (!msp->ms_loaded) 6171 msp->ms_loaded = B_TRUE; 6172 mutex_exit(&msp->ms_lock); 6173 } 6174 } 6175 6176 load_unflushed_to_ms_allocatables(spa, maptype); 6177 } 6178 6179 /* 6180 * vm_idxp is an in-out parameter which (for indirect vdevs) is the 6181 * index in vim_entries that has the first entry in this metaslab. 6182 * On return, it will be set to the first entry after this metaslab. 6183 */ 6184 static void 6185 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, 6186 uint64_t *vim_idxp) 6187 { 6188 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6189 6190 mutex_enter(&msp->ms_lock); 6191 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 6192 6193 /* 6194 * We don't want to spend the CPU manipulating the 6195 * size-ordered tree, so clear the range_tree ops. 6196 */ 6197 msp->ms_allocatable->rt_ops = NULL; 6198 6199 for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); 6200 (*vim_idxp)++) { 6201 vdev_indirect_mapping_entry_phys_t *vimep = 6202 &vim->vim_entries[*vim_idxp]; 6203 uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); 6204 uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); 6205 ASSERT3U(ent_offset, >=, msp->ms_start); 6206 if (ent_offset >= msp->ms_start + msp->ms_size) 6207 break; 6208 6209 /* 6210 * Mappings do not cross metaslab boundaries, 6211 * because we create them by walking the metaslabs. 6212 */ 6213 ASSERT3U(ent_offset + ent_len, <=, 6214 msp->ms_start + msp->ms_size); 6215 range_tree_add(msp->ms_allocatable, ent_offset, ent_len); 6216 } 6217 6218 if (!msp->ms_loaded) 6219 msp->ms_loaded = B_TRUE; 6220 mutex_exit(&msp->ms_lock); 6221 } 6222 6223 static void 6224 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) 6225 { 6226 ASSERT(!dump_opt['L']); 6227 6228 vdev_t *rvd = spa->spa_root_vdev; 6229 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 6230 vdev_t *vd = rvd->vdev_child[c]; 6231 6232 ASSERT3U(c, ==, vd->vdev_id); 6233 6234 if (vd->vdev_ops != &vdev_indirect_ops) 6235 continue; 6236 6237 /* 6238 * Note: we don't check for mapping leaks on 6239 * removing vdevs because their ms_allocatable's 6240 * are used to look for leaks in allocated space. 6241 */ 6242 zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); 6243 6244 /* 6245 * Normally, indirect vdevs don't have any 6246 * metaslabs. We want to set them up for 6247 * zio_claim(). 6248 */ 6249 vdev_metaslab_group_create(vd); 6250 VERIFY0(vdev_metaslab_init(vd, 0)); 6251 6252 vdev_indirect_mapping_t *vim __maybe_unused = 6253 vd->vdev_indirect_mapping; 6254 uint64_t vim_idx = 0; 6255 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 6256 6257 (void) fprintf(stderr, 6258 "\rloading indirect vdev %llu, " 6259 "metaslab %llu of %llu ...", 6260 (longlong_t)vd->vdev_id, 6261 (longlong_t)vd->vdev_ms[m]->ms_id, 6262 (longlong_t)vd->vdev_ms_count); 6263 6264 load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], 6265 &vim_idx); 6266 } 6267 ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); 6268 } 6269 } 6270 6271 static void 6272 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) 6273 { 6274 zcb->zcb_spa = spa; 6275 6276 if (dump_opt['L']) 6277 return; 6278 6279 dsl_pool_t *dp = spa->spa_dsl_pool; 6280 vdev_t *rvd = spa->spa_root_vdev; 6281 6282 /* 6283 * We are going to be changing the meaning of the metaslab's 6284 * ms_allocatable. Ensure that the allocator doesn't try to 6285 * use the tree. 6286 */ 6287 spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; 6288 spa->spa_log_class->mc_ops = &zdb_metaslab_ops; 6289 spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; 6290 6291 zcb->zcb_vd_obsolete_counts = 6292 umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), 6293 UMEM_NOFAIL); 6294 6295 /* 6296 * For leak detection, we overload the ms_allocatable trees 6297 * to contain allocated segments instead of free segments. 6298 * As a result, we can't use the normal metaslab_load/unload 6299 * interfaces. 6300 */ 6301 zdb_leak_init_prepare_indirect_vdevs(spa, zcb); 6302 load_concrete_ms_allocatable_trees(spa, SM_ALLOC); 6303 6304 /* 6305 * On load_concrete_ms_allocatable_trees() we loaded all the 6306 * allocated entries from the ms_sm to the ms_allocatable for 6307 * each metaslab. If the pool has a checkpoint or is in the 6308 * middle of discarding a checkpoint, some of these blocks 6309 * may have been freed but their ms_sm may not have been 6310 * updated because they are referenced by the checkpoint. In 6311 * order to avoid false-positives during leak-detection, we 6312 * go through the vdev's checkpoint space map and exclude all 6313 * its entries from their relevant ms_allocatable. 6314 * 6315 * We also aggregate the space held by the checkpoint and add 6316 * it to zcb_checkpoint_size. 6317 * 6318 * Note that at this point we are also verifying that all the 6319 * entries on the checkpoint_sm are marked as allocated in 6320 * the ms_sm of their relevant metaslab. 6321 * [see comment in checkpoint_sm_exclude_entry_cb()] 6322 */ 6323 zdb_leak_init_exclude_checkpoint(spa, zcb); 6324 ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); 6325 6326 /* for cleaner progress output */ 6327 (void) fprintf(stderr, "\n"); 6328 6329 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { 6330 ASSERT(spa_feature_is_enabled(spa, 6331 SPA_FEATURE_DEVICE_REMOVAL)); 6332 (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, 6333 increment_indirect_mapping_cb, zcb, NULL); 6334 } 6335 6336 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6337 zdb_ddt_leak_init(spa, zcb); 6338 spa_config_exit(spa, SCL_CONFIG, FTAG); 6339 } 6340 6341 static boolean_t 6342 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) 6343 { 6344 boolean_t leaks = B_FALSE; 6345 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6346 uint64_t total_leaked = 0; 6347 boolean_t are_precise = B_FALSE; 6348 6349 ASSERT(vim != NULL); 6350 6351 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { 6352 vdev_indirect_mapping_entry_phys_t *vimep = 6353 &vim->vim_entries[i]; 6354 uint64_t obsolete_bytes = 0; 6355 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); 6356 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6357 6358 /* 6359 * This is not very efficient but it's easy to 6360 * verify correctness. 6361 */ 6362 for (uint64_t inner_offset = 0; 6363 inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); 6364 inner_offset += 1ULL << vd->vdev_ashift) { 6365 if (range_tree_contains(msp->ms_allocatable, 6366 offset + inner_offset, 1ULL << vd->vdev_ashift)) { 6367 obsolete_bytes += 1ULL << vd->vdev_ashift; 6368 } 6369 } 6370 6371 int64_t bytes_leaked = obsolete_bytes - 6372 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; 6373 ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, 6374 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); 6375 6376 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); 6377 if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) { 6378 (void) printf("obsolete indirect mapping count " 6379 "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", 6380 (u_longlong_t)vd->vdev_id, 6381 (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), 6382 (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 6383 (u_longlong_t)bytes_leaked); 6384 } 6385 total_leaked += ABS(bytes_leaked); 6386 } 6387 6388 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); 6389 if (!are_precise && total_leaked > 0) { 6390 int pct_leaked = total_leaked * 100 / 6391 vdev_indirect_mapping_bytes_mapped(vim); 6392 (void) printf("cannot verify obsolete indirect mapping " 6393 "counts of vdev %llu because precise feature was not " 6394 "enabled when it was removed: %d%% (%llx bytes) of mapping" 6395 "unreferenced\n", 6396 (u_longlong_t)vd->vdev_id, pct_leaked, 6397 (u_longlong_t)total_leaked); 6398 } else if (total_leaked > 0) { 6399 (void) printf("obsolete indirect mapping count mismatch " 6400 "for vdev %llu -- %llx total bytes mismatched\n", 6401 (u_longlong_t)vd->vdev_id, 6402 (u_longlong_t)total_leaked); 6403 leaks |= B_TRUE; 6404 } 6405 6406 vdev_indirect_mapping_free_obsolete_counts(vim, 6407 zcb->zcb_vd_obsolete_counts[vd->vdev_id]); 6408 zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; 6409 6410 return (leaks); 6411 } 6412 6413 static boolean_t 6414 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) 6415 { 6416 if (dump_opt['L']) 6417 return (B_FALSE); 6418 6419 boolean_t leaks = B_FALSE; 6420 vdev_t *rvd = spa->spa_root_vdev; 6421 for (unsigned c = 0; c < rvd->vdev_children; c++) { 6422 vdev_t *vd = rvd->vdev_child[c]; 6423 6424 if (zcb->zcb_vd_obsolete_counts[c] != NULL) { 6425 leaks |= zdb_check_for_obsolete_leaks(vd, zcb); 6426 } 6427 6428 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 6429 metaslab_t *msp = vd->vdev_ms[m]; 6430 ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == 6431 spa_embedded_log_class(spa)) ? 6432 vd->vdev_log_mg : vd->vdev_mg); 6433 6434 /* 6435 * ms_allocatable has been overloaded 6436 * to contain allocated segments. Now that 6437 * we finished traversing all blocks, any 6438 * block that remains in the ms_allocatable 6439 * represents an allocated block that we 6440 * did not claim during the traversal. 6441 * Claimed blocks would have been removed 6442 * from the ms_allocatable. For indirect 6443 * vdevs, space remaining in the tree 6444 * represents parts of the mapping that are 6445 * not referenced, which is not a bug. 6446 */ 6447 if (vd->vdev_ops == &vdev_indirect_ops) { 6448 range_tree_vacate(msp->ms_allocatable, 6449 NULL, NULL); 6450 } else { 6451 range_tree_vacate(msp->ms_allocatable, 6452 zdb_leak, vd); 6453 } 6454 if (msp->ms_loaded) { 6455 msp->ms_loaded = B_FALSE; 6456 } 6457 } 6458 } 6459 6460 umem_free(zcb->zcb_vd_obsolete_counts, 6461 rvd->vdev_children * sizeof (uint32_t *)); 6462 zcb->zcb_vd_obsolete_counts = NULL; 6463 6464 return (leaks); 6465 } 6466 6467 static int 6468 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6469 { 6470 (void) tx; 6471 zdb_cb_t *zcb = arg; 6472 6473 if (dump_opt['b'] >= 5) { 6474 char blkbuf[BP_SPRINTF_LEN]; 6475 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 6476 (void) printf("[%s] %s\n", 6477 "deferred free", blkbuf); 6478 } 6479 zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); 6480 return (0); 6481 } 6482 6483 /* 6484 * Iterate over livelists which have been destroyed by the user but 6485 * are still present in the MOS, waiting to be freed 6486 */ 6487 static void 6488 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) 6489 { 6490 objset_t *mos = spa->spa_meta_objset; 6491 uint64_t zap_obj; 6492 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, 6493 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); 6494 if (err == ENOENT) 6495 return; 6496 ASSERT0(err); 6497 6498 zap_cursor_t zc; 6499 zap_attribute_t attr; 6500 dsl_deadlist_t ll; 6501 /* NULL out os prior to dsl_deadlist_open in case it's garbage */ 6502 ll.dl_os = NULL; 6503 for (zap_cursor_init(&zc, mos, zap_obj); 6504 zap_cursor_retrieve(&zc, &attr) == 0; 6505 (void) zap_cursor_advance(&zc)) { 6506 dsl_deadlist_open(&ll, mos, attr.za_first_integer); 6507 func(&ll, arg); 6508 dsl_deadlist_close(&ll); 6509 } 6510 zap_cursor_fini(&zc); 6511 } 6512 6513 static int 6514 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 6515 dmu_tx_t *tx) 6516 { 6517 ASSERT(!bp_freed); 6518 return (count_block_cb(arg, bp, tx)); 6519 } 6520 6521 static int 6522 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) 6523 { 6524 zdb_cb_t *zbc = args; 6525 bplist_t blks; 6526 bplist_create(&blks); 6527 /* determine which blocks have been alloc'd but not freed */ 6528 VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); 6529 /* count those blocks */ 6530 (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); 6531 bplist_destroy(&blks); 6532 return (0); 6533 } 6534 6535 static void 6536 livelist_count_blocks(dsl_deadlist_t *ll, void *arg) 6537 { 6538 dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); 6539 } 6540 6541 /* 6542 * Count the blocks in the livelists that have been destroyed by the user 6543 * but haven't yet been freed. 6544 */ 6545 static void 6546 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) 6547 { 6548 iterate_deleted_livelists(spa, livelist_count_blocks, zbc); 6549 } 6550 6551 static void 6552 dump_livelist_cb(dsl_deadlist_t *ll, void *arg) 6553 { 6554 ASSERT3P(arg, ==, NULL); 6555 global_feature_count[SPA_FEATURE_LIVELIST]++; 6556 dump_blkptr_list(ll, "Deleted Livelist"); 6557 dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); 6558 } 6559 6560 /* 6561 * Print out, register object references to, and increment feature counts for 6562 * livelists that have been destroyed by the user but haven't yet been freed. 6563 */ 6564 static void 6565 deleted_livelists_dump_mos(spa_t *spa) 6566 { 6567 uint64_t zap_obj; 6568 objset_t *mos = spa->spa_meta_objset; 6569 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, 6570 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); 6571 if (err == ENOENT) 6572 return; 6573 mos_obj_refd(zap_obj); 6574 iterate_deleted_livelists(spa, dump_livelist_cb, NULL); 6575 } 6576 6577 static int 6578 dump_block_stats(spa_t *spa) 6579 { 6580 zdb_cb_t *zcb; 6581 zdb_blkstats_t *zb, *tzb; 6582 uint64_t norm_alloc, norm_space, total_alloc, total_found; 6583 int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 6584 TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; 6585 boolean_t leaks = B_FALSE; 6586 int e, c, err; 6587 bp_embedded_type_t i; 6588 6589 zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); 6590 6591 (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", 6592 (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", 6593 (dump_opt['c'] == 1) ? "metadata " : "", 6594 dump_opt['c'] ? "checksums " : "", 6595 (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", 6596 !dump_opt['L'] ? "nothing leaked " : ""); 6597 6598 /* 6599 * When leak detection is enabled we load all space maps as SM_ALLOC 6600 * maps, then traverse the pool claiming each block we discover. If 6601 * the pool is perfectly consistent, the segment trees will be empty 6602 * when we're done. Anything left over is a leak; any block we can't 6603 * claim (because it's not part of any space map) is a double 6604 * allocation, reference to a freed block, or an unclaimed log block. 6605 * 6606 * When leak detection is disabled (-L option) we still traverse the 6607 * pool claiming each block we discover, but we skip opening any space 6608 * maps. 6609 */ 6610 zdb_leak_init(spa, zcb); 6611 6612 /* 6613 * If there's a deferred-free bplist, process that first. 6614 */ 6615 (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, 6616 bpobj_count_block_cb, zcb, NULL); 6617 6618 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 6619 (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, 6620 bpobj_count_block_cb, zcb, NULL); 6621 } 6622 6623 zdb_claim_removing(spa, zcb); 6624 6625 if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { 6626 VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, 6627 spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, 6628 zcb, NULL)); 6629 } 6630 6631 deleted_livelists_count_blocks(spa, zcb); 6632 6633 if (dump_opt['c'] > 1) 6634 flags |= TRAVERSE_PREFETCH_DATA; 6635 6636 zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); 6637 zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); 6638 zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); 6639 zcb->zcb_totalasize += 6640 metaslab_class_get_alloc(spa_embedded_log_class(spa)); 6641 zcb->zcb_start = zcb->zcb_lastprint = gethrtime(); 6642 err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb); 6643 6644 /* 6645 * If we've traversed the data blocks then we need to wait for those 6646 * I/Os to complete. We leverage "The Godfather" zio to wait on 6647 * all async I/Os to complete. 6648 */ 6649 if (dump_opt['c']) { 6650 for (c = 0; c < max_ncpus; c++) { 6651 (void) zio_wait(spa->spa_async_zio_root[c]); 6652 spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL, 6653 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 6654 ZIO_FLAG_GODFATHER); 6655 } 6656 } 6657 ASSERT0(spa->spa_load_verify_bytes); 6658 6659 /* 6660 * Done after zio_wait() since zcb_haderrors is modified in 6661 * zdb_blkptr_done() 6662 */ 6663 zcb->zcb_haderrors |= err; 6664 6665 if (zcb->zcb_haderrors) { 6666 (void) printf("\nError counts:\n\n"); 6667 (void) printf("\t%5s %s\n", "errno", "count"); 6668 for (e = 0; e < 256; e++) { 6669 if (zcb->zcb_errors[e] != 0) { 6670 (void) printf("\t%5d %llu\n", 6671 e, (u_longlong_t)zcb->zcb_errors[e]); 6672 } 6673 } 6674 } 6675 6676 /* 6677 * Report any leaked segments. 6678 */ 6679 leaks |= zdb_leak_fini(spa, zcb); 6680 6681 tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; 6682 6683 norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 6684 norm_space = metaslab_class_get_space(spa_normal_class(spa)); 6685 6686 total_alloc = norm_alloc + 6687 metaslab_class_get_alloc(spa_log_class(spa)) + 6688 metaslab_class_get_alloc(spa_embedded_log_class(spa)) + 6689 metaslab_class_get_alloc(spa_special_class(spa)) + 6690 metaslab_class_get_alloc(spa_dedup_class(spa)) + 6691 get_unflushed_alloc_space(spa); 6692 total_found = tzb->zb_asize - zcb->zcb_dedup_asize + 6693 zcb->zcb_removing_size + zcb->zcb_checkpoint_size; 6694 6695 if (total_found == total_alloc && !dump_opt['L']) { 6696 (void) printf("\n\tNo leaks (block sum matches space" 6697 " maps exactly)\n"); 6698 } else if (!dump_opt['L']) { 6699 (void) printf("block traversal size %llu != alloc %llu " 6700 "(%s %lld)\n", 6701 (u_longlong_t)total_found, 6702 (u_longlong_t)total_alloc, 6703 (dump_opt['L']) ? "unreachable" : "leaked", 6704 (longlong_t)(total_alloc - total_found)); 6705 leaks = B_TRUE; 6706 } 6707 6708 if (tzb->zb_count == 0) { 6709 umem_free(zcb, sizeof (zdb_cb_t)); 6710 return (2); 6711 } 6712 6713 (void) printf("\n"); 6714 (void) printf("\t%-16s %14llu\n", "bp count:", 6715 (u_longlong_t)tzb->zb_count); 6716 (void) printf("\t%-16s %14llu\n", "ganged count:", 6717 (longlong_t)tzb->zb_gangs); 6718 (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", 6719 (u_longlong_t)tzb->zb_lsize, 6720 (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); 6721 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", 6722 "bp physical:", (u_longlong_t)tzb->zb_psize, 6723 (u_longlong_t)(tzb->zb_psize / tzb->zb_count), 6724 (double)tzb->zb_lsize / tzb->zb_psize); 6725 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", 6726 "bp allocated:", (u_longlong_t)tzb->zb_asize, 6727 (u_longlong_t)(tzb->zb_asize / tzb->zb_count), 6728 (double)tzb->zb_lsize / tzb->zb_asize); 6729 (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", 6730 "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize, 6731 (u_longlong_t)zcb->zcb_dedup_blocks, 6732 (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0); 6733 (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", 6734 (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); 6735 6736 if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) { 6737 uint64_t alloc = metaslab_class_get_alloc( 6738 spa_special_class(spa)); 6739 uint64_t space = metaslab_class_get_space( 6740 spa_special_class(spa)); 6741 6742 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 6743 "Special class", (u_longlong_t)alloc, 6744 100.0 * alloc / space); 6745 } 6746 6747 if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) { 6748 uint64_t alloc = metaslab_class_get_alloc( 6749 spa_dedup_class(spa)); 6750 uint64_t space = metaslab_class_get_space( 6751 spa_dedup_class(spa)); 6752 6753 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 6754 "Dedup class", (u_longlong_t)alloc, 6755 100.0 * alloc / space); 6756 } 6757 6758 if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) { 6759 uint64_t alloc = metaslab_class_get_alloc( 6760 spa_embedded_log_class(spa)); 6761 uint64_t space = metaslab_class_get_space( 6762 spa_embedded_log_class(spa)); 6763 6764 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 6765 "Embedded log class", (u_longlong_t)alloc, 6766 100.0 * alloc / space); 6767 } 6768 6769 for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { 6770 if (zcb->zcb_embedded_blocks[i] == 0) 6771 continue; 6772 (void) printf("\n"); 6773 (void) printf("\tadditional, non-pointer bps of type %u: " 6774 "%10llu\n", 6775 i, (u_longlong_t)zcb->zcb_embedded_blocks[i]); 6776 6777 if (dump_opt['b'] >= 3) { 6778 (void) printf("\t number of (compressed) bytes: " 6779 "number of bps\n"); 6780 dump_histogram(zcb->zcb_embedded_histogram[i], 6781 sizeof (zcb->zcb_embedded_histogram[i]) / 6782 sizeof (zcb->zcb_embedded_histogram[i][0]), 0); 6783 } 6784 } 6785 6786 if (tzb->zb_ditto_samevdev != 0) { 6787 (void) printf("\tDittoed blocks on same vdev: %llu\n", 6788 (longlong_t)tzb->zb_ditto_samevdev); 6789 } 6790 if (tzb->zb_ditto_same_ms != 0) { 6791 (void) printf("\tDittoed blocks in same metaslab: %llu\n", 6792 (longlong_t)tzb->zb_ditto_same_ms); 6793 } 6794 6795 for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { 6796 vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; 6797 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6798 6799 if (vim == NULL) { 6800 continue; 6801 } 6802 6803 char mem[32]; 6804 zdb_nicenum(vdev_indirect_mapping_num_entries(vim), 6805 mem, vdev_indirect_mapping_size(vim)); 6806 6807 (void) printf("\tindirect vdev id %llu has %llu segments " 6808 "(%s in memory)\n", 6809 (longlong_t)vd->vdev_id, 6810 (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); 6811 } 6812 6813 if (dump_opt['b'] >= 2) { 6814 int l, t, level; 6815 char csize[32], lsize[32], psize[32], asize[32]; 6816 char avg[32], gang[32]; 6817 (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" 6818 "\t avg\t comp\t%%Total\tType\n"); 6819 6820 zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t), 6821 UMEM_NOFAIL); 6822 6823 for (t = 0; t <= ZDB_OT_TOTAL; t++) { 6824 const char *typename; 6825 6826 /* make sure nicenum has enough space */ 6827 _Static_assert(sizeof (csize) >= NN_NUMBUF_SZ, 6828 "csize truncated"); 6829 _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, 6830 "lsize truncated"); 6831 _Static_assert(sizeof (psize) >= NN_NUMBUF_SZ, 6832 "psize truncated"); 6833 _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, 6834 "asize truncated"); 6835 _Static_assert(sizeof (avg) >= NN_NUMBUF_SZ, 6836 "avg truncated"); 6837 _Static_assert(sizeof (gang) >= NN_NUMBUF_SZ, 6838 "gang truncated"); 6839 6840 if (t < DMU_OT_NUMTYPES) 6841 typename = dmu_ot[t].ot_name; 6842 else 6843 typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; 6844 6845 if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) { 6846 (void) printf("%6s\t%5s\t%5s\t%5s" 6847 "\t%5s\t%5s\t%6s\t%s\n", 6848 "-", 6849 "-", 6850 "-", 6851 "-", 6852 "-", 6853 "-", 6854 "-", 6855 typename); 6856 continue; 6857 } 6858 6859 for (l = ZB_TOTAL - 1; l >= -1; l--) { 6860 level = (l == -1 ? ZB_TOTAL : l); 6861 zb = &zcb->zcb_type[level][t]; 6862 6863 if (zb->zb_asize == 0) 6864 continue; 6865 6866 if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES && 6867 (level > 0 || DMU_OT_IS_METADATA(t))) { 6868 mdstats->zb_count += zb->zb_count; 6869 mdstats->zb_lsize += zb->zb_lsize; 6870 mdstats->zb_psize += zb->zb_psize; 6871 mdstats->zb_asize += zb->zb_asize; 6872 mdstats->zb_gangs += zb->zb_gangs; 6873 } 6874 6875 if (dump_opt['b'] < 3 && level != ZB_TOTAL) 6876 continue; 6877 6878 if (level == 0 && zb->zb_asize == 6879 zcb->zcb_type[ZB_TOTAL][t].zb_asize) 6880 continue; 6881 6882 zdb_nicenum(zb->zb_count, csize, 6883 sizeof (csize)); 6884 zdb_nicenum(zb->zb_lsize, lsize, 6885 sizeof (lsize)); 6886 zdb_nicenum(zb->zb_psize, psize, 6887 sizeof (psize)); 6888 zdb_nicenum(zb->zb_asize, asize, 6889 sizeof (asize)); 6890 zdb_nicenum(zb->zb_asize / zb->zb_count, avg, 6891 sizeof (avg)); 6892 zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); 6893 6894 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" 6895 "\t%5.2f\t%6.2f\t", 6896 csize, lsize, psize, asize, avg, 6897 (double)zb->zb_lsize / zb->zb_psize, 6898 100.0 * zb->zb_asize / tzb->zb_asize); 6899 6900 if (level == ZB_TOTAL) 6901 (void) printf("%s\n", typename); 6902 else 6903 (void) printf(" L%d %s\n", 6904 level, typename); 6905 6906 if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { 6907 (void) printf("\t number of ganged " 6908 "blocks: %s\n", gang); 6909 } 6910 6911 if (dump_opt['b'] >= 4) { 6912 (void) printf("psize " 6913 "(in 512-byte sectors): " 6914 "number of blocks\n"); 6915 dump_histogram(zb->zb_psize_histogram, 6916 PSIZE_HISTO_SIZE, 0); 6917 } 6918 } 6919 } 6920 zdb_nicenum(mdstats->zb_count, csize, 6921 sizeof (csize)); 6922 zdb_nicenum(mdstats->zb_lsize, lsize, 6923 sizeof (lsize)); 6924 zdb_nicenum(mdstats->zb_psize, psize, 6925 sizeof (psize)); 6926 zdb_nicenum(mdstats->zb_asize, asize, 6927 sizeof (asize)); 6928 zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg, 6929 sizeof (avg)); 6930 zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang)); 6931 6932 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" 6933 "\t%5.2f\t%6.2f\t", 6934 csize, lsize, psize, asize, avg, 6935 (double)mdstats->zb_lsize / mdstats->zb_psize, 6936 100.0 * mdstats->zb_asize / tzb->zb_asize); 6937 (void) printf("%s\n", "Metadata Total"); 6938 6939 /* Output a table summarizing block sizes in the pool */ 6940 if (dump_opt['b'] >= 2) { 6941 dump_size_histograms(zcb); 6942 } 6943 6944 umem_free(mdstats, sizeof (zfs_blkstat_t)); 6945 } 6946 6947 (void) printf("\n"); 6948 6949 if (leaks) { 6950 umem_free(zcb, sizeof (zdb_cb_t)); 6951 return (2); 6952 } 6953 6954 if (zcb->zcb_haderrors) { 6955 umem_free(zcb, sizeof (zdb_cb_t)); 6956 return (3); 6957 } 6958 6959 umem_free(zcb, sizeof (zdb_cb_t)); 6960 return (0); 6961 } 6962 6963 typedef struct zdb_ddt_entry { 6964 ddt_key_t zdde_key; 6965 uint64_t zdde_ref_blocks; 6966 uint64_t zdde_ref_lsize; 6967 uint64_t zdde_ref_psize; 6968 uint64_t zdde_ref_dsize; 6969 avl_node_t zdde_node; 6970 } zdb_ddt_entry_t; 6971 6972 static int 6973 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 6974 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 6975 { 6976 (void) zilog, (void) dnp; 6977 avl_tree_t *t = arg; 6978 avl_index_t where; 6979 zdb_ddt_entry_t *zdde, zdde_search; 6980 6981 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 6982 BP_IS_EMBEDDED(bp)) 6983 return (0); 6984 6985 if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { 6986 (void) printf("traversing objset %llu, %llu objects, " 6987 "%lu blocks so far\n", 6988 (u_longlong_t)zb->zb_objset, 6989 (u_longlong_t)BP_GET_FILL(bp), 6990 avl_numnodes(t)); 6991 } 6992 6993 if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || 6994 BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) 6995 return (0); 6996 6997 ddt_key_fill(&zdde_search.zdde_key, bp); 6998 6999 zdde = avl_find(t, &zdde_search, &where); 7000 7001 if (zdde == NULL) { 7002 zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); 7003 zdde->zdde_key = zdde_search.zdde_key; 7004 avl_insert(t, zdde, where); 7005 } 7006 7007 zdde->zdde_ref_blocks += 1; 7008 zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); 7009 zdde->zdde_ref_psize += BP_GET_PSIZE(bp); 7010 zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); 7011 7012 return (0); 7013 } 7014 7015 static void 7016 dump_simulated_ddt(spa_t *spa) 7017 { 7018 avl_tree_t t; 7019 void *cookie = NULL; 7020 zdb_ddt_entry_t *zdde; 7021 ddt_histogram_t ddh_total = {{{0}}}; 7022 ddt_stat_t dds_total = {0}; 7023 7024 avl_create(&t, ddt_entry_compare, 7025 sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); 7026 7027 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7028 7029 (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 7030 TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); 7031 7032 spa_config_exit(spa, SCL_CONFIG, FTAG); 7033 7034 while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { 7035 ddt_stat_t dds; 7036 uint64_t refcnt = zdde->zdde_ref_blocks; 7037 ASSERT(refcnt != 0); 7038 7039 dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; 7040 dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; 7041 dds.dds_psize = zdde->zdde_ref_psize / refcnt; 7042 dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; 7043 7044 dds.dds_ref_blocks = zdde->zdde_ref_blocks; 7045 dds.dds_ref_lsize = zdde->zdde_ref_lsize; 7046 dds.dds_ref_psize = zdde->zdde_ref_psize; 7047 dds.dds_ref_dsize = zdde->zdde_ref_dsize; 7048 7049 ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], 7050 &dds, 0); 7051 7052 umem_free(zdde, sizeof (*zdde)); 7053 } 7054 7055 avl_destroy(&t); 7056 7057 ddt_histogram_stat(&dds_total, &ddh_total); 7058 7059 (void) printf("Simulated DDT histogram:\n"); 7060 7061 zpool_dump_ddt(&dds_total, &ddh_total); 7062 7063 dump_dedup_ratio(&dds_total); 7064 } 7065 7066 static int 7067 verify_device_removal_feature_counts(spa_t *spa) 7068 { 7069 uint64_t dr_feature_refcount = 0; 7070 uint64_t oc_feature_refcount = 0; 7071 uint64_t indirect_vdev_count = 0; 7072 uint64_t precise_vdev_count = 0; 7073 uint64_t obsolete_counts_object_count = 0; 7074 uint64_t obsolete_sm_count = 0; 7075 uint64_t obsolete_counts_count = 0; 7076 uint64_t scip_count = 0; 7077 uint64_t obsolete_bpobj_count = 0; 7078 int ret = 0; 7079 7080 spa_condensing_indirect_phys_t *scip = 7081 &spa->spa_condensing_indirect_phys; 7082 if (scip->scip_next_mapping_object != 0) { 7083 vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; 7084 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 7085 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 7086 7087 (void) printf("Condensing indirect vdev %llu: new mapping " 7088 "object %llu, prev obsolete sm %llu\n", 7089 (u_longlong_t)scip->scip_vdev, 7090 (u_longlong_t)scip->scip_next_mapping_object, 7091 (u_longlong_t)scip->scip_prev_obsolete_sm_object); 7092 if (scip->scip_prev_obsolete_sm_object != 0) { 7093 space_map_t *prev_obsolete_sm = NULL; 7094 VERIFY0(space_map_open(&prev_obsolete_sm, 7095 spa->spa_meta_objset, 7096 scip->scip_prev_obsolete_sm_object, 7097 0, vd->vdev_asize, 0)); 7098 dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); 7099 (void) printf("\n"); 7100 space_map_close(prev_obsolete_sm); 7101 } 7102 7103 scip_count += 2; 7104 } 7105 7106 for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { 7107 vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; 7108 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 7109 7110 if (vic->vic_mapping_object != 0) { 7111 ASSERT(vd->vdev_ops == &vdev_indirect_ops || 7112 vd->vdev_removing); 7113 indirect_vdev_count++; 7114 7115 if (vd->vdev_indirect_mapping->vim_havecounts) { 7116 obsolete_counts_count++; 7117 } 7118 } 7119 7120 boolean_t are_precise; 7121 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); 7122 if (are_precise) { 7123 ASSERT(vic->vic_mapping_object != 0); 7124 precise_vdev_count++; 7125 } 7126 7127 uint64_t obsolete_sm_object; 7128 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 7129 if (obsolete_sm_object != 0) { 7130 ASSERT(vic->vic_mapping_object != 0); 7131 obsolete_sm_count++; 7132 } 7133 } 7134 7135 (void) feature_get_refcount(spa, 7136 &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], 7137 &dr_feature_refcount); 7138 (void) feature_get_refcount(spa, 7139 &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], 7140 &oc_feature_refcount); 7141 7142 if (dr_feature_refcount != indirect_vdev_count) { 7143 ret = 1; 7144 (void) printf("Number of indirect vdevs (%llu) " \ 7145 "does not match feature count (%llu)\n", 7146 (u_longlong_t)indirect_vdev_count, 7147 (u_longlong_t)dr_feature_refcount); 7148 } else { 7149 (void) printf("Verified device_removal feature refcount " \ 7150 "of %llu is correct\n", 7151 (u_longlong_t)dr_feature_refcount); 7152 } 7153 7154 if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 7155 DMU_POOL_OBSOLETE_BPOBJ) == 0) { 7156 obsolete_bpobj_count++; 7157 } 7158 7159 7160 obsolete_counts_object_count = precise_vdev_count; 7161 obsolete_counts_object_count += obsolete_sm_count; 7162 obsolete_counts_object_count += obsolete_counts_count; 7163 obsolete_counts_object_count += scip_count; 7164 obsolete_counts_object_count += obsolete_bpobj_count; 7165 obsolete_counts_object_count += remap_deadlist_count; 7166 7167 if (oc_feature_refcount != obsolete_counts_object_count) { 7168 ret = 1; 7169 (void) printf("Number of obsolete counts objects (%llu) " \ 7170 "does not match feature count (%llu)\n", 7171 (u_longlong_t)obsolete_counts_object_count, 7172 (u_longlong_t)oc_feature_refcount); 7173 (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " 7174 "ob:%llu rd:%llu\n", 7175 (u_longlong_t)precise_vdev_count, 7176 (u_longlong_t)obsolete_sm_count, 7177 (u_longlong_t)obsolete_counts_count, 7178 (u_longlong_t)scip_count, 7179 (u_longlong_t)obsolete_bpobj_count, 7180 (u_longlong_t)remap_deadlist_count); 7181 } else { 7182 (void) printf("Verified indirect_refcount feature refcount " \ 7183 "of %llu is correct\n", 7184 (u_longlong_t)oc_feature_refcount); 7185 } 7186 return (ret); 7187 } 7188 7189 static void 7190 zdb_set_skip_mmp(char *target) 7191 { 7192 spa_t *spa; 7193 7194 /* 7195 * Disable the activity check to allow examination of 7196 * active pools. 7197 */ 7198 mutex_enter(&spa_namespace_lock); 7199 if ((spa = spa_lookup(target)) != NULL) { 7200 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 7201 } 7202 mutex_exit(&spa_namespace_lock); 7203 } 7204 7205 #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" 7206 /* 7207 * Import the checkpointed state of the pool specified by the target 7208 * parameter as readonly. The function also accepts a pool config 7209 * as an optional parameter, else it attempts to infer the config by 7210 * the name of the target pool. 7211 * 7212 * Note that the checkpointed state's pool name will be the name of 7213 * the original pool with the above suffix appended to it. In addition, 7214 * if the target is not a pool name (e.g. a path to a dataset) then 7215 * the new_path parameter is populated with the updated path to 7216 * reflect the fact that we are looking into the checkpointed state. 7217 * 7218 * The function returns a newly-allocated copy of the name of the 7219 * pool containing the checkpointed state. When this copy is no 7220 * longer needed it should be freed with free(3C). Same thing 7221 * applies to the new_path parameter if allocated. 7222 */ 7223 static char * 7224 import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) 7225 { 7226 int error = 0; 7227 char *poolname, *bogus_name = NULL; 7228 boolean_t freecfg = B_FALSE; 7229 7230 /* If the target is not a pool, the extract the pool name */ 7231 char *path_start = strchr(target, '/'); 7232 if (path_start != NULL) { 7233 size_t poolname_len = path_start - target; 7234 poolname = strndup(target, poolname_len); 7235 } else { 7236 poolname = target; 7237 } 7238 7239 if (cfg == NULL) { 7240 zdb_set_skip_mmp(poolname); 7241 error = spa_get_stats(poolname, &cfg, NULL, 0); 7242 if (error != 0) { 7243 fatal("Tried to read config of pool \"%s\" but " 7244 "spa_get_stats() failed with error %d\n", 7245 poolname, error); 7246 } 7247 freecfg = B_TRUE; 7248 } 7249 7250 if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) { 7251 if (target != poolname) 7252 free(poolname); 7253 return (NULL); 7254 } 7255 fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); 7256 7257 error = spa_import(bogus_name, cfg, NULL, 7258 ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | 7259 ZFS_IMPORT_SKIP_MMP); 7260 if (freecfg) 7261 nvlist_free(cfg); 7262 if (error != 0) { 7263 fatal("Tried to import pool \"%s\" but spa_import() failed " 7264 "with error %d\n", bogus_name, error); 7265 } 7266 7267 if (new_path != NULL && path_start != NULL) { 7268 if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { 7269 free(bogus_name); 7270 if (path_start != NULL) 7271 free(poolname); 7272 return (NULL); 7273 } 7274 } 7275 7276 if (target != poolname) 7277 free(poolname); 7278 7279 return (bogus_name); 7280 } 7281 7282 typedef struct verify_checkpoint_sm_entry_cb_arg { 7283 vdev_t *vcsec_vd; 7284 7285 /* the following fields are only used for printing progress */ 7286 uint64_t vcsec_entryid; 7287 uint64_t vcsec_num_entries; 7288 } verify_checkpoint_sm_entry_cb_arg_t; 7289 7290 #define ENTRIES_PER_PROGRESS_UPDATE 10000 7291 7292 static int 7293 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) 7294 { 7295 verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; 7296 vdev_t *vd = vcsec->vcsec_vd; 7297 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; 7298 uint64_t end = sme->sme_offset + sme->sme_run; 7299 7300 ASSERT(sme->sme_type == SM_FREE); 7301 7302 if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { 7303 (void) fprintf(stderr, 7304 "\rverifying vdev %llu, space map entry %llu of %llu ...", 7305 (longlong_t)vd->vdev_id, 7306 (longlong_t)vcsec->vcsec_entryid, 7307 (longlong_t)vcsec->vcsec_num_entries); 7308 } 7309 vcsec->vcsec_entryid++; 7310 7311 /* 7312 * See comment in checkpoint_sm_exclude_entry_cb() 7313 */ 7314 VERIFY3U(sme->sme_offset, >=, ms->ms_start); 7315 VERIFY3U(end, <=, ms->ms_start + ms->ms_size); 7316 7317 /* 7318 * The entries in the vdev_checkpoint_sm should be marked as 7319 * allocated in the checkpointed state of the pool, therefore 7320 * their respective ms_allocateable trees should not contain them. 7321 */ 7322 mutex_enter(&ms->ms_lock); 7323 range_tree_verify_not_present(ms->ms_allocatable, 7324 sme->sme_offset, sme->sme_run); 7325 mutex_exit(&ms->ms_lock); 7326 7327 return (0); 7328 } 7329 7330 /* 7331 * Verify that all segments in the vdev_checkpoint_sm are allocated 7332 * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's 7333 * ms_allocatable). 7334 * 7335 * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of 7336 * each vdev in the current state of the pool to the metaslab space maps 7337 * (ms_sm) of the checkpointed state of the pool. 7338 * 7339 * Note that the function changes the state of the ms_allocatable 7340 * trees of the current spa_t. The entries of these ms_allocatable 7341 * trees are cleared out and then repopulated from with the free 7342 * entries of their respective ms_sm space maps. 7343 */ 7344 static void 7345 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) 7346 { 7347 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; 7348 vdev_t *current_rvd = current->spa_root_vdev; 7349 7350 load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); 7351 7352 for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { 7353 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; 7354 vdev_t *current_vd = current_rvd->vdev_child[c]; 7355 7356 space_map_t *checkpoint_sm = NULL; 7357 uint64_t checkpoint_sm_obj; 7358 7359 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { 7360 /* 7361 * Since we don't allow device removal in a pool 7362 * that has a checkpoint, we expect that all removed 7363 * vdevs were removed from the pool before the 7364 * checkpoint. 7365 */ 7366 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); 7367 continue; 7368 } 7369 7370 /* 7371 * If the checkpoint space map doesn't exist, then nothing 7372 * here is checkpointed so there's nothing to verify. 7373 */ 7374 if (current_vd->vdev_top_zap == 0 || 7375 zap_contains(spa_meta_objset(current), 7376 current_vd->vdev_top_zap, 7377 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 7378 continue; 7379 7380 VERIFY0(zap_lookup(spa_meta_objset(current), 7381 current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 7382 sizeof (uint64_t), 1, &checkpoint_sm_obj)); 7383 7384 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), 7385 checkpoint_sm_obj, 0, current_vd->vdev_asize, 7386 current_vd->vdev_ashift)); 7387 7388 verify_checkpoint_sm_entry_cb_arg_t vcsec; 7389 vcsec.vcsec_vd = ckpoint_vd; 7390 vcsec.vcsec_entryid = 0; 7391 vcsec.vcsec_num_entries = 7392 space_map_length(checkpoint_sm) / sizeof (uint64_t); 7393 VERIFY0(space_map_iterate(checkpoint_sm, 7394 space_map_length(checkpoint_sm), 7395 verify_checkpoint_sm_entry_cb, &vcsec)); 7396 if (dump_opt['m'] > 3) 7397 dump_spacemap(current->spa_meta_objset, checkpoint_sm); 7398 space_map_close(checkpoint_sm); 7399 } 7400 7401 /* 7402 * If we've added vdevs since we took the checkpoint, ensure 7403 * that their checkpoint space maps are empty. 7404 */ 7405 if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { 7406 for (uint64_t c = ckpoint_rvd->vdev_children; 7407 c < current_rvd->vdev_children; c++) { 7408 vdev_t *current_vd = current_rvd->vdev_child[c]; 7409 VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL); 7410 } 7411 } 7412 7413 /* for cleaner progress output */ 7414 (void) fprintf(stderr, "\n"); 7415 } 7416 7417 /* 7418 * Verifies that all space that's allocated in the checkpoint is 7419 * still allocated in the current version, by checking that everything 7420 * in checkpoint's ms_allocatable (which is actually allocated, not 7421 * allocatable/free) is not present in current's ms_allocatable. 7422 * 7423 * Note that the function changes the state of the ms_allocatable 7424 * trees of both spas when called. The entries of all ms_allocatable 7425 * trees are cleared out and then repopulated from their respective 7426 * ms_sm space maps. In the checkpointed state we load the allocated 7427 * entries, and in the current state we load the free entries. 7428 */ 7429 static void 7430 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) 7431 { 7432 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; 7433 vdev_t *current_rvd = current->spa_root_vdev; 7434 7435 load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); 7436 load_concrete_ms_allocatable_trees(current, SM_FREE); 7437 7438 for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { 7439 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; 7440 vdev_t *current_vd = current_rvd->vdev_child[i]; 7441 7442 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { 7443 /* 7444 * See comment in verify_checkpoint_vdev_spacemaps() 7445 */ 7446 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); 7447 continue; 7448 } 7449 7450 for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { 7451 metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; 7452 metaslab_t *current_msp = current_vd->vdev_ms[m]; 7453 7454 (void) fprintf(stderr, 7455 "\rverifying vdev %llu of %llu, " 7456 "metaslab %llu of %llu ...", 7457 (longlong_t)current_vd->vdev_id, 7458 (longlong_t)current_rvd->vdev_children, 7459 (longlong_t)current_vd->vdev_ms[m]->ms_id, 7460 (longlong_t)current_vd->vdev_ms_count); 7461 7462 /* 7463 * We walk through the ms_allocatable trees that 7464 * are loaded with the allocated blocks from the 7465 * ms_sm spacemaps of the checkpoint. For each 7466 * one of these ranges we ensure that none of them 7467 * exists in the ms_allocatable trees of the 7468 * current state which are loaded with the ranges 7469 * that are currently free. 7470 * 7471 * This way we ensure that none of the blocks that 7472 * are part of the checkpoint were freed by mistake. 7473 */ 7474 range_tree_walk(ckpoint_msp->ms_allocatable, 7475 (range_tree_func_t *)range_tree_verify_not_present, 7476 current_msp->ms_allocatable); 7477 } 7478 } 7479 7480 /* for cleaner progress output */ 7481 (void) fprintf(stderr, "\n"); 7482 } 7483 7484 static void 7485 verify_checkpoint_blocks(spa_t *spa) 7486 { 7487 ASSERT(!dump_opt['L']); 7488 7489 spa_t *checkpoint_spa; 7490 char *checkpoint_pool; 7491 int error = 0; 7492 7493 /* 7494 * We import the checkpointed state of the pool (under a different 7495 * name) so we can do verification on it against the current state 7496 * of the pool. 7497 */ 7498 checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, 7499 NULL); 7500 ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); 7501 7502 error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); 7503 if (error != 0) { 7504 fatal("Tried to open pool \"%s\" but spa_open() failed with " 7505 "error %d\n", checkpoint_pool, error); 7506 } 7507 7508 /* 7509 * Ensure that ranges in the checkpoint space maps of each vdev 7510 * are allocated according to the checkpointed state's metaslab 7511 * space maps. 7512 */ 7513 verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); 7514 7515 /* 7516 * Ensure that allocated ranges in the checkpoint's metaslab 7517 * space maps remain allocated in the metaslab space maps of 7518 * the current state. 7519 */ 7520 verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); 7521 7522 /* 7523 * Once we are done, we get rid of the checkpointed state. 7524 */ 7525 spa_close(checkpoint_spa, FTAG); 7526 free(checkpoint_pool); 7527 } 7528 7529 static void 7530 dump_leftover_checkpoint_blocks(spa_t *spa) 7531 { 7532 vdev_t *rvd = spa->spa_root_vdev; 7533 7534 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 7535 vdev_t *vd = rvd->vdev_child[i]; 7536 7537 space_map_t *checkpoint_sm = NULL; 7538 uint64_t checkpoint_sm_obj; 7539 7540 if (vd->vdev_top_zap == 0) 7541 continue; 7542 7543 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, 7544 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 7545 continue; 7546 7547 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, 7548 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 7549 sizeof (uint64_t), 1, &checkpoint_sm_obj)); 7550 7551 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), 7552 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); 7553 dump_spacemap(spa->spa_meta_objset, checkpoint_sm); 7554 space_map_close(checkpoint_sm); 7555 } 7556 } 7557 7558 static int 7559 verify_checkpoint(spa_t *spa) 7560 { 7561 uberblock_t checkpoint; 7562 int error; 7563 7564 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 7565 return (0); 7566 7567 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 7568 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 7569 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 7570 7571 if (error == ENOENT && !dump_opt['L']) { 7572 /* 7573 * If the feature is active but the uberblock is missing 7574 * then we must be in the middle of discarding the 7575 * checkpoint. 7576 */ 7577 (void) printf("\nPartially discarded checkpoint " 7578 "state found:\n"); 7579 if (dump_opt['m'] > 3) 7580 dump_leftover_checkpoint_blocks(spa); 7581 return (0); 7582 } else if (error != 0) { 7583 (void) printf("lookup error %d when looking for " 7584 "checkpointed uberblock in MOS\n", error); 7585 return (error); 7586 } 7587 dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); 7588 7589 if (checkpoint.ub_checkpoint_txg == 0) { 7590 (void) printf("\nub_checkpoint_txg not set in checkpointed " 7591 "uberblock\n"); 7592 error = 3; 7593 } 7594 7595 if (error == 0 && !dump_opt['L']) 7596 verify_checkpoint_blocks(spa); 7597 7598 return (error); 7599 } 7600 7601 static void 7602 mos_leaks_cb(void *arg, uint64_t start, uint64_t size) 7603 { 7604 (void) arg; 7605 for (uint64_t i = start; i < size; i++) { 7606 (void) printf("MOS object %llu referenced but not allocated\n", 7607 (u_longlong_t)i); 7608 } 7609 } 7610 7611 static void 7612 mos_obj_refd(uint64_t obj) 7613 { 7614 if (obj != 0 && mos_refd_objs != NULL) 7615 range_tree_add(mos_refd_objs, obj, 1); 7616 } 7617 7618 /* 7619 * Call on a MOS object that may already have been referenced. 7620 */ 7621 static void 7622 mos_obj_refd_multiple(uint64_t obj) 7623 { 7624 if (obj != 0 && mos_refd_objs != NULL && 7625 !range_tree_contains(mos_refd_objs, obj, 1)) 7626 range_tree_add(mos_refd_objs, obj, 1); 7627 } 7628 7629 static void 7630 mos_leak_vdev_top_zap(vdev_t *vd) 7631 { 7632 uint64_t ms_flush_data_obj; 7633 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), 7634 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, 7635 sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); 7636 if (error == ENOENT) 7637 return; 7638 ASSERT0(error); 7639 7640 mos_obj_refd(ms_flush_data_obj); 7641 } 7642 7643 static void 7644 mos_leak_vdev(vdev_t *vd) 7645 { 7646 mos_obj_refd(vd->vdev_dtl_object); 7647 mos_obj_refd(vd->vdev_ms_array); 7648 mos_obj_refd(vd->vdev_indirect_config.vic_births_object); 7649 mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); 7650 mos_obj_refd(vd->vdev_leaf_zap); 7651 if (vd->vdev_checkpoint_sm != NULL) 7652 mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); 7653 if (vd->vdev_indirect_mapping != NULL) { 7654 mos_obj_refd(vd->vdev_indirect_mapping-> 7655 vim_phys->vimp_counts_object); 7656 } 7657 if (vd->vdev_obsolete_sm != NULL) 7658 mos_obj_refd(vd->vdev_obsolete_sm->sm_object); 7659 7660 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 7661 metaslab_t *ms = vd->vdev_ms[m]; 7662 mos_obj_refd(space_map_object(ms->ms_sm)); 7663 } 7664 7665 if (vd->vdev_root_zap != 0) 7666 mos_obj_refd(vd->vdev_root_zap); 7667 7668 if (vd->vdev_top_zap != 0) { 7669 mos_obj_refd(vd->vdev_top_zap); 7670 mos_leak_vdev_top_zap(vd); 7671 } 7672 7673 for (uint64_t c = 0; c < vd->vdev_children; c++) { 7674 mos_leak_vdev(vd->vdev_child[c]); 7675 } 7676 } 7677 7678 static void 7679 mos_leak_log_spacemaps(spa_t *spa) 7680 { 7681 uint64_t spacemap_zap; 7682 int error = zap_lookup(spa_meta_objset(spa), 7683 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, 7684 sizeof (spacemap_zap), 1, &spacemap_zap); 7685 if (error == ENOENT) 7686 return; 7687 ASSERT0(error); 7688 7689 mos_obj_refd(spacemap_zap); 7690 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 7691 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) 7692 mos_obj_refd(sls->sls_sm_obj); 7693 } 7694 7695 static void 7696 errorlog_count_refd(objset_t *mos, uint64_t errlog) 7697 { 7698 zap_cursor_t zc; 7699 zap_attribute_t za; 7700 for (zap_cursor_init(&zc, mos, errlog); 7701 zap_cursor_retrieve(&zc, &za) == 0; 7702 zap_cursor_advance(&zc)) { 7703 mos_obj_refd(za.za_first_integer); 7704 } 7705 zap_cursor_fini(&zc); 7706 } 7707 7708 static int 7709 dump_mos_leaks(spa_t *spa) 7710 { 7711 int rv = 0; 7712 objset_t *mos = spa->spa_meta_objset; 7713 dsl_pool_t *dp = spa->spa_dsl_pool; 7714 7715 /* Visit and mark all referenced objects in the MOS */ 7716 7717 mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); 7718 mos_obj_refd(spa->spa_pool_props_object); 7719 mos_obj_refd(spa->spa_config_object); 7720 mos_obj_refd(spa->spa_ddt_stat_object); 7721 mos_obj_refd(spa->spa_feat_desc_obj); 7722 mos_obj_refd(spa->spa_feat_enabled_txg_obj); 7723 mos_obj_refd(spa->spa_feat_for_read_obj); 7724 mos_obj_refd(spa->spa_feat_for_write_obj); 7725 mos_obj_refd(spa->spa_history); 7726 mos_obj_refd(spa->spa_errlog_last); 7727 mos_obj_refd(spa->spa_errlog_scrub); 7728 7729 if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { 7730 errorlog_count_refd(mos, spa->spa_errlog_last); 7731 errorlog_count_refd(mos, spa->spa_errlog_scrub); 7732 } 7733 7734 mos_obj_refd(spa->spa_all_vdev_zaps); 7735 mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); 7736 mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); 7737 mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); 7738 bpobj_count_refd(&spa->spa_deferred_bpobj); 7739 mos_obj_refd(dp->dp_empty_bpobj); 7740 bpobj_count_refd(&dp->dp_obsolete_bpobj); 7741 bpobj_count_refd(&dp->dp_free_bpobj); 7742 mos_obj_refd(spa->spa_l2cache.sav_object); 7743 mos_obj_refd(spa->spa_spares.sav_object); 7744 7745 if (spa->spa_syncing_log_sm != NULL) 7746 mos_obj_refd(spa->spa_syncing_log_sm->sm_object); 7747 mos_leak_log_spacemaps(spa); 7748 7749 mos_obj_refd(spa->spa_condensing_indirect_phys. 7750 scip_next_mapping_object); 7751 mos_obj_refd(spa->spa_condensing_indirect_phys. 7752 scip_prev_obsolete_sm_object); 7753 if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { 7754 vdev_indirect_mapping_t *vim = 7755 vdev_indirect_mapping_open(mos, 7756 spa->spa_condensing_indirect_phys.scip_next_mapping_object); 7757 mos_obj_refd(vim->vim_phys->vimp_counts_object); 7758 vdev_indirect_mapping_close(vim); 7759 } 7760 deleted_livelists_dump_mos(spa); 7761 7762 if (dp->dp_origin_snap != NULL) { 7763 dsl_dataset_t *ds; 7764 7765 dsl_pool_config_enter(dp, FTAG); 7766 VERIFY0(dsl_dataset_hold_obj(dp, 7767 dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, 7768 FTAG, &ds)); 7769 count_ds_mos_objects(ds); 7770 dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); 7771 dsl_dataset_rele(ds, FTAG); 7772 dsl_pool_config_exit(dp, FTAG); 7773 7774 count_ds_mos_objects(dp->dp_origin_snap); 7775 dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); 7776 } 7777 count_dir_mos_objects(dp->dp_mos_dir); 7778 if (dp->dp_free_dir != NULL) 7779 count_dir_mos_objects(dp->dp_free_dir); 7780 if (dp->dp_leak_dir != NULL) 7781 count_dir_mos_objects(dp->dp_leak_dir); 7782 7783 mos_leak_vdev(spa->spa_root_vdev); 7784 7785 for (uint64_t class = 0; class < DDT_CLASSES; class++) { 7786 for (uint64_t type = 0; type < DDT_TYPES; type++) { 7787 for (uint64_t cksum = 0; 7788 cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { 7789 ddt_t *ddt = spa->spa_ddt[cksum]; 7790 mos_obj_refd(ddt->ddt_object[type][class]); 7791 } 7792 } 7793 } 7794 7795 /* 7796 * Visit all allocated objects and make sure they are referenced. 7797 */ 7798 uint64_t object = 0; 7799 while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { 7800 if (range_tree_contains(mos_refd_objs, object, 1)) { 7801 range_tree_remove(mos_refd_objs, object, 1); 7802 } else { 7803 dmu_object_info_t doi; 7804 const char *name; 7805 VERIFY0(dmu_object_info(mos, object, &doi)); 7806 if (doi.doi_type & DMU_OT_NEWTYPE) { 7807 dmu_object_byteswap_t bswap = 7808 DMU_OT_BYTESWAP(doi.doi_type); 7809 name = dmu_ot_byteswap[bswap].ob_name; 7810 } else { 7811 name = dmu_ot[doi.doi_type].ot_name; 7812 } 7813 7814 (void) printf("MOS object %llu (%s) leaked\n", 7815 (u_longlong_t)object, name); 7816 rv = 2; 7817 } 7818 } 7819 (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); 7820 if (!range_tree_is_empty(mos_refd_objs)) 7821 rv = 2; 7822 range_tree_vacate(mos_refd_objs, NULL, NULL); 7823 range_tree_destroy(mos_refd_objs); 7824 return (rv); 7825 } 7826 7827 typedef struct log_sm_obsolete_stats_arg { 7828 uint64_t lsos_current_txg; 7829 7830 uint64_t lsos_total_entries; 7831 uint64_t lsos_valid_entries; 7832 7833 uint64_t lsos_sm_entries; 7834 uint64_t lsos_valid_sm_entries; 7835 } log_sm_obsolete_stats_arg_t; 7836 7837 static int 7838 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, 7839 uint64_t txg, void *arg) 7840 { 7841 log_sm_obsolete_stats_arg_t *lsos = arg; 7842 7843 uint64_t offset = sme->sme_offset; 7844 uint64_t vdev_id = sme->sme_vdev; 7845 7846 if (lsos->lsos_current_txg == 0) { 7847 /* this is the first log */ 7848 lsos->lsos_current_txg = txg; 7849 } else if (lsos->lsos_current_txg < txg) { 7850 /* we just changed log - print stats and reset */ 7851 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", 7852 (u_longlong_t)lsos->lsos_valid_sm_entries, 7853 (u_longlong_t)lsos->lsos_sm_entries, 7854 (u_longlong_t)lsos->lsos_current_txg); 7855 lsos->lsos_valid_sm_entries = 0; 7856 lsos->lsos_sm_entries = 0; 7857 lsos->lsos_current_txg = txg; 7858 } 7859 ASSERT3U(lsos->lsos_current_txg, ==, txg); 7860 7861 lsos->lsos_sm_entries++; 7862 lsos->lsos_total_entries++; 7863 7864 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 7865 if (!vdev_is_concrete(vd)) 7866 return (0); 7867 7868 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 7869 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 7870 7871 if (txg < metaslab_unflushed_txg(ms)) 7872 return (0); 7873 lsos->lsos_valid_sm_entries++; 7874 lsos->lsos_valid_entries++; 7875 return (0); 7876 } 7877 7878 static void 7879 dump_log_spacemap_obsolete_stats(spa_t *spa) 7880 { 7881 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 7882 return; 7883 7884 log_sm_obsolete_stats_arg_t lsos = {0}; 7885 7886 (void) printf("Log Space Map Obsolete Entry Statistics:\n"); 7887 7888 iterate_through_spacemap_logs(spa, 7889 log_spacemap_obsolete_stats_cb, &lsos); 7890 7891 /* print stats for latest log */ 7892 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", 7893 (u_longlong_t)lsos.lsos_valid_sm_entries, 7894 (u_longlong_t)lsos.lsos_sm_entries, 7895 (u_longlong_t)lsos.lsos_current_txg); 7896 7897 (void) printf("%-8llu valid entries out of %-8llu - total\n\n", 7898 (u_longlong_t)lsos.lsos_valid_entries, 7899 (u_longlong_t)lsos.lsos_total_entries); 7900 } 7901 7902 static void 7903 dump_zpool(spa_t *spa) 7904 { 7905 dsl_pool_t *dp = spa_get_dsl(spa); 7906 int rc = 0; 7907 7908 if (dump_opt['y']) { 7909 livelist_metaslab_validate(spa); 7910 } 7911 7912 if (dump_opt['S']) { 7913 dump_simulated_ddt(spa); 7914 return; 7915 } 7916 7917 if (!dump_opt['e'] && dump_opt['C'] > 1) { 7918 (void) printf("\nCached configuration:\n"); 7919 dump_nvlist(spa->spa_config, 8); 7920 } 7921 7922 if (dump_opt['C']) 7923 dump_config(spa); 7924 7925 if (dump_opt['u']) 7926 dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); 7927 7928 if (dump_opt['D']) 7929 dump_all_ddts(spa); 7930 7931 if (dump_opt['d'] > 2 || dump_opt['m']) 7932 dump_metaslabs(spa); 7933 if (dump_opt['M']) 7934 dump_metaslab_groups(spa, dump_opt['M'] > 1); 7935 if (dump_opt['d'] > 2 || dump_opt['m']) { 7936 dump_log_spacemaps(spa); 7937 dump_log_spacemap_obsolete_stats(spa); 7938 } 7939 7940 if (dump_opt['d'] || dump_opt['i']) { 7941 spa_feature_t f; 7942 mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 7943 0); 7944 dump_objset(dp->dp_meta_objset); 7945 7946 if (dump_opt['d'] >= 3) { 7947 dsl_pool_t *dp = spa->spa_dsl_pool; 7948 dump_full_bpobj(&spa->spa_deferred_bpobj, 7949 "Deferred frees", 0); 7950 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 7951 dump_full_bpobj(&dp->dp_free_bpobj, 7952 "Pool snapshot frees", 0); 7953 } 7954 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { 7955 ASSERT(spa_feature_is_enabled(spa, 7956 SPA_FEATURE_DEVICE_REMOVAL)); 7957 dump_full_bpobj(&dp->dp_obsolete_bpobj, 7958 "Pool obsolete blocks", 0); 7959 } 7960 7961 if (spa_feature_is_active(spa, 7962 SPA_FEATURE_ASYNC_DESTROY)) { 7963 dump_bptree(spa->spa_meta_objset, 7964 dp->dp_bptree_obj, 7965 "Pool dataset frees"); 7966 } 7967 dump_dtl(spa->spa_root_vdev, 0); 7968 } 7969 7970 for (spa_feature_t f = 0; f < SPA_FEATURES; f++) 7971 global_feature_count[f] = UINT64_MAX; 7972 global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; 7973 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; 7974 global_feature_count[SPA_FEATURE_LIVELIST] = 0; 7975 7976 (void) dmu_objset_find(spa_name(spa), dump_one_objset, 7977 NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7978 7979 if (rc == 0 && !dump_opt['L']) 7980 rc = dump_mos_leaks(spa); 7981 7982 for (f = 0; f < SPA_FEATURES; f++) { 7983 uint64_t refcount; 7984 7985 uint64_t *arr; 7986 if (!(spa_feature_table[f].fi_flags & 7987 ZFEATURE_FLAG_PER_DATASET)) { 7988 if (global_feature_count[f] == UINT64_MAX) 7989 continue; 7990 if (!spa_feature_is_enabled(spa, f)) { 7991 ASSERT0(global_feature_count[f]); 7992 continue; 7993 } 7994 arr = global_feature_count; 7995 } else { 7996 if (!spa_feature_is_enabled(spa, f)) { 7997 ASSERT0(dataset_feature_count[f]); 7998 continue; 7999 } 8000 arr = dataset_feature_count; 8001 } 8002 if (feature_get_refcount(spa, &spa_feature_table[f], 8003 &refcount) == ENOTSUP) 8004 continue; 8005 if (arr[f] != refcount) { 8006 (void) printf("%s feature refcount mismatch: " 8007 "%lld consumers != %lld refcount\n", 8008 spa_feature_table[f].fi_uname, 8009 (longlong_t)arr[f], (longlong_t)refcount); 8010 rc = 2; 8011 } else { 8012 (void) printf("Verified %s feature refcount " 8013 "of %llu is correct\n", 8014 spa_feature_table[f].fi_uname, 8015 (longlong_t)refcount); 8016 } 8017 } 8018 8019 if (rc == 0) 8020 rc = verify_device_removal_feature_counts(spa); 8021 } 8022 8023 if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) 8024 rc = dump_block_stats(spa); 8025 8026 if (rc == 0) 8027 rc = verify_spacemap_refcounts(spa); 8028 8029 if (dump_opt['s']) 8030 show_pool_stats(spa); 8031 8032 if (dump_opt['h']) 8033 dump_history(spa); 8034 8035 if (rc == 0) 8036 rc = verify_checkpoint(spa); 8037 8038 if (rc != 0) { 8039 dump_debug_buffer(); 8040 exit(rc); 8041 } 8042 } 8043 8044 #define ZDB_FLAG_CHECKSUM 0x0001 8045 #define ZDB_FLAG_DECOMPRESS 0x0002 8046 #define ZDB_FLAG_BSWAP 0x0004 8047 #define ZDB_FLAG_GBH 0x0008 8048 #define ZDB_FLAG_INDIRECT 0x0010 8049 #define ZDB_FLAG_RAW 0x0020 8050 #define ZDB_FLAG_PRINT_BLKPTR 0x0040 8051 #define ZDB_FLAG_VERBOSE 0x0080 8052 8053 static int flagbits[256]; 8054 static char flagbitstr[16]; 8055 8056 static void 8057 zdb_print_blkptr(const blkptr_t *bp, int flags) 8058 { 8059 char blkbuf[BP_SPRINTF_LEN]; 8060 8061 if (flags & ZDB_FLAG_BSWAP) 8062 byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); 8063 8064 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 8065 (void) printf("%s\n", blkbuf); 8066 } 8067 8068 static void 8069 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) 8070 { 8071 int i; 8072 8073 for (i = 0; i < nbps; i++) 8074 zdb_print_blkptr(&bp[i], flags); 8075 } 8076 8077 static void 8078 zdb_dump_gbh(void *buf, int flags) 8079 { 8080 zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); 8081 } 8082 8083 static void 8084 zdb_dump_block_raw(void *buf, uint64_t size, int flags) 8085 { 8086 if (flags & ZDB_FLAG_BSWAP) 8087 byteswap_uint64_array(buf, size); 8088 VERIFY(write(fileno(stdout), buf, size) == size); 8089 } 8090 8091 static void 8092 zdb_dump_block(char *label, void *buf, uint64_t size, int flags) 8093 { 8094 uint64_t *d = (uint64_t *)buf; 8095 unsigned nwords = size / sizeof (uint64_t); 8096 int do_bswap = !!(flags & ZDB_FLAG_BSWAP); 8097 unsigned i, j; 8098 const char *hdr; 8099 char *c; 8100 8101 8102 if (do_bswap) 8103 hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; 8104 else 8105 hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; 8106 8107 (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); 8108 8109 #ifdef _LITTLE_ENDIAN 8110 /* correct the endianness */ 8111 do_bswap = !do_bswap; 8112 #endif 8113 for (i = 0; i < nwords; i += 2) { 8114 (void) printf("%06llx: %016llx %016llx ", 8115 (u_longlong_t)(i * sizeof (uint64_t)), 8116 (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), 8117 (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); 8118 8119 c = (char *)&d[i]; 8120 for (j = 0; j < 2 * sizeof (uint64_t); j++) 8121 (void) printf("%c", isprint(c[j]) ? c[j] : '.'); 8122 (void) printf("\n"); 8123 } 8124 } 8125 8126 /* 8127 * There are two acceptable formats: 8128 * leaf_name - For example: c1t0d0 or /tmp/ztest.0a 8129 * child[.child]* - For example: 0.1.1 8130 * 8131 * The second form can be used to specify arbitrary vdevs anywhere 8132 * in the hierarchy. For example, in a pool with a mirror of 8133 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . 8134 */ 8135 static vdev_t * 8136 zdb_vdev_lookup(vdev_t *vdev, const char *path) 8137 { 8138 char *s, *p, *q; 8139 unsigned i; 8140 8141 if (vdev == NULL) 8142 return (NULL); 8143 8144 /* First, assume the x.x.x.x format */ 8145 i = strtoul(path, &s, 10); 8146 if (s == path || (s && *s != '.' && *s != '\0')) 8147 goto name; 8148 if (i >= vdev->vdev_children) 8149 return (NULL); 8150 8151 vdev = vdev->vdev_child[i]; 8152 if (s && *s == '\0') 8153 return (vdev); 8154 return (zdb_vdev_lookup(vdev, s+1)); 8155 8156 name: 8157 for (i = 0; i < vdev->vdev_children; i++) { 8158 vdev_t *vc = vdev->vdev_child[i]; 8159 8160 if (vc->vdev_path == NULL) { 8161 vc = zdb_vdev_lookup(vc, path); 8162 if (vc == NULL) 8163 continue; 8164 else 8165 return (vc); 8166 } 8167 8168 p = strrchr(vc->vdev_path, '/'); 8169 p = p ? p + 1 : vc->vdev_path; 8170 q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; 8171 8172 if (strcmp(vc->vdev_path, path) == 0) 8173 return (vc); 8174 if (strcmp(p, path) == 0) 8175 return (vc); 8176 if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) 8177 return (vc); 8178 } 8179 8180 return (NULL); 8181 } 8182 8183 static int 8184 name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr) 8185 { 8186 dsl_dataset_t *ds; 8187 8188 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 8189 int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id, 8190 NULL, &ds); 8191 if (error != 0) { 8192 (void) fprintf(stderr, "failed to hold objset %llu: %s\n", 8193 (u_longlong_t)objset_id, strerror(error)); 8194 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 8195 return (error); 8196 } 8197 dsl_dataset_name(ds, outstr); 8198 dsl_dataset_rele(ds, NULL); 8199 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 8200 return (0); 8201 } 8202 8203 static boolean_t 8204 zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) 8205 { 8206 char *s0, *s1, *tmp = NULL; 8207 8208 if (sizes == NULL) 8209 return (B_FALSE); 8210 8211 s0 = strtok_r(sizes, "/", &tmp); 8212 if (s0 == NULL) 8213 return (B_FALSE); 8214 s1 = strtok_r(NULL, "/", &tmp); 8215 *lsize = strtoull(s0, NULL, 16); 8216 *psize = s1 ? strtoull(s1, NULL, 16) : *lsize; 8217 return (*lsize >= *psize && *psize > 0); 8218 } 8219 8220 #define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) 8221 8222 static boolean_t 8223 zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, 8224 uint64_t psize, int flags) 8225 { 8226 (void) buf; 8227 boolean_t exceeded = B_FALSE; 8228 /* 8229 * We don't know how the data was compressed, so just try 8230 * every decompress function at every inflated blocksize. 8231 */ 8232 void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 8233 int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 }; 8234 int *cfuncp = cfuncs; 8235 uint64_t maxlsize = SPA_MAXBLOCKSIZE; 8236 uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | 8237 ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | 8238 (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0); 8239 *cfuncp++ = ZIO_COMPRESS_LZ4; 8240 *cfuncp++ = ZIO_COMPRESS_LZJB; 8241 mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); 8242 for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) 8243 if (((1ULL << c) & mask) == 0) 8244 *cfuncp++ = c; 8245 8246 /* 8247 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this 8248 * could take a while and we should let the user know 8249 * we are not stuck. On the other hand, printing progress 8250 * info gets old after a while. User can specify 'v' flag 8251 * to see the progression. 8252 */ 8253 if (lsize == psize) 8254 lsize += SPA_MINBLOCKSIZE; 8255 else 8256 maxlsize = lsize; 8257 for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { 8258 for (cfuncp = cfuncs; *cfuncp; cfuncp++) { 8259 if (flags & ZDB_FLAG_VERBOSE) { 8260 (void) fprintf(stderr, 8261 "Trying %05llx -> %05llx (%s)\n", 8262 (u_longlong_t)psize, 8263 (u_longlong_t)lsize, 8264 zio_compress_table[*cfuncp].\ 8265 ci_name); 8266 } 8267 8268 /* 8269 * We randomize lbuf2, and decompress to both 8270 * lbuf and lbuf2. This way, we will know if 8271 * decompression fill exactly to lsize. 8272 */ 8273 VERIFY0(random_get_pseudo_bytes(lbuf2, lsize)); 8274 8275 if (zio_decompress_data(*cfuncp, pabd, 8276 lbuf, psize, lsize, NULL) == 0 && 8277 zio_decompress_data(*cfuncp, pabd, 8278 lbuf2, psize, lsize, NULL) == 0 && 8279 memcmp(lbuf, lbuf2, lsize) == 0) 8280 break; 8281 } 8282 if (*cfuncp != 0) 8283 break; 8284 } 8285 umem_free(lbuf2, SPA_MAXBLOCKSIZE); 8286 8287 if (lsize > maxlsize) { 8288 exceeded = B_TRUE; 8289 } 8290 if (*cfuncp == ZIO_COMPRESS_ZLE) { 8291 printf("\nZLE decompression was selected. If you " 8292 "suspect the results are wrong,\ntry avoiding ZLE " 8293 "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); 8294 } 8295 8296 return (exceeded); 8297 } 8298 8299 /* 8300 * Read a block from a pool and print it out. The syntax of the 8301 * block descriptor is: 8302 * 8303 * pool:vdev_specifier:offset:[lsize/]psize[:flags] 8304 * 8305 * pool - The name of the pool you wish to read from 8306 * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) 8307 * offset - offset, in hex, in bytes 8308 * size - Amount of data to read, in hex, in bytes 8309 * flags - A string of characters specifying options 8310 * b: Decode a blkptr at given offset within block 8311 * c: Calculate and display checksums 8312 * d: Decompress data before dumping 8313 * e: Byteswap data before dumping 8314 * g: Display data as a gang block header 8315 * i: Display as an indirect block 8316 * r: Dump raw data to stdout 8317 * v: Verbose 8318 * 8319 */ 8320 static void 8321 zdb_read_block(char *thing, spa_t *spa) 8322 { 8323 blkptr_t blk, *bp = &blk; 8324 dva_t *dva = bp->blk_dva; 8325 int flags = 0; 8326 uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0; 8327 zio_t *zio; 8328 vdev_t *vd; 8329 abd_t *pabd; 8330 void *lbuf, *buf; 8331 char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL; 8332 const char *vdev, *errmsg = NULL; 8333 int i, error; 8334 boolean_t borrowed = B_FALSE, found = B_FALSE; 8335 8336 dup = strdup(thing); 8337 s = strtok_r(dup, ":", &tmp); 8338 vdev = s ?: ""; 8339 s = strtok_r(NULL, ":", &tmp); 8340 offset = strtoull(s ? s : "", NULL, 16); 8341 sizes = strtok_r(NULL, ":", &tmp); 8342 s = strtok_r(NULL, ":", &tmp); 8343 flagstr = strdup(s ?: ""); 8344 8345 if (!zdb_parse_block_sizes(sizes, &lsize, &psize)) 8346 errmsg = "invalid size(s)"; 8347 if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE)) 8348 errmsg = "size must be a multiple of sector size"; 8349 if (!IS_P2ALIGNED(offset, DEV_BSIZE)) 8350 errmsg = "offset must be a multiple of sector size"; 8351 if (errmsg) { 8352 (void) printf("Invalid block specifier: %s - %s\n", 8353 thing, errmsg); 8354 goto done; 8355 } 8356 8357 tmp = NULL; 8358 for (s = strtok_r(flagstr, ":", &tmp); 8359 s != NULL; 8360 s = strtok_r(NULL, ":", &tmp)) { 8361 for (i = 0; i < strlen(flagstr); i++) { 8362 int bit = flagbits[(uchar_t)flagstr[i]]; 8363 8364 if (bit == 0) { 8365 (void) printf("***Ignoring flag: %c\n", 8366 (uchar_t)flagstr[i]); 8367 continue; 8368 } 8369 found = B_TRUE; 8370 flags |= bit; 8371 8372 p = &flagstr[i + 1]; 8373 if (*p != ':' && *p != '\0') { 8374 int j = 0, nextbit = flagbits[(uchar_t)*p]; 8375 char *end, offstr[8] = { 0 }; 8376 if ((bit == ZDB_FLAG_PRINT_BLKPTR) && 8377 (nextbit == 0)) { 8378 /* look ahead to isolate the offset */ 8379 while (nextbit == 0 && 8380 strchr(flagbitstr, *p) == NULL) { 8381 offstr[j] = *p; 8382 j++; 8383 if (i + j > strlen(flagstr)) 8384 break; 8385 p++; 8386 nextbit = flagbits[(uchar_t)*p]; 8387 } 8388 blkptr_offset = strtoull(offstr, &end, 8389 16); 8390 i += j; 8391 } else if (nextbit == 0) { 8392 (void) printf("***Ignoring flag arg:" 8393 " '%c'\n", (uchar_t)*p); 8394 } 8395 } 8396 } 8397 } 8398 if (blkptr_offset % sizeof (blkptr_t)) { 8399 printf("Block pointer offset 0x%llx " 8400 "must be divisible by 0x%x\n", 8401 (longlong_t)blkptr_offset, (int)sizeof (blkptr_t)); 8402 goto done; 8403 } 8404 if (found == B_FALSE && strlen(flagstr) > 0) { 8405 printf("Invalid flag arg: '%s'\n", flagstr); 8406 goto done; 8407 } 8408 8409 vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); 8410 if (vd == NULL) { 8411 (void) printf("***Invalid vdev: %s\n", vdev); 8412 goto done; 8413 } else { 8414 if (vd->vdev_path) 8415 (void) fprintf(stderr, "Found vdev: %s\n", 8416 vd->vdev_path); 8417 else 8418 (void) fprintf(stderr, "Found vdev type: %s\n", 8419 vd->vdev_ops->vdev_op_type); 8420 } 8421 8422 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); 8423 lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 8424 8425 BP_ZERO(bp); 8426 8427 DVA_SET_VDEV(&dva[0], vd->vdev_id); 8428 DVA_SET_OFFSET(&dva[0], offset); 8429 DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); 8430 DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); 8431 8432 BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); 8433 8434 BP_SET_LSIZE(bp, lsize); 8435 BP_SET_PSIZE(bp, psize); 8436 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 8437 BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); 8438 BP_SET_TYPE(bp, DMU_OT_NONE); 8439 BP_SET_LEVEL(bp, 0); 8440 BP_SET_DEDUP(bp, 0); 8441 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 8442 8443 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8444 zio = zio_root(spa, NULL, NULL, 0); 8445 8446 if (vd == vd->vdev_top) { 8447 /* 8448 * Treat this as a normal block read. 8449 */ 8450 zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, 8451 ZIO_PRIORITY_SYNC_READ, 8452 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); 8453 } else { 8454 /* 8455 * Treat this as a vdev child I/O. 8456 */ 8457 zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, 8458 psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, 8459 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | 8460 ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | 8461 ZIO_FLAG_OPTIONAL, NULL, NULL)); 8462 } 8463 8464 error = zio_wait(zio); 8465 spa_config_exit(spa, SCL_STATE, FTAG); 8466 8467 if (error) { 8468 (void) printf("Read of %s failed, error: %d\n", thing, error); 8469 goto out; 8470 } 8471 8472 uint64_t orig_lsize = lsize; 8473 buf = lbuf; 8474 if (flags & ZDB_FLAG_DECOMPRESS) { 8475 boolean_t failed = zdb_decompress_block(pabd, buf, lbuf, 8476 lsize, psize, flags); 8477 if (failed) { 8478 (void) printf("Decompress of %s failed\n", thing); 8479 goto out; 8480 } 8481 } else { 8482 buf = abd_borrow_buf_copy(pabd, lsize); 8483 borrowed = B_TRUE; 8484 } 8485 /* 8486 * Try to detect invalid block pointer. If invalid, try 8487 * decompressing. 8488 */ 8489 if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) && 8490 !(flags & ZDB_FLAG_DECOMPRESS)) { 8491 const blkptr_t *b = (const blkptr_t *)(void *) 8492 ((uintptr_t)buf + (uintptr_t)blkptr_offset); 8493 if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) == 8494 B_FALSE) { 8495 abd_return_buf_copy(pabd, buf, lsize); 8496 borrowed = B_FALSE; 8497 buf = lbuf; 8498 boolean_t failed = zdb_decompress_block(pabd, buf, 8499 lbuf, lsize, psize, flags); 8500 b = (const blkptr_t *)(void *) 8501 ((uintptr_t)buf + (uintptr_t)blkptr_offset); 8502 if (failed || zfs_blkptr_verify(spa, b, B_FALSE, 8503 BLK_VERIFY_LOG) == B_FALSE) { 8504 printf("invalid block pointer at this DVA\n"); 8505 goto out; 8506 } 8507 } 8508 } 8509 8510 if (flags & ZDB_FLAG_PRINT_BLKPTR) 8511 zdb_print_blkptr((blkptr_t *)(void *) 8512 ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); 8513 else if (flags & ZDB_FLAG_RAW) 8514 zdb_dump_block_raw(buf, lsize, flags); 8515 else if (flags & ZDB_FLAG_INDIRECT) 8516 zdb_dump_indirect((blkptr_t *)buf, 8517 orig_lsize / sizeof (blkptr_t), flags); 8518 else if (flags & ZDB_FLAG_GBH) 8519 zdb_dump_gbh(buf, flags); 8520 else 8521 zdb_dump_block(thing, buf, lsize, flags); 8522 8523 /* 8524 * If :c was specified, iterate through the checksum table to 8525 * calculate and display each checksum for our specified 8526 * DVA and length. 8527 */ 8528 if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) && 8529 !(flags & ZDB_FLAG_GBH)) { 8530 zio_t *czio; 8531 (void) printf("\n"); 8532 for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL; 8533 ck < ZIO_CHECKSUM_FUNCTIONS; ck++) { 8534 8535 if ((zio_checksum_table[ck].ci_flags & 8536 ZCHECKSUM_FLAG_EMBEDDED) || 8537 ck == ZIO_CHECKSUM_NOPARITY) { 8538 continue; 8539 } 8540 BP_SET_CHECKSUM(bp, ck); 8541 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8542 czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 8543 czio->io_bp = bp; 8544 8545 if (vd == vd->vdev_top) { 8546 zio_nowait(zio_read(czio, spa, bp, pabd, psize, 8547 NULL, NULL, 8548 ZIO_PRIORITY_SYNC_READ, 8549 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | 8550 ZIO_FLAG_DONT_RETRY, NULL)); 8551 } else { 8552 zio_nowait(zio_vdev_child_io(czio, bp, vd, 8553 offset, pabd, psize, ZIO_TYPE_READ, 8554 ZIO_PRIORITY_SYNC_READ, 8555 ZIO_FLAG_DONT_CACHE | 8556 ZIO_FLAG_DONT_PROPAGATE | 8557 ZIO_FLAG_DONT_RETRY | 8558 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | 8559 ZIO_FLAG_SPECULATIVE | 8560 ZIO_FLAG_OPTIONAL, NULL, NULL)); 8561 } 8562 error = zio_wait(czio); 8563 if (error == 0 || error == ECKSUM) { 8564 zio_t *ck_zio = zio_root(spa, NULL, NULL, 0); 8565 ck_zio->io_offset = 8566 DVA_GET_OFFSET(&bp->blk_dva[0]); 8567 ck_zio->io_bp = bp; 8568 zio_checksum_compute(ck_zio, ck, pabd, lsize); 8569 printf( 8570 "%12s\t" 8571 "cksum=%016llx:%016llx:%016llx:%016llx\n", 8572 zio_checksum_table[ck].ci_name, 8573 (u_longlong_t)bp->blk_cksum.zc_word[0], 8574 (u_longlong_t)bp->blk_cksum.zc_word[1], 8575 (u_longlong_t)bp->blk_cksum.zc_word[2], 8576 (u_longlong_t)bp->blk_cksum.zc_word[3]); 8577 zio_wait(ck_zio); 8578 } else { 8579 printf("error %d reading block\n", error); 8580 } 8581 spa_config_exit(spa, SCL_STATE, FTAG); 8582 } 8583 } 8584 8585 if (borrowed) 8586 abd_return_buf_copy(pabd, buf, lsize); 8587 8588 out: 8589 abd_free(pabd); 8590 umem_free(lbuf, SPA_MAXBLOCKSIZE); 8591 done: 8592 free(flagstr); 8593 free(dup); 8594 } 8595 8596 static void 8597 zdb_embedded_block(char *thing) 8598 { 8599 blkptr_t bp = {{{{0}}}}; 8600 unsigned long long *words = (void *)&bp; 8601 char *buf; 8602 int err; 8603 8604 err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" 8605 "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", 8606 words + 0, words + 1, words + 2, words + 3, 8607 words + 4, words + 5, words + 6, words + 7, 8608 words + 8, words + 9, words + 10, words + 11, 8609 words + 12, words + 13, words + 14, words + 15); 8610 if (err != 16) { 8611 (void) fprintf(stderr, "invalid input format\n"); 8612 exit(1); 8613 } 8614 ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); 8615 buf = malloc(SPA_MAXBLOCKSIZE); 8616 if (buf == NULL) { 8617 (void) fprintf(stderr, "out of memory\n"); 8618 exit(1); 8619 } 8620 err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); 8621 if (err != 0) { 8622 (void) fprintf(stderr, "decode failed: %u\n", err); 8623 exit(1); 8624 } 8625 zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); 8626 free(buf); 8627 } 8628 8629 /* check for valid hex or decimal numeric string */ 8630 static boolean_t 8631 zdb_numeric(char *str) 8632 { 8633 int i = 0; 8634 8635 if (strlen(str) == 0) 8636 return (B_FALSE); 8637 if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0) 8638 i = 2; 8639 for (; i < strlen(str); i++) { 8640 if (!isxdigit(str[i])) 8641 return (B_FALSE); 8642 } 8643 return (B_TRUE); 8644 } 8645 8646 int 8647 main(int argc, char **argv) 8648 { 8649 int c; 8650 spa_t *spa = NULL; 8651 objset_t *os = NULL; 8652 int dump_all = 1; 8653 int verbose = 0; 8654 int error = 0; 8655 char **searchdirs = NULL; 8656 int nsearch = 0; 8657 char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN]; 8658 nvlist_t *policy = NULL; 8659 uint64_t max_txg = UINT64_MAX; 8660 int64_t objset_id = -1; 8661 uint64_t object; 8662 int flags = ZFS_IMPORT_MISSING_LOG; 8663 int rewind = ZPOOL_NEVER_REWIND; 8664 char *spa_config_path_env, *objset_str; 8665 boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; 8666 nvlist_t *cfg = NULL; 8667 8668 dprintf_setup(&argc, argv); 8669 8670 /* 8671 * If there is an environment variable SPA_CONFIG_PATH it overrides 8672 * default spa_config_path setting. If -U flag is specified it will 8673 * override this environment variable settings once again. 8674 */ 8675 spa_config_path_env = getenv("SPA_CONFIG_PATH"); 8676 if (spa_config_path_env != NULL) 8677 spa_config_path = spa_config_path_env; 8678 8679 /* 8680 * For performance reasons, we set this tunable down. We do so before 8681 * the arg parsing section so that the user can override this value if 8682 * they choose. 8683 */ 8684 zfs_btree_verify_intensity = 3; 8685 8686 struct option long_options[] = { 8687 {"ignore-assertions", no_argument, NULL, 'A'}, 8688 {"block-stats", no_argument, NULL, 'b'}, 8689 {"checksum", no_argument, NULL, 'c'}, 8690 {"config", no_argument, NULL, 'C'}, 8691 {"datasets", no_argument, NULL, 'd'}, 8692 {"dedup-stats", no_argument, NULL, 'D'}, 8693 {"exported", no_argument, NULL, 'e'}, 8694 {"embedded-block-pointer", no_argument, NULL, 'E'}, 8695 {"automatic-rewind", no_argument, NULL, 'F'}, 8696 {"dump-debug-msg", no_argument, NULL, 'G'}, 8697 {"history", no_argument, NULL, 'h'}, 8698 {"intent-logs", no_argument, NULL, 'i'}, 8699 {"inflight", required_argument, NULL, 'I'}, 8700 {"checkpointed-state", no_argument, NULL, 'k'}, 8701 {"key", required_argument, NULL, 'K'}, 8702 {"label", no_argument, NULL, 'l'}, 8703 {"disable-leak-tracking", no_argument, NULL, 'L'}, 8704 {"metaslabs", no_argument, NULL, 'm'}, 8705 {"metaslab-groups", no_argument, NULL, 'M'}, 8706 {"numeric", no_argument, NULL, 'N'}, 8707 {"option", required_argument, NULL, 'o'}, 8708 {"object-lookups", no_argument, NULL, 'O'}, 8709 {"path", required_argument, NULL, 'p'}, 8710 {"parseable", no_argument, NULL, 'P'}, 8711 {"skip-label", no_argument, NULL, 'q'}, 8712 {"copy-object", no_argument, NULL, 'r'}, 8713 {"read-block", no_argument, NULL, 'R'}, 8714 {"io-stats", no_argument, NULL, 's'}, 8715 {"simulate-dedup", no_argument, NULL, 'S'}, 8716 {"txg", required_argument, NULL, 't'}, 8717 {"uberblock", no_argument, NULL, 'u'}, 8718 {"cachefile", required_argument, NULL, 'U'}, 8719 {"verbose", no_argument, NULL, 'v'}, 8720 {"verbatim", no_argument, NULL, 'V'}, 8721 {"dump-blocks", required_argument, NULL, 'x'}, 8722 {"extreme-rewind", no_argument, NULL, 'X'}, 8723 {"all-reconstruction", no_argument, NULL, 'Y'}, 8724 {"livelist", no_argument, NULL, 'y'}, 8725 {"zstd-headers", no_argument, NULL, 'Z'}, 8726 {0, 0, 0, 0} 8727 }; 8728 8729 while ((c = getopt_long(argc, argv, 8730 "AbcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", 8731 long_options, NULL)) != -1) { 8732 switch (c) { 8733 case 'b': 8734 case 'c': 8735 case 'C': 8736 case 'd': 8737 case 'D': 8738 case 'E': 8739 case 'G': 8740 case 'h': 8741 case 'i': 8742 case 'l': 8743 case 'm': 8744 case 'M': 8745 case 'N': 8746 case 'O': 8747 case 'r': 8748 case 'R': 8749 case 's': 8750 case 'S': 8751 case 'u': 8752 case 'y': 8753 case 'Z': 8754 dump_opt[c]++; 8755 dump_all = 0; 8756 break; 8757 case 'A': 8758 case 'e': 8759 case 'F': 8760 case 'k': 8761 case 'L': 8762 case 'P': 8763 case 'q': 8764 case 'X': 8765 dump_opt[c]++; 8766 break; 8767 case 'Y': 8768 zfs_reconstruct_indirect_combinations_max = INT_MAX; 8769 zfs_deadman_enabled = 0; 8770 break; 8771 /* NB: Sort single match options below. */ 8772 case 'I': 8773 max_inflight_bytes = strtoull(optarg, NULL, 0); 8774 if (max_inflight_bytes == 0) { 8775 (void) fprintf(stderr, "maximum number " 8776 "of inflight bytes must be greater " 8777 "than 0\n"); 8778 usage(); 8779 } 8780 break; 8781 case 'K': 8782 dump_opt[c]++; 8783 key_material = strdup(optarg); 8784 /* redact key material in process table */ 8785 while (*optarg != '\0') { *optarg++ = '*'; } 8786 break; 8787 case 'o': 8788 error = set_global_var(optarg); 8789 if (error != 0) 8790 usage(); 8791 break; 8792 case 'p': 8793 if (searchdirs == NULL) { 8794 searchdirs = umem_alloc(sizeof (char *), 8795 UMEM_NOFAIL); 8796 } else { 8797 char **tmp = umem_alloc((nsearch + 1) * 8798 sizeof (char *), UMEM_NOFAIL); 8799 memcpy(tmp, searchdirs, nsearch * 8800 sizeof (char *)); 8801 umem_free(searchdirs, 8802 nsearch * sizeof (char *)); 8803 searchdirs = tmp; 8804 } 8805 searchdirs[nsearch++] = optarg; 8806 break; 8807 case 't': 8808 max_txg = strtoull(optarg, NULL, 0); 8809 if (max_txg < TXG_INITIAL) { 8810 (void) fprintf(stderr, "incorrect txg " 8811 "specified: %s\n", optarg); 8812 usage(); 8813 } 8814 break; 8815 case 'U': 8816 spa_config_path = optarg; 8817 if (spa_config_path[0] != '/') { 8818 (void) fprintf(stderr, 8819 "cachefile must be an absolute path " 8820 "(i.e. start with a slash)\n"); 8821 usage(); 8822 } 8823 break; 8824 case 'v': 8825 verbose++; 8826 break; 8827 case 'V': 8828 flags = ZFS_IMPORT_VERBATIM; 8829 break; 8830 case 'x': 8831 vn_dumpdir = optarg; 8832 break; 8833 default: 8834 usage(); 8835 break; 8836 } 8837 } 8838 8839 if (!dump_opt['e'] && searchdirs != NULL) { 8840 (void) fprintf(stderr, "-p option requires use of -e\n"); 8841 usage(); 8842 } 8843 #if defined(_LP64) 8844 /* 8845 * ZDB does not typically re-read blocks; therefore limit the ARC 8846 * to 256 MB, which can be used entirely for metadata. 8847 */ 8848 zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT; 8849 zfs_arc_max = 256 * 1024 * 1024; 8850 #endif 8851 8852 /* 8853 * "zdb -c" uses checksum-verifying scrub i/os which are async reads. 8854 * "zdb -b" uses traversal prefetch which uses async reads. 8855 * For good performance, let several of them be active at once. 8856 */ 8857 zfs_vdev_async_read_max_active = 10; 8858 8859 /* 8860 * Disable reference tracking for better performance. 8861 */ 8862 reference_tracking_enable = B_FALSE; 8863 8864 /* 8865 * Do not fail spa_load when spa_load_verify fails. This is needed 8866 * to load non-idle pools. 8867 */ 8868 spa_load_verify_dryrun = B_TRUE; 8869 8870 /* 8871 * ZDB should have ability to read spacemaps. 8872 */ 8873 spa_mode_readable_spacemaps = B_TRUE; 8874 8875 kernel_init(SPA_MODE_READ); 8876 8877 if (dump_all) 8878 verbose = MAX(verbose, 1); 8879 8880 for (c = 0; c < 256; c++) { 8881 if (dump_all && strchr("AeEFkKlLNOPrRSXy", c) == NULL) 8882 dump_opt[c] = 1; 8883 if (dump_opt[c]) 8884 dump_opt[c] += verbose; 8885 } 8886 8887 libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2)); 8888 zfs_recover = (dump_opt['A'] > 1); 8889 8890 argc -= optind; 8891 argv += optind; 8892 if (argc < 2 && dump_opt['R']) 8893 usage(); 8894 8895 if (dump_opt['E']) { 8896 if (argc != 1) 8897 usage(); 8898 zdb_embedded_block(argv[0]); 8899 return (0); 8900 } 8901 8902 if (argc < 1) { 8903 if (!dump_opt['e'] && dump_opt['C']) { 8904 dump_cachefile(spa_config_path); 8905 return (0); 8906 } 8907 usage(); 8908 } 8909 8910 if (dump_opt['l']) 8911 return (dump_label(argv[0])); 8912 8913 if (dump_opt['O']) { 8914 if (argc != 2) 8915 usage(); 8916 dump_opt['v'] = verbose + 3; 8917 return (dump_path(argv[0], argv[1], NULL)); 8918 } 8919 if (dump_opt['r']) { 8920 target_is_spa = B_FALSE; 8921 if (argc != 3) 8922 usage(); 8923 dump_opt['v'] = verbose; 8924 error = dump_path(argv[0], argv[1], &object); 8925 if (error != 0) 8926 fatal("internal error: %s", strerror(error)); 8927 } 8928 8929 if (dump_opt['X'] || dump_opt['F']) 8930 rewind = ZPOOL_DO_REWIND | 8931 (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); 8932 8933 /* -N implies -d */ 8934 if (dump_opt['N'] && dump_opt['d'] == 0) 8935 dump_opt['d'] = dump_opt['N']; 8936 8937 if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || 8938 nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || 8939 nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) 8940 fatal("internal error: %s", strerror(ENOMEM)); 8941 8942 error = 0; 8943 target = argv[0]; 8944 8945 if (strpbrk(target, "/@") != NULL) { 8946 size_t targetlen; 8947 8948 target_pool = strdup(target); 8949 *strpbrk(target_pool, "/@") = '\0'; 8950 8951 target_is_spa = B_FALSE; 8952 targetlen = strlen(target); 8953 if (targetlen && target[targetlen - 1] == '/') 8954 target[targetlen - 1] = '\0'; 8955 8956 /* 8957 * See if an objset ID was supplied (-d <pool>/<objset ID>). 8958 * To disambiguate tank/100, consider the 100 as objsetID 8959 * if -N was given, otherwise 100 is an objsetID iff 8960 * tank/100 as a named dataset fails on lookup. 8961 */ 8962 objset_str = strchr(target, '/'); 8963 if (objset_str && strlen(objset_str) > 1 && 8964 zdb_numeric(objset_str + 1)) { 8965 char *endptr; 8966 errno = 0; 8967 objset_str++; 8968 objset_id = strtoull(objset_str, &endptr, 0); 8969 /* dataset 0 is the same as opening the pool */ 8970 if (errno == 0 && endptr != objset_str && 8971 objset_id != 0) { 8972 if (dump_opt['N']) 8973 dataset_lookup = B_TRUE; 8974 } 8975 /* normal dataset name not an objset ID */ 8976 if (endptr == objset_str) { 8977 objset_id = -1; 8978 } 8979 } else if (objset_str && !zdb_numeric(objset_str + 1) && 8980 dump_opt['N']) { 8981 printf("Supply a numeric objset ID with -N\n"); 8982 exit(1); 8983 } 8984 } else { 8985 target_pool = target; 8986 } 8987 8988 if (dump_opt['e']) { 8989 importargs_t args = { 0 }; 8990 8991 args.paths = nsearch; 8992 args.path = searchdirs; 8993 args.can_be_active = B_TRUE; 8994 8995 libpc_handle_t lpch = { 8996 .lpc_lib_handle = NULL, 8997 .lpc_ops = &libzpool_config_ops, 8998 .lpc_printerr = B_TRUE 8999 }; 9000 error = zpool_find_config(&lpch, target_pool, &cfg, &args); 9001 9002 if (error == 0) { 9003 9004 if (nvlist_add_nvlist(cfg, 9005 ZPOOL_LOAD_POLICY, policy) != 0) { 9006 fatal("can't open '%s': %s", 9007 target, strerror(ENOMEM)); 9008 } 9009 9010 if (dump_opt['C'] > 1) { 9011 (void) printf("\nConfiguration for import:\n"); 9012 dump_nvlist(cfg, 8); 9013 } 9014 9015 /* 9016 * Disable the activity check to allow examination of 9017 * active pools. 9018 */ 9019 error = spa_import(target_pool, cfg, NULL, 9020 flags | ZFS_IMPORT_SKIP_MMP); 9021 } 9022 } 9023 9024 if (searchdirs != NULL) { 9025 umem_free(searchdirs, nsearch * sizeof (char *)); 9026 searchdirs = NULL; 9027 } 9028 9029 /* 9030 * import_checkpointed_state makes the assumption that the 9031 * target pool that we pass it is already part of the spa 9032 * namespace. Because of that we need to make sure to call 9033 * it always after the -e option has been processed, which 9034 * imports the pool to the namespace if it's not in the 9035 * cachefile. 9036 */ 9037 char *checkpoint_pool = NULL; 9038 char *checkpoint_target = NULL; 9039 if (dump_opt['k']) { 9040 checkpoint_pool = import_checkpointed_state(target, cfg, 9041 &checkpoint_target); 9042 9043 if (checkpoint_target != NULL) 9044 target = checkpoint_target; 9045 } 9046 9047 if (cfg != NULL) { 9048 nvlist_free(cfg); 9049 cfg = NULL; 9050 } 9051 9052 if (target_pool != target) 9053 free(target_pool); 9054 9055 if (error == 0) { 9056 if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { 9057 ASSERT(checkpoint_pool != NULL); 9058 ASSERT(checkpoint_target == NULL); 9059 9060 error = spa_open(checkpoint_pool, &spa, FTAG); 9061 if (error != 0) { 9062 fatal("Tried to open pool \"%s\" but " 9063 "spa_open() failed with error %d\n", 9064 checkpoint_pool, error); 9065 } 9066 9067 } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { 9068 zdb_set_skip_mmp(target); 9069 error = spa_open_rewind(target, &spa, FTAG, policy, 9070 NULL); 9071 if (error) { 9072 /* 9073 * If we're missing the log device then 9074 * try opening the pool after clearing the 9075 * log state. 9076 */ 9077 mutex_enter(&spa_namespace_lock); 9078 if ((spa = spa_lookup(target)) != NULL && 9079 spa->spa_log_state == SPA_LOG_MISSING) { 9080 spa->spa_log_state = SPA_LOG_CLEAR; 9081 error = 0; 9082 } 9083 mutex_exit(&spa_namespace_lock); 9084 9085 if (!error) { 9086 error = spa_open_rewind(target, &spa, 9087 FTAG, policy, NULL); 9088 } 9089 } 9090 } else if (strpbrk(target, "#") != NULL) { 9091 dsl_pool_t *dp; 9092 error = dsl_pool_hold(target, FTAG, &dp); 9093 if (error != 0) { 9094 fatal("can't dump '%s': %s", target, 9095 strerror(error)); 9096 } 9097 error = dump_bookmark(dp, target, B_TRUE, verbose > 1); 9098 dsl_pool_rele(dp, FTAG); 9099 if (error != 0) { 9100 fatal("can't dump '%s': %s", target, 9101 strerror(error)); 9102 } 9103 return (error); 9104 } else { 9105 target_pool = strdup(target); 9106 if (strpbrk(target, "/@") != NULL) 9107 *strpbrk(target_pool, "/@") = '\0'; 9108 9109 zdb_set_skip_mmp(target); 9110 /* 9111 * If -N was supplied, the user has indicated that 9112 * zdb -d <pool>/<objsetID> is in effect. Otherwise 9113 * we first assume that the dataset string is the 9114 * dataset name. If dmu_objset_hold fails with the 9115 * dataset string, and we have an objset_id, retry the 9116 * lookup with the objsetID. 9117 */ 9118 boolean_t retry = B_TRUE; 9119 retry_lookup: 9120 if (dataset_lookup == B_TRUE) { 9121 /* 9122 * Use the supplied id to get the name 9123 * for open_objset. 9124 */ 9125 error = spa_open(target_pool, &spa, FTAG); 9126 if (error == 0) { 9127 error = name_from_objset_id(spa, 9128 objset_id, dsname); 9129 spa_close(spa, FTAG); 9130 if (error == 0) 9131 target = dsname; 9132 } 9133 } 9134 if (error == 0) { 9135 if (objset_id > 0 && retry) { 9136 int err = dmu_objset_hold(target, FTAG, 9137 &os); 9138 if (err) { 9139 dataset_lookup = B_TRUE; 9140 retry = B_FALSE; 9141 goto retry_lookup; 9142 } else { 9143 dmu_objset_rele(os, FTAG); 9144 } 9145 } 9146 error = open_objset(target, FTAG, &os); 9147 } 9148 if (error == 0) 9149 spa = dmu_objset_spa(os); 9150 free(target_pool); 9151 } 9152 } 9153 nvlist_free(policy); 9154 9155 if (error) 9156 fatal("can't open '%s': %s", target, strerror(error)); 9157 9158 /* 9159 * Set the pool failure mode to panic in order to prevent the pool 9160 * from suspending. A suspended I/O will have no way to resume and 9161 * can prevent the zdb(8) command from terminating as expected. 9162 */ 9163 if (spa != NULL) 9164 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; 9165 9166 argv++; 9167 argc--; 9168 if (dump_opt['r']) { 9169 error = zdb_copy_object(os, object, argv[1]); 9170 } else if (!dump_opt['R']) { 9171 flagbits['d'] = ZOR_FLAG_DIRECTORY; 9172 flagbits['f'] = ZOR_FLAG_PLAIN_FILE; 9173 flagbits['m'] = ZOR_FLAG_SPACE_MAP; 9174 flagbits['z'] = ZOR_FLAG_ZAP; 9175 flagbits['A'] = ZOR_FLAG_ALL_TYPES; 9176 9177 if (argc > 0 && dump_opt['d']) { 9178 zopt_object_args = argc; 9179 zopt_object_ranges = calloc(zopt_object_args, 9180 sizeof (zopt_object_range_t)); 9181 for (unsigned i = 0; i < zopt_object_args; i++) { 9182 int err; 9183 const char *msg = NULL; 9184 9185 err = parse_object_range(argv[i], 9186 &zopt_object_ranges[i], &msg); 9187 if (err != 0) 9188 fatal("Bad object or range: '%s': %s\n", 9189 argv[i], msg ?: ""); 9190 } 9191 } else if (argc > 0 && dump_opt['m']) { 9192 zopt_metaslab_args = argc; 9193 zopt_metaslab = calloc(zopt_metaslab_args, 9194 sizeof (uint64_t)); 9195 for (unsigned i = 0; i < zopt_metaslab_args; i++) { 9196 errno = 0; 9197 zopt_metaslab[i] = strtoull(argv[i], NULL, 0); 9198 if (zopt_metaslab[i] == 0 && errno != 0) 9199 fatal("bad number %s: %s", argv[i], 9200 strerror(errno)); 9201 } 9202 } 9203 if (os != NULL) { 9204 dump_objset(os); 9205 } else if (zopt_object_args > 0 && !dump_opt['m']) { 9206 dump_objset(spa->spa_meta_objset); 9207 } else { 9208 dump_zpool(spa); 9209 } 9210 } else { 9211 flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; 9212 flagbits['c'] = ZDB_FLAG_CHECKSUM; 9213 flagbits['d'] = ZDB_FLAG_DECOMPRESS; 9214 flagbits['e'] = ZDB_FLAG_BSWAP; 9215 flagbits['g'] = ZDB_FLAG_GBH; 9216 flagbits['i'] = ZDB_FLAG_INDIRECT; 9217 flagbits['r'] = ZDB_FLAG_RAW; 9218 flagbits['v'] = ZDB_FLAG_VERBOSE; 9219 9220 for (int i = 0; i < argc; i++) 9221 zdb_read_block(argv[i], spa); 9222 } 9223 9224 if (dump_opt['k']) { 9225 free(checkpoint_pool); 9226 if (!target_is_spa) 9227 free(checkpoint_target); 9228 } 9229 9230 if (os != NULL) { 9231 close_objset(os, FTAG); 9232 } else { 9233 spa_close(spa, FTAG); 9234 } 9235 9236 fuid_table_destroy(); 9237 9238 dump_debug_buffer(); 9239 9240 kernel_fini(); 9241 9242 return (error); 9243 } 9244