1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2019 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright 2016 Nexenta Systems, Inc. 27 * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. 28 * Copyright (c) 2015, 2017, Intel Corporation. 29 * Copyright (c) 2020 Datto Inc. 30 * Copyright (c) 2020, The FreeBSD Foundation [1] 31 * 32 * [1] Portions of this software were developed by Allan Jude 33 * under sponsorship from the FreeBSD Foundation. 34 * Copyright (c) 2021 Allan Jude 35 * Copyright (c) 2021 Toomas Soome <tsoome@me.com> 36 */ 37 38 #include <stdio.h> 39 #include <unistd.h> 40 #include <stdlib.h> 41 #include <ctype.h> 42 #include <getopt.h> 43 #include <openssl/evp.h> 44 #include <sys/zfs_context.h> 45 #include <sys/spa.h> 46 #include <sys/spa_impl.h> 47 #include <sys/dmu.h> 48 #include <sys/zap.h> 49 #include <sys/fs/zfs.h> 50 #include <sys/zfs_znode.h> 51 #include <sys/zfs_sa.h> 52 #include <sys/sa.h> 53 #include <sys/sa_impl.h> 54 #include <sys/vdev.h> 55 #include <sys/vdev_impl.h> 56 #include <sys/metaslab_impl.h> 57 #include <sys/dmu_objset.h> 58 #include <sys/dsl_dir.h> 59 #include <sys/dsl_dataset.h> 60 #include <sys/dsl_pool.h> 61 #include <sys/dsl_bookmark.h> 62 #include <sys/dbuf.h> 63 #include <sys/zil.h> 64 #include <sys/zil_impl.h> 65 #include <sys/stat.h> 66 #include <sys/resource.h> 67 #include <sys/dmu_send.h> 68 #include <sys/dmu_traverse.h> 69 #include <sys/zio_checksum.h> 70 #include <sys/zio_compress.h> 71 #include <sys/zfs_fuid.h> 72 #include <sys/arc.h> 73 #include <sys/arc_impl.h> 74 #include <sys/ddt.h> 75 #include <sys/zfeature.h> 76 #include <sys/abd.h> 77 #include <sys/blkptr.h> 78 #include <sys/dsl_crypt.h> 79 #include <sys/dsl_scan.h> 80 #include <sys/btree.h> 81 #include <zfs_comutil.h> 82 #include <sys/zstd/zstd.h> 83 84 #include <libnvpair.h> 85 #include <libzutil.h> 86 87 #include "zdb.h" 88 89 #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ 90 zio_compress_table[(idx)].ci_name : "UNKNOWN") 91 #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ 92 zio_checksum_table[(idx)].ci_name : "UNKNOWN") 93 #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ 94 (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ 95 DMU_OT_ZAP_OTHER : \ 96 (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ 97 DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) 98 99 /* Some platforms require part of inode IDs to be remapped */ 100 #ifdef __APPLE__ 101 #define ZDB_MAP_OBJECT_ID(obj) INO_XNUTOZFS(obj, 2) 102 #else 103 #define ZDB_MAP_OBJECT_ID(obj) (obj) 104 #endif 105 106 static const char * 107 zdb_ot_name(dmu_object_type_t type) 108 { 109 if (type < DMU_OT_NUMTYPES) 110 return (dmu_ot[type].ot_name); 111 else if ((type & DMU_OT_NEWTYPE) && 112 ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) 113 return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); 114 else 115 return ("UNKNOWN"); 116 } 117 118 extern int reference_tracking_enable; 119 extern int zfs_recover; 120 extern uint_t zfs_vdev_async_read_max_active; 121 extern boolean_t spa_load_verify_dryrun; 122 extern boolean_t spa_mode_readable_spacemaps; 123 extern uint_t zfs_reconstruct_indirect_combinations_max; 124 extern uint_t zfs_btree_verify_intensity; 125 126 static const char cmdname[] = "zdb"; 127 uint8_t dump_opt[256]; 128 129 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); 130 131 static uint64_t *zopt_metaslab = NULL; 132 static unsigned zopt_metaslab_args = 0; 133 134 typedef struct zopt_object_range { 135 uint64_t zor_obj_start; 136 uint64_t zor_obj_end; 137 uint64_t zor_flags; 138 } zopt_object_range_t; 139 140 static zopt_object_range_t *zopt_object_ranges = NULL; 141 static unsigned zopt_object_args = 0; 142 143 static int flagbits[256]; 144 145 #define ZOR_FLAG_PLAIN_FILE 0x0001 146 #define ZOR_FLAG_DIRECTORY 0x0002 147 #define ZOR_FLAG_SPACE_MAP 0x0004 148 #define ZOR_FLAG_ZAP 0x0008 149 #define ZOR_FLAG_ALL_TYPES -1 150 #define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ 151 ZOR_FLAG_DIRECTORY | \ 152 ZOR_FLAG_SPACE_MAP | \ 153 ZOR_FLAG_ZAP) 154 155 #define ZDB_FLAG_CHECKSUM 0x0001 156 #define ZDB_FLAG_DECOMPRESS 0x0002 157 #define ZDB_FLAG_BSWAP 0x0004 158 #define ZDB_FLAG_GBH 0x0008 159 #define ZDB_FLAG_INDIRECT 0x0010 160 #define ZDB_FLAG_RAW 0x0020 161 #define ZDB_FLAG_PRINT_BLKPTR 0x0040 162 #define ZDB_FLAG_VERBOSE 0x0080 163 164 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ 165 static int leaked_objects = 0; 166 static range_tree_t *mos_refd_objs; 167 168 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, 169 boolean_t); 170 static void mos_obj_refd(uint64_t); 171 static void mos_obj_refd_multiple(uint64_t); 172 static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, 173 dmu_tx_t *tx); 174 175 typedef struct sublivelist_verify { 176 /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ 177 zfs_btree_t sv_pair; 178 179 /* ALLOC's without a matching FREE, accumulates across sub-livelists */ 180 zfs_btree_t sv_leftover; 181 } sublivelist_verify_t; 182 183 static int 184 livelist_compare(const void *larg, const void *rarg) 185 { 186 const blkptr_t *l = larg; 187 const blkptr_t *r = rarg; 188 189 /* Sort them according to dva[0] */ 190 uint64_t l_dva0_vdev, r_dva0_vdev; 191 l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); 192 r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); 193 if (l_dva0_vdev < r_dva0_vdev) 194 return (-1); 195 else if (l_dva0_vdev > r_dva0_vdev) 196 return (+1); 197 198 /* if vdevs are equal, sort by offsets. */ 199 uint64_t l_dva0_offset; 200 uint64_t r_dva0_offset; 201 l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); 202 r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); 203 if (l_dva0_offset < r_dva0_offset) { 204 return (-1); 205 } else if (l_dva0_offset > r_dva0_offset) { 206 return (+1); 207 } 208 209 /* 210 * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, 211 * it's possible the offsets are equal. In that case, sort by txg 212 */ 213 if (l->blk_birth < r->blk_birth) { 214 return (-1); 215 } else if (l->blk_birth > r->blk_birth) { 216 return (+1); 217 } 218 return (0); 219 } 220 221 typedef struct sublivelist_verify_block { 222 dva_t svb_dva; 223 224 /* 225 * We need this to check if the block marked as allocated 226 * in the livelist was freed (and potentially reallocated) 227 * in the metaslab spacemaps at a later TXG. 228 */ 229 uint64_t svb_allocated_txg; 230 } sublivelist_verify_block_t; 231 232 static void zdb_print_blkptr(const blkptr_t *bp, int flags); 233 234 typedef struct sublivelist_verify_block_refcnt { 235 /* block pointer entry in livelist being verified */ 236 blkptr_t svbr_blk; 237 238 /* 239 * Refcount gets incremented to 1 when we encounter the first 240 * FREE entry for the svfbr block pointer and a node for it 241 * is created in our ZDB verification/tracking metadata. 242 * 243 * As we encounter more FREE entries we increment this counter 244 * and similarly decrement it whenever we find the respective 245 * ALLOC entries for this block. 246 * 247 * When the refcount gets to 0 it means that all the FREE and 248 * ALLOC entries of this block have paired up and we no longer 249 * need to track it in our verification logic (e.g. the node 250 * containing this struct in our verification data structure 251 * should be freed). 252 * 253 * [refer to sublivelist_verify_blkptr() for the actual code] 254 */ 255 uint32_t svbr_refcnt; 256 } sublivelist_verify_block_refcnt_t; 257 258 static int 259 sublivelist_block_refcnt_compare(const void *larg, const void *rarg) 260 { 261 const sublivelist_verify_block_refcnt_t *l = larg; 262 const sublivelist_verify_block_refcnt_t *r = rarg; 263 return (livelist_compare(&l->svbr_blk, &r->svbr_blk)); 264 } 265 266 static int 267 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, 268 dmu_tx_t *tx) 269 { 270 ASSERT3P(tx, ==, NULL); 271 struct sublivelist_verify *sv = arg; 272 sublivelist_verify_block_refcnt_t current = { 273 .svbr_blk = *bp, 274 275 /* 276 * Start with 1 in case this is the first free entry. 277 * This field is not used for our B-Tree comparisons 278 * anyway. 279 */ 280 .svbr_refcnt = 1, 281 }; 282 283 zfs_btree_index_t where; 284 sublivelist_verify_block_refcnt_t *pair = 285 zfs_btree_find(&sv->sv_pair, ¤t, &where); 286 if (free) { 287 if (pair == NULL) { 288 /* first free entry for this block pointer */ 289 zfs_btree_add(&sv->sv_pair, ¤t); 290 } else { 291 pair->svbr_refcnt++; 292 } 293 } else { 294 if (pair == NULL) { 295 /* block that is currently marked as allocated */ 296 for (int i = 0; i < SPA_DVAS_PER_BP; i++) { 297 if (DVA_IS_EMPTY(&bp->blk_dva[i])) 298 break; 299 sublivelist_verify_block_t svb = { 300 .svb_dva = bp->blk_dva[i], 301 .svb_allocated_txg = bp->blk_birth 302 }; 303 304 if (zfs_btree_find(&sv->sv_leftover, &svb, 305 &where) == NULL) { 306 zfs_btree_add_idx(&sv->sv_leftover, 307 &svb, &where); 308 } 309 } 310 } else { 311 /* alloc matches a free entry */ 312 pair->svbr_refcnt--; 313 if (pair->svbr_refcnt == 0) { 314 /* all allocs and frees have been matched */ 315 zfs_btree_remove_idx(&sv->sv_pair, &where); 316 } 317 } 318 } 319 320 return (0); 321 } 322 323 static int 324 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) 325 { 326 int err; 327 struct sublivelist_verify *sv = args; 328 329 zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, 330 sizeof (sublivelist_verify_block_refcnt_t)); 331 332 err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, 333 sv, NULL); 334 335 sublivelist_verify_block_refcnt_t *e; 336 zfs_btree_index_t *cookie = NULL; 337 while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { 338 char blkbuf[BP_SPRINTF_LEN]; 339 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), 340 &e->svbr_blk, B_TRUE); 341 (void) printf("\tERROR: %d unmatched FREE(s): %s\n", 342 e->svbr_refcnt, blkbuf); 343 } 344 zfs_btree_destroy(&sv->sv_pair); 345 346 return (err); 347 } 348 349 static int 350 livelist_block_compare(const void *larg, const void *rarg) 351 { 352 const sublivelist_verify_block_t *l = larg; 353 const sublivelist_verify_block_t *r = rarg; 354 355 if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) 356 return (-1); 357 else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) 358 return (+1); 359 360 if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) 361 return (-1); 362 else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) 363 return (+1); 364 365 if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) 366 return (-1); 367 else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) 368 return (+1); 369 370 return (0); 371 } 372 373 /* 374 * Check for errors in a livelist while tracking all unfreed ALLOCs in the 375 * sublivelist_verify_t: sv->sv_leftover 376 */ 377 static void 378 livelist_verify(dsl_deadlist_t *dl, void *arg) 379 { 380 sublivelist_verify_t *sv = arg; 381 dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); 382 } 383 384 /* 385 * Check for errors in the livelist entry and discard the intermediary 386 * data structures 387 */ 388 static int 389 sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) 390 { 391 (void) args; 392 sublivelist_verify_t sv; 393 zfs_btree_create(&sv.sv_leftover, livelist_block_compare, 394 sizeof (sublivelist_verify_block_t)); 395 int err = sublivelist_verify_func(&sv, dle); 396 zfs_btree_clear(&sv.sv_leftover); 397 zfs_btree_destroy(&sv.sv_leftover); 398 return (err); 399 } 400 401 typedef struct metaslab_verify { 402 /* 403 * Tree containing all the leftover ALLOCs from the livelists 404 * that are part of this metaslab. 405 */ 406 zfs_btree_t mv_livelist_allocs; 407 408 /* 409 * Metaslab information. 410 */ 411 uint64_t mv_vdid; 412 uint64_t mv_msid; 413 uint64_t mv_start; 414 uint64_t mv_end; 415 416 /* 417 * What's currently allocated for this metaslab. 418 */ 419 range_tree_t *mv_allocated; 420 } metaslab_verify_t; 421 422 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); 423 424 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, 425 void *arg); 426 427 typedef struct unflushed_iter_cb_arg { 428 spa_t *uic_spa; 429 uint64_t uic_txg; 430 void *uic_arg; 431 zdb_log_sm_cb_t uic_cb; 432 } unflushed_iter_cb_arg_t; 433 434 static int 435 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) 436 { 437 unflushed_iter_cb_arg_t *uic = arg; 438 return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); 439 } 440 441 static void 442 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) 443 { 444 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 445 return; 446 447 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 448 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 449 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { 450 space_map_t *sm = NULL; 451 VERIFY0(space_map_open(&sm, spa_meta_objset(spa), 452 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); 453 454 unflushed_iter_cb_arg_t uic = { 455 .uic_spa = spa, 456 .uic_txg = sls->sls_txg, 457 .uic_arg = arg, 458 .uic_cb = cb 459 }; 460 VERIFY0(space_map_iterate(sm, space_map_length(sm), 461 iterate_through_spacemap_logs_cb, &uic)); 462 space_map_close(sm); 463 } 464 spa_config_exit(spa, SCL_CONFIG, FTAG); 465 } 466 467 static void 468 verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, 469 uint64_t offset, uint64_t size) 470 { 471 sublivelist_verify_block_t svb = {{{0}}}; 472 DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); 473 DVA_SET_OFFSET(&svb.svb_dva, offset); 474 DVA_SET_ASIZE(&svb.svb_dva, size); 475 zfs_btree_index_t where; 476 uint64_t end_offset = offset + size; 477 478 /* 479 * Look for an exact match for spacemap entry in the livelist entries. 480 * Then, look for other livelist entries that fall within the range 481 * of the spacemap entry as it may have been condensed 482 */ 483 sublivelist_verify_block_t *found = 484 zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); 485 if (found == NULL) { 486 found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); 487 } 488 for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && 489 DVA_GET_OFFSET(&found->svb_dva) < end_offset; 490 found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { 491 if (found->svb_allocated_txg <= txg) { 492 (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " 493 "from TXG %llx FREED at TXG %llx\n", 494 (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), 495 (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), 496 (u_longlong_t)found->svb_allocated_txg, 497 (u_longlong_t)txg); 498 } 499 } 500 } 501 502 static int 503 metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) 504 { 505 metaslab_verify_t *mv = arg; 506 uint64_t offset = sme->sme_offset; 507 uint64_t size = sme->sme_run; 508 uint64_t txg = sme->sme_txg; 509 510 if (sme->sme_type == SM_ALLOC) { 511 if (range_tree_contains(mv->mv_allocated, 512 offset, size)) { 513 (void) printf("ERROR: DOUBLE ALLOC: " 514 "%llu [%llx:%llx] " 515 "%llu:%llu LOG_SM\n", 516 (u_longlong_t)txg, (u_longlong_t)offset, 517 (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, 518 (u_longlong_t)mv->mv_msid); 519 } else { 520 range_tree_add(mv->mv_allocated, 521 offset, size); 522 } 523 } else { 524 if (!range_tree_contains(mv->mv_allocated, 525 offset, size)) { 526 (void) printf("ERROR: DOUBLE FREE: " 527 "%llu [%llx:%llx] " 528 "%llu:%llu LOG_SM\n", 529 (u_longlong_t)txg, (u_longlong_t)offset, 530 (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, 531 (u_longlong_t)mv->mv_msid); 532 } else { 533 range_tree_remove(mv->mv_allocated, 534 offset, size); 535 } 536 } 537 538 if (sme->sme_type != SM_ALLOC) { 539 /* 540 * If something is freed in the spacemap, verify that 541 * it is not listed as allocated in the livelist. 542 */ 543 verify_livelist_allocs(mv, txg, offset, size); 544 } 545 return (0); 546 } 547 548 static int 549 spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, 550 uint64_t txg, void *arg) 551 { 552 metaslab_verify_t *mv = arg; 553 uint64_t offset = sme->sme_offset; 554 uint64_t vdev_id = sme->sme_vdev; 555 556 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 557 558 /* skip indirect vdevs */ 559 if (!vdev_is_concrete(vd)) 560 return (0); 561 562 if (vdev_id != mv->mv_vdid) 563 return (0); 564 565 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 566 if (ms->ms_id != mv->mv_msid) 567 return (0); 568 569 if (txg < metaslab_unflushed_txg(ms)) 570 return (0); 571 572 573 ASSERT3U(txg, ==, sme->sme_txg); 574 return (metaslab_spacemap_validation_cb(sme, mv)); 575 } 576 577 static void 578 spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) 579 { 580 iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); 581 } 582 583 static void 584 spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) 585 { 586 if (sm == NULL) 587 return; 588 589 VERIFY0(space_map_iterate(sm, space_map_length(sm), 590 metaslab_spacemap_validation_cb, mv)); 591 } 592 593 static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); 594 595 /* 596 * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if 597 * they are part of that metaslab (mv_msid). 598 */ 599 static void 600 mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) 601 { 602 zfs_btree_index_t where; 603 sublivelist_verify_block_t *svb; 604 ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); 605 for (svb = zfs_btree_first(&sv->sv_leftover, &where); 606 svb != NULL; 607 svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { 608 if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) 609 continue; 610 611 if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && 612 (DVA_GET_OFFSET(&svb->svb_dva) + 613 DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { 614 (void) printf("ERROR: Found block that crosses " 615 "metaslab boundary: <%llu:%llx:%llx>\n", 616 (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), 617 (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), 618 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); 619 continue; 620 } 621 622 if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) 623 continue; 624 625 if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) 626 continue; 627 628 if ((DVA_GET_OFFSET(&svb->svb_dva) + 629 DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { 630 (void) printf("ERROR: Found block that crosses " 631 "metaslab boundary: <%llu:%llx:%llx>\n", 632 (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), 633 (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), 634 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); 635 continue; 636 } 637 638 zfs_btree_add(&mv->mv_livelist_allocs, svb); 639 } 640 641 for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); 642 svb != NULL; 643 svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { 644 zfs_btree_remove(&sv->sv_leftover, svb); 645 } 646 } 647 648 /* 649 * [Livelist Check] 650 * Iterate through all the sublivelists and: 651 * - report leftover frees (**) 652 * - record leftover ALLOCs together with their TXG [see Cross Check] 653 * 654 * (**) Note: Double ALLOCs are valid in datasets that have dedup 655 * enabled. Similarly double FREEs are allowed as well but 656 * only if they pair up with a corresponding ALLOC entry once 657 * we our done with our sublivelist iteration. 658 * 659 * [Spacemap Check] 660 * for each metaslab: 661 * - iterate over spacemap and then the metaslab's entries in the 662 * spacemap log, then report any double FREEs and ALLOCs (do not 663 * blow up). 664 * 665 * [Cross Check] 666 * After finishing the Livelist Check phase and while being in the 667 * Spacemap Check phase, we find all the recorded leftover ALLOCs 668 * of the livelist check that are part of the metaslab that we are 669 * currently looking at in the Spacemap Check. We report any entries 670 * that are marked as ALLOCs in the livelists but have been actually 671 * freed (and potentially allocated again) after their TXG stamp in 672 * the spacemaps. Also report any ALLOCs from the livelists that 673 * belong to indirect vdevs (e.g. their vdev completed removal). 674 * 675 * Note that this will miss Log Spacemap entries that cancelled each other 676 * out before being flushed to the metaslab, so we are not guaranteed 677 * to match all erroneous ALLOCs. 678 */ 679 static void 680 livelist_metaslab_validate(spa_t *spa) 681 { 682 (void) printf("Verifying deleted livelist entries\n"); 683 684 sublivelist_verify_t sv; 685 zfs_btree_create(&sv.sv_leftover, livelist_block_compare, 686 sizeof (sublivelist_verify_block_t)); 687 iterate_deleted_livelists(spa, livelist_verify, &sv); 688 689 (void) printf("Verifying metaslab entries\n"); 690 vdev_t *rvd = spa->spa_root_vdev; 691 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 692 vdev_t *vd = rvd->vdev_child[c]; 693 694 if (!vdev_is_concrete(vd)) 695 continue; 696 697 for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { 698 metaslab_t *m = vd->vdev_ms[mid]; 699 700 (void) fprintf(stderr, 701 "\rverifying concrete vdev %llu, " 702 "metaslab %llu of %llu ...", 703 (longlong_t)vd->vdev_id, 704 (longlong_t)mid, 705 (longlong_t)vd->vdev_ms_count); 706 707 uint64_t shift, start; 708 range_seg_type_t type = 709 metaslab_calculate_range_tree_type(vd, m, 710 &start, &shift); 711 metaslab_verify_t mv; 712 mv.mv_allocated = range_tree_create(NULL, 713 type, NULL, start, shift); 714 mv.mv_vdid = vd->vdev_id; 715 mv.mv_msid = m->ms_id; 716 mv.mv_start = m->ms_start; 717 mv.mv_end = m->ms_start + m->ms_size; 718 zfs_btree_create(&mv.mv_livelist_allocs, 719 livelist_block_compare, 720 sizeof (sublivelist_verify_block_t)); 721 722 mv_populate_livelist_allocs(&mv, &sv); 723 724 spacemap_check_ms_sm(m->ms_sm, &mv); 725 spacemap_check_sm_log(spa, &mv); 726 727 range_tree_vacate(mv.mv_allocated, NULL, NULL); 728 range_tree_destroy(mv.mv_allocated); 729 zfs_btree_clear(&mv.mv_livelist_allocs); 730 zfs_btree_destroy(&mv.mv_livelist_allocs); 731 } 732 } 733 (void) fprintf(stderr, "\n"); 734 735 /* 736 * If there are any segments in the leftover tree after we walked 737 * through all the metaslabs in the concrete vdevs then this means 738 * that we have segments in the livelists that belong to indirect 739 * vdevs and are marked as allocated. 740 */ 741 if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { 742 zfs_btree_destroy(&sv.sv_leftover); 743 return; 744 } 745 (void) printf("ERROR: Found livelist blocks marked as allocated " 746 "for indirect vdevs:\n"); 747 748 zfs_btree_index_t *where = NULL; 749 sublivelist_verify_block_t *svb; 750 while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != 751 NULL) { 752 int vdev_id = DVA_GET_VDEV(&svb->svb_dva); 753 ASSERT3U(vdev_id, <, rvd->vdev_children); 754 vdev_t *vd = rvd->vdev_child[vdev_id]; 755 ASSERT(!vdev_is_concrete(vd)); 756 (void) printf("<%d:%llx:%llx> TXG %llx\n", 757 vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), 758 (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), 759 (u_longlong_t)svb->svb_allocated_txg); 760 } 761 (void) printf("\n"); 762 zfs_btree_destroy(&sv.sv_leftover); 763 } 764 765 /* 766 * These libumem hooks provide a reasonable set of defaults for the allocator's 767 * debugging facilities. 768 */ 769 const char * 770 _umem_debug_init(void) 771 { 772 return ("default,verbose"); /* $UMEM_DEBUG setting */ 773 } 774 775 const char * 776 _umem_logging_init(void) 777 { 778 return ("fail,contents"); /* $UMEM_LOGGING setting */ 779 } 780 781 static void 782 usage(void) 783 { 784 (void) fprintf(stderr, 785 "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] " 786 "[-I <inflight I/Os>]\n" 787 "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n" 788 "\t\t[-K <key>]\n" 789 "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n" 790 "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n" 791 "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n" 792 "\t%s [-v] <bookmark>\n" 793 "\t%s -C [-A] [-U <cache>]\n" 794 "\t%s -l [-Aqu] <device>\n" 795 "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] " 796 "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n" 797 "\t%s -O [-K <key>] <dataset> <path>\n" 798 "\t%s -r [-K <key>] <dataset> <path> <destination>\n" 799 "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n" 800 "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n" 801 "\t%s -E [-A] word0:word1:...:word15\n" 802 "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] " 803 "<poolname>\n\n", 804 cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, 805 cmdname, cmdname, cmdname, cmdname); 806 807 (void) fprintf(stderr, " Dataset name must include at least one " 808 "separator character '/' or '@'\n"); 809 (void) fprintf(stderr, " If dataset name is specified, only that " 810 "dataset is dumped\n"); 811 (void) fprintf(stderr, " If object numbers or object number " 812 "ranges are specified, only those\n" 813 " objects or ranges are dumped.\n\n"); 814 (void) fprintf(stderr, 815 " Object ranges take the form <start>:<end>[:<flags>]\n" 816 " start Starting object number\n" 817 " end Ending object number, or -1 for no upper bound\n" 818 " flags Optional flags to select object types:\n" 819 " A All objects (this is the default)\n" 820 " d ZFS directories\n" 821 " f ZFS files \n" 822 " m SPA space maps\n" 823 " z ZAPs\n" 824 " - Negate effect of next flag\n\n"); 825 (void) fprintf(stderr, " Options to control amount of output:\n"); 826 (void) fprintf(stderr, " -b --block-stats " 827 "block statistics\n"); 828 (void) fprintf(stderr, " -c --checksum " 829 "checksum all metadata (twice for all data) blocks\n"); 830 (void) fprintf(stderr, " -C --config " 831 "config (or cachefile if alone)\n"); 832 (void) fprintf(stderr, " -d --datasets " 833 "dataset(s)\n"); 834 (void) fprintf(stderr, " -D --dedup-stats " 835 "dedup statistics\n"); 836 (void) fprintf(stderr, " -E --embedded-block-pointer=INTEGER\n" 837 " decode and display block " 838 "from an embedded block pointer\n"); 839 (void) fprintf(stderr, " -h --history " 840 "pool history\n"); 841 (void) fprintf(stderr, " -i --intent-logs " 842 "intent logs\n"); 843 (void) fprintf(stderr, " -l --label " 844 "read label contents\n"); 845 (void) fprintf(stderr, " -k --checkpointed-state " 846 "examine the checkpointed state of the pool\n"); 847 (void) fprintf(stderr, " -L --disable-leak-tracking " 848 "disable leak tracking (do not load spacemaps)\n"); 849 (void) fprintf(stderr, " -m --metaslabs " 850 "metaslabs\n"); 851 (void) fprintf(stderr, " -M --metaslab-groups " 852 "metaslab groups\n"); 853 (void) fprintf(stderr, " -O --object-lookups " 854 "perform object lookups by path\n"); 855 (void) fprintf(stderr, " -r --copy-object " 856 "copy an object by path to file\n"); 857 (void) fprintf(stderr, " -R --read-block " 858 "read and display block from a device\n"); 859 (void) fprintf(stderr, " -s --io-stats " 860 "report stats on zdb's I/O\n"); 861 (void) fprintf(stderr, " -S --simulate-dedup " 862 "simulate dedup to measure effect\n"); 863 (void) fprintf(stderr, " -v --verbose " 864 "verbose (applies to all others)\n"); 865 (void) fprintf(stderr, " -y --livelist " 866 "perform livelist and metaslab validation on any livelists being " 867 "deleted\n\n"); 868 (void) fprintf(stderr, " Below options are intended for use " 869 "with other options:\n"); 870 (void) fprintf(stderr, " -A --ignore-assertions " 871 "ignore assertions (-A), enable panic recovery (-AA) or both " 872 "(-AAA)\n"); 873 (void) fprintf(stderr, " -e --exported " 874 "pool is exported/destroyed/has altroot/not in a cachefile\n"); 875 (void) fprintf(stderr, " -F --automatic-rewind " 876 "attempt automatic rewind within safe range of transaction " 877 "groups\n"); 878 (void) fprintf(stderr, " -G --dump-debug-msg " 879 "dump zfs_dbgmsg buffer before exiting\n"); 880 (void) fprintf(stderr, " -I --inflight=INTEGER " 881 "specify the maximum number of checksumming I/Os " 882 "[default is 200]\n"); 883 (void) fprintf(stderr, " -K --key=KEY " 884 "decryption key for encrypted dataset\n"); 885 (void) fprintf(stderr, " -o --option=\"OPTION=INTEGER\" " 886 "set global variable to an unsigned 32-bit integer\n"); 887 (void) fprintf(stderr, " -p --path==PATH " 888 "use one or more with -e to specify path to vdev dir\n"); 889 (void) fprintf(stderr, " -P --parseable " 890 "print numbers in parseable form\n"); 891 (void) fprintf(stderr, " -q --skip-label " 892 "don't print label contents\n"); 893 (void) fprintf(stderr, " -t --txg=INTEGER " 894 "highest txg to use when searching for uberblocks\n"); 895 (void) fprintf(stderr, " -u --uberblock " 896 "uberblock\n"); 897 (void) fprintf(stderr, " -U --cachefile=PATH " 898 "use alternate cachefile\n"); 899 (void) fprintf(stderr, " -V --verbatim " 900 "do verbatim import\n"); 901 (void) fprintf(stderr, " -x --dump-blocks=PATH " 902 "dump all read blocks into specified directory\n"); 903 (void) fprintf(stderr, " -X --extreme-rewind " 904 "attempt extreme rewind (does not work with dataset)\n"); 905 (void) fprintf(stderr, " -Y --all-reconstruction " 906 "attempt all reconstruction combinations for split blocks\n"); 907 (void) fprintf(stderr, " -Z --zstd-headers " 908 "show ZSTD headers \n"); 909 (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " 910 "to make only that option verbose\n"); 911 (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); 912 exit(1); 913 } 914 915 static void 916 dump_debug_buffer(void) 917 { 918 if (dump_opt['G']) { 919 (void) printf("\n"); 920 (void) fflush(stdout); 921 zfs_dbgmsg_print("zdb"); 922 } 923 } 924 925 /* 926 * Called for usage errors that are discovered after a call to spa_open(), 927 * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. 928 */ 929 930 static void 931 fatal(const char *fmt, ...) 932 { 933 va_list ap; 934 935 va_start(ap, fmt); 936 (void) fprintf(stderr, "%s: ", cmdname); 937 (void) vfprintf(stderr, fmt, ap); 938 va_end(ap); 939 (void) fprintf(stderr, "\n"); 940 941 dump_debug_buffer(); 942 943 exit(1); 944 } 945 946 static void 947 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) 948 { 949 (void) size; 950 nvlist_t *nv; 951 size_t nvsize = *(uint64_t *)data; 952 char *packed = umem_alloc(nvsize, UMEM_NOFAIL); 953 954 VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); 955 956 VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); 957 958 umem_free(packed, nvsize); 959 960 dump_nvlist(nv, 8); 961 962 nvlist_free(nv); 963 } 964 965 static void 966 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) 967 { 968 (void) os, (void) object, (void) size; 969 spa_history_phys_t *shp = data; 970 971 if (shp == NULL) 972 return; 973 974 (void) printf("\t\tpool_create_len = %llu\n", 975 (u_longlong_t)shp->sh_pool_create_len); 976 (void) printf("\t\tphys_max_off = %llu\n", 977 (u_longlong_t)shp->sh_phys_max_off); 978 (void) printf("\t\tbof = %llu\n", 979 (u_longlong_t)shp->sh_bof); 980 (void) printf("\t\teof = %llu\n", 981 (u_longlong_t)shp->sh_eof); 982 (void) printf("\t\trecords_lost = %llu\n", 983 (u_longlong_t)shp->sh_records_lost); 984 } 985 986 static void 987 zdb_nicenum(uint64_t num, char *buf, size_t buflen) 988 { 989 if (dump_opt['P']) 990 (void) snprintf(buf, buflen, "%llu", (longlong_t)num); 991 else 992 nicenum(num, buf, buflen); 993 } 994 995 static const char histo_stars[] = "****************************************"; 996 static const uint64_t histo_width = sizeof (histo_stars) - 1; 997 998 static void 999 dump_histogram(const uint64_t *histo, int size, int offset) 1000 { 1001 int i; 1002 int minidx = size - 1; 1003 int maxidx = 0; 1004 uint64_t max = 0; 1005 1006 for (i = 0; i < size; i++) { 1007 if (histo[i] == 0) 1008 continue; 1009 if (histo[i] > max) 1010 max = histo[i]; 1011 if (i > maxidx) 1012 maxidx = i; 1013 if (i < minidx) 1014 minidx = i; 1015 } 1016 1017 if (max < histo_width) 1018 max = histo_width; 1019 1020 for (i = minidx; i <= maxidx; i++) { 1021 (void) printf("\t\t\t%3u: %6llu %s\n", 1022 i + offset, (u_longlong_t)histo[i], 1023 &histo_stars[(max - histo[i]) * histo_width / max]); 1024 } 1025 } 1026 1027 static void 1028 dump_zap_stats(objset_t *os, uint64_t object) 1029 { 1030 int error; 1031 zap_stats_t zs; 1032 1033 error = zap_get_stats(os, object, &zs); 1034 if (error) 1035 return; 1036 1037 if (zs.zs_ptrtbl_len == 0) { 1038 ASSERT(zs.zs_num_blocks == 1); 1039 (void) printf("\tmicrozap: %llu bytes, %llu entries\n", 1040 (u_longlong_t)zs.zs_blocksize, 1041 (u_longlong_t)zs.zs_num_entries); 1042 return; 1043 } 1044 1045 (void) printf("\tFat ZAP stats:\n"); 1046 1047 (void) printf("\t\tPointer table:\n"); 1048 (void) printf("\t\t\t%llu elements\n", 1049 (u_longlong_t)zs.zs_ptrtbl_len); 1050 (void) printf("\t\t\tzt_blk: %llu\n", 1051 (u_longlong_t)zs.zs_ptrtbl_zt_blk); 1052 (void) printf("\t\t\tzt_numblks: %llu\n", 1053 (u_longlong_t)zs.zs_ptrtbl_zt_numblks); 1054 (void) printf("\t\t\tzt_shift: %llu\n", 1055 (u_longlong_t)zs.zs_ptrtbl_zt_shift); 1056 (void) printf("\t\t\tzt_blks_copied: %llu\n", 1057 (u_longlong_t)zs.zs_ptrtbl_blks_copied); 1058 (void) printf("\t\t\tzt_nextblk: %llu\n", 1059 (u_longlong_t)zs.zs_ptrtbl_nextblk); 1060 1061 (void) printf("\t\tZAP entries: %llu\n", 1062 (u_longlong_t)zs.zs_num_entries); 1063 (void) printf("\t\tLeaf blocks: %llu\n", 1064 (u_longlong_t)zs.zs_num_leafs); 1065 (void) printf("\t\tTotal blocks: %llu\n", 1066 (u_longlong_t)zs.zs_num_blocks); 1067 (void) printf("\t\tzap_block_type: 0x%llx\n", 1068 (u_longlong_t)zs.zs_block_type); 1069 (void) printf("\t\tzap_magic: 0x%llx\n", 1070 (u_longlong_t)zs.zs_magic); 1071 (void) printf("\t\tzap_salt: 0x%llx\n", 1072 (u_longlong_t)zs.zs_salt); 1073 1074 (void) printf("\t\tLeafs with 2^n pointers:\n"); 1075 dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); 1076 1077 (void) printf("\t\tBlocks with n*5 entries:\n"); 1078 dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); 1079 1080 (void) printf("\t\tBlocks n/10 full:\n"); 1081 dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); 1082 1083 (void) printf("\t\tEntries with n chunks:\n"); 1084 dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); 1085 1086 (void) printf("\t\tBuckets with n entries:\n"); 1087 dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); 1088 } 1089 1090 static void 1091 dump_none(objset_t *os, uint64_t object, void *data, size_t size) 1092 { 1093 (void) os, (void) object, (void) data, (void) size; 1094 } 1095 1096 static void 1097 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) 1098 { 1099 (void) os, (void) object, (void) data, (void) size; 1100 (void) printf("\tUNKNOWN OBJECT TYPE\n"); 1101 } 1102 1103 static void 1104 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) 1105 { 1106 (void) os, (void) object, (void) data, (void) size; 1107 } 1108 1109 static void 1110 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) 1111 { 1112 uint64_t *arr; 1113 uint64_t oursize; 1114 if (dump_opt['d'] < 6) 1115 return; 1116 1117 if (data == NULL) { 1118 dmu_object_info_t doi; 1119 1120 VERIFY0(dmu_object_info(os, object, &doi)); 1121 size = doi.doi_max_offset; 1122 /* 1123 * We cap the size at 1 mebibyte here to prevent 1124 * allocation failures and nigh-infinite printing if the 1125 * object is extremely large. 1126 */ 1127 oursize = MIN(size, 1 << 20); 1128 arr = kmem_alloc(oursize, KM_SLEEP); 1129 1130 int err = dmu_read(os, object, 0, oursize, arr, 0); 1131 if (err != 0) { 1132 (void) printf("got error %u from dmu_read\n", err); 1133 kmem_free(arr, oursize); 1134 return; 1135 } 1136 } else { 1137 /* 1138 * Even though the allocation is already done in this code path, 1139 * we still cap the size to prevent excessive printing. 1140 */ 1141 oursize = MIN(size, 1 << 20); 1142 arr = data; 1143 } 1144 1145 if (size == 0) { 1146 if (data == NULL) 1147 kmem_free(arr, oursize); 1148 (void) printf("\t\t[]\n"); 1149 return; 1150 } 1151 1152 (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); 1153 for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { 1154 if (i % 4 != 0) 1155 (void) printf(", %0llx", (u_longlong_t)arr[i]); 1156 else 1157 (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); 1158 } 1159 if (oursize != size) 1160 (void) printf(", ... "); 1161 (void) printf("]\n"); 1162 1163 if (data == NULL) 1164 kmem_free(arr, oursize); 1165 } 1166 1167 static void 1168 dump_zap(objset_t *os, uint64_t object, void *data, size_t size) 1169 { 1170 (void) data, (void) size; 1171 zap_cursor_t zc; 1172 zap_attribute_t attr; 1173 void *prop; 1174 unsigned i; 1175 1176 dump_zap_stats(os, object); 1177 (void) printf("\n"); 1178 1179 for (zap_cursor_init(&zc, os, object); 1180 zap_cursor_retrieve(&zc, &attr) == 0; 1181 zap_cursor_advance(&zc)) { 1182 (void) printf("\t\t%s = ", attr.za_name); 1183 if (attr.za_num_integers == 0) { 1184 (void) printf("\n"); 1185 continue; 1186 } 1187 prop = umem_zalloc(attr.za_num_integers * 1188 attr.za_integer_length, UMEM_NOFAIL); 1189 (void) zap_lookup(os, object, attr.za_name, 1190 attr.za_integer_length, attr.za_num_integers, prop); 1191 if (attr.za_integer_length == 1) { 1192 if (strcmp(attr.za_name, 1193 DSL_CRYPTO_KEY_MASTER_KEY) == 0 || 1194 strcmp(attr.za_name, 1195 DSL_CRYPTO_KEY_HMAC_KEY) == 0 || 1196 strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 || 1197 strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 || 1198 strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) { 1199 uint8_t *u8 = prop; 1200 1201 for (i = 0; i < attr.za_num_integers; i++) { 1202 (void) printf("%02x", u8[i]); 1203 } 1204 } else { 1205 (void) printf("%s", (char *)prop); 1206 } 1207 } else { 1208 for (i = 0; i < attr.za_num_integers; i++) { 1209 switch (attr.za_integer_length) { 1210 case 2: 1211 (void) printf("%u ", 1212 ((uint16_t *)prop)[i]); 1213 break; 1214 case 4: 1215 (void) printf("%u ", 1216 ((uint32_t *)prop)[i]); 1217 break; 1218 case 8: 1219 (void) printf("%lld ", 1220 (u_longlong_t)((int64_t *)prop)[i]); 1221 break; 1222 } 1223 } 1224 } 1225 (void) printf("\n"); 1226 umem_free(prop, attr.za_num_integers * attr.za_integer_length); 1227 } 1228 zap_cursor_fini(&zc); 1229 } 1230 1231 static void 1232 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) 1233 { 1234 bpobj_phys_t *bpop = data; 1235 uint64_t i; 1236 char bytes[32], comp[32], uncomp[32]; 1237 1238 /* make sure the output won't get truncated */ 1239 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 1240 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); 1241 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); 1242 1243 if (bpop == NULL) 1244 return; 1245 1246 zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); 1247 zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); 1248 zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); 1249 1250 (void) printf("\t\tnum_blkptrs = %llu\n", 1251 (u_longlong_t)bpop->bpo_num_blkptrs); 1252 (void) printf("\t\tbytes = %s\n", bytes); 1253 if (size >= BPOBJ_SIZE_V1) { 1254 (void) printf("\t\tcomp = %s\n", comp); 1255 (void) printf("\t\tuncomp = %s\n", uncomp); 1256 } 1257 if (size >= BPOBJ_SIZE_V2) { 1258 (void) printf("\t\tsubobjs = %llu\n", 1259 (u_longlong_t)bpop->bpo_subobjs); 1260 (void) printf("\t\tnum_subobjs = %llu\n", 1261 (u_longlong_t)bpop->bpo_num_subobjs); 1262 } 1263 if (size >= sizeof (*bpop)) { 1264 (void) printf("\t\tnum_freed = %llu\n", 1265 (u_longlong_t)bpop->bpo_num_freed); 1266 } 1267 1268 if (dump_opt['d'] < 5) 1269 return; 1270 1271 for (i = 0; i < bpop->bpo_num_blkptrs; i++) { 1272 char blkbuf[BP_SPRINTF_LEN]; 1273 blkptr_t bp; 1274 1275 int err = dmu_read(os, object, 1276 i * sizeof (bp), sizeof (bp), &bp, 0); 1277 if (err != 0) { 1278 (void) printf("got error %u from dmu_read\n", err); 1279 break; 1280 } 1281 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, 1282 BP_GET_FREE(&bp)); 1283 (void) printf("\t%s\n", blkbuf); 1284 } 1285 } 1286 1287 static void 1288 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) 1289 { 1290 (void) data, (void) size; 1291 dmu_object_info_t doi; 1292 int64_t i; 1293 1294 VERIFY0(dmu_object_info(os, object, &doi)); 1295 uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); 1296 1297 int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); 1298 if (err != 0) { 1299 (void) printf("got error %u from dmu_read\n", err); 1300 kmem_free(subobjs, doi.doi_max_offset); 1301 return; 1302 } 1303 1304 int64_t last_nonzero = -1; 1305 for (i = 0; i < doi.doi_max_offset / 8; i++) { 1306 if (subobjs[i] != 0) 1307 last_nonzero = i; 1308 } 1309 1310 for (i = 0; i <= last_nonzero; i++) { 1311 (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]); 1312 } 1313 kmem_free(subobjs, doi.doi_max_offset); 1314 } 1315 1316 static void 1317 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) 1318 { 1319 (void) data, (void) size; 1320 dump_zap_stats(os, object); 1321 /* contents are printed elsewhere, properly decoded */ 1322 } 1323 1324 static void 1325 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) 1326 { 1327 (void) data, (void) size; 1328 zap_cursor_t zc; 1329 zap_attribute_t attr; 1330 1331 dump_zap_stats(os, object); 1332 (void) printf("\n"); 1333 1334 for (zap_cursor_init(&zc, os, object); 1335 zap_cursor_retrieve(&zc, &attr) == 0; 1336 zap_cursor_advance(&zc)) { 1337 (void) printf("\t\t%s = ", attr.za_name); 1338 if (attr.za_num_integers == 0) { 1339 (void) printf("\n"); 1340 continue; 1341 } 1342 (void) printf(" %llx : [%d:%d:%d]\n", 1343 (u_longlong_t)attr.za_first_integer, 1344 (int)ATTR_LENGTH(attr.za_first_integer), 1345 (int)ATTR_BSWAP(attr.za_first_integer), 1346 (int)ATTR_NUM(attr.za_first_integer)); 1347 } 1348 zap_cursor_fini(&zc); 1349 } 1350 1351 static void 1352 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) 1353 { 1354 (void) data, (void) size; 1355 zap_cursor_t zc; 1356 zap_attribute_t attr; 1357 uint16_t *layout_attrs; 1358 unsigned i; 1359 1360 dump_zap_stats(os, object); 1361 (void) printf("\n"); 1362 1363 for (zap_cursor_init(&zc, os, object); 1364 zap_cursor_retrieve(&zc, &attr) == 0; 1365 zap_cursor_advance(&zc)) { 1366 (void) printf("\t\t%s = [", attr.za_name); 1367 if (attr.za_num_integers == 0) { 1368 (void) printf("\n"); 1369 continue; 1370 } 1371 1372 VERIFY(attr.za_integer_length == 2); 1373 layout_attrs = umem_zalloc(attr.za_num_integers * 1374 attr.za_integer_length, UMEM_NOFAIL); 1375 1376 VERIFY(zap_lookup(os, object, attr.za_name, 1377 attr.za_integer_length, 1378 attr.za_num_integers, layout_attrs) == 0); 1379 1380 for (i = 0; i != attr.za_num_integers; i++) 1381 (void) printf(" %d ", (int)layout_attrs[i]); 1382 (void) printf("]\n"); 1383 umem_free(layout_attrs, 1384 attr.za_num_integers * attr.za_integer_length); 1385 } 1386 zap_cursor_fini(&zc); 1387 } 1388 1389 static void 1390 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) 1391 { 1392 (void) data, (void) size; 1393 zap_cursor_t zc; 1394 zap_attribute_t attr; 1395 const char *typenames[] = { 1396 /* 0 */ "not specified", 1397 /* 1 */ "FIFO", 1398 /* 2 */ "Character Device", 1399 /* 3 */ "3 (invalid)", 1400 /* 4 */ "Directory", 1401 /* 5 */ "5 (invalid)", 1402 /* 6 */ "Block Device", 1403 /* 7 */ "7 (invalid)", 1404 /* 8 */ "Regular File", 1405 /* 9 */ "9 (invalid)", 1406 /* 10 */ "Symbolic Link", 1407 /* 11 */ "11 (invalid)", 1408 /* 12 */ "Socket", 1409 /* 13 */ "Door", 1410 /* 14 */ "Event Port", 1411 /* 15 */ "15 (invalid)", 1412 }; 1413 1414 dump_zap_stats(os, object); 1415 (void) printf("\n"); 1416 1417 for (zap_cursor_init(&zc, os, object); 1418 zap_cursor_retrieve(&zc, &attr) == 0; 1419 zap_cursor_advance(&zc)) { 1420 (void) printf("\t\t%s = %lld (type: %s)\n", 1421 attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), 1422 typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); 1423 } 1424 zap_cursor_fini(&zc); 1425 } 1426 1427 static int 1428 get_dtl_refcount(vdev_t *vd) 1429 { 1430 int refcount = 0; 1431 1432 if (vd->vdev_ops->vdev_op_leaf) { 1433 space_map_t *sm = vd->vdev_dtl_sm; 1434 1435 if (sm != NULL && 1436 sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) 1437 return (1); 1438 return (0); 1439 } 1440 1441 for (unsigned c = 0; c < vd->vdev_children; c++) 1442 refcount += get_dtl_refcount(vd->vdev_child[c]); 1443 return (refcount); 1444 } 1445 1446 static int 1447 get_metaslab_refcount(vdev_t *vd) 1448 { 1449 int refcount = 0; 1450 1451 if (vd->vdev_top == vd) { 1452 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 1453 space_map_t *sm = vd->vdev_ms[m]->ms_sm; 1454 1455 if (sm != NULL && 1456 sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) 1457 refcount++; 1458 } 1459 } 1460 for (unsigned c = 0; c < vd->vdev_children; c++) 1461 refcount += get_metaslab_refcount(vd->vdev_child[c]); 1462 1463 return (refcount); 1464 } 1465 1466 static int 1467 get_obsolete_refcount(vdev_t *vd) 1468 { 1469 uint64_t obsolete_sm_object; 1470 int refcount = 0; 1471 1472 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 1473 if (vd->vdev_top == vd && obsolete_sm_object != 0) { 1474 dmu_object_info_t doi; 1475 VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, 1476 obsolete_sm_object, &doi)); 1477 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { 1478 refcount++; 1479 } 1480 } else { 1481 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); 1482 ASSERT3U(obsolete_sm_object, ==, 0); 1483 } 1484 for (unsigned c = 0; c < vd->vdev_children; c++) { 1485 refcount += get_obsolete_refcount(vd->vdev_child[c]); 1486 } 1487 1488 return (refcount); 1489 } 1490 1491 static int 1492 get_prev_obsolete_spacemap_refcount(spa_t *spa) 1493 { 1494 uint64_t prev_obj = 1495 spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; 1496 if (prev_obj != 0) { 1497 dmu_object_info_t doi; 1498 VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); 1499 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { 1500 return (1); 1501 } 1502 } 1503 return (0); 1504 } 1505 1506 static int 1507 get_checkpoint_refcount(vdev_t *vd) 1508 { 1509 int refcount = 0; 1510 1511 if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && 1512 zap_contains(spa_meta_objset(vd->vdev_spa), 1513 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) 1514 refcount++; 1515 1516 for (uint64_t c = 0; c < vd->vdev_children; c++) 1517 refcount += get_checkpoint_refcount(vd->vdev_child[c]); 1518 1519 return (refcount); 1520 } 1521 1522 static int 1523 get_log_spacemap_refcount(spa_t *spa) 1524 { 1525 return (avl_numnodes(&spa->spa_sm_logs_by_txg)); 1526 } 1527 1528 static int 1529 verify_spacemap_refcounts(spa_t *spa) 1530 { 1531 uint64_t expected_refcount = 0; 1532 uint64_t actual_refcount; 1533 1534 (void) feature_get_refcount(spa, 1535 &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], 1536 &expected_refcount); 1537 actual_refcount = get_dtl_refcount(spa->spa_root_vdev); 1538 actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); 1539 actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); 1540 actual_refcount += get_prev_obsolete_spacemap_refcount(spa); 1541 actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); 1542 actual_refcount += get_log_spacemap_refcount(spa); 1543 1544 if (expected_refcount != actual_refcount) { 1545 (void) printf("space map refcount mismatch: expected %lld != " 1546 "actual %lld\n", 1547 (longlong_t)expected_refcount, 1548 (longlong_t)actual_refcount); 1549 return (2); 1550 } 1551 return (0); 1552 } 1553 1554 static void 1555 dump_spacemap(objset_t *os, space_map_t *sm) 1556 { 1557 const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", 1558 "INVALID", "INVALID", "INVALID", "INVALID" }; 1559 1560 if (sm == NULL) 1561 return; 1562 1563 (void) printf("space map object %llu:\n", 1564 (longlong_t)sm->sm_object); 1565 (void) printf(" smp_length = 0x%llx\n", 1566 (longlong_t)sm->sm_phys->smp_length); 1567 (void) printf(" smp_alloc = 0x%llx\n", 1568 (longlong_t)sm->sm_phys->smp_alloc); 1569 1570 if (dump_opt['d'] < 6 && dump_opt['m'] < 4) 1571 return; 1572 1573 /* 1574 * Print out the freelist entries in both encoded and decoded form. 1575 */ 1576 uint8_t mapshift = sm->sm_shift; 1577 int64_t alloc = 0; 1578 uint64_t word, entry_id = 0; 1579 for (uint64_t offset = 0; offset < space_map_length(sm); 1580 offset += sizeof (word)) { 1581 1582 VERIFY0(dmu_read(os, space_map_object(sm), offset, 1583 sizeof (word), &word, DMU_READ_PREFETCH)); 1584 1585 if (sm_entry_is_debug(word)) { 1586 uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); 1587 uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); 1588 if (de_txg == 0) { 1589 (void) printf( 1590 "\t [%6llu] PADDING\n", 1591 (u_longlong_t)entry_id); 1592 } else { 1593 (void) printf( 1594 "\t [%6llu] %s: txg %llu pass %llu\n", 1595 (u_longlong_t)entry_id, 1596 ddata[SM_DEBUG_ACTION_DECODE(word)], 1597 (u_longlong_t)de_txg, 1598 (u_longlong_t)de_sync_pass); 1599 } 1600 entry_id++; 1601 continue; 1602 } 1603 1604 uint8_t words; 1605 char entry_type; 1606 uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; 1607 1608 if (sm_entry_is_single_word(word)) { 1609 entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? 1610 'A' : 'F'; 1611 entry_off = (SM_OFFSET_DECODE(word) << mapshift) + 1612 sm->sm_start; 1613 entry_run = SM_RUN_DECODE(word) << mapshift; 1614 words = 1; 1615 } else { 1616 /* it is a two-word entry so we read another word */ 1617 ASSERT(sm_entry_is_double_word(word)); 1618 1619 uint64_t extra_word; 1620 offset += sizeof (extra_word); 1621 VERIFY0(dmu_read(os, space_map_object(sm), offset, 1622 sizeof (extra_word), &extra_word, 1623 DMU_READ_PREFETCH)); 1624 1625 ASSERT3U(offset, <=, space_map_length(sm)); 1626 1627 entry_run = SM2_RUN_DECODE(word) << mapshift; 1628 entry_vdev = SM2_VDEV_DECODE(word); 1629 entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 1630 'A' : 'F'; 1631 entry_off = (SM2_OFFSET_DECODE(extra_word) << 1632 mapshift) + sm->sm_start; 1633 words = 2; 1634 } 1635 1636 (void) printf("\t [%6llu] %c range:" 1637 " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", 1638 (u_longlong_t)entry_id, 1639 entry_type, (u_longlong_t)entry_off, 1640 (u_longlong_t)(entry_off + entry_run), 1641 (u_longlong_t)entry_run, 1642 (u_longlong_t)entry_vdev, words); 1643 1644 if (entry_type == 'A') 1645 alloc += entry_run; 1646 else 1647 alloc -= entry_run; 1648 entry_id++; 1649 } 1650 if (alloc != space_map_allocated(sm)) { 1651 (void) printf("space_map_object alloc (%lld) INCONSISTENT " 1652 "with space map summary (%lld)\n", 1653 (longlong_t)space_map_allocated(sm), (longlong_t)alloc); 1654 } 1655 } 1656 1657 static void 1658 dump_metaslab_stats(metaslab_t *msp) 1659 { 1660 char maxbuf[32]; 1661 range_tree_t *rt = msp->ms_allocatable; 1662 zfs_btree_t *t = &msp->ms_allocatable_by_size; 1663 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1664 1665 /* max sure nicenum has enough space */ 1666 _Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated"); 1667 1668 zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); 1669 1670 (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", 1671 "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, 1672 "freepct", free_pct); 1673 (void) printf("\tIn-memory histogram:\n"); 1674 dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); 1675 } 1676 1677 static void 1678 dump_metaslab(metaslab_t *msp) 1679 { 1680 vdev_t *vd = msp->ms_group->mg_vd; 1681 spa_t *spa = vd->vdev_spa; 1682 space_map_t *sm = msp->ms_sm; 1683 char freebuf[32]; 1684 1685 zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, 1686 sizeof (freebuf)); 1687 1688 (void) printf( 1689 "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", 1690 (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, 1691 (u_longlong_t)space_map_object(sm), freebuf); 1692 1693 if (dump_opt['m'] > 2 && !dump_opt['L']) { 1694 mutex_enter(&msp->ms_lock); 1695 VERIFY0(metaslab_load(msp)); 1696 range_tree_stat_verify(msp->ms_allocatable); 1697 dump_metaslab_stats(msp); 1698 metaslab_unload(msp); 1699 mutex_exit(&msp->ms_lock); 1700 } 1701 1702 if (dump_opt['m'] > 1 && sm != NULL && 1703 spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { 1704 /* 1705 * The space map histogram represents free space in chunks 1706 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). 1707 */ 1708 (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", 1709 (u_longlong_t)msp->ms_fragmentation); 1710 dump_histogram(sm->sm_phys->smp_histogram, 1711 SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); 1712 } 1713 1714 if (vd->vdev_ops == &vdev_draid_ops) 1715 ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); 1716 else 1717 ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); 1718 1719 dump_spacemap(spa->spa_meta_objset, msp->ms_sm); 1720 1721 if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 1722 (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", 1723 (u_longlong_t)metaslab_unflushed_txg(msp)); 1724 } 1725 } 1726 1727 static void 1728 print_vdev_metaslab_header(vdev_t *vd) 1729 { 1730 vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; 1731 const char *bias_str = ""; 1732 if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { 1733 bias_str = VDEV_ALLOC_BIAS_LOG; 1734 } else if (alloc_bias == VDEV_BIAS_SPECIAL) { 1735 bias_str = VDEV_ALLOC_BIAS_SPECIAL; 1736 } else if (alloc_bias == VDEV_BIAS_DEDUP) { 1737 bias_str = VDEV_ALLOC_BIAS_DEDUP; 1738 } 1739 1740 uint64_t ms_flush_data_obj = 0; 1741 if (vd->vdev_top_zap != 0) { 1742 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), 1743 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, 1744 sizeof (uint64_t), 1, &ms_flush_data_obj); 1745 if (error != ENOENT) { 1746 ASSERT0(error); 1747 } 1748 } 1749 1750 (void) printf("\tvdev %10llu %s", 1751 (u_longlong_t)vd->vdev_id, bias_str); 1752 1753 if (ms_flush_data_obj != 0) { 1754 (void) printf(" ms_unflushed_phys object %llu", 1755 (u_longlong_t)ms_flush_data_obj); 1756 } 1757 1758 (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", 1759 "metaslabs", (u_longlong_t)vd->vdev_ms_count, 1760 "offset", "spacemap", "free"); 1761 (void) printf("\t%15s %19s %15s %12s\n", 1762 "---------------", "-------------------", 1763 "---------------", "------------"); 1764 } 1765 1766 static void 1767 dump_metaslab_groups(spa_t *spa, boolean_t show_special) 1768 { 1769 vdev_t *rvd = spa->spa_root_vdev; 1770 metaslab_class_t *mc = spa_normal_class(spa); 1771 metaslab_class_t *smc = spa_special_class(spa); 1772 uint64_t fragmentation; 1773 1774 metaslab_class_histogram_verify(mc); 1775 1776 for (unsigned c = 0; c < rvd->vdev_children; c++) { 1777 vdev_t *tvd = rvd->vdev_child[c]; 1778 metaslab_group_t *mg = tvd->vdev_mg; 1779 1780 if (mg == NULL || (mg->mg_class != mc && 1781 (!show_special || mg->mg_class != smc))) 1782 continue; 1783 1784 metaslab_group_histogram_verify(mg); 1785 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 1786 1787 (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" 1788 "fragmentation", 1789 (u_longlong_t)tvd->vdev_id, 1790 (u_longlong_t)tvd->vdev_ms_count); 1791 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 1792 (void) printf("%3s\n", "-"); 1793 } else { 1794 (void) printf("%3llu%%\n", 1795 (u_longlong_t)mg->mg_fragmentation); 1796 } 1797 dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); 1798 } 1799 1800 (void) printf("\tpool %s\tfragmentation", spa_name(spa)); 1801 fragmentation = metaslab_class_fragmentation(mc); 1802 if (fragmentation == ZFS_FRAG_INVALID) 1803 (void) printf("\t%3s\n", "-"); 1804 else 1805 (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); 1806 dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); 1807 } 1808 1809 static void 1810 print_vdev_indirect(vdev_t *vd) 1811 { 1812 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 1813 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1814 vdev_indirect_births_t *vib = vd->vdev_indirect_births; 1815 1816 if (vim == NULL) { 1817 ASSERT3P(vib, ==, NULL); 1818 return; 1819 } 1820 1821 ASSERT3U(vdev_indirect_mapping_object(vim), ==, 1822 vic->vic_mapping_object); 1823 ASSERT3U(vdev_indirect_births_object(vib), ==, 1824 vic->vic_births_object); 1825 1826 (void) printf("indirect births obj %llu:\n", 1827 (longlong_t)vic->vic_births_object); 1828 (void) printf(" vib_count = %llu\n", 1829 (longlong_t)vdev_indirect_births_count(vib)); 1830 for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { 1831 vdev_indirect_birth_entry_phys_t *cur_vibe = 1832 &vib->vib_entries[i]; 1833 (void) printf("\toffset %llx -> txg %llu\n", 1834 (longlong_t)cur_vibe->vibe_offset, 1835 (longlong_t)cur_vibe->vibe_phys_birth_txg); 1836 } 1837 (void) printf("\n"); 1838 1839 (void) printf("indirect mapping obj %llu:\n", 1840 (longlong_t)vic->vic_mapping_object); 1841 (void) printf(" vim_max_offset = 0x%llx\n", 1842 (longlong_t)vdev_indirect_mapping_max_offset(vim)); 1843 (void) printf(" vim_bytes_mapped = 0x%llx\n", 1844 (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); 1845 (void) printf(" vim_count = %llu\n", 1846 (longlong_t)vdev_indirect_mapping_num_entries(vim)); 1847 1848 if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) 1849 return; 1850 1851 uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); 1852 1853 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { 1854 vdev_indirect_mapping_entry_phys_t *vimep = 1855 &vim->vim_entries[i]; 1856 (void) printf("\t<%llx:%llx:%llx> -> " 1857 "<%llx:%llx:%llx> (%x obsolete)\n", 1858 (longlong_t)vd->vdev_id, 1859 (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), 1860 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 1861 (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), 1862 (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), 1863 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 1864 counts[i]); 1865 } 1866 (void) printf("\n"); 1867 1868 uint64_t obsolete_sm_object; 1869 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 1870 if (obsolete_sm_object != 0) { 1871 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1872 (void) printf("obsolete space map object %llu:\n", 1873 (u_longlong_t)obsolete_sm_object); 1874 ASSERT(vd->vdev_obsolete_sm != NULL); 1875 ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, 1876 obsolete_sm_object); 1877 dump_spacemap(mos, vd->vdev_obsolete_sm); 1878 (void) printf("\n"); 1879 } 1880 } 1881 1882 static void 1883 dump_metaslabs(spa_t *spa) 1884 { 1885 vdev_t *vd, *rvd = spa->spa_root_vdev; 1886 uint64_t m, c = 0, children = rvd->vdev_children; 1887 1888 (void) printf("\nMetaslabs:\n"); 1889 1890 if (!dump_opt['d'] && zopt_metaslab_args > 0) { 1891 c = zopt_metaslab[0]; 1892 1893 if (c >= children) 1894 (void) fatal("bad vdev id: %llu", (u_longlong_t)c); 1895 1896 if (zopt_metaslab_args > 1) { 1897 vd = rvd->vdev_child[c]; 1898 print_vdev_metaslab_header(vd); 1899 1900 for (m = 1; m < zopt_metaslab_args; m++) { 1901 if (zopt_metaslab[m] < vd->vdev_ms_count) 1902 dump_metaslab( 1903 vd->vdev_ms[zopt_metaslab[m]]); 1904 else 1905 (void) fprintf(stderr, "bad metaslab " 1906 "number %llu\n", 1907 (u_longlong_t)zopt_metaslab[m]); 1908 } 1909 (void) printf("\n"); 1910 return; 1911 } 1912 children = c + 1; 1913 } 1914 for (; c < children; c++) { 1915 vd = rvd->vdev_child[c]; 1916 print_vdev_metaslab_header(vd); 1917 1918 print_vdev_indirect(vd); 1919 1920 for (m = 0; m < vd->vdev_ms_count; m++) 1921 dump_metaslab(vd->vdev_ms[m]); 1922 (void) printf("\n"); 1923 } 1924 } 1925 1926 static void 1927 dump_log_spacemaps(spa_t *spa) 1928 { 1929 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1930 return; 1931 1932 (void) printf("\nLog Space Maps in Pool:\n"); 1933 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 1934 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { 1935 space_map_t *sm = NULL; 1936 VERIFY0(space_map_open(&sm, spa_meta_objset(spa), 1937 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); 1938 1939 (void) printf("Log Spacemap object %llu txg %llu\n", 1940 (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); 1941 dump_spacemap(spa->spa_meta_objset, sm); 1942 space_map_close(sm); 1943 } 1944 (void) printf("\n"); 1945 } 1946 1947 static void 1948 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) 1949 { 1950 const ddt_phys_t *ddp = dde->dde_phys; 1951 const ddt_key_t *ddk = &dde->dde_key; 1952 const char *types[4] = { "ditto", "single", "double", "triple" }; 1953 char blkbuf[BP_SPRINTF_LEN]; 1954 blkptr_t blk; 1955 int p; 1956 1957 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1958 if (ddp->ddp_phys_birth == 0) 1959 continue; 1960 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 1961 snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); 1962 (void) printf("index %llx refcnt %llu %s %s\n", 1963 (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, 1964 types[p], blkbuf); 1965 } 1966 } 1967 1968 static void 1969 dump_dedup_ratio(const ddt_stat_t *dds) 1970 { 1971 double rL, rP, rD, D, dedup, compress, copies; 1972 1973 if (dds->dds_blocks == 0) 1974 return; 1975 1976 rL = (double)dds->dds_ref_lsize; 1977 rP = (double)dds->dds_ref_psize; 1978 rD = (double)dds->dds_ref_dsize; 1979 D = (double)dds->dds_dsize; 1980 1981 dedup = rD / D; 1982 compress = rL / rP; 1983 copies = rD / rP; 1984 1985 (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " 1986 "dedup * compress / copies = %.2f\n\n", 1987 dedup, compress, copies, dedup * compress / copies); 1988 } 1989 1990 static void 1991 dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 1992 { 1993 char name[DDT_NAMELEN]; 1994 ddt_entry_t dde; 1995 uint64_t walk = 0; 1996 dmu_object_info_t doi; 1997 uint64_t count, dspace, mspace; 1998 int error; 1999 2000 error = ddt_object_info(ddt, type, class, &doi); 2001 2002 if (error == ENOENT) 2003 return; 2004 ASSERT(error == 0); 2005 2006 error = ddt_object_count(ddt, type, class, &count); 2007 ASSERT(error == 0); 2008 if (count == 0) 2009 return; 2010 2011 dspace = doi.doi_physical_blocks_512 << 9; 2012 mspace = doi.doi_fill_count * doi.doi_data_block_size; 2013 2014 ddt_object_name(ddt, type, class, name); 2015 2016 (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", 2017 name, 2018 (u_longlong_t)count, 2019 (u_longlong_t)(dspace / count), 2020 (u_longlong_t)(mspace / count)); 2021 2022 if (dump_opt['D'] < 3) 2023 return; 2024 2025 zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); 2026 2027 if (dump_opt['D'] < 4) 2028 return; 2029 2030 if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) 2031 return; 2032 2033 (void) printf("%s contents:\n\n", name); 2034 2035 while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) 2036 dump_dde(ddt, &dde, walk); 2037 2038 ASSERT3U(error, ==, ENOENT); 2039 2040 (void) printf("\n"); 2041 } 2042 2043 static void 2044 dump_all_ddts(spa_t *spa) 2045 { 2046 ddt_histogram_t ddh_total = {{{0}}}; 2047 ddt_stat_t dds_total = {0}; 2048 2049 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 2050 ddt_t *ddt = spa->spa_ddt[c]; 2051 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 2052 for (enum ddt_class class = 0; class < DDT_CLASSES; 2053 class++) { 2054 dump_ddt(ddt, type, class); 2055 } 2056 } 2057 } 2058 2059 ddt_get_dedup_stats(spa, &dds_total); 2060 2061 if (dds_total.dds_blocks == 0) { 2062 (void) printf("All DDTs are empty\n"); 2063 return; 2064 } 2065 2066 (void) printf("\n"); 2067 2068 if (dump_opt['D'] > 1) { 2069 (void) printf("DDT histogram (aggregated over all DDTs):\n"); 2070 ddt_get_dedup_histogram(spa, &ddh_total); 2071 zpool_dump_ddt(&dds_total, &ddh_total); 2072 } 2073 2074 dump_dedup_ratio(&dds_total); 2075 } 2076 2077 static void 2078 dump_dtl_seg(void *arg, uint64_t start, uint64_t size) 2079 { 2080 char *prefix = arg; 2081 2082 (void) printf("%s [%llu,%llu) length %llu\n", 2083 prefix, 2084 (u_longlong_t)start, 2085 (u_longlong_t)(start + size), 2086 (u_longlong_t)(size)); 2087 } 2088 2089 static void 2090 dump_dtl(vdev_t *vd, int indent) 2091 { 2092 spa_t *spa = vd->vdev_spa; 2093 boolean_t required; 2094 const char *name[DTL_TYPES] = { "missing", "partial", "scrub", 2095 "outage" }; 2096 char prefix[256]; 2097 2098 spa_vdev_state_enter(spa, SCL_NONE); 2099 required = vdev_dtl_required(vd); 2100 (void) spa_vdev_state_exit(spa, NULL, 0); 2101 2102 if (indent == 0) 2103 (void) printf("\nDirty time logs:\n\n"); 2104 2105 (void) printf("\t%*s%s [%s]\n", indent, "", 2106 vd->vdev_path ? vd->vdev_path : 2107 vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), 2108 required ? "DTL-required" : "DTL-expendable"); 2109 2110 for (int t = 0; t < DTL_TYPES; t++) { 2111 range_tree_t *rt = vd->vdev_dtl[t]; 2112 if (range_tree_space(rt) == 0) 2113 continue; 2114 (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", 2115 indent + 2, "", name[t]); 2116 range_tree_walk(rt, dump_dtl_seg, prefix); 2117 if (dump_opt['d'] > 5 && vd->vdev_children == 0) 2118 dump_spacemap(spa->spa_meta_objset, 2119 vd->vdev_dtl_sm); 2120 } 2121 2122 for (unsigned c = 0; c < vd->vdev_children; c++) 2123 dump_dtl(vd->vdev_child[c], indent + 4); 2124 } 2125 2126 static void 2127 dump_history(spa_t *spa) 2128 { 2129 nvlist_t **events = NULL; 2130 char *buf; 2131 uint64_t resid, len, off = 0; 2132 uint_t num = 0; 2133 int error; 2134 char tbuf[30]; 2135 2136 if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { 2137 (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", 2138 __func__); 2139 return; 2140 } 2141 2142 do { 2143 len = SPA_OLD_MAXBLOCKSIZE; 2144 2145 if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { 2146 (void) fprintf(stderr, "Unable to read history: " 2147 "error %d\n", error); 2148 free(buf); 2149 return; 2150 } 2151 2152 if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) 2153 break; 2154 2155 off -= resid; 2156 } while (len != 0); 2157 2158 (void) printf("\nHistory:\n"); 2159 for (unsigned i = 0; i < num; i++) { 2160 boolean_t printed = B_FALSE; 2161 2162 if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) { 2163 time_t tsec; 2164 struct tm t; 2165 2166 tsec = fnvlist_lookup_uint64(events[i], 2167 ZPOOL_HIST_TIME); 2168 (void) localtime_r(&tsec, &t); 2169 (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); 2170 } else { 2171 tbuf[0] = '\0'; 2172 } 2173 2174 if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) { 2175 (void) printf("%s %s\n", tbuf, 2176 fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD)); 2177 } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) { 2178 uint64_t ievent; 2179 2180 ievent = fnvlist_lookup_uint64(events[i], 2181 ZPOOL_HIST_INT_EVENT); 2182 if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) 2183 goto next; 2184 2185 (void) printf(" %s [internal %s txg:%ju] %s\n", 2186 tbuf, 2187 zfs_history_event_names[ievent], 2188 fnvlist_lookup_uint64(events[i], 2189 ZPOOL_HIST_TXG), 2190 fnvlist_lookup_string(events[i], 2191 ZPOOL_HIST_INT_STR)); 2192 } else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) { 2193 (void) printf("%s [txg:%ju] %s", tbuf, 2194 fnvlist_lookup_uint64(events[i], 2195 ZPOOL_HIST_TXG), 2196 fnvlist_lookup_string(events[i], 2197 ZPOOL_HIST_INT_NAME)); 2198 2199 if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) { 2200 (void) printf(" %s (%llu)", 2201 fnvlist_lookup_string(events[i], 2202 ZPOOL_HIST_DSNAME), 2203 (u_longlong_t)fnvlist_lookup_uint64( 2204 events[i], 2205 ZPOOL_HIST_DSID)); 2206 } 2207 2208 (void) printf(" %s\n", fnvlist_lookup_string(events[i], 2209 ZPOOL_HIST_INT_STR)); 2210 } else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) { 2211 (void) printf("%s ioctl %s\n", tbuf, 2212 fnvlist_lookup_string(events[i], 2213 ZPOOL_HIST_IOCTL)); 2214 2215 if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) { 2216 (void) printf(" input:\n"); 2217 dump_nvlist(fnvlist_lookup_nvlist(events[i], 2218 ZPOOL_HIST_INPUT_NVL), 8); 2219 } 2220 if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) { 2221 (void) printf(" output:\n"); 2222 dump_nvlist(fnvlist_lookup_nvlist(events[i], 2223 ZPOOL_HIST_OUTPUT_NVL), 8); 2224 } 2225 if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) { 2226 (void) printf(" errno: %lld\n", 2227 (longlong_t)fnvlist_lookup_int64(events[i], 2228 ZPOOL_HIST_ERRNO)); 2229 } 2230 } else { 2231 goto next; 2232 } 2233 2234 printed = B_TRUE; 2235 next: 2236 if (dump_opt['h'] > 1) { 2237 if (!printed) 2238 (void) printf("unrecognized record:\n"); 2239 dump_nvlist(events[i], 2); 2240 } 2241 } 2242 free(buf); 2243 } 2244 2245 static void 2246 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) 2247 { 2248 (void) os, (void) object, (void) data, (void) size; 2249 } 2250 2251 static uint64_t 2252 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, 2253 const zbookmark_phys_t *zb) 2254 { 2255 if (dnp == NULL) { 2256 ASSERT(zb->zb_level < 0); 2257 if (zb->zb_object == 0) 2258 return (zb->zb_blkid); 2259 return (zb->zb_blkid * BP_GET_LSIZE(bp)); 2260 } 2261 2262 ASSERT(zb->zb_level >= 0); 2263 2264 return ((zb->zb_blkid << 2265 (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * 2266 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 2267 } 2268 2269 static void 2270 snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, 2271 const blkptr_t *bp) 2272 { 2273 abd_t *pabd; 2274 void *buf; 2275 zio_t *zio; 2276 zfs_zstdhdr_t zstd_hdr; 2277 int error; 2278 2279 if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD) 2280 return; 2281 2282 if (BP_IS_HOLE(bp)) 2283 return; 2284 2285 if (BP_IS_EMBEDDED(bp)) { 2286 buf = malloc(SPA_MAXBLOCKSIZE); 2287 if (buf == NULL) { 2288 (void) fprintf(stderr, "out of memory\n"); 2289 exit(1); 2290 } 2291 decode_embedded_bp_compressed(bp, buf); 2292 memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); 2293 free(buf); 2294 zstd_hdr.c_len = BE_32(zstd_hdr.c_len); 2295 zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); 2296 (void) snprintf(blkbuf + strlen(blkbuf), 2297 buflen - strlen(blkbuf), 2298 " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", 2299 zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), 2300 zfs_get_hdrlevel(&zstd_hdr)); 2301 return; 2302 } 2303 2304 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); 2305 zio = zio_root(spa, NULL, NULL, 0); 2306 2307 /* Decrypt but don't decompress so we can read the compression header */ 2308 zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL, 2309 ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS, 2310 NULL)); 2311 error = zio_wait(zio); 2312 if (error) { 2313 (void) fprintf(stderr, "read failed: %d\n", error); 2314 return; 2315 } 2316 buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp)); 2317 memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); 2318 zstd_hdr.c_len = BE_32(zstd_hdr.c_len); 2319 zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); 2320 2321 (void) snprintf(blkbuf + strlen(blkbuf), 2322 buflen - strlen(blkbuf), 2323 " ZSTD:size=%u:version=%u:level=%u:NORMAL", 2324 zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), 2325 zfs_get_hdrlevel(&zstd_hdr)); 2326 2327 abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); 2328 } 2329 2330 static void 2331 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, 2332 boolean_t bp_freed) 2333 { 2334 const dva_t *dva = bp->blk_dva; 2335 int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; 2336 int i; 2337 2338 if (dump_opt['b'] >= 6) { 2339 snprintf_blkptr(blkbuf, buflen, bp); 2340 if (bp_freed) { 2341 (void) snprintf(blkbuf + strlen(blkbuf), 2342 buflen - strlen(blkbuf), " %s", "FREE"); 2343 } 2344 return; 2345 } 2346 2347 if (BP_IS_EMBEDDED(bp)) { 2348 (void) sprintf(blkbuf, 2349 "EMBEDDED et=%u %llxL/%llxP B=%llu", 2350 (int)BPE_GET_ETYPE(bp), 2351 (u_longlong_t)BPE_GET_LSIZE(bp), 2352 (u_longlong_t)BPE_GET_PSIZE(bp), 2353 (u_longlong_t)bp->blk_birth); 2354 return; 2355 } 2356 2357 blkbuf[0] = '\0'; 2358 2359 for (i = 0; i < ndvas; i++) 2360 (void) snprintf(blkbuf + strlen(blkbuf), 2361 buflen - strlen(blkbuf), "%llu:%llx:%llx ", 2362 (u_longlong_t)DVA_GET_VDEV(&dva[i]), 2363 (u_longlong_t)DVA_GET_OFFSET(&dva[i]), 2364 (u_longlong_t)DVA_GET_ASIZE(&dva[i])); 2365 2366 if (BP_IS_HOLE(bp)) { 2367 (void) snprintf(blkbuf + strlen(blkbuf), 2368 buflen - strlen(blkbuf), 2369 "%llxL B=%llu", 2370 (u_longlong_t)BP_GET_LSIZE(bp), 2371 (u_longlong_t)bp->blk_birth); 2372 } else { 2373 (void) snprintf(blkbuf + strlen(blkbuf), 2374 buflen - strlen(blkbuf), 2375 "%llxL/%llxP F=%llu B=%llu/%llu", 2376 (u_longlong_t)BP_GET_LSIZE(bp), 2377 (u_longlong_t)BP_GET_PSIZE(bp), 2378 (u_longlong_t)BP_GET_FILL(bp), 2379 (u_longlong_t)bp->blk_birth, 2380 (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); 2381 if (bp_freed) 2382 (void) snprintf(blkbuf + strlen(blkbuf), 2383 buflen - strlen(blkbuf), " %s", "FREE"); 2384 (void) snprintf(blkbuf + strlen(blkbuf), 2385 buflen - strlen(blkbuf), 2386 " cksum=%016llx:%016llx:%016llx:%016llx", 2387 (u_longlong_t)bp->blk_cksum.zc_word[0], 2388 (u_longlong_t)bp->blk_cksum.zc_word[1], 2389 (u_longlong_t)bp->blk_cksum.zc_word[2], 2390 (u_longlong_t)bp->blk_cksum.zc_word[3]); 2391 } 2392 } 2393 2394 static void 2395 print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, 2396 const dnode_phys_t *dnp) 2397 { 2398 char blkbuf[BP_SPRINTF_LEN]; 2399 int l; 2400 2401 if (!BP_IS_EMBEDDED(bp)) { 2402 ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); 2403 ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); 2404 } 2405 2406 (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); 2407 2408 ASSERT(zb->zb_level >= 0); 2409 2410 for (l = dnp->dn_nlevels - 1; l >= -1; l--) { 2411 if (l == zb->zb_level) { 2412 (void) printf("L%llx", (u_longlong_t)zb->zb_level); 2413 } else { 2414 (void) printf(" "); 2415 } 2416 } 2417 2418 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); 2419 if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) 2420 snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); 2421 (void) printf("%s\n", blkbuf); 2422 } 2423 2424 static int 2425 visit_indirect(spa_t *spa, const dnode_phys_t *dnp, 2426 blkptr_t *bp, const zbookmark_phys_t *zb) 2427 { 2428 int err = 0; 2429 2430 if (bp->blk_birth == 0) 2431 return (0); 2432 2433 print_indirect(spa, bp, zb, dnp); 2434 2435 if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { 2436 arc_flags_t flags = ARC_FLAG_WAIT; 2437 int i; 2438 blkptr_t *cbp; 2439 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 2440 arc_buf_t *buf; 2441 uint64_t fill = 0; 2442 ASSERT(!BP_IS_REDACTED(bp)); 2443 2444 err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, 2445 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 2446 if (err) 2447 return (err); 2448 ASSERT(buf->b_data); 2449 2450 /* recursively visit blocks below this */ 2451 cbp = buf->b_data; 2452 for (i = 0; i < epb; i++, cbp++) { 2453 zbookmark_phys_t czb; 2454 2455 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 2456 zb->zb_level - 1, 2457 zb->zb_blkid * epb + i); 2458 err = visit_indirect(spa, dnp, cbp, &czb); 2459 if (err) 2460 break; 2461 fill += BP_GET_FILL(cbp); 2462 } 2463 if (!err) 2464 ASSERT3U(fill, ==, BP_GET_FILL(bp)); 2465 arc_buf_destroy(buf, &buf); 2466 } 2467 2468 return (err); 2469 } 2470 2471 static void 2472 dump_indirect(dnode_t *dn) 2473 { 2474 dnode_phys_t *dnp = dn->dn_phys; 2475 zbookmark_phys_t czb; 2476 2477 (void) printf("Indirect blocks:\n"); 2478 2479 SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), 2480 dn->dn_object, dnp->dn_nlevels - 1, 0); 2481 for (int j = 0; j < dnp->dn_nblkptr; j++) { 2482 czb.zb_blkid = j; 2483 (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, 2484 &dnp->dn_blkptr[j], &czb); 2485 } 2486 2487 (void) printf("\n"); 2488 } 2489 2490 static void 2491 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) 2492 { 2493 (void) os, (void) object; 2494 dsl_dir_phys_t *dd = data; 2495 time_t crtime; 2496 char nice[32]; 2497 2498 /* make sure nicenum has enough space */ 2499 _Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated"); 2500 2501 if (dd == NULL) 2502 return; 2503 2504 ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); 2505 2506 crtime = dd->dd_creation_time; 2507 (void) printf("\t\tcreation_time = %s", ctime(&crtime)); 2508 (void) printf("\t\thead_dataset_obj = %llu\n", 2509 (u_longlong_t)dd->dd_head_dataset_obj); 2510 (void) printf("\t\tparent_dir_obj = %llu\n", 2511 (u_longlong_t)dd->dd_parent_obj); 2512 (void) printf("\t\torigin_obj = %llu\n", 2513 (u_longlong_t)dd->dd_origin_obj); 2514 (void) printf("\t\tchild_dir_zapobj = %llu\n", 2515 (u_longlong_t)dd->dd_child_dir_zapobj); 2516 zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); 2517 (void) printf("\t\tused_bytes = %s\n", nice); 2518 zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); 2519 (void) printf("\t\tcompressed_bytes = %s\n", nice); 2520 zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); 2521 (void) printf("\t\tuncompressed_bytes = %s\n", nice); 2522 zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); 2523 (void) printf("\t\tquota = %s\n", nice); 2524 zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); 2525 (void) printf("\t\treserved = %s\n", nice); 2526 (void) printf("\t\tprops_zapobj = %llu\n", 2527 (u_longlong_t)dd->dd_props_zapobj); 2528 (void) printf("\t\tdeleg_zapobj = %llu\n", 2529 (u_longlong_t)dd->dd_deleg_zapobj); 2530 (void) printf("\t\tflags = %llx\n", 2531 (u_longlong_t)dd->dd_flags); 2532 2533 #define DO(which) \ 2534 zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ 2535 sizeof (nice)); \ 2536 (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) 2537 DO(HEAD); 2538 DO(SNAP); 2539 DO(CHILD); 2540 DO(CHILD_RSRV); 2541 DO(REFRSRV); 2542 #undef DO 2543 (void) printf("\t\tclones = %llu\n", 2544 (u_longlong_t)dd->dd_clones); 2545 } 2546 2547 static void 2548 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) 2549 { 2550 (void) os, (void) object; 2551 dsl_dataset_phys_t *ds = data; 2552 time_t crtime; 2553 char used[32], compressed[32], uncompressed[32], unique[32]; 2554 char blkbuf[BP_SPRINTF_LEN]; 2555 2556 /* make sure nicenum has enough space */ 2557 _Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated"); 2558 _Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ, 2559 "compressed truncated"); 2560 _Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ, 2561 "uncompressed truncated"); 2562 _Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated"); 2563 2564 if (ds == NULL) 2565 return; 2566 2567 ASSERT(size == sizeof (*ds)); 2568 crtime = ds->ds_creation_time; 2569 zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); 2570 zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); 2571 zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, 2572 sizeof (uncompressed)); 2573 zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); 2574 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); 2575 2576 (void) printf("\t\tdir_obj = %llu\n", 2577 (u_longlong_t)ds->ds_dir_obj); 2578 (void) printf("\t\tprev_snap_obj = %llu\n", 2579 (u_longlong_t)ds->ds_prev_snap_obj); 2580 (void) printf("\t\tprev_snap_txg = %llu\n", 2581 (u_longlong_t)ds->ds_prev_snap_txg); 2582 (void) printf("\t\tnext_snap_obj = %llu\n", 2583 (u_longlong_t)ds->ds_next_snap_obj); 2584 (void) printf("\t\tsnapnames_zapobj = %llu\n", 2585 (u_longlong_t)ds->ds_snapnames_zapobj); 2586 (void) printf("\t\tnum_children = %llu\n", 2587 (u_longlong_t)ds->ds_num_children); 2588 (void) printf("\t\tuserrefs_obj = %llu\n", 2589 (u_longlong_t)ds->ds_userrefs_obj); 2590 (void) printf("\t\tcreation_time = %s", ctime(&crtime)); 2591 (void) printf("\t\tcreation_txg = %llu\n", 2592 (u_longlong_t)ds->ds_creation_txg); 2593 (void) printf("\t\tdeadlist_obj = %llu\n", 2594 (u_longlong_t)ds->ds_deadlist_obj); 2595 (void) printf("\t\tused_bytes = %s\n", used); 2596 (void) printf("\t\tcompressed_bytes = %s\n", compressed); 2597 (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); 2598 (void) printf("\t\tunique = %s\n", unique); 2599 (void) printf("\t\tfsid_guid = %llu\n", 2600 (u_longlong_t)ds->ds_fsid_guid); 2601 (void) printf("\t\tguid = %llu\n", 2602 (u_longlong_t)ds->ds_guid); 2603 (void) printf("\t\tflags = %llx\n", 2604 (u_longlong_t)ds->ds_flags); 2605 (void) printf("\t\tnext_clones_obj = %llu\n", 2606 (u_longlong_t)ds->ds_next_clones_obj); 2607 (void) printf("\t\tprops_obj = %llu\n", 2608 (u_longlong_t)ds->ds_props_obj); 2609 (void) printf("\t\tbp = %s\n", blkbuf); 2610 } 2611 2612 static int 2613 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2614 { 2615 (void) arg, (void) tx; 2616 char blkbuf[BP_SPRINTF_LEN]; 2617 2618 if (bp->blk_birth != 0) { 2619 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 2620 (void) printf("\t%s\n", blkbuf); 2621 } 2622 return (0); 2623 } 2624 2625 static void 2626 dump_bptree(objset_t *os, uint64_t obj, const char *name) 2627 { 2628 char bytes[32]; 2629 bptree_phys_t *bt; 2630 dmu_buf_t *db; 2631 2632 /* make sure nicenum has enough space */ 2633 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 2634 2635 if (dump_opt['d'] < 3) 2636 return; 2637 2638 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 2639 bt = db->db_data; 2640 zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); 2641 (void) printf("\n %s: %llu datasets, %s\n", 2642 name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); 2643 dmu_buf_rele(db, FTAG); 2644 2645 if (dump_opt['d'] < 5) 2646 return; 2647 2648 (void) printf("\n"); 2649 2650 (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); 2651 } 2652 2653 static int 2654 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) 2655 { 2656 (void) arg, (void) tx; 2657 char blkbuf[BP_SPRINTF_LEN]; 2658 2659 ASSERT(bp->blk_birth != 0); 2660 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); 2661 (void) printf("\t%s\n", blkbuf); 2662 return (0); 2663 } 2664 2665 static void 2666 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) 2667 { 2668 char bytes[32]; 2669 char comp[32]; 2670 char uncomp[32]; 2671 uint64_t i; 2672 2673 /* make sure nicenum has enough space */ 2674 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 2675 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); 2676 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); 2677 2678 if (dump_opt['d'] < 3) 2679 return; 2680 2681 zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); 2682 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 2683 zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); 2684 zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); 2685 if (bpo->bpo_havefreed) { 2686 (void) printf(" %*s: object %llu, %llu local " 2687 "blkptrs, %llu freed, %llu subobjs in object %llu, " 2688 "%s (%s/%s comp)\n", 2689 indent * 8, name, 2690 (u_longlong_t)bpo->bpo_object, 2691 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 2692 (u_longlong_t)bpo->bpo_phys->bpo_num_freed, 2693 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, 2694 (u_longlong_t)bpo->bpo_phys->bpo_subobjs, 2695 bytes, comp, uncomp); 2696 } else { 2697 (void) printf(" %*s: object %llu, %llu local " 2698 "blkptrs, %llu subobjs in object %llu, " 2699 "%s (%s/%s comp)\n", 2700 indent * 8, name, 2701 (u_longlong_t)bpo->bpo_object, 2702 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 2703 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, 2704 (u_longlong_t)bpo->bpo_phys->bpo_subobjs, 2705 bytes, comp, uncomp); 2706 } 2707 2708 for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { 2709 uint64_t subobj; 2710 bpobj_t subbpo; 2711 int error; 2712 VERIFY0(dmu_read(bpo->bpo_os, 2713 bpo->bpo_phys->bpo_subobjs, 2714 i * sizeof (subobj), sizeof (subobj), &subobj, 0)); 2715 error = bpobj_open(&subbpo, bpo->bpo_os, subobj); 2716 if (error != 0) { 2717 (void) printf("ERROR %u while trying to open " 2718 "subobj id %llu\n", 2719 error, (u_longlong_t)subobj); 2720 continue; 2721 } 2722 dump_full_bpobj(&subbpo, "subobj", indent + 1); 2723 bpobj_close(&subbpo); 2724 } 2725 } else { 2726 if (bpo->bpo_havefreed) { 2727 (void) printf(" %*s: object %llu, %llu blkptrs, " 2728 "%llu freed, %s\n", 2729 indent * 8, name, 2730 (u_longlong_t)bpo->bpo_object, 2731 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 2732 (u_longlong_t)bpo->bpo_phys->bpo_num_freed, 2733 bytes); 2734 } else { 2735 (void) printf(" %*s: object %llu, %llu blkptrs, " 2736 "%s\n", 2737 indent * 8, name, 2738 (u_longlong_t)bpo->bpo_object, 2739 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 2740 bytes); 2741 } 2742 } 2743 2744 if (dump_opt['d'] < 5) 2745 return; 2746 2747 2748 if (indent == 0) { 2749 (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); 2750 (void) printf("\n"); 2751 } 2752 } 2753 2754 static int 2755 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, 2756 boolean_t print_list) 2757 { 2758 int err = 0; 2759 zfs_bookmark_phys_t prop; 2760 objset_t *mos = dp->dp_spa->spa_meta_objset; 2761 err = dsl_bookmark_lookup(dp, name, NULL, &prop); 2762 2763 if (err != 0) { 2764 return (err); 2765 } 2766 2767 (void) printf("\t#%s: ", strchr(name, '#') + 1); 2768 (void) printf("{guid: %llx creation_txg: %llu creation_time: " 2769 "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, 2770 (u_longlong_t)prop.zbm_creation_txg, 2771 (u_longlong_t)prop.zbm_creation_time, 2772 (u_longlong_t)prop.zbm_redaction_obj); 2773 2774 IMPLY(print_list, print_redact); 2775 if (!print_redact || prop.zbm_redaction_obj == 0) 2776 return (0); 2777 2778 redaction_list_t *rl; 2779 VERIFY0(dsl_redaction_list_hold_obj(dp, 2780 prop.zbm_redaction_obj, FTAG, &rl)); 2781 2782 redaction_list_phys_t *rlp = rl->rl_phys; 2783 (void) printf("\tRedacted:\n\t\tProgress: "); 2784 if (rlp->rlp_last_object != UINT64_MAX || 2785 rlp->rlp_last_blkid != UINT64_MAX) { 2786 (void) printf("%llu %llu (incomplete)\n", 2787 (u_longlong_t)rlp->rlp_last_object, 2788 (u_longlong_t)rlp->rlp_last_blkid); 2789 } else { 2790 (void) printf("complete\n"); 2791 } 2792 (void) printf("\t\tSnapshots: ["); 2793 for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { 2794 if (i > 0) 2795 (void) printf(", "); 2796 (void) printf("%0llu", 2797 (u_longlong_t)rlp->rlp_snaps[i]); 2798 } 2799 (void) printf("]\n\t\tLength: %llu\n", 2800 (u_longlong_t)rlp->rlp_num_entries); 2801 2802 if (!print_list) { 2803 dsl_redaction_list_rele(rl, FTAG); 2804 return (0); 2805 } 2806 2807 if (rlp->rlp_num_entries == 0) { 2808 dsl_redaction_list_rele(rl, FTAG); 2809 (void) printf("\t\tRedaction List: []\n\n"); 2810 return (0); 2811 } 2812 2813 redact_block_phys_t *rbp_buf; 2814 uint64_t size; 2815 dmu_object_info_t doi; 2816 2817 VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); 2818 size = doi.doi_max_offset; 2819 rbp_buf = kmem_alloc(size, KM_SLEEP); 2820 2821 err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, 2822 rbp_buf, 0); 2823 if (err != 0) { 2824 dsl_redaction_list_rele(rl, FTAG); 2825 kmem_free(rbp_buf, size); 2826 return (err); 2827 } 2828 2829 (void) printf("\t\tRedaction List: [{object: %llx, offset: " 2830 "%llx, blksz: %x, count: %llx}", 2831 (u_longlong_t)rbp_buf[0].rbp_object, 2832 (u_longlong_t)rbp_buf[0].rbp_blkid, 2833 (uint_t)(redact_block_get_size(&rbp_buf[0])), 2834 (u_longlong_t)redact_block_get_count(&rbp_buf[0])); 2835 2836 for (size_t i = 1; i < rlp->rlp_num_entries; i++) { 2837 (void) printf(",\n\t\t{object: %llx, offset: %llx, " 2838 "blksz: %x, count: %llx}", 2839 (u_longlong_t)rbp_buf[i].rbp_object, 2840 (u_longlong_t)rbp_buf[i].rbp_blkid, 2841 (uint_t)(redact_block_get_size(&rbp_buf[i])), 2842 (u_longlong_t)redact_block_get_count(&rbp_buf[i])); 2843 } 2844 dsl_redaction_list_rele(rl, FTAG); 2845 kmem_free(rbp_buf, size); 2846 (void) printf("]\n\n"); 2847 return (0); 2848 } 2849 2850 static void 2851 dump_bookmarks(objset_t *os, int verbosity) 2852 { 2853 zap_cursor_t zc; 2854 zap_attribute_t attr; 2855 dsl_dataset_t *ds = dmu_objset_ds(os); 2856 dsl_pool_t *dp = spa_get_dsl(os->os_spa); 2857 objset_t *mos = os->os_spa->spa_meta_objset; 2858 if (verbosity < 4) 2859 return; 2860 dsl_pool_config_enter(dp, FTAG); 2861 2862 for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); 2863 zap_cursor_retrieve(&zc, &attr) == 0; 2864 zap_cursor_advance(&zc)) { 2865 char osname[ZFS_MAX_DATASET_NAME_LEN]; 2866 char buf[ZFS_MAX_DATASET_NAME_LEN]; 2867 int len; 2868 dmu_objset_name(os, osname); 2869 len = snprintf(buf, sizeof (buf), "%s#%s", osname, 2870 attr.za_name); 2871 VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN); 2872 (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); 2873 } 2874 zap_cursor_fini(&zc); 2875 dsl_pool_config_exit(dp, FTAG); 2876 } 2877 2878 static void 2879 bpobj_count_refd(bpobj_t *bpo) 2880 { 2881 mos_obj_refd(bpo->bpo_object); 2882 2883 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 2884 mos_obj_refd(bpo->bpo_phys->bpo_subobjs); 2885 for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { 2886 uint64_t subobj; 2887 bpobj_t subbpo; 2888 int error; 2889 VERIFY0(dmu_read(bpo->bpo_os, 2890 bpo->bpo_phys->bpo_subobjs, 2891 i * sizeof (subobj), sizeof (subobj), &subobj, 0)); 2892 error = bpobj_open(&subbpo, bpo->bpo_os, subobj); 2893 if (error != 0) { 2894 (void) printf("ERROR %u while trying to open " 2895 "subobj id %llu\n", 2896 error, (u_longlong_t)subobj); 2897 continue; 2898 } 2899 bpobj_count_refd(&subbpo); 2900 bpobj_close(&subbpo); 2901 } 2902 } 2903 } 2904 2905 static int 2906 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) 2907 { 2908 spa_t *spa = arg; 2909 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; 2910 if (dle->dle_bpobj.bpo_object != empty_bpobj) 2911 bpobj_count_refd(&dle->dle_bpobj); 2912 return (0); 2913 } 2914 2915 static int 2916 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) 2917 { 2918 ASSERT(arg == NULL); 2919 if (dump_opt['d'] >= 5) { 2920 char buf[128]; 2921 (void) snprintf(buf, sizeof (buf), 2922 "mintxg %llu -> obj %llu", 2923 (longlong_t)dle->dle_mintxg, 2924 (longlong_t)dle->dle_bpobj.bpo_object); 2925 2926 dump_full_bpobj(&dle->dle_bpobj, buf, 0); 2927 } else { 2928 (void) printf("mintxg %llu -> obj %llu\n", 2929 (longlong_t)dle->dle_mintxg, 2930 (longlong_t)dle->dle_bpobj.bpo_object); 2931 } 2932 return (0); 2933 } 2934 2935 static void 2936 dump_blkptr_list(dsl_deadlist_t *dl, const char *name) 2937 { 2938 char bytes[32]; 2939 char comp[32]; 2940 char uncomp[32]; 2941 char entries[32]; 2942 spa_t *spa = dmu_objset_spa(dl->dl_os); 2943 uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; 2944 2945 if (dl->dl_oldfmt) { 2946 if (dl->dl_bpobj.bpo_object != empty_bpobj) 2947 bpobj_count_refd(&dl->dl_bpobj); 2948 } else { 2949 mos_obj_refd(dl->dl_object); 2950 dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); 2951 } 2952 2953 /* make sure nicenum has enough space */ 2954 _Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated"); 2955 _Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated"); 2956 _Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated"); 2957 _Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated"); 2958 2959 if (dump_opt['d'] < 3) 2960 return; 2961 2962 if (dl->dl_oldfmt) { 2963 dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); 2964 return; 2965 } 2966 2967 zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); 2968 zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); 2969 zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); 2970 zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); 2971 (void) printf("\n %s: %s (%s/%s comp), %s entries\n", 2972 name, bytes, comp, uncomp, entries); 2973 2974 if (dump_opt['d'] < 4) 2975 return; 2976 2977 (void) putchar('\n'); 2978 2979 dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); 2980 } 2981 2982 static int 2983 verify_dd_livelist(objset_t *os) 2984 { 2985 uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; 2986 dsl_pool_t *dp = spa_get_dsl(os->os_spa); 2987 dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; 2988 2989 ASSERT(!dmu_objset_is_snapshot(os)); 2990 if (!dsl_deadlist_is_open(&dd->dd_livelist)) 2991 return (0); 2992 2993 /* Iterate through the livelist to check for duplicates */ 2994 dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, 2995 NULL); 2996 2997 dsl_pool_config_enter(dp, FTAG); 2998 dsl_deadlist_space(&dd->dd_livelist, &ll_used, 2999 &ll_comp, &ll_uncomp); 3000 3001 dsl_dataset_t *origin_ds; 3002 ASSERT(dsl_pool_config_held(dp)); 3003 VERIFY0(dsl_dataset_hold_obj(dp, 3004 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); 3005 VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, 3006 &used, &comp, &uncomp)); 3007 dsl_dataset_rele(origin_ds, FTAG); 3008 dsl_pool_config_exit(dp, FTAG); 3009 /* 3010 * It's possible that the dataset's uncomp space is larger than the 3011 * livelist's because livelists do not track embedded block pointers 3012 */ 3013 if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { 3014 char nice_used[32], nice_comp[32], nice_uncomp[32]; 3015 (void) printf("Discrepancy in space accounting:\n"); 3016 zdb_nicenum(used, nice_used, sizeof (nice_used)); 3017 zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); 3018 zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); 3019 (void) printf("dir: used %s, comp %s, uncomp %s\n", 3020 nice_used, nice_comp, nice_uncomp); 3021 zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); 3022 zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); 3023 zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); 3024 (void) printf("livelist: used %s, comp %s, uncomp %s\n", 3025 nice_used, nice_comp, nice_uncomp); 3026 return (1); 3027 } 3028 return (0); 3029 } 3030 3031 static char *key_material = NULL; 3032 3033 static boolean_t 3034 zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out) 3035 { 3036 uint64_t keyformat, salt, iters; 3037 int i; 3038 unsigned char c; 3039 3040 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, 3041 zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t), 3042 1, &keyformat)); 3043 3044 switch (keyformat) { 3045 case ZFS_KEYFORMAT_HEX: 3046 for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) { 3047 if (!isxdigit(key_material[i]) || 3048 !isxdigit(key_material[i+1])) 3049 return (B_FALSE); 3050 if (sscanf(&key_material[i], "%02hhx", &c) != 1) 3051 return (B_FALSE); 3052 key_out[i / 2] = c; 3053 } 3054 break; 3055 3056 case ZFS_KEYFORMAT_PASSPHRASE: 3057 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, 3058 dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 3059 sizeof (uint64_t), 1, &salt)); 3060 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, 3061 dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 3062 sizeof (uint64_t), 1, &iters)); 3063 3064 if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material), 3065 ((uint8_t *)&salt), sizeof (uint64_t), iters, 3066 WRAPPING_KEY_LEN, key_out) != 1) 3067 return (B_FALSE); 3068 3069 break; 3070 3071 default: 3072 fatal("no support for key format %u\n", 3073 (unsigned int) keyformat); 3074 } 3075 3076 return (B_TRUE); 3077 } 3078 3079 static char encroot[ZFS_MAX_DATASET_NAME_LEN]; 3080 static boolean_t key_loaded = B_FALSE; 3081 3082 static void 3083 zdb_load_key(objset_t *os) 3084 { 3085 dsl_pool_t *dp; 3086 dsl_dir_t *dd, *rdd; 3087 uint8_t key[WRAPPING_KEY_LEN]; 3088 uint64_t rddobj; 3089 int err; 3090 3091 dp = spa_get_dsl(os->os_spa); 3092 dd = os->os_dsl_dataset->ds_dir; 3093 3094 dsl_pool_config_enter(dp, FTAG); 3095 VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, 3096 DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj)); 3097 VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd)); 3098 dsl_dir_name(rdd, encroot); 3099 dsl_dir_rele(rdd, FTAG); 3100 3101 if (!zdb_derive_key(dd, key)) 3102 fatal("couldn't derive encryption key"); 3103 3104 dsl_pool_config_exit(dp, FTAG); 3105 3106 ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE); 3107 3108 dsl_crypto_params_t *dcp; 3109 nvlist_t *crypto_args; 3110 3111 crypto_args = fnvlist_alloc(); 3112 fnvlist_add_uint8_array(crypto_args, "wkeydata", 3113 (uint8_t *)key, WRAPPING_KEY_LEN); 3114 VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, 3115 NULL, crypto_args, &dcp)); 3116 err = spa_keystore_load_wkey(encroot, dcp, B_FALSE); 3117 3118 dsl_crypto_params_free(dcp, (err != 0)); 3119 fnvlist_free(crypto_args); 3120 3121 if (err != 0) 3122 fatal( 3123 "couldn't load encryption key for %s: %s", 3124 encroot, err == ZFS_ERR_CRYPTO_NOTSUP ? 3125 "crypto params not supported" : strerror(err)); 3126 3127 ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE); 3128 3129 printf("Unlocked encryption root: %s\n", encroot); 3130 key_loaded = B_TRUE; 3131 } 3132 3133 static void 3134 zdb_unload_key(void) 3135 { 3136 if (!key_loaded) 3137 return; 3138 3139 VERIFY0(spa_keystore_unload_wkey(encroot)); 3140 key_loaded = B_FALSE; 3141 } 3142 3143 static avl_tree_t idx_tree; 3144 static avl_tree_t domain_tree; 3145 static boolean_t fuid_table_loaded; 3146 static objset_t *sa_os = NULL; 3147 static sa_attr_type_t *sa_attr_table = NULL; 3148 3149 static int 3150 open_objset(const char *path, const void *tag, objset_t **osp) 3151 { 3152 int err; 3153 uint64_t sa_attrs = 0; 3154 uint64_t version = 0; 3155 3156 VERIFY3P(sa_os, ==, NULL); 3157 3158 /* 3159 * We can't own an objset if it's redacted. Therefore, we do this 3160 * dance: hold the objset, then acquire a long hold on its dataset, then 3161 * release the pool (which is held as part of holding the objset). 3162 */ 3163 3164 if (dump_opt['K']) { 3165 /* decryption requested, try to load keys */ 3166 err = dmu_objset_hold(path, tag, osp); 3167 if (err != 0) { 3168 (void) fprintf(stderr, "failed to hold dataset " 3169 "'%s': %s\n", 3170 path, strerror(err)); 3171 return (err); 3172 } 3173 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); 3174 dsl_pool_rele(dmu_objset_pool(*osp), tag); 3175 3176 /* succeeds or dies */ 3177 zdb_load_key(*osp); 3178 3179 /* release it all */ 3180 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); 3181 dsl_dataset_rele(dmu_objset_ds(*osp), tag); 3182 } 3183 3184 int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0; 3185 3186 err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp); 3187 if (err != 0) { 3188 (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", 3189 path, strerror(err)); 3190 return (err); 3191 } 3192 dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); 3193 dsl_pool_rele(dmu_objset_pool(*osp), tag); 3194 3195 if (dmu_objset_type(*osp) == DMU_OST_ZFS && 3196 (key_loaded || !(*osp)->os_encrypted)) { 3197 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 3198 8, 1, &version); 3199 if (version >= ZPL_VERSION_SA) { 3200 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 3201 8, 1, &sa_attrs); 3202 } 3203 err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, 3204 &sa_attr_table); 3205 if (err != 0) { 3206 (void) fprintf(stderr, "sa_setup failed: %s\n", 3207 strerror(err)); 3208 dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); 3209 dsl_dataset_rele_flags(dmu_objset_ds(*osp), 3210 ds_hold_flags, tag); 3211 *osp = NULL; 3212 } 3213 } 3214 sa_os = *osp; 3215 3216 return (err); 3217 } 3218 3219 static void 3220 close_objset(objset_t *os, const void *tag) 3221 { 3222 VERIFY3P(os, ==, sa_os); 3223 if (os->os_sa != NULL) 3224 sa_tear_down(os); 3225 dsl_dataset_long_rele(dmu_objset_ds(os), tag); 3226 dsl_dataset_rele_flags(dmu_objset_ds(os), 3227 key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag); 3228 sa_attr_table = NULL; 3229 sa_os = NULL; 3230 3231 zdb_unload_key(); 3232 } 3233 3234 static void 3235 fuid_table_destroy(void) 3236 { 3237 if (fuid_table_loaded) { 3238 zfs_fuid_table_destroy(&idx_tree, &domain_tree); 3239 fuid_table_loaded = B_FALSE; 3240 } 3241 } 3242 3243 /* 3244 * print uid or gid information. 3245 * For normal POSIX id just the id is printed in decimal format. 3246 * For CIFS files with FUID the fuid is printed in hex followed by 3247 * the domain-rid string. 3248 */ 3249 static void 3250 print_idstr(uint64_t id, const char *id_type) 3251 { 3252 if (FUID_INDEX(id)) { 3253 const char *domain = 3254 zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); 3255 (void) printf("\t%s %llx [%s-%d]\n", id_type, 3256 (u_longlong_t)id, domain, (int)FUID_RID(id)); 3257 } else { 3258 (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); 3259 } 3260 3261 } 3262 3263 static void 3264 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) 3265 { 3266 uint32_t uid_idx, gid_idx; 3267 3268 uid_idx = FUID_INDEX(uid); 3269 gid_idx = FUID_INDEX(gid); 3270 3271 /* Load domain table, if not already loaded */ 3272 if (!fuid_table_loaded && (uid_idx || gid_idx)) { 3273 uint64_t fuid_obj; 3274 3275 /* first find the fuid object. It lives in the master node */ 3276 VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 3277 8, 1, &fuid_obj) == 0); 3278 zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); 3279 (void) zfs_fuid_table_load(os, fuid_obj, 3280 &idx_tree, &domain_tree); 3281 fuid_table_loaded = B_TRUE; 3282 } 3283 3284 print_idstr(uid, "uid"); 3285 print_idstr(gid, "gid"); 3286 } 3287 3288 static void 3289 dump_znode_sa_xattr(sa_handle_t *hdl) 3290 { 3291 nvlist_t *sa_xattr; 3292 nvpair_t *elem = NULL; 3293 int sa_xattr_size = 0; 3294 int sa_xattr_entries = 0; 3295 int error; 3296 char *sa_xattr_packed; 3297 3298 error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size); 3299 if (error || sa_xattr_size == 0) 3300 return; 3301 3302 sa_xattr_packed = malloc(sa_xattr_size); 3303 if (sa_xattr_packed == NULL) 3304 return; 3305 3306 error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR], 3307 sa_xattr_packed, sa_xattr_size); 3308 if (error) { 3309 free(sa_xattr_packed); 3310 return; 3311 } 3312 3313 error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0); 3314 if (error) { 3315 free(sa_xattr_packed); 3316 return; 3317 } 3318 3319 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) 3320 sa_xattr_entries++; 3321 3322 (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", 3323 sa_xattr_size, sa_xattr_entries); 3324 while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { 3325 boolean_t can_print = !dump_opt['P']; 3326 uchar_t *value; 3327 uint_t cnt, idx; 3328 3329 (void) printf("\t\t%s = ", nvpair_name(elem)); 3330 nvpair_value_byte_array(elem, &value, &cnt); 3331 3332 for (idx = 0; idx < cnt; ++idx) { 3333 if (!isprint(value[idx])) { 3334 can_print = B_FALSE; 3335 break; 3336 } 3337 } 3338 3339 for (idx = 0; idx < cnt; ++idx) { 3340 if (can_print) 3341 (void) putchar(value[idx]); 3342 else 3343 (void) printf("\\%3.3o", value[idx]); 3344 } 3345 (void) putchar('\n'); 3346 } 3347 3348 nvlist_free(sa_xattr); 3349 free(sa_xattr_packed); 3350 } 3351 3352 static void 3353 dump_znode_symlink(sa_handle_t *hdl) 3354 { 3355 int sa_symlink_size = 0; 3356 char linktarget[MAXPATHLEN]; 3357 int error; 3358 3359 error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size); 3360 if (error || sa_symlink_size == 0) { 3361 return; 3362 } 3363 if (sa_symlink_size >= sizeof (linktarget)) { 3364 (void) printf("symlink size %d is too large\n", 3365 sa_symlink_size); 3366 return; 3367 } 3368 linktarget[sa_symlink_size] = '\0'; 3369 if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK], 3370 &linktarget, sa_symlink_size) == 0) 3371 (void) printf("\ttarget %s\n", linktarget); 3372 } 3373 3374 static void 3375 dump_znode(objset_t *os, uint64_t object, void *data, size_t size) 3376 { 3377 (void) data, (void) size; 3378 char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ 3379 sa_handle_t *hdl; 3380 uint64_t xattr, rdev, gen; 3381 uint64_t uid, gid, mode, fsize, parent, links; 3382 uint64_t pflags; 3383 uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; 3384 time_t z_crtime, z_atime, z_mtime, z_ctime; 3385 sa_bulk_attr_t bulk[12]; 3386 int idx = 0; 3387 int error; 3388 3389 VERIFY3P(os, ==, sa_os); 3390 if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { 3391 (void) printf("Failed to get handle for SA znode\n"); 3392 return; 3393 } 3394 3395 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); 3396 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); 3397 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, 3398 &links, 8); 3399 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); 3400 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, 3401 &mode, 8); 3402 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], 3403 NULL, &parent, 8); 3404 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, 3405 &fsize, 8); 3406 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, 3407 acctm, 16); 3408 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, 3409 modtm, 16); 3410 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, 3411 crtm, 16); 3412 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, 3413 chgtm, 16); 3414 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, 3415 &pflags, 8); 3416 3417 if (sa_bulk_lookup(hdl, bulk, idx)) { 3418 (void) sa_handle_destroy(hdl); 3419 return; 3420 } 3421 3422 z_crtime = (time_t)crtm[0]; 3423 z_atime = (time_t)acctm[0]; 3424 z_mtime = (time_t)modtm[0]; 3425 z_ctime = (time_t)chgtm[0]; 3426 3427 if (dump_opt['d'] > 4) { 3428 error = zfs_obj_to_path(os, object, path, sizeof (path)); 3429 if (error == ESTALE) { 3430 (void) snprintf(path, sizeof (path), "on delete queue"); 3431 } else if (error != 0) { 3432 leaked_objects++; 3433 (void) snprintf(path, sizeof (path), 3434 "path not found, possibly leaked"); 3435 } 3436 (void) printf("\tpath %s\n", path); 3437 } 3438 3439 if (S_ISLNK(mode)) 3440 dump_znode_symlink(hdl); 3441 dump_uidgid(os, uid, gid); 3442 (void) printf("\tatime %s", ctime(&z_atime)); 3443 (void) printf("\tmtime %s", ctime(&z_mtime)); 3444 (void) printf("\tctime %s", ctime(&z_ctime)); 3445 (void) printf("\tcrtime %s", ctime(&z_crtime)); 3446 (void) printf("\tgen %llu\n", (u_longlong_t)gen); 3447 (void) printf("\tmode %llo\n", (u_longlong_t)mode); 3448 (void) printf("\tsize %llu\n", (u_longlong_t)fsize); 3449 (void) printf("\tparent %llu\n", (u_longlong_t)parent); 3450 (void) printf("\tlinks %llu\n", (u_longlong_t)links); 3451 (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); 3452 if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { 3453 uint64_t projid; 3454 3455 if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, 3456 sizeof (uint64_t)) == 0) 3457 (void) printf("\tprojid %llu\n", (u_longlong_t)projid); 3458 } 3459 if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, 3460 sizeof (uint64_t)) == 0) 3461 (void) printf("\txattr %llu\n", (u_longlong_t)xattr); 3462 if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, 3463 sizeof (uint64_t)) == 0) 3464 (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); 3465 dump_znode_sa_xattr(hdl); 3466 sa_handle_destroy(hdl); 3467 } 3468 3469 static void 3470 dump_acl(objset_t *os, uint64_t object, void *data, size_t size) 3471 { 3472 (void) os, (void) object, (void) data, (void) size; 3473 } 3474 3475 static void 3476 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) 3477 { 3478 (void) os, (void) object, (void) data, (void) size; 3479 } 3480 3481 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { 3482 dump_none, /* unallocated */ 3483 dump_zap, /* object directory */ 3484 dump_uint64, /* object array */ 3485 dump_none, /* packed nvlist */ 3486 dump_packed_nvlist, /* packed nvlist size */ 3487 dump_none, /* bpobj */ 3488 dump_bpobj, /* bpobj header */ 3489 dump_none, /* SPA space map header */ 3490 dump_none, /* SPA space map */ 3491 dump_none, /* ZIL intent log */ 3492 dump_dnode, /* DMU dnode */ 3493 dump_dmu_objset, /* DMU objset */ 3494 dump_dsl_dir, /* DSL directory */ 3495 dump_zap, /* DSL directory child map */ 3496 dump_zap, /* DSL dataset snap map */ 3497 dump_zap, /* DSL props */ 3498 dump_dsl_dataset, /* DSL dataset */ 3499 dump_znode, /* ZFS znode */ 3500 dump_acl, /* ZFS V0 ACL */ 3501 dump_uint8, /* ZFS plain file */ 3502 dump_zpldir, /* ZFS directory */ 3503 dump_zap, /* ZFS master node */ 3504 dump_zap, /* ZFS delete queue */ 3505 dump_uint8, /* zvol object */ 3506 dump_zap, /* zvol prop */ 3507 dump_uint8, /* other uint8[] */ 3508 dump_uint64, /* other uint64[] */ 3509 dump_zap, /* other ZAP */ 3510 dump_zap, /* persistent error log */ 3511 dump_uint8, /* SPA history */ 3512 dump_history_offsets, /* SPA history offsets */ 3513 dump_zap, /* Pool properties */ 3514 dump_zap, /* DSL permissions */ 3515 dump_acl, /* ZFS ACL */ 3516 dump_uint8, /* ZFS SYSACL */ 3517 dump_none, /* FUID nvlist */ 3518 dump_packed_nvlist, /* FUID nvlist size */ 3519 dump_zap, /* DSL dataset next clones */ 3520 dump_zap, /* DSL scrub queue */ 3521 dump_zap, /* ZFS user/group/project used */ 3522 dump_zap, /* ZFS user/group/project quota */ 3523 dump_zap, /* snapshot refcount tags */ 3524 dump_ddt_zap, /* DDT ZAP object */ 3525 dump_zap, /* DDT statistics */ 3526 dump_znode, /* SA object */ 3527 dump_zap, /* SA Master Node */ 3528 dump_sa_attrs, /* SA attribute registration */ 3529 dump_sa_layouts, /* SA attribute layouts */ 3530 dump_zap, /* DSL scrub translations */ 3531 dump_none, /* fake dedup BP */ 3532 dump_zap, /* deadlist */ 3533 dump_none, /* deadlist hdr */ 3534 dump_zap, /* dsl clones */ 3535 dump_bpobj_subobjs, /* bpobj subobjs */ 3536 dump_unknown, /* Unknown type, must be last */ 3537 }; 3538 3539 static boolean_t 3540 match_object_type(dmu_object_type_t obj_type, uint64_t flags) 3541 { 3542 boolean_t match = B_TRUE; 3543 3544 switch (obj_type) { 3545 case DMU_OT_DIRECTORY_CONTENTS: 3546 if (!(flags & ZOR_FLAG_DIRECTORY)) 3547 match = B_FALSE; 3548 break; 3549 case DMU_OT_PLAIN_FILE_CONTENTS: 3550 if (!(flags & ZOR_FLAG_PLAIN_FILE)) 3551 match = B_FALSE; 3552 break; 3553 case DMU_OT_SPACE_MAP: 3554 if (!(flags & ZOR_FLAG_SPACE_MAP)) 3555 match = B_FALSE; 3556 break; 3557 default: 3558 if (strcmp(zdb_ot_name(obj_type), "zap") == 0) { 3559 if (!(flags & ZOR_FLAG_ZAP)) 3560 match = B_FALSE; 3561 break; 3562 } 3563 3564 /* 3565 * If all bits except some of the supported flags are 3566 * set, the user combined the all-types flag (A) with 3567 * a negated flag to exclude some types (e.g. A-f to 3568 * show all object types except plain files). 3569 */ 3570 if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES) 3571 match = B_FALSE; 3572 3573 break; 3574 } 3575 3576 return (match); 3577 } 3578 3579 static void 3580 dump_object(objset_t *os, uint64_t object, int verbosity, 3581 boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags) 3582 { 3583 dmu_buf_t *db = NULL; 3584 dmu_object_info_t doi; 3585 dnode_t *dn; 3586 boolean_t dnode_held = B_FALSE; 3587 void *bonus = NULL; 3588 size_t bsize = 0; 3589 char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; 3590 char bonus_size[32]; 3591 char aux[50]; 3592 int error; 3593 3594 /* make sure nicenum has enough space */ 3595 _Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated"); 3596 _Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated"); 3597 _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated"); 3598 _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated"); 3599 _Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ, 3600 "bonus_size truncated"); 3601 3602 if (*print_header) { 3603 (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", 3604 "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", 3605 "lsize", "%full", "type"); 3606 *print_header = 0; 3607 } 3608 3609 if (object == 0) { 3610 dn = DMU_META_DNODE(os); 3611 dmu_object_info_from_dnode(dn, &doi); 3612 } else { 3613 /* 3614 * Encrypted datasets will have sensitive bonus buffers 3615 * encrypted. Therefore we cannot hold the bonus buffer and 3616 * must hold the dnode itself instead. 3617 */ 3618 error = dmu_object_info(os, object, &doi); 3619 if (error) 3620 fatal("dmu_object_info() failed, errno %u", error); 3621 3622 if (!key_loaded && os->os_encrypted && 3623 DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { 3624 error = dnode_hold(os, object, FTAG, &dn); 3625 if (error) 3626 fatal("dnode_hold() failed, errno %u", error); 3627 dnode_held = B_TRUE; 3628 } else { 3629 error = dmu_bonus_hold(os, object, FTAG, &db); 3630 if (error) 3631 fatal("dmu_bonus_hold(%llu) failed, errno %u", 3632 object, error); 3633 bonus = db->db_data; 3634 bsize = db->db_size; 3635 dn = DB_DNODE((dmu_buf_impl_t *)db); 3636 } 3637 } 3638 3639 /* 3640 * Default to showing all object types if no flags were specified. 3641 */ 3642 if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES && 3643 !match_object_type(doi.doi_type, flags)) 3644 goto out; 3645 3646 if (dnode_slots_used) 3647 *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; 3648 3649 zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); 3650 zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); 3651 zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); 3652 zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); 3653 zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); 3654 zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); 3655 (void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 * 3656 doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? 3657 DNODES_PER_BLOCK : 1) / doi.doi_max_offset); 3658 3659 aux[0] = '\0'; 3660 3661 if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { 3662 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), 3663 " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); 3664 } 3665 3666 if (doi.doi_compress == ZIO_COMPRESS_INHERIT && 3667 ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) { 3668 const char *compname = NULL; 3669 if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION, 3670 ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel), 3671 &compname) == 0) { 3672 (void) snprintf(aux + strlen(aux), 3673 sizeof (aux) - strlen(aux), " (Z=inherit=%s)", 3674 compname); 3675 } else { 3676 (void) snprintf(aux + strlen(aux), 3677 sizeof (aux) - strlen(aux), 3678 " (Z=inherit=%s-unknown)", 3679 ZDB_COMPRESS_NAME(os->os_compress)); 3680 } 3681 } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) { 3682 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), 3683 " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress)); 3684 } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { 3685 (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), 3686 " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); 3687 } 3688 3689 (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", 3690 (u_longlong_t)object, doi.doi_indirection, iblk, dblk, 3691 asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux); 3692 3693 if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { 3694 (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", 3695 "", "", "", "", "", "", bonus_size, "bonus", 3696 zdb_ot_name(doi.doi_bonus_type)); 3697 } 3698 3699 if (verbosity >= 4) { 3700 (void) printf("\tdnode flags: %s%s%s%s\n", 3701 (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? 3702 "USED_BYTES " : "", 3703 (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? 3704 "USERUSED_ACCOUNTED " : "", 3705 (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? 3706 "USEROBJUSED_ACCOUNTED " : "", 3707 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? 3708 "SPILL_BLKPTR" : ""); 3709 (void) printf("\tdnode maxblkid: %llu\n", 3710 (longlong_t)dn->dn_phys->dn_maxblkid); 3711 3712 if (!dnode_held) { 3713 object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, 3714 object, bonus, bsize); 3715 } else { 3716 (void) printf("\t\t(bonus encrypted)\n"); 3717 } 3718 3719 if (key_loaded || 3720 (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) { 3721 object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, 3722 NULL, 0); 3723 } else { 3724 (void) printf("\t\t(object encrypted)\n"); 3725 } 3726 3727 *print_header = B_TRUE; 3728 } 3729 3730 if (verbosity >= 5) { 3731 if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 3732 char blkbuf[BP_SPRINTF_LEN]; 3733 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), 3734 DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE); 3735 (void) printf("\nSpill block: %s\n", blkbuf); 3736 } 3737 dump_indirect(dn); 3738 } 3739 3740 if (verbosity >= 5) { 3741 /* 3742 * Report the list of segments that comprise the object. 3743 */ 3744 uint64_t start = 0; 3745 uint64_t end; 3746 uint64_t blkfill = 1; 3747 int minlvl = 1; 3748 3749 if (dn->dn_type == DMU_OT_DNODE) { 3750 minlvl = 0; 3751 blkfill = DNODES_PER_BLOCK; 3752 } 3753 3754 for (;;) { 3755 char segsize[32]; 3756 /* make sure nicenum has enough space */ 3757 _Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ, 3758 "segsize truncated"); 3759 error = dnode_next_offset(dn, 3760 0, &start, minlvl, blkfill, 0); 3761 if (error) 3762 break; 3763 end = start; 3764 error = dnode_next_offset(dn, 3765 DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); 3766 zdb_nicenum(end - start, segsize, sizeof (segsize)); 3767 (void) printf("\t\tsegment [%016llx, %016llx)" 3768 " size %5s\n", (u_longlong_t)start, 3769 (u_longlong_t)end, segsize); 3770 if (error) 3771 break; 3772 start = end; 3773 } 3774 } 3775 3776 out: 3777 if (db != NULL) 3778 dmu_buf_rele(db, FTAG); 3779 if (dnode_held) 3780 dnode_rele(dn, FTAG); 3781 } 3782 3783 static void 3784 count_dir_mos_objects(dsl_dir_t *dd) 3785 { 3786 mos_obj_refd(dd->dd_object); 3787 mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); 3788 mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); 3789 mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); 3790 mos_obj_refd(dsl_dir_phys(dd)->dd_clones); 3791 3792 /* 3793 * The dd_crypto_obj can be referenced by multiple dsl_dir's. 3794 * Ignore the references after the first one. 3795 */ 3796 mos_obj_refd_multiple(dd->dd_crypto_obj); 3797 } 3798 3799 static void 3800 count_ds_mos_objects(dsl_dataset_t *ds) 3801 { 3802 mos_obj_refd(ds->ds_object); 3803 mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); 3804 mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); 3805 mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); 3806 mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); 3807 mos_obj_refd(ds->ds_bookmarks_obj); 3808 3809 if (!dsl_dataset_is_snapshot(ds)) { 3810 count_dir_mos_objects(ds->ds_dir); 3811 } 3812 } 3813 3814 static const char *const objset_types[DMU_OST_NUMTYPES] = { 3815 "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; 3816 3817 /* 3818 * Parse a string denoting a range of object IDs of the form 3819 * <start>[:<end>[:flags]], and store the results in zor. 3820 * Return 0 on success. On error, return 1 and update the msg 3821 * pointer to point to a descriptive error message. 3822 */ 3823 static int 3824 parse_object_range(char *range, zopt_object_range_t *zor, const char **msg) 3825 { 3826 uint64_t flags = 0; 3827 char *p, *s, *dup, *flagstr, *tmp = NULL; 3828 size_t len; 3829 int i; 3830 int rc = 0; 3831 3832 if (strchr(range, ':') == NULL) { 3833 zor->zor_obj_start = strtoull(range, &p, 0); 3834 if (*p != '\0') { 3835 *msg = "Invalid characters in object ID"; 3836 rc = 1; 3837 } 3838 zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); 3839 zor->zor_obj_end = zor->zor_obj_start; 3840 return (rc); 3841 } 3842 3843 if (strchr(range, ':') == range) { 3844 *msg = "Invalid leading colon"; 3845 rc = 1; 3846 return (rc); 3847 } 3848 3849 len = strlen(range); 3850 if (range[len - 1] == ':') { 3851 *msg = "Invalid trailing colon"; 3852 rc = 1; 3853 return (rc); 3854 } 3855 3856 dup = strdup(range); 3857 s = strtok_r(dup, ":", &tmp); 3858 zor->zor_obj_start = strtoull(s, &p, 0); 3859 3860 if (*p != '\0') { 3861 *msg = "Invalid characters in start object ID"; 3862 rc = 1; 3863 goto out; 3864 } 3865 3866 s = strtok_r(NULL, ":", &tmp); 3867 zor->zor_obj_end = strtoull(s, &p, 0); 3868 3869 if (*p != '\0') { 3870 *msg = "Invalid characters in end object ID"; 3871 rc = 1; 3872 goto out; 3873 } 3874 3875 if (zor->zor_obj_start > zor->zor_obj_end) { 3876 *msg = "Start object ID may not exceed end object ID"; 3877 rc = 1; 3878 goto out; 3879 } 3880 3881 s = strtok_r(NULL, ":", &tmp); 3882 if (s == NULL) { 3883 zor->zor_flags = ZOR_FLAG_ALL_TYPES; 3884 goto out; 3885 } else if (strtok_r(NULL, ":", &tmp) != NULL) { 3886 *msg = "Invalid colon-delimited field after flags"; 3887 rc = 1; 3888 goto out; 3889 } 3890 3891 flagstr = s; 3892 for (i = 0; flagstr[i]; i++) { 3893 int bit; 3894 boolean_t negation = (flagstr[i] == '-'); 3895 3896 if (negation) { 3897 i++; 3898 if (flagstr[i] == '\0') { 3899 *msg = "Invalid trailing negation operator"; 3900 rc = 1; 3901 goto out; 3902 } 3903 } 3904 bit = flagbits[(uchar_t)flagstr[i]]; 3905 if (bit == 0) { 3906 *msg = "Invalid flag"; 3907 rc = 1; 3908 goto out; 3909 } 3910 if (negation) 3911 flags &= ~bit; 3912 else 3913 flags |= bit; 3914 } 3915 zor->zor_flags = flags; 3916 3917 zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start); 3918 zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end); 3919 3920 out: 3921 free(dup); 3922 return (rc); 3923 } 3924 3925 static void 3926 dump_objset(objset_t *os) 3927 { 3928 dmu_objset_stats_t dds = { 0 }; 3929 uint64_t object, object_count; 3930 uint64_t refdbytes, usedobjs, scratch; 3931 char numbuf[32]; 3932 char blkbuf[BP_SPRINTF_LEN + 20]; 3933 char osname[ZFS_MAX_DATASET_NAME_LEN]; 3934 const char *type = "UNKNOWN"; 3935 int verbosity = dump_opt['d']; 3936 boolean_t print_header; 3937 unsigned i; 3938 int error; 3939 uint64_t total_slots_used = 0; 3940 uint64_t max_slot_used = 0; 3941 uint64_t dnode_slots; 3942 uint64_t obj_start; 3943 uint64_t obj_end; 3944 uint64_t flags; 3945 3946 /* make sure nicenum has enough space */ 3947 _Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated"); 3948 3949 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 3950 dmu_objset_fast_stat(os, &dds); 3951 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 3952 3953 print_header = B_TRUE; 3954 3955 if (dds.dds_type < DMU_OST_NUMTYPES) 3956 type = objset_types[dds.dds_type]; 3957 3958 if (dds.dds_type == DMU_OST_META) { 3959 dds.dds_creation_txg = TXG_INITIAL; 3960 usedobjs = BP_GET_FILL(os->os_rootbp); 3961 refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> 3962 dd_used_bytes; 3963 } else { 3964 dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); 3965 } 3966 3967 ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); 3968 3969 zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); 3970 3971 if (verbosity >= 4) { 3972 (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); 3973 (void) snprintf_blkptr(blkbuf + strlen(blkbuf), 3974 sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); 3975 } else { 3976 blkbuf[0] = '\0'; 3977 } 3978 3979 dmu_objset_name(os, osname); 3980 3981 (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " 3982 "%s, %llu objects%s%s\n", 3983 osname, type, (u_longlong_t)dmu_objset_id(os), 3984 (u_longlong_t)dds.dds_creation_txg, 3985 numbuf, (u_longlong_t)usedobjs, blkbuf, 3986 (dds.dds_inconsistent) ? " (inconsistent)" : ""); 3987 3988 for (i = 0; i < zopt_object_args; i++) { 3989 obj_start = zopt_object_ranges[i].zor_obj_start; 3990 obj_end = zopt_object_ranges[i].zor_obj_end; 3991 flags = zopt_object_ranges[i].zor_flags; 3992 3993 object = obj_start; 3994 if (object == 0 || obj_start == obj_end) 3995 dump_object(os, object, verbosity, &print_header, NULL, 3996 flags); 3997 else 3998 object--; 3999 4000 while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) && 4001 object <= obj_end) { 4002 dump_object(os, object, verbosity, &print_header, NULL, 4003 flags); 4004 } 4005 } 4006 4007 if (zopt_object_args > 0) { 4008 (void) printf("\n"); 4009 return; 4010 } 4011 4012 if (dump_opt['i'] != 0 || verbosity >= 2) 4013 dump_intent_log(dmu_objset_zil(os)); 4014 4015 if (dmu_objset_ds(os) != NULL) { 4016 dsl_dataset_t *ds = dmu_objset_ds(os); 4017 dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); 4018 if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && 4019 !dmu_objset_is_snapshot(os)) { 4020 dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); 4021 if (verify_dd_livelist(os) != 0) 4022 fatal("livelist is incorrect"); 4023 } 4024 4025 if (dsl_dataset_remap_deadlist_exists(ds)) { 4026 (void) printf("ds_remap_deadlist:\n"); 4027 dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); 4028 } 4029 count_ds_mos_objects(ds); 4030 } 4031 4032 if (dmu_objset_ds(os) != NULL) 4033 dump_bookmarks(os, verbosity); 4034 4035 if (verbosity < 2) 4036 return; 4037 4038 if (BP_IS_HOLE(os->os_rootbp)) 4039 return; 4040 4041 dump_object(os, 0, verbosity, &print_header, NULL, 0); 4042 object_count = 0; 4043 if (DMU_USERUSED_DNODE(os) != NULL && 4044 DMU_USERUSED_DNODE(os)->dn_type != 0) { 4045 dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, 4046 NULL, 0); 4047 dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, 4048 NULL, 0); 4049 } 4050 4051 if (DMU_PROJECTUSED_DNODE(os) != NULL && 4052 DMU_PROJECTUSED_DNODE(os)->dn_type != 0) 4053 dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, 4054 &print_header, NULL, 0); 4055 4056 object = 0; 4057 while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { 4058 dump_object(os, object, verbosity, &print_header, &dnode_slots, 4059 0); 4060 object_count++; 4061 total_slots_used += dnode_slots; 4062 max_slot_used = object + dnode_slots - 1; 4063 } 4064 4065 (void) printf("\n"); 4066 4067 (void) printf(" Dnode slots:\n"); 4068 (void) printf("\tTotal used: %10llu\n", 4069 (u_longlong_t)total_slots_used); 4070 (void) printf("\tMax used: %10llu\n", 4071 (u_longlong_t)max_slot_used); 4072 (void) printf("\tPercent empty: %10lf\n", 4073 (double)(max_slot_used - total_slots_used)*100 / 4074 (double)max_slot_used); 4075 (void) printf("\n"); 4076 4077 if (error != ESRCH) { 4078 (void) fprintf(stderr, "dmu_object_next() = %d\n", error); 4079 abort(); 4080 } 4081 4082 ASSERT3U(object_count, ==, usedobjs); 4083 4084 if (leaked_objects != 0) { 4085 (void) printf("%d potentially leaked objects detected\n", 4086 leaked_objects); 4087 leaked_objects = 0; 4088 } 4089 } 4090 4091 static void 4092 dump_uberblock(uberblock_t *ub, const char *header, const char *footer) 4093 { 4094 time_t timestamp = ub->ub_timestamp; 4095 4096 (void) printf("%s", header ? header : ""); 4097 (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); 4098 (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); 4099 (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); 4100 (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); 4101 (void) printf("\ttimestamp = %llu UTC = %s", 4102 (u_longlong_t)ub->ub_timestamp, ctime(×tamp)); 4103 4104 (void) printf("\tmmp_magic = %016llx\n", 4105 (u_longlong_t)ub->ub_mmp_magic); 4106 if (MMP_VALID(ub)) { 4107 (void) printf("\tmmp_delay = %0llu\n", 4108 (u_longlong_t)ub->ub_mmp_delay); 4109 if (MMP_SEQ_VALID(ub)) 4110 (void) printf("\tmmp_seq = %u\n", 4111 (unsigned int) MMP_SEQ(ub)); 4112 if (MMP_FAIL_INT_VALID(ub)) 4113 (void) printf("\tmmp_fail = %u\n", 4114 (unsigned int) MMP_FAIL_INT(ub)); 4115 if (MMP_INTERVAL_VALID(ub)) 4116 (void) printf("\tmmp_write = %u\n", 4117 (unsigned int) MMP_INTERVAL(ub)); 4118 /* After MMP_* to make summarize_uberblock_mmp cleaner */ 4119 (void) printf("\tmmp_valid = %x\n", 4120 (unsigned int) ub->ub_mmp_config & 0xFF); 4121 } 4122 4123 if (dump_opt['u'] >= 4) { 4124 char blkbuf[BP_SPRINTF_LEN]; 4125 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); 4126 (void) printf("\trootbp = %s\n", blkbuf); 4127 } 4128 (void) printf("\tcheckpoint_txg = %llu\n", 4129 (u_longlong_t)ub->ub_checkpoint_txg); 4130 (void) printf("%s", footer ? footer : ""); 4131 } 4132 4133 static void 4134 dump_config(spa_t *spa) 4135 { 4136 dmu_buf_t *db; 4137 size_t nvsize = 0; 4138 int error = 0; 4139 4140 4141 error = dmu_bonus_hold(spa->spa_meta_objset, 4142 spa->spa_config_object, FTAG, &db); 4143 4144 if (error == 0) { 4145 nvsize = *(uint64_t *)db->db_data; 4146 dmu_buf_rele(db, FTAG); 4147 4148 (void) printf("\nMOS Configuration:\n"); 4149 dump_packed_nvlist(spa->spa_meta_objset, 4150 spa->spa_config_object, (void *)&nvsize, 1); 4151 } else { 4152 (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", 4153 (u_longlong_t)spa->spa_config_object, error); 4154 } 4155 } 4156 4157 static void 4158 dump_cachefile(const char *cachefile) 4159 { 4160 int fd; 4161 struct stat64 statbuf; 4162 char *buf; 4163 nvlist_t *config; 4164 4165 if ((fd = open64(cachefile, O_RDONLY)) < 0) { 4166 (void) printf("cannot open '%s': %s\n", cachefile, 4167 strerror(errno)); 4168 exit(1); 4169 } 4170 4171 if (fstat64(fd, &statbuf) != 0) { 4172 (void) printf("failed to stat '%s': %s\n", cachefile, 4173 strerror(errno)); 4174 exit(1); 4175 } 4176 4177 if ((buf = malloc(statbuf.st_size)) == NULL) { 4178 (void) fprintf(stderr, "failed to allocate %llu bytes\n", 4179 (u_longlong_t)statbuf.st_size); 4180 exit(1); 4181 } 4182 4183 if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { 4184 (void) fprintf(stderr, "failed to read %llu bytes\n", 4185 (u_longlong_t)statbuf.st_size); 4186 exit(1); 4187 } 4188 4189 (void) close(fd); 4190 4191 if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { 4192 (void) fprintf(stderr, "failed to unpack nvlist\n"); 4193 exit(1); 4194 } 4195 4196 free(buf); 4197 4198 dump_nvlist(config, 0); 4199 4200 nvlist_free(config); 4201 } 4202 4203 /* 4204 * ZFS label nvlist stats 4205 */ 4206 typedef struct zdb_nvl_stats { 4207 int zns_list_count; 4208 int zns_leaf_count; 4209 size_t zns_leaf_largest; 4210 size_t zns_leaf_total; 4211 nvlist_t *zns_string; 4212 nvlist_t *zns_uint64; 4213 nvlist_t *zns_boolean; 4214 } zdb_nvl_stats_t; 4215 4216 static void 4217 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats) 4218 { 4219 nvlist_t *list, **array; 4220 nvpair_t *nvp = NULL; 4221 const char *name; 4222 uint_t i, items; 4223 4224 stats->zns_list_count++; 4225 4226 while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { 4227 name = nvpair_name(nvp); 4228 4229 switch (nvpair_type(nvp)) { 4230 case DATA_TYPE_STRING: 4231 fnvlist_add_string(stats->zns_string, name, 4232 fnvpair_value_string(nvp)); 4233 break; 4234 case DATA_TYPE_UINT64: 4235 fnvlist_add_uint64(stats->zns_uint64, name, 4236 fnvpair_value_uint64(nvp)); 4237 break; 4238 case DATA_TYPE_BOOLEAN: 4239 fnvlist_add_boolean(stats->zns_boolean, name); 4240 break; 4241 case DATA_TYPE_NVLIST: 4242 if (nvpair_value_nvlist(nvp, &list) == 0) 4243 collect_nvlist_stats(list, stats); 4244 break; 4245 case DATA_TYPE_NVLIST_ARRAY: 4246 if (nvpair_value_nvlist_array(nvp, &array, &items) != 0) 4247 break; 4248 4249 for (i = 0; i < items; i++) { 4250 collect_nvlist_stats(array[i], stats); 4251 4252 /* collect stats on leaf vdev */ 4253 if (strcmp(name, "children") == 0) { 4254 size_t size; 4255 4256 (void) nvlist_size(array[i], &size, 4257 NV_ENCODE_XDR); 4258 stats->zns_leaf_total += size; 4259 if (size > stats->zns_leaf_largest) 4260 stats->zns_leaf_largest = size; 4261 stats->zns_leaf_count++; 4262 } 4263 } 4264 break; 4265 default: 4266 (void) printf("skip type %d!\n", (int)nvpair_type(nvp)); 4267 } 4268 } 4269 } 4270 4271 static void 4272 dump_nvlist_stats(nvlist_t *nvl, size_t cap) 4273 { 4274 zdb_nvl_stats_t stats = { 0 }; 4275 size_t size, sum = 0, total; 4276 size_t noise; 4277 4278 /* requires nvlist with non-unique names for stat collection */ 4279 VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0)); 4280 VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0)); 4281 VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0)); 4282 VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR)); 4283 4284 (void) printf("\n\nZFS Label NVList Config Stats:\n"); 4285 4286 VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR)); 4287 (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n", 4288 (int)total, (int)(cap - total), 100.0 * total / cap); 4289 4290 collect_nvlist_stats(nvl, &stats); 4291 4292 VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR)); 4293 size -= noise; 4294 sum += size; 4295 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:", 4296 (int)fnvlist_num_pairs(stats.zns_uint64), 4297 (int)size, 100.0 * size / total); 4298 4299 VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR)); 4300 size -= noise; 4301 sum += size; 4302 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:", 4303 (int)fnvlist_num_pairs(stats.zns_string), 4304 (int)size, 100.0 * size / total); 4305 4306 VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR)); 4307 size -= noise; 4308 sum += size; 4309 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:", 4310 (int)fnvlist_num_pairs(stats.zns_boolean), 4311 (int)size, 100.0 * size / total); 4312 4313 size = total - sum; /* treat remainder as nvlist overhead */ 4314 (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:", 4315 stats.zns_list_count, (int)size, 100.0 * size / total); 4316 4317 if (stats.zns_leaf_count > 0) { 4318 size_t average = stats.zns_leaf_total / stats.zns_leaf_count; 4319 4320 (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:", 4321 stats.zns_leaf_count, (int)average); 4322 (void) printf("%24d bytes largest\n", 4323 (int)stats.zns_leaf_largest); 4324 4325 if (dump_opt['l'] >= 3 && average > 0) 4326 (void) printf(" space for %d additional leaf vdevs\n", 4327 (int)((cap - total) / average)); 4328 } 4329 (void) printf("\n"); 4330 4331 nvlist_free(stats.zns_string); 4332 nvlist_free(stats.zns_uint64); 4333 nvlist_free(stats.zns_boolean); 4334 } 4335 4336 typedef struct cksum_record { 4337 zio_cksum_t cksum; 4338 boolean_t labels[VDEV_LABELS]; 4339 avl_node_t link; 4340 } cksum_record_t; 4341 4342 static int 4343 cksum_record_compare(const void *x1, const void *x2) 4344 { 4345 const cksum_record_t *l = (cksum_record_t *)x1; 4346 const cksum_record_t *r = (cksum_record_t *)x2; 4347 int arraysize = ARRAY_SIZE(l->cksum.zc_word); 4348 int difference = 0; 4349 4350 for (int i = 0; i < arraysize; i++) { 4351 difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); 4352 if (difference) 4353 break; 4354 } 4355 4356 return (difference); 4357 } 4358 4359 static cksum_record_t * 4360 cksum_record_alloc(zio_cksum_t *cksum, int l) 4361 { 4362 cksum_record_t *rec; 4363 4364 rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); 4365 rec->cksum = *cksum; 4366 rec->labels[l] = B_TRUE; 4367 4368 return (rec); 4369 } 4370 4371 static cksum_record_t * 4372 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) 4373 { 4374 cksum_record_t lookup = { .cksum = *cksum }; 4375 avl_index_t where; 4376 4377 return (avl_find(tree, &lookup, &where)); 4378 } 4379 4380 static cksum_record_t * 4381 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) 4382 { 4383 cksum_record_t *rec; 4384 4385 rec = cksum_record_lookup(tree, cksum); 4386 if (rec) { 4387 rec->labels[l] = B_TRUE; 4388 } else { 4389 rec = cksum_record_alloc(cksum, l); 4390 avl_add(tree, rec); 4391 } 4392 4393 return (rec); 4394 } 4395 4396 static int 4397 first_label(cksum_record_t *rec) 4398 { 4399 for (int i = 0; i < VDEV_LABELS; i++) 4400 if (rec->labels[i]) 4401 return (i); 4402 4403 return (-1); 4404 } 4405 4406 static void 4407 print_label_numbers(const char *prefix, const cksum_record_t *rec) 4408 { 4409 fputs(prefix, stdout); 4410 for (int i = 0; i < VDEV_LABELS; i++) 4411 if (rec->labels[i] == B_TRUE) 4412 printf("%d ", i); 4413 putchar('\n'); 4414 } 4415 4416 #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) 4417 4418 typedef struct zdb_label { 4419 vdev_label_t label; 4420 uint64_t label_offset; 4421 nvlist_t *config_nv; 4422 cksum_record_t *config; 4423 cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; 4424 boolean_t header_printed; 4425 boolean_t read_failed; 4426 boolean_t cksum_valid; 4427 } zdb_label_t; 4428 4429 static void 4430 print_label_header(zdb_label_t *label, int l) 4431 { 4432 4433 if (dump_opt['q']) 4434 return; 4435 4436 if (label->header_printed == B_TRUE) 4437 return; 4438 4439 (void) printf("------------------------------------\n"); 4440 (void) printf("LABEL %d %s\n", l, 4441 label->cksum_valid ? "" : "(Bad label cksum)"); 4442 (void) printf("------------------------------------\n"); 4443 4444 label->header_printed = B_TRUE; 4445 } 4446 4447 static void 4448 print_l2arc_header(void) 4449 { 4450 (void) printf("------------------------------------\n"); 4451 (void) printf("L2ARC device header\n"); 4452 (void) printf("------------------------------------\n"); 4453 } 4454 4455 static void 4456 print_l2arc_log_blocks(void) 4457 { 4458 (void) printf("------------------------------------\n"); 4459 (void) printf("L2ARC device log blocks\n"); 4460 (void) printf("------------------------------------\n"); 4461 } 4462 4463 static void 4464 dump_l2arc_log_entries(uint64_t log_entries, 4465 l2arc_log_ent_phys_t *le, uint64_t i) 4466 { 4467 for (int j = 0; j < log_entries; j++) { 4468 dva_t dva = le[j].le_dva; 4469 (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " 4470 "vdev: %llu, offset: %llu\n", 4471 (u_longlong_t)i, j + 1, 4472 (u_longlong_t)DVA_GET_ASIZE(&dva), 4473 (u_longlong_t)DVA_GET_VDEV(&dva), 4474 (u_longlong_t)DVA_GET_OFFSET(&dva)); 4475 (void) printf("|\t\t\t\tbirth: %llu\n", 4476 (u_longlong_t)le[j].le_birth); 4477 (void) printf("|\t\t\t\tlsize: %llu\n", 4478 (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); 4479 (void) printf("|\t\t\t\tpsize: %llu\n", 4480 (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); 4481 (void) printf("|\t\t\t\tcompr: %llu\n", 4482 (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); 4483 (void) printf("|\t\t\t\tcomplevel: %llu\n", 4484 (u_longlong_t)(&le[j])->le_complevel); 4485 (void) printf("|\t\t\t\ttype: %llu\n", 4486 (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); 4487 (void) printf("|\t\t\t\tprotected: %llu\n", 4488 (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); 4489 (void) printf("|\t\t\t\tprefetch: %llu\n", 4490 (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); 4491 (void) printf("|\t\t\t\taddress: %llu\n", 4492 (u_longlong_t)le[j].le_daddr); 4493 (void) printf("|\t\t\t\tARC state: %llu\n", 4494 (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop)); 4495 (void) printf("|\n"); 4496 } 4497 (void) printf("\n"); 4498 } 4499 4500 static void 4501 dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps) 4502 { 4503 (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr); 4504 (void) printf("|\t\tpayload_asize: %llu\n", 4505 (u_longlong_t)lbps->lbp_payload_asize); 4506 (void) printf("|\t\tpayload_start: %llu\n", 4507 (u_longlong_t)lbps->lbp_payload_start); 4508 (void) printf("|\t\tlsize: %llu\n", 4509 (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop)); 4510 (void) printf("|\t\tasize: %llu\n", 4511 (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop)); 4512 (void) printf("|\t\tcompralgo: %llu\n", 4513 (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop)); 4514 (void) printf("|\t\tcksumalgo: %llu\n", 4515 (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop)); 4516 (void) printf("|\n\n"); 4517 } 4518 4519 static void 4520 dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr, 4521 l2arc_dev_hdr_phys_t *rebuild) 4522 { 4523 l2arc_log_blk_phys_t this_lb; 4524 uint64_t asize; 4525 l2arc_log_blkptr_t lbps[2]; 4526 abd_t *abd; 4527 zio_cksum_t cksum; 4528 int failed = 0; 4529 l2arc_dev_t dev; 4530 4531 if (!dump_opt['q']) 4532 print_l2arc_log_blocks(); 4533 memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps)); 4534 4535 dev.l2ad_evict = l2dhdr->dh_evict; 4536 dev.l2ad_start = l2dhdr->dh_start; 4537 dev.l2ad_end = l2dhdr->dh_end; 4538 4539 if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) { 4540 /* no log blocks to read */ 4541 if (!dump_opt['q']) { 4542 (void) printf("No log blocks to read\n"); 4543 (void) printf("\n"); 4544 } 4545 return; 4546 } else { 4547 dev.l2ad_hand = lbps[0].lbp_daddr + 4548 L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); 4549 } 4550 4551 dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); 4552 4553 for (;;) { 4554 if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) 4555 break; 4556 4557 /* L2BLK_GET_PSIZE returns aligned size for log blocks */ 4558 asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); 4559 if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { 4560 if (!dump_opt['q']) { 4561 (void) printf("Error while reading next log " 4562 "block\n\n"); 4563 } 4564 break; 4565 } 4566 4567 fletcher_4_native_varsize(&this_lb, asize, &cksum); 4568 if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { 4569 failed++; 4570 if (!dump_opt['q']) { 4571 (void) printf("Invalid cksum\n"); 4572 dump_l2arc_log_blkptr(&lbps[0]); 4573 } 4574 break; 4575 } 4576 4577 switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { 4578 case ZIO_COMPRESS_OFF: 4579 break; 4580 default: 4581 abd = abd_alloc_for_io(asize, B_TRUE); 4582 abd_copy_from_buf_off(abd, &this_lb, 0, asize); 4583 if (zio_decompress_data(L2BLK_GET_COMPRESS( 4584 (&lbps[0])->lbp_prop), abd, &this_lb, 4585 asize, sizeof (this_lb), NULL) != 0) { 4586 (void) printf("L2ARC block decompression " 4587 "failed\n"); 4588 abd_free(abd); 4589 goto out; 4590 } 4591 abd_free(abd); 4592 break; 4593 } 4594 4595 if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) 4596 byteswap_uint64_array(&this_lb, sizeof (this_lb)); 4597 if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { 4598 if (!dump_opt['q']) 4599 (void) printf("Invalid log block magic\n\n"); 4600 break; 4601 } 4602 4603 rebuild->dh_lb_count++; 4604 rebuild->dh_lb_asize += asize; 4605 if (dump_opt['l'] > 1 && !dump_opt['q']) { 4606 (void) printf("lb[%4llu]\tmagic: %llu\n", 4607 (u_longlong_t)rebuild->dh_lb_count, 4608 (u_longlong_t)this_lb.lb_magic); 4609 dump_l2arc_log_blkptr(&lbps[0]); 4610 } 4611 4612 if (dump_opt['l'] > 2 && !dump_opt['q']) 4613 dump_l2arc_log_entries(l2dhdr->dh_log_entries, 4614 this_lb.lb_entries, 4615 rebuild->dh_lb_count); 4616 4617 if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, 4618 lbps[0].lbp_payload_start, dev.l2ad_evict) && 4619 !dev.l2ad_first) 4620 break; 4621 4622 lbps[0] = lbps[1]; 4623 lbps[1] = this_lb.lb_prev_lbp; 4624 } 4625 out: 4626 if (!dump_opt['q']) { 4627 (void) printf("log_blk_count:\t %llu with valid cksum\n", 4628 (u_longlong_t)rebuild->dh_lb_count); 4629 (void) printf("\t\t %d with invalid cksum\n", failed); 4630 (void) printf("log_blk_asize:\t %llu\n\n", 4631 (u_longlong_t)rebuild->dh_lb_asize); 4632 } 4633 } 4634 4635 static int 4636 dump_l2arc_header(int fd) 4637 { 4638 l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0}; 4639 int error = B_FALSE; 4640 4641 if (pread64(fd, &l2dhdr, sizeof (l2dhdr), 4642 VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { 4643 error = B_TRUE; 4644 } else { 4645 if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) 4646 byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); 4647 4648 if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) 4649 error = B_TRUE; 4650 } 4651 4652 if (error) { 4653 (void) printf("L2ARC device header not found\n\n"); 4654 /* Do not return an error here for backward compatibility */ 4655 return (0); 4656 } else if (!dump_opt['q']) { 4657 print_l2arc_header(); 4658 4659 (void) printf(" magic: %llu\n", 4660 (u_longlong_t)l2dhdr.dh_magic); 4661 (void) printf(" version: %llu\n", 4662 (u_longlong_t)l2dhdr.dh_version); 4663 (void) printf(" pool_guid: %llu\n", 4664 (u_longlong_t)l2dhdr.dh_spa_guid); 4665 (void) printf(" flags: %llu\n", 4666 (u_longlong_t)l2dhdr.dh_flags); 4667 (void) printf(" start_lbps[0]: %llu\n", 4668 (u_longlong_t) 4669 l2dhdr.dh_start_lbps[0].lbp_daddr); 4670 (void) printf(" start_lbps[1]: %llu\n", 4671 (u_longlong_t) 4672 l2dhdr.dh_start_lbps[1].lbp_daddr); 4673 (void) printf(" log_blk_ent: %llu\n", 4674 (u_longlong_t)l2dhdr.dh_log_entries); 4675 (void) printf(" start: %llu\n", 4676 (u_longlong_t)l2dhdr.dh_start); 4677 (void) printf(" end: %llu\n", 4678 (u_longlong_t)l2dhdr.dh_end); 4679 (void) printf(" evict: %llu\n", 4680 (u_longlong_t)l2dhdr.dh_evict); 4681 (void) printf(" lb_asize_refcount: %llu\n", 4682 (u_longlong_t)l2dhdr.dh_lb_asize); 4683 (void) printf(" lb_count_refcount: %llu\n", 4684 (u_longlong_t)l2dhdr.dh_lb_count); 4685 (void) printf(" trim_action_time: %llu\n", 4686 (u_longlong_t)l2dhdr.dh_trim_action_time); 4687 (void) printf(" trim_state: %llu\n\n", 4688 (u_longlong_t)l2dhdr.dh_trim_state); 4689 } 4690 4691 dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild); 4692 /* 4693 * The total aligned size of log blocks and the number of log blocks 4694 * reported in the header of the device may be less than what zdb 4695 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). 4696 * This happens because dump_l2arc_log_blocks() lacks the memory 4697 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system 4698 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize 4699 * and dh_lb_count will be lower to begin with than what exists on the 4700 * device. This is normal and zdb should not exit with an error. The 4701 * opposite case should never happen though, the values reported in the 4702 * header should never be higher than what dump_l2arc_log_blocks() and 4703 * l2arc_rebuild() report. If this happens there is a leak in the 4704 * accounting of log blocks. 4705 */ 4706 if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || 4707 l2dhdr.dh_lb_count > rebuild.dh_lb_count) 4708 return (1); 4709 4710 return (0); 4711 } 4712 4713 static void 4714 dump_config_from_label(zdb_label_t *label, size_t buflen, int l) 4715 { 4716 if (dump_opt['q']) 4717 return; 4718 4719 if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) 4720 return; 4721 4722 print_label_header(label, l); 4723 dump_nvlist(label->config_nv, 4); 4724 print_label_numbers(" labels = ", label->config); 4725 4726 if (dump_opt['l'] >= 2) 4727 dump_nvlist_stats(label->config_nv, buflen); 4728 } 4729 4730 #define ZDB_MAX_UB_HEADER_SIZE 32 4731 4732 static void 4733 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) 4734 { 4735 4736 vdev_t vd; 4737 char header[ZDB_MAX_UB_HEADER_SIZE]; 4738 4739 vd.vdev_ashift = ashift; 4740 vd.vdev_top = &vd; 4741 4742 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { 4743 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); 4744 uberblock_t *ub = (void *)((char *)&label->label + uoff); 4745 cksum_record_t *rec = label->uberblocks[i]; 4746 4747 if (rec == NULL) { 4748 if (dump_opt['u'] >= 2) { 4749 print_label_header(label, label_num); 4750 (void) printf(" Uberblock[%d] invalid\n", i); 4751 } 4752 continue; 4753 } 4754 4755 if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) 4756 continue; 4757 4758 if ((dump_opt['u'] < 4) && 4759 (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && 4760 (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) 4761 continue; 4762 4763 print_label_header(label, label_num); 4764 (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, 4765 " Uberblock[%d]\n", i); 4766 dump_uberblock(ub, header, ""); 4767 print_label_numbers(" labels = ", rec); 4768 } 4769 } 4770 4771 static char curpath[PATH_MAX]; 4772 4773 /* 4774 * Iterate through the path components, recursively passing 4775 * current one's obj and remaining path until we find the obj 4776 * for the last one. 4777 */ 4778 static int 4779 dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj) 4780 { 4781 int err; 4782 boolean_t header = B_TRUE; 4783 uint64_t child_obj; 4784 char *s; 4785 dmu_buf_t *db; 4786 dmu_object_info_t doi; 4787 4788 if ((s = strchr(name, '/')) != NULL) 4789 *s = '\0'; 4790 err = zap_lookup(os, obj, name, 8, 1, &child_obj); 4791 4792 (void) strlcat(curpath, name, sizeof (curpath)); 4793 4794 if (err != 0) { 4795 (void) fprintf(stderr, "failed to lookup %s: %s\n", 4796 curpath, strerror(err)); 4797 return (err); 4798 } 4799 4800 child_obj = ZFS_DIRENT_OBJ(child_obj); 4801 err = sa_buf_hold(os, child_obj, FTAG, &db); 4802 if (err != 0) { 4803 (void) fprintf(stderr, 4804 "failed to get SA dbuf for obj %llu: %s\n", 4805 (u_longlong_t)child_obj, strerror(err)); 4806 return (EINVAL); 4807 } 4808 dmu_object_info_from_db(db, &doi); 4809 sa_buf_rele(db, FTAG); 4810 4811 if (doi.doi_bonus_type != DMU_OT_SA && 4812 doi.doi_bonus_type != DMU_OT_ZNODE) { 4813 (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", 4814 doi.doi_bonus_type, (u_longlong_t)child_obj); 4815 return (EINVAL); 4816 } 4817 4818 if (dump_opt['v'] > 6) { 4819 (void) printf("obj=%llu %s type=%d bonustype=%d\n", 4820 (u_longlong_t)child_obj, curpath, doi.doi_type, 4821 doi.doi_bonus_type); 4822 } 4823 4824 (void) strlcat(curpath, "/", sizeof (curpath)); 4825 4826 switch (doi.doi_type) { 4827 case DMU_OT_DIRECTORY_CONTENTS: 4828 if (s != NULL && *(s + 1) != '\0') 4829 return (dump_path_impl(os, child_obj, s + 1, retobj)); 4830 zfs_fallthrough; 4831 case DMU_OT_PLAIN_FILE_CONTENTS: 4832 if (retobj != NULL) { 4833 *retobj = child_obj; 4834 } else { 4835 dump_object(os, child_obj, dump_opt['v'], &header, 4836 NULL, 0); 4837 } 4838 return (0); 4839 default: 4840 (void) fprintf(stderr, "object %llu has non-file/directory " 4841 "type %d\n", (u_longlong_t)obj, doi.doi_type); 4842 break; 4843 } 4844 4845 return (EINVAL); 4846 } 4847 4848 /* 4849 * Dump the blocks for the object specified by path inside the dataset. 4850 */ 4851 static int 4852 dump_path(char *ds, char *path, uint64_t *retobj) 4853 { 4854 int err; 4855 objset_t *os; 4856 uint64_t root_obj; 4857 4858 err = open_objset(ds, FTAG, &os); 4859 if (err != 0) 4860 return (err); 4861 4862 err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); 4863 if (err != 0) { 4864 (void) fprintf(stderr, "can't lookup root znode: %s\n", 4865 strerror(err)); 4866 close_objset(os, FTAG); 4867 return (EINVAL); 4868 } 4869 4870 (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); 4871 4872 err = dump_path_impl(os, root_obj, path, retobj); 4873 4874 close_objset(os, FTAG); 4875 return (err); 4876 } 4877 4878 static int 4879 zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) 4880 { 4881 int err = 0; 4882 uint64_t size, readsize, oursize, offset; 4883 ssize_t writesize; 4884 sa_handle_t *hdl; 4885 4886 (void) printf("Copying object %" PRIu64 " to file %s\n", srcobj, 4887 destfile); 4888 4889 VERIFY3P(os, ==, sa_os); 4890 if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) { 4891 (void) printf("Failed to get handle for SA znode\n"); 4892 return (err); 4893 } 4894 if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) { 4895 (void) sa_handle_destroy(hdl); 4896 return (err); 4897 } 4898 (void) sa_handle_destroy(hdl); 4899 4900 (void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj, 4901 size); 4902 if (size == 0) { 4903 return (EINVAL); 4904 } 4905 4906 int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); 4907 if (fd == -1) 4908 return (errno); 4909 /* 4910 * We cap the size at 1 mebibyte here to prevent 4911 * allocation failures and nigh-infinite printing if the 4912 * object is extremely large. 4913 */ 4914 oursize = MIN(size, 1 << 20); 4915 offset = 0; 4916 char *buf = kmem_alloc(oursize, KM_NOSLEEP); 4917 if (buf == NULL) { 4918 (void) close(fd); 4919 return (ENOMEM); 4920 } 4921 4922 while (offset < size) { 4923 readsize = MIN(size - offset, 1 << 20); 4924 err = dmu_read(os, srcobj, offset, readsize, buf, 0); 4925 if (err != 0) { 4926 (void) printf("got error %u from dmu_read\n", err); 4927 kmem_free(buf, oursize); 4928 (void) close(fd); 4929 return (err); 4930 } 4931 if (dump_opt['v'] > 3) { 4932 (void) printf("Read offset=%" PRIu64 " size=%" PRIu64 4933 " error=%d\n", offset, readsize, err); 4934 } 4935 4936 writesize = write(fd, buf, readsize); 4937 if (writesize < 0) { 4938 err = errno; 4939 break; 4940 } else if (writesize != readsize) { 4941 /* Incomplete write */ 4942 (void) fprintf(stderr, "Short write, only wrote %llu of" 4943 " %" PRIu64 " bytes, exiting...\n", 4944 (u_longlong_t)writesize, readsize); 4945 break; 4946 } 4947 4948 offset += readsize; 4949 } 4950 4951 (void) close(fd); 4952 4953 if (buf != NULL) 4954 kmem_free(buf, oursize); 4955 4956 return (err); 4957 } 4958 4959 static boolean_t 4960 label_cksum_valid(vdev_label_t *label, uint64_t offset) 4961 { 4962 zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; 4963 zio_cksum_t expected_cksum; 4964 zio_cksum_t actual_cksum; 4965 zio_cksum_t verifier; 4966 zio_eck_t *eck; 4967 int byteswap; 4968 4969 void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys); 4970 eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1; 4971 4972 offset += offsetof(vdev_label_t, vl_vdev_phys); 4973 ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); 4974 4975 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 4976 if (byteswap) 4977 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 4978 4979 expected_cksum = eck->zec_cksum; 4980 eck->zec_cksum = verifier; 4981 4982 abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE); 4983 ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum); 4984 abd_free(abd); 4985 4986 if (byteswap) 4987 byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); 4988 4989 if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) 4990 return (B_TRUE); 4991 4992 return (B_FALSE); 4993 } 4994 4995 static int 4996 dump_label(const char *dev) 4997 { 4998 char path[MAXPATHLEN]; 4999 zdb_label_t labels[VDEV_LABELS] = {{{{0}}}}; 5000 uint64_t psize, ashift, l2cache; 5001 struct stat64 statbuf; 5002 boolean_t config_found = B_FALSE; 5003 boolean_t error = B_FALSE; 5004 boolean_t read_l2arc_header = B_FALSE; 5005 avl_tree_t config_tree; 5006 avl_tree_t uberblock_tree; 5007 void *node, *cookie; 5008 int fd; 5009 5010 /* 5011 * Check if we were given absolute path and use it as is. 5012 * Otherwise if the provided vdev name doesn't point to a file, 5013 * try prepending expected disk paths and partition numbers. 5014 */ 5015 (void) strlcpy(path, dev, sizeof (path)); 5016 if (dev[0] != '/' && stat64(path, &statbuf) != 0) { 5017 int error; 5018 5019 error = zfs_resolve_shortname(dev, path, MAXPATHLEN); 5020 if (error == 0 && zfs_dev_is_whole_disk(path)) { 5021 if (zfs_append_partition(path, MAXPATHLEN) == -1) 5022 error = ENOENT; 5023 } 5024 5025 if (error || (stat64(path, &statbuf) != 0)) { 5026 (void) printf("failed to find device %s, try " 5027 "specifying absolute path instead\n", dev); 5028 return (1); 5029 } 5030 } 5031 5032 if ((fd = open64(path, O_RDONLY)) < 0) { 5033 (void) printf("cannot open '%s': %s\n", path, strerror(errno)); 5034 exit(1); 5035 } 5036 5037 if (fstat64_blk(fd, &statbuf) != 0) { 5038 (void) printf("failed to stat '%s': %s\n", path, 5039 strerror(errno)); 5040 (void) close(fd); 5041 exit(1); 5042 } 5043 5044 if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) 5045 (void) printf("failed to invalidate cache '%s' : %s\n", path, 5046 strerror(errno)); 5047 5048 avl_create(&config_tree, cksum_record_compare, 5049 sizeof (cksum_record_t), offsetof(cksum_record_t, link)); 5050 avl_create(&uberblock_tree, cksum_record_compare, 5051 sizeof (cksum_record_t), offsetof(cksum_record_t, link)); 5052 5053 psize = statbuf.st_size; 5054 psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); 5055 ashift = SPA_MINBLOCKSHIFT; 5056 5057 /* 5058 * 1. Read the label from disk 5059 * 2. Verify label cksum 5060 * 3. Unpack the configuration and insert in config tree. 5061 * 4. Traverse all uberblocks and insert in uberblock tree. 5062 */ 5063 for (int l = 0; l < VDEV_LABELS; l++) { 5064 zdb_label_t *label = &labels[l]; 5065 char *buf = label->label.vl_vdev_phys.vp_nvlist; 5066 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); 5067 nvlist_t *config; 5068 cksum_record_t *rec; 5069 zio_cksum_t cksum; 5070 vdev_t vd; 5071 5072 label->label_offset = vdev_label_offset(psize, l, 0); 5073 5074 if (pread64(fd, &label->label, sizeof (label->label), 5075 label->label_offset) != sizeof (label->label)) { 5076 if (!dump_opt['q']) 5077 (void) printf("failed to read label %d\n", l); 5078 label->read_failed = B_TRUE; 5079 error = B_TRUE; 5080 continue; 5081 } 5082 5083 label->read_failed = B_FALSE; 5084 label->cksum_valid = label_cksum_valid(&label->label, 5085 label->label_offset); 5086 5087 if (nvlist_unpack(buf, buflen, &config, 0) == 0) { 5088 nvlist_t *vdev_tree = NULL; 5089 size_t size; 5090 5091 if ((nvlist_lookup_nvlist(config, 5092 ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || 5093 (nvlist_lookup_uint64(vdev_tree, 5094 ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) 5095 ashift = SPA_MINBLOCKSHIFT; 5096 5097 if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) 5098 size = buflen; 5099 5100 /* If the device is a cache device clear the header. */ 5101 if (!read_l2arc_header) { 5102 if (nvlist_lookup_uint64(config, 5103 ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && 5104 l2cache == POOL_STATE_L2CACHE) { 5105 read_l2arc_header = B_TRUE; 5106 } 5107 } 5108 5109 fletcher_4_native_varsize(buf, size, &cksum); 5110 rec = cksum_record_insert(&config_tree, &cksum, l); 5111 5112 label->config = rec; 5113 label->config_nv = config; 5114 config_found = B_TRUE; 5115 } else { 5116 error = B_TRUE; 5117 } 5118 5119 vd.vdev_ashift = ashift; 5120 vd.vdev_top = &vd; 5121 5122 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { 5123 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); 5124 uberblock_t *ub = (void *)((char *)label + uoff); 5125 5126 if (uberblock_verify(ub)) 5127 continue; 5128 5129 fletcher_4_native_varsize(ub, sizeof (*ub), &cksum); 5130 rec = cksum_record_insert(&uberblock_tree, &cksum, l); 5131 5132 label->uberblocks[i] = rec; 5133 } 5134 } 5135 5136 /* 5137 * Dump the label and uberblocks. 5138 */ 5139 for (int l = 0; l < VDEV_LABELS; l++) { 5140 zdb_label_t *label = &labels[l]; 5141 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); 5142 5143 if (label->read_failed == B_TRUE) 5144 continue; 5145 5146 if (label->config_nv) { 5147 dump_config_from_label(label, buflen, l); 5148 } else { 5149 if (!dump_opt['q']) 5150 (void) printf("failed to unpack label %d\n", l); 5151 } 5152 5153 if (dump_opt['u']) 5154 dump_label_uberblocks(label, ashift, l); 5155 5156 nvlist_free(label->config_nv); 5157 } 5158 5159 /* 5160 * Dump the L2ARC header, if existent. 5161 */ 5162 if (read_l2arc_header) 5163 error |= dump_l2arc_header(fd); 5164 5165 cookie = NULL; 5166 while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) 5167 umem_free(node, sizeof (cksum_record_t)); 5168 5169 cookie = NULL; 5170 while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) 5171 umem_free(node, sizeof (cksum_record_t)); 5172 5173 avl_destroy(&config_tree); 5174 avl_destroy(&uberblock_tree); 5175 5176 (void) close(fd); 5177 5178 return (config_found == B_FALSE ? 2 : 5179 (error == B_TRUE ? 1 : 0)); 5180 } 5181 5182 static uint64_t dataset_feature_count[SPA_FEATURES]; 5183 static uint64_t global_feature_count[SPA_FEATURES]; 5184 static uint64_t remap_deadlist_count = 0; 5185 5186 static int 5187 dump_one_objset(const char *dsname, void *arg) 5188 { 5189 (void) arg; 5190 int error; 5191 objset_t *os; 5192 spa_feature_t f; 5193 5194 error = open_objset(dsname, FTAG, &os); 5195 if (error != 0) 5196 return (0); 5197 5198 for (f = 0; f < SPA_FEATURES; f++) { 5199 if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) 5200 continue; 5201 ASSERT(spa_feature_table[f].fi_flags & 5202 ZFEATURE_FLAG_PER_DATASET); 5203 dataset_feature_count[f]++; 5204 } 5205 5206 if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { 5207 remap_deadlist_count++; 5208 } 5209 5210 for (dsl_bookmark_node_t *dbn = 5211 avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; 5212 dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { 5213 mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); 5214 if (dbn->dbn_phys.zbm_redaction_obj != 0) 5215 global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; 5216 if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) 5217 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; 5218 } 5219 5220 if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && 5221 !dmu_objset_is_snapshot(os)) { 5222 global_feature_count[SPA_FEATURE_LIVELIST]++; 5223 } 5224 5225 dump_objset(os); 5226 close_objset(os, FTAG); 5227 fuid_table_destroy(); 5228 return (0); 5229 } 5230 5231 /* 5232 * Block statistics. 5233 */ 5234 #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) 5235 typedef struct zdb_blkstats { 5236 uint64_t zb_asize; 5237 uint64_t zb_lsize; 5238 uint64_t zb_psize; 5239 uint64_t zb_count; 5240 uint64_t zb_gangs; 5241 uint64_t zb_ditto_samevdev; 5242 uint64_t zb_ditto_same_ms; 5243 uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; 5244 } zdb_blkstats_t; 5245 5246 /* 5247 * Extended object types to report deferred frees and dedup auto-ditto blocks. 5248 */ 5249 #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) 5250 #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) 5251 #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) 5252 #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) 5253 5254 static const char *zdb_ot_extname[] = { 5255 "deferred free", 5256 "dedup ditto", 5257 "other", 5258 "Total", 5259 }; 5260 5261 #define ZB_TOTAL DN_MAX_LEVELS 5262 #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) 5263 5264 typedef struct zdb_cb { 5265 zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; 5266 uint64_t zcb_removing_size; 5267 uint64_t zcb_checkpoint_size; 5268 uint64_t zcb_dedup_asize; 5269 uint64_t zcb_dedup_blocks; 5270 uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; 5271 uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; 5272 uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; 5273 uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; 5274 uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; 5275 uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; 5276 uint64_t zcb_psize_total; 5277 uint64_t zcb_lsize_total; 5278 uint64_t zcb_asize_total; 5279 uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; 5280 uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] 5281 [BPE_PAYLOAD_SIZE + 1]; 5282 uint64_t zcb_start; 5283 hrtime_t zcb_lastprint; 5284 uint64_t zcb_totalasize; 5285 uint64_t zcb_errors[256]; 5286 int zcb_readfails; 5287 int zcb_haderrors; 5288 spa_t *zcb_spa; 5289 uint32_t **zcb_vd_obsolete_counts; 5290 } zdb_cb_t; 5291 5292 /* test if two DVA offsets from same vdev are within the same metaslab */ 5293 static boolean_t 5294 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) 5295 { 5296 vdev_t *vd = vdev_lookup_top(spa, vdev); 5297 uint64_t ms_shift = vd->vdev_ms_shift; 5298 5299 return ((off1 >> ms_shift) == (off2 >> ms_shift)); 5300 } 5301 5302 /* 5303 * Used to simplify reporting of the histogram data. 5304 */ 5305 typedef struct one_histo { 5306 const char *name; 5307 uint64_t *count; 5308 uint64_t *len; 5309 uint64_t cumulative; 5310 } one_histo_t; 5311 5312 /* 5313 * The number of separate histograms processed for psize, lsize and asize. 5314 */ 5315 #define NUM_HISTO 3 5316 5317 /* 5318 * This routine will create a fixed column size output of three different 5319 * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M 5320 * the count, length and cumulative length of the psize, lsize and 5321 * asize blocks. 5322 * 5323 * All three types of blocks are listed on a single line 5324 * 5325 * By default the table is printed in nicenumber format (e.g. 123K) but 5326 * if the '-P' parameter is specified then the full raw number (parseable) 5327 * is printed out. 5328 */ 5329 static void 5330 dump_size_histograms(zdb_cb_t *zcb) 5331 { 5332 /* 5333 * A temporary buffer that allows us to convert a number into 5334 * a string using zdb_nicenumber to allow either raw or human 5335 * readable numbers to be output. 5336 */ 5337 char numbuf[32]; 5338 5339 /* 5340 * Define titles which are used in the headers of the tables 5341 * printed by this routine. 5342 */ 5343 const char blocksize_title1[] = "block"; 5344 const char blocksize_title2[] = "size"; 5345 const char count_title[] = "Count"; 5346 const char length_title[] = "Size"; 5347 const char cumulative_title[] = "Cum."; 5348 5349 /* 5350 * Setup the histogram arrays (psize, lsize, and asize). 5351 */ 5352 one_histo_t parm_histo[NUM_HISTO]; 5353 5354 parm_histo[0].name = "psize"; 5355 parm_histo[0].count = zcb->zcb_psize_count; 5356 parm_histo[0].len = zcb->zcb_psize_len; 5357 parm_histo[0].cumulative = 0; 5358 5359 parm_histo[1].name = "lsize"; 5360 parm_histo[1].count = zcb->zcb_lsize_count; 5361 parm_histo[1].len = zcb->zcb_lsize_len; 5362 parm_histo[1].cumulative = 0; 5363 5364 parm_histo[2].name = "asize"; 5365 parm_histo[2].count = zcb->zcb_asize_count; 5366 parm_histo[2].len = zcb->zcb_asize_len; 5367 parm_histo[2].cumulative = 0; 5368 5369 5370 (void) printf("\nBlock Size Histogram\n"); 5371 /* 5372 * Print the first line titles 5373 */ 5374 if (dump_opt['P']) 5375 (void) printf("\n%s\t", blocksize_title1); 5376 else 5377 (void) printf("\n%7s ", blocksize_title1); 5378 5379 for (int j = 0; j < NUM_HISTO; j++) { 5380 if (dump_opt['P']) { 5381 if (j < NUM_HISTO - 1) { 5382 (void) printf("%s\t\t\t", parm_histo[j].name); 5383 } else { 5384 /* Don't print trailing spaces */ 5385 (void) printf(" %s", parm_histo[j].name); 5386 } 5387 } else { 5388 if (j < NUM_HISTO - 1) { 5389 /* Left aligned strings in the output */ 5390 (void) printf("%-7s ", 5391 parm_histo[j].name); 5392 } else { 5393 /* Don't print trailing spaces */ 5394 (void) printf("%s", parm_histo[j].name); 5395 } 5396 } 5397 } 5398 (void) printf("\n"); 5399 5400 /* 5401 * Print the second line titles 5402 */ 5403 if (dump_opt['P']) { 5404 (void) printf("%s\t", blocksize_title2); 5405 } else { 5406 (void) printf("%7s ", blocksize_title2); 5407 } 5408 5409 for (int i = 0; i < NUM_HISTO; i++) { 5410 if (dump_opt['P']) { 5411 (void) printf("%s\t%s\t%s\t", 5412 count_title, length_title, cumulative_title); 5413 } else { 5414 (void) printf("%7s%7s%7s", 5415 count_title, length_title, cumulative_title); 5416 } 5417 } 5418 (void) printf("\n"); 5419 5420 /* 5421 * Print the rows 5422 */ 5423 for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { 5424 5425 /* 5426 * Print the first column showing the blocksize 5427 */ 5428 zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); 5429 5430 if (dump_opt['P']) { 5431 printf("%s", numbuf); 5432 } else { 5433 printf("%7s:", numbuf); 5434 } 5435 5436 /* 5437 * Print the remaining set of 3 columns per size: 5438 * for psize, lsize and asize 5439 */ 5440 for (int j = 0; j < NUM_HISTO; j++) { 5441 parm_histo[j].cumulative += parm_histo[j].len[i]; 5442 5443 zdb_nicenum(parm_histo[j].count[i], 5444 numbuf, sizeof (numbuf)); 5445 if (dump_opt['P']) 5446 (void) printf("\t%s", numbuf); 5447 else 5448 (void) printf("%7s", numbuf); 5449 5450 zdb_nicenum(parm_histo[j].len[i], 5451 numbuf, sizeof (numbuf)); 5452 if (dump_opt['P']) 5453 (void) printf("\t%s", numbuf); 5454 else 5455 (void) printf("%7s", numbuf); 5456 5457 zdb_nicenum(parm_histo[j].cumulative, 5458 numbuf, sizeof (numbuf)); 5459 if (dump_opt['P']) 5460 (void) printf("\t%s", numbuf); 5461 else 5462 (void) printf("%7s", numbuf); 5463 } 5464 (void) printf("\n"); 5465 } 5466 } 5467 5468 static void 5469 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, 5470 dmu_object_type_t type) 5471 { 5472 uint64_t refcnt = 0; 5473 int i; 5474 5475 ASSERT(type < ZDB_OT_TOTAL); 5476 5477 if (zilog && zil_bp_tree_add(zilog, bp) != 0) 5478 return; 5479 5480 spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); 5481 5482 for (i = 0; i < 4; i++) { 5483 int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; 5484 int t = (i & 1) ? type : ZDB_OT_TOTAL; 5485 int equal; 5486 zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; 5487 5488 zb->zb_asize += BP_GET_ASIZE(bp); 5489 zb->zb_lsize += BP_GET_LSIZE(bp); 5490 zb->zb_psize += BP_GET_PSIZE(bp); 5491 zb->zb_count++; 5492 5493 /* 5494 * The histogram is only big enough to record blocks up to 5495 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, 5496 * "other", bucket. 5497 */ 5498 unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; 5499 idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); 5500 zb->zb_psize_histogram[idx]++; 5501 5502 zb->zb_gangs += BP_COUNT_GANG(bp); 5503 5504 switch (BP_GET_NDVAS(bp)) { 5505 case 2: 5506 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 5507 DVA_GET_VDEV(&bp->blk_dva[1])) { 5508 zb->zb_ditto_samevdev++; 5509 5510 if (same_metaslab(zcb->zcb_spa, 5511 DVA_GET_VDEV(&bp->blk_dva[0]), 5512 DVA_GET_OFFSET(&bp->blk_dva[0]), 5513 DVA_GET_OFFSET(&bp->blk_dva[1]))) 5514 zb->zb_ditto_same_ms++; 5515 } 5516 break; 5517 case 3: 5518 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 5519 DVA_GET_VDEV(&bp->blk_dva[1])) + 5520 (DVA_GET_VDEV(&bp->blk_dva[0]) == 5521 DVA_GET_VDEV(&bp->blk_dva[2])) + 5522 (DVA_GET_VDEV(&bp->blk_dva[1]) == 5523 DVA_GET_VDEV(&bp->blk_dva[2])); 5524 if (equal != 0) { 5525 zb->zb_ditto_samevdev++; 5526 5527 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 5528 DVA_GET_VDEV(&bp->blk_dva[1]) && 5529 same_metaslab(zcb->zcb_spa, 5530 DVA_GET_VDEV(&bp->blk_dva[0]), 5531 DVA_GET_OFFSET(&bp->blk_dva[0]), 5532 DVA_GET_OFFSET(&bp->blk_dva[1]))) 5533 zb->zb_ditto_same_ms++; 5534 else if (DVA_GET_VDEV(&bp->blk_dva[0]) == 5535 DVA_GET_VDEV(&bp->blk_dva[2]) && 5536 same_metaslab(zcb->zcb_spa, 5537 DVA_GET_VDEV(&bp->blk_dva[0]), 5538 DVA_GET_OFFSET(&bp->blk_dva[0]), 5539 DVA_GET_OFFSET(&bp->blk_dva[2]))) 5540 zb->zb_ditto_same_ms++; 5541 else if (DVA_GET_VDEV(&bp->blk_dva[1]) == 5542 DVA_GET_VDEV(&bp->blk_dva[2]) && 5543 same_metaslab(zcb->zcb_spa, 5544 DVA_GET_VDEV(&bp->blk_dva[1]), 5545 DVA_GET_OFFSET(&bp->blk_dva[1]), 5546 DVA_GET_OFFSET(&bp->blk_dva[2]))) 5547 zb->zb_ditto_same_ms++; 5548 } 5549 break; 5550 } 5551 } 5552 5553 spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); 5554 5555 if (BP_IS_EMBEDDED(bp)) { 5556 zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; 5557 zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] 5558 [BPE_GET_PSIZE(bp)]++; 5559 return; 5560 } 5561 /* 5562 * The binning histogram bins by powers of two up to 5563 * SPA_MAXBLOCKSIZE rather than creating bins for 5564 * every possible blocksize found in the pool. 5565 */ 5566 int bin = highbit64(BP_GET_PSIZE(bp)) - 1; 5567 5568 zcb->zcb_psize_count[bin]++; 5569 zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); 5570 zcb->zcb_psize_total += BP_GET_PSIZE(bp); 5571 5572 bin = highbit64(BP_GET_LSIZE(bp)) - 1; 5573 5574 zcb->zcb_lsize_count[bin]++; 5575 zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); 5576 zcb->zcb_lsize_total += BP_GET_LSIZE(bp); 5577 5578 bin = highbit64(BP_GET_ASIZE(bp)) - 1; 5579 5580 zcb->zcb_asize_count[bin]++; 5581 zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); 5582 zcb->zcb_asize_total += BP_GET_ASIZE(bp); 5583 5584 if (dump_opt['L']) 5585 return; 5586 5587 if (BP_GET_DEDUP(bp)) { 5588 ddt_t *ddt; 5589 ddt_entry_t *dde; 5590 5591 ddt = ddt_select(zcb->zcb_spa, bp); 5592 ddt_enter(ddt); 5593 dde = ddt_lookup(ddt, bp, B_FALSE); 5594 5595 if (dde == NULL) { 5596 refcnt = 0; 5597 } else { 5598 ddt_phys_t *ddp = ddt_phys_select(dde, bp); 5599 ddt_phys_decref(ddp); 5600 refcnt = ddp->ddp_refcnt; 5601 if (ddt_phys_total_refcnt(dde) == 0) 5602 ddt_remove(ddt, dde); 5603 } 5604 ddt_exit(ddt); 5605 } 5606 5607 VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, 5608 refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), 5609 bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); 5610 } 5611 5612 static void 5613 zdb_blkptr_done(zio_t *zio) 5614 { 5615 spa_t *spa = zio->io_spa; 5616 blkptr_t *bp = zio->io_bp; 5617 int ioerr = zio->io_error; 5618 zdb_cb_t *zcb = zio->io_private; 5619 zbookmark_phys_t *zb = &zio->io_bookmark; 5620 5621 mutex_enter(&spa->spa_scrub_lock); 5622 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 5623 cv_broadcast(&spa->spa_scrub_io_cv); 5624 5625 if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 5626 char blkbuf[BP_SPRINTF_LEN]; 5627 5628 zcb->zcb_haderrors = 1; 5629 zcb->zcb_errors[ioerr]++; 5630 5631 if (dump_opt['b'] >= 2) 5632 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 5633 else 5634 blkbuf[0] = '\0'; 5635 5636 (void) printf("zdb_blkptr_cb: " 5637 "Got error %d reading " 5638 "<%llu, %llu, %lld, %llx> %s -- skipping\n", 5639 ioerr, 5640 (u_longlong_t)zb->zb_objset, 5641 (u_longlong_t)zb->zb_object, 5642 (u_longlong_t)zb->zb_level, 5643 (u_longlong_t)zb->zb_blkid, 5644 blkbuf); 5645 } 5646 mutex_exit(&spa->spa_scrub_lock); 5647 5648 abd_free(zio->io_abd); 5649 } 5650 5651 static int 5652 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 5653 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 5654 { 5655 zdb_cb_t *zcb = arg; 5656 dmu_object_type_t type; 5657 boolean_t is_metadata; 5658 5659 if (zb->zb_level == ZB_DNODE_LEVEL) 5660 return (0); 5661 5662 if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { 5663 char blkbuf[BP_SPRINTF_LEN]; 5664 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 5665 (void) printf("objset %llu object %llu " 5666 "level %lld offset 0x%llx %s\n", 5667 (u_longlong_t)zb->zb_objset, 5668 (u_longlong_t)zb->zb_object, 5669 (longlong_t)zb->zb_level, 5670 (u_longlong_t)blkid2offset(dnp, bp, zb), 5671 blkbuf); 5672 } 5673 5674 if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) 5675 return (0); 5676 5677 type = BP_GET_TYPE(bp); 5678 5679 zdb_count_block(zcb, zilog, bp, 5680 (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); 5681 5682 is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); 5683 5684 if (!BP_IS_EMBEDDED(bp) && 5685 (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { 5686 size_t size = BP_GET_PSIZE(bp); 5687 abd_t *abd = abd_alloc(size, B_FALSE); 5688 int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; 5689 5690 /* If it's an intent log block, failure is expected. */ 5691 if (zb->zb_level == ZB_ZIL_LEVEL) 5692 flags |= ZIO_FLAG_SPECULATIVE; 5693 5694 mutex_enter(&spa->spa_scrub_lock); 5695 while (spa->spa_load_verify_bytes > max_inflight_bytes) 5696 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 5697 spa->spa_load_verify_bytes += size; 5698 mutex_exit(&spa->spa_scrub_lock); 5699 5700 zio_nowait(zio_read(NULL, spa, bp, abd, size, 5701 zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); 5702 } 5703 5704 zcb->zcb_readfails = 0; 5705 5706 /* only call gethrtime() every 100 blocks */ 5707 static int iters; 5708 if (++iters > 100) 5709 iters = 0; 5710 else 5711 return (0); 5712 5713 if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { 5714 uint64_t now = gethrtime(); 5715 char buf[10]; 5716 uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; 5717 uint64_t kb_per_sec = 5718 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); 5719 uint64_t sec_remaining = 5720 (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; 5721 5722 /* make sure nicenum has enough space */ 5723 _Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated"); 5724 5725 zfs_nicebytes(bytes, buf, sizeof (buf)); 5726 (void) fprintf(stderr, 5727 "\r%5s completed (%4"PRIu64"MB/s) " 5728 "estimated time remaining: " 5729 "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec ", 5730 buf, kb_per_sec / 1024, 5731 sec_remaining / 60 / 60, 5732 sec_remaining / 60 % 60, 5733 sec_remaining % 60); 5734 5735 zcb->zcb_lastprint = now; 5736 } 5737 5738 return (0); 5739 } 5740 5741 static void 5742 zdb_leak(void *arg, uint64_t start, uint64_t size) 5743 { 5744 vdev_t *vd = arg; 5745 5746 (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", 5747 (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); 5748 } 5749 5750 static metaslab_ops_t zdb_metaslab_ops = { 5751 NULL /* alloc */ 5752 }; 5753 5754 static int 5755 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, 5756 uint64_t txg, void *arg) 5757 { 5758 spa_vdev_removal_t *svr = arg; 5759 5760 uint64_t offset = sme->sme_offset; 5761 uint64_t size = sme->sme_run; 5762 5763 /* skip vdevs we don't care about */ 5764 if (sme->sme_vdev != svr->svr_vdev_id) 5765 return (0); 5766 5767 vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); 5768 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5769 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 5770 5771 if (txg < metaslab_unflushed_txg(ms)) 5772 return (0); 5773 5774 if (sme->sme_type == SM_ALLOC) 5775 range_tree_add(svr->svr_allocd_segs, offset, size); 5776 else 5777 range_tree_remove(svr->svr_allocd_segs, offset, size); 5778 5779 return (0); 5780 } 5781 5782 static void 5783 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 5784 uint64_t size, void *arg) 5785 { 5786 (void) inner_offset, (void) arg; 5787 5788 /* 5789 * This callback was called through a remap from 5790 * a device being removed. Therefore, the vdev that 5791 * this callback is applied to is a concrete 5792 * vdev. 5793 */ 5794 ASSERT(vdev_is_concrete(vd)); 5795 5796 VERIFY0(metaslab_claim_impl(vd, offset, size, 5797 spa_min_claim_txg(vd->vdev_spa))); 5798 } 5799 5800 static void 5801 claim_segment_cb(void *arg, uint64_t offset, uint64_t size) 5802 { 5803 vdev_t *vd = arg; 5804 5805 vdev_indirect_ops.vdev_op_remap(vd, offset, size, 5806 claim_segment_impl_cb, NULL); 5807 } 5808 5809 /* 5810 * After accounting for all allocated blocks that are directly referenced, 5811 * we might have missed a reference to a block from a partially complete 5812 * (and thus unused) indirect mapping object. We perform a secondary pass 5813 * through the metaslabs we have already mapped and claim the destination 5814 * blocks. 5815 */ 5816 static void 5817 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) 5818 { 5819 if (dump_opt['L']) 5820 return; 5821 5822 if (spa->spa_vdev_removal == NULL) 5823 return; 5824 5825 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5826 5827 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 5828 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); 5829 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 5830 5831 ASSERT0(range_tree_space(svr->svr_allocd_segs)); 5832 5833 range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); 5834 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { 5835 metaslab_t *msp = vd->vdev_ms[msi]; 5836 5837 ASSERT0(range_tree_space(allocs)); 5838 if (msp->ms_sm != NULL) 5839 VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); 5840 range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); 5841 } 5842 range_tree_destroy(allocs); 5843 5844 iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); 5845 5846 /* 5847 * Clear everything past what has been synced, 5848 * because we have not allocated mappings for 5849 * it yet. 5850 */ 5851 range_tree_clear(svr->svr_allocd_segs, 5852 vdev_indirect_mapping_max_offset(vim), 5853 vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); 5854 5855 zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); 5856 range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); 5857 5858 spa_config_exit(spa, SCL_CONFIG, FTAG); 5859 } 5860 5861 static int 5862 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 5863 dmu_tx_t *tx) 5864 { 5865 (void) tx; 5866 zdb_cb_t *zcb = arg; 5867 spa_t *spa = zcb->zcb_spa; 5868 vdev_t *vd; 5869 const dva_t *dva = &bp->blk_dva[0]; 5870 5871 ASSERT(!bp_freed); 5872 ASSERT(!dump_opt['L']); 5873 ASSERT3U(BP_GET_NDVAS(bp), ==, 1); 5874 5875 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 5876 vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); 5877 ASSERT3P(vd, !=, NULL); 5878 spa_config_exit(spa, SCL_VDEV, FTAG); 5879 5880 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); 5881 ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); 5882 5883 vdev_indirect_mapping_increment_obsolete_count( 5884 vd->vdev_indirect_mapping, 5885 DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), 5886 zcb->zcb_vd_obsolete_counts[vd->vdev_id]); 5887 5888 return (0); 5889 } 5890 5891 static uint32_t * 5892 zdb_load_obsolete_counts(vdev_t *vd) 5893 { 5894 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 5895 spa_t *spa = vd->vdev_spa; 5896 spa_condensing_indirect_phys_t *scip = 5897 &spa->spa_condensing_indirect_phys; 5898 uint64_t obsolete_sm_object; 5899 uint32_t *counts; 5900 5901 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 5902 EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL); 5903 counts = vdev_indirect_mapping_load_obsolete_counts(vim); 5904 if (vd->vdev_obsolete_sm != NULL) { 5905 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, 5906 vd->vdev_obsolete_sm); 5907 } 5908 if (scip->scip_vdev == vd->vdev_id && 5909 scip->scip_prev_obsolete_sm_object != 0) { 5910 space_map_t *prev_obsolete_sm = NULL; 5911 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 5912 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 5913 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, 5914 prev_obsolete_sm); 5915 space_map_close(prev_obsolete_sm); 5916 } 5917 return (counts); 5918 } 5919 5920 static void 5921 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) 5922 { 5923 ddt_bookmark_t ddb = {0}; 5924 ddt_entry_t dde; 5925 int error; 5926 int p; 5927 5928 ASSERT(!dump_opt['L']); 5929 5930 while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { 5931 blkptr_t blk; 5932 ddt_phys_t *ddp = dde.dde_phys; 5933 5934 if (ddb.ddb_class == DDT_CLASS_UNIQUE) 5935 return; 5936 5937 ASSERT(ddt_phys_total_refcnt(&dde) > 1); 5938 5939 for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 5940 if (ddp->ddp_phys_birth == 0) 5941 continue; 5942 ddt_bp_create(ddb.ddb_checksum, 5943 &dde.dde_key, ddp, &blk); 5944 if (p == DDT_PHYS_DITTO) { 5945 zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); 5946 } else { 5947 zcb->zcb_dedup_asize += 5948 BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); 5949 zcb->zcb_dedup_blocks++; 5950 } 5951 } 5952 ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; 5953 ddt_enter(ddt); 5954 VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); 5955 ddt_exit(ddt); 5956 } 5957 5958 ASSERT(error == ENOENT); 5959 } 5960 5961 typedef struct checkpoint_sm_exclude_entry_arg { 5962 vdev_t *cseea_vd; 5963 uint64_t cseea_checkpoint_size; 5964 } checkpoint_sm_exclude_entry_arg_t; 5965 5966 static int 5967 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) 5968 { 5969 checkpoint_sm_exclude_entry_arg_t *cseea = arg; 5970 vdev_t *vd = cseea->cseea_vd; 5971 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; 5972 uint64_t end = sme->sme_offset + sme->sme_run; 5973 5974 ASSERT(sme->sme_type == SM_FREE); 5975 5976 /* 5977 * Since the vdev_checkpoint_sm exists in the vdev level 5978 * and the ms_sm space maps exist in the metaslab level, 5979 * an entry in the checkpoint space map could theoretically 5980 * cross the boundaries of the metaslab that it belongs. 5981 * 5982 * In reality, because of the way that we populate and 5983 * manipulate the checkpoint's space maps currently, 5984 * there shouldn't be any entries that cross metaslabs. 5985 * Hence the assertion below. 5986 * 5987 * That said, there is no fundamental requirement that 5988 * the checkpoint's space map entries should not cross 5989 * metaslab boundaries. So if needed we could add code 5990 * that handles metaslab-crossing segments in the future. 5991 */ 5992 VERIFY3U(sme->sme_offset, >=, ms->ms_start); 5993 VERIFY3U(end, <=, ms->ms_start + ms->ms_size); 5994 5995 /* 5996 * By removing the entry from the allocated segments we 5997 * also verify that the entry is there to begin with. 5998 */ 5999 mutex_enter(&ms->ms_lock); 6000 range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); 6001 mutex_exit(&ms->ms_lock); 6002 6003 cseea->cseea_checkpoint_size += sme->sme_run; 6004 return (0); 6005 } 6006 6007 static void 6008 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) 6009 { 6010 spa_t *spa = vd->vdev_spa; 6011 space_map_t *checkpoint_sm = NULL; 6012 uint64_t checkpoint_sm_obj; 6013 6014 /* 6015 * If there is no vdev_top_zap, we are in a pool whose 6016 * version predates the pool checkpoint feature. 6017 */ 6018 if (vd->vdev_top_zap == 0) 6019 return; 6020 6021 /* 6022 * If there is no reference of the vdev_checkpoint_sm in 6023 * the vdev_top_zap, then one of the following scenarios 6024 * is true: 6025 * 6026 * 1] There is no checkpoint 6027 * 2] There is a checkpoint, but no checkpointed blocks 6028 * have been freed yet 6029 * 3] The current vdev is indirect 6030 * 6031 * In these cases we return immediately. 6032 */ 6033 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, 6034 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 6035 return; 6036 6037 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, 6038 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, 6039 &checkpoint_sm_obj)); 6040 6041 checkpoint_sm_exclude_entry_arg_t cseea; 6042 cseea.cseea_vd = vd; 6043 cseea.cseea_checkpoint_size = 0; 6044 6045 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), 6046 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); 6047 6048 VERIFY0(space_map_iterate(checkpoint_sm, 6049 space_map_length(checkpoint_sm), 6050 checkpoint_sm_exclude_entry_cb, &cseea)); 6051 space_map_close(checkpoint_sm); 6052 6053 zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; 6054 } 6055 6056 static void 6057 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) 6058 { 6059 ASSERT(!dump_opt['L']); 6060 6061 vdev_t *rvd = spa->spa_root_vdev; 6062 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 6063 ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); 6064 zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); 6065 } 6066 } 6067 6068 static int 6069 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, 6070 uint64_t txg, void *arg) 6071 { 6072 int64_t *ualloc_space = arg; 6073 6074 uint64_t offset = sme->sme_offset; 6075 uint64_t vdev_id = sme->sme_vdev; 6076 6077 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 6078 if (!vdev_is_concrete(vd)) 6079 return (0); 6080 6081 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6082 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 6083 6084 if (txg < metaslab_unflushed_txg(ms)) 6085 return (0); 6086 6087 if (sme->sme_type == SM_ALLOC) 6088 *ualloc_space += sme->sme_run; 6089 else 6090 *ualloc_space -= sme->sme_run; 6091 6092 return (0); 6093 } 6094 6095 static int64_t 6096 get_unflushed_alloc_space(spa_t *spa) 6097 { 6098 if (dump_opt['L']) 6099 return (0); 6100 6101 int64_t ualloc_space = 0; 6102 iterate_through_spacemap_logs(spa, count_unflushed_space_cb, 6103 &ualloc_space); 6104 return (ualloc_space); 6105 } 6106 6107 static int 6108 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) 6109 { 6110 maptype_t *uic_maptype = arg; 6111 6112 uint64_t offset = sme->sme_offset; 6113 uint64_t size = sme->sme_run; 6114 uint64_t vdev_id = sme->sme_vdev; 6115 6116 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 6117 6118 /* skip indirect vdevs */ 6119 if (!vdev_is_concrete(vd)) 6120 return (0); 6121 6122 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6123 6124 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 6125 ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); 6126 6127 if (txg < metaslab_unflushed_txg(ms)) 6128 return (0); 6129 6130 if (*uic_maptype == sme->sme_type) 6131 range_tree_add(ms->ms_allocatable, offset, size); 6132 else 6133 range_tree_remove(ms->ms_allocatable, offset, size); 6134 6135 return (0); 6136 } 6137 6138 static void 6139 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) 6140 { 6141 iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); 6142 } 6143 6144 static void 6145 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) 6146 { 6147 vdev_t *rvd = spa->spa_root_vdev; 6148 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 6149 vdev_t *vd = rvd->vdev_child[i]; 6150 6151 ASSERT3U(i, ==, vd->vdev_id); 6152 6153 if (vd->vdev_ops == &vdev_indirect_ops) 6154 continue; 6155 6156 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 6157 metaslab_t *msp = vd->vdev_ms[m]; 6158 6159 (void) fprintf(stderr, 6160 "\rloading concrete vdev %llu, " 6161 "metaslab %llu of %llu ...", 6162 (longlong_t)vd->vdev_id, 6163 (longlong_t)msp->ms_id, 6164 (longlong_t)vd->vdev_ms_count); 6165 6166 mutex_enter(&msp->ms_lock); 6167 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 6168 6169 /* 6170 * We don't want to spend the CPU manipulating the 6171 * size-ordered tree, so clear the range_tree ops. 6172 */ 6173 msp->ms_allocatable->rt_ops = NULL; 6174 6175 if (msp->ms_sm != NULL) { 6176 VERIFY0(space_map_load(msp->ms_sm, 6177 msp->ms_allocatable, maptype)); 6178 } 6179 if (!msp->ms_loaded) 6180 msp->ms_loaded = B_TRUE; 6181 mutex_exit(&msp->ms_lock); 6182 } 6183 } 6184 6185 load_unflushed_to_ms_allocatables(spa, maptype); 6186 } 6187 6188 /* 6189 * vm_idxp is an in-out parameter which (for indirect vdevs) is the 6190 * index in vim_entries that has the first entry in this metaslab. 6191 * On return, it will be set to the first entry after this metaslab. 6192 */ 6193 static void 6194 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, 6195 uint64_t *vim_idxp) 6196 { 6197 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6198 6199 mutex_enter(&msp->ms_lock); 6200 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 6201 6202 /* 6203 * We don't want to spend the CPU manipulating the 6204 * size-ordered tree, so clear the range_tree ops. 6205 */ 6206 msp->ms_allocatable->rt_ops = NULL; 6207 6208 for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); 6209 (*vim_idxp)++) { 6210 vdev_indirect_mapping_entry_phys_t *vimep = 6211 &vim->vim_entries[*vim_idxp]; 6212 uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); 6213 uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); 6214 ASSERT3U(ent_offset, >=, msp->ms_start); 6215 if (ent_offset >= msp->ms_start + msp->ms_size) 6216 break; 6217 6218 /* 6219 * Mappings do not cross metaslab boundaries, 6220 * because we create them by walking the metaslabs. 6221 */ 6222 ASSERT3U(ent_offset + ent_len, <=, 6223 msp->ms_start + msp->ms_size); 6224 range_tree_add(msp->ms_allocatable, ent_offset, ent_len); 6225 } 6226 6227 if (!msp->ms_loaded) 6228 msp->ms_loaded = B_TRUE; 6229 mutex_exit(&msp->ms_lock); 6230 } 6231 6232 static void 6233 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) 6234 { 6235 ASSERT(!dump_opt['L']); 6236 6237 vdev_t *rvd = spa->spa_root_vdev; 6238 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 6239 vdev_t *vd = rvd->vdev_child[c]; 6240 6241 ASSERT3U(c, ==, vd->vdev_id); 6242 6243 if (vd->vdev_ops != &vdev_indirect_ops) 6244 continue; 6245 6246 /* 6247 * Note: we don't check for mapping leaks on 6248 * removing vdevs because their ms_allocatable's 6249 * are used to look for leaks in allocated space. 6250 */ 6251 zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); 6252 6253 /* 6254 * Normally, indirect vdevs don't have any 6255 * metaslabs. We want to set them up for 6256 * zio_claim(). 6257 */ 6258 vdev_metaslab_group_create(vd); 6259 VERIFY0(vdev_metaslab_init(vd, 0)); 6260 6261 vdev_indirect_mapping_t *vim __maybe_unused = 6262 vd->vdev_indirect_mapping; 6263 uint64_t vim_idx = 0; 6264 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 6265 6266 (void) fprintf(stderr, 6267 "\rloading indirect vdev %llu, " 6268 "metaslab %llu of %llu ...", 6269 (longlong_t)vd->vdev_id, 6270 (longlong_t)vd->vdev_ms[m]->ms_id, 6271 (longlong_t)vd->vdev_ms_count); 6272 6273 load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], 6274 &vim_idx); 6275 } 6276 ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); 6277 } 6278 } 6279 6280 static void 6281 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) 6282 { 6283 zcb->zcb_spa = spa; 6284 6285 if (dump_opt['L']) 6286 return; 6287 6288 dsl_pool_t *dp = spa->spa_dsl_pool; 6289 vdev_t *rvd = spa->spa_root_vdev; 6290 6291 /* 6292 * We are going to be changing the meaning of the metaslab's 6293 * ms_allocatable. Ensure that the allocator doesn't try to 6294 * use the tree. 6295 */ 6296 spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; 6297 spa->spa_log_class->mc_ops = &zdb_metaslab_ops; 6298 spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; 6299 6300 zcb->zcb_vd_obsolete_counts = 6301 umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), 6302 UMEM_NOFAIL); 6303 6304 /* 6305 * For leak detection, we overload the ms_allocatable trees 6306 * to contain allocated segments instead of free segments. 6307 * As a result, we can't use the normal metaslab_load/unload 6308 * interfaces. 6309 */ 6310 zdb_leak_init_prepare_indirect_vdevs(spa, zcb); 6311 load_concrete_ms_allocatable_trees(spa, SM_ALLOC); 6312 6313 /* 6314 * On load_concrete_ms_allocatable_trees() we loaded all the 6315 * allocated entries from the ms_sm to the ms_allocatable for 6316 * each metaslab. If the pool has a checkpoint or is in the 6317 * middle of discarding a checkpoint, some of these blocks 6318 * may have been freed but their ms_sm may not have been 6319 * updated because they are referenced by the checkpoint. In 6320 * order to avoid false-positives during leak-detection, we 6321 * go through the vdev's checkpoint space map and exclude all 6322 * its entries from their relevant ms_allocatable. 6323 * 6324 * We also aggregate the space held by the checkpoint and add 6325 * it to zcb_checkpoint_size. 6326 * 6327 * Note that at this point we are also verifying that all the 6328 * entries on the checkpoint_sm are marked as allocated in 6329 * the ms_sm of their relevant metaslab. 6330 * [see comment in checkpoint_sm_exclude_entry_cb()] 6331 */ 6332 zdb_leak_init_exclude_checkpoint(spa, zcb); 6333 ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); 6334 6335 /* for cleaner progress output */ 6336 (void) fprintf(stderr, "\n"); 6337 6338 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { 6339 ASSERT(spa_feature_is_enabled(spa, 6340 SPA_FEATURE_DEVICE_REMOVAL)); 6341 (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, 6342 increment_indirect_mapping_cb, zcb, NULL); 6343 } 6344 6345 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6346 zdb_ddt_leak_init(spa, zcb); 6347 spa_config_exit(spa, SCL_CONFIG, FTAG); 6348 } 6349 6350 static boolean_t 6351 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) 6352 { 6353 boolean_t leaks = B_FALSE; 6354 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6355 uint64_t total_leaked = 0; 6356 boolean_t are_precise = B_FALSE; 6357 6358 ASSERT(vim != NULL); 6359 6360 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { 6361 vdev_indirect_mapping_entry_phys_t *vimep = 6362 &vim->vim_entries[i]; 6363 uint64_t obsolete_bytes = 0; 6364 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); 6365 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 6366 6367 /* 6368 * This is not very efficient but it's easy to 6369 * verify correctness. 6370 */ 6371 for (uint64_t inner_offset = 0; 6372 inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); 6373 inner_offset += 1ULL << vd->vdev_ashift) { 6374 if (range_tree_contains(msp->ms_allocatable, 6375 offset + inner_offset, 1ULL << vd->vdev_ashift)) { 6376 obsolete_bytes += 1ULL << vd->vdev_ashift; 6377 } 6378 } 6379 6380 int64_t bytes_leaked = obsolete_bytes - 6381 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; 6382 ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, 6383 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); 6384 6385 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); 6386 if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) { 6387 (void) printf("obsolete indirect mapping count " 6388 "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", 6389 (u_longlong_t)vd->vdev_id, 6390 (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), 6391 (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 6392 (u_longlong_t)bytes_leaked); 6393 } 6394 total_leaked += ABS(bytes_leaked); 6395 } 6396 6397 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); 6398 if (!are_precise && total_leaked > 0) { 6399 int pct_leaked = total_leaked * 100 / 6400 vdev_indirect_mapping_bytes_mapped(vim); 6401 (void) printf("cannot verify obsolete indirect mapping " 6402 "counts of vdev %llu because precise feature was not " 6403 "enabled when it was removed: %d%% (%llx bytes) of mapping" 6404 "unreferenced\n", 6405 (u_longlong_t)vd->vdev_id, pct_leaked, 6406 (u_longlong_t)total_leaked); 6407 } else if (total_leaked > 0) { 6408 (void) printf("obsolete indirect mapping count mismatch " 6409 "for vdev %llu -- %llx total bytes mismatched\n", 6410 (u_longlong_t)vd->vdev_id, 6411 (u_longlong_t)total_leaked); 6412 leaks |= B_TRUE; 6413 } 6414 6415 vdev_indirect_mapping_free_obsolete_counts(vim, 6416 zcb->zcb_vd_obsolete_counts[vd->vdev_id]); 6417 zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; 6418 6419 return (leaks); 6420 } 6421 6422 static boolean_t 6423 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) 6424 { 6425 if (dump_opt['L']) 6426 return (B_FALSE); 6427 6428 boolean_t leaks = B_FALSE; 6429 vdev_t *rvd = spa->spa_root_vdev; 6430 for (unsigned c = 0; c < rvd->vdev_children; c++) { 6431 vdev_t *vd = rvd->vdev_child[c]; 6432 6433 if (zcb->zcb_vd_obsolete_counts[c] != NULL) { 6434 leaks |= zdb_check_for_obsolete_leaks(vd, zcb); 6435 } 6436 6437 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 6438 metaslab_t *msp = vd->vdev_ms[m]; 6439 ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == 6440 spa_embedded_log_class(spa)) ? 6441 vd->vdev_log_mg : vd->vdev_mg); 6442 6443 /* 6444 * ms_allocatable has been overloaded 6445 * to contain allocated segments. Now that 6446 * we finished traversing all blocks, any 6447 * block that remains in the ms_allocatable 6448 * represents an allocated block that we 6449 * did not claim during the traversal. 6450 * Claimed blocks would have been removed 6451 * from the ms_allocatable. For indirect 6452 * vdevs, space remaining in the tree 6453 * represents parts of the mapping that are 6454 * not referenced, which is not a bug. 6455 */ 6456 if (vd->vdev_ops == &vdev_indirect_ops) { 6457 range_tree_vacate(msp->ms_allocatable, 6458 NULL, NULL); 6459 } else { 6460 range_tree_vacate(msp->ms_allocatable, 6461 zdb_leak, vd); 6462 } 6463 if (msp->ms_loaded) { 6464 msp->ms_loaded = B_FALSE; 6465 } 6466 } 6467 } 6468 6469 umem_free(zcb->zcb_vd_obsolete_counts, 6470 rvd->vdev_children * sizeof (uint32_t *)); 6471 zcb->zcb_vd_obsolete_counts = NULL; 6472 6473 return (leaks); 6474 } 6475 6476 static int 6477 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6478 { 6479 (void) tx; 6480 zdb_cb_t *zcb = arg; 6481 6482 if (dump_opt['b'] >= 5) { 6483 char blkbuf[BP_SPRINTF_LEN]; 6484 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 6485 (void) printf("[%s] %s\n", 6486 "deferred free", blkbuf); 6487 } 6488 zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); 6489 return (0); 6490 } 6491 6492 /* 6493 * Iterate over livelists which have been destroyed by the user but 6494 * are still present in the MOS, waiting to be freed 6495 */ 6496 static void 6497 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) 6498 { 6499 objset_t *mos = spa->spa_meta_objset; 6500 uint64_t zap_obj; 6501 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, 6502 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); 6503 if (err == ENOENT) 6504 return; 6505 ASSERT0(err); 6506 6507 zap_cursor_t zc; 6508 zap_attribute_t attr; 6509 dsl_deadlist_t ll; 6510 /* NULL out os prior to dsl_deadlist_open in case it's garbage */ 6511 ll.dl_os = NULL; 6512 for (zap_cursor_init(&zc, mos, zap_obj); 6513 zap_cursor_retrieve(&zc, &attr) == 0; 6514 (void) zap_cursor_advance(&zc)) { 6515 dsl_deadlist_open(&ll, mos, attr.za_first_integer); 6516 func(&ll, arg); 6517 dsl_deadlist_close(&ll); 6518 } 6519 zap_cursor_fini(&zc); 6520 } 6521 6522 static int 6523 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 6524 dmu_tx_t *tx) 6525 { 6526 ASSERT(!bp_freed); 6527 return (count_block_cb(arg, bp, tx)); 6528 } 6529 6530 static int 6531 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) 6532 { 6533 zdb_cb_t *zbc = args; 6534 bplist_t blks; 6535 bplist_create(&blks); 6536 /* determine which blocks have been alloc'd but not freed */ 6537 VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); 6538 /* count those blocks */ 6539 (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); 6540 bplist_destroy(&blks); 6541 return (0); 6542 } 6543 6544 static void 6545 livelist_count_blocks(dsl_deadlist_t *ll, void *arg) 6546 { 6547 dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); 6548 } 6549 6550 /* 6551 * Count the blocks in the livelists that have been destroyed by the user 6552 * but haven't yet been freed. 6553 */ 6554 static void 6555 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) 6556 { 6557 iterate_deleted_livelists(spa, livelist_count_blocks, zbc); 6558 } 6559 6560 static void 6561 dump_livelist_cb(dsl_deadlist_t *ll, void *arg) 6562 { 6563 ASSERT3P(arg, ==, NULL); 6564 global_feature_count[SPA_FEATURE_LIVELIST]++; 6565 dump_blkptr_list(ll, "Deleted Livelist"); 6566 dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); 6567 } 6568 6569 /* 6570 * Print out, register object references to, and increment feature counts for 6571 * livelists that have been destroyed by the user but haven't yet been freed. 6572 */ 6573 static void 6574 deleted_livelists_dump_mos(spa_t *spa) 6575 { 6576 uint64_t zap_obj; 6577 objset_t *mos = spa->spa_meta_objset; 6578 int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, 6579 DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); 6580 if (err == ENOENT) 6581 return; 6582 mos_obj_refd(zap_obj); 6583 iterate_deleted_livelists(spa, dump_livelist_cb, NULL); 6584 } 6585 6586 static int 6587 dump_block_stats(spa_t *spa) 6588 { 6589 zdb_cb_t *zcb; 6590 zdb_blkstats_t *zb, *tzb; 6591 uint64_t norm_alloc, norm_space, total_alloc, total_found; 6592 int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 6593 TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; 6594 boolean_t leaks = B_FALSE; 6595 int e, c, err; 6596 bp_embedded_type_t i; 6597 6598 zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); 6599 6600 (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", 6601 (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", 6602 (dump_opt['c'] == 1) ? "metadata " : "", 6603 dump_opt['c'] ? "checksums " : "", 6604 (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", 6605 !dump_opt['L'] ? "nothing leaked " : ""); 6606 6607 /* 6608 * When leak detection is enabled we load all space maps as SM_ALLOC 6609 * maps, then traverse the pool claiming each block we discover. If 6610 * the pool is perfectly consistent, the segment trees will be empty 6611 * when we're done. Anything left over is a leak; any block we can't 6612 * claim (because it's not part of any space map) is a double 6613 * allocation, reference to a freed block, or an unclaimed log block. 6614 * 6615 * When leak detection is disabled (-L option) we still traverse the 6616 * pool claiming each block we discover, but we skip opening any space 6617 * maps. 6618 */ 6619 zdb_leak_init(spa, zcb); 6620 6621 /* 6622 * If there's a deferred-free bplist, process that first. 6623 */ 6624 (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, 6625 bpobj_count_block_cb, zcb, NULL); 6626 6627 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 6628 (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, 6629 bpobj_count_block_cb, zcb, NULL); 6630 } 6631 6632 zdb_claim_removing(spa, zcb); 6633 6634 if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { 6635 VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, 6636 spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, 6637 zcb, NULL)); 6638 } 6639 6640 deleted_livelists_count_blocks(spa, zcb); 6641 6642 if (dump_opt['c'] > 1) 6643 flags |= TRAVERSE_PREFETCH_DATA; 6644 6645 zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); 6646 zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); 6647 zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); 6648 zcb->zcb_totalasize += 6649 metaslab_class_get_alloc(spa_embedded_log_class(spa)); 6650 zcb->zcb_start = zcb->zcb_lastprint = gethrtime(); 6651 err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb); 6652 6653 /* 6654 * If we've traversed the data blocks then we need to wait for those 6655 * I/Os to complete. We leverage "The Godfather" zio to wait on 6656 * all async I/Os to complete. 6657 */ 6658 if (dump_opt['c']) { 6659 for (c = 0; c < max_ncpus; c++) { 6660 (void) zio_wait(spa->spa_async_zio_root[c]); 6661 spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL, 6662 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 6663 ZIO_FLAG_GODFATHER); 6664 } 6665 } 6666 ASSERT0(spa->spa_load_verify_bytes); 6667 6668 /* 6669 * Done after zio_wait() since zcb_haderrors is modified in 6670 * zdb_blkptr_done() 6671 */ 6672 zcb->zcb_haderrors |= err; 6673 6674 if (zcb->zcb_haderrors) { 6675 (void) printf("\nError counts:\n\n"); 6676 (void) printf("\t%5s %s\n", "errno", "count"); 6677 for (e = 0; e < 256; e++) { 6678 if (zcb->zcb_errors[e] != 0) { 6679 (void) printf("\t%5d %llu\n", 6680 e, (u_longlong_t)zcb->zcb_errors[e]); 6681 } 6682 } 6683 } 6684 6685 /* 6686 * Report any leaked segments. 6687 */ 6688 leaks |= zdb_leak_fini(spa, zcb); 6689 6690 tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; 6691 6692 norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 6693 norm_space = metaslab_class_get_space(spa_normal_class(spa)); 6694 6695 total_alloc = norm_alloc + 6696 metaslab_class_get_alloc(spa_log_class(spa)) + 6697 metaslab_class_get_alloc(spa_embedded_log_class(spa)) + 6698 metaslab_class_get_alloc(spa_special_class(spa)) + 6699 metaslab_class_get_alloc(spa_dedup_class(spa)) + 6700 get_unflushed_alloc_space(spa); 6701 total_found = tzb->zb_asize - zcb->zcb_dedup_asize + 6702 zcb->zcb_removing_size + zcb->zcb_checkpoint_size; 6703 6704 if (total_found == total_alloc && !dump_opt['L']) { 6705 (void) printf("\n\tNo leaks (block sum matches space" 6706 " maps exactly)\n"); 6707 } else if (!dump_opt['L']) { 6708 (void) printf("block traversal size %llu != alloc %llu " 6709 "(%s %lld)\n", 6710 (u_longlong_t)total_found, 6711 (u_longlong_t)total_alloc, 6712 (dump_opt['L']) ? "unreachable" : "leaked", 6713 (longlong_t)(total_alloc - total_found)); 6714 leaks = B_TRUE; 6715 } 6716 6717 if (tzb->zb_count == 0) { 6718 umem_free(zcb, sizeof (zdb_cb_t)); 6719 return (2); 6720 } 6721 6722 (void) printf("\n"); 6723 (void) printf("\t%-16s %14llu\n", "bp count:", 6724 (u_longlong_t)tzb->zb_count); 6725 (void) printf("\t%-16s %14llu\n", "ganged count:", 6726 (longlong_t)tzb->zb_gangs); 6727 (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", 6728 (u_longlong_t)tzb->zb_lsize, 6729 (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); 6730 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", 6731 "bp physical:", (u_longlong_t)tzb->zb_psize, 6732 (u_longlong_t)(tzb->zb_psize / tzb->zb_count), 6733 (double)tzb->zb_lsize / tzb->zb_psize); 6734 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", 6735 "bp allocated:", (u_longlong_t)tzb->zb_asize, 6736 (u_longlong_t)(tzb->zb_asize / tzb->zb_count), 6737 (double)tzb->zb_lsize / tzb->zb_asize); 6738 (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", 6739 "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize, 6740 (u_longlong_t)zcb->zcb_dedup_blocks, 6741 (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0); 6742 (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", 6743 (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); 6744 6745 if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) { 6746 uint64_t alloc = metaslab_class_get_alloc( 6747 spa_special_class(spa)); 6748 uint64_t space = metaslab_class_get_space( 6749 spa_special_class(spa)); 6750 6751 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 6752 "Special class", (u_longlong_t)alloc, 6753 100.0 * alloc / space); 6754 } 6755 6756 if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) { 6757 uint64_t alloc = metaslab_class_get_alloc( 6758 spa_dedup_class(spa)); 6759 uint64_t space = metaslab_class_get_space( 6760 spa_dedup_class(spa)); 6761 6762 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 6763 "Dedup class", (u_longlong_t)alloc, 6764 100.0 * alloc / space); 6765 } 6766 6767 if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) { 6768 uint64_t alloc = metaslab_class_get_alloc( 6769 spa_embedded_log_class(spa)); 6770 uint64_t space = metaslab_class_get_space( 6771 spa_embedded_log_class(spa)); 6772 6773 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 6774 "Embedded log class", (u_longlong_t)alloc, 6775 100.0 * alloc / space); 6776 } 6777 6778 for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { 6779 if (zcb->zcb_embedded_blocks[i] == 0) 6780 continue; 6781 (void) printf("\n"); 6782 (void) printf("\tadditional, non-pointer bps of type %u: " 6783 "%10llu\n", 6784 i, (u_longlong_t)zcb->zcb_embedded_blocks[i]); 6785 6786 if (dump_opt['b'] >= 3) { 6787 (void) printf("\t number of (compressed) bytes: " 6788 "number of bps\n"); 6789 dump_histogram(zcb->zcb_embedded_histogram[i], 6790 sizeof (zcb->zcb_embedded_histogram[i]) / 6791 sizeof (zcb->zcb_embedded_histogram[i][0]), 0); 6792 } 6793 } 6794 6795 if (tzb->zb_ditto_samevdev != 0) { 6796 (void) printf("\tDittoed blocks on same vdev: %llu\n", 6797 (longlong_t)tzb->zb_ditto_samevdev); 6798 } 6799 if (tzb->zb_ditto_same_ms != 0) { 6800 (void) printf("\tDittoed blocks in same metaslab: %llu\n", 6801 (longlong_t)tzb->zb_ditto_same_ms); 6802 } 6803 6804 for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { 6805 vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; 6806 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 6807 6808 if (vim == NULL) { 6809 continue; 6810 } 6811 6812 char mem[32]; 6813 zdb_nicenum(vdev_indirect_mapping_num_entries(vim), 6814 mem, vdev_indirect_mapping_size(vim)); 6815 6816 (void) printf("\tindirect vdev id %llu has %llu segments " 6817 "(%s in memory)\n", 6818 (longlong_t)vd->vdev_id, 6819 (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); 6820 } 6821 6822 if (dump_opt['b'] >= 2) { 6823 int l, t, level; 6824 char csize[32], lsize[32], psize[32], asize[32]; 6825 char avg[32], gang[32]; 6826 (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" 6827 "\t avg\t comp\t%%Total\tType\n"); 6828 6829 zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t), 6830 UMEM_NOFAIL); 6831 6832 for (t = 0; t <= ZDB_OT_TOTAL; t++) { 6833 const char *typename; 6834 6835 /* make sure nicenum has enough space */ 6836 _Static_assert(sizeof (csize) >= NN_NUMBUF_SZ, 6837 "csize truncated"); 6838 _Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, 6839 "lsize truncated"); 6840 _Static_assert(sizeof (psize) >= NN_NUMBUF_SZ, 6841 "psize truncated"); 6842 _Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, 6843 "asize truncated"); 6844 _Static_assert(sizeof (avg) >= NN_NUMBUF_SZ, 6845 "avg truncated"); 6846 _Static_assert(sizeof (gang) >= NN_NUMBUF_SZ, 6847 "gang truncated"); 6848 6849 if (t < DMU_OT_NUMTYPES) 6850 typename = dmu_ot[t].ot_name; 6851 else 6852 typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; 6853 6854 if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) { 6855 (void) printf("%6s\t%5s\t%5s\t%5s" 6856 "\t%5s\t%5s\t%6s\t%s\n", 6857 "-", 6858 "-", 6859 "-", 6860 "-", 6861 "-", 6862 "-", 6863 "-", 6864 typename); 6865 continue; 6866 } 6867 6868 for (l = ZB_TOTAL - 1; l >= -1; l--) { 6869 level = (l == -1 ? ZB_TOTAL : l); 6870 zb = &zcb->zcb_type[level][t]; 6871 6872 if (zb->zb_asize == 0) 6873 continue; 6874 6875 if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES && 6876 (level > 0 || DMU_OT_IS_METADATA(t))) { 6877 mdstats->zb_count += zb->zb_count; 6878 mdstats->zb_lsize += zb->zb_lsize; 6879 mdstats->zb_psize += zb->zb_psize; 6880 mdstats->zb_asize += zb->zb_asize; 6881 mdstats->zb_gangs += zb->zb_gangs; 6882 } 6883 6884 if (dump_opt['b'] < 3 && level != ZB_TOTAL) 6885 continue; 6886 6887 if (level == 0 && zb->zb_asize == 6888 zcb->zcb_type[ZB_TOTAL][t].zb_asize) 6889 continue; 6890 6891 zdb_nicenum(zb->zb_count, csize, 6892 sizeof (csize)); 6893 zdb_nicenum(zb->zb_lsize, lsize, 6894 sizeof (lsize)); 6895 zdb_nicenum(zb->zb_psize, psize, 6896 sizeof (psize)); 6897 zdb_nicenum(zb->zb_asize, asize, 6898 sizeof (asize)); 6899 zdb_nicenum(zb->zb_asize / zb->zb_count, avg, 6900 sizeof (avg)); 6901 zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); 6902 6903 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" 6904 "\t%5.2f\t%6.2f\t", 6905 csize, lsize, psize, asize, avg, 6906 (double)zb->zb_lsize / zb->zb_psize, 6907 100.0 * zb->zb_asize / tzb->zb_asize); 6908 6909 if (level == ZB_TOTAL) 6910 (void) printf("%s\n", typename); 6911 else 6912 (void) printf(" L%d %s\n", 6913 level, typename); 6914 6915 if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { 6916 (void) printf("\t number of ganged " 6917 "blocks: %s\n", gang); 6918 } 6919 6920 if (dump_opt['b'] >= 4) { 6921 (void) printf("psize " 6922 "(in 512-byte sectors): " 6923 "number of blocks\n"); 6924 dump_histogram(zb->zb_psize_histogram, 6925 PSIZE_HISTO_SIZE, 0); 6926 } 6927 } 6928 } 6929 zdb_nicenum(mdstats->zb_count, csize, 6930 sizeof (csize)); 6931 zdb_nicenum(mdstats->zb_lsize, lsize, 6932 sizeof (lsize)); 6933 zdb_nicenum(mdstats->zb_psize, psize, 6934 sizeof (psize)); 6935 zdb_nicenum(mdstats->zb_asize, asize, 6936 sizeof (asize)); 6937 zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg, 6938 sizeof (avg)); 6939 zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang)); 6940 6941 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" 6942 "\t%5.2f\t%6.2f\t", 6943 csize, lsize, psize, asize, avg, 6944 (double)mdstats->zb_lsize / mdstats->zb_psize, 6945 100.0 * mdstats->zb_asize / tzb->zb_asize); 6946 (void) printf("%s\n", "Metadata Total"); 6947 6948 /* Output a table summarizing block sizes in the pool */ 6949 if (dump_opt['b'] >= 2) { 6950 dump_size_histograms(zcb); 6951 } 6952 6953 umem_free(mdstats, sizeof (zfs_blkstat_t)); 6954 } 6955 6956 (void) printf("\n"); 6957 6958 if (leaks) { 6959 umem_free(zcb, sizeof (zdb_cb_t)); 6960 return (2); 6961 } 6962 6963 if (zcb->zcb_haderrors) { 6964 umem_free(zcb, sizeof (zdb_cb_t)); 6965 return (3); 6966 } 6967 6968 umem_free(zcb, sizeof (zdb_cb_t)); 6969 return (0); 6970 } 6971 6972 typedef struct zdb_ddt_entry { 6973 ddt_key_t zdde_key; 6974 uint64_t zdde_ref_blocks; 6975 uint64_t zdde_ref_lsize; 6976 uint64_t zdde_ref_psize; 6977 uint64_t zdde_ref_dsize; 6978 avl_node_t zdde_node; 6979 } zdb_ddt_entry_t; 6980 6981 static int 6982 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 6983 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 6984 { 6985 (void) zilog, (void) dnp; 6986 avl_tree_t *t = arg; 6987 avl_index_t where; 6988 zdb_ddt_entry_t *zdde, zdde_search; 6989 6990 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 6991 BP_IS_EMBEDDED(bp)) 6992 return (0); 6993 6994 if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { 6995 (void) printf("traversing objset %llu, %llu objects, " 6996 "%lu blocks so far\n", 6997 (u_longlong_t)zb->zb_objset, 6998 (u_longlong_t)BP_GET_FILL(bp), 6999 avl_numnodes(t)); 7000 } 7001 7002 if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || 7003 BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) 7004 return (0); 7005 7006 ddt_key_fill(&zdde_search.zdde_key, bp); 7007 7008 zdde = avl_find(t, &zdde_search, &where); 7009 7010 if (zdde == NULL) { 7011 zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); 7012 zdde->zdde_key = zdde_search.zdde_key; 7013 avl_insert(t, zdde, where); 7014 } 7015 7016 zdde->zdde_ref_blocks += 1; 7017 zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); 7018 zdde->zdde_ref_psize += BP_GET_PSIZE(bp); 7019 zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); 7020 7021 return (0); 7022 } 7023 7024 static void 7025 dump_simulated_ddt(spa_t *spa) 7026 { 7027 avl_tree_t t; 7028 void *cookie = NULL; 7029 zdb_ddt_entry_t *zdde; 7030 ddt_histogram_t ddh_total = {{{0}}}; 7031 ddt_stat_t dds_total = {0}; 7032 7033 avl_create(&t, ddt_entry_compare, 7034 sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); 7035 7036 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7037 7038 (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 7039 TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); 7040 7041 spa_config_exit(spa, SCL_CONFIG, FTAG); 7042 7043 while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { 7044 ddt_stat_t dds; 7045 uint64_t refcnt = zdde->zdde_ref_blocks; 7046 ASSERT(refcnt != 0); 7047 7048 dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; 7049 dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; 7050 dds.dds_psize = zdde->zdde_ref_psize / refcnt; 7051 dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; 7052 7053 dds.dds_ref_blocks = zdde->zdde_ref_blocks; 7054 dds.dds_ref_lsize = zdde->zdde_ref_lsize; 7055 dds.dds_ref_psize = zdde->zdde_ref_psize; 7056 dds.dds_ref_dsize = zdde->zdde_ref_dsize; 7057 7058 ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], 7059 &dds, 0); 7060 7061 umem_free(zdde, sizeof (*zdde)); 7062 } 7063 7064 avl_destroy(&t); 7065 7066 ddt_histogram_stat(&dds_total, &ddh_total); 7067 7068 (void) printf("Simulated DDT histogram:\n"); 7069 7070 zpool_dump_ddt(&dds_total, &ddh_total); 7071 7072 dump_dedup_ratio(&dds_total); 7073 } 7074 7075 static int 7076 verify_device_removal_feature_counts(spa_t *spa) 7077 { 7078 uint64_t dr_feature_refcount = 0; 7079 uint64_t oc_feature_refcount = 0; 7080 uint64_t indirect_vdev_count = 0; 7081 uint64_t precise_vdev_count = 0; 7082 uint64_t obsolete_counts_object_count = 0; 7083 uint64_t obsolete_sm_count = 0; 7084 uint64_t obsolete_counts_count = 0; 7085 uint64_t scip_count = 0; 7086 uint64_t obsolete_bpobj_count = 0; 7087 int ret = 0; 7088 7089 spa_condensing_indirect_phys_t *scip = 7090 &spa->spa_condensing_indirect_phys; 7091 if (scip->scip_next_mapping_object != 0) { 7092 vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; 7093 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 7094 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 7095 7096 (void) printf("Condensing indirect vdev %llu: new mapping " 7097 "object %llu, prev obsolete sm %llu\n", 7098 (u_longlong_t)scip->scip_vdev, 7099 (u_longlong_t)scip->scip_next_mapping_object, 7100 (u_longlong_t)scip->scip_prev_obsolete_sm_object); 7101 if (scip->scip_prev_obsolete_sm_object != 0) { 7102 space_map_t *prev_obsolete_sm = NULL; 7103 VERIFY0(space_map_open(&prev_obsolete_sm, 7104 spa->spa_meta_objset, 7105 scip->scip_prev_obsolete_sm_object, 7106 0, vd->vdev_asize, 0)); 7107 dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); 7108 (void) printf("\n"); 7109 space_map_close(prev_obsolete_sm); 7110 } 7111 7112 scip_count += 2; 7113 } 7114 7115 for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { 7116 vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; 7117 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 7118 7119 if (vic->vic_mapping_object != 0) { 7120 ASSERT(vd->vdev_ops == &vdev_indirect_ops || 7121 vd->vdev_removing); 7122 indirect_vdev_count++; 7123 7124 if (vd->vdev_indirect_mapping->vim_havecounts) { 7125 obsolete_counts_count++; 7126 } 7127 } 7128 7129 boolean_t are_precise; 7130 VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); 7131 if (are_precise) { 7132 ASSERT(vic->vic_mapping_object != 0); 7133 precise_vdev_count++; 7134 } 7135 7136 uint64_t obsolete_sm_object; 7137 VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 7138 if (obsolete_sm_object != 0) { 7139 ASSERT(vic->vic_mapping_object != 0); 7140 obsolete_sm_count++; 7141 } 7142 } 7143 7144 (void) feature_get_refcount(spa, 7145 &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], 7146 &dr_feature_refcount); 7147 (void) feature_get_refcount(spa, 7148 &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], 7149 &oc_feature_refcount); 7150 7151 if (dr_feature_refcount != indirect_vdev_count) { 7152 ret = 1; 7153 (void) printf("Number of indirect vdevs (%llu) " \ 7154 "does not match feature count (%llu)\n", 7155 (u_longlong_t)indirect_vdev_count, 7156 (u_longlong_t)dr_feature_refcount); 7157 } else { 7158 (void) printf("Verified device_removal feature refcount " \ 7159 "of %llu is correct\n", 7160 (u_longlong_t)dr_feature_refcount); 7161 } 7162 7163 if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 7164 DMU_POOL_OBSOLETE_BPOBJ) == 0) { 7165 obsolete_bpobj_count++; 7166 } 7167 7168 7169 obsolete_counts_object_count = precise_vdev_count; 7170 obsolete_counts_object_count += obsolete_sm_count; 7171 obsolete_counts_object_count += obsolete_counts_count; 7172 obsolete_counts_object_count += scip_count; 7173 obsolete_counts_object_count += obsolete_bpobj_count; 7174 obsolete_counts_object_count += remap_deadlist_count; 7175 7176 if (oc_feature_refcount != obsolete_counts_object_count) { 7177 ret = 1; 7178 (void) printf("Number of obsolete counts objects (%llu) " \ 7179 "does not match feature count (%llu)\n", 7180 (u_longlong_t)obsolete_counts_object_count, 7181 (u_longlong_t)oc_feature_refcount); 7182 (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " 7183 "ob:%llu rd:%llu\n", 7184 (u_longlong_t)precise_vdev_count, 7185 (u_longlong_t)obsolete_sm_count, 7186 (u_longlong_t)obsolete_counts_count, 7187 (u_longlong_t)scip_count, 7188 (u_longlong_t)obsolete_bpobj_count, 7189 (u_longlong_t)remap_deadlist_count); 7190 } else { 7191 (void) printf("Verified indirect_refcount feature refcount " \ 7192 "of %llu is correct\n", 7193 (u_longlong_t)oc_feature_refcount); 7194 } 7195 return (ret); 7196 } 7197 7198 static void 7199 zdb_set_skip_mmp(char *target) 7200 { 7201 spa_t *spa; 7202 7203 /* 7204 * Disable the activity check to allow examination of 7205 * active pools. 7206 */ 7207 mutex_enter(&spa_namespace_lock); 7208 if ((spa = spa_lookup(target)) != NULL) { 7209 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 7210 } 7211 mutex_exit(&spa_namespace_lock); 7212 } 7213 7214 #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" 7215 /* 7216 * Import the checkpointed state of the pool specified by the target 7217 * parameter as readonly. The function also accepts a pool config 7218 * as an optional parameter, else it attempts to infer the config by 7219 * the name of the target pool. 7220 * 7221 * Note that the checkpointed state's pool name will be the name of 7222 * the original pool with the above suffix appended to it. In addition, 7223 * if the target is not a pool name (e.g. a path to a dataset) then 7224 * the new_path parameter is populated with the updated path to 7225 * reflect the fact that we are looking into the checkpointed state. 7226 * 7227 * The function returns a newly-allocated copy of the name of the 7228 * pool containing the checkpointed state. When this copy is no 7229 * longer needed it should be freed with free(3C). Same thing 7230 * applies to the new_path parameter if allocated. 7231 */ 7232 static char * 7233 import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) 7234 { 7235 int error = 0; 7236 char *poolname, *bogus_name = NULL; 7237 boolean_t freecfg = B_FALSE; 7238 7239 /* If the target is not a pool, the extract the pool name */ 7240 char *path_start = strchr(target, '/'); 7241 if (path_start != NULL) { 7242 size_t poolname_len = path_start - target; 7243 poolname = strndup(target, poolname_len); 7244 } else { 7245 poolname = target; 7246 } 7247 7248 if (cfg == NULL) { 7249 zdb_set_skip_mmp(poolname); 7250 error = spa_get_stats(poolname, &cfg, NULL, 0); 7251 if (error != 0) { 7252 fatal("Tried to read config of pool \"%s\" but " 7253 "spa_get_stats() failed with error %d\n", 7254 poolname, error); 7255 } 7256 freecfg = B_TRUE; 7257 } 7258 7259 if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) { 7260 if (target != poolname) 7261 free(poolname); 7262 return (NULL); 7263 } 7264 fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); 7265 7266 error = spa_import(bogus_name, cfg, NULL, 7267 ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | 7268 ZFS_IMPORT_SKIP_MMP); 7269 if (freecfg) 7270 nvlist_free(cfg); 7271 if (error != 0) { 7272 fatal("Tried to import pool \"%s\" but spa_import() failed " 7273 "with error %d\n", bogus_name, error); 7274 } 7275 7276 if (new_path != NULL && path_start != NULL) { 7277 if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { 7278 free(bogus_name); 7279 if (path_start != NULL) 7280 free(poolname); 7281 return (NULL); 7282 } 7283 } 7284 7285 if (target != poolname) 7286 free(poolname); 7287 7288 return (bogus_name); 7289 } 7290 7291 typedef struct verify_checkpoint_sm_entry_cb_arg { 7292 vdev_t *vcsec_vd; 7293 7294 /* the following fields are only used for printing progress */ 7295 uint64_t vcsec_entryid; 7296 uint64_t vcsec_num_entries; 7297 } verify_checkpoint_sm_entry_cb_arg_t; 7298 7299 #define ENTRIES_PER_PROGRESS_UPDATE 10000 7300 7301 static int 7302 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) 7303 { 7304 verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; 7305 vdev_t *vd = vcsec->vcsec_vd; 7306 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; 7307 uint64_t end = sme->sme_offset + sme->sme_run; 7308 7309 ASSERT(sme->sme_type == SM_FREE); 7310 7311 if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { 7312 (void) fprintf(stderr, 7313 "\rverifying vdev %llu, space map entry %llu of %llu ...", 7314 (longlong_t)vd->vdev_id, 7315 (longlong_t)vcsec->vcsec_entryid, 7316 (longlong_t)vcsec->vcsec_num_entries); 7317 } 7318 vcsec->vcsec_entryid++; 7319 7320 /* 7321 * See comment in checkpoint_sm_exclude_entry_cb() 7322 */ 7323 VERIFY3U(sme->sme_offset, >=, ms->ms_start); 7324 VERIFY3U(end, <=, ms->ms_start + ms->ms_size); 7325 7326 /* 7327 * The entries in the vdev_checkpoint_sm should be marked as 7328 * allocated in the checkpointed state of the pool, therefore 7329 * their respective ms_allocateable trees should not contain them. 7330 */ 7331 mutex_enter(&ms->ms_lock); 7332 range_tree_verify_not_present(ms->ms_allocatable, 7333 sme->sme_offset, sme->sme_run); 7334 mutex_exit(&ms->ms_lock); 7335 7336 return (0); 7337 } 7338 7339 /* 7340 * Verify that all segments in the vdev_checkpoint_sm are allocated 7341 * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's 7342 * ms_allocatable). 7343 * 7344 * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of 7345 * each vdev in the current state of the pool to the metaslab space maps 7346 * (ms_sm) of the checkpointed state of the pool. 7347 * 7348 * Note that the function changes the state of the ms_allocatable 7349 * trees of the current spa_t. The entries of these ms_allocatable 7350 * trees are cleared out and then repopulated from with the free 7351 * entries of their respective ms_sm space maps. 7352 */ 7353 static void 7354 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) 7355 { 7356 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; 7357 vdev_t *current_rvd = current->spa_root_vdev; 7358 7359 load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); 7360 7361 for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { 7362 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; 7363 vdev_t *current_vd = current_rvd->vdev_child[c]; 7364 7365 space_map_t *checkpoint_sm = NULL; 7366 uint64_t checkpoint_sm_obj; 7367 7368 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { 7369 /* 7370 * Since we don't allow device removal in a pool 7371 * that has a checkpoint, we expect that all removed 7372 * vdevs were removed from the pool before the 7373 * checkpoint. 7374 */ 7375 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); 7376 continue; 7377 } 7378 7379 /* 7380 * If the checkpoint space map doesn't exist, then nothing 7381 * here is checkpointed so there's nothing to verify. 7382 */ 7383 if (current_vd->vdev_top_zap == 0 || 7384 zap_contains(spa_meta_objset(current), 7385 current_vd->vdev_top_zap, 7386 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 7387 continue; 7388 7389 VERIFY0(zap_lookup(spa_meta_objset(current), 7390 current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 7391 sizeof (uint64_t), 1, &checkpoint_sm_obj)); 7392 7393 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), 7394 checkpoint_sm_obj, 0, current_vd->vdev_asize, 7395 current_vd->vdev_ashift)); 7396 7397 verify_checkpoint_sm_entry_cb_arg_t vcsec; 7398 vcsec.vcsec_vd = ckpoint_vd; 7399 vcsec.vcsec_entryid = 0; 7400 vcsec.vcsec_num_entries = 7401 space_map_length(checkpoint_sm) / sizeof (uint64_t); 7402 VERIFY0(space_map_iterate(checkpoint_sm, 7403 space_map_length(checkpoint_sm), 7404 verify_checkpoint_sm_entry_cb, &vcsec)); 7405 if (dump_opt['m'] > 3) 7406 dump_spacemap(current->spa_meta_objset, checkpoint_sm); 7407 space_map_close(checkpoint_sm); 7408 } 7409 7410 /* 7411 * If we've added vdevs since we took the checkpoint, ensure 7412 * that their checkpoint space maps are empty. 7413 */ 7414 if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { 7415 for (uint64_t c = ckpoint_rvd->vdev_children; 7416 c < current_rvd->vdev_children; c++) { 7417 vdev_t *current_vd = current_rvd->vdev_child[c]; 7418 VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL); 7419 } 7420 } 7421 7422 /* for cleaner progress output */ 7423 (void) fprintf(stderr, "\n"); 7424 } 7425 7426 /* 7427 * Verifies that all space that's allocated in the checkpoint is 7428 * still allocated in the current version, by checking that everything 7429 * in checkpoint's ms_allocatable (which is actually allocated, not 7430 * allocatable/free) is not present in current's ms_allocatable. 7431 * 7432 * Note that the function changes the state of the ms_allocatable 7433 * trees of both spas when called. The entries of all ms_allocatable 7434 * trees are cleared out and then repopulated from their respective 7435 * ms_sm space maps. In the checkpointed state we load the allocated 7436 * entries, and in the current state we load the free entries. 7437 */ 7438 static void 7439 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) 7440 { 7441 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; 7442 vdev_t *current_rvd = current->spa_root_vdev; 7443 7444 load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); 7445 load_concrete_ms_allocatable_trees(current, SM_FREE); 7446 7447 for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { 7448 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; 7449 vdev_t *current_vd = current_rvd->vdev_child[i]; 7450 7451 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { 7452 /* 7453 * See comment in verify_checkpoint_vdev_spacemaps() 7454 */ 7455 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); 7456 continue; 7457 } 7458 7459 for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { 7460 metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; 7461 metaslab_t *current_msp = current_vd->vdev_ms[m]; 7462 7463 (void) fprintf(stderr, 7464 "\rverifying vdev %llu of %llu, " 7465 "metaslab %llu of %llu ...", 7466 (longlong_t)current_vd->vdev_id, 7467 (longlong_t)current_rvd->vdev_children, 7468 (longlong_t)current_vd->vdev_ms[m]->ms_id, 7469 (longlong_t)current_vd->vdev_ms_count); 7470 7471 /* 7472 * We walk through the ms_allocatable trees that 7473 * are loaded with the allocated blocks from the 7474 * ms_sm spacemaps of the checkpoint. For each 7475 * one of these ranges we ensure that none of them 7476 * exists in the ms_allocatable trees of the 7477 * current state which are loaded with the ranges 7478 * that are currently free. 7479 * 7480 * This way we ensure that none of the blocks that 7481 * are part of the checkpoint were freed by mistake. 7482 */ 7483 range_tree_walk(ckpoint_msp->ms_allocatable, 7484 (range_tree_func_t *)range_tree_verify_not_present, 7485 current_msp->ms_allocatable); 7486 } 7487 } 7488 7489 /* for cleaner progress output */ 7490 (void) fprintf(stderr, "\n"); 7491 } 7492 7493 static void 7494 verify_checkpoint_blocks(spa_t *spa) 7495 { 7496 ASSERT(!dump_opt['L']); 7497 7498 spa_t *checkpoint_spa; 7499 char *checkpoint_pool; 7500 int error = 0; 7501 7502 /* 7503 * We import the checkpointed state of the pool (under a different 7504 * name) so we can do verification on it against the current state 7505 * of the pool. 7506 */ 7507 checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, 7508 NULL); 7509 ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); 7510 7511 error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); 7512 if (error != 0) { 7513 fatal("Tried to open pool \"%s\" but spa_open() failed with " 7514 "error %d\n", checkpoint_pool, error); 7515 } 7516 7517 /* 7518 * Ensure that ranges in the checkpoint space maps of each vdev 7519 * are allocated according to the checkpointed state's metaslab 7520 * space maps. 7521 */ 7522 verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); 7523 7524 /* 7525 * Ensure that allocated ranges in the checkpoint's metaslab 7526 * space maps remain allocated in the metaslab space maps of 7527 * the current state. 7528 */ 7529 verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); 7530 7531 /* 7532 * Once we are done, we get rid of the checkpointed state. 7533 */ 7534 spa_close(checkpoint_spa, FTAG); 7535 free(checkpoint_pool); 7536 } 7537 7538 static void 7539 dump_leftover_checkpoint_blocks(spa_t *spa) 7540 { 7541 vdev_t *rvd = spa->spa_root_vdev; 7542 7543 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 7544 vdev_t *vd = rvd->vdev_child[i]; 7545 7546 space_map_t *checkpoint_sm = NULL; 7547 uint64_t checkpoint_sm_obj; 7548 7549 if (vd->vdev_top_zap == 0) 7550 continue; 7551 7552 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, 7553 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 7554 continue; 7555 7556 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, 7557 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 7558 sizeof (uint64_t), 1, &checkpoint_sm_obj)); 7559 7560 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), 7561 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); 7562 dump_spacemap(spa->spa_meta_objset, checkpoint_sm); 7563 space_map_close(checkpoint_sm); 7564 } 7565 } 7566 7567 static int 7568 verify_checkpoint(spa_t *spa) 7569 { 7570 uberblock_t checkpoint; 7571 int error; 7572 7573 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 7574 return (0); 7575 7576 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 7577 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 7578 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 7579 7580 if (error == ENOENT && !dump_opt['L']) { 7581 /* 7582 * If the feature is active but the uberblock is missing 7583 * then we must be in the middle of discarding the 7584 * checkpoint. 7585 */ 7586 (void) printf("\nPartially discarded checkpoint " 7587 "state found:\n"); 7588 if (dump_opt['m'] > 3) 7589 dump_leftover_checkpoint_blocks(spa); 7590 return (0); 7591 } else if (error != 0) { 7592 (void) printf("lookup error %d when looking for " 7593 "checkpointed uberblock in MOS\n", error); 7594 return (error); 7595 } 7596 dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); 7597 7598 if (checkpoint.ub_checkpoint_txg == 0) { 7599 (void) printf("\nub_checkpoint_txg not set in checkpointed " 7600 "uberblock\n"); 7601 error = 3; 7602 } 7603 7604 if (error == 0 && !dump_opt['L']) 7605 verify_checkpoint_blocks(spa); 7606 7607 return (error); 7608 } 7609 7610 static void 7611 mos_leaks_cb(void *arg, uint64_t start, uint64_t size) 7612 { 7613 (void) arg; 7614 for (uint64_t i = start; i < size; i++) { 7615 (void) printf("MOS object %llu referenced but not allocated\n", 7616 (u_longlong_t)i); 7617 } 7618 } 7619 7620 static void 7621 mos_obj_refd(uint64_t obj) 7622 { 7623 if (obj != 0 && mos_refd_objs != NULL) 7624 range_tree_add(mos_refd_objs, obj, 1); 7625 } 7626 7627 /* 7628 * Call on a MOS object that may already have been referenced. 7629 */ 7630 static void 7631 mos_obj_refd_multiple(uint64_t obj) 7632 { 7633 if (obj != 0 && mos_refd_objs != NULL && 7634 !range_tree_contains(mos_refd_objs, obj, 1)) 7635 range_tree_add(mos_refd_objs, obj, 1); 7636 } 7637 7638 static void 7639 mos_leak_vdev_top_zap(vdev_t *vd) 7640 { 7641 uint64_t ms_flush_data_obj; 7642 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), 7643 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, 7644 sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); 7645 if (error == ENOENT) 7646 return; 7647 ASSERT0(error); 7648 7649 mos_obj_refd(ms_flush_data_obj); 7650 } 7651 7652 static void 7653 mos_leak_vdev(vdev_t *vd) 7654 { 7655 mos_obj_refd(vd->vdev_dtl_object); 7656 mos_obj_refd(vd->vdev_ms_array); 7657 mos_obj_refd(vd->vdev_indirect_config.vic_births_object); 7658 mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); 7659 mos_obj_refd(vd->vdev_leaf_zap); 7660 if (vd->vdev_checkpoint_sm != NULL) 7661 mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); 7662 if (vd->vdev_indirect_mapping != NULL) { 7663 mos_obj_refd(vd->vdev_indirect_mapping-> 7664 vim_phys->vimp_counts_object); 7665 } 7666 if (vd->vdev_obsolete_sm != NULL) 7667 mos_obj_refd(vd->vdev_obsolete_sm->sm_object); 7668 7669 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 7670 metaslab_t *ms = vd->vdev_ms[m]; 7671 mos_obj_refd(space_map_object(ms->ms_sm)); 7672 } 7673 7674 if (vd->vdev_root_zap != 0) 7675 mos_obj_refd(vd->vdev_root_zap); 7676 7677 if (vd->vdev_top_zap != 0) { 7678 mos_obj_refd(vd->vdev_top_zap); 7679 mos_leak_vdev_top_zap(vd); 7680 } 7681 7682 for (uint64_t c = 0; c < vd->vdev_children; c++) { 7683 mos_leak_vdev(vd->vdev_child[c]); 7684 } 7685 } 7686 7687 static void 7688 mos_leak_log_spacemaps(spa_t *spa) 7689 { 7690 uint64_t spacemap_zap; 7691 int error = zap_lookup(spa_meta_objset(spa), 7692 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, 7693 sizeof (spacemap_zap), 1, &spacemap_zap); 7694 if (error == ENOENT) 7695 return; 7696 ASSERT0(error); 7697 7698 mos_obj_refd(spacemap_zap); 7699 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 7700 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) 7701 mos_obj_refd(sls->sls_sm_obj); 7702 } 7703 7704 static void 7705 errorlog_count_refd(objset_t *mos, uint64_t errlog) 7706 { 7707 zap_cursor_t zc; 7708 zap_attribute_t za; 7709 for (zap_cursor_init(&zc, mos, errlog); 7710 zap_cursor_retrieve(&zc, &za) == 0; 7711 zap_cursor_advance(&zc)) { 7712 mos_obj_refd(za.za_first_integer); 7713 } 7714 zap_cursor_fini(&zc); 7715 } 7716 7717 static int 7718 dump_mos_leaks(spa_t *spa) 7719 { 7720 int rv = 0; 7721 objset_t *mos = spa->spa_meta_objset; 7722 dsl_pool_t *dp = spa->spa_dsl_pool; 7723 7724 /* Visit and mark all referenced objects in the MOS */ 7725 7726 mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); 7727 mos_obj_refd(spa->spa_pool_props_object); 7728 mos_obj_refd(spa->spa_config_object); 7729 mos_obj_refd(spa->spa_ddt_stat_object); 7730 mos_obj_refd(spa->spa_feat_desc_obj); 7731 mos_obj_refd(spa->spa_feat_enabled_txg_obj); 7732 mos_obj_refd(spa->spa_feat_for_read_obj); 7733 mos_obj_refd(spa->spa_feat_for_write_obj); 7734 mos_obj_refd(spa->spa_history); 7735 mos_obj_refd(spa->spa_errlog_last); 7736 mos_obj_refd(spa->spa_errlog_scrub); 7737 7738 if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { 7739 errorlog_count_refd(mos, spa->spa_errlog_last); 7740 errorlog_count_refd(mos, spa->spa_errlog_scrub); 7741 } 7742 7743 mos_obj_refd(spa->spa_all_vdev_zaps); 7744 mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); 7745 mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); 7746 mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); 7747 bpobj_count_refd(&spa->spa_deferred_bpobj); 7748 mos_obj_refd(dp->dp_empty_bpobj); 7749 bpobj_count_refd(&dp->dp_obsolete_bpobj); 7750 bpobj_count_refd(&dp->dp_free_bpobj); 7751 mos_obj_refd(spa->spa_l2cache.sav_object); 7752 mos_obj_refd(spa->spa_spares.sav_object); 7753 7754 if (spa->spa_syncing_log_sm != NULL) 7755 mos_obj_refd(spa->spa_syncing_log_sm->sm_object); 7756 mos_leak_log_spacemaps(spa); 7757 7758 mos_obj_refd(spa->spa_condensing_indirect_phys. 7759 scip_next_mapping_object); 7760 mos_obj_refd(spa->spa_condensing_indirect_phys. 7761 scip_prev_obsolete_sm_object); 7762 if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { 7763 vdev_indirect_mapping_t *vim = 7764 vdev_indirect_mapping_open(mos, 7765 spa->spa_condensing_indirect_phys.scip_next_mapping_object); 7766 mos_obj_refd(vim->vim_phys->vimp_counts_object); 7767 vdev_indirect_mapping_close(vim); 7768 } 7769 deleted_livelists_dump_mos(spa); 7770 7771 if (dp->dp_origin_snap != NULL) { 7772 dsl_dataset_t *ds; 7773 7774 dsl_pool_config_enter(dp, FTAG); 7775 VERIFY0(dsl_dataset_hold_obj(dp, 7776 dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, 7777 FTAG, &ds)); 7778 count_ds_mos_objects(ds); 7779 dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); 7780 dsl_dataset_rele(ds, FTAG); 7781 dsl_pool_config_exit(dp, FTAG); 7782 7783 count_ds_mos_objects(dp->dp_origin_snap); 7784 dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); 7785 } 7786 count_dir_mos_objects(dp->dp_mos_dir); 7787 if (dp->dp_free_dir != NULL) 7788 count_dir_mos_objects(dp->dp_free_dir); 7789 if (dp->dp_leak_dir != NULL) 7790 count_dir_mos_objects(dp->dp_leak_dir); 7791 7792 mos_leak_vdev(spa->spa_root_vdev); 7793 7794 for (uint64_t class = 0; class < DDT_CLASSES; class++) { 7795 for (uint64_t type = 0; type < DDT_TYPES; type++) { 7796 for (uint64_t cksum = 0; 7797 cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { 7798 ddt_t *ddt = spa->spa_ddt[cksum]; 7799 mos_obj_refd(ddt->ddt_object[type][class]); 7800 } 7801 } 7802 } 7803 7804 /* 7805 * Visit all allocated objects and make sure they are referenced. 7806 */ 7807 uint64_t object = 0; 7808 while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { 7809 if (range_tree_contains(mos_refd_objs, object, 1)) { 7810 range_tree_remove(mos_refd_objs, object, 1); 7811 } else { 7812 dmu_object_info_t doi; 7813 const char *name; 7814 VERIFY0(dmu_object_info(mos, object, &doi)); 7815 if (doi.doi_type & DMU_OT_NEWTYPE) { 7816 dmu_object_byteswap_t bswap = 7817 DMU_OT_BYTESWAP(doi.doi_type); 7818 name = dmu_ot_byteswap[bswap].ob_name; 7819 } else { 7820 name = dmu_ot[doi.doi_type].ot_name; 7821 } 7822 7823 (void) printf("MOS object %llu (%s) leaked\n", 7824 (u_longlong_t)object, name); 7825 rv = 2; 7826 } 7827 } 7828 (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); 7829 if (!range_tree_is_empty(mos_refd_objs)) 7830 rv = 2; 7831 range_tree_vacate(mos_refd_objs, NULL, NULL); 7832 range_tree_destroy(mos_refd_objs); 7833 return (rv); 7834 } 7835 7836 typedef struct log_sm_obsolete_stats_arg { 7837 uint64_t lsos_current_txg; 7838 7839 uint64_t lsos_total_entries; 7840 uint64_t lsos_valid_entries; 7841 7842 uint64_t lsos_sm_entries; 7843 uint64_t lsos_valid_sm_entries; 7844 } log_sm_obsolete_stats_arg_t; 7845 7846 static int 7847 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, 7848 uint64_t txg, void *arg) 7849 { 7850 log_sm_obsolete_stats_arg_t *lsos = arg; 7851 7852 uint64_t offset = sme->sme_offset; 7853 uint64_t vdev_id = sme->sme_vdev; 7854 7855 if (lsos->lsos_current_txg == 0) { 7856 /* this is the first log */ 7857 lsos->lsos_current_txg = txg; 7858 } else if (lsos->lsos_current_txg < txg) { 7859 /* we just changed log - print stats and reset */ 7860 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", 7861 (u_longlong_t)lsos->lsos_valid_sm_entries, 7862 (u_longlong_t)lsos->lsos_sm_entries, 7863 (u_longlong_t)lsos->lsos_current_txg); 7864 lsos->lsos_valid_sm_entries = 0; 7865 lsos->lsos_sm_entries = 0; 7866 lsos->lsos_current_txg = txg; 7867 } 7868 ASSERT3U(lsos->lsos_current_txg, ==, txg); 7869 7870 lsos->lsos_sm_entries++; 7871 lsos->lsos_total_entries++; 7872 7873 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 7874 if (!vdev_is_concrete(vd)) 7875 return (0); 7876 7877 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 7878 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 7879 7880 if (txg < metaslab_unflushed_txg(ms)) 7881 return (0); 7882 lsos->lsos_valid_sm_entries++; 7883 lsos->lsos_valid_entries++; 7884 return (0); 7885 } 7886 7887 static void 7888 dump_log_spacemap_obsolete_stats(spa_t *spa) 7889 { 7890 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 7891 return; 7892 7893 log_sm_obsolete_stats_arg_t lsos = {0}; 7894 7895 (void) printf("Log Space Map Obsolete Entry Statistics:\n"); 7896 7897 iterate_through_spacemap_logs(spa, 7898 log_spacemap_obsolete_stats_cb, &lsos); 7899 7900 /* print stats for latest log */ 7901 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", 7902 (u_longlong_t)lsos.lsos_valid_sm_entries, 7903 (u_longlong_t)lsos.lsos_sm_entries, 7904 (u_longlong_t)lsos.lsos_current_txg); 7905 7906 (void) printf("%-8llu valid entries out of %-8llu - total\n\n", 7907 (u_longlong_t)lsos.lsos_valid_entries, 7908 (u_longlong_t)lsos.lsos_total_entries); 7909 } 7910 7911 static void 7912 dump_zpool(spa_t *spa) 7913 { 7914 dsl_pool_t *dp = spa_get_dsl(spa); 7915 int rc = 0; 7916 7917 if (dump_opt['y']) { 7918 livelist_metaslab_validate(spa); 7919 } 7920 7921 if (dump_opt['S']) { 7922 dump_simulated_ddt(spa); 7923 return; 7924 } 7925 7926 if (!dump_opt['e'] && dump_opt['C'] > 1) { 7927 (void) printf("\nCached configuration:\n"); 7928 dump_nvlist(spa->spa_config, 8); 7929 } 7930 7931 if (dump_opt['C']) 7932 dump_config(spa); 7933 7934 if (dump_opt['u']) 7935 dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); 7936 7937 if (dump_opt['D']) 7938 dump_all_ddts(spa); 7939 7940 if (dump_opt['d'] > 2 || dump_opt['m']) 7941 dump_metaslabs(spa); 7942 if (dump_opt['M']) 7943 dump_metaslab_groups(spa, dump_opt['M'] > 1); 7944 if (dump_opt['d'] > 2 || dump_opt['m']) { 7945 dump_log_spacemaps(spa); 7946 dump_log_spacemap_obsolete_stats(spa); 7947 } 7948 7949 if (dump_opt['d'] || dump_opt['i']) { 7950 spa_feature_t f; 7951 mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 7952 0); 7953 dump_objset(dp->dp_meta_objset); 7954 7955 if (dump_opt['d'] >= 3) { 7956 dsl_pool_t *dp = spa->spa_dsl_pool; 7957 dump_full_bpobj(&spa->spa_deferred_bpobj, 7958 "Deferred frees", 0); 7959 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 7960 dump_full_bpobj(&dp->dp_free_bpobj, 7961 "Pool snapshot frees", 0); 7962 } 7963 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { 7964 ASSERT(spa_feature_is_enabled(spa, 7965 SPA_FEATURE_DEVICE_REMOVAL)); 7966 dump_full_bpobj(&dp->dp_obsolete_bpobj, 7967 "Pool obsolete blocks", 0); 7968 } 7969 7970 if (spa_feature_is_active(spa, 7971 SPA_FEATURE_ASYNC_DESTROY)) { 7972 dump_bptree(spa->spa_meta_objset, 7973 dp->dp_bptree_obj, 7974 "Pool dataset frees"); 7975 } 7976 dump_dtl(spa->spa_root_vdev, 0); 7977 } 7978 7979 for (spa_feature_t f = 0; f < SPA_FEATURES; f++) 7980 global_feature_count[f] = UINT64_MAX; 7981 global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; 7982 global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; 7983 global_feature_count[SPA_FEATURE_LIVELIST] = 0; 7984 7985 (void) dmu_objset_find(spa_name(spa), dump_one_objset, 7986 NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 7987 7988 if (rc == 0 && !dump_opt['L']) 7989 rc = dump_mos_leaks(spa); 7990 7991 for (f = 0; f < SPA_FEATURES; f++) { 7992 uint64_t refcount; 7993 7994 uint64_t *arr; 7995 if (!(spa_feature_table[f].fi_flags & 7996 ZFEATURE_FLAG_PER_DATASET)) { 7997 if (global_feature_count[f] == UINT64_MAX) 7998 continue; 7999 if (!spa_feature_is_enabled(spa, f)) { 8000 ASSERT0(global_feature_count[f]); 8001 continue; 8002 } 8003 arr = global_feature_count; 8004 } else { 8005 if (!spa_feature_is_enabled(spa, f)) { 8006 ASSERT0(dataset_feature_count[f]); 8007 continue; 8008 } 8009 arr = dataset_feature_count; 8010 } 8011 if (feature_get_refcount(spa, &spa_feature_table[f], 8012 &refcount) == ENOTSUP) 8013 continue; 8014 if (arr[f] != refcount) { 8015 (void) printf("%s feature refcount mismatch: " 8016 "%lld consumers != %lld refcount\n", 8017 spa_feature_table[f].fi_uname, 8018 (longlong_t)arr[f], (longlong_t)refcount); 8019 rc = 2; 8020 } else { 8021 (void) printf("Verified %s feature refcount " 8022 "of %llu is correct\n", 8023 spa_feature_table[f].fi_uname, 8024 (longlong_t)refcount); 8025 } 8026 } 8027 8028 if (rc == 0) 8029 rc = verify_device_removal_feature_counts(spa); 8030 } 8031 8032 if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) 8033 rc = dump_block_stats(spa); 8034 8035 if (rc == 0) 8036 rc = verify_spacemap_refcounts(spa); 8037 8038 if (dump_opt['s']) 8039 show_pool_stats(spa); 8040 8041 if (dump_opt['h']) 8042 dump_history(spa); 8043 8044 if (rc == 0) 8045 rc = verify_checkpoint(spa); 8046 8047 if (rc != 0) { 8048 dump_debug_buffer(); 8049 exit(rc); 8050 } 8051 } 8052 8053 #define ZDB_FLAG_CHECKSUM 0x0001 8054 #define ZDB_FLAG_DECOMPRESS 0x0002 8055 #define ZDB_FLAG_BSWAP 0x0004 8056 #define ZDB_FLAG_GBH 0x0008 8057 #define ZDB_FLAG_INDIRECT 0x0010 8058 #define ZDB_FLAG_RAW 0x0020 8059 #define ZDB_FLAG_PRINT_BLKPTR 0x0040 8060 #define ZDB_FLAG_VERBOSE 0x0080 8061 8062 static int flagbits[256]; 8063 static char flagbitstr[16]; 8064 8065 static void 8066 zdb_print_blkptr(const blkptr_t *bp, int flags) 8067 { 8068 char blkbuf[BP_SPRINTF_LEN]; 8069 8070 if (flags & ZDB_FLAG_BSWAP) 8071 byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); 8072 8073 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 8074 (void) printf("%s\n", blkbuf); 8075 } 8076 8077 static void 8078 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) 8079 { 8080 int i; 8081 8082 for (i = 0; i < nbps; i++) 8083 zdb_print_blkptr(&bp[i], flags); 8084 } 8085 8086 static void 8087 zdb_dump_gbh(void *buf, int flags) 8088 { 8089 zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); 8090 } 8091 8092 static void 8093 zdb_dump_block_raw(void *buf, uint64_t size, int flags) 8094 { 8095 if (flags & ZDB_FLAG_BSWAP) 8096 byteswap_uint64_array(buf, size); 8097 VERIFY(write(fileno(stdout), buf, size) == size); 8098 } 8099 8100 static void 8101 zdb_dump_block(char *label, void *buf, uint64_t size, int flags) 8102 { 8103 uint64_t *d = (uint64_t *)buf; 8104 unsigned nwords = size / sizeof (uint64_t); 8105 int do_bswap = !!(flags & ZDB_FLAG_BSWAP); 8106 unsigned i, j; 8107 const char *hdr; 8108 char *c; 8109 8110 8111 if (do_bswap) 8112 hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; 8113 else 8114 hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; 8115 8116 (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); 8117 8118 #ifdef _LITTLE_ENDIAN 8119 /* correct the endianness */ 8120 do_bswap = !do_bswap; 8121 #endif 8122 for (i = 0; i < nwords; i += 2) { 8123 (void) printf("%06llx: %016llx %016llx ", 8124 (u_longlong_t)(i * sizeof (uint64_t)), 8125 (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), 8126 (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); 8127 8128 c = (char *)&d[i]; 8129 for (j = 0; j < 2 * sizeof (uint64_t); j++) 8130 (void) printf("%c", isprint(c[j]) ? c[j] : '.'); 8131 (void) printf("\n"); 8132 } 8133 } 8134 8135 /* 8136 * There are two acceptable formats: 8137 * leaf_name - For example: c1t0d0 or /tmp/ztest.0a 8138 * child[.child]* - For example: 0.1.1 8139 * 8140 * The second form can be used to specify arbitrary vdevs anywhere 8141 * in the hierarchy. For example, in a pool with a mirror of 8142 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . 8143 */ 8144 static vdev_t * 8145 zdb_vdev_lookup(vdev_t *vdev, const char *path) 8146 { 8147 char *s, *p, *q; 8148 unsigned i; 8149 8150 if (vdev == NULL) 8151 return (NULL); 8152 8153 /* First, assume the x.x.x.x format */ 8154 i = strtoul(path, &s, 10); 8155 if (s == path || (s && *s != '.' && *s != '\0')) 8156 goto name; 8157 if (i >= vdev->vdev_children) 8158 return (NULL); 8159 8160 vdev = vdev->vdev_child[i]; 8161 if (s && *s == '\0') 8162 return (vdev); 8163 return (zdb_vdev_lookup(vdev, s+1)); 8164 8165 name: 8166 for (i = 0; i < vdev->vdev_children; i++) { 8167 vdev_t *vc = vdev->vdev_child[i]; 8168 8169 if (vc->vdev_path == NULL) { 8170 vc = zdb_vdev_lookup(vc, path); 8171 if (vc == NULL) 8172 continue; 8173 else 8174 return (vc); 8175 } 8176 8177 p = strrchr(vc->vdev_path, '/'); 8178 p = p ? p + 1 : vc->vdev_path; 8179 q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; 8180 8181 if (strcmp(vc->vdev_path, path) == 0) 8182 return (vc); 8183 if (strcmp(p, path) == 0) 8184 return (vc); 8185 if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) 8186 return (vc); 8187 } 8188 8189 return (NULL); 8190 } 8191 8192 static int 8193 name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr) 8194 { 8195 dsl_dataset_t *ds; 8196 8197 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 8198 int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id, 8199 NULL, &ds); 8200 if (error != 0) { 8201 (void) fprintf(stderr, "failed to hold objset %llu: %s\n", 8202 (u_longlong_t)objset_id, strerror(error)); 8203 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 8204 return (error); 8205 } 8206 dsl_dataset_name(ds, outstr); 8207 dsl_dataset_rele(ds, NULL); 8208 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 8209 return (0); 8210 } 8211 8212 static boolean_t 8213 zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) 8214 { 8215 char *s0, *s1, *tmp = NULL; 8216 8217 if (sizes == NULL) 8218 return (B_FALSE); 8219 8220 s0 = strtok_r(sizes, "/", &tmp); 8221 if (s0 == NULL) 8222 return (B_FALSE); 8223 s1 = strtok_r(NULL, "/", &tmp); 8224 *lsize = strtoull(s0, NULL, 16); 8225 *psize = s1 ? strtoull(s1, NULL, 16) : *lsize; 8226 return (*lsize >= *psize && *psize > 0); 8227 } 8228 8229 #define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) 8230 8231 static boolean_t 8232 zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, 8233 uint64_t psize, int flags) 8234 { 8235 (void) buf; 8236 boolean_t exceeded = B_FALSE; 8237 /* 8238 * We don't know how the data was compressed, so just try 8239 * every decompress function at every inflated blocksize. 8240 */ 8241 void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 8242 int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 }; 8243 int *cfuncp = cfuncs; 8244 uint64_t maxlsize = SPA_MAXBLOCKSIZE; 8245 uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | 8246 ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | 8247 (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0); 8248 *cfuncp++ = ZIO_COMPRESS_LZ4; 8249 *cfuncp++ = ZIO_COMPRESS_LZJB; 8250 mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); 8251 for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) 8252 if (((1ULL << c) & mask) == 0) 8253 *cfuncp++ = c; 8254 8255 /* 8256 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this 8257 * could take a while and we should let the user know 8258 * we are not stuck. On the other hand, printing progress 8259 * info gets old after a while. User can specify 'v' flag 8260 * to see the progression. 8261 */ 8262 if (lsize == psize) 8263 lsize += SPA_MINBLOCKSIZE; 8264 else 8265 maxlsize = lsize; 8266 for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { 8267 for (cfuncp = cfuncs; *cfuncp; cfuncp++) { 8268 if (flags & ZDB_FLAG_VERBOSE) { 8269 (void) fprintf(stderr, 8270 "Trying %05llx -> %05llx (%s)\n", 8271 (u_longlong_t)psize, 8272 (u_longlong_t)lsize, 8273 zio_compress_table[*cfuncp].\ 8274 ci_name); 8275 } 8276 8277 /* 8278 * We randomize lbuf2, and decompress to both 8279 * lbuf and lbuf2. This way, we will know if 8280 * decompression fill exactly to lsize. 8281 */ 8282 VERIFY0(random_get_pseudo_bytes(lbuf2, lsize)); 8283 8284 if (zio_decompress_data(*cfuncp, pabd, 8285 lbuf, psize, lsize, NULL) == 0 && 8286 zio_decompress_data(*cfuncp, pabd, 8287 lbuf2, psize, lsize, NULL) == 0 && 8288 memcmp(lbuf, lbuf2, lsize) == 0) 8289 break; 8290 } 8291 if (*cfuncp != 0) 8292 break; 8293 } 8294 umem_free(lbuf2, SPA_MAXBLOCKSIZE); 8295 8296 if (lsize > maxlsize) { 8297 exceeded = B_TRUE; 8298 } 8299 if (*cfuncp == ZIO_COMPRESS_ZLE) { 8300 printf("\nZLE decompression was selected. If you " 8301 "suspect the results are wrong,\ntry avoiding ZLE " 8302 "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); 8303 } 8304 8305 return (exceeded); 8306 } 8307 8308 /* 8309 * Read a block from a pool and print it out. The syntax of the 8310 * block descriptor is: 8311 * 8312 * pool:vdev_specifier:offset:[lsize/]psize[:flags] 8313 * 8314 * pool - The name of the pool you wish to read from 8315 * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) 8316 * offset - offset, in hex, in bytes 8317 * size - Amount of data to read, in hex, in bytes 8318 * flags - A string of characters specifying options 8319 * b: Decode a blkptr at given offset within block 8320 * c: Calculate and display checksums 8321 * d: Decompress data before dumping 8322 * e: Byteswap data before dumping 8323 * g: Display data as a gang block header 8324 * i: Display as an indirect block 8325 * r: Dump raw data to stdout 8326 * v: Verbose 8327 * 8328 */ 8329 static void 8330 zdb_read_block(char *thing, spa_t *spa) 8331 { 8332 blkptr_t blk, *bp = &blk; 8333 dva_t *dva = bp->blk_dva; 8334 int flags = 0; 8335 uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0; 8336 zio_t *zio; 8337 vdev_t *vd; 8338 abd_t *pabd; 8339 void *lbuf, *buf; 8340 char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL; 8341 const char *vdev, *errmsg = NULL; 8342 int i, error; 8343 boolean_t borrowed = B_FALSE, found = B_FALSE; 8344 8345 dup = strdup(thing); 8346 s = strtok_r(dup, ":", &tmp); 8347 vdev = s ?: ""; 8348 s = strtok_r(NULL, ":", &tmp); 8349 offset = strtoull(s ? s : "", NULL, 16); 8350 sizes = strtok_r(NULL, ":", &tmp); 8351 s = strtok_r(NULL, ":", &tmp); 8352 flagstr = strdup(s ?: ""); 8353 8354 if (!zdb_parse_block_sizes(sizes, &lsize, &psize)) 8355 errmsg = "invalid size(s)"; 8356 if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE)) 8357 errmsg = "size must be a multiple of sector size"; 8358 if (!IS_P2ALIGNED(offset, DEV_BSIZE)) 8359 errmsg = "offset must be a multiple of sector size"; 8360 if (errmsg) { 8361 (void) printf("Invalid block specifier: %s - %s\n", 8362 thing, errmsg); 8363 goto done; 8364 } 8365 8366 tmp = NULL; 8367 for (s = strtok_r(flagstr, ":", &tmp); 8368 s != NULL; 8369 s = strtok_r(NULL, ":", &tmp)) { 8370 for (i = 0; i < strlen(flagstr); i++) { 8371 int bit = flagbits[(uchar_t)flagstr[i]]; 8372 8373 if (bit == 0) { 8374 (void) printf("***Ignoring flag: %c\n", 8375 (uchar_t)flagstr[i]); 8376 continue; 8377 } 8378 found = B_TRUE; 8379 flags |= bit; 8380 8381 p = &flagstr[i + 1]; 8382 if (*p != ':' && *p != '\0') { 8383 int j = 0, nextbit = flagbits[(uchar_t)*p]; 8384 char *end, offstr[8] = { 0 }; 8385 if ((bit == ZDB_FLAG_PRINT_BLKPTR) && 8386 (nextbit == 0)) { 8387 /* look ahead to isolate the offset */ 8388 while (nextbit == 0 && 8389 strchr(flagbitstr, *p) == NULL) { 8390 offstr[j] = *p; 8391 j++; 8392 if (i + j > strlen(flagstr)) 8393 break; 8394 p++; 8395 nextbit = flagbits[(uchar_t)*p]; 8396 } 8397 blkptr_offset = strtoull(offstr, &end, 8398 16); 8399 i += j; 8400 } else if (nextbit == 0) { 8401 (void) printf("***Ignoring flag arg:" 8402 " '%c'\n", (uchar_t)*p); 8403 } 8404 } 8405 } 8406 } 8407 if (blkptr_offset % sizeof (blkptr_t)) { 8408 printf("Block pointer offset 0x%llx " 8409 "must be divisible by 0x%x\n", 8410 (longlong_t)blkptr_offset, (int)sizeof (blkptr_t)); 8411 goto done; 8412 } 8413 if (found == B_FALSE && strlen(flagstr) > 0) { 8414 printf("Invalid flag arg: '%s'\n", flagstr); 8415 goto done; 8416 } 8417 8418 vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); 8419 if (vd == NULL) { 8420 (void) printf("***Invalid vdev: %s\n", vdev); 8421 goto done; 8422 } else { 8423 if (vd->vdev_path) 8424 (void) fprintf(stderr, "Found vdev: %s\n", 8425 vd->vdev_path); 8426 else 8427 (void) fprintf(stderr, "Found vdev type: %s\n", 8428 vd->vdev_ops->vdev_op_type); 8429 } 8430 8431 pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); 8432 lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 8433 8434 BP_ZERO(bp); 8435 8436 DVA_SET_VDEV(&dva[0], vd->vdev_id); 8437 DVA_SET_OFFSET(&dva[0], offset); 8438 DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); 8439 DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); 8440 8441 BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); 8442 8443 BP_SET_LSIZE(bp, lsize); 8444 BP_SET_PSIZE(bp, psize); 8445 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 8446 BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); 8447 BP_SET_TYPE(bp, DMU_OT_NONE); 8448 BP_SET_LEVEL(bp, 0); 8449 BP_SET_DEDUP(bp, 0); 8450 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 8451 8452 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8453 zio = zio_root(spa, NULL, NULL, 0); 8454 8455 if (vd == vd->vdev_top) { 8456 /* 8457 * Treat this as a normal block read. 8458 */ 8459 zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, 8460 ZIO_PRIORITY_SYNC_READ, 8461 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); 8462 } else { 8463 /* 8464 * Treat this as a vdev child I/O. 8465 */ 8466 zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, 8467 psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, 8468 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | 8469 ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | 8470 ZIO_FLAG_OPTIONAL, NULL, NULL)); 8471 } 8472 8473 error = zio_wait(zio); 8474 spa_config_exit(spa, SCL_STATE, FTAG); 8475 8476 if (error) { 8477 (void) printf("Read of %s failed, error: %d\n", thing, error); 8478 goto out; 8479 } 8480 8481 uint64_t orig_lsize = lsize; 8482 buf = lbuf; 8483 if (flags & ZDB_FLAG_DECOMPRESS) { 8484 boolean_t failed = zdb_decompress_block(pabd, buf, lbuf, 8485 lsize, psize, flags); 8486 if (failed) { 8487 (void) printf("Decompress of %s failed\n", thing); 8488 goto out; 8489 } 8490 } else { 8491 buf = abd_borrow_buf_copy(pabd, lsize); 8492 borrowed = B_TRUE; 8493 } 8494 /* 8495 * Try to detect invalid block pointer. If invalid, try 8496 * decompressing. 8497 */ 8498 if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) && 8499 !(flags & ZDB_FLAG_DECOMPRESS)) { 8500 const blkptr_t *b = (const blkptr_t *)(void *) 8501 ((uintptr_t)buf + (uintptr_t)blkptr_offset); 8502 if (zfs_blkptr_verify(spa, b, 8503 BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) { 8504 abd_return_buf_copy(pabd, buf, lsize); 8505 borrowed = B_FALSE; 8506 buf = lbuf; 8507 boolean_t failed = zdb_decompress_block(pabd, buf, 8508 lbuf, lsize, psize, flags); 8509 b = (const blkptr_t *)(void *) 8510 ((uintptr_t)buf + (uintptr_t)blkptr_offset); 8511 if (failed || zfs_blkptr_verify(spa, b, 8512 BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) { 8513 printf("invalid block pointer at this DVA\n"); 8514 goto out; 8515 } 8516 } 8517 } 8518 8519 if (flags & ZDB_FLAG_PRINT_BLKPTR) 8520 zdb_print_blkptr((blkptr_t *)(void *) 8521 ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); 8522 else if (flags & ZDB_FLAG_RAW) 8523 zdb_dump_block_raw(buf, lsize, flags); 8524 else if (flags & ZDB_FLAG_INDIRECT) 8525 zdb_dump_indirect((blkptr_t *)buf, 8526 orig_lsize / sizeof (blkptr_t), flags); 8527 else if (flags & ZDB_FLAG_GBH) 8528 zdb_dump_gbh(buf, flags); 8529 else 8530 zdb_dump_block(thing, buf, lsize, flags); 8531 8532 /* 8533 * If :c was specified, iterate through the checksum table to 8534 * calculate and display each checksum for our specified 8535 * DVA and length. 8536 */ 8537 if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) && 8538 !(flags & ZDB_FLAG_GBH)) { 8539 zio_t *czio; 8540 (void) printf("\n"); 8541 for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL; 8542 ck < ZIO_CHECKSUM_FUNCTIONS; ck++) { 8543 8544 if ((zio_checksum_table[ck].ci_flags & 8545 ZCHECKSUM_FLAG_EMBEDDED) || 8546 ck == ZIO_CHECKSUM_NOPARITY) { 8547 continue; 8548 } 8549 BP_SET_CHECKSUM(bp, ck); 8550 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8551 czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 8552 czio->io_bp = bp; 8553 8554 if (vd == vd->vdev_top) { 8555 zio_nowait(zio_read(czio, spa, bp, pabd, psize, 8556 NULL, NULL, 8557 ZIO_PRIORITY_SYNC_READ, 8558 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | 8559 ZIO_FLAG_DONT_RETRY, NULL)); 8560 } else { 8561 zio_nowait(zio_vdev_child_io(czio, bp, vd, 8562 offset, pabd, psize, ZIO_TYPE_READ, 8563 ZIO_PRIORITY_SYNC_READ, 8564 ZIO_FLAG_DONT_CACHE | 8565 ZIO_FLAG_DONT_PROPAGATE | 8566 ZIO_FLAG_DONT_RETRY | 8567 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | 8568 ZIO_FLAG_SPECULATIVE | 8569 ZIO_FLAG_OPTIONAL, NULL, NULL)); 8570 } 8571 error = zio_wait(czio); 8572 if (error == 0 || error == ECKSUM) { 8573 zio_t *ck_zio = zio_root(spa, NULL, NULL, 0); 8574 ck_zio->io_offset = 8575 DVA_GET_OFFSET(&bp->blk_dva[0]); 8576 ck_zio->io_bp = bp; 8577 zio_checksum_compute(ck_zio, ck, pabd, lsize); 8578 printf( 8579 "%12s\t" 8580 "cksum=%016llx:%016llx:%016llx:%016llx\n", 8581 zio_checksum_table[ck].ci_name, 8582 (u_longlong_t)bp->blk_cksum.zc_word[0], 8583 (u_longlong_t)bp->blk_cksum.zc_word[1], 8584 (u_longlong_t)bp->blk_cksum.zc_word[2], 8585 (u_longlong_t)bp->blk_cksum.zc_word[3]); 8586 zio_wait(ck_zio); 8587 } else { 8588 printf("error %d reading block\n", error); 8589 } 8590 spa_config_exit(spa, SCL_STATE, FTAG); 8591 } 8592 } 8593 8594 if (borrowed) 8595 abd_return_buf_copy(pabd, buf, lsize); 8596 8597 out: 8598 abd_free(pabd); 8599 umem_free(lbuf, SPA_MAXBLOCKSIZE); 8600 done: 8601 free(flagstr); 8602 free(dup); 8603 } 8604 8605 static void 8606 zdb_embedded_block(char *thing) 8607 { 8608 blkptr_t bp = {{{{0}}}}; 8609 unsigned long long *words = (void *)&bp; 8610 char *buf; 8611 int err; 8612 8613 err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" 8614 "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", 8615 words + 0, words + 1, words + 2, words + 3, 8616 words + 4, words + 5, words + 6, words + 7, 8617 words + 8, words + 9, words + 10, words + 11, 8618 words + 12, words + 13, words + 14, words + 15); 8619 if (err != 16) { 8620 (void) fprintf(stderr, "invalid input format\n"); 8621 exit(1); 8622 } 8623 ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); 8624 buf = malloc(SPA_MAXBLOCKSIZE); 8625 if (buf == NULL) { 8626 (void) fprintf(stderr, "out of memory\n"); 8627 exit(1); 8628 } 8629 err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); 8630 if (err != 0) { 8631 (void) fprintf(stderr, "decode failed: %u\n", err); 8632 exit(1); 8633 } 8634 zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); 8635 free(buf); 8636 } 8637 8638 /* check for valid hex or decimal numeric string */ 8639 static boolean_t 8640 zdb_numeric(char *str) 8641 { 8642 int i = 0; 8643 8644 if (strlen(str) == 0) 8645 return (B_FALSE); 8646 if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0) 8647 i = 2; 8648 for (; i < strlen(str); i++) { 8649 if (!isxdigit(str[i])) 8650 return (B_FALSE); 8651 } 8652 return (B_TRUE); 8653 } 8654 8655 int 8656 main(int argc, char **argv) 8657 { 8658 int c; 8659 spa_t *spa = NULL; 8660 objset_t *os = NULL; 8661 int dump_all = 1; 8662 int verbose = 0; 8663 int error = 0; 8664 char **searchdirs = NULL; 8665 int nsearch = 0; 8666 char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN]; 8667 nvlist_t *policy = NULL; 8668 uint64_t max_txg = UINT64_MAX; 8669 int64_t objset_id = -1; 8670 uint64_t object; 8671 int flags = ZFS_IMPORT_MISSING_LOG; 8672 int rewind = ZPOOL_NEVER_REWIND; 8673 char *spa_config_path_env, *objset_str; 8674 boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; 8675 nvlist_t *cfg = NULL; 8676 8677 dprintf_setup(&argc, argv); 8678 8679 /* 8680 * If there is an environment variable SPA_CONFIG_PATH it overrides 8681 * default spa_config_path setting. If -U flag is specified it will 8682 * override this environment variable settings once again. 8683 */ 8684 spa_config_path_env = getenv("SPA_CONFIG_PATH"); 8685 if (spa_config_path_env != NULL) 8686 spa_config_path = spa_config_path_env; 8687 8688 /* 8689 * For performance reasons, we set this tunable down. We do so before 8690 * the arg parsing section so that the user can override this value if 8691 * they choose. 8692 */ 8693 zfs_btree_verify_intensity = 3; 8694 8695 struct option long_options[] = { 8696 {"ignore-assertions", no_argument, NULL, 'A'}, 8697 {"block-stats", no_argument, NULL, 'b'}, 8698 {"checksum", no_argument, NULL, 'c'}, 8699 {"config", no_argument, NULL, 'C'}, 8700 {"datasets", no_argument, NULL, 'd'}, 8701 {"dedup-stats", no_argument, NULL, 'D'}, 8702 {"exported", no_argument, NULL, 'e'}, 8703 {"embedded-block-pointer", no_argument, NULL, 'E'}, 8704 {"automatic-rewind", no_argument, NULL, 'F'}, 8705 {"dump-debug-msg", no_argument, NULL, 'G'}, 8706 {"history", no_argument, NULL, 'h'}, 8707 {"intent-logs", no_argument, NULL, 'i'}, 8708 {"inflight", required_argument, NULL, 'I'}, 8709 {"checkpointed-state", no_argument, NULL, 'k'}, 8710 {"key", required_argument, NULL, 'K'}, 8711 {"label", no_argument, NULL, 'l'}, 8712 {"disable-leak-tracking", no_argument, NULL, 'L'}, 8713 {"metaslabs", no_argument, NULL, 'm'}, 8714 {"metaslab-groups", no_argument, NULL, 'M'}, 8715 {"numeric", no_argument, NULL, 'N'}, 8716 {"option", required_argument, NULL, 'o'}, 8717 {"object-lookups", no_argument, NULL, 'O'}, 8718 {"path", required_argument, NULL, 'p'}, 8719 {"parseable", no_argument, NULL, 'P'}, 8720 {"skip-label", no_argument, NULL, 'q'}, 8721 {"copy-object", no_argument, NULL, 'r'}, 8722 {"read-block", no_argument, NULL, 'R'}, 8723 {"io-stats", no_argument, NULL, 's'}, 8724 {"simulate-dedup", no_argument, NULL, 'S'}, 8725 {"txg", required_argument, NULL, 't'}, 8726 {"uberblock", no_argument, NULL, 'u'}, 8727 {"cachefile", required_argument, NULL, 'U'}, 8728 {"verbose", no_argument, NULL, 'v'}, 8729 {"verbatim", no_argument, NULL, 'V'}, 8730 {"dump-blocks", required_argument, NULL, 'x'}, 8731 {"extreme-rewind", no_argument, NULL, 'X'}, 8732 {"all-reconstruction", no_argument, NULL, 'Y'}, 8733 {"livelist", no_argument, NULL, 'y'}, 8734 {"zstd-headers", no_argument, NULL, 'Z'}, 8735 {0, 0, 0, 0} 8736 }; 8737 8738 while ((c = getopt_long(argc, argv, 8739 "AbcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", 8740 long_options, NULL)) != -1) { 8741 switch (c) { 8742 case 'b': 8743 case 'c': 8744 case 'C': 8745 case 'd': 8746 case 'D': 8747 case 'E': 8748 case 'G': 8749 case 'h': 8750 case 'i': 8751 case 'l': 8752 case 'm': 8753 case 'M': 8754 case 'N': 8755 case 'O': 8756 case 'r': 8757 case 'R': 8758 case 's': 8759 case 'S': 8760 case 'u': 8761 case 'y': 8762 case 'Z': 8763 dump_opt[c]++; 8764 dump_all = 0; 8765 break; 8766 case 'A': 8767 case 'e': 8768 case 'F': 8769 case 'k': 8770 case 'L': 8771 case 'P': 8772 case 'q': 8773 case 'X': 8774 dump_opt[c]++; 8775 break; 8776 case 'Y': 8777 zfs_reconstruct_indirect_combinations_max = INT_MAX; 8778 zfs_deadman_enabled = 0; 8779 break; 8780 /* NB: Sort single match options below. */ 8781 case 'I': 8782 max_inflight_bytes = strtoull(optarg, NULL, 0); 8783 if (max_inflight_bytes == 0) { 8784 (void) fprintf(stderr, "maximum number " 8785 "of inflight bytes must be greater " 8786 "than 0\n"); 8787 usage(); 8788 } 8789 break; 8790 case 'K': 8791 dump_opt[c]++; 8792 key_material = strdup(optarg); 8793 /* redact key material in process table */ 8794 while (*optarg != '\0') { *optarg++ = '*'; } 8795 break; 8796 case 'o': 8797 error = set_global_var(optarg); 8798 if (error != 0) 8799 usage(); 8800 break; 8801 case 'p': 8802 if (searchdirs == NULL) { 8803 searchdirs = umem_alloc(sizeof (char *), 8804 UMEM_NOFAIL); 8805 } else { 8806 char **tmp = umem_alloc((nsearch + 1) * 8807 sizeof (char *), UMEM_NOFAIL); 8808 memcpy(tmp, searchdirs, nsearch * 8809 sizeof (char *)); 8810 umem_free(searchdirs, 8811 nsearch * sizeof (char *)); 8812 searchdirs = tmp; 8813 } 8814 searchdirs[nsearch++] = optarg; 8815 break; 8816 case 't': 8817 max_txg = strtoull(optarg, NULL, 0); 8818 if (max_txg < TXG_INITIAL) { 8819 (void) fprintf(stderr, "incorrect txg " 8820 "specified: %s\n", optarg); 8821 usage(); 8822 } 8823 break; 8824 case 'U': 8825 spa_config_path = optarg; 8826 if (spa_config_path[0] != '/') { 8827 (void) fprintf(stderr, 8828 "cachefile must be an absolute path " 8829 "(i.e. start with a slash)\n"); 8830 usage(); 8831 } 8832 break; 8833 case 'v': 8834 verbose++; 8835 break; 8836 case 'V': 8837 flags = ZFS_IMPORT_VERBATIM; 8838 break; 8839 case 'x': 8840 vn_dumpdir = optarg; 8841 break; 8842 default: 8843 usage(); 8844 break; 8845 } 8846 } 8847 8848 if (!dump_opt['e'] && searchdirs != NULL) { 8849 (void) fprintf(stderr, "-p option requires use of -e\n"); 8850 usage(); 8851 } 8852 #if defined(_LP64) 8853 /* 8854 * ZDB does not typically re-read blocks; therefore limit the ARC 8855 * to 256 MB, which can be used entirely for metadata. 8856 */ 8857 zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT; 8858 zfs_arc_max = 256 * 1024 * 1024; 8859 #endif 8860 8861 /* 8862 * "zdb -c" uses checksum-verifying scrub i/os which are async reads. 8863 * "zdb -b" uses traversal prefetch which uses async reads. 8864 * For good performance, let several of them be active at once. 8865 */ 8866 zfs_vdev_async_read_max_active = 10; 8867 8868 /* 8869 * Disable reference tracking for better performance. 8870 */ 8871 reference_tracking_enable = B_FALSE; 8872 8873 /* 8874 * Do not fail spa_load when spa_load_verify fails. This is needed 8875 * to load non-idle pools. 8876 */ 8877 spa_load_verify_dryrun = B_TRUE; 8878 8879 /* 8880 * ZDB should have ability to read spacemaps. 8881 */ 8882 spa_mode_readable_spacemaps = B_TRUE; 8883 8884 kernel_init(SPA_MODE_READ); 8885 8886 if (dump_all) 8887 verbose = MAX(verbose, 1); 8888 8889 for (c = 0; c < 256; c++) { 8890 if (dump_all && strchr("AeEFkKlLNOPrRSXy", c) == NULL) 8891 dump_opt[c] = 1; 8892 if (dump_opt[c]) 8893 dump_opt[c] += verbose; 8894 } 8895 8896 libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2)); 8897 zfs_recover = (dump_opt['A'] > 1); 8898 8899 argc -= optind; 8900 argv += optind; 8901 if (argc < 2 && dump_opt['R']) 8902 usage(); 8903 8904 if (dump_opt['E']) { 8905 if (argc != 1) 8906 usage(); 8907 zdb_embedded_block(argv[0]); 8908 return (0); 8909 } 8910 8911 if (argc < 1) { 8912 if (!dump_opt['e'] && dump_opt['C']) { 8913 dump_cachefile(spa_config_path); 8914 return (0); 8915 } 8916 usage(); 8917 } 8918 8919 if (dump_opt['l']) 8920 return (dump_label(argv[0])); 8921 8922 if (dump_opt['O']) { 8923 if (argc != 2) 8924 usage(); 8925 dump_opt['v'] = verbose + 3; 8926 return (dump_path(argv[0], argv[1], NULL)); 8927 } 8928 if (dump_opt['r']) { 8929 target_is_spa = B_FALSE; 8930 if (argc != 3) 8931 usage(); 8932 dump_opt['v'] = verbose; 8933 error = dump_path(argv[0], argv[1], &object); 8934 if (error != 0) 8935 fatal("internal error: %s", strerror(error)); 8936 } 8937 8938 if (dump_opt['X'] || dump_opt['F']) 8939 rewind = ZPOOL_DO_REWIND | 8940 (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); 8941 8942 /* -N implies -d */ 8943 if (dump_opt['N'] && dump_opt['d'] == 0) 8944 dump_opt['d'] = dump_opt['N']; 8945 8946 if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || 8947 nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || 8948 nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) 8949 fatal("internal error: %s", strerror(ENOMEM)); 8950 8951 error = 0; 8952 target = argv[0]; 8953 8954 if (strpbrk(target, "/@") != NULL) { 8955 size_t targetlen; 8956 8957 target_pool = strdup(target); 8958 *strpbrk(target_pool, "/@") = '\0'; 8959 8960 target_is_spa = B_FALSE; 8961 targetlen = strlen(target); 8962 if (targetlen && target[targetlen - 1] == '/') 8963 target[targetlen - 1] = '\0'; 8964 8965 /* 8966 * See if an objset ID was supplied (-d <pool>/<objset ID>). 8967 * To disambiguate tank/100, consider the 100 as objsetID 8968 * if -N was given, otherwise 100 is an objsetID iff 8969 * tank/100 as a named dataset fails on lookup. 8970 */ 8971 objset_str = strchr(target, '/'); 8972 if (objset_str && strlen(objset_str) > 1 && 8973 zdb_numeric(objset_str + 1)) { 8974 char *endptr; 8975 errno = 0; 8976 objset_str++; 8977 objset_id = strtoull(objset_str, &endptr, 0); 8978 /* dataset 0 is the same as opening the pool */ 8979 if (errno == 0 && endptr != objset_str && 8980 objset_id != 0) { 8981 if (dump_opt['N']) 8982 dataset_lookup = B_TRUE; 8983 } 8984 /* normal dataset name not an objset ID */ 8985 if (endptr == objset_str) { 8986 objset_id = -1; 8987 } 8988 } else if (objset_str && !zdb_numeric(objset_str + 1) && 8989 dump_opt['N']) { 8990 printf("Supply a numeric objset ID with -N\n"); 8991 exit(1); 8992 } 8993 } else { 8994 target_pool = target; 8995 } 8996 8997 if (dump_opt['e']) { 8998 importargs_t args = { 0 }; 8999 9000 args.paths = nsearch; 9001 args.path = searchdirs; 9002 args.can_be_active = B_TRUE; 9003 9004 libpc_handle_t lpch = { 9005 .lpc_lib_handle = NULL, 9006 .lpc_ops = &libzpool_config_ops, 9007 .lpc_printerr = B_TRUE 9008 }; 9009 error = zpool_find_config(&lpch, target_pool, &cfg, &args); 9010 9011 if (error == 0) { 9012 9013 if (nvlist_add_nvlist(cfg, 9014 ZPOOL_LOAD_POLICY, policy) != 0) { 9015 fatal("can't open '%s': %s", 9016 target, strerror(ENOMEM)); 9017 } 9018 9019 if (dump_opt['C'] > 1) { 9020 (void) printf("\nConfiguration for import:\n"); 9021 dump_nvlist(cfg, 8); 9022 } 9023 9024 /* 9025 * Disable the activity check to allow examination of 9026 * active pools. 9027 */ 9028 error = spa_import(target_pool, cfg, NULL, 9029 flags | ZFS_IMPORT_SKIP_MMP); 9030 } 9031 } 9032 9033 if (searchdirs != NULL) { 9034 umem_free(searchdirs, nsearch * sizeof (char *)); 9035 searchdirs = NULL; 9036 } 9037 9038 /* 9039 * import_checkpointed_state makes the assumption that the 9040 * target pool that we pass it is already part of the spa 9041 * namespace. Because of that we need to make sure to call 9042 * it always after the -e option has been processed, which 9043 * imports the pool to the namespace if it's not in the 9044 * cachefile. 9045 */ 9046 char *checkpoint_pool = NULL; 9047 char *checkpoint_target = NULL; 9048 if (dump_opt['k']) { 9049 checkpoint_pool = import_checkpointed_state(target, cfg, 9050 &checkpoint_target); 9051 9052 if (checkpoint_target != NULL) 9053 target = checkpoint_target; 9054 } 9055 9056 if (cfg != NULL) { 9057 nvlist_free(cfg); 9058 cfg = NULL; 9059 } 9060 9061 if (target_pool != target) 9062 free(target_pool); 9063 9064 if (error == 0) { 9065 if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { 9066 ASSERT(checkpoint_pool != NULL); 9067 ASSERT(checkpoint_target == NULL); 9068 9069 error = spa_open(checkpoint_pool, &spa, FTAG); 9070 if (error != 0) { 9071 fatal("Tried to open pool \"%s\" but " 9072 "spa_open() failed with error %d\n", 9073 checkpoint_pool, error); 9074 } 9075 9076 } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { 9077 zdb_set_skip_mmp(target); 9078 error = spa_open_rewind(target, &spa, FTAG, policy, 9079 NULL); 9080 if (error) { 9081 /* 9082 * If we're missing the log device then 9083 * try opening the pool after clearing the 9084 * log state. 9085 */ 9086 mutex_enter(&spa_namespace_lock); 9087 if ((spa = spa_lookup(target)) != NULL && 9088 spa->spa_log_state == SPA_LOG_MISSING) { 9089 spa->spa_log_state = SPA_LOG_CLEAR; 9090 error = 0; 9091 } 9092 mutex_exit(&spa_namespace_lock); 9093 9094 if (!error) { 9095 error = spa_open_rewind(target, &spa, 9096 FTAG, policy, NULL); 9097 } 9098 } 9099 } else if (strpbrk(target, "#") != NULL) { 9100 dsl_pool_t *dp; 9101 error = dsl_pool_hold(target, FTAG, &dp); 9102 if (error != 0) { 9103 fatal("can't dump '%s': %s", target, 9104 strerror(error)); 9105 } 9106 error = dump_bookmark(dp, target, B_TRUE, verbose > 1); 9107 dsl_pool_rele(dp, FTAG); 9108 if (error != 0) { 9109 fatal("can't dump '%s': %s", target, 9110 strerror(error)); 9111 } 9112 return (error); 9113 } else { 9114 target_pool = strdup(target); 9115 if (strpbrk(target, "/@") != NULL) 9116 *strpbrk(target_pool, "/@") = '\0'; 9117 9118 zdb_set_skip_mmp(target); 9119 /* 9120 * If -N was supplied, the user has indicated that 9121 * zdb -d <pool>/<objsetID> is in effect. Otherwise 9122 * we first assume that the dataset string is the 9123 * dataset name. If dmu_objset_hold fails with the 9124 * dataset string, and we have an objset_id, retry the 9125 * lookup with the objsetID. 9126 */ 9127 boolean_t retry = B_TRUE; 9128 retry_lookup: 9129 if (dataset_lookup == B_TRUE) { 9130 /* 9131 * Use the supplied id to get the name 9132 * for open_objset. 9133 */ 9134 error = spa_open(target_pool, &spa, FTAG); 9135 if (error == 0) { 9136 error = name_from_objset_id(spa, 9137 objset_id, dsname); 9138 spa_close(spa, FTAG); 9139 if (error == 0) 9140 target = dsname; 9141 } 9142 } 9143 if (error == 0) { 9144 if (objset_id > 0 && retry) { 9145 int err = dmu_objset_hold(target, FTAG, 9146 &os); 9147 if (err) { 9148 dataset_lookup = B_TRUE; 9149 retry = B_FALSE; 9150 goto retry_lookup; 9151 } else { 9152 dmu_objset_rele(os, FTAG); 9153 } 9154 } 9155 error = open_objset(target, FTAG, &os); 9156 } 9157 if (error == 0) 9158 spa = dmu_objset_spa(os); 9159 free(target_pool); 9160 } 9161 } 9162 nvlist_free(policy); 9163 9164 if (error) 9165 fatal("can't open '%s': %s", target, strerror(error)); 9166 9167 /* 9168 * Set the pool failure mode to panic in order to prevent the pool 9169 * from suspending. A suspended I/O will have no way to resume and 9170 * can prevent the zdb(8) command from terminating as expected. 9171 */ 9172 if (spa != NULL) 9173 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; 9174 9175 argv++; 9176 argc--; 9177 if (dump_opt['r']) { 9178 error = zdb_copy_object(os, object, argv[1]); 9179 } else if (!dump_opt['R']) { 9180 flagbits['d'] = ZOR_FLAG_DIRECTORY; 9181 flagbits['f'] = ZOR_FLAG_PLAIN_FILE; 9182 flagbits['m'] = ZOR_FLAG_SPACE_MAP; 9183 flagbits['z'] = ZOR_FLAG_ZAP; 9184 flagbits['A'] = ZOR_FLAG_ALL_TYPES; 9185 9186 if (argc > 0 && dump_opt['d']) { 9187 zopt_object_args = argc; 9188 zopt_object_ranges = calloc(zopt_object_args, 9189 sizeof (zopt_object_range_t)); 9190 for (unsigned i = 0; i < zopt_object_args; i++) { 9191 int err; 9192 const char *msg = NULL; 9193 9194 err = parse_object_range(argv[i], 9195 &zopt_object_ranges[i], &msg); 9196 if (err != 0) 9197 fatal("Bad object or range: '%s': %s\n", 9198 argv[i], msg ?: ""); 9199 } 9200 } else if (argc > 0 && dump_opt['m']) { 9201 zopt_metaslab_args = argc; 9202 zopt_metaslab = calloc(zopt_metaslab_args, 9203 sizeof (uint64_t)); 9204 for (unsigned i = 0; i < zopt_metaslab_args; i++) { 9205 errno = 0; 9206 zopt_metaslab[i] = strtoull(argv[i], NULL, 0); 9207 if (zopt_metaslab[i] == 0 && errno != 0) 9208 fatal("bad number %s: %s", argv[i], 9209 strerror(errno)); 9210 } 9211 } 9212 if (os != NULL) { 9213 dump_objset(os); 9214 } else if (zopt_object_args > 0 && !dump_opt['m']) { 9215 dump_objset(spa->spa_meta_objset); 9216 } else { 9217 dump_zpool(spa); 9218 } 9219 } else { 9220 flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; 9221 flagbits['c'] = ZDB_FLAG_CHECKSUM; 9222 flagbits['d'] = ZDB_FLAG_DECOMPRESS; 9223 flagbits['e'] = ZDB_FLAG_BSWAP; 9224 flagbits['g'] = ZDB_FLAG_GBH; 9225 flagbits['i'] = ZDB_FLAG_INDIRECT; 9226 flagbits['r'] = ZDB_FLAG_RAW; 9227 flagbits['v'] = ZDB_FLAG_VERBOSE; 9228 9229 for (int i = 0; i < argc; i++) 9230 zdb_read_block(argv[i], spa); 9231 } 9232 9233 if (dump_opt['k']) { 9234 free(checkpoint_pool); 9235 if (!target_is_spa) 9236 free(checkpoint_target); 9237 } 9238 9239 if (os != NULL) { 9240 close_objset(os, FTAG); 9241 } else { 9242 spa_close(spa, FTAG); 9243 } 9244 9245 fuid_table_destroy(); 9246 9247 dump_debug_buffer(); 9248 9249 kernel_fini(); 9250 9251 return (error); 9252 } 9253