xref: /freebsd/sys/contrib/openzfs/cmd/zdb/zdb.c (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
25  * Copyright (c) 2014 Integros [integros.com]
26  * Copyright 2016 Nexenta Systems, Inc.
27  * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
28  * Copyright (c) 2015, 2017, Intel Corporation.
29  * Copyright (c) 2020 Datto Inc.
30  * Copyright (c) 2020, The FreeBSD Foundation [1]
31  *
32  * [1] Portions of this software were developed by Allan Jude
33  *     under sponsorship from the FreeBSD Foundation.
34  * Copyright (c) 2021 Allan Jude
35  * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
36  * Copyright (c) 2023, Klara Inc.
37  * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
38  */
39 
40 #include <stdio.h>
41 #include <unistd.h>
42 #include <stdlib.h>
43 #include <ctype.h>
44 #include <getopt.h>
45 #include <openssl/evp.h>
46 #include <sys/zfs_context.h>
47 #include <sys/spa.h>
48 #include <sys/spa_impl.h>
49 #include <sys/dmu.h>
50 #include <sys/zap.h>
51 #include <sys/fs/zfs.h>
52 #include <sys/zfs_znode.h>
53 #include <sys/zfs_sa.h>
54 #include <sys/sa.h>
55 #include <sys/sa_impl.h>
56 #include <sys/vdev.h>
57 #include <sys/vdev_impl.h>
58 #include <sys/metaslab_impl.h>
59 #include <sys/dmu_objset.h>
60 #include <sys/dsl_dir.h>
61 #include <sys/dsl_dataset.h>
62 #include <sys/dsl_pool.h>
63 #include <sys/dsl_bookmark.h>
64 #include <sys/dbuf.h>
65 #include <sys/zil.h>
66 #include <sys/zil_impl.h>
67 #include <sys/stat.h>
68 #include <sys/resource.h>
69 #include <sys/dmu_send.h>
70 #include <sys/dmu_traverse.h>
71 #include <sys/zio_checksum.h>
72 #include <sys/zio_compress.h>
73 #include <sys/zfs_fuid.h>
74 #include <sys/arc.h>
75 #include <sys/arc_impl.h>
76 #include <sys/ddt.h>
77 #include <sys/ddt_impl.h>
78 #include <sys/zfeature.h>
79 #include <sys/abd.h>
80 #include <sys/blkptr.h>
81 #include <sys/dsl_crypt.h>
82 #include <sys/dsl_scan.h>
83 #include <sys/btree.h>
84 #include <sys/brt.h>
85 #include <sys/brt_impl.h>
86 #include <zfs_comutil.h>
87 #include <sys/zstd/zstd.h>
88 
89 #include <libnvpair.h>
90 #include <libzutil.h>
91 
92 #include <libzdb.h>
93 
94 #include "zdb.h"
95 
96 
97 extern int reference_tracking_enable;
98 extern int zfs_recover;
99 extern uint_t zfs_vdev_async_read_max_active;
100 extern boolean_t spa_load_verify_dryrun;
101 extern boolean_t spa_mode_readable_spacemaps;
102 extern uint_t zfs_reconstruct_indirect_combinations_max;
103 extern uint_t zfs_btree_verify_intensity;
104 
105 static const char cmdname[] = "zdb";
106 uint8_t dump_opt[256];
107 
108 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
109 
110 static uint64_t *zopt_metaslab = NULL;
111 static unsigned zopt_metaslab_args = 0;
112 
113 
114 static zopt_object_range_t *zopt_object_ranges = NULL;
115 static unsigned zopt_object_args = 0;
116 
117 static int flagbits[256];
118 
119 
120 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
121 static int leaked_objects = 0;
122 static range_tree_t *mos_refd_objs;
123 
124 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
125     boolean_t);
126 static void mos_obj_refd(uint64_t);
127 static void mos_obj_refd_multiple(uint64_t);
128 static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
129     dmu_tx_t *tx);
130 
131 
132 
133 static void zdb_print_blkptr(const blkptr_t *bp, int flags);
134 
135 typedef struct sublivelist_verify_block_refcnt {
136 	/* block pointer entry in livelist being verified */
137 	blkptr_t svbr_blk;
138 
139 	/*
140 	 * Refcount gets incremented to 1 when we encounter the first
141 	 * FREE entry for the svfbr block pointer and a node for it
142 	 * is created in our ZDB verification/tracking metadata.
143 	 *
144 	 * As we encounter more FREE entries we increment this counter
145 	 * and similarly decrement it whenever we find the respective
146 	 * ALLOC entries for this block.
147 	 *
148 	 * When the refcount gets to 0 it means that all the FREE and
149 	 * ALLOC entries of this block have paired up and we no longer
150 	 * need to track it in our verification logic (e.g. the node
151 	 * containing this struct in our verification data structure
152 	 * should be freed).
153 	 *
154 	 * [refer to sublivelist_verify_blkptr() for the actual code]
155 	 */
156 	uint32_t svbr_refcnt;
157 } sublivelist_verify_block_refcnt_t;
158 
159 static int
160 sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
161 {
162 	const sublivelist_verify_block_refcnt_t *l = larg;
163 	const sublivelist_verify_block_refcnt_t *r = rarg;
164 	return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
165 }
166 
167 static int
168 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
169     dmu_tx_t *tx)
170 {
171 	ASSERT3P(tx, ==, NULL);
172 	struct sublivelist_verify *sv = arg;
173 	sublivelist_verify_block_refcnt_t current = {
174 			.svbr_blk = *bp,
175 
176 			/*
177 			 * Start with 1 in case this is the first free entry.
178 			 * This field is not used for our B-Tree comparisons
179 			 * anyway.
180 			 */
181 			.svbr_refcnt = 1,
182 	};
183 
184 	zfs_btree_index_t where;
185 	sublivelist_verify_block_refcnt_t *pair =
186 	    zfs_btree_find(&sv->sv_pair, &current, &where);
187 	if (free) {
188 		if (pair == NULL) {
189 			/* first free entry for this block pointer */
190 			zfs_btree_add(&sv->sv_pair, &current);
191 		} else {
192 			pair->svbr_refcnt++;
193 		}
194 	} else {
195 		if (pair == NULL) {
196 			/* block that is currently marked as allocated */
197 			for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
198 				if (DVA_IS_EMPTY(&bp->blk_dva[i]))
199 					break;
200 				sublivelist_verify_block_t svb = {
201 				    .svb_dva = bp->blk_dva[i],
202 				    .svb_allocated_txg = bp->blk_birth
203 				};
204 
205 				if (zfs_btree_find(&sv->sv_leftover, &svb,
206 				    &where) == NULL) {
207 					zfs_btree_add_idx(&sv->sv_leftover,
208 					    &svb, &where);
209 				}
210 			}
211 		} else {
212 			/* alloc matches a free entry */
213 			pair->svbr_refcnt--;
214 			if (pair->svbr_refcnt == 0) {
215 				/* all allocs and frees have been matched */
216 				zfs_btree_remove_idx(&sv->sv_pair, &where);
217 			}
218 		}
219 	}
220 
221 	return (0);
222 }
223 
224 static int
225 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
226 {
227 	int err;
228 	struct sublivelist_verify *sv = args;
229 
230 	zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
231 	    sizeof (sublivelist_verify_block_refcnt_t));
232 
233 	err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
234 	    sv, NULL);
235 
236 	sublivelist_verify_block_refcnt_t *e;
237 	zfs_btree_index_t *cookie = NULL;
238 	while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
239 		char blkbuf[BP_SPRINTF_LEN];
240 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
241 		    &e->svbr_blk, B_TRUE);
242 		(void) printf("\tERROR: %d unmatched FREE(s): %s\n",
243 		    e->svbr_refcnt, blkbuf);
244 	}
245 	zfs_btree_destroy(&sv->sv_pair);
246 
247 	return (err);
248 }
249 
250 static int
251 livelist_block_compare(const void *larg, const void *rarg)
252 {
253 	const sublivelist_verify_block_t *l = larg;
254 	const sublivelist_verify_block_t *r = rarg;
255 
256 	if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
257 		return (-1);
258 	else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
259 		return (+1);
260 
261 	if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
262 		return (-1);
263 	else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
264 		return (+1);
265 
266 	if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
267 		return (-1);
268 	else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
269 		return (+1);
270 
271 	return (0);
272 }
273 
274 /*
275  * Check for errors in a livelist while tracking all unfreed ALLOCs in the
276  * sublivelist_verify_t: sv->sv_leftover
277  */
278 static void
279 livelist_verify(dsl_deadlist_t *dl, void *arg)
280 {
281 	sublivelist_verify_t *sv = arg;
282 	dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
283 }
284 
285 /*
286  * Check for errors in the livelist entry and discard the intermediary
287  * data structures
288  */
289 static int
290 sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
291 {
292 	(void) args;
293 	sublivelist_verify_t sv;
294 	zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
295 	    sizeof (sublivelist_verify_block_t));
296 	int err = sublivelist_verify_func(&sv, dle);
297 	zfs_btree_clear(&sv.sv_leftover);
298 	zfs_btree_destroy(&sv.sv_leftover);
299 	return (err);
300 }
301 
302 typedef struct metaslab_verify {
303 	/*
304 	 * Tree containing all the leftover ALLOCs from the livelists
305 	 * that are part of this metaslab.
306 	 */
307 	zfs_btree_t mv_livelist_allocs;
308 
309 	/*
310 	 * Metaslab information.
311 	 */
312 	uint64_t mv_vdid;
313 	uint64_t mv_msid;
314 	uint64_t mv_start;
315 	uint64_t mv_end;
316 
317 	/*
318 	 * What's currently allocated for this metaslab.
319 	 */
320 	range_tree_t *mv_allocated;
321 } metaslab_verify_t;
322 
323 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
324 
325 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
326     void *arg);
327 
328 typedef struct unflushed_iter_cb_arg {
329 	spa_t *uic_spa;
330 	uint64_t uic_txg;
331 	void *uic_arg;
332 	zdb_log_sm_cb_t uic_cb;
333 } unflushed_iter_cb_arg_t;
334 
335 static int
336 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
337 {
338 	unflushed_iter_cb_arg_t *uic = arg;
339 	return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
340 }
341 
342 static void
343 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
344 {
345 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
346 		return;
347 
348 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
349 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
350 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
351 		space_map_t *sm = NULL;
352 		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
353 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
354 
355 		unflushed_iter_cb_arg_t uic = {
356 			.uic_spa = spa,
357 			.uic_txg = sls->sls_txg,
358 			.uic_arg = arg,
359 			.uic_cb = cb
360 		};
361 		VERIFY0(space_map_iterate(sm, space_map_length(sm),
362 		    iterate_through_spacemap_logs_cb, &uic));
363 		space_map_close(sm);
364 	}
365 	spa_config_exit(spa, SCL_CONFIG, FTAG);
366 }
367 
368 static void
369 verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
370     uint64_t offset, uint64_t size)
371 {
372 	sublivelist_verify_block_t svb = {{{0}}};
373 	DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
374 	DVA_SET_OFFSET(&svb.svb_dva, offset);
375 	DVA_SET_ASIZE(&svb.svb_dva, size);
376 	zfs_btree_index_t where;
377 	uint64_t end_offset = offset + size;
378 
379 	/*
380 	 *  Look for an exact match for spacemap entry in the livelist entries.
381 	 *  Then, look for other livelist entries that fall within the range
382 	 *  of the spacemap entry as it may have been condensed
383 	 */
384 	sublivelist_verify_block_t *found =
385 	    zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
386 	if (found == NULL) {
387 		found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
388 	}
389 	for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
390 	    DVA_GET_OFFSET(&found->svb_dva) < end_offset;
391 	    found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
392 		if (found->svb_allocated_txg <= txg) {
393 			(void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
394 			    "from TXG %llx FREED at TXG %llx\n",
395 			    (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
396 			    (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
397 			    (u_longlong_t)found->svb_allocated_txg,
398 			    (u_longlong_t)txg);
399 		}
400 	}
401 }
402 
403 static int
404 metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
405 {
406 	metaslab_verify_t *mv = arg;
407 	uint64_t offset = sme->sme_offset;
408 	uint64_t size = sme->sme_run;
409 	uint64_t txg = sme->sme_txg;
410 
411 	if (sme->sme_type == SM_ALLOC) {
412 		if (range_tree_contains(mv->mv_allocated,
413 		    offset, size)) {
414 			(void) printf("ERROR: DOUBLE ALLOC: "
415 			    "%llu [%llx:%llx] "
416 			    "%llu:%llu LOG_SM\n",
417 			    (u_longlong_t)txg, (u_longlong_t)offset,
418 			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
419 			    (u_longlong_t)mv->mv_msid);
420 		} else {
421 			range_tree_add(mv->mv_allocated,
422 			    offset, size);
423 		}
424 	} else {
425 		if (!range_tree_contains(mv->mv_allocated,
426 		    offset, size)) {
427 			(void) printf("ERROR: DOUBLE FREE: "
428 			    "%llu [%llx:%llx] "
429 			    "%llu:%llu LOG_SM\n",
430 			    (u_longlong_t)txg, (u_longlong_t)offset,
431 			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
432 			    (u_longlong_t)mv->mv_msid);
433 		} else {
434 			range_tree_remove(mv->mv_allocated,
435 			    offset, size);
436 		}
437 	}
438 
439 	if (sme->sme_type != SM_ALLOC) {
440 		/*
441 		 * If something is freed in the spacemap, verify that
442 		 * it is not listed as allocated in the livelist.
443 		 */
444 		verify_livelist_allocs(mv, txg, offset, size);
445 	}
446 	return (0);
447 }
448 
449 static int
450 spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
451     uint64_t txg, void *arg)
452 {
453 	metaslab_verify_t *mv = arg;
454 	uint64_t offset = sme->sme_offset;
455 	uint64_t vdev_id = sme->sme_vdev;
456 
457 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
458 
459 	/* skip indirect vdevs */
460 	if (!vdev_is_concrete(vd))
461 		return (0);
462 
463 	if (vdev_id != mv->mv_vdid)
464 		return (0);
465 
466 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
467 	if (ms->ms_id != mv->mv_msid)
468 		return (0);
469 
470 	if (txg < metaslab_unflushed_txg(ms))
471 		return (0);
472 
473 
474 	ASSERT3U(txg, ==, sme->sme_txg);
475 	return (metaslab_spacemap_validation_cb(sme, mv));
476 }
477 
478 static void
479 spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
480 {
481 	iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
482 }
483 
484 static void
485 spacemap_check_ms_sm(space_map_t  *sm, metaslab_verify_t *mv)
486 {
487 	if (sm == NULL)
488 		return;
489 
490 	VERIFY0(space_map_iterate(sm, space_map_length(sm),
491 	    metaslab_spacemap_validation_cb, mv));
492 }
493 
494 static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
495 
496 /*
497  * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
498  * they are part of that metaslab (mv_msid).
499  */
500 static void
501 mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
502 {
503 	zfs_btree_index_t where;
504 	sublivelist_verify_block_t *svb;
505 	ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
506 	for (svb = zfs_btree_first(&sv->sv_leftover, &where);
507 	    svb != NULL;
508 	    svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
509 		if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
510 			continue;
511 
512 		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
513 		    (DVA_GET_OFFSET(&svb->svb_dva) +
514 		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
515 			(void) printf("ERROR: Found block that crosses "
516 			    "metaslab boundary: <%llu:%llx:%llx>\n",
517 			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
518 			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
519 			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
520 			continue;
521 		}
522 
523 		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
524 			continue;
525 
526 		if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
527 			continue;
528 
529 		if ((DVA_GET_OFFSET(&svb->svb_dva) +
530 		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
531 			(void) printf("ERROR: Found block that crosses "
532 			    "metaslab boundary: <%llu:%llx:%llx>\n",
533 			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
534 			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
535 			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
536 			continue;
537 		}
538 
539 		zfs_btree_add(&mv->mv_livelist_allocs, svb);
540 	}
541 
542 	for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
543 	    svb != NULL;
544 	    svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
545 		zfs_btree_remove(&sv->sv_leftover, svb);
546 	}
547 }
548 
549 /*
550  * [Livelist Check]
551  * Iterate through all the sublivelists and:
552  * - report leftover frees (**)
553  * - record leftover ALLOCs together with their TXG [see Cross Check]
554  *
555  * (**) Note: Double ALLOCs are valid in datasets that have dedup
556  *      enabled. Similarly double FREEs are allowed as well but
557  *      only if they pair up with a corresponding ALLOC entry once
558  *      we our done with our sublivelist iteration.
559  *
560  * [Spacemap Check]
561  * for each metaslab:
562  * - iterate over spacemap and then the metaslab's entries in the
563  *   spacemap log, then report any double FREEs and ALLOCs (do not
564  *   blow up).
565  *
566  * [Cross Check]
567  * After finishing the Livelist Check phase and while being in the
568  * Spacemap Check phase, we find all the recorded leftover ALLOCs
569  * of the livelist check that are part of the metaslab that we are
570  * currently looking at in the Spacemap Check. We report any entries
571  * that are marked as ALLOCs in the livelists but have been actually
572  * freed (and potentially allocated again) after their TXG stamp in
573  * the spacemaps. Also report any ALLOCs from the livelists that
574  * belong to indirect vdevs (e.g. their vdev completed removal).
575  *
576  * Note that this will miss Log Spacemap entries that cancelled each other
577  * out before being flushed to the metaslab, so we are not guaranteed
578  * to match all erroneous ALLOCs.
579  */
580 static void
581 livelist_metaslab_validate(spa_t *spa)
582 {
583 	(void) printf("Verifying deleted livelist entries\n");
584 
585 	sublivelist_verify_t sv;
586 	zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
587 	    sizeof (sublivelist_verify_block_t));
588 	iterate_deleted_livelists(spa, livelist_verify, &sv);
589 
590 	(void) printf("Verifying metaslab entries\n");
591 	vdev_t *rvd = spa->spa_root_vdev;
592 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
593 		vdev_t *vd = rvd->vdev_child[c];
594 
595 		if (!vdev_is_concrete(vd))
596 			continue;
597 
598 		for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
599 			metaslab_t *m = vd->vdev_ms[mid];
600 
601 			(void) fprintf(stderr,
602 			    "\rverifying concrete vdev %llu, "
603 			    "metaslab %llu of %llu ...",
604 			    (longlong_t)vd->vdev_id,
605 			    (longlong_t)mid,
606 			    (longlong_t)vd->vdev_ms_count);
607 
608 			uint64_t shift, start;
609 			range_seg_type_t type =
610 			    metaslab_calculate_range_tree_type(vd, m,
611 			    &start, &shift);
612 			metaslab_verify_t mv;
613 			mv.mv_allocated = range_tree_create(NULL,
614 			    type, NULL, start, shift);
615 			mv.mv_vdid = vd->vdev_id;
616 			mv.mv_msid = m->ms_id;
617 			mv.mv_start = m->ms_start;
618 			mv.mv_end = m->ms_start + m->ms_size;
619 			zfs_btree_create(&mv.mv_livelist_allocs,
620 			    livelist_block_compare, NULL,
621 			    sizeof (sublivelist_verify_block_t));
622 
623 			mv_populate_livelist_allocs(&mv, &sv);
624 
625 			spacemap_check_ms_sm(m->ms_sm, &mv);
626 			spacemap_check_sm_log(spa, &mv);
627 
628 			range_tree_vacate(mv.mv_allocated, NULL, NULL);
629 			range_tree_destroy(mv.mv_allocated);
630 			zfs_btree_clear(&mv.mv_livelist_allocs);
631 			zfs_btree_destroy(&mv.mv_livelist_allocs);
632 		}
633 	}
634 	(void) fprintf(stderr, "\n");
635 
636 	/*
637 	 * If there are any segments in the leftover tree after we walked
638 	 * through all the metaslabs in the concrete vdevs then this means
639 	 * that we have segments in the livelists that belong to indirect
640 	 * vdevs and are marked as allocated.
641 	 */
642 	if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
643 		zfs_btree_destroy(&sv.sv_leftover);
644 		return;
645 	}
646 	(void) printf("ERROR: Found livelist blocks marked as allocated "
647 	    "for indirect vdevs:\n");
648 
649 	zfs_btree_index_t *where = NULL;
650 	sublivelist_verify_block_t *svb;
651 	while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
652 	    NULL) {
653 		int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
654 		ASSERT3U(vdev_id, <, rvd->vdev_children);
655 		vdev_t *vd = rvd->vdev_child[vdev_id];
656 		ASSERT(!vdev_is_concrete(vd));
657 		(void) printf("<%d:%llx:%llx> TXG %llx\n",
658 		    vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
659 		    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
660 		    (u_longlong_t)svb->svb_allocated_txg);
661 	}
662 	(void) printf("\n");
663 	zfs_btree_destroy(&sv.sv_leftover);
664 }
665 
666 /*
667  * These libumem hooks provide a reasonable set of defaults for the allocator's
668  * debugging facilities.
669  */
670 const char *
671 _umem_debug_init(void)
672 {
673 	return ("default,verbose"); /* $UMEM_DEBUG setting */
674 }
675 
676 const char *
677 _umem_logging_init(void)
678 {
679 	return ("fail,contents"); /* $UMEM_LOGGING setting */
680 }
681 
682 static void
683 usage(void)
684 {
685 	(void) fprintf(stderr,
686 	    "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
687 	    "[-I <inflight I/Os>]\n"
688 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
689 	    "\t\t[-K <key>]\n"
690 	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
691 	    "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n"
692 	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
693 	    "\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n"
694 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
695 	    "\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
696 	    "\t%s [-v] <bookmark>\n"
697 	    "\t%s -C [-A] [-U <cache>] [<poolname>]\n"
698 	    "\t%s -l [-Aqu] <device>\n"
699 	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
700 	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
701 	    "\t%s -O [-K <key>] <dataset> <path>\n"
702 	    "\t%s -r [-K <key>] <dataset> <path> <destination>\n"
703 	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
704 	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
705 	    "\t%s -E [-A] word0:word1:...:word15\n"
706 	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
707 	    "<poolname>\n\n",
708 	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
709 	    cmdname, cmdname, cmdname, cmdname, cmdname);
710 
711 	(void) fprintf(stderr, "    Dataset name must include at least one "
712 	    "separator character '/' or '@'\n");
713 	(void) fprintf(stderr, "    If dataset name is specified, only that "
714 	    "dataset is dumped\n");
715 	(void) fprintf(stderr,  "    If object numbers or object number "
716 	    "ranges are specified, only those\n"
717 	    "    objects or ranges are dumped.\n\n");
718 	(void) fprintf(stderr,
719 	    "    Object ranges take the form <start>:<end>[:<flags>]\n"
720 	    "        start    Starting object number\n"
721 	    "        end      Ending object number, or -1 for no upper bound\n"
722 	    "        flags    Optional flags to select object types:\n"
723 	    "            A     All objects (this is the default)\n"
724 	    "            d     ZFS directories\n"
725 	    "            f     ZFS files \n"
726 	    "            m     SPA space maps\n"
727 	    "            z     ZAPs\n"
728 	    "            -     Negate effect of next flag\n\n");
729 	(void) fprintf(stderr, "    Options to control amount of output:\n");
730 	(void) fprintf(stderr, "        -b --block-stats             "
731 	    "block statistics\n");
732 	(void) fprintf(stderr, "        -B --backup                  "
733 	    "backup stream\n");
734 	(void) fprintf(stderr, "        -c --checksum                "
735 	    "checksum all metadata (twice for all data) blocks\n");
736 	(void) fprintf(stderr, "        -C --config                  "
737 	    "config (or cachefile if alone)\n");
738 	(void) fprintf(stderr, "        -d --datasets                "
739 	    "dataset(s)\n");
740 	(void) fprintf(stderr, "        -D --dedup-stats             "
741 	    "dedup statistics\n");
742 	(void) fprintf(stderr, "        -E --embedded-block-pointer=INTEGER\n"
743 	    "                                     decode and display block "
744 	    "from an embedded block pointer\n");
745 	(void) fprintf(stderr, "        -h --history                 "
746 	    "pool history\n");
747 	(void) fprintf(stderr, "        -i --intent-logs             "
748 	    "intent logs\n");
749 	(void) fprintf(stderr, "        -l --label                   "
750 	    "read label contents\n");
751 	(void) fprintf(stderr, "        -k --checkpointed-state      "
752 	    "examine the checkpointed state of the pool\n");
753 	(void) fprintf(stderr, "        -L --disable-leak-tracking   "
754 	    "disable leak tracking (do not load spacemaps)\n");
755 	(void) fprintf(stderr, "        -m --metaslabs               "
756 	    "metaslabs\n");
757 	(void) fprintf(stderr, "        -M --metaslab-groups         "
758 	    "metaslab groups\n");
759 	(void) fprintf(stderr, "        -O --object-lookups          "
760 	    "perform object lookups by path\n");
761 	(void) fprintf(stderr, "        -r --copy-object             "
762 	    "copy an object by path to file\n");
763 	(void) fprintf(stderr, "        -R --read-block              "
764 	    "read and display block from a device\n");
765 	(void) fprintf(stderr, "        -s --io-stats                "
766 	    "report stats on zdb's I/O\n");
767 	(void) fprintf(stderr, "        -S --simulate-dedup          "
768 	    "simulate dedup to measure effect\n");
769 	(void) fprintf(stderr, "        -v --verbose                 "
770 	    "verbose (applies to all others)\n");
771 	(void) fprintf(stderr, "        -y --livelist                "
772 	    "perform livelist and metaslab validation on any livelists being "
773 	    "deleted\n\n");
774 	(void) fprintf(stderr, "    Below options are intended for use "
775 	    "with other options:\n");
776 	(void) fprintf(stderr, "        -A --ignore-assertions       "
777 	    "ignore assertions (-A), enable panic recovery (-AA) or both "
778 	    "(-AAA)\n");
779 	(void) fprintf(stderr, "        -e --exported                "
780 	    "pool is exported/destroyed/has altroot/not in a cachefile\n");
781 	(void) fprintf(stderr, "        -F --automatic-rewind        "
782 	    "attempt automatic rewind within safe range of transaction "
783 	    "groups\n");
784 	(void) fprintf(stderr, "        -G --dump-debug-msg          "
785 	    "dump zfs_dbgmsg buffer before exiting\n");
786 	(void) fprintf(stderr, "        -I --inflight=INTEGER        "
787 	    "specify the maximum number of checksumming I/Os "
788 	    "[default is 200]\n");
789 	(void) fprintf(stderr, "        -K --key=KEY                 "
790 	    "decryption key for encrypted dataset\n");
791 	(void) fprintf(stderr, "        -o --option=\"OPTION=INTEGER\" "
792 	    "set global variable to an unsigned 32-bit integer\n");
793 	(void) fprintf(stderr, "        -p --path==PATH              "
794 	    "use one or more with -e to specify path to vdev dir\n");
795 	(void) fprintf(stderr, "        -P --parseable               "
796 	    "print numbers in parseable form\n");
797 	(void) fprintf(stderr, "        -q --skip-label              "
798 	    "don't print label contents\n");
799 	(void) fprintf(stderr, "        -t --txg=INTEGER             "
800 	    "highest txg to use when searching for uberblocks\n");
801 	(void) fprintf(stderr, "        -T --brt-stats               "
802 	    "BRT statistics\n");
803 	(void) fprintf(stderr, "        -u --uberblock               "
804 	    "uberblock\n");
805 	(void) fprintf(stderr, "        -U --cachefile=PATH          "
806 	    "use alternate cachefile\n");
807 	(void) fprintf(stderr, "        -V --verbatim                "
808 	    "do verbatim import\n");
809 	(void) fprintf(stderr, "        -x --dump-blocks=PATH        "
810 	    "dump all read blocks into specified directory\n");
811 	(void) fprintf(stderr, "        -X --extreme-rewind          "
812 	    "attempt extreme rewind (does not work with dataset)\n");
813 	(void) fprintf(stderr, "        -Y --all-reconstruction      "
814 	    "attempt all reconstruction combinations for split blocks\n");
815 	(void) fprintf(stderr, "        -Z --zstd-headers            "
816 	    "show ZSTD headers \n");
817 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
818 	    "to make only that option verbose\n");
819 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
820 	exit(1);
821 }
822 
823 static void
824 dump_debug_buffer(void)
825 {
826 	if (dump_opt['G']) {
827 		(void) printf("\n");
828 		(void) fflush(stdout);
829 		zfs_dbgmsg_print("zdb");
830 	}
831 }
832 
833 /*
834  * Called for usage errors that are discovered after a call to spa_open(),
835  * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
836  */
837 
838 static void
839 fatal(const char *fmt, ...)
840 {
841 	va_list ap;
842 
843 	va_start(ap, fmt);
844 	(void) fprintf(stderr, "%s: ", cmdname);
845 	(void) vfprintf(stderr, fmt, ap);
846 	va_end(ap);
847 	(void) fprintf(stderr, "\n");
848 
849 	dump_debug_buffer();
850 
851 	exit(1);
852 }
853 
854 static void
855 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
856 {
857 	(void) size;
858 	nvlist_t *nv;
859 	size_t nvsize = *(uint64_t *)data;
860 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
861 
862 	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
863 
864 	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
865 
866 	umem_free(packed, nvsize);
867 
868 	dump_nvlist(nv, 8);
869 
870 	nvlist_free(nv);
871 }
872 
873 static void
874 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
875 {
876 	(void) os, (void) object, (void) size;
877 	spa_history_phys_t *shp = data;
878 
879 	if (shp == NULL)
880 		return;
881 
882 	(void) printf("\t\tpool_create_len = %llu\n",
883 	    (u_longlong_t)shp->sh_pool_create_len);
884 	(void) printf("\t\tphys_max_off = %llu\n",
885 	    (u_longlong_t)shp->sh_phys_max_off);
886 	(void) printf("\t\tbof = %llu\n",
887 	    (u_longlong_t)shp->sh_bof);
888 	(void) printf("\t\teof = %llu\n",
889 	    (u_longlong_t)shp->sh_eof);
890 	(void) printf("\t\trecords_lost = %llu\n",
891 	    (u_longlong_t)shp->sh_records_lost);
892 }
893 
894 static void
895 zdb_nicenum(uint64_t num, char *buf, size_t buflen)
896 {
897 	if (dump_opt['P'])
898 		(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
899 	else
900 		nicenum(num, buf, buflen);
901 }
902 
903 static void
904 zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)
905 {
906 	if (dump_opt['P'])
907 		(void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);
908 	else
909 		zfs_nicebytes(bytes, buf, buflen);
910 }
911 
912 static const char histo_stars[] = "****************************************";
913 static const uint64_t histo_width = sizeof (histo_stars) - 1;
914 
915 static void
916 dump_histogram(const uint64_t *histo, int size, int offset)
917 {
918 	int i;
919 	int minidx = size - 1;
920 	int maxidx = 0;
921 	uint64_t max = 0;
922 
923 	for (i = 0; i < size; i++) {
924 		if (histo[i] == 0)
925 			continue;
926 		if (histo[i] > max)
927 			max = histo[i];
928 		if (i > maxidx)
929 			maxidx = i;
930 		if (i < minidx)
931 			minidx = i;
932 	}
933 
934 	if (max < histo_width)
935 		max = histo_width;
936 
937 	for (i = minidx; i <= maxidx; i++) {
938 		(void) printf("\t\t\t%3u: %6llu %s\n",
939 		    i + offset, (u_longlong_t)histo[i],
940 		    &histo_stars[(max - histo[i]) * histo_width / max]);
941 	}
942 }
943 
944 static void
945 dump_zap_stats(objset_t *os, uint64_t object)
946 {
947 	int error;
948 	zap_stats_t zs;
949 
950 	error = zap_get_stats(os, object, &zs);
951 	if (error)
952 		return;
953 
954 	if (zs.zs_ptrtbl_len == 0) {
955 		ASSERT(zs.zs_num_blocks == 1);
956 		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
957 		    (u_longlong_t)zs.zs_blocksize,
958 		    (u_longlong_t)zs.zs_num_entries);
959 		return;
960 	}
961 
962 	(void) printf("\tFat ZAP stats:\n");
963 
964 	(void) printf("\t\tPointer table:\n");
965 	(void) printf("\t\t\t%llu elements\n",
966 	    (u_longlong_t)zs.zs_ptrtbl_len);
967 	(void) printf("\t\t\tzt_blk: %llu\n",
968 	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
969 	(void) printf("\t\t\tzt_numblks: %llu\n",
970 	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
971 	(void) printf("\t\t\tzt_shift: %llu\n",
972 	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
973 	(void) printf("\t\t\tzt_blks_copied: %llu\n",
974 	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
975 	(void) printf("\t\t\tzt_nextblk: %llu\n",
976 	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
977 
978 	(void) printf("\t\tZAP entries: %llu\n",
979 	    (u_longlong_t)zs.zs_num_entries);
980 	(void) printf("\t\tLeaf blocks: %llu\n",
981 	    (u_longlong_t)zs.zs_num_leafs);
982 	(void) printf("\t\tTotal blocks: %llu\n",
983 	    (u_longlong_t)zs.zs_num_blocks);
984 	(void) printf("\t\tzap_block_type: 0x%llx\n",
985 	    (u_longlong_t)zs.zs_block_type);
986 	(void) printf("\t\tzap_magic: 0x%llx\n",
987 	    (u_longlong_t)zs.zs_magic);
988 	(void) printf("\t\tzap_salt: 0x%llx\n",
989 	    (u_longlong_t)zs.zs_salt);
990 
991 	(void) printf("\t\tLeafs with 2^n pointers:\n");
992 	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
993 
994 	(void) printf("\t\tBlocks with n*5 entries:\n");
995 	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
996 
997 	(void) printf("\t\tBlocks n/10 full:\n");
998 	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
999 
1000 	(void) printf("\t\tEntries with n chunks:\n");
1001 	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
1002 
1003 	(void) printf("\t\tBuckets with n entries:\n");
1004 	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
1005 }
1006 
1007 static void
1008 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
1009 {
1010 	(void) os, (void) object, (void) data, (void) size;
1011 }
1012 
1013 static void
1014 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
1015 {
1016 	(void) os, (void) object, (void) data, (void) size;
1017 	(void) printf("\tUNKNOWN OBJECT TYPE\n");
1018 }
1019 
1020 static void
1021 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
1022 {
1023 	(void) os, (void) object, (void) data, (void) size;
1024 }
1025 
1026 static void
1027 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
1028 {
1029 	uint64_t *arr;
1030 	uint64_t oursize;
1031 	if (dump_opt['d'] < 6)
1032 		return;
1033 
1034 	if (data == NULL) {
1035 		dmu_object_info_t doi;
1036 
1037 		VERIFY0(dmu_object_info(os, object, &doi));
1038 		size = doi.doi_max_offset;
1039 		/*
1040 		 * We cap the size at 1 mebibyte here to prevent
1041 		 * allocation failures and nigh-infinite printing if the
1042 		 * object is extremely large.
1043 		 */
1044 		oursize = MIN(size, 1 << 20);
1045 		arr = kmem_alloc(oursize, KM_SLEEP);
1046 
1047 		int err = dmu_read(os, object, 0, oursize, arr, 0);
1048 		if (err != 0) {
1049 			(void) printf("got error %u from dmu_read\n", err);
1050 			kmem_free(arr, oursize);
1051 			return;
1052 		}
1053 	} else {
1054 		/*
1055 		 * Even though the allocation is already done in this code path,
1056 		 * we still cap the size to prevent excessive printing.
1057 		 */
1058 		oursize = MIN(size, 1 << 20);
1059 		arr = data;
1060 	}
1061 
1062 	if (size == 0) {
1063 		if (data == NULL)
1064 			kmem_free(arr, oursize);
1065 		(void) printf("\t\t[]\n");
1066 		return;
1067 	}
1068 
1069 	(void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
1070 	for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
1071 		if (i % 4 != 0)
1072 			(void) printf(", %0llx", (u_longlong_t)arr[i]);
1073 		else
1074 			(void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
1075 	}
1076 	if (oursize != size)
1077 		(void) printf(", ... ");
1078 	(void) printf("]\n");
1079 
1080 	if (data == NULL)
1081 		kmem_free(arr, oursize);
1082 }
1083 
1084 static void
1085 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
1086 {
1087 	(void) data, (void) size;
1088 	zap_cursor_t zc;
1089 	zap_attribute_t attr;
1090 	void *prop;
1091 	unsigned i;
1092 
1093 	dump_zap_stats(os, object);
1094 	(void) printf("\n");
1095 
1096 	for (zap_cursor_init(&zc, os, object);
1097 	    zap_cursor_retrieve(&zc, &attr) == 0;
1098 	    zap_cursor_advance(&zc)) {
1099 		(void) printf("\t\t%s = ", attr.za_name);
1100 		if (attr.za_num_integers == 0) {
1101 			(void) printf("\n");
1102 			continue;
1103 		}
1104 		prop = umem_zalloc(attr.za_num_integers *
1105 		    attr.za_integer_length, UMEM_NOFAIL);
1106 		(void) zap_lookup(os, object, attr.za_name,
1107 		    attr.za_integer_length, attr.za_num_integers, prop);
1108 		if (attr.za_integer_length == 1) {
1109 			if (strcmp(attr.za_name,
1110 			    DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||
1111 			    strcmp(attr.za_name,
1112 			    DSL_CRYPTO_KEY_HMAC_KEY) == 0 ||
1113 			    strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 ||
1114 			    strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 ||
1115 			    strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) {
1116 				uint8_t *u8 = prop;
1117 
1118 				for (i = 0; i < attr.za_num_integers; i++) {
1119 					(void) printf("%02x", u8[i]);
1120 				}
1121 			} else {
1122 				(void) printf("%s", (char *)prop);
1123 			}
1124 		} else {
1125 			for (i = 0; i < attr.za_num_integers; i++) {
1126 				switch (attr.za_integer_length) {
1127 				case 2:
1128 					(void) printf("%u ",
1129 					    ((uint16_t *)prop)[i]);
1130 					break;
1131 				case 4:
1132 					(void) printf("%u ",
1133 					    ((uint32_t *)prop)[i]);
1134 					break;
1135 				case 8:
1136 					(void) printf("%lld ",
1137 					    (u_longlong_t)((int64_t *)prop)[i]);
1138 					break;
1139 				}
1140 			}
1141 		}
1142 		(void) printf("\n");
1143 		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
1144 	}
1145 	zap_cursor_fini(&zc);
1146 }
1147 
1148 static void
1149 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
1150 {
1151 	bpobj_phys_t *bpop = data;
1152 	uint64_t i;
1153 	char bytes[32], comp[32], uncomp[32];
1154 
1155 	/* make sure the output won't get truncated */
1156 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
1157 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
1158 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
1159 
1160 	if (bpop == NULL)
1161 		return;
1162 
1163 	zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
1164 	zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
1165 	zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
1166 
1167 	(void) printf("\t\tnum_blkptrs = %llu\n",
1168 	    (u_longlong_t)bpop->bpo_num_blkptrs);
1169 	(void) printf("\t\tbytes = %s\n", bytes);
1170 	if (size >= BPOBJ_SIZE_V1) {
1171 		(void) printf("\t\tcomp = %s\n", comp);
1172 		(void) printf("\t\tuncomp = %s\n", uncomp);
1173 	}
1174 	if (size >= BPOBJ_SIZE_V2) {
1175 		(void) printf("\t\tsubobjs = %llu\n",
1176 		    (u_longlong_t)bpop->bpo_subobjs);
1177 		(void) printf("\t\tnum_subobjs = %llu\n",
1178 		    (u_longlong_t)bpop->bpo_num_subobjs);
1179 	}
1180 	if (size >= sizeof (*bpop)) {
1181 		(void) printf("\t\tnum_freed = %llu\n",
1182 		    (u_longlong_t)bpop->bpo_num_freed);
1183 	}
1184 
1185 	if (dump_opt['d'] < 5)
1186 		return;
1187 
1188 	for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
1189 		char blkbuf[BP_SPRINTF_LEN];
1190 		blkptr_t bp;
1191 
1192 		int err = dmu_read(os, object,
1193 		    i * sizeof (bp), sizeof (bp), &bp, 0);
1194 		if (err != 0) {
1195 			(void) printf("got error %u from dmu_read\n", err);
1196 			break;
1197 		}
1198 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
1199 		    BP_GET_FREE(&bp));
1200 		(void) printf("\t%s\n", blkbuf);
1201 	}
1202 }
1203 
1204 static void
1205 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
1206 {
1207 	(void) data, (void) size;
1208 	dmu_object_info_t doi;
1209 	int64_t i;
1210 
1211 	VERIFY0(dmu_object_info(os, object, &doi));
1212 	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
1213 
1214 	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
1215 	if (err != 0) {
1216 		(void) printf("got error %u from dmu_read\n", err);
1217 		kmem_free(subobjs, doi.doi_max_offset);
1218 		return;
1219 	}
1220 
1221 	int64_t last_nonzero = -1;
1222 	for (i = 0; i < doi.doi_max_offset / 8; i++) {
1223 		if (subobjs[i] != 0)
1224 			last_nonzero = i;
1225 	}
1226 
1227 	for (i = 0; i <= last_nonzero; i++) {
1228 		(void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
1229 	}
1230 	kmem_free(subobjs, doi.doi_max_offset);
1231 }
1232 
1233 static void
1234 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
1235 {
1236 	(void) data, (void) size;
1237 	dump_zap_stats(os, object);
1238 	/* contents are printed elsewhere, properly decoded */
1239 }
1240 
1241 static void
1242 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
1243 {
1244 	(void) data, (void) size;
1245 	zap_cursor_t zc;
1246 	zap_attribute_t attr;
1247 
1248 	dump_zap_stats(os, object);
1249 	(void) printf("\n");
1250 
1251 	for (zap_cursor_init(&zc, os, object);
1252 	    zap_cursor_retrieve(&zc, &attr) == 0;
1253 	    zap_cursor_advance(&zc)) {
1254 		(void) printf("\t\t%s = ", attr.za_name);
1255 		if (attr.za_num_integers == 0) {
1256 			(void) printf("\n");
1257 			continue;
1258 		}
1259 		(void) printf(" %llx : [%d:%d:%d]\n",
1260 		    (u_longlong_t)attr.za_first_integer,
1261 		    (int)ATTR_LENGTH(attr.za_first_integer),
1262 		    (int)ATTR_BSWAP(attr.za_first_integer),
1263 		    (int)ATTR_NUM(attr.za_first_integer));
1264 	}
1265 	zap_cursor_fini(&zc);
1266 }
1267 
1268 static void
1269 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
1270 {
1271 	(void) data, (void) size;
1272 	zap_cursor_t zc;
1273 	zap_attribute_t attr;
1274 	uint16_t *layout_attrs;
1275 	unsigned i;
1276 
1277 	dump_zap_stats(os, object);
1278 	(void) printf("\n");
1279 
1280 	for (zap_cursor_init(&zc, os, object);
1281 	    zap_cursor_retrieve(&zc, &attr) == 0;
1282 	    zap_cursor_advance(&zc)) {
1283 		(void) printf("\t\t%s = [", attr.za_name);
1284 		if (attr.za_num_integers == 0) {
1285 			(void) printf("\n");
1286 			continue;
1287 		}
1288 
1289 		VERIFY(attr.za_integer_length == 2);
1290 		layout_attrs = umem_zalloc(attr.za_num_integers *
1291 		    attr.za_integer_length, UMEM_NOFAIL);
1292 
1293 		VERIFY(zap_lookup(os, object, attr.za_name,
1294 		    attr.za_integer_length,
1295 		    attr.za_num_integers, layout_attrs) == 0);
1296 
1297 		for (i = 0; i != attr.za_num_integers; i++)
1298 			(void) printf(" %d ", (int)layout_attrs[i]);
1299 		(void) printf("]\n");
1300 		umem_free(layout_attrs,
1301 		    attr.za_num_integers * attr.za_integer_length);
1302 	}
1303 	zap_cursor_fini(&zc);
1304 }
1305 
1306 static void
1307 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
1308 {
1309 	(void) data, (void) size;
1310 	zap_cursor_t zc;
1311 	zap_attribute_t attr;
1312 	const char *typenames[] = {
1313 		/* 0 */ "not specified",
1314 		/* 1 */ "FIFO",
1315 		/* 2 */ "Character Device",
1316 		/* 3 */ "3 (invalid)",
1317 		/* 4 */ "Directory",
1318 		/* 5 */ "5 (invalid)",
1319 		/* 6 */ "Block Device",
1320 		/* 7 */ "7 (invalid)",
1321 		/* 8 */ "Regular File",
1322 		/* 9 */ "9 (invalid)",
1323 		/* 10 */ "Symbolic Link",
1324 		/* 11 */ "11 (invalid)",
1325 		/* 12 */ "Socket",
1326 		/* 13 */ "Door",
1327 		/* 14 */ "Event Port",
1328 		/* 15 */ "15 (invalid)",
1329 	};
1330 
1331 	dump_zap_stats(os, object);
1332 	(void) printf("\n");
1333 
1334 	for (zap_cursor_init(&zc, os, object);
1335 	    zap_cursor_retrieve(&zc, &attr) == 0;
1336 	    zap_cursor_advance(&zc)) {
1337 		(void) printf("\t\t%s = %lld (type: %s)\n",
1338 		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
1339 		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
1340 	}
1341 	zap_cursor_fini(&zc);
1342 }
1343 
1344 static int
1345 get_dtl_refcount(vdev_t *vd)
1346 {
1347 	int refcount = 0;
1348 
1349 	if (vd->vdev_ops->vdev_op_leaf) {
1350 		space_map_t *sm = vd->vdev_dtl_sm;
1351 
1352 		if (sm != NULL &&
1353 		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
1354 			return (1);
1355 		return (0);
1356 	}
1357 
1358 	for (unsigned c = 0; c < vd->vdev_children; c++)
1359 		refcount += get_dtl_refcount(vd->vdev_child[c]);
1360 	return (refcount);
1361 }
1362 
1363 static int
1364 get_metaslab_refcount(vdev_t *vd)
1365 {
1366 	int refcount = 0;
1367 
1368 	if (vd->vdev_top == vd) {
1369 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
1370 			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
1371 
1372 			if (sm != NULL &&
1373 			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
1374 				refcount++;
1375 		}
1376 	}
1377 	for (unsigned c = 0; c < vd->vdev_children; c++)
1378 		refcount += get_metaslab_refcount(vd->vdev_child[c]);
1379 
1380 	return (refcount);
1381 }
1382 
1383 static int
1384 get_obsolete_refcount(vdev_t *vd)
1385 {
1386 	uint64_t obsolete_sm_object;
1387 	int refcount = 0;
1388 
1389 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
1390 	if (vd->vdev_top == vd && obsolete_sm_object != 0) {
1391 		dmu_object_info_t doi;
1392 		VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
1393 		    obsolete_sm_object, &doi));
1394 		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
1395 			refcount++;
1396 		}
1397 	} else {
1398 		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
1399 		ASSERT3U(obsolete_sm_object, ==, 0);
1400 	}
1401 	for (unsigned c = 0; c < vd->vdev_children; c++) {
1402 		refcount += get_obsolete_refcount(vd->vdev_child[c]);
1403 	}
1404 
1405 	return (refcount);
1406 }
1407 
1408 static int
1409 get_prev_obsolete_spacemap_refcount(spa_t *spa)
1410 {
1411 	uint64_t prev_obj =
1412 	    spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
1413 	if (prev_obj != 0) {
1414 		dmu_object_info_t doi;
1415 		VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
1416 		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
1417 			return (1);
1418 		}
1419 	}
1420 	return (0);
1421 }
1422 
1423 static int
1424 get_checkpoint_refcount(vdev_t *vd)
1425 {
1426 	int refcount = 0;
1427 
1428 	if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
1429 	    zap_contains(spa_meta_objset(vd->vdev_spa),
1430 	    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
1431 		refcount++;
1432 
1433 	for (uint64_t c = 0; c < vd->vdev_children; c++)
1434 		refcount += get_checkpoint_refcount(vd->vdev_child[c]);
1435 
1436 	return (refcount);
1437 }
1438 
1439 static int
1440 get_log_spacemap_refcount(spa_t *spa)
1441 {
1442 	return (avl_numnodes(&spa->spa_sm_logs_by_txg));
1443 }
1444 
1445 static int
1446 verify_spacemap_refcounts(spa_t *spa)
1447 {
1448 	uint64_t expected_refcount = 0;
1449 	uint64_t actual_refcount;
1450 
1451 	(void) feature_get_refcount(spa,
1452 	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
1453 	    &expected_refcount);
1454 	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
1455 	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
1456 	actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
1457 	actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
1458 	actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
1459 	actual_refcount += get_log_spacemap_refcount(spa);
1460 
1461 	if (expected_refcount != actual_refcount) {
1462 		(void) printf("space map refcount mismatch: expected %lld != "
1463 		    "actual %lld\n",
1464 		    (longlong_t)expected_refcount,
1465 		    (longlong_t)actual_refcount);
1466 		return (2);
1467 	}
1468 	return (0);
1469 }
1470 
1471 static void
1472 dump_spacemap(objset_t *os, space_map_t *sm)
1473 {
1474 	const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
1475 	    "INVALID", "INVALID", "INVALID", "INVALID" };
1476 
1477 	if (sm == NULL)
1478 		return;
1479 
1480 	(void) printf("space map object %llu:\n",
1481 	    (longlong_t)sm->sm_object);
1482 	(void) printf("  smp_length = 0x%llx\n",
1483 	    (longlong_t)sm->sm_phys->smp_length);
1484 	(void) printf("  smp_alloc = 0x%llx\n",
1485 	    (longlong_t)sm->sm_phys->smp_alloc);
1486 
1487 	if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
1488 		return;
1489 
1490 	/*
1491 	 * Print out the freelist entries in both encoded and decoded form.
1492 	 */
1493 	uint8_t mapshift = sm->sm_shift;
1494 	int64_t alloc = 0;
1495 	uint64_t word, entry_id = 0;
1496 	for (uint64_t offset = 0; offset < space_map_length(sm);
1497 	    offset += sizeof (word)) {
1498 
1499 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
1500 		    sizeof (word), &word, DMU_READ_PREFETCH));
1501 
1502 		if (sm_entry_is_debug(word)) {
1503 			uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
1504 			uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
1505 			if (de_txg == 0) {
1506 				(void) printf(
1507 				    "\t    [%6llu] PADDING\n",
1508 				    (u_longlong_t)entry_id);
1509 			} else {
1510 				(void) printf(
1511 				    "\t    [%6llu] %s: txg %llu pass %llu\n",
1512 				    (u_longlong_t)entry_id,
1513 				    ddata[SM_DEBUG_ACTION_DECODE(word)],
1514 				    (u_longlong_t)de_txg,
1515 				    (u_longlong_t)de_sync_pass);
1516 			}
1517 			entry_id++;
1518 			continue;
1519 		}
1520 
1521 		uint8_t words;
1522 		char entry_type;
1523 		uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
1524 
1525 		if (sm_entry_is_single_word(word)) {
1526 			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
1527 			    'A' : 'F';
1528 			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
1529 			    sm->sm_start;
1530 			entry_run = SM_RUN_DECODE(word) << mapshift;
1531 			words = 1;
1532 		} else {
1533 			/* it is a two-word entry so we read another word */
1534 			ASSERT(sm_entry_is_double_word(word));
1535 
1536 			uint64_t extra_word;
1537 			offset += sizeof (extra_word);
1538 			VERIFY0(dmu_read(os, space_map_object(sm), offset,
1539 			    sizeof (extra_word), &extra_word,
1540 			    DMU_READ_PREFETCH));
1541 
1542 			ASSERT3U(offset, <=, space_map_length(sm));
1543 
1544 			entry_run = SM2_RUN_DECODE(word) << mapshift;
1545 			entry_vdev = SM2_VDEV_DECODE(word);
1546 			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
1547 			    'A' : 'F';
1548 			entry_off = (SM2_OFFSET_DECODE(extra_word) <<
1549 			    mapshift) + sm->sm_start;
1550 			words = 2;
1551 		}
1552 
1553 		(void) printf("\t    [%6llu]    %c  range:"
1554 		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
1555 		    (u_longlong_t)entry_id,
1556 		    entry_type, (u_longlong_t)entry_off,
1557 		    (u_longlong_t)(entry_off + entry_run),
1558 		    (u_longlong_t)entry_run,
1559 		    (u_longlong_t)entry_vdev, words);
1560 
1561 		if (entry_type == 'A')
1562 			alloc += entry_run;
1563 		else
1564 			alloc -= entry_run;
1565 		entry_id++;
1566 	}
1567 	if (alloc != space_map_allocated(sm)) {
1568 		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
1569 		    "with space map summary (%lld)\n",
1570 		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
1571 	}
1572 }
1573 
1574 static void
1575 dump_metaslab_stats(metaslab_t *msp)
1576 {
1577 	char maxbuf[32];
1578 	range_tree_t *rt = msp->ms_allocatable;
1579 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
1580 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1581 
1582 	/* max sure nicenum has enough space */
1583 	_Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated");
1584 
1585 	zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
1586 
1587 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
1588 	    "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
1589 	    "freepct", free_pct);
1590 	(void) printf("\tIn-memory histogram:\n");
1591 	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1592 }
1593 
1594 static void
1595 dump_metaslab(metaslab_t *msp)
1596 {
1597 	vdev_t *vd = msp->ms_group->mg_vd;
1598 	spa_t *spa = vd->vdev_spa;
1599 	space_map_t *sm = msp->ms_sm;
1600 	char freebuf[32];
1601 
1602 	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
1603 	    sizeof (freebuf));
1604 
1605 	(void) printf(
1606 	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
1607 	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
1608 	    (u_longlong_t)space_map_object(sm), freebuf);
1609 
1610 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
1611 		mutex_enter(&msp->ms_lock);
1612 		VERIFY0(metaslab_load(msp));
1613 		range_tree_stat_verify(msp->ms_allocatable);
1614 		dump_metaslab_stats(msp);
1615 		metaslab_unload(msp);
1616 		mutex_exit(&msp->ms_lock);
1617 	}
1618 
1619 	if (dump_opt['m'] > 1 && sm != NULL &&
1620 	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
1621 		/*
1622 		 * The space map histogram represents free space in chunks
1623 		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
1624 		 */
1625 		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
1626 		    (u_longlong_t)msp->ms_fragmentation);
1627 		dump_histogram(sm->sm_phys->smp_histogram,
1628 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
1629 	}
1630 
1631 	if (vd->vdev_ops == &vdev_draid_ops)
1632 		ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
1633 	else
1634 		ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
1635 
1636 	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
1637 
1638 	if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
1639 		(void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
1640 		    (u_longlong_t)metaslab_unflushed_txg(msp));
1641 	}
1642 }
1643 
1644 static void
1645 print_vdev_metaslab_header(vdev_t *vd)
1646 {
1647 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
1648 	const char *bias_str = "";
1649 	if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
1650 		bias_str = VDEV_ALLOC_BIAS_LOG;
1651 	} else if (alloc_bias == VDEV_BIAS_SPECIAL) {
1652 		bias_str = VDEV_ALLOC_BIAS_SPECIAL;
1653 	} else if (alloc_bias == VDEV_BIAS_DEDUP) {
1654 		bias_str = VDEV_ALLOC_BIAS_DEDUP;
1655 	}
1656 
1657 	uint64_t ms_flush_data_obj = 0;
1658 	if (vd->vdev_top_zap != 0) {
1659 		int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
1660 		    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
1661 		    sizeof (uint64_t), 1, &ms_flush_data_obj);
1662 		if (error != ENOENT) {
1663 			ASSERT0(error);
1664 		}
1665 	}
1666 
1667 	(void) printf("\tvdev %10llu   %s",
1668 	    (u_longlong_t)vd->vdev_id, bias_str);
1669 
1670 	if (ms_flush_data_obj != 0) {
1671 		(void) printf("   ms_unflushed_phys object %llu",
1672 		    (u_longlong_t)ms_flush_data_obj);
1673 	}
1674 
1675 	(void) printf("\n\t%-10s%5llu   %-19s   %-15s   %-12s\n",
1676 	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
1677 	    "offset", "spacemap", "free");
1678 	(void) printf("\t%15s   %19s   %15s   %12s\n",
1679 	    "---------------", "-------------------",
1680 	    "---------------", "------------");
1681 }
1682 
1683 static void
1684 dump_metaslab_groups(spa_t *spa, boolean_t show_special)
1685 {
1686 	vdev_t *rvd = spa->spa_root_vdev;
1687 	metaslab_class_t *mc = spa_normal_class(spa);
1688 	metaslab_class_t *smc = spa_special_class(spa);
1689 	uint64_t fragmentation;
1690 
1691 	metaslab_class_histogram_verify(mc);
1692 
1693 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
1694 		vdev_t *tvd = rvd->vdev_child[c];
1695 		metaslab_group_t *mg = tvd->vdev_mg;
1696 
1697 		if (mg == NULL || (mg->mg_class != mc &&
1698 		    (!show_special || mg->mg_class != smc)))
1699 			continue;
1700 
1701 		metaslab_group_histogram_verify(mg);
1702 		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
1703 
1704 		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
1705 		    "fragmentation",
1706 		    (u_longlong_t)tvd->vdev_id,
1707 		    (u_longlong_t)tvd->vdev_ms_count);
1708 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
1709 			(void) printf("%3s\n", "-");
1710 		} else {
1711 			(void) printf("%3llu%%\n",
1712 			    (u_longlong_t)mg->mg_fragmentation);
1713 		}
1714 		dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1715 	}
1716 
1717 	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
1718 	fragmentation = metaslab_class_fragmentation(mc);
1719 	if (fragmentation == ZFS_FRAG_INVALID)
1720 		(void) printf("\t%3s\n", "-");
1721 	else
1722 		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
1723 	dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
1724 }
1725 
1726 static void
1727 print_vdev_indirect(vdev_t *vd)
1728 {
1729 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
1730 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
1731 	vdev_indirect_births_t *vib = vd->vdev_indirect_births;
1732 
1733 	if (vim == NULL) {
1734 		ASSERT3P(vib, ==, NULL);
1735 		return;
1736 	}
1737 
1738 	ASSERT3U(vdev_indirect_mapping_object(vim), ==,
1739 	    vic->vic_mapping_object);
1740 	ASSERT3U(vdev_indirect_births_object(vib), ==,
1741 	    vic->vic_births_object);
1742 
1743 	(void) printf("indirect births obj %llu:\n",
1744 	    (longlong_t)vic->vic_births_object);
1745 	(void) printf("    vib_count = %llu\n",
1746 	    (longlong_t)vdev_indirect_births_count(vib));
1747 	for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
1748 		vdev_indirect_birth_entry_phys_t *cur_vibe =
1749 		    &vib->vib_entries[i];
1750 		(void) printf("\toffset %llx -> txg %llu\n",
1751 		    (longlong_t)cur_vibe->vibe_offset,
1752 		    (longlong_t)cur_vibe->vibe_phys_birth_txg);
1753 	}
1754 	(void) printf("\n");
1755 
1756 	(void) printf("indirect mapping obj %llu:\n",
1757 	    (longlong_t)vic->vic_mapping_object);
1758 	(void) printf("    vim_max_offset = 0x%llx\n",
1759 	    (longlong_t)vdev_indirect_mapping_max_offset(vim));
1760 	(void) printf("    vim_bytes_mapped = 0x%llx\n",
1761 	    (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
1762 	(void) printf("    vim_count = %llu\n",
1763 	    (longlong_t)vdev_indirect_mapping_num_entries(vim));
1764 
1765 	if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
1766 		return;
1767 
1768 	uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
1769 
1770 	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
1771 		vdev_indirect_mapping_entry_phys_t *vimep =
1772 		    &vim->vim_entries[i];
1773 		(void) printf("\t<%llx:%llx:%llx> -> "
1774 		    "<%llx:%llx:%llx> (%x obsolete)\n",
1775 		    (longlong_t)vd->vdev_id,
1776 		    (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
1777 		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1778 		    (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
1779 		    (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
1780 		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
1781 		    counts[i]);
1782 	}
1783 	(void) printf("\n");
1784 
1785 	uint64_t obsolete_sm_object;
1786 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
1787 	if (obsolete_sm_object != 0) {
1788 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
1789 		(void) printf("obsolete space map object %llu:\n",
1790 		    (u_longlong_t)obsolete_sm_object);
1791 		ASSERT(vd->vdev_obsolete_sm != NULL);
1792 		ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
1793 		    obsolete_sm_object);
1794 		dump_spacemap(mos, vd->vdev_obsolete_sm);
1795 		(void) printf("\n");
1796 	}
1797 }
1798 
1799 static void
1800 dump_metaslabs(spa_t *spa)
1801 {
1802 	vdev_t *vd, *rvd = spa->spa_root_vdev;
1803 	uint64_t m, c = 0, children = rvd->vdev_children;
1804 
1805 	(void) printf("\nMetaslabs:\n");
1806 
1807 	if (!dump_opt['d'] && zopt_metaslab_args > 0) {
1808 		c = zopt_metaslab[0];
1809 
1810 		if (c >= children)
1811 			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
1812 
1813 		if (zopt_metaslab_args > 1) {
1814 			vd = rvd->vdev_child[c];
1815 			print_vdev_metaslab_header(vd);
1816 
1817 			for (m = 1; m < zopt_metaslab_args; m++) {
1818 				if (zopt_metaslab[m] < vd->vdev_ms_count)
1819 					dump_metaslab(
1820 					    vd->vdev_ms[zopt_metaslab[m]]);
1821 				else
1822 					(void) fprintf(stderr, "bad metaslab "
1823 					    "number %llu\n",
1824 					    (u_longlong_t)zopt_metaslab[m]);
1825 			}
1826 			(void) printf("\n");
1827 			return;
1828 		}
1829 		children = c + 1;
1830 	}
1831 	for (; c < children; c++) {
1832 		vd = rvd->vdev_child[c];
1833 		print_vdev_metaslab_header(vd);
1834 
1835 		print_vdev_indirect(vd);
1836 
1837 		for (m = 0; m < vd->vdev_ms_count; m++)
1838 			dump_metaslab(vd->vdev_ms[m]);
1839 		(void) printf("\n");
1840 	}
1841 }
1842 
1843 static void
1844 dump_log_spacemaps(spa_t *spa)
1845 {
1846 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
1847 		return;
1848 
1849 	(void) printf("\nLog Space Maps in Pool:\n");
1850 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
1851 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
1852 		space_map_t *sm = NULL;
1853 		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
1854 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
1855 
1856 		(void) printf("Log Spacemap object %llu txg %llu\n",
1857 		    (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
1858 		dump_spacemap(spa->spa_meta_objset, sm);
1859 		space_map_close(sm);
1860 	}
1861 	(void) printf("\n");
1862 }
1863 
1864 static void
1865 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
1866 {
1867 	const ddt_phys_t *ddp = dde->dde_phys;
1868 	const ddt_key_t *ddk = &dde->dde_key;
1869 	const char *types[4] = { "ditto", "single", "double", "triple" };
1870 	char blkbuf[BP_SPRINTF_LEN];
1871 	blkptr_t blk;
1872 	int p;
1873 
1874 	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1875 		if (ddp->ddp_phys_birth == 0)
1876 			continue;
1877 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
1878 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
1879 		(void) printf("index %llx refcnt %llu %s %s\n",
1880 		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
1881 		    types[p], blkbuf);
1882 	}
1883 }
1884 
1885 static void
1886 dump_dedup_ratio(const ddt_stat_t *dds)
1887 {
1888 	double rL, rP, rD, D, dedup, compress, copies;
1889 
1890 	if (dds->dds_blocks == 0)
1891 		return;
1892 
1893 	rL = (double)dds->dds_ref_lsize;
1894 	rP = (double)dds->dds_ref_psize;
1895 	rD = (double)dds->dds_ref_dsize;
1896 	D = (double)dds->dds_dsize;
1897 
1898 	dedup = rD / D;
1899 	compress = rL / rP;
1900 	copies = rD / rP;
1901 
1902 	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
1903 	    "dedup * compress / copies = %.2f\n\n",
1904 	    dedup, compress, copies, dedup * compress / copies);
1905 }
1906 
1907 static void
1908 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
1909 {
1910 	char name[DDT_NAMELEN];
1911 	ddt_entry_t dde;
1912 	uint64_t walk = 0;
1913 	dmu_object_info_t doi;
1914 	uint64_t count, dspace, mspace;
1915 	int error;
1916 
1917 	error = ddt_object_info(ddt, type, class, &doi);
1918 
1919 	if (error == ENOENT)
1920 		return;
1921 	ASSERT(error == 0);
1922 
1923 	error = ddt_object_count(ddt, type, class, &count);
1924 	ASSERT(error == 0);
1925 	if (count == 0)
1926 		return;
1927 
1928 	dspace = doi.doi_physical_blocks_512 << 9;
1929 	mspace = doi.doi_fill_count * doi.doi_data_block_size;
1930 
1931 	ddt_object_name(ddt, type, class, name);
1932 
1933 	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
1934 	    name,
1935 	    (u_longlong_t)count,
1936 	    (u_longlong_t)(dspace / count),
1937 	    (u_longlong_t)(mspace / count));
1938 
1939 	if (dump_opt['D'] < 3)
1940 		return;
1941 
1942 	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
1943 
1944 	if (dump_opt['D'] < 4)
1945 		return;
1946 
1947 	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
1948 		return;
1949 
1950 	(void) printf("%s contents:\n\n", name);
1951 
1952 	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
1953 		dump_dde(ddt, &dde, walk);
1954 
1955 	ASSERT3U(error, ==, ENOENT);
1956 
1957 	(void) printf("\n");
1958 }
1959 
1960 static void
1961 dump_all_ddts(spa_t *spa)
1962 {
1963 	ddt_histogram_t ddh_total = {{{0}}};
1964 	ddt_stat_t dds_total = {0};
1965 
1966 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1967 		ddt_t *ddt = spa->spa_ddt[c];
1968 		if (!ddt)
1969 			continue;
1970 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
1971 			for (ddt_class_t class = 0; class < DDT_CLASSES;
1972 			    class++) {
1973 				dump_ddt(ddt, type, class);
1974 			}
1975 		}
1976 	}
1977 
1978 	ddt_get_dedup_stats(spa, &dds_total);
1979 
1980 	if (dds_total.dds_blocks == 0) {
1981 		(void) printf("All DDTs are empty\n");
1982 		return;
1983 	}
1984 
1985 	(void) printf("\n");
1986 
1987 	if (dump_opt['D'] > 1) {
1988 		(void) printf("DDT histogram (aggregated over all DDTs):\n");
1989 		ddt_get_dedup_histogram(spa, &ddh_total);
1990 		zpool_dump_ddt(&dds_total, &ddh_total);
1991 	}
1992 
1993 	dump_dedup_ratio(&dds_total);
1994 }
1995 
1996 static void
1997 dump_brt(spa_t *spa)
1998 {
1999 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {
2000 		printf("BRT: unsupported on this pool\n");
2001 		return;
2002 	}
2003 
2004 	if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
2005 		printf("BRT: empty\n");
2006 		return;
2007 	}
2008 
2009 	brt_t *brt = spa->spa_brt;
2010 	VERIFY(brt);
2011 
2012 	char count[32], used[32], saved[32];
2013 	zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
2014 	zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
2015 	uint64_t ratio = brt_get_ratio(spa);
2016 	printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,
2017 	    (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));
2018 
2019 	if (dump_opt['T'] < 2)
2020 		return;
2021 
2022 	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
2023 		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
2024 		if (brtvd == NULL)
2025 			continue;
2026 
2027 		if (!brtvd->bv_initiated) {
2028 			printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
2029 			continue;
2030 		}
2031 
2032 		zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
2033 		zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
2034 		zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
2035 		printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",
2036 		    vdevid, count, used, saved);
2037 	}
2038 
2039 	if (dump_opt['T'] < 3)
2040 		return;
2041 
2042 	char dva[64];
2043 	printf("\n%-16s %-10s\n", "DVA", "REFCNT");
2044 
2045 	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
2046 		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
2047 		if (brtvd == NULL || !brtvd->bv_initiated)
2048 			continue;
2049 
2050 		zap_cursor_t zc;
2051 		zap_attribute_t za;
2052 		for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
2053 		    zap_cursor_retrieve(&zc, &za) == 0;
2054 		    zap_cursor_advance(&zc)) {
2055 			uint64_t offset = *(uint64_t *)za.za_name;
2056 			uint64_t refcnt = za.za_first_integer;
2057 
2058 			snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid,
2059 			    (u_longlong_t)offset);
2060 			printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt);
2061 		}
2062 		zap_cursor_fini(&zc);
2063 	}
2064 }
2065 
2066 static void
2067 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
2068 {
2069 	char *prefix = arg;
2070 
2071 	(void) printf("%s [%llu,%llu) length %llu\n",
2072 	    prefix,
2073 	    (u_longlong_t)start,
2074 	    (u_longlong_t)(start + size),
2075 	    (u_longlong_t)(size));
2076 }
2077 
2078 static void
2079 dump_dtl(vdev_t *vd, int indent)
2080 {
2081 	spa_t *spa = vd->vdev_spa;
2082 	boolean_t required;
2083 	const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
2084 		"outage" };
2085 	char prefix[256];
2086 
2087 	spa_vdev_state_enter(spa, SCL_NONE);
2088 	required = vdev_dtl_required(vd);
2089 	(void) spa_vdev_state_exit(spa, NULL, 0);
2090 
2091 	if (indent == 0)
2092 		(void) printf("\nDirty time logs:\n\n");
2093 
2094 	(void) printf("\t%*s%s [%s]\n", indent, "",
2095 	    vd->vdev_path ? vd->vdev_path :
2096 	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
2097 	    required ? "DTL-required" : "DTL-expendable");
2098 
2099 	for (int t = 0; t < DTL_TYPES; t++) {
2100 		range_tree_t *rt = vd->vdev_dtl[t];
2101 		if (range_tree_space(rt) == 0)
2102 			continue;
2103 		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
2104 		    indent + 2, "", name[t]);
2105 		range_tree_walk(rt, dump_dtl_seg, prefix);
2106 		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
2107 			dump_spacemap(spa->spa_meta_objset,
2108 			    vd->vdev_dtl_sm);
2109 	}
2110 
2111 	for (unsigned c = 0; c < vd->vdev_children; c++)
2112 		dump_dtl(vd->vdev_child[c], indent + 4);
2113 }
2114 
2115 static void
2116 dump_history(spa_t *spa)
2117 {
2118 	nvlist_t **events = NULL;
2119 	char *buf;
2120 	uint64_t resid, len, off = 0;
2121 	uint_t num = 0;
2122 	int error;
2123 	char tbuf[30];
2124 
2125 	if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
2126 		(void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
2127 		    __func__);
2128 		return;
2129 	}
2130 
2131 	do {
2132 		len = SPA_OLD_MAXBLOCKSIZE;
2133 
2134 		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
2135 			(void) fprintf(stderr, "Unable to read history: "
2136 			    "error %d\n", error);
2137 			free(buf);
2138 			return;
2139 		}
2140 
2141 		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
2142 			break;
2143 
2144 		off -= resid;
2145 	} while (len != 0);
2146 
2147 	(void) printf("\nHistory:\n");
2148 	for (unsigned i = 0; i < num; i++) {
2149 		boolean_t printed = B_FALSE;
2150 
2151 		if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) {
2152 			time_t tsec;
2153 			struct tm t;
2154 
2155 			tsec = fnvlist_lookup_uint64(events[i],
2156 			    ZPOOL_HIST_TIME);
2157 			(void) localtime_r(&tsec, &t);
2158 			(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
2159 		} else {
2160 			tbuf[0] = '\0';
2161 		}
2162 
2163 		if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) {
2164 			(void) printf("%s %s\n", tbuf,
2165 			    fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD));
2166 		} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) {
2167 			uint64_t ievent;
2168 
2169 			ievent = fnvlist_lookup_uint64(events[i],
2170 			    ZPOOL_HIST_INT_EVENT);
2171 			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
2172 				goto next;
2173 
2174 			(void) printf(" %s [internal %s txg:%ju] %s\n",
2175 			    tbuf,
2176 			    zfs_history_event_names[ievent],
2177 			    fnvlist_lookup_uint64(events[i],
2178 			    ZPOOL_HIST_TXG),
2179 			    fnvlist_lookup_string(events[i],
2180 			    ZPOOL_HIST_INT_STR));
2181 		} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) {
2182 			(void) printf("%s [txg:%ju] %s", tbuf,
2183 			    fnvlist_lookup_uint64(events[i],
2184 			    ZPOOL_HIST_TXG),
2185 			    fnvlist_lookup_string(events[i],
2186 			    ZPOOL_HIST_INT_NAME));
2187 
2188 			if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) {
2189 				(void) printf(" %s (%llu)",
2190 				    fnvlist_lookup_string(events[i],
2191 				    ZPOOL_HIST_DSNAME),
2192 				    (u_longlong_t)fnvlist_lookup_uint64(
2193 				    events[i],
2194 				    ZPOOL_HIST_DSID));
2195 			}
2196 
2197 			(void) printf(" %s\n", fnvlist_lookup_string(events[i],
2198 			    ZPOOL_HIST_INT_STR));
2199 		} else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) {
2200 			(void) printf("%s ioctl %s\n", tbuf,
2201 			    fnvlist_lookup_string(events[i],
2202 			    ZPOOL_HIST_IOCTL));
2203 
2204 			if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) {
2205 				(void) printf("    input:\n");
2206 				dump_nvlist(fnvlist_lookup_nvlist(events[i],
2207 				    ZPOOL_HIST_INPUT_NVL), 8);
2208 			}
2209 			if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) {
2210 				(void) printf("    output:\n");
2211 				dump_nvlist(fnvlist_lookup_nvlist(events[i],
2212 				    ZPOOL_HIST_OUTPUT_NVL), 8);
2213 			}
2214 			if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) {
2215 				(void) printf("    errno: %lld\n",
2216 				    (longlong_t)fnvlist_lookup_int64(events[i],
2217 				    ZPOOL_HIST_ERRNO));
2218 			}
2219 		} else {
2220 			goto next;
2221 		}
2222 
2223 		printed = B_TRUE;
2224 next:
2225 		if (dump_opt['h'] > 1) {
2226 			if (!printed)
2227 				(void) printf("unrecognized record:\n");
2228 			dump_nvlist(events[i], 2);
2229 		}
2230 	}
2231 	free(buf);
2232 }
2233 
2234 static void
2235 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
2236 {
2237 	(void) os, (void) object, (void) data, (void) size;
2238 }
2239 
2240 static uint64_t
2241 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
2242     const zbookmark_phys_t *zb)
2243 {
2244 	if (dnp == NULL) {
2245 		ASSERT(zb->zb_level < 0);
2246 		if (zb->zb_object == 0)
2247 			return (zb->zb_blkid);
2248 		return (zb->zb_blkid * BP_GET_LSIZE(bp));
2249 	}
2250 
2251 	ASSERT(zb->zb_level >= 0);
2252 
2253 	return ((zb->zb_blkid <<
2254 	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
2255 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
2256 }
2257 
2258 static void
2259 snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
2260     const blkptr_t *bp)
2261 {
2262 	static abd_t *pabd = NULL;
2263 	void *buf;
2264 	zio_t *zio;
2265 	zfs_zstdhdr_t zstd_hdr;
2266 	int error;
2267 
2268 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)
2269 		return;
2270 
2271 	if (BP_IS_HOLE(bp))
2272 		return;
2273 
2274 	if (BP_IS_EMBEDDED(bp)) {
2275 		buf = malloc(SPA_MAXBLOCKSIZE);
2276 		if (buf == NULL) {
2277 			(void) fprintf(stderr, "out of memory\n");
2278 			exit(1);
2279 		}
2280 		decode_embedded_bp_compressed(bp, buf);
2281 		memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
2282 		free(buf);
2283 		zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
2284 		zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
2285 		(void) snprintf(blkbuf + strlen(blkbuf),
2286 		    buflen - strlen(blkbuf),
2287 		    " ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
2288 		    zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
2289 		    zfs_get_hdrlevel(&zstd_hdr));
2290 		return;
2291 	}
2292 
2293 	if (!pabd)
2294 		pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
2295 	zio = zio_root(spa, NULL, NULL, 0);
2296 
2297 	/* Decrypt but don't decompress so we can read the compression header */
2298 	zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,
2299 	    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,
2300 	    NULL));
2301 	error = zio_wait(zio);
2302 	if (error) {
2303 		(void) fprintf(stderr, "read failed: %d\n", error);
2304 		return;
2305 	}
2306 	buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));
2307 	memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
2308 	zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
2309 	zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
2310 
2311 	(void) snprintf(blkbuf + strlen(blkbuf),
2312 	    buflen - strlen(blkbuf),
2313 	    " ZSTD:size=%u:version=%u:level=%u:NORMAL",
2314 	    zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
2315 	    zfs_get_hdrlevel(&zstd_hdr));
2316 
2317 	abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
2318 }
2319 
2320 static void
2321 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
2322     boolean_t bp_freed)
2323 {
2324 	const dva_t *dva = bp->blk_dva;
2325 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
2326 	int i;
2327 
2328 	if (dump_opt['b'] >= 6) {
2329 		snprintf_blkptr(blkbuf, buflen, bp);
2330 		if (bp_freed) {
2331 			(void) snprintf(blkbuf + strlen(blkbuf),
2332 			    buflen - strlen(blkbuf), " %s", "FREE");
2333 		}
2334 		return;
2335 	}
2336 
2337 	if (BP_IS_EMBEDDED(bp)) {
2338 		(void) sprintf(blkbuf,
2339 		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
2340 		    (int)BPE_GET_ETYPE(bp),
2341 		    (u_longlong_t)BPE_GET_LSIZE(bp),
2342 		    (u_longlong_t)BPE_GET_PSIZE(bp),
2343 		    (u_longlong_t)bp->blk_birth);
2344 		return;
2345 	}
2346 
2347 	blkbuf[0] = '\0';
2348 
2349 	for (i = 0; i < ndvas; i++)
2350 		(void) snprintf(blkbuf + strlen(blkbuf),
2351 		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
2352 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
2353 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
2354 		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
2355 
2356 	if (BP_IS_HOLE(bp)) {
2357 		(void) snprintf(blkbuf + strlen(blkbuf),
2358 		    buflen - strlen(blkbuf),
2359 		    "%llxL B=%llu",
2360 		    (u_longlong_t)BP_GET_LSIZE(bp),
2361 		    (u_longlong_t)bp->blk_birth);
2362 	} else {
2363 		(void) snprintf(blkbuf + strlen(blkbuf),
2364 		    buflen - strlen(blkbuf),
2365 		    "%llxL/%llxP F=%llu B=%llu/%llu",
2366 		    (u_longlong_t)BP_GET_LSIZE(bp),
2367 		    (u_longlong_t)BP_GET_PSIZE(bp),
2368 		    (u_longlong_t)BP_GET_FILL(bp),
2369 		    (u_longlong_t)bp->blk_birth,
2370 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
2371 		if (bp_freed)
2372 			(void) snprintf(blkbuf + strlen(blkbuf),
2373 			    buflen - strlen(blkbuf), " %s", "FREE");
2374 		(void) snprintf(blkbuf + strlen(blkbuf),
2375 		    buflen - strlen(blkbuf),
2376 		    " cksum=%016llx:%016llx:%016llx:%016llx",
2377 		    (u_longlong_t)bp->blk_cksum.zc_word[0],
2378 		    (u_longlong_t)bp->blk_cksum.zc_word[1],
2379 		    (u_longlong_t)bp->blk_cksum.zc_word[2],
2380 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);
2381 	}
2382 }
2383 
2384 static void
2385 print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
2386     const dnode_phys_t *dnp)
2387 {
2388 	char blkbuf[BP_SPRINTF_LEN];
2389 	int l;
2390 
2391 	if (!BP_IS_EMBEDDED(bp)) {
2392 		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
2393 		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
2394 	}
2395 
2396 	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
2397 
2398 	ASSERT(zb->zb_level >= 0);
2399 
2400 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
2401 		if (l == zb->zb_level) {
2402 			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
2403 		} else {
2404 			(void) printf(" ");
2405 		}
2406 	}
2407 
2408 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
2409 	if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
2410 		snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
2411 	(void) printf("%s\n", blkbuf);
2412 }
2413 
2414 static int
2415 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
2416     blkptr_t *bp, const zbookmark_phys_t *zb)
2417 {
2418 	int err = 0;
2419 
2420 	if (bp->blk_birth == 0)
2421 		return (0);
2422 
2423 	print_indirect(spa, bp, zb, dnp);
2424 
2425 	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
2426 		arc_flags_t flags = ARC_FLAG_WAIT;
2427 		int i;
2428 		blkptr_t *cbp;
2429 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
2430 		arc_buf_t *buf;
2431 		uint64_t fill = 0;
2432 		ASSERT(!BP_IS_REDACTED(bp));
2433 
2434 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
2435 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
2436 		if (err)
2437 			return (err);
2438 		ASSERT(buf->b_data);
2439 
2440 		/* recursively visit blocks below this */
2441 		cbp = buf->b_data;
2442 		for (i = 0; i < epb; i++, cbp++) {
2443 			zbookmark_phys_t czb;
2444 
2445 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
2446 			    zb->zb_level - 1,
2447 			    zb->zb_blkid * epb + i);
2448 			err = visit_indirect(spa, dnp, cbp, &czb);
2449 			if (err)
2450 				break;
2451 			fill += BP_GET_FILL(cbp);
2452 		}
2453 		if (!err)
2454 			ASSERT3U(fill, ==, BP_GET_FILL(bp));
2455 		arc_buf_destroy(buf, &buf);
2456 	}
2457 
2458 	return (err);
2459 }
2460 
2461 static void
2462 dump_indirect(dnode_t *dn)
2463 {
2464 	dnode_phys_t *dnp = dn->dn_phys;
2465 	zbookmark_phys_t czb;
2466 
2467 	(void) printf("Indirect blocks:\n");
2468 
2469 	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
2470 	    dn->dn_object, dnp->dn_nlevels - 1, 0);
2471 	for (int j = 0; j < dnp->dn_nblkptr; j++) {
2472 		czb.zb_blkid = j;
2473 		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
2474 		    &dnp->dn_blkptr[j], &czb);
2475 	}
2476 
2477 	(void) printf("\n");
2478 }
2479 
2480 static void
2481 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
2482 {
2483 	(void) os, (void) object;
2484 	dsl_dir_phys_t *dd = data;
2485 	time_t crtime;
2486 	char nice[32];
2487 
2488 	/* make sure nicenum has enough space */
2489 	_Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated");
2490 
2491 	if (dd == NULL)
2492 		return;
2493 
2494 	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
2495 
2496 	crtime = dd->dd_creation_time;
2497 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
2498 	(void) printf("\t\thead_dataset_obj = %llu\n",
2499 	    (u_longlong_t)dd->dd_head_dataset_obj);
2500 	(void) printf("\t\tparent_dir_obj = %llu\n",
2501 	    (u_longlong_t)dd->dd_parent_obj);
2502 	(void) printf("\t\torigin_obj = %llu\n",
2503 	    (u_longlong_t)dd->dd_origin_obj);
2504 	(void) printf("\t\tchild_dir_zapobj = %llu\n",
2505 	    (u_longlong_t)dd->dd_child_dir_zapobj);
2506 	zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
2507 	(void) printf("\t\tused_bytes = %s\n", nice);
2508 	zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
2509 	(void) printf("\t\tcompressed_bytes = %s\n", nice);
2510 	zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
2511 	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
2512 	zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
2513 	(void) printf("\t\tquota = %s\n", nice);
2514 	zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
2515 	(void) printf("\t\treserved = %s\n", nice);
2516 	(void) printf("\t\tprops_zapobj = %llu\n",
2517 	    (u_longlong_t)dd->dd_props_zapobj);
2518 	(void) printf("\t\tdeleg_zapobj = %llu\n",
2519 	    (u_longlong_t)dd->dd_deleg_zapobj);
2520 	(void) printf("\t\tflags = %llx\n",
2521 	    (u_longlong_t)dd->dd_flags);
2522 
2523 #define	DO(which) \
2524 	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
2525 	    sizeof (nice)); \
2526 	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
2527 	DO(HEAD);
2528 	DO(SNAP);
2529 	DO(CHILD);
2530 	DO(CHILD_RSRV);
2531 	DO(REFRSRV);
2532 #undef DO
2533 	(void) printf("\t\tclones = %llu\n",
2534 	    (u_longlong_t)dd->dd_clones);
2535 }
2536 
2537 static void
2538 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
2539 {
2540 	(void) os, (void) object;
2541 	dsl_dataset_phys_t *ds = data;
2542 	time_t crtime;
2543 	char used[32], compressed[32], uncompressed[32], unique[32];
2544 	char blkbuf[BP_SPRINTF_LEN];
2545 
2546 	/* make sure nicenum has enough space */
2547 	_Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated");
2548 	_Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ,
2549 	    "compressed truncated");
2550 	_Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ,
2551 	    "uncompressed truncated");
2552 	_Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated");
2553 
2554 	if (ds == NULL)
2555 		return;
2556 
2557 	ASSERT(size == sizeof (*ds));
2558 	crtime = ds->ds_creation_time;
2559 	zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
2560 	zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
2561 	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
2562 	    sizeof (uncompressed));
2563 	zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
2564 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
2565 
2566 	(void) printf("\t\tdir_obj = %llu\n",
2567 	    (u_longlong_t)ds->ds_dir_obj);
2568 	(void) printf("\t\tprev_snap_obj = %llu\n",
2569 	    (u_longlong_t)ds->ds_prev_snap_obj);
2570 	(void) printf("\t\tprev_snap_txg = %llu\n",
2571 	    (u_longlong_t)ds->ds_prev_snap_txg);
2572 	(void) printf("\t\tnext_snap_obj = %llu\n",
2573 	    (u_longlong_t)ds->ds_next_snap_obj);
2574 	(void) printf("\t\tsnapnames_zapobj = %llu\n",
2575 	    (u_longlong_t)ds->ds_snapnames_zapobj);
2576 	(void) printf("\t\tnum_children = %llu\n",
2577 	    (u_longlong_t)ds->ds_num_children);
2578 	(void) printf("\t\tuserrefs_obj = %llu\n",
2579 	    (u_longlong_t)ds->ds_userrefs_obj);
2580 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
2581 	(void) printf("\t\tcreation_txg = %llu\n",
2582 	    (u_longlong_t)ds->ds_creation_txg);
2583 	(void) printf("\t\tdeadlist_obj = %llu\n",
2584 	    (u_longlong_t)ds->ds_deadlist_obj);
2585 	(void) printf("\t\tused_bytes = %s\n", used);
2586 	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
2587 	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
2588 	(void) printf("\t\tunique = %s\n", unique);
2589 	(void) printf("\t\tfsid_guid = %llu\n",
2590 	    (u_longlong_t)ds->ds_fsid_guid);
2591 	(void) printf("\t\tguid = %llu\n",
2592 	    (u_longlong_t)ds->ds_guid);
2593 	(void) printf("\t\tflags = %llx\n",
2594 	    (u_longlong_t)ds->ds_flags);
2595 	(void) printf("\t\tnext_clones_obj = %llu\n",
2596 	    (u_longlong_t)ds->ds_next_clones_obj);
2597 	(void) printf("\t\tprops_obj = %llu\n",
2598 	    (u_longlong_t)ds->ds_props_obj);
2599 	(void) printf("\t\tbp = %s\n", blkbuf);
2600 }
2601 
2602 static int
2603 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
2604 {
2605 	(void) arg, (void) tx;
2606 	char blkbuf[BP_SPRINTF_LEN];
2607 
2608 	if (bp->blk_birth != 0) {
2609 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2610 		(void) printf("\t%s\n", blkbuf);
2611 	}
2612 	return (0);
2613 }
2614 
2615 static void
2616 dump_bptree(objset_t *os, uint64_t obj, const char *name)
2617 {
2618 	char bytes[32];
2619 	bptree_phys_t *bt;
2620 	dmu_buf_t *db;
2621 
2622 	/* make sure nicenum has enough space */
2623 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
2624 
2625 	if (dump_opt['d'] < 3)
2626 		return;
2627 
2628 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
2629 	bt = db->db_data;
2630 	zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
2631 	(void) printf("\n    %s: %llu datasets, %s\n",
2632 	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
2633 	dmu_buf_rele(db, FTAG);
2634 
2635 	if (dump_opt['d'] < 5)
2636 		return;
2637 
2638 	(void) printf("\n");
2639 
2640 	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
2641 }
2642 
2643 static int
2644 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
2645 {
2646 	(void) arg, (void) tx;
2647 	char blkbuf[BP_SPRINTF_LEN];
2648 
2649 	ASSERT(bp->blk_birth != 0);
2650 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
2651 	(void) printf("\t%s\n", blkbuf);
2652 	return (0);
2653 }
2654 
2655 static void
2656 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
2657 {
2658 	char bytes[32];
2659 	char comp[32];
2660 	char uncomp[32];
2661 	uint64_t i;
2662 
2663 	/* make sure nicenum has enough space */
2664 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
2665 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
2666 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
2667 
2668 	if (dump_opt['d'] < 3)
2669 		return;
2670 
2671 	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
2672 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
2673 		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
2674 		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
2675 		if (bpo->bpo_havefreed) {
2676 			(void) printf("    %*s: object %llu, %llu local "
2677 			    "blkptrs, %llu freed, %llu subobjs in object %llu, "
2678 			    "%s (%s/%s comp)\n",
2679 			    indent * 8, name,
2680 			    (u_longlong_t)bpo->bpo_object,
2681 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2682 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
2683 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
2684 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
2685 			    bytes, comp, uncomp);
2686 		} else {
2687 			(void) printf("    %*s: object %llu, %llu local "
2688 			    "blkptrs, %llu subobjs in object %llu, "
2689 			    "%s (%s/%s comp)\n",
2690 			    indent * 8, name,
2691 			    (u_longlong_t)bpo->bpo_object,
2692 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2693 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
2694 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
2695 			    bytes, comp, uncomp);
2696 		}
2697 
2698 		for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
2699 			uint64_t subobj;
2700 			bpobj_t subbpo;
2701 			int error;
2702 			VERIFY0(dmu_read(bpo->bpo_os,
2703 			    bpo->bpo_phys->bpo_subobjs,
2704 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
2705 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
2706 			if (error != 0) {
2707 				(void) printf("ERROR %u while trying to open "
2708 				    "subobj id %llu\n",
2709 				    error, (u_longlong_t)subobj);
2710 				continue;
2711 			}
2712 			dump_full_bpobj(&subbpo, "subobj", indent + 1);
2713 			bpobj_close(&subbpo);
2714 		}
2715 	} else {
2716 		if (bpo->bpo_havefreed) {
2717 			(void) printf("    %*s: object %llu, %llu blkptrs, "
2718 			    "%llu freed, %s\n",
2719 			    indent * 8, name,
2720 			    (u_longlong_t)bpo->bpo_object,
2721 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2722 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
2723 			    bytes);
2724 		} else {
2725 			(void) printf("    %*s: object %llu, %llu blkptrs, "
2726 			    "%s\n",
2727 			    indent * 8, name,
2728 			    (u_longlong_t)bpo->bpo_object,
2729 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
2730 			    bytes);
2731 		}
2732 	}
2733 
2734 	if (dump_opt['d'] < 5)
2735 		return;
2736 
2737 
2738 	if (indent == 0) {
2739 		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
2740 		(void) printf("\n");
2741 	}
2742 }
2743 
2744 static int
2745 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
2746     boolean_t print_list)
2747 {
2748 	int err = 0;
2749 	zfs_bookmark_phys_t prop;
2750 	objset_t *mos = dp->dp_spa->spa_meta_objset;
2751 	err = dsl_bookmark_lookup(dp, name, NULL, &prop);
2752 
2753 	if (err != 0) {
2754 		return (err);
2755 	}
2756 
2757 	(void) printf("\t#%s: ", strchr(name, '#') + 1);
2758 	(void) printf("{guid: %llx creation_txg: %llu creation_time: "
2759 	    "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
2760 	    (u_longlong_t)prop.zbm_creation_txg,
2761 	    (u_longlong_t)prop.zbm_creation_time,
2762 	    (u_longlong_t)prop.zbm_redaction_obj);
2763 
2764 	IMPLY(print_list, print_redact);
2765 	if (!print_redact || prop.zbm_redaction_obj == 0)
2766 		return (0);
2767 
2768 	redaction_list_t *rl;
2769 	VERIFY0(dsl_redaction_list_hold_obj(dp,
2770 	    prop.zbm_redaction_obj, FTAG, &rl));
2771 
2772 	redaction_list_phys_t *rlp = rl->rl_phys;
2773 	(void) printf("\tRedacted:\n\t\tProgress: ");
2774 	if (rlp->rlp_last_object != UINT64_MAX ||
2775 	    rlp->rlp_last_blkid != UINT64_MAX) {
2776 		(void) printf("%llu %llu (incomplete)\n",
2777 		    (u_longlong_t)rlp->rlp_last_object,
2778 		    (u_longlong_t)rlp->rlp_last_blkid);
2779 	} else {
2780 		(void) printf("complete\n");
2781 	}
2782 	(void) printf("\t\tSnapshots: [");
2783 	for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
2784 		if (i > 0)
2785 			(void) printf(", ");
2786 		(void) printf("%0llu",
2787 		    (u_longlong_t)rlp->rlp_snaps[i]);
2788 	}
2789 	(void) printf("]\n\t\tLength: %llu\n",
2790 	    (u_longlong_t)rlp->rlp_num_entries);
2791 
2792 	if (!print_list) {
2793 		dsl_redaction_list_rele(rl, FTAG);
2794 		return (0);
2795 	}
2796 
2797 	if (rlp->rlp_num_entries == 0) {
2798 		dsl_redaction_list_rele(rl, FTAG);
2799 		(void) printf("\t\tRedaction List: []\n\n");
2800 		return (0);
2801 	}
2802 
2803 	redact_block_phys_t *rbp_buf;
2804 	uint64_t size;
2805 	dmu_object_info_t doi;
2806 
2807 	VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
2808 	size = doi.doi_max_offset;
2809 	rbp_buf = kmem_alloc(size, KM_SLEEP);
2810 
2811 	err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
2812 	    rbp_buf, 0);
2813 	if (err != 0) {
2814 		dsl_redaction_list_rele(rl, FTAG);
2815 		kmem_free(rbp_buf, size);
2816 		return (err);
2817 	}
2818 
2819 	(void) printf("\t\tRedaction List: [{object: %llx, offset: "
2820 	    "%llx, blksz: %x, count: %llx}",
2821 	    (u_longlong_t)rbp_buf[0].rbp_object,
2822 	    (u_longlong_t)rbp_buf[0].rbp_blkid,
2823 	    (uint_t)(redact_block_get_size(&rbp_buf[0])),
2824 	    (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
2825 
2826 	for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
2827 		(void) printf(",\n\t\t{object: %llx, offset: %llx, "
2828 		    "blksz: %x, count: %llx}",
2829 		    (u_longlong_t)rbp_buf[i].rbp_object,
2830 		    (u_longlong_t)rbp_buf[i].rbp_blkid,
2831 		    (uint_t)(redact_block_get_size(&rbp_buf[i])),
2832 		    (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
2833 	}
2834 	dsl_redaction_list_rele(rl, FTAG);
2835 	kmem_free(rbp_buf, size);
2836 	(void) printf("]\n\n");
2837 	return (0);
2838 }
2839 
2840 static void
2841 dump_bookmarks(objset_t *os, int verbosity)
2842 {
2843 	zap_cursor_t zc;
2844 	zap_attribute_t attr;
2845 	dsl_dataset_t *ds = dmu_objset_ds(os);
2846 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
2847 	objset_t *mos = os->os_spa->spa_meta_objset;
2848 	if (verbosity < 4)
2849 		return;
2850 	dsl_pool_config_enter(dp, FTAG);
2851 
2852 	for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
2853 	    zap_cursor_retrieve(&zc, &attr) == 0;
2854 	    zap_cursor_advance(&zc)) {
2855 		char osname[ZFS_MAX_DATASET_NAME_LEN];
2856 		char buf[ZFS_MAX_DATASET_NAME_LEN];
2857 		int len;
2858 		dmu_objset_name(os, osname);
2859 		len = snprintf(buf, sizeof (buf), "%s#%s", osname,
2860 		    attr.za_name);
2861 		VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN);
2862 		(void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
2863 	}
2864 	zap_cursor_fini(&zc);
2865 	dsl_pool_config_exit(dp, FTAG);
2866 }
2867 
2868 static void
2869 bpobj_count_refd(bpobj_t *bpo)
2870 {
2871 	mos_obj_refd(bpo->bpo_object);
2872 
2873 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
2874 		mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
2875 		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
2876 			uint64_t subobj;
2877 			bpobj_t subbpo;
2878 			int error;
2879 			VERIFY0(dmu_read(bpo->bpo_os,
2880 			    bpo->bpo_phys->bpo_subobjs,
2881 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
2882 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
2883 			if (error != 0) {
2884 				(void) printf("ERROR %u while trying to open "
2885 				    "subobj id %llu\n",
2886 				    error, (u_longlong_t)subobj);
2887 				continue;
2888 			}
2889 			bpobj_count_refd(&subbpo);
2890 			bpobj_close(&subbpo);
2891 		}
2892 	}
2893 }
2894 
2895 static int
2896 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
2897 {
2898 	spa_t *spa = arg;
2899 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
2900 	if (dle->dle_bpobj.bpo_object != empty_bpobj)
2901 		bpobj_count_refd(&dle->dle_bpobj);
2902 	return (0);
2903 }
2904 
2905 static int
2906 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
2907 {
2908 	ASSERT(arg == NULL);
2909 	if (dump_opt['d'] >= 5) {
2910 		char buf[128];
2911 		(void) snprintf(buf, sizeof (buf),
2912 		    "mintxg %llu -> obj %llu",
2913 		    (longlong_t)dle->dle_mintxg,
2914 		    (longlong_t)dle->dle_bpobj.bpo_object);
2915 
2916 		dump_full_bpobj(&dle->dle_bpobj, buf, 0);
2917 	} else {
2918 		(void) printf("mintxg %llu -> obj %llu\n",
2919 		    (longlong_t)dle->dle_mintxg,
2920 		    (longlong_t)dle->dle_bpobj.bpo_object);
2921 	}
2922 	return (0);
2923 }
2924 
2925 static void
2926 dump_blkptr_list(dsl_deadlist_t *dl, const char *name)
2927 {
2928 	char bytes[32];
2929 	char comp[32];
2930 	char uncomp[32];
2931 	char entries[32];
2932 	spa_t *spa = dmu_objset_spa(dl->dl_os);
2933 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
2934 
2935 	if (dl->dl_oldfmt) {
2936 		if (dl->dl_bpobj.bpo_object != empty_bpobj)
2937 			bpobj_count_refd(&dl->dl_bpobj);
2938 	} else {
2939 		mos_obj_refd(dl->dl_object);
2940 		dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
2941 	}
2942 
2943 	/* make sure nicenum has enough space */
2944 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
2945 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
2946 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
2947 	_Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated");
2948 
2949 	if (dump_opt['d'] < 3)
2950 		return;
2951 
2952 	if (dl->dl_oldfmt) {
2953 		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
2954 		return;
2955 	}
2956 
2957 	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
2958 	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
2959 	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
2960 	zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
2961 	(void) printf("\n    %s: %s (%s/%s comp), %s entries\n",
2962 	    name, bytes, comp, uncomp, entries);
2963 
2964 	if (dump_opt['d'] < 4)
2965 		return;
2966 
2967 	(void) putchar('\n');
2968 
2969 	dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
2970 }
2971 
2972 static int
2973 verify_dd_livelist(objset_t *os)
2974 {
2975 	uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
2976 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
2977 	dsl_dir_t  *dd = os->os_dsl_dataset->ds_dir;
2978 
2979 	ASSERT(!dmu_objset_is_snapshot(os));
2980 	if (!dsl_deadlist_is_open(&dd->dd_livelist))
2981 		return (0);
2982 
2983 	/* Iterate through the livelist to check for duplicates */
2984 	dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
2985 	    NULL);
2986 
2987 	dsl_pool_config_enter(dp, FTAG);
2988 	dsl_deadlist_space(&dd->dd_livelist, &ll_used,
2989 	    &ll_comp, &ll_uncomp);
2990 
2991 	dsl_dataset_t *origin_ds;
2992 	ASSERT(dsl_pool_config_held(dp));
2993 	VERIFY0(dsl_dataset_hold_obj(dp,
2994 	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
2995 	VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
2996 	    &used, &comp, &uncomp));
2997 	dsl_dataset_rele(origin_ds, FTAG);
2998 	dsl_pool_config_exit(dp, FTAG);
2999 	/*
3000 	 *  It's possible that the dataset's uncomp space is larger than the
3001 	 *  livelist's because livelists do not track embedded block pointers
3002 	 */
3003 	if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
3004 		char nice_used[32], nice_comp[32], nice_uncomp[32];
3005 		(void) printf("Discrepancy in space accounting:\n");
3006 		zdb_nicenum(used, nice_used, sizeof (nice_used));
3007 		zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
3008 		zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
3009 		(void) printf("dir: used %s, comp %s, uncomp %s\n",
3010 		    nice_used, nice_comp, nice_uncomp);
3011 		zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
3012 		zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
3013 		zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
3014 		(void) printf("livelist: used %s, comp %s, uncomp %s\n",
3015 		    nice_used, nice_comp, nice_uncomp);
3016 		return (1);
3017 	}
3018 	return (0);
3019 }
3020 
3021 static char *key_material = NULL;
3022 
3023 static boolean_t
3024 zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
3025 {
3026 	uint64_t keyformat, salt, iters;
3027 	int i;
3028 	unsigned char c;
3029 
3030 	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
3031 	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t),
3032 	    1, &keyformat));
3033 
3034 	switch (keyformat) {
3035 	case ZFS_KEYFORMAT_HEX:
3036 		for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) {
3037 			if (!isxdigit(key_material[i]) ||
3038 			    !isxdigit(key_material[i+1]))
3039 				return (B_FALSE);
3040 			if (sscanf(&key_material[i], "%02hhx", &c) != 1)
3041 				return (B_FALSE);
3042 			key_out[i / 2] = c;
3043 		}
3044 		break;
3045 
3046 	case ZFS_KEYFORMAT_PASSPHRASE:
3047 		VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
3048 		    dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
3049 		    sizeof (uint64_t), 1, &salt));
3050 		VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
3051 		    dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
3052 		    sizeof (uint64_t), 1, &iters));
3053 
3054 		if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material),
3055 		    ((uint8_t *)&salt), sizeof (uint64_t), iters,
3056 		    WRAPPING_KEY_LEN, key_out) != 1)
3057 			return (B_FALSE);
3058 
3059 		break;
3060 
3061 	default:
3062 		fatal("no support for key format %u\n",
3063 		    (unsigned int) keyformat);
3064 	}
3065 
3066 	return (B_TRUE);
3067 }
3068 
3069 static char encroot[ZFS_MAX_DATASET_NAME_LEN];
3070 static boolean_t key_loaded = B_FALSE;
3071 
3072 static void
3073 zdb_load_key(objset_t *os)
3074 {
3075 	dsl_pool_t *dp;
3076 	dsl_dir_t *dd, *rdd;
3077 	uint8_t key[WRAPPING_KEY_LEN];
3078 	uint64_t rddobj;
3079 	int err;
3080 
3081 	dp = spa_get_dsl(os->os_spa);
3082 	dd = os->os_dsl_dataset->ds_dir;
3083 
3084 	dsl_pool_config_enter(dp, FTAG);
3085 	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
3086 	    DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj));
3087 	VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd));
3088 	dsl_dir_name(rdd, encroot);
3089 	dsl_dir_rele(rdd, FTAG);
3090 
3091 	if (!zdb_derive_key(dd, key))
3092 		fatal("couldn't derive encryption key");
3093 
3094 	dsl_pool_config_exit(dp, FTAG);
3095 
3096 	ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE);
3097 
3098 	dsl_crypto_params_t *dcp;
3099 	nvlist_t *crypto_args;
3100 
3101 	crypto_args = fnvlist_alloc();
3102 	fnvlist_add_uint8_array(crypto_args, "wkeydata",
3103 	    (uint8_t *)key, WRAPPING_KEY_LEN);
3104 	VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
3105 	    NULL, crypto_args, &dcp));
3106 	err = spa_keystore_load_wkey(encroot, dcp, B_FALSE);
3107 
3108 	dsl_crypto_params_free(dcp, (err != 0));
3109 	fnvlist_free(crypto_args);
3110 
3111 	if (err != 0)
3112 		fatal(
3113 		    "couldn't load encryption key for %s: %s",
3114 		    encroot, err == ZFS_ERR_CRYPTO_NOTSUP ?
3115 		    "crypto params not supported" : strerror(err));
3116 
3117 	ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE);
3118 
3119 	printf("Unlocked encryption root: %s\n", encroot);
3120 	key_loaded = B_TRUE;
3121 }
3122 
3123 static void
3124 zdb_unload_key(void)
3125 {
3126 	if (!key_loaded)
3127 		return;
3128 
3129 	VERIFY0(spa_keystore_unload_wkey(encroot));
3130 	key_loaded = B_FALSE;
3131 }
3132 
3133 static avl_tree_t idx_tree;
3134 static avl_tree_t domain_tree;
3135 static boolean_t fuid_table_loaded;
3136 static objset_t *sa_os = NULL;
3137 static sa_attr_type_t *sa_attr_table = NULL;
3138 
3139 static int
3140 open_objset(const char *path, const void *tag, objset_t **osp)
3141 {
3142 	int err;
3143 	uint64_t sa_attrs = 0;
3144 	uint64_t version = 0;
3145 
3146 	VERIFY3P(sa_os, ==, NULL);
3147 
3148 	/*
3149 	 * We can't own an objset if it's redacted.  Therefore, we do this
3150 	 * dance: hold the objset, then acquire a long hold on its dataset, then
3151 	 * release the pool (which is held as part of holding the objset).
3152 	 */
3153 
3154 	if (dump_opt['K']) {
3155 		/* decryption requested, try to load keys */
3156 		err = dmu_objset_hold(path, tag, osp);
3157 		if (err != 0) {
3158 			(void) fprintf(stderr, "failed to hold dataset "
3159 			    "'%s': %s\n",
3160 			    path, strerror(err));
3161 			return (err);
3162 		}
3163 		dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
3164 		dsl_pool_rele(dmu_objset_pool(*osp), tag);
3165 
3166 		/* succeeds or dies */
3167 		zdb_load_key(*osp);
3168 
3169 		/* release it all */
3170 		dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
3171 		dsl_dataset_rele(dmu_objset_ds(*osp), tag);
3172 	}
3173 
3174 	int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0;
3175 
3176 	err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp);
3177 	if (err != 0) {
3178 		(void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
3179 		    path, strerror(err));
3180 		return (err);
3181 	}
3182 	dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
3183 	dsl_pool_rele(dmu_objset_pool(*osp), tag);
3184 
3185 	if (dmu_objset_type(*osp) == DMU_OST_ZFS &&
3186 	    (key_loaded || !(*osp)->os_encrypted)) {
3187 		(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
3188 		    8, 1, &version);
3189 		if (version >= ZPL_VERSION_SA) {
3190 			(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
3191 			    8, 1, &sa_attrs);
3192 		}
3193 		err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
3194 		    &sa_attr_table);
3195 		if (err != 0) {
3196 			(void) fprintf(stderr, "sa_setup failed: %s\n",
3197 			    strerror(err));
3198 			dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
3199 			dsl_dataset_rele_flags(dmu_objset_ds(*osp),
3200 			    ds_hold_flags, tag);
3201 			*osp = NULL;
3202 		}
3203 	}
3204 	sa_os = *osp;
3205 
3206 	return (err);
3207 }
3208 
3209 static void
3210 close_objset(objset_t *os, const void *tag)
3211 {
3212 	VERIFY3P(os, ==, sa_os);
3213 	if (os->os_sa != NULL)
3214 		sa_tear_down(os);
3215 	dsl_dataset_long_rele(dmu_objset_ds(os), tag);
3216 	dsl_dataset_rele_flags(dmu_objset_ds(os),
3217 	    key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag);
3218 	sa_attr_table = NULL;
3219 	sa_os = NULL;
3220 
3221 	zdb_unload_key();
3222 }
3223 
3224 static void
3225 fuid_table_destroy(void)
3226 {
3227 	if (fuid_table_loaded) {
3228 		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
3229 		fuid_table_loaded = B_FALSE;
3230 	}
3231 }
3232 
3233 /*
3234  * print uid or gid information.
3235  * For normal POSIX id just the id is printed in decimal format.
3236  * For CIFS files with FUID the fuid is printed in hex followed by
3237  * the domain-rid string.
3238  */
3239 static void
3240 print_idstr(uint64_t id, const char *id_type)
3241 {
3242 	if (FUID_INDEX(id)) {
3243 		const char *domain =
3244 		    zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
3245 		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
3246 		    (u_longlong_t)id, domain, (int)FUID_RID(id));
3247 	} else {
3248 		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
3249 	}
3250 
3251 }
3252 
3253 static void
3254 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
3255 {
3256 	uint32_t uid_idx, gid_idx;
3257 
3258 	uid_idx = FUID_INDEX(uid);
3259 	gid_idx = FUID_INDEX(gid);
3260 
3261 	/* Load domain table, if not already loaded */
3262 	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
3263 		uint64_t fuid_obj;
3264 
3265 		/* first find the fuid object.  It lives in the master node */
3266 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
3267 		    8, 1, &fuid_obj) == 0);
3268 		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
3269 		(void) zfs_fuid_table_load(os, fuid_obj,
3270 		    &idx_tree, &domain_tree);
3271 		fuid_table_loaded = B_TRUE;
3272 	}
3273 
3274 	print_idstr(uid, "uid");
3275 	print_idstr(gid, "gid");
3276 }
3277 
3278 static void
3279 dump_znode_sa_xattr(sa_handle_t *hdl)
3280 {
3281 	nvlist_t *sa_xattr;
3282 	nvpair_t *elem = NULL;
3283 	int sa_xattr_size = 0;
3284 	int sa_xattr_entries = 0;
3285 	int error;
3286 	char *sa_xattr_packed;
3287 
3288 	error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
3289 	if (error || sa_xattr_size == 0)
3290 		return;
3291 
3292 	sa_xattr_packed = malloc(sa_xattr_size);
3293 	if (sa_xattr_packed == NULL)
3294 		return;
3295 
3296 	error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
3297 	    sa_xattr_packed, sa_xattr_size);
3298 	if (error) {
3299 		free(sa_xattr_packed);
3300 		return;
3301 	}
3302 
3303 	error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
3304 	if (error) {
3305 		free(sa_xattr_packed);
3306 		return;
3307 	}
3308 
3309 	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
3310 		sa_xattr_entries++;
3311 
3312 	(void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
3313 	    sa_xattr_size, sa_xattr_entries);
3314 	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
3315 		boolean_t can_print = !dump_opt['P'];
3316 		uchar_t *value;
3317 		uint_t cnt, idx;
3318 
3319 		(void) printf("\t\t%s = ", nvpair_name(elem));
3320 		nvpair_value_byte_array(elem, &value, &cnt);
3321 
3322 		for (idx = 0; idx < cnt; ++idx) {
3323 			if (!isprint(value[idx])) {
3324 				can_print = B_FALSE;
3325 				break;
3326 			}
3327 		}
3328 
3329 		for (idx = 0; idx < cnt; ++idx) {
3330 			if (can_print)
3331 				(void) putchar(value[idx]);
3332 			else
3333 				(void) printf("\\%3.3o", value[idx]);
3334 		}
3335 		(void) putchar('\n');
3336 	}
3337 
3338 	nvlist_free(sa_xattr);
3339 	free(sa_xattr_packed);
3340 }
3341 
3342 static void
3343 dump_znode_symlink(sa_handle_t *hdl)
3344 {
3345 	int sa_symlink_size = 0;
3346 	char linktarget[MAXPATHLEN];
3347 	int error;
3348 
3349 	error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);
3350 	if (error || sa_symlink_size == 0) {
3351 		return;
3352 	}
3353 	if (sa_symlink_size >= sizeof (linktarget)) {
3354 		(void) printf("symlink size %d is too large\n",
3355 		    sa_symlink_size);
3356 		return;
3357 	}
3358 	linktarget[sa_symlink_size] = '\0';
3359 	if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],
3360 	    &linktarget, sa_symlink_size) == 0)
3361 		(void) printf("\ttarget	%s\n", linktarget);
3362 }
3363 
3364 static void
3365 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
3366 {
3367 	(void) data, (void) size;
3368 	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
3369 	sa_handle_t *hdl;
3370 	uint64_t xattr, rdev, gen;
3371 	uint64_t uid, gid, mode, fsize, parent, links;
3372 	uint64_t pflags;
3373 	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
3374 	time_t z_crtime, z_atime, z_mtime, z_ctime;
3375 	sa_bulk_attr_t bulk[12];
3376 	int idx = 0;
3377 	int error;
3378 
3379 	VERIFY3P(os, ==, sa_os);
3380 	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
3381 		(void) printf("Failed to get handle for SA znode\n");
3382 		return;
3383 	}
3384 
3385 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
3386 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
3387 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
3388 	    &links, 8);
3389 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
3390 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
3391 	    &mode, 8);
3392 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
3393 	    NULL, &parent, 8);
3394 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
3395 	    &fsize, 8);
3396 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
3397 	    acctm, 16);
3398 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
3399 	    modtm, 16);
3400 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
3401 	    crtm, 16);
3402 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
3403 	    chgtm, 16);
3404 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
3405 	    &pflags, 8);
3406 
3407 	if (sa_bulk_lookup(hdl, bulk, idx)) {
3408 		(void) sa_handle_destroy(hdl);
3409 		return;
3410 	}
3411 
3412 	z_crtime = (time_t)crtm[0];
3413 	z_atime = (time_t)acctm[0];
3414 	z_mtime = (time_t)modtm[0];
3415 	z_ctime = (time_t)chgtm[0];
3416 
3417 	if (dump_opt['d'] > 4) {
3418 		error = zfs_obj_to_path(os, object, path, sizeof (path));
3419 		if (error == ESTALE) {
3420 			(void) snprintf(path, sizeof (path), "on delete queue");
3421 		} else if (error != 0) {
3422 			leaked_objects++;
3423 			(void) snprintf(path, sizeof (path),
3424 			    "path not found, possibly leaked");
3425 		}
3426 		(void) printf("\tpath	%s\n", path);
3427 	}
3428 
3429 	if (S_ISLNK(mode))
3430 		dump_znode_symlink(hdl);
3431 	dump_uidgid(os, uid, gid);
3432 	(void) printf("\tatime	%s", ctime(&z_atime));
3433 	(void) printf("\tmtime	%s", ctime(&z_mtime));
3434 	(void) printf("\tctime	%s", ctime(&z_ctime));
3435 	(void) printf("\tcrtime	%s", ctime(&z_crtime));
3436 	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
3437 	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
3438 	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
3439 	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
3440 	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
3441 	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
3442 	if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
3443 		uint64_t projid;
3444 
3445 		if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
3446 		    sizeof (uint64_t)) == 0)
3447 			(void) printf("\tprojid	%llu\n", (u_longlong_t)projid);
3448 	}
3449 	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
3450 	    sizeof (uint64_t)) == 0)
3451 		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
3452 	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
3453 	    sizeof (uint64_t)) == 0)
3454 		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
3455 	dump_znode_sa_xattr(hdl);
3456 	sa_handle_destroy(hdl);
3457 }
3458 
3459 static void
3460 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
3461 {
3462 	(void) os, (void) object, (void) data, (void) size;
3463 }
3464 
3465 static void
3466 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
3467 {
3468 	(void) os, (void) object, (void) data, (void) size;
3469 }
3470 
3471 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
3472 	dump_none,		/* unallocated			*/
3473 	dump_zap,		/* object directory		*/
3474 	dump_uint64,		/* object array			*/
3475 	dump_none,		/* packed nvlist		*/
3476 	dump_packed_nvlist,	/* packed nvlist size		*/
3477 	dump_none,		/* bpobj			*/
3478 	dump_bpobj,		/* bpobj header			*/
3479 	dump_none,		/* SPA space map header		*/
3480 	dump_none,		/* SPA space map		*/
3481 	dump_none,		/* ZIL intent log		*/
3482 	dump_dnode,		/* DMU dnode			*/
3483 	dump_dmu_objset,	/* DMU objset			*/
3484 	dump_dsl_dir,		/* DSL directory		*/
3485 	dump_zap,		/* DSL directory child map	*/
3486 	dump_zap,		/* DSL dataset snap map		*/
3487 	dump_zap,		/* DSL props			*/
3488 	dump_dsl_dataset,	/* DSL dataset			*/
3489 	dump_znode,		/* ZFS znode			*/
3490 	dump_acl,		/* ZFS V0 ACL			*/
3491 	dump_uint8,		/* ZFS plain file		*/
3492 	dump_zpldir,		/* ZFS directory		*/
3493 	dump_zap,		/* ZFS master node		*/
3494 	dump_zap,		/* ZFS delete queue		*/
3495 	dump_uint8,		/* zvol object			*/
3496 	dump_zap,		/* zvol prop			*/
3497 	dump_uint8,		/* other uint8[]		*/
3498 	dump_uint64,		/* other uint64[]		*/
3499 	dump_zap,		/* other ZAP			*/
3500 	dump_zap,		/* persistent error log		*/
3501 	dump_uint8,		/* SPA history			*/
3502 	dump_history_offsets,	/* SPA history offsets		*/
3503 	dump_zap,		/* Pool properties		*/
3504 	dump_zap,		/* DSL permissions		*/
3505 	dump_acl,		/* ZFS ACL			*/
3506 	dump_uint8,		/* ZFS SYSACL			*/
3507 	dump_none,		/* FUID nvlist			*/
3508 	dump_packed_nvlist,	/* FUID nvlist size		*/
3509 	dump_zap,		/* DSL dataset next clones	*/
3510 	dump_zap,		/* DSL scrub queue		*/
3511 	dump_zap,		/* ZFS user/group/project used	*/
3512 	dump_zap,		/* ZFS user/group/project quota	*/
3513 	dump_zap,		/* snapshot refcount tags	*/
3514 	dump_ddt_zap,		/* DDT ZAP object		*/
3515 	dump_zap,		/* DDT statistics		*/
3516 	dump_znode,		/* SA object			*/
3517 	dump_zap,		/* SA Master Node		*/
3518 	dump_sa_attrs,		/* SA attribute registration	*/
3519 	dump_sa_layouts,	/* SA attribute layouts		*/
3520 	dump_zap,		/* DSL scrub translations	*/
3521 	dump_none,		/* fake dedup BP		*/
3522 	dump_zap,		/* deadlist			*/
3523 	dump_none,		/* deadlist hdr			*/
3524 	dump_zap,		/* dsl clones			*/
3525 	dump_bpobj_subobjs,	/* bpobj subobjs		*/
3526 	dump_unknown,		/* Unknown type, must be last	*/
3527 };
3528 
3529 static boolean_t
3530 match_object_type(dmu_object_type_t obj_type, uint64_t flags)
3531 {
3532 	boolean_t match = B_TRUE;
3533 
3534 	switch (obj_type) {
3535 	case DMU_OT_DIRECTORY_CONTENTS:
3536 		if (!(flags & ZOR_FLAG_DIRECTORY))
3537 			match = B_FALSE;
3538 		break;
3539 	case DMU_OT_PLAIN_FILE_CONTENTS:
3540 		if (!(flags & ZOR_FLAG_PLAIN_FILE))
3541 			match = B_FALSE;
3542 		break;
3543 	case DMU_OT_SPACE_MAP:
3544 		if (!(flags & ZOR_FLAG_SPACE_MAP))
3545 			match = B_FALSE;
3546 		break;
3547 	default:
3548 		if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {
3549 			if (!(flags & ZOR_FLAG_ZAP))
3550 				match = B_FALSE;
3551 			break;
3552 		}
3553 
3554 		/*
3555 		 * If all bits except some of the supported flags are
3556 		 * set, the user combined the all-types flag (A) with
3557 		 * a negated flag to exclude some types (e.g. A-f to
3558 		 * show all object types except plain files).
3559 		 */
3560 		if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)
3561 			match = B_FALSE;
3562 
3563 		break;
3564 	}
3565 
3566 	return (match);
3567 }
3568 
3569 static void
3570 dump_object(objset_t *os, uint64_t object, int verbosity,
3571     boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)
3572 {
3573 	dmu_buf_t *db = NULL;
3574 	dmu_object_info_t doi;
3575 	dnode_t *dn;
3576 	boolean_t dnode_held = B_FALSE;
3577 	void *bonus = NULL;
3578 	size_t bsize = 0;
3579 	char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
3580 	char bonus_size[32];
3581 	char aux[50];
3582 	int error;
3583 
3584 	/* make sure nicenum has enough space */
3585 	_Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated");
3586 	_Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated");
3587 	_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated");
3588 	_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated");
3589 	_Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ,
3590 	    "bonus_size truncated");
3591 
3592 	if (*print_header) {
3593 		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %6s  %5s  %6s  %s\n",
3594 		    "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
3595 		    "lsize", "%full", "type");
3596 		*print_header = 0;
3597 	}
3598 
3599 	if (object == 0) {
3600 		dn = DMU_META_DNODE(os);
3601 		dmu_object_info_from_dnode(dn, &doi);
3602 	} else {
3603 		/*
3604 		 * Encrypted datasets will have sensitive bonus buffers
3605 		 * encrypted. Therefore we cannot hold the bonus buffer and
3606 		 * must hold the dnode itself instead.
3607 		 */
3608 		error = dmu_object_info(os, object, &doi);
3609 		if (error)
3610 			fatal("dmu_object_info() failed, errno %u", error);
3611 
3612 		if (!key_loaded && os->os_encrypted &&
3613 		    DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
3614 			error = dnode_hold(os, object, FTAG, &dn);
3615 			if (error)
3616 				fatal("dnode_hold() failed, errno %u", error);
3617 			dnode_held = B_TRUE;
3618 		} else {
3619 			error = dmu_bonus_hold(os, object, FTAG, &db);
3620 			if (error)
3621 				fatal("dmu_bonus_hold(%llu) failed, errno %u",
3622 				    object, error);
3623 			bonus = db->db_data;
3624 			bsize = db->db_size;
3625 			dn = DB_DNODE((dmu_buf_impl_t *)db);
3626 		}
3627 	}
3628 
3629 	/*
3630 	 * Default to showing all object types if no flags were specified.
3631 	 */
3632 	if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&
3633 	    !match_object_type(doi.doi_type, flags))
3634 		goto out;
3635 
3636 	if (dnode_slots_used)
3637 		*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
3638 
3639 	zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
3640 	zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
3641 	zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
3642 	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
3643 	zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
3644 	zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
3645 	(void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 *
3646 	    doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ?
3647 	    DNODES_PER_BLOCK : 1) / doi.doi_max_offset);
3648 
3649 	aux[0] = '\0';
3650 
3651 	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
3652 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
3653 		    " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
3654 	}
3655 
3656 	if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&
3657 	    ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {
3658 		const char *compname = NULL;
3659 		if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,
3660 		    ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),
3661 		    &compname) == 0) {
3662 			(void) snprintf(aux + strlen(aux),
3663 			    sizeof (aux) - strlen(aux), " (Z=inherit=%s)",
3664 			    compname);
3665 		} else {
3666 			(void) snprintf(aux + strlen(aux),
3667 			    sizeof (aux) - strlen(aux),
3668 			    " (Z=inherit=%s-unknown)",
3669 			    ZDB_COMPRESS_NAME(os->os_compress));
3670 		}
3671 	} else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {
3672 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
3673 		    " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));
3674 	} else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
3675 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
3676 		    " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
3677 	}
3678 
3679 	(void) printf("%10lld  %3u  %5s  %5s  %5s  %6s  %5s  %6s  %s%s\n",
3680 	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
3681 	    asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
3682 
3683 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
3684 		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %5s  %6s  %s\n",
3685 		    "", "", "", "", "", "", bonus_size, "bonus",
3686 		    zdb_ot_name(doi.doi_bonus_type));
3687 	}
3688 
3689 	if (verbosity >= 4) {
3690 		(void) printf("\tdnode flags: %s%s%s%s\n",
3691 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
3692 		    "USED_BYTES " : "",
3693 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
3694 		    "USERUSED_ACCOUNTED " : "",
3695 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
3696 		    "USEROBJUSED_ACCOUNTED " : "",
3697 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
3698 		    "SPILL_BLKPTR" : "");
3699 		(void) printf("\tdnode maxblkid: %llu\n",
3700 		    (longlong_t)dn->dn_phys->dn_maxblkid);
3701 
3702 		if (!dnode_held) {
3703 			object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
3704 			    object, bonus, bsize);
3705 		} else {
3706 			(void) printf("\t\t(bonus encrypted)\n");
3707 		}
3708 
3709 		if (key_loaded ||
3710 		    (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) {
3711 			object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
3712 			    NULL, 0);
3713 		} else {
3714 			(void) printf("\t\t(object encrypted)\n");
3715 		}
3716 
3717 		*print_header = B_TRUE;
3718 	}
3719 
3720 	if (verbosity >= 5) {
3721 		if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
3722 			char blkbuf[BP_SPRINTF_LEN];
3723 			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
3724 			    DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE);
3725 			(void) printf("\nSpill block: %s\n", blkbuf);
3726 		}
3727 		dump_indirect(dn);
3728 	}
3729 
3730 	if (verbosity >= 5) {
3731 		/*
3732 		 * Report the list of segments that comprise the object.
3733 		 */
3734 		uint64_t start = 0;
3735 		uint64_t end;
3736 		uint64_t blkfill = 1;
3737 		int minlvl = 1;
3738 
3739 		if (dn->dn_type == DMU_OT_DNODE) {
3740 			minlvl = 0;
3741 			blkfill = DNODES_PER_BLOCK;
3742 		}
3743 
3744 		for (;;) {
3745 			char segsize[32];
3746 			/* make sure nicenum has enough space */
3747 			_Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ,
3748 			    "segsize truncated");
3749 			error = dnode_next_offset(dn,
3750 			    0, &start, minlvl, blkfill, 0);
3751 			if (error)
3752 				break;
3753 			end = start;
3754 			error = dnode_next_offset(dn,
3755 			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
3756 			zdb_nicenum(end - start, segsize, sizeof (segsize));
3757 			(void) printf("\t\tsegment [%016llx, %016llx)"
3758 			    " size %5s\n", (u_longlong_t)start,
3759 			    (u_longlong_t)end, segsize);
3760 			if (error)
3761 				break;
3762 			start = end;
3763 		}
3764 	}
3765 
3766 out:
3767 	if (db != NULL)
3768 		dmu_buf_rele(db, FTAG);
3769 	if (dnode_held)
3770 		dnode_rele(dn, FTAG);
3771 }
3772 
3773 static void
3774 count_dir_mos_objects(dsl_dir_t *dd)
3775 {
3776 	mos_obj_refd(dd->dd_object);
3777 	mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
3778 	mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
3779 	mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
3780 	mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
3781 
3782 	/*
3783 	 * The dd_crypto_obj can be referenced by multiple dsl_dir's.
3784 	 * Ignore the references after the first one.
3785 	 */
3786 	mos_obj_refd_multiple(dd->dd_crypto_obj);
3787 }
3788 
3789 static void
3790 count_ds_mos_objects(dsl_dataset_t *ds)
3791 {
3792 	mos_obj_refd(ds->ds_object);
3793 	mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
3794 	mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
3795 	mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
3796 	mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
3797 	mos_obj_refd(ds->ds_bookmarks_obj);
3798 
3799 	if (!dsl_dataset_is_snapshot(ds)) {
3800 		count_dir_mos_objects(ds->ds_dir);
3801 	}
3802 }
3803 
3804 static const char *const objset_types[DMU_OST_NUMTYPES] = {
3805 	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
3806 
3807 /*
3808  * Parse a string denoting a range of object IDs of the form
3809  * <start>[:<end>[:flags]], and store the results in zor.
3810  * Return 0 on success. On error, return 1 and update the msg
3811  * pointer to point to a descriptive error message.
3812  */
3813 static int
3814 parse_object_range(char *range, zopt_object_range_t *zor, const char **msg)
3815 {
3816 	uint64_t flags = 0;
3817 	char *p, *s, *dup, *flagstr, *tmp = NULL;
3818 	size_t len;
3819 	int i;
3820 	int rc = 0;
3821 
3822 	if (strchr(range, ':') == NULL) {
3823 		zor->zor_obj_start = strtoull(range, &p, 0);
3824 		if (*p != '\0') {
3825 			*msg = "Invalid characters in object ID";
3826 			rc = 1;
3827 		}
3828 		zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
3829 		zor->zor_obj_end = zor->zor_obj_start;
3830 		return (rc);
3831 	}
3832 
3833 	if (strchr(range, ':') == range) {
3834 		*msg = "Invalid leading colon";
3835 		rc = 1;
3836 		return (rc);
3837 	}
3838 
3839 	len = strlen(range);
3840 	if (range[len - 1] == ':') {
3841 		*msg = "Invalid trailing colon";
3842 		rc = 1;
3843 		return (rc);
3844 	}
3845 
3846 	dup = strdup(range);
3847 	s = strtok_r(dup, ":", &tmp);
3848 	zor->zor_obj_start = strtoull(s, &p, 0);
3849 
3850 	if (*p != '\0') {
3851 		*msg = "Invalid characters in start object ID";
3852 		rc = 1;
3853 		goto out;
3854 	}
3855 
3856 	s = strtok_r(NULL, ":", &tmp);
3857 	zor->zor_obj_end = strtoull(s, &p, 0);
3858 
3859 	if (*p != '\0') {
3860 		*msg = "Invalid characters in end object ID";
3861 		rc = 1;
3862 		goto out;
3863 	}
3864 
3865 	if (zor->zor_obj_start > zor->zor_obj_end) {
3866 		*msg = "Start object ID may not exceed end object ID";
3867 		rc = 1;
3868 		goto out;
3869 	}
3870 
3871 	s = strtok_r(NULL, ":", &tmp);
3872 	if (s == NULL) {
3873 		zor->zor_flags = ZOR_FLAG_ALL_TYPES;
3874 		goto out;
3875 	} else if (strtok_r(NULL, ":", &tmp) != NULL) {
3876 		*msg = "Invalid colon-delimited field after flags";
3877 		rc = 1;
3878 		goto out;
3879 	}
3880 
3881 	flagstr = s;
3882 	for (i = 0; flagstr[i]; i++) {
3883 		int bit;
3884 		boolean_t negation = (flagstr[i] == '-');
3885 
3886 		if (negation) {
3887 			i++;
3888 			if (flagstr[i] == '\0') {
3889 				*msg = "Invalid trailing negation operator";
3890 				rc = 1;
3891 				goto out;
3892 			}
3893 		}
3894 		bit = flagbits[(uchar_t)flagstr[i]];
3895 		if (bit == 0) {
3896 			*msg = "Invalid flag";
3897 			rc = 1;
3898 			goto out;
3899 		}
3900 		if (negation)
3901 			flags &= ~bit;
3902 		else
3903 			flags |= bit;
3904 	}
3905 	zor->zor_flags = flags;
3906 
3907 	zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
3908 	zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end);
3909 
3910 out:
3911 	free(dup);
3912 	return (rc);
3913 }
3914 
3915 static void
3916 dump_objset(objset_t *os)
3917 {
3918 	dmu_objset_stats_t dds = { 0 };
3919 	uint64_t object, object_count;
3920 	uint64_t refdbytes, usedobjs, scratch;
3921 	char numbuf[32];
3922 	char blkbuf[BP_SPRINTF_LEN + 20];
3923 	char osname[ZFS_MAX_DATASET_NAME_LEN];
3924 	const char *type = "UNKNOWN";
3925 	int verbosity = dump_opt['d'];
3926 	boolean_t print_header;
3927 	unsigned i;
3928 	int error;
3929 	uint64_t total_slots_used = 0;
3930 	uint64_t max_slot_used = 0;
3931 	uint64_t dnode_slots;
3932 	uint64_t obj_start;
3933 	uint64_t obj_end;
3934 	uint64_t flags;
3935 
3936 	/* make sure nicenum has enough space */
3937 	_Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated");
3938 
3939 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
3940 	dmu_objset_fast_stat(os, &dds);
3941 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
3942 
3943 	print_header = B_TRUE;
3944 
3945 	if (dds.dds_type < DMU_OST_NUMTYPES)
3946 		type = objset_types[dds.dds_type];
3947 
3948 	if (dds.dds_type == DMU_OST_META) {
3949 		dds.dds_creation_txg = TXG_INITIAL;
3950 		usedobjs = BP_GET_FILL(os->os_rootbp);
3951 		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
3952 		    dd_used_bytes;
3953 	} else {
3954 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
3955 	}
3956 
3957 	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
3958 
3959 	zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
3960 
3961 	if (verbosity >= 4) {
3962 		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
3963 		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
3964 		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
3965 	} else {
3966 		blkbuf[0] = '\0';
3967 	}
3968 
3969 	dmu_objset_name(os, osname);
3970 
3971 	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
3972 	    "%s, %llu objects%s%s\n",
3973 	    osname, type, (u_longlong_t)dmu_objset_id(os),
3974 	    (u_longlong_t)dds.dds_creation_txg,
3975 	    numbuf, (u_longlong_t)usedobjs, blkbuf,
3976 	    (dds.dds_inconsistent) ? " (inconsistent)" : "");
3977 
3978 	for (i = 0; i < zopt_object_args; i++) {
3979 		obj_start = zopt_object_ranges[i].zor_obj_start;
3980 		obj_end = zopt_object_ranges[i].zor_obj_end;
3981 		flags = zopt_object_ranges[i].zor_flags;
3982 
3983 		object = obj_start;
3984 		if (object == 0 || obj_start == obj_end)
3985 			dump_object(os, object, verbosity, &print_header, NULL,
3986 			    flags);
3987 		else
3988 			object--;
3989 
3990 		while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&
3991 		    object <= obj_end) {
3992 			dump_object(os, object, verbosity, &print_header, NULL,
3993 			    flags);
3994 		}
3995 	}
3996 
3997 	if (zopt_object_args > 0) {
3998 		(void) printf("\n");
3999 		return;
4000 	}
4001 
4002 	if (dump_opt['i'] != 0 || verbosity >= 2)
4003 		dump_intent_log(dmu_objset_zil(os));
4004 
4005 	if (dmu_objset_ds(os) != NULL) {
4006 		dsl_dataset_t *ds = dmu_objset_ds(os);
4007 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
4008 		if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
4009 		    !dmu_objset_is_snapshot(os)) {
4010 			dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
4011 			if (verify_dd_livelist(os) != 0)
4012 				fatal("livelist is incorrect");
4013 		}
4014 
4015 		if (dsl_dataset_remap_deadlist_exists(ds)) {
4016 			(void) printf("ds_remap_deadlist:\n");
4017 			dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
4018 		}
4019 		count_ds_mos_objects(ds);
4020 	}
4021 
4022 	if (dmu_objset_ds(os) != NULL)
4023 		dump_bookmarks(os, verbosity);
4024 
4025 	if (verbosity < 2)
4026 		return;
4027 
4028 	if (BP_IS_HOLE(os->os_rootbp))
4029 		return;
4030 
4031 	dump_object(os, 0, verbosity, &print_header, NULL, 0);
4032 	object_count = 0;
4033 	if (DMU_USERUSED_DNODE(os) != NULL &&
4034 	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
4035 		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
4036 		    NULL, 0);
4037 		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
4038 		    NULL, 0);
4039 	}
4040 
4041 	if (DMU_PROJECTUSED_DNODE(os) != NULL &&
4042 	    DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
4043 		dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
4044 		    &print_header, NULL, 0);
4045 
4046 	object = 0;
4047 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
4048 		dump_object(os, object, verbosity, &print_header, &dnode_slots,
4049 		    0);
4050 		object_count++;
4051 		total_slots_used += dnode_slots;
4052 		max_slot_used = object + dnode_slots - 1;
4053 	}
4054 
4055 	(void) printf("\n");
4056 
4057 	(void) printf("    Dnode slots:\n");
4058 	(void) printf("\tTotal used:    %10llu\n",
4059 	    (u_longlong_t)total_slots_used);
4060 	(void) printf("\tMax used:      %10llu\n",
4061 	    (u_longlong_t)max_slot_used);
4062 	(void) printf("\tPercent empty: %10lf\n",
4063 	    (double)(max_slot_used - total_slots_used)*100 /
4064 	    (double)max_slot_used);
4065 	(void) printf("\n");
4066 
4067 	if (error != ESRCH) {
4068 		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
4069 		abort();
4070 	}
4071 
4072 	ASSERT3U(object_count, ==, usedobjs);
4073 
4074 	if (leaked_objects != 0) {
4075 		(void) printf("%d potentially leaked objects detected\n",
4076 		    leaked_objects);
4077 		leaked_objects = 0;
4078 	}
4079 }
4080 
4081 static void
4082 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
4083 {
4084 	time_t timestamp = ub->ub_timestamp;
4085 
4086 	(void) printf("%s", header ? header : "");
4087 	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
4088 	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
4089 	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
4090 	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
4091 	(void) printf("\ttimestamp = %llu UTC = %s",
4092 	    (u_longlong_t)ub->ub_timestamp, ctime(&timestamp));
4093 
4094 	(void) printf("\tmmp_magic = %016llx\n",
4095 	    (u_longlong_t)ub->ub_mmp_magic);
4096 	if (MMP_VALID(ub)) {
4097 		(void) printf("\tmmp_delay = %0llu\n",
4098 		    (u_longlong_t)ub->ub_mmp_delay);
4099 		if (MMP_SEQ_VALID(ub))
4100 			(void) printf("\tmmp_seq = %u\n",
4101 			    (unsigned int) MMP_SEQ(ub));
4102 		if (MMP_FAIL_INT_VALID(ub))
4103 			(void) printf("\tmmp_fail = %u\n",
4104 			    (unsigned int) MMP_FAIL_INT(ub));
4105 		if (MMP_INTERVAL_VALID(ub))
4106 			(void) printf("\tmmp_write = %u\n",
4107 			    (unsigned int) MMP_INTERVAL(ub));
4108 		/* After MMP_* to make summarize_uberblock_mmp cleaner */
4109 		(void) printf("\tmmp_valid = %x\n",
4110 		    (unsigned int) ub->ub_mmp_config & 0xFF);
4111 	}
4112 
4113 	if (dump_opt['u'] >= 4) {
4114 		char blkbuf[BP_SPRINTF_LEN];
4115 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
4116 		(void) printf("\trootbp = %s\n", blkbuf);
4117 	}
4118 	(void) printf("\tcheckpoint_txg = %llu\n",
4119 	    (u_longlong_t)ub->ub_checkpoint_txg);
4120 
4121 	(void) printf("\traidz_reflow state=%u off=%llu\n",
4122 	    (int)RRSS_GET_STATE(ub),
4123 	    (u_longlong_t)RRSS_GET_OFFSET(ub));
4124 
4125 	(void) printf("%s", footer ? footer : "");
4126 }
4127 
4128 static void
4129 dump_config(spa_t *spa)
4130 {
4131 	dmu_buf_t *db;
4132 	size_t nvsize = 0;
4133 	int error = 0;
4134 
4135 
4136 	error = dmu_bonus_hold(spa->spa_meta_objset,
4137 	    spa->spa_config_object, FTAG, &db);
4138 
4139 	if (error == 0) {
4140 		nvsize = *(uint64_t *)db->db_data;
4141 		dmu_buf_rele(db, FTAG);
4142 
4143 		(void) printf("\nMOS Configuration:\n");
4144 		dump_packed_nvlist(spa->spa_meta_objset,
4145 		    spa->spa_config_object, (void *)&nvsize, 1);
4146 	} else {
4147 		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
4148 		    (u_longlong_t)spa->spa_config_object, error);
4149 	}
4150 }
4151 
4152 static void
4153 dump_cachefile(const char *cachefile)
4154 {
4155 	int fd;
4156 	struct stat64 statbuf;
4157 	char *buf;
4158 	nvlist_t *config;
4159 
4160 	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
4161 		(void) printf("cannot open '%s': %s\n", cachefile,
4162 		    strerror(errno));
4163 		exit(1);
4164 	}
4165 
4166 	if (fstat64(fd, &statbuf) != 0) {
4167 		(void) printf("failed to stat '%s': %s\n", cachefile,
4168 		    strerror(errno));
4169 		exit(1);
4170 	}
4171 
4172 	if ((buf = malloc(statbuf.st_size)) == NULL) {
4173 		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
4174 		    (u_longlong_t)statbuf.st_size);
4175 		exit(1);
4176 	}
4177 
4178 	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
4179 		(void) fprintf(stderr, "failed to read %llu bytes\n",
4180 		    (u_longlong_t)statbuf.st_size);
4181 		exit(1);
4182 	}
4183 
4184 	(void) close(fd);
4185 
4186 	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
4187 		(void) fprintf(stderr, "failed to unpack nvlist\n");
4188 		exit(1);
4189 	}
4190 
4191 	free(buf);
4192 
4193 	dump_nvlist(config, 0);
4194 
4195 	nvlist_free(config);
4196 }
4197 
4198 /*
4199  * ZFS label nvlist stats
4200  */
4201 typedef struct zdb_nvl_stats {
4202 	int		zns_list_count;
4203 	int		zns_leaf_count;
4204 	size_t		zns_leaf_largest;
4205 	size_t		zns_leaf_total;
4206 	nvlist_t	*zns_string;
4207 	nvlist_t	*zns_uint64;
4208 	nvlist_t	*zns_boolean;
4209 } zdb_nvl_stats_t;
4210 
4211 static void
4212 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
4213 {
4214 	nvlist_t *list, **array;
4215 	nvpair_t *nvp = NULL;
4216 	const char *name;
4217 	uint_t i, items;
4218 
4219 	stats->zns_list_count++;
4220 
4221 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
4222 		name = nvpair_name(nvp);
4223 
4224 		switch (nvpair_type(nvp)) {
4225 		case DATA_TYPE_STRING:
4226 			fnvlist_add_string(stats->zns_string, name,
4227 			    fnvpair_value_string(nvp));
4228 			break;
4229 		case DATA_TYPE_UINT64:
4230 			fnvlist_add_uint64(stats->zns_uint64, name,
4231 			    fnvpair_value_uint64(nvp));
4232 			break;
4233 		case DATA_TYPE_BOOLEAN:
4234 			fnvlist_add_boolean(stats->zns_boolean, name);
4235 			break;
4236 		case DATA_TYPE_NVLIST:
4237 			if (nvpair_value_nvlist(nvp, &list) == 0)
4238 				collect_nvlist_stats(list, stats);
4239 			break;
4240 		case DATA_TYPE_NVLIST_ARRAY:
4241 			if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
4242 				break;
4243 
4244 			for (i = 0; i < items; i++) {
4245 				collect_nvlist_stats(array[i], stats);
4246 
4247 				/* collect stats on leaf vdev */
4248 				if (strcmp(name, "children") == 0) {
4249 					size_t size;
4250 
4251 					(void) nvlist_size(array[i], &size,
4252 					    NV_ENCODE_XDR);
4253 					stats->zns_leaf_total += size;
4254 					if (size > stats->zns_leaf_largest)
4255 						stats->zns_leaf_largest = size;
4256 					stats->zns_leaf_count++;
4257 				}
4258 			}
4259 			break;
4260 		default:
4261 			(void) printf("skip type %d!\n", (int)nvpair_type(nvp));
4262 		}
4263 	}
4264 }
4265 
4266 static void
4267 dump_nvlist_stats(nvlist_t *nvl, size_t cap)
4268 {
4269 	zdb_nvl_stats_t stats = { 0 };
4270 	size_t size, sum = 0, total;
4271 	size_t noise;
4272 
4273 	/* requires nvlist with non-unique names for stat collection */
4274 	VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
4275 	VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
4276 	VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
4277 	VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
4278 
4279 	(void) printf("\n\nZFS Label NVList Config Stats:\n");
4280 
4281 	VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
4282 	(void) printf("  %d bytes used, %d bytes free (using %4.1f%%)\n\n",
4283 	    (int)total, (int)(cap - total), 100.0 * total / cap);
4284 
4285 	collect_nvlist_stats(nvl, &stats);
4286 
4287 	VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
4288 	size -= noise;
4289 	sum += size;
4290 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
4291 	    (int)fnvlist_num_pairs(stats.zns_uint64),
4292 	    (int)size, 100.0 * size / total);
4293 
4294 	VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
4295 	size -= noise;
4296 	sum += size;
4297 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
4298 	    (int)fnvlist_num_pairs(stats.zns_string),
4299 	    (int)size, 100.0 * size / total);
4300 
4301 	VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
4302 	size -= noise;
4303 	sum += size;
4304 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
4305 	    (int)fnvlist_num_pairs(stats.zns_boolean),
4306 	    (int)size, 100.0 * size / total);
4307 
4308 	size = total - sum;	/* treat remainder as nvlist overhead */
4309 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
4310 	    stats.zns_list_count, (int)size, 100.0 * size / total);
4311 
4312 	if (stats.zns_leaf_count > 0) {
4313 		size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
4314 
4315 		(void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
4316 		    stats.zns_leaf_count, (int)average);
4317 		(void) printf("%24d bytes largest\n",
4318 		    (int)stats.zns_leaf_largest);
4319 
4320 		if (dump_opt['l'] >= 3 && average > 0)
4321 			(void) printf("  space for %d additional leaf vdevs\n",
4322 			    (int)((cap - total) / average));
4323 	}
4324 	(void) printf("\n");
4325 
4326 	nvlist_free(stats.zns_string);
4327 	nvlist_free(stats.zns_uint64);
4328 	nvlist_free(stats.zns_boolean);
4329 }
4330 
4331 typedef struct cksum_record {
4332 	zio_cksum_t cksum;
4333 	boolean_t labels[VDEV_LABELS];
4334 	avl_node_t link;
4335 } cksum_record_t;
4336 
4337 static int
4338 cksum_record_compare(const void *x1, const void *x2)
4339 {
4340 	const cksum_record_t *l = (cksum_record_t *)x1;
4341 	const cksum_record_t *r = (cksum_record_t *)x2;
4342 	int arraysize = ARRAY_SIZE(l->cksum.zc_word);
4343 	int difference = 0;
4344 
4345 	for (int i = 0; i < arraysize; i++) {
4346 		difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
4347 		if (difference)
4348 			break;
4349 	}
4350 
4351 	return (difference);
4352 }
4353 
4354 static cksum_record_t *
4355 cksum_record_alloc(zio_cksum_t *cksum, int l)
4356 {
4357 	cksum_record_t *rec;
4358 
4359 	rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
4360 	rec->cksum = *cksum;
4361 	rec->labels[l] = B_TRUE;
4362 
4363 	return (rec);
4364 }
4365 
4366 static cksum_record_t *
4367 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
4368 {
4369 	cksum_record_t lookup = { .cksum = *cksum };
4370 	avl_index_t where;
4371 
4372 	return (avl_find(tree, &lookup, &where));
4373 }
4374 
4375 static cksum_record_t *
4376 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
4377 {
4378 	cksum_record_t *rec;
4379 
4380 	rec = cksum_record_lookup(tree, cksum);
4381 	if (rec) {
4382 		rec->labels[l] = B_TRUE;
4383 	} else {
4384 		rec = cksum_record_alloc(cksum, l);
4385 		avl_add(tree, rec);
4386 	}
4387 
4388 	return (rec);
4389 }
4390 
4391 static int
4392 first_label(cksum_record_t *rec)
4393 {
4394 	for (int i = 0; i < VDEV_LABELS; i++)
4395 		if (rec->labels[i])
4396 			return (i);
4397 
4398 	return (-1);
4399 }
4400 
4401 static void
4402 print_label_numbers(const char *prefix, const cksum_record_t *rec)
4403 {
4404 	fputs(prefix, stdout);
4405 	for (int i = 0; i < VDEV_LABELS; i++)
4406 		if (rec->labels[i] == B_TRUE)
4407 			printf("%d ", i);
4408 	putchar('\n');
4409 }
4410 
4411 #define	MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
4412 
4413 typedef struct zdb_label {
4414 	vdev_label_t label;
4415 	uint64_t label_offset;
4416 	nvlist_t *config_nv;
4417 	cksum_record_t *config;
4418 	cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
4419 	boolean_t header_printed;
4420 	boolean_t read_failed;
4421 	boolean_t cksum_valid;
4422 } zdb_label_t;
4423 
4424 static void
4425 print_label_header(zdb_label_t *label, int l)
4426 {
4427 
4428 	if (dump_opt['q'])
4429 		return;
4430 
4431 	if (label->header_printed == B_TRUE)
4432 		return;
4433 
4434 	(void) printf("------------------------------------\n");
4435 	(void) printf("LABEL %d %s\n", l,
4436 	    label->cksum_valid ? "" : "(Bad label cksum)");
4437 	(void) printf("------------------------------------\n");
4438 
4439 	label->header_printed = B_TRUE;
4440 }
4441 
4442 static void
4443 print_l2arc_header(void)
4444 {
4445 	(void) printf("------------------------------------\n");
4446 	(void) printf("L2ARC device header\n");
4447 	(void) printf("------------------------------------\n");
4448 }
4449 
4450 static void
4451 print_l2arc_log_blocks(void)
4452 {
4453 	(void) printf("------------------------------------\n");
4454 	(void) printf("L2ARC device log blocks\n");
4455 	(void) printf("------------------------------------\n");
4456 }
4457 
4458 static void
4459 dump_l2arc_log_entries(uint64_t log_entries,
4460     l2arc_log_ent_phys_t *le, uint64_t i)
4461 {
4462 	for (int j = 0; j < log_entries; j++) {
4463 		dva_t dva = le[j].le_dva;
4464 		(void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
4465 		    "vdev: %llu, offset: %llu\n",
4466 		    (u_longlong_t)i, j + 1,
4467 		    (u_longlong_t)DVA_GET_ASIZE(&dva),
4468 		    (u_longlong_t)DVA_GET_VDEV(&dva),
4469 		    (u_longlong_t)DVA_GET_OFFSET(&dva));
4470 		(void) printf("|\t\t\t\tbirth: %llu\n",
4471 		    (u_longlong_t)le[j].le_birth);
4472 		(void) printf("|\t\t\t\tlsize: %llu\n",
4473 		    (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
4474 		(void) printf("|\t\t\t\tpsize: %llu\n",
4475 		    (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
4476 		(void) printf("|\t\t\t\tcompr: %llu\n",
4477 		    (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
4478 		(void) printf("|\t\t\t\tcomplevel: %llu\n",
4479 		    (u_longlong_t)(&le[j])->le_complevel);
4480 		(void) printf("|\t\t\t\ttype: %llu\n",
4481 		    (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
4482 		(void) printf("|\t\t\t\tprotected: %llu\n",
4483 		    (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
4484 		(void) printf("|\t\t\t\tprefetch: %llu\n",
4485 		    (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
4486 		(void) printf("|\t\t\t\taddress: %llu\n",
4487 		    (u_longlong_t)le[j].le_daddr);
4488 		(void) printf("|\t\t\t\tARC state: %llu\n",
4489 		    (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop));
4490 		(void) printf("|\n");
4491 	}
4492 	(void) printf("\n");
4493 }
4494 
4495 static void
4496 dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps)
4497 {
4498 	(void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr);
4499 	(void) printf("|\t\tpayload_asize: %llu\n",
4500 	    (u_longlong_t)lbps->lbp_payload_asize);
4501 	(void) printf("|\t\tpayload_start: %llu\n",
4502 	    (u_longlong_t)lbps->lbp_payload_start);
4503 	(void) printf("|\t\tlsize: %llu\n",
4504 	    (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop));
4505 	(void) printf("|\t\tasize: %llu\n",
4506 	    (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop));
4507 	(void) printf("|\t\tcompralgo: %llu\n",
4508 	    (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop));
4509 	(void) printf("|\t\tcksumalgo: %llu\n",
4510 	    (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop));
4511 	(void) printf("|\n\n");
4512 }
4513 
4514 static void
4515 dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
4516     l2arc_dev_hdr_phys_t *rebuild)
4517 {
4518 	l2arc_log_blk_phys_t this_lb;
4519 	uint64_t asize;
4520 	l2arc_log_blkptr_t lbps[2];
4521 	abd_t *abd;
4522 	zio_cksum_t cksum;
4523 	int failed = 0;
4524 	l2arc_dev_t dev;
4525 
4526 	if (!dump_opt['q'])
4527 		print_l2arc_log_blocks();
4528 	memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
4529 
4530 	dev.l2ad_evict = l2dhdr->dh_evict;
4531 	dev.l2ad_start = l2dhdr->dh_start;
4532 	dev.l2ad_end = l2dhdr->dh_end;
4533 
4534 	if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) {
4535 		/* no log blocks to read */
4536 		if (!dump_opt['q']) {
4537 			(void) printf("No log blocks to read\n");
4538 			(void) printf("\n");
4539 		}
4540 		return;
4541 	} else {
4542 		dev.l2ad_hand = lbps[0].lbp_daddr +
4543 		    L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
4544 	}
4545 
4546 	dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
4547 
4548 	for (;;) {
4549 		if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
4550 			break;
4551 
4552 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
4553 		asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
4554 		if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
4555 			if (!dump_opt['q']) {
4556 				(void) printf("Error while reading next log "
4557 				    "block\n\n");
4558 			}
4559 			break;
4560 		}
4561 
4562 		fletcher_4_native_varsize(&this_lb, asize, &cksum);
4563 		if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
4564 			failed++;
4565 			if (!dump_opt['q']) {
4566 				(void) printf("Invalid cksum\n");
4567 				dump_l2arc_log_blkptr(&lbps[0]);
4568 			}
4569 			break;
4570 		}
4571 
4572 		switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
4573 		case ZIO_COMPRESS_OFF:
4574 			break;
4575 		default:
4576 			abd = abd_alloc_for_io(asize, B_TRUE);
4577 			abd_copy_from_buf_off(abd, &this_lb, 0, asize);
4578 			if (zio_decompress_data(L2BLK_GET_COMPRESS(
4579 			    (&lbps[0])->lbp_prop), abd, &this_lb,
4580 			    asize, sizeof (this_lb), NULL) != 0) {
4581 				(void) printf("L2ARC block decompression "
4582 				    "failed\n");
4583 				abd_free(abd);
4584 				goto out;
4585 			}
4586 			abd_free(abd);
4587 			break;
4588 		}
4589 
4590 		if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
4591 			byteswap_uint64_array(&this_lb, sizeof (this_lb));
4592 		if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
4593 			if (!dump_opt['q'])
4594 				(void) printf("Invalid log block magic\n\n");
4595 			break;
4596 		}
4597 
4598 		rebuild->dh_lb_count++;
4599 		rebuild->dh_lb_asize += asize;
4600 		if (dump_opt['l'] > 1 && !dump_opt['q']) {
4601 			(void) printf("lb[%4llu]\tmagic: %llu\n",
4602 			    (u_longlong_t)rebuild->dh_lb_count,
4603 			    (u_longlong_t)this_lb.lb_magic);
4604 			dump_l2arc_log_blkptr(&lbps[0]);
4605 		}
4606 
4607 		if (dump_opt['l'] > 2 && !dump_opt['q'])
4608 			dump_l2arc_log_entries(l2dhdr->dh_log_entries,
4609 			    this_lb.lb_entries,
4610 			    rebuild->dh_lb_count);
4611 
4612 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
4613 		    lbps[0].lbp_payload_start, dev.l2ad_evict) &&
4614 		    !dev.l2ad_first)
4615 			break;
4616 
4617 		lbps[0] = lbps[1];
4618 		lbps[1] = this_lb.lb_prev_lbp;
4619 	}
4620 out:
4621 	if (!dump_opt['q']) {
4622 		(void) printf("log_blk_count:\t %llu with valid cksum\n",
4623 		    (u_longlong_t)rebuild->dh_lb_count);
4624 		(void) printf("\t\t %d with invalid cksum\n", failed);
4625 		(void) printf("log_blk_asize:\t %llu\n\n",
4626 		    (u_longlong_t)rebuild->dh_lb_asize);
4627 	}
4628 }
4629 
4630 static int
4631 dump_l2arc_header(int fd)
4632 {
4633 	l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0};
4634 	int error = B_FALSE;
4635 
4636 	if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
4637 	    VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
4638 		error = B_TRUE;
4639 	} else {
4640 		if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
4641 			byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
4642 
4643 		if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
4644 			error = B_TRUE;
4645 	}
4646 
4647 	if (error) {
4648 		(void) printf("L2ARC device header not found\n\n");
4649 		/* Do not return an error here for backward compatibility */
4650 		return (0);
4651 	} else if (!dump_opt['q']) {
4652 		print_l2arc_header();
4653 
4654 		(void) printf("    magic: %llu\n",
4655 		    (u_longlong_t)l2dhdr.dh_magic);
4656 		(void) printf("    version: %llu\n",
4657 		    (u_longlong_t)l2dhdr.dh_version);
4658 		(void) printf("    pool_guid: %llu\n",
4659 		    (u_longlong_t)l2dhdr.dh_spa_guid);
4660 		(void) printf("    flags: %llu\n",
4661 		    (u_longlong_t)l2dhdr.dh_flags);
4662 		(void) printf("    start_lbps[0]: %llu\n",
4663 		    (u_longlong_t)
4664 		    l2dhdr.dh_start_lbps[0].lbp_daddr);
4665 		(void) printf("    start_lbps[1]: %llu\n",
4666 		    (u_longlong_t)
4667 		    l2dhdr.dh_start_lbps[1].lbp_daddr);
4668 		(void) printf("    log_blk_ent: %llu\n",
4669 		    (u_longlong_t)l2dhdr.dh_log_entries);
4670 		(void) printf("    start: %llu\n",
4671 		    (u_longlong_t)l2dhdr.dh_start);
4672 		(void) printf("    end: %llu\n",
4673 		    (u_longlong_t)l2dhdr.dh_end);
4674 		(void) printf("    evict: %llu\n",
4675 		    (u_longlong_t)l2dhdr.dh_evict);
4676 		(void) printf("    lb_asize_refcount: %llu\n",
4677 		    (u_longlong_t)l2dhdr.dh_lb_asize);
4678 		(void) printf("    lb_count_refcount: %llu\n",
4679 		    (u_longlong_t)l2dhdr.dh_lb_count);
4680 		(void) printf("    trim_action_time: %llu\n",
4681 		    (u_longlong_t)l2dhdr.dh_trim_action_time);
4682 		(void) printf("    trim_state: %llu\n\n",
4683 		    (u_longlong_t)l2dhdr.dh_trim_state);
4684 	}
4685 
4686 	dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild);
4687 	/*
4688 	 * The total aligned size of log blocks and the number of log blocks
4689 	 * reported in the header of the device may be less than what zdb
4690 	 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
4691 	 * This happens because dump_l2arc_log_blocks() lacks the memory
4692 	 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system
4693 	 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
4694 	 * and dh_lb_count will be lower to begin with than what exists on the
4695 	 * device. This is normal and zdb should not exit with an error. The
4696 	 * opposite case should never happen though, the values reported in the
4697 	 * header should never be higher than what dump_l2arc_log_blocks() and
4698 	 * l2arc_rebuild() report. If this happens there is a leak in the
4699 	 * accounting of log blocks.
4700 	 */
4701 	if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
4702 	    l2dhdr.dh_lb_count > rebuild.dh_lb_count)
4703 		return (1);
4704 
4705 	return (0);
4706 }
4707 
4708 static void
4709 dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
4710 {
4711 	if (dump_opt['q'])
4712 		return;
4713 
4714 	if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
4715 		return;
4716 
4717 	print_label_header(label, l);
4718 	dump_nvlist(label->config_nv, 4);
4719 	print_label_numbers("    labels = ", label->config);
4720 
4721 	if (dump_opt['l'] >= 2)
4722 		dump_nvlist_stats(label->config_nv, buflen);
4723 }
4724 
4725 #define	ZDB_MAX_UB_HEADER_SIZE 32
4726 
4727 static void
4728 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
4729 {
4730 
4731 	vdev_t vd;
4732 	char header[ZDB_MAX_UB_HEADER_SIZE];
4733 
4734 	vd.vdev_ashift = ashift;
4735 	vd.vdev_top = &vd;
4736 
4737 	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
4738 		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
4739 		uberblock_t *ub = (void *)((char *)&label->label + uoff);
4740 		cksum_record_t *rec = label->uberblocks[i];
4741 
4742 		if (rec == NULL) {
4743 			if (dump_opt['u'] >= 2) {
4744 				print_label_header(label, label_num);
4745 				(void) printf("    Uberblock[%d] invalid\n", i);
4746 			}
4747 			continue;
4748 		}
4749 
4750 		if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
4751 			continue;
4752 
4753 		if ((dump_opt['u'] < 4) &&
4754 		    (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
4755 		    (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
4756 			continue;
4757 
4758 		print_label_header(label, label_num);
4759 		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
4760 		    "    Uberblock[%d]\n", i);
4761 		dump_uberblock(ub, header, "");
4762 		print_label_numbers("        labels = ", rec);
4763 	}
4764 }
4765 
4766 static char curpath[PATH_MAX];
4767 
4768 /*
4769  * Iterate through the path components, recursively passing
4770  * current one's obj and remaining path until we find the obj
4771  * for the last one.
4772  */
4773 static int
4774 dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj)
4775 {
4776 	int err;
4777 	boolean_t header = B_TRUE;
4778 	uint64_t child_obj;
4779 	char *s;
4780 	dmu_buf_t *db;
4781 	dmu_object_info_t doi;
4782 
4783 	if ((s = strchr(name, '/')) != NULL)
4784 		*s = '\0';
4785 	err = zap_lookup(os, obj, name, 8, 1, &child_obj);
4786 
4787 	(void) strlcat(curpath, name, sizeof (curpath));
4788 
4789 	if (err != 0) {
4790 		(void) fprintf(stderr, "failed to lookup %s: %s\n",
4791 		    curpath, strerror(err));
4792 		return (err);
4793 	}
4794 
4795 	child_obj = ZFS_DIRENT_OBJ(child_obj);
4796 	err = sa_buf_hold(os, child_obj, FTAG, &db);
4797 	if (err != 0) {
4798 		(void) fprintf(stderr,
4799 		    "failed to get SA dbuf for obj %llu: %s\n",
4800 		    (u_longlong_t)child_obj, strerror(err));
4801 		return (EINVAL);
4802 	}
4803 	dmu_object_info_from_db(db, &doi);
4804 	sa_buf_rele(db, FTAG);
4805 
4806 	if (doi.doi_bonus_type != DMU_OT_SA &&
4807 	    doi.doi_bonus_type != DMU_OT_ZNODE) {
4808 		(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
4809 		    doi.doi_bonus_type, (u_longlong_t)child_obj);
4810 		return (EINVAL);
4811 	}
4812 
4813 	if (dump_opt['v'] > 6) {
4814 		(void) printf("obj=%llu %s type=%d bonustype=%d\n",
4815 		    (u_longlong_t)child_obj, curpath, doi.doi_type,
4816 		    doi.doi_bonus_type);
4817 	}
4818 
4819 	(void) strlcat(curpath, "/", sizeof (curpath));
4820 
4821 	switch (doi.doi_type) {
4822 	case DMU_OT_DIRECTORY_CONTENTS:
4823 		if (s != NULL && *(s + 1) != '\0')
4824 			return (dump_path_impl(os, child_obj, s + 1, retobj));
4825 		zfs_fallthrough;
4826 	case DMU_OT_PLAIN_FILE_CONTENTS:
4827 		if (retobj != NULL) {
4828 			*retobj = child_obj;
4829 		} else {
4830 			dump_object(os, child_obj, dump_opt['v'], &header,
4831 			    NULL, 0);
4832 		}
4833 		return (0);
4834 	default:
4835 		(void) fprintf(stderr, "object %llu has non-file/directory "
4836 		    "type %d\n", (u_longlong_t)obj, doi.doi_type);
4837 		break;
4838 	}
4839 
4840 	return (EINVAL);
4841 }
4842 
4843 /*
4844  * Dump the blocks for the object specified by path inside the dataset.
4845  */
4846 static int
4847 dump_path(char *ds, char *path, uint64_t *retobj)
4848 {
4849 	int err;
4850 	objset_t *os;
4851 	uint64_t root_obj;
4852 
4853 	err = open_objset(ds, FTAG, &os);
4854 	if (err != 0)
4855 		return (err);
4856 
4857 	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
4858 	if (err != 0) {
4859 		(void) fprintf(stderr, "can't lookup root znode: %s\n",
4860 		    strerror(err));
4861 		close_objset(os, FTAG);
4862 		return (EINVAL);
4863 	}
4864 
4865 	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
4866 
4867 	err = dump_path_impl(os, root_obj, path, retobj);
4868 
4869 	close_objset(os, FTAG);
4870 	return (err);
4871 }
4872 
4873 static int
4874 dump_backup_bytes(objset_t *os, void *buf, int len, void *arg)
4875 {
4876 	const char *p = (const char *)buf;
4877 	ssize_t nwritten;
4878 
4879 	(void) os;
4880 	(void) arg;
4881 
4882 	/* Write the data out, handling short writes and signals. */
4883 	while ((nwritten = write(STDOUT_FILENO, p, len)) < len) {
4884 		if (nwritten < 0) {
4885 			if (errno == EINTR)
4886 				continue;
4887 			return (errno);
4888 		}
4889 		p += nwritten;
4890 		len -= nwritten;
4891 	}
4892 
4893 	return (0);
4894 }
4895 
4896 static void
4897 dump_backup(const char *pool, uint64_t objset_id, const char *flagstr)
4898 {
4899 	boolean_t embed = B_FALSE;
4900 	boolean_t large_block = B_FALSE;
4901 	boolean_t compress = B_FALSE;
4902 	boolean_t raw = B_FALSE;
4903 
4904 	const char *c;
4905 	for (c = flagstr; c != NULL && *c != '\0'; c++) {
4906 		switch (*c) {
4907 			case 'e':
4908 				embed = B_TRUE;
4909 				break;
4910 			case 'L':
4911 				large_block = B_TRUE;
4912 				break;
4913 			case 'c':
4914 				compress = B_TRUE;
4915 				break;
4916 			case 'w':
4917 				raw = B_TRUE;
4918 				break;
4919 			default:
4920 				fprintf(stderr, "dump_backup: invalid flag "
4921 				    "'%c'\n", *c);
4922 				return;
4923 		}
4924 	}
4925 
4926 	if (isatty(STDOUT_FILENO)) {
4927 		fprintf(stderr, "dump_backup: stream cannot be written "
4928 		    "to a terminal\n");
4929 		return;
4930 	}
4931 
4932 	offset_t off = 0;
4933 	dmu_send_outparams_t out = {
4934 	    .dso_outfunc = dump_backup_bytes,
4935 	    .dso_dryrun  = B_FALSE,
4936 	};
4937 
4938 	int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed,
4939 	    large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO,
4940 	    &off, &out);
4941 	if (err != 0) {
4942 		fprintf(stderr, "dump_backup: dmu_send_obj: %s\n",
4943 		    strerror(err));
4944 		return;
4945 	}
4946 }
4947 
4948 static int
4949 zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)
4950 {
4951 	int err = 0;
4952 	uint64_t size, readsize, oursize, offset;
4953 	ssize_t writesize;
4954 	sa_handle_t *hdl;
4955 
4956 	(void) printf("Copying object %" PRIu64 " to file %s\n", srcobj,
4957 	    destfile);
4958 
4959 	VERIFY3P(os, ==, sa_os);
4960 	if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) {
4961 		(void) printf("Failed to get handle for SA znode\n");
4962 		return (err);
4963 	}
4964 	if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) {
4965 		(void) sa_handle_destroy(hdl);
4966 		return (err);
4967 	}
4968 	(void) sa_handle_destroy(hdl);
4969 
4970 	(void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj,
4971 	    size);
4972 	if (size == 0) {
4973 		return (EINVAL);
4974 	}
4975 
4976 	int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);
4977 	if (fd == -1)
4978 		return (errno);
4979 	/*
4980 	 * We cap the size at 1 mebibyte here to prevent
4981 	 * allocation failures and nigh-infinite printing if the
4982 	 * object is extremely large.
4983 	 */
4984 	oursize = MIN(size, 1 << 20);
4985 	offset = 0;
4986 	char *buf = kmem_alloc(oursize, KM_NOSLEEP);
4987 	if (buf == NULL) {
4988 		(void) close(fd);
4989 		return (ENOMEM);
4990 	}
4991 
4992 	while (offset < size) {
4993 		readsize = MIN(size - offset, 1 << 20);
4994 		err = dmu_read(os, srcobj, offset, readsize, buf, 0);
4995 		if (err != 0) {
4996 			(void) printf("got error %u from dmu_read\n", err);
4997 			kmem_free(buf, oursize);
4998 			(void) close(fd);
4999 			return (err);
5000 		}
5001 		if (dump_opt['v'] > 3) {
5002 			(void) printf("Read offset=%" PRIu64 " size=%" PRIu64
5003 			    " error=%d\n", offset, readsize, err);
5004 		}
5005 
5006 		writesize = write(fd, buf, readsize);
5007 		if (writesize < 0) {
5008 			err = errno;
5009 			break;
5010 		} else if (writesize != readsize) {
5011 			/* Incomplete write */
5012 			(void) fprintf(stderr, "Short write, only wrote %llu of"
5013 			    " %" PRIu64 " bytes, exiting...\n",
5014 			    (u_longlong_t)writesize, readsize);
5015 			break;
5016 		}
5017 
5018 		offset += readsize;
5019 	}
5020 
5021 	(void) close(fd);
5022 
5023 	if (buf != NULL)
5024 		kmem_free(buf, oursize);
5025 
5026 	return (err);
5027 }
5028 
5029 static boolean_t
5030 label_cksum_valid(vdev_label_t *label, uint64_t offset)
5031 {
5032 	zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
5033 	zio_cksum_t expected_cksum;
5034 	zio_cksum_t actual_cksum;
5035 	zio_cksum_t verifier;
5036 	zio_eck_t *eck;
5037 	int byteswap;
5038 
5039 	void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys);
5040 	eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1;
5041 
5042 	offset += offsetof(vdev_label_t, vl_vdev_phys);
5043 	ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0);
5044 
5045 	byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
5046 	if (byteswap)
5047 		byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
5048 
5049 	expected_cksum = eck->zec_cksum;
5050 	eck->zec_cksum = verifier;
5051 
5052 	abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE);
5053 	ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum);
5054 	abd_free(abd);
5055 
5056 	if (byteswap)
5057 		byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t));
5058 
5059 	if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
5060 		return (B_TRUE);
5061 
5062 	return (B_FALSE);
5063 }
5064 
5065 static int
5066 dump_label(const char *dev)
5067 {
5068 	char path[MAXPATHLEN];
5069 	zdb_label_t labels[VDEV_LABELS] = {{{{0}}}};
5070 	uint64_t psize, ashift, l2cache;
5071 	struct stat64 statbuf;
5072 	boolean_t config_found = B_FALSE;
5073 	boolean_t error = B_FALSE;
5074 	boolean_t read_l2arc_header = B_FALSE;
5075 	avl_tree_t config_tree;
5076 	avl_tree_t uberblock_tree;
5077 	void *node, *cookie;
5078 	int fd;
5079 
5080 	/*
5081 	 * Check if we were given absolute path and use it as is.
5082 	 * Otherwise if the provided vdev name doesn't point to a file,
5083 	 * try prepending expected disk paths and partition numbers.
5084 	 */
5085 	(void) strlcpy(path, dev, sizeof (path));
5086 	if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
5087 		int error;
5088 
5089 		error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
5090 		if (error == 0 && zfs_dev_is_whole_disk(path)) {
5091 			if (zfs_append_partition(path, MAXPATHLEN) == -1)
5092 				error = ENOENT;
5093 		}
5094 
5095 		if (error || (stat64(path, &statbuf) != 0)) {
5096 			(void) printf("failed to find device %s, try "
5097 			    "specifying absolute path instead\n", dev);
5098 			return (1);
5099 		}
5100 	}
5101 
5102 	if ((fd = open64(path, O_RDONLY)) < 0) {
5103 		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
5104 		exit(1);
5105 	}
5106 
5107 	if (fstat64_blk(fd, &statbuf) != 0) {
5108 		(void) printf("failed to stat '%s': %s\n", path,
5109 		    strerror(errno));
5110 		(void) close(fd);
5111 		exit(1);
5112 	}
5113 
5114 	if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
5115 		(void) printf("failed to invalidate cache '%s' : %s\n", path,
5116 		    strerror(errno));
5117 
5118 	avl_create(&config_tree, cksum_record_compare,
5119 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
5120 	avl_create(&uberblock_tree, cksum_record_compare,
5121 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
5122 
5123 	psize = statbuf.st_size;
5124 	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
5125 	ashift = SPA_MINBLOCKSHIFT;
5126 
5127 	/*
5128 	 * 1. Read the label from disk
5129 	 * 2. Verify label cksum
5130 	 * 3. Unpack the configuration and insert in config tree.
5131 	 * 4. Traverse all uberblocks and insert in uberblock tree.
5132 	 */
5133 	for (int l = 0; l < VDEV_LABELS; l++) {
5134 		zdb_label_t *label = &labels[l];
5135 		char *buf = label->label.vl_vdev_phys.vp_nvlist;
5136 		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
5137 		nvlist_t *config;
5138 		cksum_record_t *rec;
5139 		zio_cksum_t cksum;
5140 		vdev_t vd;
5141 
5142 		label->label_offset = vdev_label_offset(psize, l, 0);
5143 
5144 		if (pread64(fd, &label->label, sizeof (label->label),
5145 		    label->label_offset) != sizeof (label->label)) {
5146 			if (!dump_opt['q'])
5147 				(void) printf("failed to read label %d\n", l);
5148 			label->read_failed = B_TRUE;
5149 			error = B_TRUE;
5150 			continue;
5151 		}
5152 
5153 		label->read_failed = B_FALSE;
5154 		label->cksum_valid = label_cksum_valid(&label->label,
5155 		    label->label_offset);
5156 
5157 		if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
5158 			nvlist_t *vdev_tree = NULL;
5159 			size_t size;
5160 
5161 			if ((nvlist_lookup_nvlist(config,
5162 			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
5163 			    (nvlist_lookup_uint64(vdev_tree,
5164 			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
5165 				ashift = SPA_MINBLOCKSHIFT;
5166 
5167 			if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
5168 				size = buflen;
5169 
5170 			/* If the device is a cache device read the header. */
5171 			if (!read_l2arc_header) {
5172 				if (nvlist_lookup_uint64(config,
5173 				    ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
5174 				    l2cache == POOL_STATE_L2CACHE) {
5175 					read_l2arc_header = B_TRUE;
5176 				}
5177 			}
5178 
5179 			fletcher_4_native_varsize(buf, size, &cksum);
5180 			rec = cksum_record_insert(&config_tree, &cksum, l);
5181 
5182 			label->config = rec;
5183 			label->config_nv = config;
5184 			config_found = B_TRUE;
5185 		} else {
5186 			error = B_TRUE;
5187 		}
5188 
5189 		vd.vdev_ashift = ashift;
5190 		vd.vdev_top = &vd;
5191 
5192 		for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
5193 			uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
5194 			uberblock_t *ub = (void *)((char *)label + uoff);
5195 
5196 			if (uberblock_verify(ub))
5197 				continue;
5198 
5199 			fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
5200 			rec = cksum_record_insert(&uberblock_tree, &cksum, l);
5201 
5202 			label->uberblocks[i] = rec;
5203 		}
5204 	}
5205 
5206 	/*
5207 	 * Dump the label and uberblocks.
5208 	 */
5209 	for (int l = 0; l < VDEV_LABELS; l++) {
5210 		zdb_label_t *label = &labels[l];
5211 		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
5212 
5213 		if (label->read_failed == B_TRUE)
5214 			continue;
5215 
5216 		if (label->config_nv) {
5217 			dump_config_from_label(label, buflen, l);
5218 		} else {
5219 			if (!dump_opt['q'])
5220 				(void) printf("failed to unpack label %d\n", l);
5221 		}
5222 
5223 		if (dump_opt['u'])
5224 			dump_label_uberblocks(label, ashift, l);
5225 
5226 		nvlist_free(label->config_nv);
5227 	}
5228 
5229 	/*
5230 	 * Dump the L2ARC header, if existent.
5231 	 */
5232 	if (read_l2arc_header)
5233 		error |= dump_l2arc_header(fd);
5234 
5235 	cookie = NULL;
5236 	while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
5237 		umem_free(node, sizeof (cksum_record_t));
5238 
5239 	cookie = NULL;
5240 	while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
5241 		umem_free(node, sizeof (cksum_record_t));
5242 
5243 	avl_destroy(&config_tree);
5244 	avl_destroy(&uberblock_tree);
5245 
5246 	(void) close(fd);
5247 
5248 	return (config_found == B_FALSE ? 2 :
5249 	    (error == B_TRUE ? 1 : 0));
5250 }
5251 
5252 static uint64_t dataset_feature_count[SPA_FEATURES];
5253 static uint64_t global_feature_count[SPA_FEATURES];
5254 static uint64_t remap_deadlist_count = 0;
5255 
5256 static int
5257 dump_one_objset(const char *dsname, void *arg)
5258 {
5259 	(void) arg;
5260 	int error;
5261 	objset_t *os;
5262 	spa_feature_t f;
5263 
5264 	error = open_objset(dsname, FTAG, &os);
5265 	if (error != 0)
5266 		return (0);
5267 
5268 	for (f = 0; f < SPA_FEATURES; f++) {
5269 		if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
5270 			continue;
5271 		ASSERT(spa_feature_table[f].fi_flags &
5272 		    ZFEATURE_FLAG_PER_DATASET);
5273 		dataset_feature_count[f]++;
5274 	}
5275 
5276 	if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
5277 		remap_deadlist_count++;
5278 	}
5279 
5280 	for (dsl_bookmark_node_t *dbn =
5281 	    avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
5282 	    dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
5283 		mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
5284 		if (dbn->dbn_phys.zbm_redaction_obj != 0) {
5285 			global_feature_count[
5286 			    SPA_FEATURE_REDACTION_BOOKMARKS]++;
5287 			objset_t *mos = os->os_spa->spa_meta_objset;
5288 			dnode_t *rl;
5289 			VERIFY0(dnode_hold(mos,
5290 			    dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
5291 			if (rl->dn_have_spill) {
5292 				global_feature_count[
5293 				    SPA_FEATURE_REDACTION_LIST_SPILL]++;
5294 			}
5295 		}
5296 		if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
5297 			global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
5298 	}
5299 
5300 	if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
5301 	    !dmu_objset_is_snapshot(os)) {
5302 		global_feature_count[SPA_FEATURE_LIVELIST]++;
5303 	}
5304 
5305 	dump_objset(os);
5306 	close_objset(os, FTAG);
5307 	fuid_table_destroy();
5308 	return (0);
5309 }
5310 
5311 /*
5312  * Block statistics.
5313  */
5314 #define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
5315 typedef struct zdb_blkstats {
5316 	uint64_t zb_asize;
5317 	uint64_t zb_lsize;
5318 	uint64_t zb_psize;
5319 	uint64_t zb_count;
5320 	uint64_t zb_gangs;
5321 	uint64_t zb_ditto_samevdev;
5322 	uint64_t zb_ditto_same_ms;
5323 	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
5324 } zdb_blkstats_t;
5325 
5326 /*
5327  * Extended object types to report deferred frees and dedup auto-ditto blocks.
5328  */
5329 #define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
5330 #define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
5331 #define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
5332 #define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
5333 
5334 static const char *zdb_ot_extname[] = {
5335 	"deferred free",
5336 	"dedup ditto",
5337 	"other",
5338 	"Total",
5339 };
5340 
5341 #define	ZB_TOTAL	DN_MAX_LEVELS
5342 #define	SPA_MAX_FOR_16M	(SPA_MAXBLOCKSHIFT+1)
5343 
5344 typedef struct zdb_brt_entry {
5345 	dva_t		zbre_dva;
5346 	uint64_t	zbre_refcount;
5347 	avl_node_t	zbre_node;
5348 } zdb_brt_entry_t;
5349 
5350 typedef struct zdb_cb {
5351 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
5352 	uint64_t	zcb_removing_size;
5353 	uint64_t	zcb_checkpoint_size;
5354 	uint64_t	zcb_dedup_asize;
5355 	uint64_t	zcb_dedup_blocks;
5356 	uint64_t	zcb_clone_asize;
5357 	uint64_t	zcb_clone_blocks;
5358 	uint64_t	zcb_psize_count[SPA_MAX_FOR_16M];
5359 	uint64_t	zcb_lsize_count[SPA_MAX_FOR_16M];
5360 	uint64_t	zcb_asize_count[SPA_MAX_FOR_16M];
5361 	uint64_t	zcb_psize_len[SPA_MAX_FOR_16M];
5362 	uint64_t	zcb_lsize_len[SPA_MAX_FOR_16M];
5363 	uint64_t	zcb_asize_len[SPA_MAX_FOR_16M];
5364 	uint64_t	zcb_psize_total;
5365 	uint64_t	zcb_lsize_total;
5366 	uint64_t	zcb_asize_total;
5367 	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
5368 	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
5369 	    [BPE_PAYLOAD_SIZE + 1];
5370 	uint64_t	zcb_start;
5371 	hrtime_t	zcb_lastprint;
5372 	uint64_t	zcb_totalasize;
5373 	uint64_t	zcb_errors[256];
5374 	int		zcb_readfails;
5375 	int		zcb_haderrors;
5376 	spa_t		*zcb_spa;
5377 	uint32_t	**zcb_vd_obsolete_counts;
5378 	avl_tree_t	zcb_brt;
5379 	boolean_t	zcb_brt_is_active;
5380 } zdb_cb_t;
5381 
5382 /* test if two DVA offsets from same vdev are within the same metaslab */
5383 static boolean_t
5384 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
5385 {
5386 	vdev_t *vd = vdev_lookup_top(spa, vdev);
5387 	uint64_t ms_shift = vd->vdev_ms_shift;
5388 
5389 	return ((off1 >> ms_shift) == (off2 >> ms_shift));
5390 }
5391 
5392 /*
5393  * Used to simplify reporting of the histogram data.
5394  */
5395 typedef struct one_histo {
5396 	const char *name;
5397 	uint64_t *count;
5398 	uint64_t *len;
5399 	uint64_t cumulative;
5400 } one_histo_t;
5401 
5402 /*
5403  * The number of separate histograms processed for psize, lsize and asize.
5404  */
5405 #define	NUM_HISTO 3
5406 
5407 /*
5408  * This routine will create a fixed column size output of three different
5409  * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
5410  * the count, length and cumulative length of the psize, lsize and
5411  * asize blocks.
5412  *
5413  * All three types of blocks are listed on a single line
5414  *
5415  * By default the table is printed in nicenumber format (e.g. 123K) but
5416  * if the '-P' parameter is specified then the full raw number (parseable)
5417  * is printed out.
5418  */
5419 static void
5420 dump_size_histograms(zdb_cb_t *zcb)
5421 {
5422 	/*
5423 	 * A temporary buffer that allows us to convert a number into
5424 	 * a string using zdb_nicenumber to allow either raw or human
5425 	 * readable numbers to be output.
5426 	 */
5427 	char numbuf[32];
5428 
5429 	/*
5430 	 * Define titles which are used in the headers of the tables
5431 	 * printed by this routine.
5432 	 */
5433 	const char blocksize_title1[] = "block";
5434 	const char blocksize_title2[] = "size";
5435 	const char count_title[] = "Count";
5436 	const char length_title[] = "Size";
5437 	const char cumulative_title[] = "Cum.";
5438 
5439 	/*
5440 	 * Setup the histogram arrays (psize, lsize, and asize).
5441 	 */
5442 	one_histo_t parm_histo[NUM_HISTO];
5443 
5444 	parm_histo[0].name = "psize";
5445 	parm_histo[0].count = zcb->zcb_psize_count;
5446 	parm_histo[0].len = zcb->zcb_psize_len;
5447 	parm_histo[0].cumulative = 0;
5448 
5449 	parm_histo[1].name = "lsize";
5450 	parm_histo[1].count = zcb->zcb_lsize_count;
5451 	parm_histo[1].len = zcb->zcb_lsize_len;
5452 	parm_histo[1].cumulative = 0;
5453 
5454 	parm_histo[2].name = "asize";
5455 	parm_histo[2].count = zcb->zcb_asize_count;
5456 	parm_histo[2].len = zcb->zcb_asize_len;
5457 	parm_histo[2].cumulative = 0;
5458 
5459 
5460 	(void) printf("\nBlock Size Histogram\n");
5461 	/*
5462 	 * Print the first line titles
5463 	 */
5464 	if (dump_opt['P'])
5465 		(void) printf("\n%s\t", blocksize_title1);
5466 	else
5467 		(void) printf("\n%7s   ", blocksize_title1);
5468 
5469 	for (int j = 0; j < NUM_HISTO; j++) {
5470 		if (dump_opt['P']) {
5471 			if (j < NUM_HISTO - 1) {
5472 				(void) printf("%s\t\t\t", parm_histo[j].name);
5473 			} else {
5474 				/* Don't print trailing spaces */
5475 				(void) printf("  %s", parm_histo[j].name);
5476 			}
5477 		} else {
5478 			if (j < NUM_HISTO - 1) {
5479 				/* Left aligned strings in the output */
5480 				(void) printf("%-7s              ",
5481 				    parm_histo[j].name);
5482 			} else {
5483 				/* Don't print trailing spaces */
5484 				(void) printf("%s", parm_histo[j].name);
5485 			}
5486 		}
5487 	}
5488 	(void) printf("\n");
5489 
5490 	/*
5491 	 * Print the second line titles
5492 	 */
5493 	if (dump_opt['P']) {
5494 		(void) printf("%s\t", blocksize_title2);
5495 	} else {
5496 		(void) printf("%7s ", blocksize_title2);
5497 	}
5498 
5499 	for (int i = 0; i < NUM_HISTO; i++) {
5500 		if (dump_opt['P']) {
5501 			(void) printf("%s\t%s\t%s\t",
5502 			    count_title, length_title, cumulative_title);
5503 		} else {
5504 			(void) printf("%7s%7s%7s",
5505 			    count_title, length_title, cumulative_title);
5506 		}
5507 	}
5508 	(void) printf("\n");
5509 
5510 	/*
5511 	 * Print the rows
5512 	 */
5513 	for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
5514 
5515 		/*
5516 		 * Print the first column showing the blocksize
5517 		 */
5518 		zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
5519 
5520 		if (dump_opt['P']) {
5521 			printf("%s", numbuf);
5522 		} else {
5523 			printf("%7s:", numbuf);
5524 		}
5525 
5526 		/*
5527 		 * Print the remaining set of 3 columns per size:
5528 		 * for psize, lsize and asize
5529 		 */
5530 		for (int j = 0; j < NUM_HISTO; j++) {
5531 			parm_histo[j].cumulative += parm_histo[j].len[i];
5532 
5533 			zdb_nicenum(parm_histo[j].count[i],
5534 			    numbuf, sizeof (numbuf));
5535 			if (dump_opt['P'])
5536 				(void) printf("\t%s", numbuf);
5537 			else
5538 				(void) printf("%7s", numbuf);
5539 
5540 			zdb_nicenum(parm_histo[j].len[i],
5541 			    numbuf, sizeof (numbuf));
5542 			if (dump_opt['P'])
5543 				(void) printf("\t%s", numbuf);
5544 			else
5545 				(void) printf("%7s", numbuf);
5546 
5547 			zdb_nicenum(parm_histo[j].cumulative,
5548 			    numbuf, sizeof (numbuf));
5549 			if (dump_opt['P'])
5550 				(void) printf("\t%s", numbuf);
5551 			else
5552 				(void) printf("%7s", numbuf);
5553 		}
5554 		(void) printf("\n");
5555 	}
5556 }
5557 
5558 static void
5559 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
5560     dmu_object_type_t type)
5561 {
5562 	uint64_t refcnt = 0;
5563 	int i;
5564 
5565 	ASSERT(type < ZDB_OT_TOTAL);
5566 
5567 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
5568 		return;
5569 
5570 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
5571 
5572 	for (i = 0; i < 4; i++) {
5573 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
5574 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
5575 		int equal;
5576 		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
5577 
5578 		zb->zb_asize += BP_GET_ASIZE(bp);
5579 		zb->zb_lsize += BP_GET_LSIZE(bp);
5580 		zb->zb_psize += BP_GET_PSIZE(bp);
5581 		zb->zb_count++;
5582 
5583 		/*
5584 		 * The histogram is only big enough to record blocks up to
5585 		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
5586 		 * "other", bucket.
5587 		 */
5588 		unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
5589 		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
5590 		zb->zb_psize_histogram[idx]++;
5591 
5592 		zb->zb_gangs += BP_COUNT_GANG(bp);
5593 
5594 		switch (BP_GET_NDVAS(bp)) {
5595 		case 2:
5596 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5597 			    DVA_GET_VDEV(&bp->blk_dva[1])) {
5598 				zb->zb_ditto_samevdev++;
5599 
5600 				if (same_metaslab(zcb->zcb_spa,
5601 				    DVA_GET_VDEV(&bp->blk_dva[0]),
5602 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
5603 				    DVA_GET_OFFSET(&bp->blk_dva[1])))
5604 					zb->zb_ditto_same_ms++;
5605 			}
5606 			break;
5607 		case 3:
5608 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5609 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
5610 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5611 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
5612 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
5613 			    DVA_GET_VDEV(&bp->blk_dva[2]));
5614 			if (equal != 0) {
5615 				zb->zb_ditto_samevdev++;
5616 
5617 				if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5618 				    DVA_GET_VDEV(&bp->blk_dva[1]) &&
5619 				    same_metaslab(zcb->zcb_spa,
5620 				    DVA_GET_VDEV(&bp->blk_dva[0]),
5621 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
5622 				    DVA_GET_OFFSET(&bp->blk_dva[1])))
5623 					zb->zb_ditto_same_ms++;
5624 				else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
5625 				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
5626 				    same_metaslab(zcb->zcb_spa,
5627 				    DVA_GET_VDEV(&bp->blk_dva[0]),
5628 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
5629 				    DVA_GET_OFFSET(&bp->blk_dva[2])))
5630 					zb->zb_ditto_same_ms++;
5631 				else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
5632 				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
5633 				    same_metaslab(zcb->zcb_spa,
5634 				    DVA_GET_VDEV(&bp->blk_dva[1]),
5635 				    DVA_GET_OFFSET(&bp->blk_dva[1]),
5636 				    DVA_GET_OFFSET(&bp->blk_dva[2])))
5637 					zb->zb_ditto_same_ms++;
5638 			}
5639 			break;
5640 		}
5641 	}
5642 
5643 	spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
5644 
5645 	if (BP_IS_EMBEDDED(bp)) {
5646 		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
5647 		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
5648 		    [BPE_GET_PSIZE(bp)]++;
5649 		return;
5650 	}
5651 	/*
5652 	 * The binning histogram bins by powers of two up to
5653 	 * SPA_MAXBLOCKSIZE rather than creating bins for
5654 	 * every possible blocksize found in the pool.
5655 	 */
5656 	int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
5657 
5658 	zcb->zcb_psize_count[bin]++;
5659 	zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
5660 	zcb->zcb_psize_total += BP_GET_PSIZE(bp);
5661 
5662 	bin = highbit64(BP_GET_LSIZE(bp)) - 1;
5663 
5664 	zcb->zcb_lsize_count[bin]++;
5665 	zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
5666 	zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
5667 
5668 	bin = highbit64(BP_GET_ASIZE(bp)) - 1;
5669 
5670 	zcb->zcb_asize_count[bin]++;
5671 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
5672 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);
5673 
5674 	if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
5675 		/*
5676 		 * Cloned blocks are special. We need to count them, so we can
5677 		 * later uncount them when reporting leaked space, and we must
5678 		 * only claim them them once.
5679 		 *
5680 		 * To do this, we keep our own in-memory BRT. For each block
5681 		 * we haven't seen before, we look it up in the real BRT and
5682 		 * if its there, we note it and its refcount then proceed as
5683 		 * normal. If we see the block again, we count it as a clone
5684 		 * and then give it no further consideration.
5685 		 */
5686 		zdb_brt_entry_t zbre_search, *zbre;
5687 		avl_index_t where;
5688 
5689 		zbre_search.zbre_dva = bp->blk_dva[0];
5690 		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
5691 		if (zbre != NULL) {
5692 			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
5693 			zcb->zcb_clone_blocks++;
5694 
5695 			zbre->zbre_refcount--;
5696 			if (zbre->zbre_refcount == 0) {
5697 				avl_remove(&zcb->zcb_brt, zbre);
5698 				umem_free(zbre, sizeof (zdb_brt_entry_t));
5699 			}
5700 			return;
5701 		}
5702 
5703 		uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
5704 		if (crefcnt > 0) {
5705 			zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
5706 			    UMEM_NOFAIL);
5707 			zbre->zbre_dva = bp->blk_dva[0];
5708 			zbre->zbre_refcount = crefcnt;
5709 			avl_insert(&zcb->zcb_brt, zbre, where);
5710 		}
5711 	}
5712 
5713 	if (dump_opt['L'])
5714 		return;
5715 
5716 	if (BP_GET_DEDUP(bp)) {
5717 		ddt_t *ddt;
5718 		ddt_entry_t *dde;
5719 
5720 		ddt = ddt_select(zcb->zcb_spa, bp);
5721 		ddt_enter(ddt);
5722 		dde = ddt_lookup(ddt, bp, B_FALSE);
5723 
5724 		if (dde == NULL) {
5725 			refcnt = 0;
5726 		} else {
5727 			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
5728 			ddt_phys_decref(ddp);
5729 			refcnt = ddp->ddp_refcnt;
5730 			if (ddt_phys_total_refcnt(dde) == 0)
5731 				ddt_remove(ddt, dde);
5732 		}
5733 		ddt_exit(ddt);
5734 	}
5735 
5736 	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
5737 	    refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
5738 	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
5739 }
5740 
5741 static void
5742 zdb_blkptr_done(zio_t *zio)
5743 {
5744 	spa_t *spa = zio->io_spa;
5745 	blkptr_t *bp = zio->io_bp;
5746 	int ioerr = zio->io_error;
5747 	zdb_cb_t *zcb = zio->io_private;
5748 	zbookmark_phys_t *zb = &zio->io_bookmark;
5749 
5750 	mutex_enter(&spa->spa_scrub_lock);
5751 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
5752 	cv_broadcast(&spa->spa_scrub_io_cv);
5753 
5754 	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
5755 		char blkbuf[BP_SPRINTF_LEN];
5756 
5757 		zcb->zcb_haderrors = 1;
5758 		zcb->zcb_errors[ioerr]++;
5759 
5760 		if (dump_opt['b'] >= 2)
5761 			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
5762 		else
5763 			blkbuf[0] = '\0';
5764 
5765 		(void) printf("zdb_blkptr_cb: "
5766 		    "Got error %d reading "
5767 		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
5768 		    ioerr,
5769 		    (u_longlong_t)zb->zb_objset,
5770 		    (u_longlong_t)zb->zb_object,
5771 		    (u_longlong_t)zb->zb_level,
5772 		    (u_longlong_t)zb->zb_blkid,
5773 		    blkbuf);
5774 	}
5775 	mutex_exit(&spa->spa_scrub_lock);
5776 
5777 	abd_free(zio->io_abd);
5778 }
5779 
5780 static int
5781 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
5782     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
5783 {
5784 	zdb_cb_t *zcb = arg;
5785 	dmu_object_type_t type;
5786 	boolean_t is_metadata;
5787 
5788 	if (zb->zb_level == ZB_DNODE_LEVEL)
5789 		return (0);
5790 
5791 	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
5792 		char blkbuf[BP_SPRINTF_LEN];
5793 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
5794 		(void) printf("objset %llu object %llu "
5795 		    "level %lld offset 0x%llx %s\n",
5796 		    (u_longlong_t)zb->zb_objset,
5797 		    (u_longlong_t)zb->zb_object,
5798 		    (longlong_t)zb->zb_level,
5799 		    (u_longlong_t)blkid2offset(dnp, bp, zb),
5800 		    blkbuf);
5801 	}
5802 
5803 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
5804 		return (0);
5805 
5806 	type = BP_GET_TYPE(bp);
5807 
5808 	zdb_count_block(zcb, zilog, bp,
5809 	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
5810 
5811 	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
5812 
5813 	if (!BP_IS_EMBEDDED(bp) &&
5814 	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
5815 		size_t size = BP_GET_PSIZE(bp);
5816 		abd_t *abd = abd_alloc(size, B_FALSE);
5817 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
5818 
5819 		/* If it's an intent log block, failure is expected. */
5820 		if (zb->zb_level == ZB_ZIL_LEVEL)
5821 			flags |= ZIO_FLAG_SPECULATIVE;
5822 
5823 		mutex_enter(&spa->spa_scrub_lock);
5824 		while (spa->spa_load_verify_bytes > max_inflight_bytes)
5825 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
5826 		spa->spa_load_verify_bytes += size;
5827 		mutex_exit(&spa->spa_scrub_lock);
5828 
5829 		zio_nowait(zio_read(NULL, spa, bp, abd, size,
5830 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
5831 	}
5832 
5833 	zcb->zcb_readfails = 0;
5834 
5835 	/* only call gethrtime() every 100 blocks */
5836 	static int iters;
5837 	if (++iters > 100)
5838 		iters = 0;
5839 	else
5840 		return (0);
5841 
5842 	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
5843 		uint64_t now = gethrtime();
5844 		char buf[10];
5845 		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
5846 		uint64_t kb_per_sec =
5847 		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
5848 		uint64_t sec_remaining =
5849 		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
5850 
5851 		/* make sure nicenum has enough space */
5852 		_Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated");
5853 
5854 		zfs_nicebytes(bytes, buf, sizeof (buf));
5855 		(void) fprintf(stderr,
5856 		    "\r%5s completed (%4"PRIu64"MB/s) "
5857 		    "estimated time remaining: "
5858 		    "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec        ",
5859 		    buf, kb_per_sec / 1024,
5860 		    sec_remaining / 60 / 60,
5861 		    sec_remaining / 60 % 60,
5862 		    sec_remaining % 60);
5863 
5864 		zcb->zcb_lastprint = now;
5865 	}
5866 
5867 	return (0);
5868 }
5869 
5870 static void
5871 zdb_leak(void *arg, uint64_t start, uint64_t size)
5872 {
5873 	vdev_t *vd = arg;
5874 
5875 	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
5876 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
5877 }
5878 
5879 static metaslab_ops_t zdb_metaslab_ops = {
5880 	NULL	/* alloc */
5881 };
5882 
5883 static int
5884 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
5885     uint64_t txg, void *arg)
5886 {
5887 	spa_vdev_removal_t *svr = arg;
5888 
5889 	uint64_t offset = sme->sme_offset;
5890 	uint64_t size = sme->sme_run;
5891 
5892 	/* skip vdevs we don't care about */
5893 	if (sme->sme_vdev != svr->svr_vdev_id)
5894 		return (0);
5895 
5896 	vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
5897 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5898 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
5899 
5900 	if (txg < metaslab_unflushed_txg(ms))
5901 		return (0);
5902 
5903 	if (sme->sme_type == SM_ALLOC)
5904 		range_tree_add(svr->svr_allocd_segs, offset, size);
5905 	else
5906 		range_tree_remove(svr->svr_allocd_segs, offset, size);
5907 
5908 	return (0);
5909 }
5910 
5911 static void
5912 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5913     uint64_t size, void *arg)
5914 {
5915 	(void) inner_offset, (void) arg;
5916 
5917 	/*
5918 	 * This callback was called through a remap from
5919 	 * a device being removed. Therefore, the vdev that
5920 	 * this callback is applied to is a concrete
5921 	 * vdev.
5922 	 */
5923 	ASSERT(vdev_is_concrete(vd));
5924 
5925 	VERIFY0(metaslab_claim_impl(vd, offset, size,
5926 	    spa_min_claim_txg(vd->vdev_spa)));
5927 }
5928 
5929 static void
5930 claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
5931 {
5932 	vdev_t *vd = arg;
5933 
5934 	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
5935 	    claim_segment_impl_cb, NULL);
5936 }
5937 
5938 /*
5939  * After accounting for all allocated blocks that are directly referenced,
5940  * we might have missed a reference to a block from a partially complete
5941  * (and thus unused) indirect mapping object. We perform a secondary pass
5942  * through the metaslabs we have already mapped and claim the destination
5943  * blocks.
5944  */
5945 static void
5946 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
5947 {
5948 	if (dump_opt['L'])
5949 		return;
5950 
5951 	if (spa->spa_vdev_removal == NULL)
5952 		return;
5953 
5954 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5955 
5956 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
5957 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
5958 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
5959 
5960 	ASSERT0(range_tree_space(svr->svr_allocd_segs));
5961 
5962 	range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
5963 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
5964 		metaslab_t *msp = vd->vdev_ms[msi];
5965 
5966 		ASSERT0(range_tree_space(allocs));
5967 		if (msp->ms_sm != NULL)
5968 			VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
5969 		range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs);
5970 	}
5971 	range_tree_destroy(allocs);
5972 
5973 	iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
5974 
5975 	/*
5976 	 * Clear everything past what has been synced,
5977 	 * because we have not allocated mappings for
5978 	 * it yet.
5979 	 */
5980 	range_tree_clear(svr->svr_allocd_segs,
5981 	    vdev_indirect_mapping_max_offset(vim),
5982 	    vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
5983 
5984 	zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs);
5985 	range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
5986 
5987 	spa_config_exit(spa, SCL_CONFIG, FTAG);
5988 }
5989 
5990 static int
5991 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
5992     dmu_tx_t *tx)
5993 {
5994 	(void) tx;
5995 	zdb_cb_t *zcb = arg;
5996 	spa_t *spa = zcb->zcb_spa;
5997 	vdev_t *vd;
5998 	const dva_t *dva = &bp->blk_dva[0];
5999 
6000 	ASSERT(!bp_freed);
6001 	ASSERT(!dump_opt['L']);
6002 	ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
6003 
6004 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
6005 	vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
6006 	ASSERT3P(vd, !=, NULL);
6007 	spa_config_exit(spa, SCL_VDEV, FTAG);
6008 
6009 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
6010 	ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
6011 
6012 	vdev_indirect_mapping_increment_obsolete_count(
6013 	    vd->vdev_indirect_mapping,
6014 	    DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
6015 	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
6016 
6017 	return (0);
6018 }
6019 
6020 static uint32_t *
6021 zdb_load_obsolete_counts(vdev_t *vd)
6022 {
6023 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6024 	spa_t *spa = vd->vdev_spa;
6025 	spa_condensing_indirect_phys_t *scip =
6026 	    &spa->spa_condensing_indirect_phys;
6027 	uint64_t obsolete_sm_object;
6028 	uint32_t *counts;
6029 
6030 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
6031 	EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
6032 	counts = vdev_indirect_mapping_load_obsolete_counts(vim);
6033 	if (vd->vdev_obsolete_sm != NULL) {
6034 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
6035 		    vd->vdev_obsolete_sm);
6036 	}
6037 	if (scip->scip_vdev == vd->vdev_id &&
6038 	    scip->scip_prev_obsolete_sm_object != 0) {
6039 		space_map_t *prev_obsolete_sm = NULL;
6040 		VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
6041 		    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
6042 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
6043 		    prev_obsolete_sm);
6044 		space_map_close(prev_obsolete_sm);
6045 	}
6046 	return (counts);
6047 }
6048 
6049 static void
6050 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
6051 {
6052 	ddt_bookmark_t ddb = {0};
6053 	ddt_entry_t dde;
6054 	int error;
6055 	int p;
6056 
6057 	ASSERT(!dump_opt['L']);
6058 
6059 	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
6060 		blkptr_t blk;
6061 		ddt_phys_t *ddp = dde.dde_phys;
6062 
6063 		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
6064 			return;
6065 
6066 		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
6067 		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
6068 		VERIFY(ddt);
6069 
6070 		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
6071 			if (ddp->ddp_phys_birth == 0)
6072 				continue;
6073 			ddt_bp_create(ddb.ddb_checksum,
6074 			    &dde.dde_key, ddp, &blk);
6075 			if (p == DDT_PHYS_DITTO) {
6076 				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
6077 			} else {
6078 				zcb->zcb_dedup_asize +=
6079 				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
6080 				zcb->zcb_dedup_blocks++;
6081 			}
6082 		}
6083 
6084 		ddt_enter(ddt);
6085 		VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
6086 		ddt_exit(ddt);
6087 	}
6088 
6089 	ASSERT(error == ENOENT);
6090 }
6091 
6092 typedef struct checkpoint_sm_exclude_entry_arg {
6093 	vdev_t *cseea_vd;
6094 	uint64_t cseea_checkpoint_size;
6095 } checkpoint_sm_exclude_entry_arg_t;
6096 
6097 static int
6098 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
6099 {
6100 	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
6101 	vdev_t *vd = cseea->cseea_vd;
6102 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
6103 	uint64_t end = sme->sme_offset + sme->sme_run;
6104 
6105 	ASSERT(sme->sme_type == SM_FREE);
6106 
6107 	/*
6108 	 * Since the vdev_checkpoint_sm exists in the vdev level
6109 	 * and the ms_sm space maps exist in the metaslab level,
6110 	 * an entry in the checkpoint space map could theoretically
6111 	 * cross the boundaries of the metaslab that it belongs.
6112 	 *
6113 	 * In reality, because of the way that we populate and
6114 	 * manipulate the checkpoint's space maps currently,
6115 	 * there shouldn't be any entries that cross metaslabs.
6116 	 * Hence the assertion below.
6117 	 *
6118 	 * That said, there is no fundamental requirement that
6119 	 * the checkpoint's space map entries should not cross
6120 	 * metaslab boundaries. So if needed we could add code
6121 	 * that handles metaslab-crossing segments in the future.
6122 	 */
6123 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
6124 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
6125 
6126 	/*
6127 	 * By removing the entry from the allocated segments we
6128 	 * also verify that the entry is there to begin with.
6129 	 */
6130 	mutex_enter(&ms->ms_lock);
6131 	range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
6132 	mutex_exit(&ms->ms_lock);
6133 
6134 	cseea->cseea_checkpoint_size += sme->sme_run;
6135 	return (0);
6136 }
6137 
6138 static void
6139 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
6140 {
6141 	spa_t *spa = vd->vdev_spa;
6142 	space_map_t *checkpoint_sm = NULL;
6143 	uint64_t checkpoint_sm_obj;
6144 
6145 	/*
6146 	 * If there is no vdev_top_zap, we are in a pool whose
6147 	 * version predates the pool checkpoint feature.
6148 	 */
6149 	if (vd->vdev_top_zap == 0)
6150 		return;
6151 
6152 	/*
6153 	 * If there is no reference of the vdev_checkpoint_sm in
6154 	 * the vdev_top_zap, then one of the following scenarios
6155 	 * is true:
6156 	 *
6157 	 * 1] There is no checkpoint
6158 	 * 2] There is a checkpoint, but no checkpointed blocks
6159 	 *    have been freed yet
6160 	 * 3] The current vdev is indirect
6161 	 *
6162 	 * In these cases we return immediately.
6163 	 */
6164 	if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
6165 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
6166 		return;
6167 
6168 	VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
6169 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
6170 	    &checkpoint_sm_obj));
6171 
6172 	checkpoint_sm_exclude_entry_arg_t cseea;
6173 	cseea.cseea_vd = vd;
6174 	cseea.cseea_checkpoint_size = 0;
6175 
6176 	VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
6177 	    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
6178 
6179 	VERIFY0(space_map_iterate(checkpoint_sm,
6180 	    space_map_length(checkpoint_sm),
6181 	    checkpoint_sm_exclude_entry_cb, &cseea));
6182 	space_map_close(checkpoint_sm);
6183 
6184 	zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
6185 }
6186 
6187 static void
6188 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
6189 {
6190 	ASSERT(!dump_opt['L']);
6191 
6192 	vdev_t *rvd = spa->spa_root_vdev;
6193 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
6194 		ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
6195 		zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
6196 	}
6197 }
6198 
6199 static int
6200 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
6201     uint64_t txg, void *arg)
6202 {
6203 	int64_t *ualloc_space = arg;
6204 
6205 	uint64_t offset = sme->sme_offset;
6206 	uint64_t vdev_id = sme->sme_vdev;
6207 
6208 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
6209 	if (!vdev_is_concrete(vd))
6210 		return (0);
6211 
6212 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6213 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
6214 
6215 	if (txg < metaslab_unflushed_txg(ms))
6216 		return (0);
6217 
6218 	if (sme->sme_type == SM_ALLOC)
6219 		*ualloc_space += sme->sme_run;
6220 	else
6221 		*ualloc_space -= sme->sme_run;
6222 
6223 	return (0);
6224 }
6225 
6226 static int64_t
6227 get_unflushed_alloc_space(spa_t *spa)
6228 {
6229 	if (dump_opt['L'])
6230 		return (0);
6231 
6232 	int64_t ualloc_space = 0;
6233 	iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
6234 	    &ualloc_space);
6235 	return (ualloc_space);
6236 }
6237 
6238 static int
6239 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
6240 {
6241 	maptype_t *uic_maptype = arg;
6242 
6243 	uint64_t offset = sme->sme_offset;
6244 	uint64_t size = sme->sme_run;
6245 	uint64_t vdev_id = sme->sme_vdev;
6246 
6247 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
6248 
6249 	/* skip indirect vdevs */
6250 	if (!vdev_is_concrete(vd))
6251 		return (0);
6252 
6253 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6254 
6255 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
6256 	ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
6257 
6258 	if (txg < metaslab_unflushed_txg(ms))
6259 		return (0);
6260 
6261 	if (*uic_maptype == sme->sme_type)
6262 		range_tree_add(ms->ms_allocatable, offset, size);
6263 	else
6264 		range_tree_remove(ms->ms_allocatable, offset, size);
6265 
6266 	return (0);
6267 }
6268 
6269 static void
6270 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
6271 {
6272 	iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
6273 }
6274 
6275 static void
6276 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
6277 {
6278 	vdev_t *rvd = spa->spa_root_vdev;
6279 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
6280 		vdev_t *vd = rvd->vdev_child[i];
6281 
6282 		ASSERT3U(i, ==, vd->vdev_id);
6283 
6284 		if (vd->vdev_ops == &vdev_indirect_ops)
6285 			continue;
6286 
6287 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
6288 			metaslab_t *msp = vd->vdev_ms[m];
6289 
6290 			(void) fprintf(stderr,
6291 			    "\rloading concrete vdev %llu, "
6292 			    "metaslab %llu of %llu ...",
6293 			    (longlong_t)vd->vdev_id,
6294 			    (longlong_t)msp->ms_id,
6295 			    (longlong_t)vd->vdev_ms_count);
6296 
6297 			mutex_enter(&msp->ms_lock);
6298 			range_tree_vacate(msp->ms_allocatable, NULL, NULL);
6299 
6300 			/*
6301 			 * We don't want to spend the CPU manipulating the
6302 			 * size-ordered tree, so clear the range_tree ops.
6303 			 */
6304 			msp->ms_allocatable->rt_ops = NULL;
6305 
6306 			if (msp->ms_sm != NULL) {
6307 				VERIFY0(space_map_load(msp->ms_sm,
6308 				    msp->ms_allocatable, maptype));
6309 			}
6310 			if (!msp->ms_loaded)
6311 				msp->ms_loaded = B_TRUE;
6312 			mutex_exit(&msp->ms_lock);
6313 		}
6314 	}
6315 
6316 	load_unflushed_to_ms_allocatables(spa, maptype);
6317 }
6318 
6319 /*
6320  * vm_idxp is an in-out parameter which (for indirect vdevs) is the
6321  * index in vim_entries that has the first entry in this metaslab.
6322  * On return, it will be set to the first entry after this metaslab.
6323  */
6324 static void
6325 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
6326     uint64_t *vim_idxp)
6327 {
6328 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6329 
6330 	mutex_enter(&msp->ms_lock);
6331 	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
6332 
6333 	/*
6334 	 * We don't want to spend the CPU manipulating the
6335 	 * size-ordered tree, so clear the range_tree ops.
6336 	 */
6337 	msp->ms_allocatable->rt_ops = NULL;
6338 
6339 	for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
6340 	    (*vim_idxp)++) {
6341 		vdev_indirect_mapping_entry_phys_t *vimep =
6342 		    &vim->vim_entries[*vim_idxp];
6343 		uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
6344 		uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
6345 		ASSERT3U(ent_offset, >=, msp->ms_start);
6346 		if (ent_offset >= msp->ms_start + msp->ms_size)
6347 			break;
6348 
6349 		/*
6350 		 * Mappings do not cross metaslab boundaries,
6351 		 * because we create them by walking the metaslabs.
6352 		 */
6353 		ASSERT3U(ent_offset + ent_len, <=,
6354 		    msp->ms_start + msp->ms_size);
6355 		range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
6356 	}
6357 
6358 	if (!msp->ms_loaded)
6359 		msp->ms_loaded = B_TRUE;
6360 	mutex_exit(&msp->ms_lock);
6361 }
6362 
6363 static void
6364 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
6365 {
6366 	ASSERT(!dump_opt['L']);
6367 
6368 	vdev_t *rvd = spa->spa_root_vdev;
6369 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
6370 		vdev_t *vd = rvd->vdev_child[c];
6371 
6372 		ASSERT3U(c, ==, vd->vdev_id);
6373 
6374 		if (vd->vdev_ops != &vdev_indirect_ops)
6375 			continue;
6376 
6377 		/*
6378 		 * Note: we don't check for mapping leaks on
6379 		 * removing vdevs because their ms_allocatable's
6380 		 * are used to look for leaks in allocated space.
6381 		 */
6382 		zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
6383 
6384 		/*
6385 		 * Normally, indirect vdevs don't have any
6386 		 * metaslabs.  We want to set them up for
6387 		 * zio_claim().
6388 		 */
6389 		vdev_metaslab_group_create(vd);
6390 		VERIFY0(vdev_metaslab_init(vd, 0));
6391 
6392 		vdev_indirect_mapping_t *vim __maybe_unused =
6393 		    vd->vdev_indirect_mapping;
6394 		uint64_t vim_idx = 0;
6395 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
6396 
6397 			(void) fprintf(stderr,
6398 			    "\rloading indirect vdev %llu, "
6399 			    "metaslab %llu of %llu ...",
6400 			    (longlong_t)vd->vdev_id,
6401 			    (longlong_t)vd->vdev_ms[m]->ms_id,
6402 			    (longlong_t)vd->vdev_ms_count);
6403 
6404 			load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
6405 			    &vim_idx);
6406 		}
6407 		ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
6408 	}
6409 }
6410 
6411 static void
6412 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
6413 {
6414 	zcb->zcb_spa = spa;
6415 
6416 	if (dump_opt['L'])
6417 		return;
6418 
6419 	dsl_pool_t *dp = spa->spa_dsl_pool;
6420 	vdev_t *rvd = spa->spa_root_vdev;
6421 
6422 	/*
6423 	 * We are going to be changing the meaning of the metaslab's
6424 	 * ms_allocatable.  Ensure that the allocator doesn't try to
6425 	 * use the tree.
6426 	 */
6427 	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
6428 	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
6429 	spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;
6430 
6431 	zcb->zcb_vd_obsolete_counts =
6432 	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
6433 	    UMEM_NOFAIL);
6434 
6435 	/*
6436 	 * For leak detection, we overload the ms_allocatable trees
6437 	 * to contain allocated segments instead of free segments.
6438 	 * As a result, we can't use the normal metaslab_load/unload
6439 	 * interfaces.
6440 	 */
6441 	zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
6442 	load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
6443 
6444 	/*
6445 	 * On load_concrete_ms_allocatable_trees() we loaded all the
6446 	 * allocated entries from the ms_sm to the ms_allocatable for
6447 	 * each metaslab. If the pool has a checkpoint or is in the
6448 	 * middle of discarding a checkpoint, some of these blocks
6449 	 * may have been freed but their ms_sm may not have been
6450 	 * updated because they are referenced by the checkpoint. In
6451 	 * order to avoid false-positives during leak-detection, we
6452 	 * go through the vdev's checkpoint space map and exclude all
6453 	 * its entries from their relevant ms_allocatable.
6454 	 *
6455 	 * We also aggregate the space held by the checkpoint and add
6456 	 * it to zcb_checkpoint_size.
6457 	 *
6458 	 * Note that at this point we are also verifying that all the
6459 	 * entries on the checkpoint_sm are marked as allocated in
6460 	 * the ms_sm of their relevant metaslab.
6461 	 * [see comment in checkpoint_sm_exclude_entry_cb()]
6462 	 */
6463 	zdb_leak_init_exclude_checkpoint(spa, zcb);
6464 	ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
6465 
6466 	/* for cleaner progress output */
6467 	(void) fprintf(stderr, "\n");
6468 
6469 	if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
6470 		ASSERT(spa_feature_is_enabled(spa,
6471 		    SPA_FEATURE_DEVICE_REMOVAL));
6472 		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
6473 		    increment_indirect_mapping_cb, zcb, NULL);
6474 	}
6475 
6476 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6477 	zdb_ddt_leak_init(spa, zcb);
6478 	spa_config_exit(spa, SCL_CONFIG, FTAG);
6479 }
6480 
6481 static boolean_t
6482 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
6483 {
6484 	boolean_t leaks = B_FALSE;
6485 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6486 	uint64_t total_leaked = 0;
6487 	boolean_t are_precise = B_FALSE;
6488 
6489 	ASSERT(vim != NULL);
6490 
6491 	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
6492 		vdev_indirect_mapping_entry_phys_t *vimep =
6493 		    &vim->vim_entries[i];
6494 		uint64_t obsolete_bytes = 0;
6495 		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
6496 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
6497 
6498 		/*
6499 		 * This is not very efficient but it's easy to
6500 		 * verify correctness.
6501 		 */
6502 		for (uint64_t inner_offset = 0;
6503 		    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
6504 		    inner_offset += 1ULL << vd->vdev_ashift) {
6505 			if (range_tree_contains(msp->ms_allocatable,
6506 			    offset + inner_offset, 1ULL << vd->vdev_ashift)) {
6507 				obsolete_bytes += 1ULL << vd->vdev_ashift;
6508 			}
6509 		}
6510 
6511 		int64_t bytes_leaked = obsolete_bytes -
6512 		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
6513 		ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
6514 		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
6515 
6516 		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
6517 		if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
6518 			(void) printf("obsolete indirect mapping count "
6519 			    "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
6520 			    (u_longlong_t)vd->vdev_id,
6521 			    (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
6522 			    (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
6523 			    (u_longlong_t)bytes_leaked);
6524 		}
6525 		total_leaked += ABS(bytes_leaked);
6526 	}
6527 
6528 	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
6529 	if (!are_precise && total_leaked > 0) {
6530 		int pct_leaked = total_leaked * 100 /
6531 		    vdev_indirect_mapping_bytes_mapped(vim);
6532 		(void) printf("cannot verify obsolete indirect mapping "
6533 		    "counts of vdev %llu because precise feature was not "
6534 		    "enabled when it was removed: %d%% (%llx bytes) of mapping"
6535 		    "unreferenced\n",
6536 		    (u_longlong_t)vd->vdev_id, pct_leaked,
6537 		    (u_longlong_t)total_leaked);
6538 	} else if (total_leaked > 0) {
6539 		(void) printf("obsolete indirect mapping count mismatch "
6540 		    "for vdev %llu -- %llx total bytes mismatched\n",
6541 		    (u_longlong_t)vd->vdev_id,
6542 		    (u_longlong_t)total_leaked);
6543 		leaks |= B_TRUE;
6544 	}
6545 
6546 	vdev_indirect_mapping_free_obsolete_counts(vim,
6547 	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
6548 	zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
6549 
6550 	return (leaks);
6551 }
6552 
6553 static boolean_t
6554 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
6555 {
6556 	if (dump_opt['L'])
6557 		return (B_FALSE);
6558 
6559 	boolean_t leaks = B_FALSE;
6560 	vdev_t *rvd = spa->spa_root_vdev;
6561 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
6562 		vdev_t *vd = rvd->vdev_child[c];
6563 
6564 		if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
6565 			leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
6566 		}
6567 
6568 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
6569 			metaslab_t *msp = vd->vdev_ms[m];
6570 			ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
6571 			    spa_embedded_log_class(spa)) ?
6572 			    vd->vdev_log_mg : vd->vdev_mg);
6573 
6574 			/*
6575 			 * ms_allocatable has been overloaded
6576 			 * to contain allocated segments. Now that
6577 			 * we finished traversing all blocks, any
6578 			 * block that remains in the ms_allocatable
6579 			 * represents an allocated block that we
6580 			 * did not claim during the traversal.
6581 			 * Claimed blocks would have been removed
6582 			 * from the ms_allocatable.  For indirect
6583 			 * vdevs, space remaining in the tree
6584 			 * represents parts of the mapping that are
6585 			 * not referenced, which is not a bug.
6586 			 */
6587 			if (vd->vdev_ops == &vdev_indirect_ops) {
6588 				range_tree_vacate(msp->ms_allocatable,
6589 				    NULL, NULL);
6590 			} else {
6591 				range_tree_vacate(msp->ms_allocatable,
6592 				    zdb_leak, vd);
6593 			}
6594 			if (msp->ms_loaded) {
6595 				msp->ms_loaded = B_FALSE;
6596 			}
6597 		}
6598 	}
6599 
6600 	umem_free(zcb->zcb_vd_obsolete_counts,
6601 	    rvd->vdev_children * sizeof (uint32_t *));
6602 	zcb->zcb_vd_obsolete_counts = NULL;
6603 
6604 	return (leaks);
6605 }
6606 
6607 static int
6608 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6609 {
6610 	(void) tx;
6611 	zdb_cb_t *zcb = arg;
6612 
6613 	if (dump_opt['b'] >= 5) {
6614 		char blkbuf[BP_SPRINTF_LEN];
6615 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
6616 		(void) printf("[%s] %s\n",
6617 		    "deferred free", blkbuf);
6618 	}
6619 	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
6620 	return (0);
6621 }
6622 
6623 /*
6624  * Iterate over livelists which have been destroyed by the user but
6625  * are still present in the MOS, waiting to be freed
6626  */
6627 static void
6628 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
6629 {
6630 	objset_t *mos = spa->spa_meta_objset;
6631 	uint64_t zap_obj;
6632 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
6633 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
6634 	if (err == ENOENT)
6635 		return;
6636 	ASSERT0(err);
6637 
6638 	zap_cursor_t zc;
6639 	zap_attribute_t attr;
6640 	dsl_deadlist_t ll;
6641 	/* NULL out os prior to dsl_deadlist_open in case it's garbage */
6642 	ll.dl_os = NULL;
6643 	for (zap_cursor_init(&zc, mos, zap_obj);
6644 	    zap_cursor_retrieve(&zc, &attr) == 0;
6645 	    (void) zap_cursor_advance(&zc)) {
6646 		dsl_deadlist_open(&ll, mos, attr.za_first_integer);
6647 		func(&ll, arg);
6648 		dsl_deadlist_close(&ll);
6649 	}
6650 	zap_cursor_fini(&zc);
6651 }
6652 
6653 static int
6654 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
6655     dmu_tx_t *tx)
6656 {
6657 	ASSERT(!bp_freed);
6658 	return (count_block_cb(arg, bp, tx));
6659 }
6660 
6661 static int
6662 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
6663 {
6664 	zdb_cb_t *zbc = args;
6665 	bplist_t blks;
6666 	bplist_create(&blks);
6667 	/* determine which blocks have been alloc'd but not freed */
6668 	VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
6669 	/* count those blocks */
6670 	(void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
6671 	bplist_destroy(&blks);
6672 	return (0);
6673 }
6674 
6675 static void
6676 livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
6677 {
6678 	dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
6679 }
6680 
6681 /*
6682  * Count the blocks in the livelists that have been destroyed by the user
6683  * but haven't yet been freed.
6684  */
6685 static void
6686 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
6687 {
6688 	iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
6689 }
6690 
6691 static void
6692 dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
6693 {
6694 	ASSERT3P(arg, ==, NULL);
6695 	global_feature_count[SPA_FEATURE_LIVELIST]++;
6696 	dump_blkptr_list(ll, "Deleted Livelist");
6697 	dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
6698 }
6699 
6700 /*
6701  * Print out, register object references to, and increment feature counts for
6702  * livelists that have been destroyed by the user but haven't yet been freed.
6703  */
6704 static void
6705 deleted_livelists_dump_mos(spa_t *spa)
6706 {
6707 	uint64_t zap_obj;
6708 	objset_t *mos = spa->spa_meta_objset;
6709 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
6710 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
6711 	if (err == ENOENT)
6712 		return;
6713 	mos_obj_refd(zap_obj);
6714 	iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
6715 }
6716 
6717 static int
6718 zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
6719 {
6720 	const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
6721 	const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
6722 	int cmp;
6723 
6724 	cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
6725 	if (cmp == 0)
6726 		cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
6727 
6728 	return (cmp);
6729 }
6730 
6731 static int
6732 dump_block_stats(spa_t *spa)
6733 {
6734 	zdb_cb_t *zcb;
6735 	zdb_blkstats_t *zb, *tzb;
6736 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
6737 	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
6738 	    TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
6739 	boolean_t leaks = B_FALSE;
6740 	int e, c, err;
6741 	bp_embedded_type_t i;
6742 
6743 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
6744 
6745 	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
6746 		avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
6747 		    sizeof (zdb_brt_entry_t),
6748 		    offsetof(zdb_brt_entry_t, zbre_node));
6749 		zcb->zcb_brt_is_active = B_TRUE;
6750 	}
6751 
6752 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
6753 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
6754 	    (dump_opt['c'] == 1) ? "metadata " : "",
6755 	    dump_opt['c'] ? "checksums " : "",
6756 	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
6757 	    !dump_opt['L'] ? "nothing leaked " : "");
6758 
6759 	/*
6760 	 * When leak detection is enabled we load all space maps as SM_ALLOC
6761 	 * maps, then traverse the pool claiming each block we discover. If
6762 	 * the pool is perfectly consistent, the segment trees will be empty
6763 	 * when we're done. Anything left over is a leak; any block we can't
6764 	 * claim (because it's not part of any space map) is a double
6765 	 * allocation, reference to a freed block, or an unclaimed log block.
6766 	 *
6767 	 * When leak detection is disabled (-L option) we still traverse the
6768 	 * pool claiming each block we discover, but we skip opening any space
6769 	 * maps.
6770 	 */
6771 	zdb_leak_init(spa, zcb);
6772 
6773 	/*
6774 	 * If there's a deferred-free bplist, process that first.
6775 	 */
6776 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
6777 	    bpobj_count_block_cb, zcb, NULL);
6778 
6779 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
6780 		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
6781 		    bpobj_count_block_cb, zcb, NULL);
6782 	}
6783 
6784 	zdb_claim_removing(spa, zcb);
6785 
6786 	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
6787 		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
6788 		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
6789 		    zcb, NULL));
6790 	}
6791 
6792 	deleted_livelists_count_blocks(spa, zcb);
6793 
6794 	if (dump_opt['c'] > 1)
6795 		flags |= TRAVERSE_PREFETCH_DATA;
6796 
6797 	zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
6798 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
6799 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
6800 	zcb->zcb_totalasize +=
6801 	    metaslab_class_get_alloc(spa_embedded_log_class(spa));
6802 	zcb->zcb_start = zcb->zcb_lastprint = gethrtime();
6803 	err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb);
6804 
6805 	/*
6806 	 * If we've traversed the data blocks then we need to wait for those
6807 	 * I/Os to complete. We leverage "The Godfather" zio to wait on
6808 	 * all async I/Os to complete.
6809 	 */
6810 	if (dump_opt['c']) {
6811 		for (c = 0; c < max_ncpus; c++) {
6812 			(void) zio_wait(spa->spa_async_zio_root[c]);
6813 			spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
6814 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
6815 			    ZIO_FLAG_GODFATHER);
6816 		}
6817 	}
6818 	ASSERT0(spa->spa_load_verify_bytes);
6819 
6820 	/*
6821 	 * Done after zio_wait() since zcb_haderrors is modified in
6822 	 * zdb_blkptr_done()
6823 	 */
6824 	zcb->zcb_haderrors |= err;
6825 
6826 	if (zcb->zcb_haderrors) {
6827 		(void) printf("\nError counts:\n\n");
6828 		(void) printf("\t%5s  %s\n", "errno", "count");
6829 		for (e = 0; e < 256; e++) {
6830 			if (zcb->zcb_errors[e] != 0) {
6831 				(void) printf("\t%5d  %llu\n",
6832 				    e, (u_longlong_t)zcb->zcb_errors[e]);
6833 			}
6834 		}
6835 	}
6836 
6837 	/*
6838 	 * Report any leaked segments.
6839 	 */
6840 	leaks |= zdb_leak_fini(spa, zcb);
6841 
6842 	tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
6843 
6844 	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
6845 	norm_space = metaslab_class_get_space(spa_normal_class(spa));
6846 
6847 	total_alloc = norm_alloc +
6848 	    metaslab_class_get_alloc(spa_log_class(spa)) +
6849 	    metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
6850 	    metaslab_class_get_alloc(spa_special_class(spa)) +
6851 	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
6852 	    get_unflushed_alloc_space(spa);
6853 	total_found =
6854 	    tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
6855 	    zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
6856 
6857 	if (total_found == total_alloc && !dump_opt['L']) {
6858 		(void) printf("\n\tNo leaks (block sum matches space"
6859 		    " maps exactly)\n");
6860 	} else if (!dump_opt['L']) {
6861 		(void) printf("block traversal size %llu != alloc %llu "
6862 		    "(%s %lld)\n",
6863 		    (u_longlong_t)total_found,
6864 		    (u_longlong_t)total_alloc,
6865 		    (dump_opt['L']) ? "unreachable" : "leaked",
6866 		    (longlong_t)(total_alloc - total_found));
6867 		leaks = B_TRUE;
6868 	}
6869 
6870 	if (tzb->zb_count == 0) {
6871 		umem_free(zcb, sizeof (zdb_cb_t));
6872 		return (2);
6873 	}
6874 
6875 	(void) printf("\n");
6876 	(void) printf("\t%-16s %14llu\n", "bp count:",
6877 	    (u_longlong_t)tzb->zb_count);
6878 	(void) printf("\t%-16s %14llu\n", "ganged count:",
6879 	    (longlong_t)tzb->zb_gangs);
6880 	(void) printf("\t%-16s %14llu      avg: %6llu\n", "bp logical:",
6881 	    (u_longlong_t)tzb->zb_lsize,
6882 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
6883 	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
6884 	    "bp physical:", (u_longlong_t)tzb->zb_psize,
6885 	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
6886 	    (double)tzb->zb_lsize / tzb->zb_psize);
6887 	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
6888 	    "bp allocated:", (u_longlong_t)tzb->zb_asize,
6889 	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
6890 	    (double)tzb->zb_lsize / tzb->zb_asize);
6891 	(void) printf("\t%-16s %14llu    ref>1: %6llu   deduplication: %6.2f\n",
6892 	    "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
6893 	    (u_longlong_t)zcb->zcb_dedup_blocks,
6894 	    (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
6895 	(void) printf("\t%-16s %14llu    count: %6llu\n",
6896 	    "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
6897 	    (u_longlong_t)zcb->zcb_clone_blocks);
6898 	(void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
6899 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
6900 
6901 	if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) {
6902 		uint64_t alloc = metaslab_class_get_alloc(
6903 		    spa_special_class(spa));
6904 		uint64_t space = metaslab_class_get_space(
6905 		    spa_special_class(spa));
6906 
6907 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
6908 		    "Special class", (u_longlong_t)alloc,
6909 		    100.0 * alloc / space);
6910 	}
6911 
6912 	if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) {
6913 		uint64_t alloc = metaslab_class_get_alloc(
6914 		    spa_dedup_class(spa));
6915 		uint64_t space = metaslab_class_get_space(
6916 		    spa_dedup_class(spa));
6917 
6918 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
6919 		    "Dedup class", (u_longlong_t)alloc,
6920 		    100.0 * alloc / space);
6921 	}
6922 
6923 	if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {
6924 		uint64_t alloc = metaslab_class_get_alloc(
6925 		    spa_embedded_log_class(spa));
6926 		uint64_t space = metaslab_class_get_space(
6927 		    spa_embedded_log_class(spa));
6928 
6929 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
6930 		    "Embedded log class", (u_longlong_t)alloc,
6931 		    100.0 * alloc / space);
6932 	}
6933 
6934 	for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
6935 		if (zcb->zcb_embedded_blocks[i] == 0)
6936 			continue;
6937 		(void) printf("\n");
6938 		(void) printf("\tadditional, non-pointer bps of type %u: "
6939 		    "%10llu\n",
6940 		    i, (u_longlong_t)zcb->zcb_embedded_blocks[i]);
6941 
6942 		if (dump_opt['b'] >= 3) {
6943 			(void) printf("\t number of (compressed) bytes:  "
6944 			    "number of bps\n");
6945 			dump_histogram(zcb->zcb_embedded_histogram[i],
6946 			    sizeof (zcb->zcb_embedded_histogram[i]) /
6947 			    sizeof (zcb->zcb_embedded_histogram[i][0]), 0);
6948 		}
6949 	}
6950 
6951 	if (tzb->zb_ditto_samevdev != 0) {
6952 		(void) printf("\tDittoed blocks on same vdev: %llu\n",
6953 		    (longlong_t)tzb->zb_ditto_samevdev);
6954 	}
6955 	if (tzb->zb_ditto_same_ms != 0) {
6956 		(void) printf("\tDittoed blocks in same metaslab: %llu\n",
6957 		    (longlong_t)tzb->zb_ditto_same_ms);
6958 	}
6959 
6960 	for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
6961 		vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
6962 		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
6963 
6964 		if (vim == NULL) {
6965 			continue;
6966 		}
6967 
6968 		char mem[32];
6969 		zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
6970 		    mem, vdev_indirect_mapping_size(vim));
6971 
6972 		(void) printf("\tindirect vdev id %llu has %llu segments "
6973 		    "(%s in memory)\n",
6974 		    (longlong_t)vd->vdev_id,
6975 		    (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
6976 	}
6977 
6978 	if (dump_opt['b'] >= 2) {
6979 		int l, t, level;
6980 		char csize[32], lsize[32], psize[32], asize[32];
6981 		char avg[32], gang[32];
6982 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
6983 		    "\t  avg\t comp\t%%Total\tType\n");
6984 
6985 		zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t),
6986 		    UMEM_NOFAIL);
6987 
6988 		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
6989 			const char *typename;
6990 
6991 			/* make sure nicenum has enough space */
6992 			_Static_assert(sizeof (csize) >= NN_NUMBUF_SZ,
6993 			    "csize truncated");
6994 			_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ,
6995 			    "lsize truncated");
6996 			_Static_assert(sizeof (psize) >= NN_NUMBUF_SZ,
6997 			    "psize truncated");
6998 			_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ,
6999 			    "asize truncated");
7000 			_Static_assert(sizeof (avg) >= NN_NUMBUF_SZ,
7001 			    "avg truncated");
7002 			_Static_assert(sizeof (gang) >= NN_NUMBUF_SZ,
7003 			    "gang truncated");
7004 
7005 			if (t < DMU_OT_NUMTYPES)
7006 				typename = dmu_ot[t].ot_name;
7007 			else
7008 				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
7009 
7010 			if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) {
7011 				(void) printf("%6s\t%5s\t%5s\t%5s"
7012 				    "\t%5s\t%5s\t%6s\t%s\n",
7013 				    "-",
7014 				    "-",
7015 				    "-",
7016 				    "-",
7017 				    "-",
7018 				    "-",
7019 				    "-",
7020 				    typename);
7021 				continue;
7022 			}
7023 
7024 			for (l = ZB_TOTAL - 1; l >= -1; l--) {
7025 				level = (l == -1 ? ZB_TOTAL : l);
7026 				zb = &zcb->zcb_type[level][t];
7027 
7028 				if (zb->zb_asize == 0)
7029 					continue;
7030 
7031 				if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES &&
7032 				    (level > 0 || DMU_OT_IS_METADATA(t))) {
7033 					mdstats->zb_count += zb->zb_count;
7034 					mdstats->zb_lsize += zb->zb_lsize;
7035 					mdstats->zb_psize += zb->zb_psize;
7036 					mdstats->zb_asize += zb->zb_asize;
7037 					mdstats->zb_gangs += zb->zb_gangs;
7038 				}
7039 
7040 				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
7041 					continue;
7042 
7043 				if (level == 0 && zb->zb_asize ==
7044 				    zcb->zcb_type[ZB_TOTAL][t].zb_asize)
7045 					continue;
7046 
7047 				zdb_nicenum(zb->zb_count, csize,
7048 				    sizeof (csize));
7049 				zdb_nicenum(zb->zb_lsize, lsize,
7050 				    sizeof (lsize));
7051 				zdb_nicenum(zb->zb_psize, psize,
7052 				    sizeof (psize));
7053 				zdb_nicenum(zb->zb_asize, asize,
7054 				    sizeof (asize));
7055 				zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
7056 				    sizeof (avg));
7057 				zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
7058 
7059 				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
7060 				    "\t%5.2f\t%6.2f\t",
7061 				    csize, lsize, psize, asize, avg,
7062 				    (double)zb->zb_lsize / zb->zb_psize,
7063 				    100.0 * zb->zb_asize / tzb->zb_asize);
7064 
7065 				if (level == ZB_TOTAL)
7066 					(void) printf("%s\n", typename);
7067 				else
7068 					(void) printf("    L%d %s\n",
7069 					    level, typename);
7070 
7071 				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
7072 					(void) printf("\t number of ganged "
7073 					    "blocks: %s\n", gang);
7074 				}
7075 
7076 				if (dump_opt['b'] >= 4) {
7077 					(void) printf("psize "
7078 					    "(in 512-byte sectors): "
7079 					    "number of blocks\n");
7080 					dump_histogram(zb->zb_psize_histogram,
7081 					    PSIZE_HISTO_SIZE, 0);
7082 				}
7083 			}
7084 		}
7085 		zdb_nicenum(mdstats->zb_count, csize,
7086 		    sizeof (csize));
7087 		zdb_nicenum(mdstats->zb_lsize, lsize,
7088 		    sizeof (lsize));
7089 		zdb_nicenum(mdstats->zb_psize, psize,
7090 		    sizeof (psize));
7091 		zdb_nicenum(mdstats->zb_asize, asize,
7092 		    sizeof (asize));
7093 		zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg,
7094 		    sizeof (avg));
7095 		zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang));
7096 
7097 		(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
7098 		    "\t%5.2f\t%6.2f\t",
7099 		    csize, lsize, psize, asize, avg,
7100 		    (double)mdstats->zb_lsize / mdstats->zb_psize,
7101 		    100.0 * mdstats->zb_asize / tzb->zb_asize);
7102 		(void) printf("%s\n", "Metadata Total");
7103 
7104 		/* Output a table summarizing block sizes in the pool */
7105 		if (dump_opt['b'] >= 2) {
7106 			dump_size_histograms(zcb);
7107 		}
7108 
7109 		umem_free(mdstats, sizeof (zfs_blkstat_t));
7110 	}
7111 
7112 	(void) printf("\n");
7113 
7114 	if (leaks) {
7115 		umem_free(zcb, sizeof (zdb_cb_t));
7116 		return (2);
7117 	}
7118 
7119 	if (zcb->zcb_haderrors) {
7120 		umem_free(zcb, sizeof (zdb_cb_t));
7121 		return (3);
7122 	}
7123 
7124 	umem_free(zcb, sizeof (zdb_cb_t));
7125 	return (0);
7126 }
7127 
7128 typedef struct zdb_ddt_entry {
7129 	/* key must be first for ddt_key_compare */
7130 	ddt_key_t	zdde_key;
7131 	uint64_t	zdde_ref_blocks;
7132 	uint64_t	zdde_ref_lsize;
7133 	uint64_t	zdde_ref_psize;
7134 	uint64_t	zdde_ref_dsize;
7135 	avl_node_t	zdde_node;
7136 } zdb_ddt_entry_t;
7137 
7138 static int
7139 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
7140     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
7141 {
7142 	(void) zilog, (void) dnp;
7143 	avl_tree_t *t = arg;
7144 	avl_index_t where;
7145 	zdb_ddt_entry_t *zdde, zdde_search;
7146 
7147 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
7148 	    BP_IS_EMBEDDED(bp))
7149 		return (0);
7150 
7151 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
7152 		(void) printf("traversing objset %llu, %llu objects, "
7153 		    "%lu blocks so far\n",
7154 		    (u_longlong_t)zb->zb_objset,
7155 		    (u_longlong_t)BP_GET_FILL(bp),
7156 		    avl_numnodes(t));
7157 	}
7158 
7159 	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
7160 	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
7161 		return (0);
7162 
7163 	ddt_key_fill(&zdde_search.zdde_key, bp);
7164 
7165 	zdde = avl_find(t, &zdde_search, &where);
7166 
7167 	if (zdde == NULL) {
7168 		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
7169 		zdde->zdde_key = zdde_search.zdde_key;
7170 		avl_insert(t, zdde, where);
7171 	}
7172 
7173 	zdde->zdde_ref_blocks += 1;
7174 	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
7175 	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
7176 	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
7177 
7178 	return (0);
7179 }
7180 
7181 static void
7182 dump_simulated_ddt(spa_t *spa)
7183 {
7184 	avl_tree_t t;
7185 	void *cookie = NULL;
7186 	zdb_ddt_entry_t *zdde;
7187 	ddt_histogram_t ddh_total = {{{0}}};
7188 	ddt_stat_t dds_total = {0};
7189 
7190 	avl_create(&t, ddt_key_compare,
7191 	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
7192 
7193 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
7194 
7195 	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
7196 	    TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
7197 
7198 	spa_config_exit(spa, SCL_CONFIG, FTAG);
7199 
7200 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
7201 		ddt_stat_t dds;
7202 		uint64_t refcnt = zdde->zdde_ref_blocks;
7203 		ASSERT(refcnt != 0);
7204 
7205 		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
7206 		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
7207 		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
7208 		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
7209 
7210 		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
7211 		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
7212 		dds.dds_ref_psize = zdde->zdde_ref_psize;
7213 		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
7214 
7215 		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
7216 		    &dds, 0);
7217 
7218 		umem_free(zdde, sizeof (*zdde));
7219 	}
7220 
7221 	avl_destroy(&t);
7222 
7223 	ddt_histogram_stat(&dds_total, &ddh_total);
7224 
7225 	(void) printf("Simulated DDT histogram:\n");
7226 
7227 	zpool_dump_ddt(&dds_total, &ddh_total);
7228 
7229 	dump_dedup_ratio(&dds_total);
7230 }
7231 
7232 static int
7233 verify_device_removal_feature_counts(spa_t *spa)
7234 {
7235 	uint64_t dr_feature_refcount = 0;
7236 	uint64_t oc_feature_refcount = 0;
7237 	uint64_t indirect_vdev_count = 0;
7238 	uint64_t precise_vdev_count = 0;
7239 	uint64_t obsolete_counts_object_count = 0;
7240 	uint64_t obsolete_sm_count = 0;
7241 	uint64_t obsolete_counts_count = 0;
7242 	uint64_t scip_count = 0;
7243 	uint64_t obsolete_bpobj_count = 0;
7244 	int ret = 0;
7245 
7246 	spa_condensing_indirect_phys_t *scip =
7247 	    &spa->spa_condensing_indirect_phys;
7248 	if (scip->scip_next_mapping_object != 0) {
7249 		vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
7250 		ASSERT(scip->scip_prev_obsolete_sm_object != 0);
7251 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
7252 
7253 		(void) printf("Condensing indirect vdev %llu: new mapping "
7254 		    "object %llu, prev obsolete sm %llu\n",
7255 		    (u_longlong_t)scip->scip_vdev,
7256 		    (u_longlong_t)scip->scip_next_mapping_object,
7257 		    (u_longlong_t)scip->scip_prev_obsolete_sm_object);
7258 		if (scip->scip_prev_obsolete_sm_object != 0) {
7259 			space_map_t *prev_obsolete_sm = NULL;
7260 			VERIFY0(space_map_open(&prev_obsolete_sm,
7261 			    spa->spa_meta_objset,
7262 			    scip->scip_prev_obsolete_sm_object,
7263 			    0, vd->vdev_asize, 0));
7264 			dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
7265 			(void) printf("\n");
7266 			space_map_close(prev_obsolete_sm);
7267 		}
7268 
7269 		scip_count += 2;
7270 	}
7271 
7272 	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
7273 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
7274 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
7275 
7276 		if (vic->vic_mapping_object != 0) {
7277 			ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
7278 			    vd->vdev_removing);
7279 			indirect_vdev_count++;
7280 
7281 			if (vd->vdev_indirect_mapping->vim_havecounts) {
7282 				obsolete_counts_count++;
7283 			}
7284 		}
7285 
7286 		boolean_t are_precise;
7287 		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
7288 		if (are_precise) {
7289 			ASSERT(vic->vic_mapping_object != 0);
7290 			precise_vdev_count++;
7291 		}
7292 
7293 		uint64_t obsolete_sm_object;
7294 		VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
7295 		if (obsolete_sm_object != 0) {
7296 			ASSERT(vic->vic_mapping_object != 0);
7297 			obsolete_sm_count++;
7298 		}
7299 	}
7300 
7301 	(void) feature_get_refcount(spa,
7302 	    &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
7303 	    &dr_feature_refcount);
7304 	(void) feature_get_refcount(spa,
7305 	    &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
7306 	    &oc_feature_refcount);
7307 
7308 	if (dr_feature_refcount != indirect_vdev_count) {
7309 		ret = 1;
7310 		(void) printf("Number of indirect vdevs (%llu) " \
7311 		    "does not match feature count (%llu)\n",
7312 		    (u_longlong_t)indirect_vdev_count,
7313 		    (u_longlong_t)dr_feature_refcount);
7314 	} else {
7315 		(void) printf("Verified device_removal feature refcount " \
7316 		    "of %llu is correct\n",
7317 		    (u_longlong_t)dr_feature_refcount);
7318 	}
7319 
7320 	if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
7321 	    DMU_POOL_OBSOLETE_BPOBJ) == 0) {
7322 		obsolete_bpobj_count++;
7323 	}
7324 
7325 
7326 	obsolete_counts_object_count = precise_vdev_count;
7327 	obsolete_counts_object_count += obsolete_sm_count;
7328 	obsolete_counts_object_count += obsolete_counts_count;
7329 	obsolete_counts_object_count += scip_count;
7330 	obsolete_counts_object_count += obsolete_bpobj_count;
7331 	obsolete_counts_object_count += remap_deadlist_count;
7332 
7333 	if (oc_feature_refcount != obsolete_counts_object_count) {
7334 		ret = 1;
7335 		(void) printf("Number of obsolete counts objects (%llu) " \
7336 		    "does not match feature count (%llu)\n",
7337 		    (u_longlong_t)obsolete_counts_object_count,
7338 		    (u_longlong_t)oc_feature_refcount);
7339 		(void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
7340 		    "ob:%llu rd:%llu\n",
7341 		    (u_longlong_t)precise_vdev_count,
7342 		    (u_longlong_t)obsolete_sm_count,
7343 		    (u_longlong_t)obsolete_counts_count,
7344 		    (u_longlong_t)scip_count,
7345 		    (u_longlong_t)obsolete_bpobj_count,
7346 		    (u_longlong_t)remap_deadlist_count);
7347 	} else {
7348 		(void) printf("Verified indirect_refcount feature refcount " \
7349 		    "of %llu is correct\n",
7350 		    (u_longlong_t)oc_feature_refcount);
7351 	}
7352 	return (ret);
7353 }
7354 
7355 static void
7356 zdb_set_skip_mmp(char *target)
7357 {
7358 	spa_t *spa;
7359 
7360 	/*
7361 	 * Disable the activity check to allow examination of
7362 	 * active pools.
7363 	 */
7364 	mutex_enter(&spa_namespace_lock);
7365 	if ((spa = spa_lookup(target)) != NULL) {
7366 		spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
7367 	}
7368 	mutex_exit(&spa_namespace_lock);
7369 }
7370 
7371 #define	BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
7372 /*
7373  * Import the checkpointed state of the pool specified by the target
7374  * parameter as readonly. The function also accepts a pool config
7375  * as an optional parameter, else it attempts to infer the config by
7376  * the name of the target pool.
7377  *
7378  * Note that the checkpointed state's pool name will be the name of
7379  * the original pool with the above suffix appended to it. In addition,
7380  * if the target is not a pool name (e.g. a path to a dataset) then
7381  * the new_path parameter is populated with the updated path to
7382  * reflect the fact that we are looking into the checkpointed state.
7383  *
7384  * The function returns a newly-allocated copy of the name of the
7385  * pool containing the checkpointed state. When this copy is no
7386  * longer needed it should be freed with free(3C). Same thing
7387  * applies to the new_path parameter if allocated.
7388  */
7389 static char *
7390 import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path)
7391 {
7392 	int error = 0;
7393 	char *poolname, *bogus_name = NULL;
7394 	boolean_t freecfg = B_FALSE;
7395 
7396 	/* If the target is not a pool, the extract the pool name */
7397 	char *path_start = strchr(target, '/');
7398 	if (path_start != NULL) {
7399 		size_t poolname_len = path_start - target;
7400 		poolname = strndup(target, poolname_len);
7401 	} else {
7402 		poolname = target;
7403 	}
7404 
7405 	if (cfg == NULL) {
7406 		zdb_set_skip_mmp(poolname);
7407 		error = spa_get_stats(poolname, &cfg, NULL, 0);
7408 		if (error != 0) {
7409 			fatal("Tried to read config of pool \"%s\" but "
7410 			    "spa_get_stats() failed with error %d\n",
7411 			    poolname, error);
7412 		}
7413 		freecfg = B_TRUE;
7414 	}
7415 
7416 	if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) {
7417 		if (target != poolname)
7418 			free(poolname);
7419 		return (NULL);
7420 	}
7421 	fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
7422 
7423 	error = spa_import(bogus_name, cfg, NULL,
7424 	    ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
7425 	    ZFS_IMPORT_SKIP_MMP);
7426 	if (freecfg)
7427 		nvlist_free(cfg);
7428 	if (error != 0) {
7429 		fatal("Tried to import pool \"%s\" but spa_import() failed "
7430 		    "with error %d\n", bogus_name, error);
7431 	}
7432 
7433 	if (new_path != NULL && path_start != NULL) {
7434 		if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) {
7435 			free(bogus_name);
7436 			if (path_start != NULL)
7437 				free(poolname);
7438 			return (NULL);
7439 		}
7440 	}
7441 
7442 	if (target != poolname)
7443 		free(poolname);
7444 
7445 	return (bogus_name);
7446 }
7447 
7448 typedef struct verify_checkpoint_sm_entry_cb_arg {
7449 	vdev_t *vcsec_vd;
7450 
7451 	/* the following fields are only used for printing progress */
7452 	uint64_t vcsec_entryid;
7453 	uint64_t vcsec_num_entries;
7454 } verify_checkpoint_sm_entry_cb_arg_t;
7455 
7456 #define	ENTRIES_PER_PROGRESS_UPDATE 10000
7457 
7458 static int
7459 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
7460 {
7461 	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
7462 	vdev_t *vd = vcsec->vcsec_vd;
7463 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
7464 	uint64_t end = sme->sme_offset + sme->sme_run;
7465 
7466 	ASSERT(sme->sme_type == SM_FREE);
7467 
7468 	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
7469 		(void) fprintf(stderr,
7470 		    "\rverifying vdev %llu, space map entry %llu of %llu ...",
7471 		    (longlong_t)vd->vdev_id,
7472 		    (longlong_t)vcsec->vcsec_entryid,
7473 		    (longlong_t)vcsec->vcsec_num_entries);
7474 	}
7475 	vcsec->vcsec_entryid++;
7476 
7477 	/*
7478 	 * See comment in checkpoint_sm_exclude_entry_cb()
7479 	 */
7480 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
7481 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
7482 
7483 	/*
7484 	 * The entries in the vdev_checkpoint_sm should be marked as
7485 	 * allocated in the checkpointed state of the pool, therefore
7486 	 * their respective ms_allocateable trees should not contain them.
7487 	 */
7488 	mutex_enter(&ms->ms_lock);
7489 	range_tree_verify_not_present(ms->ms_allocatable,
7490 	    sme->sme_offset, sme->sme_run);
7491 	mutex_exit(&ms->ms_lock);
7492 
7493 	return (0);
7494 }
7495 
7496 /*
7497  * Verify that all segments in the vdev_checkpoint_sm are allocated
7498  * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
7499  * ms_allocatable).
7500  *
7501  * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
7502  * each vdev in the current state of the pool to the metaslab space maps
7503  * (ms_sm) of the checkpointed state of the pool.
7504  *
7505  * Note that the function changes the state of the ms_allocatable
7506  * trees of the current spa_t. The entries of these ms_allocatable
7507  * trees are cleared out and then repopulated from with the free
7508  * entries of their respective ms_sm space maps.
7509  */
7510 static void
7511 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
7512 {
7513 	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
7514 	vdev_t *current_rvd = current->spa_root_vdev;
7515 
7516 	load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
7517 
7518 	for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
7519 		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
7520 		vdev_t *current_vd = current_rvd->vdev_child[c];
7521 
7522 		space_map_t *checkpoint_sm = NULL;
7523 		uint64_t checkpoint_sm_obj;
7524 
7525 		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
7526 			/*
7527 			 * Since we don't allow device removal in a pool
7528 			 * that has a checkpoint, we expect that all removed
7529 			 * vdevs were removed from the pool before the
7530 			 * checkpoint.
7531 			 */
7532 			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
7533 			continue;
7534 		}
7535 
7536 		/*
7537 		 * If the checkpoint space map doesn't exist, then nothing
7538 		 * here is checkpointed so there's nothing to verify.
7539 		 */
7540 		if (current_vd->vdev_top_zap == 0 ||
7541 		    zap_contains(spa_meta_objset(current),
7542 		    current_vd->vdev_top_zap,
7543 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
7544 			continue;
7545 
7546 		VERIFY0(zap_lookup(spa_meta_objset(current),
7547 		    current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
7548 		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
7549 
7550 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
7551 		    checkpoint_sm_obj, 0, current_vd->vdev_asize,
7552 		    current_vd->vdev_ashift));
7553 
7554 		verify_checkpoint_sm_entry_cb_arg_t vcsec;
7555 		vcsec.vcsec_vd = ckpoint_vd;
7556 		vcsec.vcsec_entryid = 0;
7557 		vcsec.vcsec_num_entries =
7558 		    space_map_length(checkpoint_sm) / sizeof (uint64_t);
7559 		VERIFY0(space_map_iterate(checkpoint_sm,
7560 		    space_map_length(checkpoint_sm),
7561 		    verify_checkpoint_sm_entry_cb, &vcsec));
7562 		if (dump_opt['m'] > 3)
7563 			dump_spacemap(current->spa_meta_objset, checkpoint_sm);
7564 		space_map_close(checkpoint_sm);
7565 	}
7566 
7567 	/*
7568 	 * If we've added vdevs since we took the checkpoint, ensure
7569 	 * that their checkpoint space maps are empty.
7570 	 */
7571 	if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
7572 		for (uint64_t c = ckpoint_rvd->vdev_children;
7573 		    c < current_rvd->vdev_children; c++) {
7574 			vdev_t *current_vd = current_rvd->vdev_child[c];
7575 			VERIFY3P(current_vd->vdev_checkpoint_sm, ==, NULL);
7576 		}
7577 	}
7578 
7579 	/* for cleaner progress output */
7580 	(void) fprintf(stderr, "\n");
7581 }
7582 
7583 /*
7584  * Verifies that all space that's allocated in the checkpoint is
7585  * still allocated in the current version, by checking that everything
7586  * in checkpoint's ms_allocatable (which is actually allocated, not
7587  * allocatable/free) is not present in current's ms_allocatable.
7588  *
7589  * Note that the function changes the state of the ms_allocatable
7590  * trees of both spas when called. The entries of all ms_allocatable
7591  * trees are cleared out and then repopulated from their respective
7592  * ms_sm space maps. In the checkpointed state we load the allocated
7593  * entries, and in the current state we load the free entries.
7594  */
7595 static void
7596 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
7597 {
7598 	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
7599 	vdev_t *current_rvd = current->spa_root_vdev;
7600 
7601 	load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
7602 	load_concrete_ms_allocatable_trees(current, SM_FREE);
7603 
7604 	for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
7605 		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
7606 		vdev_t *current_vd = current_rvd->vdev_child[i];
7607 
7608 		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
7609 			/*
7610 			 * See comment in verify_checkpoint_vdev_spacemaps()
7611 			 */
7612 			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
7613 			continue;
7614 		}
7615 
7616 		for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
7617 			metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
7618 			metaslab_t *current_msp = current_vd->vdev_ms[m];
7619 
7620 			(void) fprintf(stderr,
7621 			    "\rverifying vdev %llu of %llu, "
7622 			    "metaslab %llu of %llu ...",
7623 			    (longlong_t)current_vd->vdev_id,
7624 			    (longlong_t)current_rvd->vdev_children,
7625 			    (longlong_t)current_vd->vdev_ms[m]->ms_id,
7626 			    (longlong_t)current_vd->vdev_ms_count);
7627 
7628 			/*
7629 			 * We walk through the ms_allocatable trees that
7630 			 * are loaded with the allocated blocks from the
7631 			 * ms_sm spacemaps of the checkpoint. For each
7632 			 * one of these ranges we ensure that none of them
7633 			 * exists in the ms_allocatable trees of the
7634 			 * current state which are loaded with the ranges
7635 			 * that are currently free.
7636 			 *
7637 			 * This way we ensure that none of the blocks that
7638 			 * are part of the checkpoint were freed by mistake.
7639 			 */
7640 			range_tree_walk(ckpoint_msp->ms_allocatable,
7641 			    (range_tree_func_t *)range_tree_verify_not_present,
7642 			    current_msp->ms_allocatable);
7643 		}
7644 	}
7645 
7646 	/* for cleaner progress output */
7647 	(void) fprintf(stderr, "\n");
7648 }
7649 
7650 static void
7651 verify_checkpoint_blocks(spa_t *spa)
7652 {
7653 	ASSERT(!dump_opt['L']);
7654 
7655 	spa_t *checkpoint_spa;
7656 	char *checkpoint_pool;
7657 	int error = 0;
7658 
7659 	/*
7660 	 * We import the checkpointed state of the pool (under a different
7661 	 * name) so we can do verification on it against the current state
7662 	 * of the pool.
7663 	 */
7664 	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL,
7665 	    NULL);
7666 	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
7667 
7668 	error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
7669 	if (error != 0) {
7670 		fatal("Tried to open pool \"%s\" but spa_open() failed with "
7671 		    "error %d\n", checkpoint_pool, error);
7672 	}
7673 
7674 	/*
7675 	 * Ensure that ranges in the checkpoint space maps of each vdev
7676 	 * are allocated according to the checkpointed state's metaslab
7677 	 * space maps.
7678 	 */
7679 	verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
7680 
7681 	/*
7682 	 * Ensure that allocated ranges in the checkpoint's metaslab
7683 	 * space maps remain allocated in the metaslab space maps of
7684 	 * the current state.
7685 	 */
7686 	verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
7687 
7688 	/*
7689 	 * Once we are done, we get rid of the checkpointed state.
7690 	 */
7691 	spa_close(checkpoint_spa, FTAG);
7692 	free(checkpoint_pool);
7693 }
7694 
7695 static void
7696 dump_leftover_checkpoint_blocks(spa_t *spa)
7697 {
7698 	vdev_t *rvd = spa->spa_root_vdev;
7699 
7700 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
7701 		vdev_t *vd = rvd->vdev_child[i];
7702 
7703 		space_map_t *checkpoint_sm = NULL;
7704 		uint64_t checkpoint_sm_obj;
7705 
7706 		if (vd->vdev_top_zap == 0)
7707 			continue;
7708 
7709 		if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
7710 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
7711 			continue;
7712 
7713 		VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
7714 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
7715 		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
7716 
7717 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
7718 		    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
7719 		dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
7720 		space_map_close(checkpoint_sm);
7721 	}
7722 }
7723 
7724 static int
7725 verify_checkpoint(spa_t *spa)
7726 {
7727 	uberblock_t checkpoint;
7728 	int error;
7729 
7730 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
7731 		return (0);
7732 
7733 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
7734 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
7735 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
7736 
7737 	if (error == ENOENT && !dump_opt['L']) {
7738 		/*
7739 		 * If the feature is active but the uberblock is missing
7740 		 * then we must be in the middle of discarding the
7741 		 * checkpoint.
7742 		 */
7743 		(void) printf("\nPartially discarded checkpoint "
7744 		    "state found:\n");
7745 		if (dump_opt['m'] > 3)
7746 			dump_leftover_checkpoint_blocks(spa);
7747 		return (0);
7748 	} else if (error != 0) {
7749 		(void) printf("lookup error %d when looking for "
7750 		    "checkpointed uberblock in MOS\n", error);
7751 		return (error);
7752 	}
7753 	dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
7754 
7755 	if (checkpoint.ub_checkpoint_txg == 0) {
7756 		(void) printf("\nub_checkpoint_txg not set in checkpointed "
7757 		    "uberblock\n");
7758 		error = 3;
7759 	}
7760 
7761 	if (error == 0 && !dump_opt['L'])
7762 		verify_checkpoint_blocks(spa);
7763 
7764 	return (error);
7765 }
7766 
7767 static void
7768 mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
7769 {
7770 	(void) arg;
7771 	for (uint64_t i = start; i < size; i++) {
7772 		(void) printf("MOS object %llu referenced but not allocated\n",
7773 		    (u_longlong_t)i);
7774 	}
7775 }
7776 
7777 static void
7778 mos_obj_refd(uint64_t obj)
7779 {
7780 	if (obj != 0 && mos_refd_objs != NULL)
7781 		range_tree_add(mos_refd_objs, obj, 1);
7782 }
7783 
7784 /*
7785  * Call on a MOS object that may already have been referenced.
7786  */
7787 static void
7788 mos_obj_refd_multiple(uint64_t obj)
7789 {
7790 	if (obj != 0 && mos_refd_objs != NULL &&
7791 	    !range_tree_contains(mos_refd_objs, obj, 1))
7792 		range_tree_add(mos_refd_objs, obj, 1);
7793 }
7794 
7795 static void
7796 mos_leak_vdev_top_zap(vdev_t *vd)
7797 {
7798 	uint64_t ms_flush_data_obj;
7799 	int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
7800 	    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
7801 	    sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
7802 	if (error == ENOENT)
7803 		return;
7804 	ASSERT0(error);
7805 
7806 	mos_obj_refd(ms_flush_data_obj);
7807 }
7808 
7809 static void
7810 mos_leak_vdev(vdev_t *vd)
7811 {
7812 	mos_obj_refd(vd->vdev_dtl_object);
7813 	mos_obj_refd(vd->vdev_ms_array);
7814 	mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
7815 	mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
7816 	mos_obj_refd(vd->vdev_leaf_zap);
7817 	if (vd->vdev_checkpoint_sm != NULL)
7818 		mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
7819 	if (vd->vdev_indirect_mapping != NULL) {
7820 		mos_obj_refd(vd->vdev_indirect_mapping->
7821 		    vim_phys->vimp_counts_object);
7822 	}
7823 	if (vd->vdev_obsolete_sm != NULL)
7824 		mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
7825 
7826 	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
7827 		metaslab_t *ms = vd->vdev_ms[m];
7828 		mos_obj_refd(space_map_object(ms->ms_sm));
7829 	}
7830 
7831 	if (vd->vdev_root_zap != 0)
7832 		mos_obj_refd(vd->vdev_root_zap);
7833 
7834 	if (vd->vdev_top_zap != 0) {
7835 		mos_obj_refd(vd->vdev_top_zap);
7836 		mos_leak_vdev_top_zap(vd);
7837 	}
7838 
7839 	for (uint64_t c = 0; c < vd->vdev_children; c++) {
7840 		mos_leak_vdev(vd->vdev_child[c]);
7841 	}
7842 }
7843 
7844 static void
7845 mos_leak_log_spacemaps(spa_t *spa)
7846 {
7847 	uint64_t spacemap_zap;
7848 	int error = zap_lookup(spa_meta_objset(spa),
7849 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
7850 	    sizeof (spacemap_zap), 1, &spacemap_zap);
7851 	if (error == ENOENT)
7852 		return;
7853 	ASSERT0(error);
7854 
7855 	mos_obj_refd(spacemap_zap);
7856 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
7857 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
7858 		mos_obj_refd(sls->sls_sm_obj);
7859 }
7860 
7861 static void
7862 errorlog_count_refd(objset_t *mos, uint64_t errlog)
7863 {
7864 	zap_cursor_t zc;
7865 	zap_attribute_t za;
7866 	for (zap_cursor_init(&zc, mos, errlog);
7867 	    zap_cursor_retrieve(&zc, &za) == 0;
7868 	    zap_cursor_advance(&zc)) {
7869 		mos_obj_refd(za.za_first_integer);
7870 	}
7871 	zap_cursor_fini(&zc);
7872 }
7873 
7874 static int
7875 dump_mos_leaks(spa_t *spa)
7876 {
7877 	int rv = 0;
7878 	objset_t *mos = spa->spa_meta_objset;
7879 	dsl_pool_t *dp = spa->spa_dsl_pool;
7880 
7881 	/* Visit and mark all referenced objects in the MOS */
7882 
7883 	mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
7884 	mos_obj_refd(spa->spa_pool_props_object);
7885 	mos_obj_refd(spa->spa_config_object);
7886 	mos_obj_refd(spa->spa_ddt_stat_object);
7887 	mos_obj_refd(spa->spa_feat_desc_obj);
7888 	mos_obj_refd(spa->spa_feat_enabled_txg_obj);
7889 	mos_obj_refd(spa->spa_feat_for_read_obj);
7890 	mos_obj_refd(spa->spa_feat_for_write_obj);
7891 	mos_obj_refd(spa->spa_history);
7892 	mos_obj_refd(spa->spa_errlog_last);
7893 	mos_obj_refd(spa->spa_errlog_scrub);
7894 
7895 	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
7896 		errorlog_count_refd(mos, spa->spa_errlog_last);
7897 		errorlog_count_refd(mos, spa->spa_errlog_scrub);
7898 	}
7899 
7900 	mos_obj_refd(spa->spa_all_vdev_zaps);
7901 	mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
7902 	mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
7903 	mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
7904 	bpobj_count_refd(&spa->spa_deferred_bpobj);
7905 	mos_obj_refd(dp->dp_empty_bpobj);
7906 	bpobj_count_refd(&dp->dp_obsolete_bpobj);
7907 	bpobj_count_refd(&dp->dp_free_bpobj);
7908 	mos_obj_refd(spa->spa_l2cache.sav_object);
7909 	mos_obj_refd(spa->spa_spares.sav_object);
7910 
7911 	if (spa->spa_syncing_log_sm != NULL)
7912 		mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
7913 	mos_leak_log_spacemaps(spa);
7914 
7915 	mos_obj_refd(spa->spa_condensing_indirect_phys.
7916 	    scip_next_mapping_object);
7917 	mos_obj_refd(spa->spa_condensing_indirect_phys.
7918 	    scip_prev_obsolete_sm_object);
7919 	if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
7920 		vdev_indirect_mapping_t *vim =
7921 		    vdev_indirect_mapping_open(mos,
7922 		    spa->spa_condensing_indirect_phys.scip_next_mapping_object);
7923 		mos_obj_refd(vim->vim_phys->vimp_counts_object);
7924 		vdev_indirect_mapping_close(vim);
7925 	}
7926 	deleted_livelists_dump_mos(spa);
7927 
7928 	if (dp->dp_origin_snap != NULL) {
7929 		dsl_dataset_t *ds;
7930 
7931 		dsl_pool_config_enter(dp, FTAG);
7932 		VERIFY0(dsl_dataset_hold_obj(dp,
7933 		    dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
7934 		    FTAG, &ds));
7935 		count_ds_mos_objects(ds);
7936 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
7937 		dsl_dataset_rele(ds, FTAG);
7938 		dsl_pool_config_exit(dp, FTAG);
7939 
7940 		count_ds_mos_objects(dp->dp_origin_snap);
7941 		dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
7942 	}
7943 	count_dir_mos_objects(dp->dp_mos_dir);
7944 	if (dp->dp_free_dir != NULL)
7945 		count_dir_mos_objects(dp->dp_free_dir);
7946 	if (dp->dp_leak_dir != NULL)
7947 		count_dir_mos_objects(dp->dp_leak_dir);
7948 
7949 	mos_leak_vdev(spa->spa_root_vdev);
7950 
7951 	for (uint64_t class = 0; class < DDT_CLASSES; class++) {
7952 		for (uint64_t type = 0; type < DDT_TYPES; type++) {
7953 			for (uint64_t cksum = 0;
7954 			    cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
7955 				ddt_t *ddt = spa->spa_ddt[cksum];
7956 				if (!ddt)
7957 					continue;
7958 				mos_obj_refd(ddt->ddt_object[type][class]);
7959 			}
7960 		}
7961 	}
7962 
7963 	if (spa->spa_brt != NULL) {
7964 		brt_t *brt = spa->spa_brt;
7965 		for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
7966 			brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
7967 			if (brtvd != NULL && brtvd->bv_initiated) {
7968 				mos_obj_refd(brtvd->bv_mos_brtvdev);
7969 				mos_obj_refd(brtvd->bv_mos_entries);
7970 			}
7971 		}
7972 	}
7973 
7974 	/*
7975 	 * Visit all allocated objects and make sure they are referenced.
7976 	 */
7977 	uint64_t object = 0;
7978 	while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
7979 		if (range_tree_contains(mos_refd_objs, object, 1)) {
7980 			range_tree_remove(mos_refd_objs, object, 1);
7981 		} else {
7982 			dmu_object_info_t doi;
7983 			const char *name;
7984 			VERIFY0(dmu_object_info(mos, object, &doi));
7985 			if (doi.doi_type & DMU_OT_NEWTYPE) {
7986 				dmu_object_byteswap_t bswap =
7987 				    DMU_OT_BYTESWAP(doi.doi_type);
7988 				name = dmu_ot_byteswap[bswap].ob_name;
7989 			} else {
7990 				name = dmu_ot[doi.doi_type].ot_name;
7991 			}
7992 
7993 			(void) printf("MOS object %llu (%s) leaked\n",
7994 			    (u_longlong_t)object, name);
7995 			rv = 2;
7996 		}
7997 	}
7998 	(void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
7999 	if (!range_tree_is_empty(mos_refd_objs))
8000 		rv = 2;
8001 	range_tree_vacate(mos_refd_objs, NULL, NULL);
8002 	range_tree_destroy(mos_refd_objs);
8003 	return (rv);
8004 }
8005 
8006 typedef struct log_sm_obsolete_stats_arg {
8007 	uint64_t lsos_current_txg;
8008 
8009 	uint64_t lsos_total_entries;
8010 	uint64_t lsos_valid_entries;
8011 
8012 	uint64_t lsos_sm_entries;
8013 	uint64_t lsos_valid_sm_entries;
8014 } log_sm_obsolete_stats_arg_t;
8015 
8016 static int
8017 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
8018     uint64_t txg, void *arg)
8019 {
8020 	log_sm_obsolete_stats_arg_t *lsos = arg;
8021 
8022 	uint64_t offset = sme->sme_offset;
8023 	uint64_t vdev_id = sme->sme_vdev;
8024 
8025 	if (lsos->lsos_current_txg == 0) {
8026 		/* this is the first log */
8027 		lsos->lsos_current_txg = txg;
8028 	} else if (lsos->lsos_current_txg < txg) {
8029 		/* we just changed log - print stats and reset */
8030 		(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
8031 		    (u_longlong_t)lsos->lsos_valid_sm_entries,
8032 		    (u_longlong_t)lsos->lsos_sm_entries,
8033 		    (u_longlong_t)lsos->lsos_current_txg);
8034 		lsos->lsos_valid_sm_entries = 0;
8035 		lsos->lsos_sm_entries = 0;
8036 		lsos->lsos_current_txg = txg;
8037 	}
8038 	ASSERT3U(lsos->lsos_current_txg, ==, txg);
8039 
8040 	lsos->lsos_sm_entries++;
8041 	lsos->lsos_total_entries++;
8042 
8043 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
8044 	if (!vdev_is_concrete(vd))
8045 		return (0);
8046 
8047 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
8048 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
8049 
8050 	if (txg < metaslab_unflushed_txg(ms))
8051 		return (0);
8052 	lsos->lsos_valid_sm_entries++;
8053 	lsos->lsos_valid_entries++;
8054 	return (0);
8055 }
8056 
8057 static void
8058 dump_log_spacemap_obsolete_stats(spa_t *spa)
8059 {
8060 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
8061 		return;
8062 
8063 	log_sm_obsolete_stats_arg_t lsos = {0};
8064 
8065 	(void) printf("Log Space Map Obsolete Entry Statistics:\n");
8066 
8067 	iterate_through_spacemap_logs(spa,
8068 	    log_spacemap_obsolete_stats_cb, &lsos);
8069 
8070 	/* print stats for latest log */
8071 	(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
8072 	    (u_longlong_t)lsos.lsos_valid_sm_entries,
8073 	    (u_longlong_t)lsos.lsos_sm_entries,
8074 	    (u_longlong_t)lsos.lsos_current_txg);
8075 
8076 	(void) printf("%-8llu valid entries out of %-8llu - total\n\n",
8077 	    (u_longlong_t)lsos.lsos_valid_entries,
8078 	    (u_longlong_t)lsos.lsos_total_entries);
8079 }
8080 
8081 static void
8082 dump_zpool(spa_t *spa)
8083 {
8084 	dsl_pool_t *dp = spa_get_dsl(spa);
8085 	int rc = 0;
8086 
8087 	if (dump_opt['y']) {
8088 		livelist_metaslab_validate(spa);
8089 	}
8090 
8091 	if (dump_opt['S']) {
8092 		dump_simulated_ddt(spa);
8093 		return;
8094 	}
8095 
8096 	if (!dump_opt['e'] && dump_opt['C'] > 1) {
8097 		(void) printf("\nCached configuration:\n");
8098 		dump_nvlist(spa->spa_config, 8);
8099 	}
8100 
8101 	if (dump_opt['C'])
8102 		dump_config(spa);
8103 
8104 	if (dump_opt['u'])
8105 		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
8106 
8107 	if (dump_opt['D'])
8108 		dump_all_ddts(spa);
8109 
8110 	if (dump_opt['T'])
8111 		dump_brt(spa);
8112 
8113 	if (dump_opt['d'] > 2 || dump_opt['m'])
8114 		dump_metaslabs(spa);
8115 	if (dump_opt['M'])
8116 		dump_metaslab_groups(spa, dump_opt['M'] > 1);
8117 	if (dump_opt['d'] > 2 || dump_opt['m']) {
8118 		dump_log_spacemaps(spa);
8119 		dump_log_spacemap_obsolete_stats(spa);
8120 	}
8121 
8122 	if (dump_opt['d'] || dump_opt['i']) {
8123 		spa_feature_t f;
8124 		mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
8125 		    0);
8126 		dump_objset(dp->dp_meta_objset);
8127 
8128 		if (dump_opt['d'] >= 3) {
8129 			dsl_pool_t *dp = spa->spa_dsl_pool;
8130 			dump_full_bpobj(&spa->spa_deferred_bpobj,
8131 			    "Deferred frees", 0);
8132 			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
8133 				dump_full_bpobj(&dp->dp_free_bpobj,
8134 				    "Pool snapshot frees", 0);
8135 			}
8136 			if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
8137 				ASSERT(spa_feature_is_enabled(spa,
8138 				    SPA_FEATURE_DEVICE_REMOVAL));
8139 				dump_full_bpobj(&dp->dp_obsolete_bpobj,
8140 				    "Pool obsolete blocks", 0);
8141 			}
8142 
8143 			if (spa_feature_is_active(spa,
8144 			    SPA_FEATURE_ASYNC_DESTROY)) {
8145 				dump_bptree(spa->spa_meta_objset,
8146 				    dp->dp_bptree_obj,
8147 				    "Pool dataset frees");
8148 			}
8149 			dump_dtl(spa->spa_root_vdev, 0);
8150 		}
8151 
8152 		for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
8153 			global_feature_count[f] = UINT64_MAX;
8154 		global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
8155 		global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0;
8156 		global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
8157 		global_feature_count[SPA_FEATURE_LIVELIST] = 0;
8158 
8159 		(void) dmu_objset_find(spa_name(spa), dump_one_objset,
8160 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
8161 
8162 		if (rc == 0 && !dump_opt['L'])
8163 			rc = dump_mos_leaks(spa);
8164 
8165 		for (f = 0; f < SPA_FEATURES; f++) {
8166 			uint64_t refcount;
8167 
8168 			uint64_t *arr;
8169 			if (!(spa_feature_table[f].fi_flags &
8170 			    ZFEATURE_FLAG_PER_DATASET)) {
8171 				if (global_feature_count[f] == UINT64_MAX)
8172 					continue;
8173 				if (!spa_feature_is_enabled(spa, f)) {
8174 					ASSERT0(global_feature_count[f]);
8175 					continue;
8176 				}
8177 				arr = global_feature_count;
8178 			} else {
8179 				if (!spa_feature_is_enabled(spa, f)) {
8180 					ASSERT0(dataset_feature_count[f]);
8181 					continue;
8182 				}
8183 				arr = dataset_feature_count;
8184 			}
8185 			if (feature_get_refcount(spa, &spa_feature_table[f],
8186 			    &refcount) == ENOTSUP)
8187 				continue;
8188 			if (arr[f] != refcount) {
8189 				(void) printf("%s feature refcount mismatch: "
8190 				    "%lld consumers != %lld refcount\n",
8191 				    spa_feature_table[f].fi_uname,
8192 				    (longlong_t)arr[f], (longlong_t)refcount);
8193 				rc = 2;
8194 			} else {
8195 				(void) printf("Verified %s feature refcount "
8196 				    "of %llu is correct\n",
8197 				    spa_feature_table[f].fi_uname,
8198 				    (longlong_t)refcount);
8199 			}
8200 		}
8201 
8202 		if (rc == 0)
8203 			rc = verify_device_removal_feature_counts(spa);
8204 	}
8205 
8206 	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
8207 		rc = dump_block_stats(spa);
8208 
8209 	if (rc == 0)
8210 		rc = verify_spacemap_refcounts(spa);
8211 
8212 	if (dump_opt['s'])
8213 		show_pool_stats(spa);
8214 
8215 	if (dump_opt['h'])
8216 		dump_history(spa);
8217 
8218 	if (rc == 0)
8219 		rc = verify_checkpoint(spa);
8220 
8221 	if (rc != 0) {
8222 		dump_debug_buffer();
8223 		exit(rc);
8224 	}
8225 }
8226 
8227 #define	ZDB_FLAG_CHECKSUM	0x0001
8228 #define	ZDB_FLAG_DECOMPRESS	0x0002
8229 #define	ZDB_FLAG_BSWAP		0x0004
8230 #define	ZDB_FLAG_GBH		0x0008
8231 #define	ZDB_FLAG_INDIRECT	0x0010
8232 #define	ZDB_FLAG_RAW		0x0020
8233 #define	ZDB_FLAG_PRINT_BLKPTR	0x0040
8234 #define	ZDB_FLAG_VERBOSE	0x0080
8235 
8236 static int flagbits[256];
8237 static char flagbitstr[16];
8238 
8239 static void
8240 zdb_print_blkptr(const blkptr_t *bp, int flags)
8241 {
8242 	char blkbuf[BP_SPRINTF_LEN];
8243 
8244 	if (flags & ZDB_FLAG_BSWAP)
8245 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
8246 
8247 	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
8248 	(void) printf("%s\n", blkbuf);
8249 }
8250 
8251 static void
8252 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
8253 {
8254 	int i;
8255 
8256 	for (i = 0; i < nbps; i++)
8257 		zdb_print_blkptr(&bp[i], flags);
8258 }
8259 
8260 static void
8261 zdb_dump_gbh(void *buf, int flags)
8262 {
8263 	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
8264 }
8265 
8266 static void
8267 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
8268 {
8269 	if (flags & ZDB_FLAG_BSWAP)
8270 		byteswap_uint64_array(buf, size);
8271 	VERIFY(write(fileno(stdout), buf, size) == size);
8272 }
8273 
8274 static void
8275 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
8276 {
8277 	uint64_t *d = (uint64_t *)buf;
8278 	unsigned nwords = size / sizeof (uint64_t);
8279 	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
8280 	unsigned i, j;
8281 	const char *hdr;
8282 	char *c;
8283 
8284 
8285 	if (do_bswap)
8286 		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
8287 	else
8288 		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
8289 
8290 	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
8291 
8292 #ifdef _LITTLE_ENDIAN
8293 	/* correct the endianness */
8294 	do_bswap = !do_bswap;
8295 #endif
8296 	for (i = 0; i < nwords; i += 2) {
8297 		(void) printf("%06llx:  %016llx  %016llx  ",
8298 		    (u_longlong_t)(i * sizeof (uint64_t)),
8299 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
8300 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
8301 
8302 		c = (char *)&d[i];
8303 		for (j = 0; j < 2 * sizeof (uint64_t); j++)
8304 			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
8305 		(void) printf("\n");
8306 	}
8307 }
8308 
8309 /*
8310  * There are two acceptable formats:
8311  *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
8312  *	child[.child]*    - For example: 0.1.1
8313  *
8314  * The second form can be used to specify arbitrary vdevs anywhere
8315  * in the hierarchy.  For example, in a pool with a mirror of
8316  * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
8317  */
8318 static vdev_t *
8319 zdb_vdev_lookup(vdev_t *vdev, const char *path)
8320 {
8321 	char *s, *p, *q;
8322 	unsigned i;
8323 
8324 	if (vdev == NULL)
8325 		return (NULL);
8326 
8327 	/* First, assume the x.x.x.x format */
8328 	i = strtoul(path, &s, 10);
8329 	if (s == path || (s && *s != '.' && *s != '\0'))
8330 		goto name;
8331 	if (i >= vdev->vdev_children)
8332 		return (NULL);
8333 
8334 	vdev = vdev->vdev_child[i];
8335 	if (s && *s == '\0')
8336 		return (vdev);
8337 	return (zdb_vdev_lookup(vdev, s+1));
8338 
8339 name:
8340 	for (i = 0; i < vdev->vdev_children; i++) {
8341 		vdev_t *vc = vdev->vdev_child[i];
8342 
8343 		if (vc->vdev_path == NULL) {
8344 			vc = zdb_vdev_lookup(vc, path);
8345 			if (vc == NULL)
8346 				continue;
8347 			else
8348 				return (vc);
8349 		}
8350 
8351 		p = strrchr(vc->vdev_path, '/');
8352 		p = p ? p + 1 : vc->vdev_path;
8353 		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
8354 
8355 		if (strcmp(vc->vdev_path, path) == 0)
8356 			return (vc);
8357 		if (strcmp(p, path) == 0)
8358 			return (vc);
8359 		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
8360 			return (vc);
8361 	}
8362 
8363 	return (NULL);
8364 }
8365 
8366 static int
8367 name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)
8368 {
8369 	dsl_dataset_t *ds;
8370 
8371 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
8372 	int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,
8373 	    NULL, &ds);
8374 	if (error != 0) {
8375 		(void) fprintf(stderr, "failed to hold objset %llu: %s\n",
8376 		    (u_longlong_t)objset_id, strerror(error));
8377 		dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
8378 		return (error);
8379 	}
8380 	dsl_dataset_name(ds, outstr);
8381 	dsl_dataset_rele(ds, NULL);
8382 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
8383 	return (0);
8384 }
8385 
8386 static boolean_t
8387 zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)
8388 {
8389 	char *s0, *s1, *tmp = NULL;
8390 
8391 	if (sizes == NULL)
8392 		return (B_FALSE);
8393 
8394 	s0 = strtok_r(sizes, "/", &tmp);
8395 	if (s0 == NULL)
8396 		return (B_FALSE);
8397 	s1 = strtok_r(NULL, "/", &tmp);
8398 	*lsize = strtoull(s0, NULL, 16);
8399 	*psize = s1 ? strtoull(s1, NULL, 16) : *lsize;
8400 	return (*lsize >= *psize && *psize > 0);
8401 }
8402 
8403 #define	ZIO_COMPRESS_MASK(alg)	(1ULL << (ZIO_COMPRESS_##alg))
8404 
8405 static boolean_t
8406 try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
8407     int flags, int cfunc, void *lbuf, void *lbuf2)
8408 {
8409 	if (flags & ZDB_FLAG_VERBOSE) {
8410 		(void) fprintf(stderr,
8411 		    "Trying %05llx -> %05llx (%s)\n",
8412 		    (u_longlong_t)psize,
8413 		    (u_longlong_t)lsize,
8414 		    zio_compress_table[cfunc].ci_name);
8415 	}
8416 
8417 	/*
8418 	 * We set lbuf to all zeros and lbuf2 to all
8419 	 * ones, then decompress to both buffers and
8420 	 * compare their contents. This way we can
8421 	 * know if decompression filled exactly to
8422 	 * lsize or if it left some bytes unwritten.
8423 	 */
8424 
8425 	memset(lbuf, 0x00, lsize);
8426 	memset(lbuf2, 0xff, lsize);
8427 
8428 	if (zio_decompress_data(cfunc, pabd,
8429 	    lbuf, psize, lsize, NULL) == 0 &&
8430 	    zio_decompress_data(cfunc, pabd,
8431 	    lbuf2, psize, lsize, NULL) == 0 &&
8432 	    memcmp(lbuf, lbuf2, lsize) == 0)
8433 		return (B_TRUE);
8434 	return (B_FALSE);
8435 }
8436 
8437 static uint64_t
8438 zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
8439     uint64_t psize, int flags)
8440 {
8441 	(void) buf;
8442 	uint64_t orig_lsize = lsize;
8443 	boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL));
8444 	boolean_t found = B_FALSE;
8445 	/*
8446 	 * We don't know how the data was compressed, so just try
8447 	 * every decompress function at every inflated blocksize.
8448 	 */
8449 	void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
8450 	int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };
8451 	int *cfuncp = cfuncs;
8452 	uint64_t maxlsize = SPA_MAXBLOCKSIZE;
8453 	uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |
8454 	    ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |
8455 	    ZIO_COMPRESS_MASK(ZLE);
8456 	*cfuncp++ = ZIO_COMPRESS_LZ4;
8457 	*cfuncp++ = ZIO_COMPRESS_LZJB;
8458 	mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);
8459 	/*
8460 	 * Every gzip level has the same decompressor, no need to
8461 	 * run it 9 times per bruteforce attempt.
8462 	 */
8463 	mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3);
8464 	mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5);
8465 	mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7);
8466 	mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9);
8467 	for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)
8468 		if (((1ULL << c) & mask) == 0)
8469 			*cfuncp++ = c;
8470 
8471 	/*
8472 	 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this
8473 	 * could take a while and we should let the user know
8474 	 * we are not stuck.  On the other hand, printing progress
8475 	 * info gets old after a while.  User can specify 'v' flag
8476 	 * to see the progression.
8477 	 */
8478 	if (lsize == psize)
8479 		lsize += SPA_MINBLOCKSIZE;
8480 	else
8481 		maxlsize = lsize;
8482 
8483 	for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {
8484 		for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
8485 			if (try_decompress_block(pabd, lsize, psize, flags,
8486 			    *cfuncp, lbuf, lbuf2)) {
8487 				found = B_TRUE;
8488 				break;
8489 			}
8490 		}
8491 		if (*cfuncp != 0)
8492 			break;
8493 	}
8494 	if (!found && tryzle) {
8495 		for (lsize = orig_lsize; lsize <= maxlsize;
8496 		    lsize += SPA_MINBLOCKSIZE) {
8497 			if (try_decompress_block(pabd, lsize, psize, flags,
8498 			    ZIO_COMPRESS_ZLE, lbuf, lbuf2)) {
8499 				*cfuncp = ZIO_COMPRESS_ZLE;
8500 				found = B_TRUE;
8501 				break;
8502 			}
8503 		}
8504 	}
8505 	umem_free(lbuf2, SPA_MAXBLOCKSIZE);
8506 
8507 	if (*cfuncp == ZIO_COMPRESS_ZLE) {
8508 		printf("\nZLE decompression was selected. If you "
8509 		    "suspect the results are wrong,\ntry avoiding ZLE "
8510 		    "by setting and exporting ZDB_NO_ZLE=\"true\"\n");
8511 	}
8512 
8513 	return (lsize > maxlsize ? -1 : lsize);
8514 }
8515 
8516 /*
8517  * Read a block from a pool and print it out.  The syntax of the
8518  * block descriptor is:
8519  *
8520  *	pool:vdev_specifier:offset:[lsize/]psize[:flags]
8521  *
8522  *	pool           - The name of the pool you wish to read from
8523  *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
8524  *	offset         - offset, in hex, in bytes
8525  *	size           - Amount of data to read, in hex, in bytes
8526  *	flags          - A string of characters specifying options
8527  *		 b: Decode a blkptr at given offset within block
8528  *		 c: Calculate and display checksums
8529  *		 d: Decompress data before dumping
8530  *		 e: Byteswap data before dumping
8531  *		 g: Display data as a gang block header
8532  *		 i: Display as an indirect block
8533  *		 r: Dump raw data to stdout
8534  *		 v: Verbose
8535  *
8536  */
8537 static void
8538 zdb_read_block(char *thing, spa_t *spa)
8539 {
8540 	blkptr_t blk, *bp = &blk;
8541 	dva_t *dva = bp->blk_dva;
8542 	int flags = 0;
8543 	uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;
8544 	zio_t *zio;
8545 	vdev_t *vd;
8546 	abd_t *pabd;
8547 	void *lbuf, *buf;
8548 	char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL;
8549 	const char *vdev, *errmsg = NULL;
8550 	int i, error;
8551 	boolean_t borrowed = B_FALSE, found = B_FALSE;
8552 
8553 	dup = strdup(thing);
8554 	s = strtok_r(dup, ":", &tmp);
8555 	vdev = s ?: "";
8556 	s = strtok_r(NULL, ":", &tmp);
8557 	offset = strtoull(s ? s : "", NULL, 16);
8558 	sizes = strtok_r(NULL, ":", &tmp);
8559 	s = strtok_r(NULL, ":", &tmp);
8560 	flagstr = strdup(s ?: "");
8561 
8562 	if (!zdb_parse_block_sizes(sizes, &lsize, &psize))
8563 		errmsg = "invalid size(s)";
8564 	if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))
8565 		errmsg = "size must be a multiple of sector size";
8566 	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
8567 		errmsg = "offset must be a multiple of sector size";
8568 	if (errmsg) {
8569 		(void) printf("Invalid block specifier: %s  - %s\n",
8570 		    thing, errmsg);
8571 		goto done;
8572 	}
8573 
8574 	tmp = NULL;
8575 	for (s = strtok_r(flagstr, ":", &tmp);
8576 	    s != NULL;
8577 	    s = strtok_r(NULL, ":", &tmp)) {
8578 		for (i = 0; i < strlen(flagstr); i++) {
8579 			int bit = flagbits[(uchar_t)flagstr[i]];
8580 
8581 			if (bit == 0) {
8582 				(void) printf("***Ignoring flag: %c\n",
8583 				    (uchar_t)flagstr[i]);
8584 				continue;
8585 			}
8586 			found = B_TRUE;
8587 			flags |= bit;
8588 
8589 			p = &flagstr[i + 1];
8590 			if (*p != ':' && *p != '\0') {
8591 				int j = 0, nextbit = flagbits[(uchar_t)*p];
8592 				char *end, offstr[8] = { 0 };
8593 				if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&
8594 				    (nextbit == 0)) {
8595 					/* look ahead to isolate the offset */
8596 					while (nextbit == 0 &&
8597 					    strchr(flagbitstr, *p) == NULL) {
8598 						offstr[j] = *p;
8599 						j++;
8600 						if (i + j > strlen(flagstr))
8601 							break;
8602 						p++;
8603 						nextbit = flagbits[(uchar_t)*p];
8604 					}
8605 					blkptr_offset = strtoull(offstr, &end,
8606 					    16);
8607 					i += j;
8608 				} else if (nextbit == 0) {
8609 					(void) printf("***Ignoring flag arg:"
8610 					    " '%c'\n", (uchar_t)*p);
8611 				}
8612 			}
8613 		}
8614 	}
8615 	if (blkptr_offset % sizeof (blkptr_t)) {
8616 		printf("Block pointer offset 0x%llx "
8617 		    "must be divisible by 0x%x\n",
8618 		    (longlong_t)blkptr_offset, (int)sizeof (blkptr_t));
8619 		goto done;
8620 	}
8621 	if (found == B_FALSE && strlen(flagstr) > 0) {
8622 		printf("Invalid flag arg: '%s'\n", flagstr);
8623 		goto done;
8624 	}
8625 
8626 	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
8627 	if (vd == NULL) {
8628 		(void) printf("***Invalid vdev: %s\n", vdev);
8629 		goto done;
8630 	} else {
8631 		if (vd->vdev_path)
8632 			(void) fprintf(stderr, "Found vdev: %s\n",
8633 			    vd->vdev_path);
8634 		else
8635 			(void) fprintf(stderr, "Found vdev type: %s\n",
8636 			    vd->vdev_ops->vdev_op_type);
8637 	}
8638 
8639 	pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
8640 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
8641 
8642 	BP_ZERO(bp);
8643 
8644 	DVA_SET_VDEV(&dva[0], vd->vdev_id);
8645 	DVA_SET_OFFSET(&dva[0], offset);
8646 	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
8647 	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
8648 
8649 	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
8650 
8651 	BP_SET_LSIZE(bp, lsize);
8652 	BP_SET_PSIZE(bp, psize);
8653 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
8654 	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
8655 	BP_SET_TYPE(bp, DMU_OT_NONE);
8656 	BP_SET_LEVEL(bp, 0);
8657 	BP_SET_DEDUP(bp, 0);
8658 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
8659 
8660 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8661 	zio = zio_root(spa, NULL, NULL, 0);
8662 
8663 	if (vd == vd->vdev_top) {
8664 		/*
8665 		 * Treat this as a normal block read.
8666 		 */
8667 		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
8668 		    ZIO_PRIORITY_SYNC_READ,
8669 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
8670 	} else {
8671 		/*
8672 		 * Treat this as a vdev child I/O.
8673 		 */
8674 		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
8675 		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
8676 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
8677 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
8678 		    NULL, NULL));
8679 	}
8680 
8681 	error = zio_wait(zio);
8682 	spa_config_exit(spa, SCL_STATE, FTAG);
8683 
8684 	if (error) {
8685 		(void) printf("Read of %s failed, error: %d\n", thing, error);
8686 		goto out;
8687 	}
8688 
8689 	uint64_t orig_lsize = lsize;
8690 	buf = lbuf;
8691 	if (flags & ZDB_FLAG_DECOMPRESS) {
8692 		lsize = zdb_decompress_block(pabd, buf, lbuf,
8693 		    lsize, psize, flags);
8694 		if (lsize == -1) {
8695 			(void) printf("Decompress of %s failed\n", thing);
8696 			goto out;
8697 		}
8698 	} else {
8699 		buf = abd_borrow_buf_copy(pabd, lsize);
8700 		borrowed = B_TRUE;
8701 	}
8702 	/*
8703 	 * Try to detect invalid block pointer.  If invalid, try
8704 	 * decompressing.
8705 	 */
8706 	if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&
8707 	    !(flags & ZDB_FLAG_DECOMPRESS)) {
8708 		const blkptr_t *b = (const blkptr_t *)(void *)
8709 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
8710 		if (zfs_blkptr_verify(spa, b,
8711 		    BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) {
8712 			abd_return_buf_copy(pabd, buf, lsize);
8713 			borrowed = B_FALSE;
8714 			buf = lbuf;
8715 			lsize = zdb_decompress_block(pabd, buf,
8716 			    lbuf, lsize, psize, flags);
8717 			b = (const blkptr_t *)(void *)
8718 			    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
8719 			if (lsize == -1 || zfs_blkptr_verify(spa, b,
8720 			    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) {
8721 				printf("invalid block pointer at this DVA\n");
8722 				goto out;
8723 			}
8724 		}
8725 	}
8726 
8727 	if (flags & ZDB_FLAG_PRINT_BLKPTR)
8728 		zdb_print_blkptr((blkptr_t *)(void *)
8729 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
8730 	else if (flags & ZDB_FLAG_RAW)
8731 		zdb_dump_block_raw(buf, lsize, flags);
8732 	else if (flags & ZDB_FLAG_INDIRECT)
8733 		zdb_dump_indirect((blkptr_t *)buf,
8734 		    orig_lsize / sizeof (blkptr_t), flags);
8735 	else if (flags & ZDB_FLAG_GBH)
8736 		zdb_dump_gbh(buf, flags);
8737 	else
8738 		zdb_dump_block(thing, buf, lsize, flags);
8739 
8740 	/*
8741 	 * If :c was specified, iterate through the checksum table to
8742 	 * calculate and display each checksum for our specified
8743 	 * DVA and length.
8744 	 */
8745 	if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&
8746 	    !(flags & ZDB_FLAG_GBH)) {
8747 		zio_t *czio;
8748 		(void) printf("\n");
8749 		for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;
8750 		    ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {
8751 
8752 			if ((zio_checksum_table[ck].ci_flags &
8753 			    ZCHECKSUM_FLAG_EMBEDDED) ||
8754 			    ck == ZIO_CHECKSUM_NOPARITY) {
8755 				continue;
8756 			}
8757 			BP_SET_CHECKSUM(bp, ck);
8758 			spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8759 			czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
8760 			if (vd == vd->vdev_top) {
8761 				zio_nowait(zio_read(czio, spa, bp, pabd, psize,
8762 				    NULL, NULL,
8763 				    ZIO_PRIORITY_SYNC_READ,
8764 				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
8765 				    ZIO_FLAG_DONT_RETRY, NULL));
8766 			} else {
8767 				zio_nowait(zio_vdev_child_io(czio, bp, vd,
8768 				    offset, pabd, psize, ZIO_TYPE_READ,
8769 				    ZIO_PRIORITY_SYNC_READ,
8770 				    ZIO_FLAG_DONT_PROPAGATE |
8771 				    ZIO_FLAG_DONT_RETRY |
8772 				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
8773 				    ZIO_FLAG_SPECULATIVE |
8774 				    ZIO_FLAG_OPTIONAL, NULL, NULL));
8775 			}
8776 			error = zio_wait(czio);
8777 			if (error == 0 || error == ECKSUM) {
8778 				zio_t *ck_zio = zio_null(NULL, spa, NULL,
8779 				    NULL, NULL, 0);
8780 				ck_zio->io_offset =
8781 				    DVA_GET_OFFSET(&bp->blk_dva[0]);
8782 				ck_zio->io_bp = bp;
8783 				zio_checksum_compute(ck_zio, ck, pabd, lsize);
8784 				printf(
8785 				    "%12s\t"
8786 				    "cksum=%016llx:%016llx:%016llx:%016llx\n",
8787 				    zio_checksum_table[ck].ci_name,
8788 				    (u_longlong_t)bp->blk_cksum.zc_word[0],
8789 				    (u_longlong_t)bp->blk_cksum.zc_word[1],
8790 				    (u_longlong_t)bp->blk_cksum.zc_word[2],
8791 				    (u_longlong_t)bp->blk_cksum.zc_word[3]);
8792 				zio_wait(ck_zio);
8793 			} else {
8794 				printf("error %d reading block\n", error);
8795 			}
8796 			spa_config_exit(spa, SCL_STATE, FTAG);
8797 		}
8798 	}
8799 
8800 	if (borrowed)
8801 		abd_return_buf_copy(pabd, buf, lsize);
8802 
8803 out:
8804 	abd_free(pabd);
8805 	umem_free(lbuf, SPA_MAXBLOCKSIZE);
8806 done:
8807 	free(flagstr);
8808 	free(dup);
8809 }
8810 
8811 static void
8812 zdb_embedded_block(char *thing)
8813 {
8814 	blkptr_t bp = {{{{0}}}};
8815 	unsigned long long *words = (void *)&bp;
8816 	char *buf;
8817 	int err;
8818 
8819 	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
8820 	    "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
8821 	    words + 0, words + 1, words + 2, words + 3,
8822 	    words + 4, words + 5, words + 6, words + 7,
8823 	    words + 8, words + 9, words + 10, words + 11,
8824 	    words + 12, words + 13, words + 14, words + 15);
8825 	if (err != 16) {
8826 		(void) fprintf(stderr, "invalid input format\n");
8827 		exit(1);
8828 	}
8829 	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
8830 	buf = malloc(SPA_MAXBLOCKSIZE);
8831 	if (buf == NULL) {
8832 		(void) fprintf(stderr, "out of memory\n");
8833 		exit(1);
8834 	}
8835 	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
8836 	if (err != 0) {
8837 		(void) fprintf(stderr, "decode failed: %u\n", err);
8838 		exit(1);
8839 	}
8840 	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
8841 	free(buf);
8842 }
8843 
8844 /* check for valid hex or decimal numeric string */
8845 static boolean_t
8846 zdb_numeric(char *str)
8847 {
8848 	int i = 0;
8849 
8850 	if (strlen(str) == 0)
8851 		return (B_FALSE);
8852 	if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0)
8853 		i = 2;
8854 	for (; i < strlen(str); i++) {
8855 		if (!isxdigit(str[i]))
8856 			return (B_FALSE);
8857 	}
8858 	return (B_TRUE);
8859 }
8860 
8861 int
8862 main(int argc, char **argv)
8863 {
8864 	int c;
8865 	spa_t *spa = NULL;
8866 	objset_t *os = NULL;
8867 	int dump_all = 1;
8868 	int verbose = 0;
8869 	int error = 0;
8870 	char **searchdirs = NULL;
8871 	int nsearch = 0;
8872 	char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];
8873 	nvlist_t *policy = NULL;
8874 	uint64_t max_txg = UINT64_MAX;
8875 	int64_t objset_id = -1;
8876 	uint64_t object;
8877 	int flags = ZFS_IMPORT_MISSING_LOG;
8878 	int rewind = ZPOOL_NEVER_REWIND;
8879 	char *spa_config_path_env, *objset_str;
8880 	boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
8881 	nvlist_t *cfg = NULL;
8882 
8883 	dprintf_setup(&argc, argv);
8884 
8885 	/*
8886 	 * If there is an environment variable SPA_CONFIG_PATH it overrides
8887 	 * default spa_config_path setting. If -U flag is specified it will
8888 	 * override this environment variable settings once again.
8889 	 */
8890 	spa_config_path_env = getenv("SPA_CONFIG_PATH");
8891 	if (spa_config_path_env != NULL)
8892 		spa_config_path = spa_config_path_env;
8893 
8894 	/*
8895 	 * For performance reasons, we set this tunable down. We do so before
8896 	 * the arg parsing section so that the user can override this value if
8897 	 * they choose.
8898 	 */
8899 	zfs_btree_verify_intensity = 3;
8900 
8901 	struct option long_options[] = {
8902 		{"ignore-assertions",	no_argument,		NULL, 'A'},
8903 		{"block-stats",		no_argument,		NULL, 'b'},
8904 		{"backup",		no_argument,		NULL, 'B'},
8905 		{"checksum",		no_argument,		NULL, 'c'},
8906 		{"config",		no_argument,		NULL, 'C'},
8907 		{"datasets",		no_argument,		NULL, 'd'},
8908 		{"dedup-stats",		no_argument,		NULL, 'D'},
8909 		{"exported",		no_argument,		NULL, 'e'},
8910 		{"embedded-block-pointer",	no_argument,	NULL, 'E'},
8911 		{"automatic-rewind",	no_argument,		NULL, 'F'},
8912 		{"dump-debug-msg",	no_argument,		NULL, 'G'},
8913 		{"history",		no_argument,		NULL, 'h'},
8914 		{"intent-logs",		no_argument,		NULL, 'i'},
8915 		{"inflight",		required_argument,	NULL, 'I'},
8916 		{"checkpointed-state",	no_argument,		NULL, 'k'},
8917 		{"key",			required_argument,	NULL, 'K'},
8918 		{"label",		no_argument,		NULL, 'l'},
8919 		{"disable-leak-tracking",	no_argument,	NULL, 'L'},
8920 		{"metaslabs",		no_argument,		NULL, 'm'},
8921 		{"metaslab-groups",	no_argument,		NULL, 'M'},
8922 		{"numeric",		no_argument,		NULL, 'N'},
8923 		{"option",		required_argument,	NULL, 'o'},
8924 		{"object-lookups",	no_argument,		NULL, 'O'},
8925 		{"path",		required_argument,	NULL, 'p'},
8926 		{"parseable",		no_argument,		NULL, 'P'},
8927 		{"skip-label",		no_argument,		NULL, 'q'},
8928 		{"copy-object",		no_argument,		NULL, 'r'},
8929 		{"read-block",		no_argument,		NULL, 'R'},
8930 		{"io-stats",		no_argument,		NULL, 's'},
8931 		{"simulate-dedup",	no_argument,		NULL, 'S'},
8932 		{"txg",			required_argument,	NULL, 't'},
8933 		{"brt-stats",		no_argument,		NULL, 'T'},
8934 		{"uberblock",		no_argument,		NULL, 'u'},
8935 		{"cachefile",		required_argument,	NULL, 'U'},
8936 		{"verbose",		no_argument,		NULL, 'v'},
8937 		{"verbatim",		no_argument,		NULL, 'V'},
8938 		{"dump-blocks",		required_argument,	NULL, 'x'},
8939 		{"extreme-rewind",	no_argument,		NULL, 'X'},
8940 		{"all-reconstruction",	no_argument,		NULL, 'Y'},
8941 		{"livelist",		no_argument,		NULL, 'y'},
8942 		{"zstd-headers",	no_argument,		NULL, 'Z'},
8943 		{0, 0, 0, 0}
8944 	};
8945 
8946 	while ((c = getopt_long(argc, argv,
8947 	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",
8948 	    long_options, NULL)) != -1) {
8949 		switch (c) {
8950 		case 'b':
8951 		case 'B':
8952 		case 'c':
8953 		case 'C':
8954 		case 'd':
8955 		case 'D':
8956 		case 'E':
8957 		case 'G':
8958 		case 'h':
8959 		case 'i':
8960 		case 'l':
8961 		case 'm':
8962 		case 'M':
8963 		case 'N':
8964 		case 'O':
8965 		case 'r':
8966 		case 'R':
8967 		case 's':
8968 		case 'S':
8969 		case 'T':
8970 		case 'u':
8971 		case 'y':
8972 		case 'Z':
8973 			dump_opt[c]++;
8974 			dump_all = 0;
8975 			break;
8976 		case 'A':
8977 		case 'e':
8978 		case 'F':
8979 		case 'k':
8980 		case 'L':
8981 		case 'P':
8982 		case 'q':
8983 		case 'X':
8984 			dump_opt[c]++;
8985 			break;
8986 		case 'Y':
8987 			zfs_reconstruct_indirect_combinations_max = INT_MAX;
8988 			zfs_deadman_enabled = 0;
8989 			break;
8990 		/* NB: Sort single match options below. */
8991 		case 'I':
8992 			max_inflight_bytes = strtoull(optarg, NULL, 0);
8993 			if (max_inflight_bytes == 0) {
8994 				(void) fprintf(stderr, "maximum number "
8995 				    "of inflight bytes must be greater "
8996 				    "than 0\n");
8997 				usage();
8998 			}
8999 			break;
9000 		case 'K':
9001 			dump_opt[c]++;
9002 			key_material = strdup(optarg);
9003 			/* redact key material in process table */
9004 			while (*optarg != '\0') { *optarg++ = '*'; }
9005 			break;
9006 		case 'o':
9007 			error = set_global_var(optarg);
9008 			if (error != 0)
9009 				usage();
9010 			break;
9011 		case 'p':
9012 			if (searchdirs == NULL) {
9013 				searchdirs = umem_alloc(sizeof (char *),
9014 				    UMEM_NOFAIL);
9015 			} else {
9016 				char **tmp = umem_alloc((nsearch + 1) *
9017 				    sizeof (char *), UMEM_NOFAIL);
9018 				memcpy(tmp, searchdirs, nsearch *
9019 				    sizeof (char *));
9020 				umem_free(searchdirs,
9021 				    nsearch * sizeof (char *));
9022 				searchdirs = tmp;
9023 			}
9024 			searchdirs[nsearch++] = optarg;
9025 			break;
9026 		case 't':
9027 			max_txg = strtoull(optarg, NULL, 0);
9028 			if (max_txg < TXG_INITIAL) {
9029 				(void) fprintf(stderr, "incorrect txg "
9030 				    "specified: %s\n", optarg);
9031 				usage();
9032 			}
9033 			break;
9034 		case 'U':
9035 			spa_config_path = optarg;
9036 			if (spa_config_path[0] != '/') {
9037 				(void) fprintf(stderr,
9038 				    "cachefile must be an absolute path "
9039 				    "(i.e. start with a slash)\n");
9040 				usage();
9041 			}
9042 			break;
9043 		case 'v':
9044 			verbose++;
9045 			break;
9046 		case 'V':
9047 			flags = ZFS_IMPORT_VERBATIM;
9048 			break;
9049 		case 'x':
9050 			vn_dumpdir = optarg;
9051 			break;
9052 		default:
9053 			usage();
9054 			break;
9055 		}
9056 	}
9057 
9058 	if (!dump_opt['e'] && searchdirs != NULL) {
9059 		(void) fprintf(stderr, "-p option requires use of -e\n");
9060 		usage();
9061 	}
9062 #if defined(_LP64)
9063 	/*
9064 	 * ZDB does not typically re-read blocks; therefore limit the ARC
9065 	 * to 256 MB, which can be used entirely for metadata.
9066 	 */
9067 	zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT;
9068 	zfs_arc_max = 256 * 1024 * 1024;
9069 #endif
9070 
9071 	/*
9072 	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
9073 	 * "zdb -b" uses traversal prefetch which uses async reads.
9074 	 * For good performance, let several of them be active at once.
9075 	 */
9076 	zfs_vdev_async_read_max_active = 10;
9077 
9078 	/*
9079 	 * Disable reference tracking for better performance.
9080 	 */
9081 	reference_tracking_enable = B_FALSE;
9082 
9083 	/*
9084 	 * Do not fail spa_load when spa_load_verify fails. This is needed
9085 	 * to load non-idle pools.
9086 	 */
9087 	spa_load_verify_dryrun = B_TRUE;
9088 
9089 	/*
9090 	 * ZDB should have ability to read spacemaps.
9091 	 */
9092 	spa_mode_readable_spacemaps = B_TRUE;
9093 
9094 	kernel_init(SPA_MODE_READ);
9095 
9096 	if (dump_all)
9097 		verbose = MAX(verbose, 1);
9098 
9099 	for (c = 0; c < 256; c++) {
9100 		if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL)
9101 			dump_opt[c] = 1;
9102 		if (dump_opt[c])
9103 			dump_opt[c] += verbose;
9104 	}
9105 
9106 	libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2));
9107 	zfs_recover = (dump_opt['A'] > 1);
9108 
9109 	argc -= optind;
9110 	argv += optind;
9111 	if (argc < 2 && dump_opt['R'])
9112 		usage();
9113 
9114 	if (dump_opt['E']) {
9115 		if (argc != 1)
9116 			usage();
9117 		zdb_embedded_block(argv[0]);
9118 		return (0);
9119 	}
9120 
9121 	if (argc < 1) {
9122 		if (!dump_opt['e'] && dump_opt['C']) {
9123 			dump_cachefile(spa_config_path);
9124 			return (0);
9125 		}
9126 		usage();
9127 	}
9128 
9129 	if (dump_opt['l'])
9130 		return (dump_label(argv[0]));
9131 
9132 	if (dump_opt['X'] || dump_opt['F'])
9133 		rewind = ZPOOL_DO_REWIND |
9134 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
9135 
9136 	/* -N implies -d */
9137 	if (dump_opt['N'] && dump_opt['d'] == 0)
9138 		dump_opt['d'] = dump_opt['N'];
9139 
9140 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
9141 	    nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
9142 	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
9143 		fatal("internal error: %s", strerror(ENOMEM));
9144 
9145 	error = 0;
9146 	target = argv[0];
9147 
9148 	if (strpbrk(target, "/@") != NULL) {
9149 		size_t targetlen;
9150 
9151 		target_pool = strdup(target);
9152 		*strpbrk(target_pool, "/@") = '\0';
9153 
9154 		target_is_spa = B_FALSE;
9155 		targetlen = strlen(target);
9156 		if (targetlen && target[targetlen - 1] == '/')
9157 			target[targetlen - 1] = '\0';
9158 
9159 		/*
9160 		 * See if an objset ID was supplied (-d <pool>/<objset ID>).
9161 		 * To disambiguate tank/100, consider the 100 as objsetID
9162 		 * if -N was given, otherwise 100 is an objsetID iff
9163 		 * tank/100 as a named dataset fails on lookup.
9164 		 */
9165 		objset_str = strchr(target, '/');
9166 		if (objset_str && strlen(objset_str) > 1 &&
9167 		    zdb_numeric(objset_str + 1)) {
9168 			char *endptr;
9169 			errno = 0;
9170 			objset_str++;
9171 			objset_id = strtoull(objset_str, &endptr, 0);
9172 			/* dataset 0 is the same as opening the pool */
9173 			if (errno == 0 && endptr != objset_str &&
9174 			    objset_id != 0) {
9175 				if (dump_opt['N'])
9176 					dataset_lookup = B_TRUE;
9177 			}
9178 			/* normal dataset name not an objset ID */
9179 			if (endptr == objset_str) {
9180 				objset_id = -1;
9181 			}
9182 		} else if (objset_str && !zdb_numeric(objset_str + 1) &&
9183 		    dump_opt['N']) {
9184 			printf("Supply a numeric objset ID with -N\n");
9185 			exit(1);
9186 		}
9187 	} else {
9188 		target_pool = target;
9189 	}
9190 
9191 	if (dump_opt['e']) {
9192 		importargs_t args = { 0 };
9193 
9194 		args.paths = nsearch;
9195 		args.path = searchdirs;
9196 		args.can_be_active = B_TRUE;
9197 
9198 		libpc_handle_t lpch = {
9199 			.lpc_lib_handle = NULL,
9200 			.lpc_ops = &libzpool_config_ops,
9201 			.lpc_printerr = B_TRUE
9202 		};
9203 		error = zpool_find_config(&lpch, target_pool, &cfg, &args);
9204 
9205 		if (error == 0) {
9206 
9207 			if (nvlist_add_nvlist(cfg,
9208 			    ZPOOL_LOAD_POLICY, policy) != 0) {
9209 				fatal("can't open '%s': %s",
9210 				    target, strerror(ENOMEM));
9211 			}
9212 
9213 			if (dump_opt['C'] > 1) {
9214 				(void) printf("\nConfiguration for import:\n");
9215 				dump_nvlist(cfg, 8);
9216 			}
9217 
9218 			/*
9219 			 * Disable the activity check to allow examination of
9220 			 * active pools.
9221 			 */
9222 			error = spa_import(target_pool, cfg, NULL,
9223 			    flags | ZFS_IMPORT_SKIP_MMP);
9224 		}
9225 	}
9226 
9227 	if (searchdirs != NULL) {
9228 		umem_free(searchdirs, nsearch * sizeof (char *));
9229 		searchdirs = NULL;
9230 	}
9231 
9232 	/*
9233 	 * We need to make sure to process -O option or call
9234 	 * dump_path after the -e option has been processed,
9235 	 * which imports the pool to the namespace if it's
9236 	 * not in the cachefile.
9237 	 */
9238 	if (dump_opt['O']) {
9239 		if (argc != 2)
9240 			usage();
9241 		dump_opt['v'] = verbose + 3;
9242 		return (dump_path(argv[0], argv[1], NULL));
9243 	}
9244 
9245 	if (dump_opt['r']) {
9246 		target_is_spa = B_FALSE;
9247 		if (argc != 3)
9248 			usage();
9249 		dump_opt['v'] = verbose;
9250 		error = dump_path(argv[0], argv[1], &object);
9251 		if (error != 0)
9252 			fatal("internal error: %s", strerror(error));
9253 	}
9254 
9255 	/*
9256 	 * import_checkpointed_state makes the assumption that the
9257 	 * target pool that we pass it is already part of the spa
9258 	 * namespace. Because of that we need to make sure to call
9259 	 * it always after the -e option has been processed, which
9260 	 * imports the pool to the namespace if it's not in the
9261 	 * cachefile.
9262 	 */
9263 	char *checkpoint_pool = NULL;
9264 	char *checkpoint_target = NULL;
9265 	if (dump_opt['k']) {
9266 		checkpoint_pool = import_checkpointed_state(target, cfg,
9267 		    &checkpoint_target);
9268 
9269 		if (checkpoint_target != NULL)
9270 			target = checkpoint_target;
9271 	}
9272 
9273 	if (cfg != NULL) {
9274 		nvlist_free(cfg);
9275 		cfg = NULL;
9276 	}
9277 
9278 	if (target_pool != target)
9279 		free(target_pool);
9280 
9281 	if (error == 0) {
9282 		if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
9283 			ASSERT(checkpoint_pool != NULL);
9284 			ASSERT(checkpoint_target == NULL);
9285 
9286 			error = spa_open(checkpoint_pool, &spa, FTAG);
9287 			if (error != 0) {
9288 				fatal("Tried to open pool \"%s\" but "
9289 				    "spa_open() failed with error %d\n",
9290 				    checkpoint_pool, error);
9291 			}
9292 
9293 		} else if (target_is_spa || dump_opt['R'] || dump_opt['B'] ||
9294 		    objset_id == 0) {
9295 			zdb_set_skip_mmp(target);
9296 			error = spa_open_rewind(target, &spa, FTAG, policy,
9297 			    NULL);
9298 			if (error) {
9299 				/*
9300 				 * If we're missing the log device then
9301 				 * try opening the pool after clearing the
9302 				 * log state.
9303 				 */
9304 				mutex_enter(&spa_namespace_lock);
9305 				if ((spa = spa_lookup(target)) != NULL &&
9306 				    spa->spa_log_state == SPA_LOG_MISSING) {
9307 					spa->spa_log_state = SPA_LOG_CLEAR;
9308 					error = 0;
9309 				}
9310 				mutex_exit(&spa_namespace_lock);
9311 
9312 				if (!error) {
9313 					error = spa_open_rewind(target, &spa,
9314 					    FTAG, policy, NULL);
9315 				}
9316 			}
9317 		} else if (strpbrk(target, "#") != NULL) {
9318 			dsl_pool_t *dp;
9319 			error = dsl_pool_hold(target, FTAG, &dp);
9320 			if (error != 0) {
9321 				fatal("can't dump '%s': %s", target,
9322 				    strerror(error));
9323 			}
9324 			error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
9325 			dsl_pool_rele(dp, FTAG);
9326 			if (error != 0) {
9327 				fatal("can't dump '%s': %s", target,
9328 				    strerror(error));
9329 			}
9330 			return (error);
9331 		} else {
9332 			target_pool = strdup(target);
9333 			if (strpbrk(target, "/@") != NULL)
9334 				*strpbrk(target_pool, "/@") = '\0';
9335 
9336 			zdb_set_skip_mmp(target);
9337 			/*
9338 			 * If -N was supplied, the user has indicated that
9339 			 * zdb -d <pool>/<objsetID> is in effect.  Otherwise
9340 			 * we first assume that the dataset string is the
9341 			 * dataset name.  If dmu_objset_hold fails with the
9342 			 * dataset string, and we have an objset_id, retry the
9343 			 * lookup with the objsetID.
9344 			 */
9345 			boolean_t retry = B_TRUE;
9346 retry_lookup:
9347 			if (dataset_lookup == B_TRUE) {
9348 				/*
9349 				 * Use the supplied id to get the name
9350 				 * for open_objset.
9351 				 */
9352 				error = spa_open(target_pool, &spa, FTAG);
9353 				if (error == 0) {
9354 					error = name_from_objset_id(spa,
9355 					    objset_id, dsname);
9356 					spa_close(spa, FTAG);
9357 					if (error == 0)
9358 						target = dsname;
9359 				}
9360 			}
9361 			if (error == 0) {
9362 				if (objset_id > 0 && retry) {
9363 					int err = dmu_objset_hold(target, FTAG,
9364 					    &os);
9365 					if (err) {
9366 						dataset_lookup = B_TRUE;
9367 						retry = B_FALSE;
9368 						goto retry_lookup;
9369 					} else {
9370 						dmu_objset_rele(os, FTAG);
9371 					}
9372 				}
9373 				error = open_objset(target, FTAG, &os);
9374 			}
9375 			if (error == 0)
9376 				spa = dmu_objset_spa(os);
9377 			free(target_pool);
9378 		}
9379 	}
9380 	nvlist_free(policy);
9381 
9382 	if (error)
9383 		fatal("can't open '%s': %s", target, strerror(error));
9384 
9385 	/*
9386 	 * Set the pool failure mode to panic in order to prevent the pool
9387 	 * from suspending.  A suspended I/O will have no way to resume and
9388 	 * can prevent the zdb(8) command from terminating as expected.
9389 	 */
9390 	if (spa != NULL)
9391 		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
9392 
9393 	argv++;
9394 	argc--;
9395 	if (dump_opt['r']) {
9396 		error = zdb_copy_object(os, object, argv[1]);
9397 	} else if (!dump_opt['R']) {
9398 		flagbits['d'] = ZOR_FLAG_DIRECTORY;
9399 		flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
9400 		flagbits['m'] = ZOR_FLAG_SPACE_MAP;
9401 		flagbits['z'] = ZOR_FLAG_ZAP;
9402 		flagbits['A'] = ZOR_FLAG_ALL_TYPES;
9403 
9404 		if (argc > 0 && dump_opt['d']) {
9405 			zopt_object_args = argc;
9406 			zopt_object_ranges = calloc(zopt_object_args,
9407 			    sizeof (zopt_object_range_t));
9408 			for (unsigned i = 0; i < zopt_object_args; i++) {
9409 				int err;
9410 				const char *msg = NULL;
9411 
9412 				err = parse_object_range(argv[i],
9413 				    &zopt_object_ranges[i], &msg);
9414 				if (err != 0)
9415 					fatal("Bad object or range: '%s': %s\n",
9416 					    argv[i], msg ?: "");
9417 			}
9418 		} else if (argc > 0 && dump_opt['m']) {
9419 			zopt_metaslab_args = argc;
9420 			zopt_metaslab = calloc(zopt_metaslab_args,
9421 			    sizeof (uint64_t));
9422 			for (unsigned i = 0; i < zopt_metaslab_args; i++) {
9423 				errno = 0;
9424 				zopt_metaslab[i] = strtoull(argv[i], NULL, 0);
9425 				if (zopt_metaslab[i] == 0 && errno != 0)
9426 					fatal("bad number %s: %s", argv[i],
9427 					    strerror(errno));
9428 			}
9429 		}
9430 		if (dump_opt['B']) {
9431 			dump_backup(target, objset_id,
9432 			    argc > 0 ? argv[0] : NULL);
9433 		} else if (os != NULL) {
9434 			dump_objset(os);
9435 		} else if (zopt_object_args > 0 && !dump_opt['m']) {
9436 			dump_objset(spa->spa_meta_objset);
9437 		} else {
9438 			dump_zpool(spa);
9439 		}
9440 	} else {
9441 		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
9442 		flagbits['c'] = ZDB_FLAG_CHECKSUM;
9443 		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
9444 		flagbits['e'] = ZDB_FLAG_BSWAP;
9445 		flagbits['g'] = ZDB_FLAG_GBH;
9446 		flagbits['i'] = ZDB_FLAG_INDIRECT;
9447 		flagbits['r'] = ZDB_FLAG_RAW;
9448 		flagbits['v'] = ZDB_FLAG_VERBOSE;
9449 
9450 		for (int i = 0; i < argc; i++)
9451 			zdb_read_block(argv[i], spa);
9452 	}
9453 
9454 	if (dump_opt['k']) {
9455 		free(checkpoint_pool);
9456 		if (!target_is_spa)
9457 			free(checkpoint_target);
9458 	}
9459 
9460 	if (os != NULL) {
9461 		close_objset(os, FTAG);
9462 	} else {
9463 		spa_close(spa, FTAG);
9464 	}
9465 
9466 	fuid_table_destroy();
9467 
9468 	dump_debug_buffer();
9469 
9470 	kernel_fini();
9471 
9472 	return (error);
9473 }
9474