1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2019 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 * Copyright 2017 Nexenta Systems, Inc. 27 * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. 28 * Copyright 2017 RackTop Systems. 29 */ 30 31 #include <stdio.h> 32 #include <unistd.h> 33 #include <stdio_ext.h> 34 #include <stdlib.h> 35 #include <ctype.h> 36 #include <sys/zfs_context.h> 37 #include <sys/spa.h> 38 #include <sys/spa_impl.h> 39 #include <sys/dmu.h> 40 #include <sys/zap.h> 41 #include <sys/fs/zfs.h> 42 #include <sys/zfs_znode.h> 43 #include <sys/zfs_sa.h> 44 #include <sys/sa.h> 45 #include <sys/sa_impl.h> 46 #include <sys/vdev.h> 47 #include <sys/vdev_impl.h> 48 #include <sys/metaslab_impl.h> 49 #include <sys/dmu_objset.h> 50 #include <sys/dsl_dir.h> 51 #include <sys/dsl_dataset.h> 52 #include <sys/dsl_pool.h> 53 #include <sys/dbuf.h> 54 #include <sys/zil.h> 55 #include <sys/zil_impl.h> 56 #include <sys/stat.h> 57 #include <sys/resource.h> 58 #include <sys/dmu_traverse.h> 59 #include <sys/zio_checksum.h> 60 #include <sys/zio_compress.h> 61 #include <sys/zfs_fuid.h> 62 #include <sys/arc.h> 63 #include <sys/ddt.h> 64 #include <sys/zfeature.h> 65 #include <sys/abd.h> 66 #include <sys/blkptr.h> 67 #include <sys/dsl_scan.h> 68 #include <sys/dsl_crypt.h> 69 #include <zfs_comutil.h> 70 #include <libcmdutils.h> 71 #undef verify 72 #include <libzfs.h> 73 74 #include <libnvpair.h> 75 #include <libzutil.h> 76 #include <zfs_fletcher.h> 77 78 #include "zdb.h" 79 80 #define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ 81 zio_compress_table[(idx)].ci_name : "UNKNOWN") 82 #define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ 83 zio_checksum_table[(idx)].ci_name : "UNKNOWN") 84 #define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ 85 dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \ 86 dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN") 87 #define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ 88 (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ 89 DMU_OT_ZAP_OTHER : \ 90 (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ 91 DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) 92 93 extern int reference_tracking_enable; 94 extern boolean_t zfs_recover; 95 extern uint64_t zfs_arc_max, zfs_arc_meta_limit; 96 extern int zfs_vdev_async_read_max_active; 97 extern int aok; 98 extern boolean_t spa_load_verify_dryrun; 99 extern int zfs_btree_verify_intensity; 100 101 static const char cmdname[] = "zdb"; 102 uint8_t dump_opt[256]; 103 104 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); 105 106 uint64_t *zopt_object = NULL; 107 static unsigned zopt_objects = 0; 108 uint64_t max_inflight = 1000; 109 static int leaked_objects = 0; 110 111 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *); 112 static void mos_obj_refd(uint64_t); 113 114 /* 115 * These libumem hooks provide a reasonable set of defaults for the allocator's 116 * debugging facilities. 117 */ 118 const char * 119 _umem_debug_init() 120 { 121 return ("default,verbose"); /* $UMEM_DEBUG setting */ 122 } 123 124 const char * 125 _umem_logging_init(void) 126 { 127 return ("fail,contents"); /* $UMEM_LOGGING setting */ 128 } 129 130 static void 131 usage(void) 132 { 133 (void) fprintf(stderr, 134 "Usage:\t%s [-AbcdDFGhikLMPsvX] [-e [-V] [-p <path> ...]] " 135 "[-I <inflight I/Os>]\n" 136 "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n" 137 "\t\t[<poolname> [<object> ...]]\n" 138 "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] <dataset> " 139 "[<object> ...]\n" 140 "\t%s -C [-A] [-U <cache>]\n" 141 "\t%s -l [-Aqu] <device>\n" 142 "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] " 143 "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n" 144 "\t%s -O <dataset> <path>\n" 145 "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n" 146 "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n" 147 "\t%s -E [-A] word0:word1:...:word15\n" 148 "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] " 149 "<poolname>\n\n", 150 cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, 151 cmdname, cmdname); 152 153 (void) fprintf(stderr, " Dataset name must include at least one " 154 "separator character '/' or '@'\n"); 155 (void) fprintf(stderr, " If dataset name is specified, only that " 156 "dataset is dumped\n"); 157 (void) fprintf(stderr, " If object numbers are specified, only " 158 "those objects are dumped\n\n"); 159 (void) fprintf(stderr, " Options to control amount of output:\n"); 160 (void) fprintf(stderr, " -b block statistics\n"); 161 (void) fprintf(stderr, " -c checksum all metadata (twice for " 162 "all data) blocks\n"); 163 (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); 164 (void) fprintf(stderr, " -d dataset(s)\n"); 165 (void) fprintf(stderr, " -D dedup statistics\n"); 166 (void) fprintf(stderr, " -E decode and display block from an " 167 "embedded block pointer\n"); 168 (void) fprintf(stderr, " -h pool history\n"); 169 (void) fprintf(stderr, " -i intent logs\n"); 170 (void) fprintf(stderr, " -l read label contents\n"); 171 (void) fprintf(stderr, " -k examine the checkpointed state " 172 "of the pool\n"); 173 (void) fprintf(stderr, " -L disable leak tracking (do not " 174 "load spacemaps)\n"); 175 (void) fprintf(stderr, " -m metaslabs\n"); 176 (void) fprintf(stderr, " -M metaslab groups\n"); 177 (void) fprintf(stderr, " -O perform object lookups by path\n"); 178 (void) fprintf(stderr, " -R read and display block from a " 179 "device\n"); 180 (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); 181 (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); 182 (void) fprintf(stderr, " -v verbose (applies to all " 183 "others)\n\n"); 184 (void) fprintf(stderr, " Below options are intended for use " 185 "with other options:\n"); 186 (void) fprintf(stderr, " -A ignore assertions (-A), enable " 187 "panic recovery (-AA) or both (-AAA)\n"); 188 (void) fprintf(stderr, " -e pool is exported/destroyed/" 189 "has altroot/not in a cachefile\n"); 190 (void) fprintf(stderr, " -F attempt automatic rewind within " 191 "safe range of transaction groups\n"); 192 (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " 193 "exiting\n"); 194 (void) fprintf(stderr, " -I <number of inflight I/Os> -- " 195 "specify the maximum number of " 196 "checksumming I/Os [default is 200]\n"); 197 (void) fprintf(stderr, " -o <variable>=<value> set global " 198 "variable to an unsigned 32-bit integer value\n"); 199 (void) fprintf(stderr, " -p <path> -- use one or more with " 200 "-e to specify path to vdev dir\n"); 201 (void) fprintf(stderr, " -P print numbers in parseable form\n"); 202 (void) fprintf(stderr, " -q don't print label contents\n"); 203 (void) fprintf(stderr, " -t <txg> -- highest txg to use when " 204 "searching for uberblocks\n"); 205 (void) fprintf(stderr, " -u uberblock\n"); 206 (void) fprintf(stderr, " -U <cachefile_path> -- use alternate " 207 "cachefile\n"); 208 (void) fprintf(stderr, " -V do verbatim import\n"); 209 (void) fprintf(stderr, " -x <dumpdir> -- " 210 "dump all read blocks into specified directory\n"); 211 (void) fprintf(stderr, " -X attempt extreme rewind (does not " 212 "work with dataset)\n\n"); 213 (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " 214 "to make only that option verbose\n"); 215 (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); 216 exit(1); 217 } 218 219 static void 220 dump_debug_buffer() 221 { 222 if (dump_opt['G']) { 223 (void) printf("\n"); 224 zfs_dbgmsg_print("zdb"); 225 } 226 } 227 228 /* 229 * Called for usage errors that are discovered after a call to spa_open(), 230 * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. 231 */ 232 233 static void 234 fatal(const char *fmt, ...) 235 { 236 va_list ap; 237 238 va_start(ap, fmt); 239 (void) fprintf(stderr, "%s: ", cmdname); 240 (void) vfprintf(stderr, fmt, ap); 241 va_end(ap); 242 (void) fprintf(stderr, "\n"); 243 244 dump_debug_buffer(); 245 246 exit(1); 247 } 248 249 /* ARGSUSED */ 250 static void 251 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) 252 { 253 nvlist_t *nv; 254 size_t nvsize = *(uint64_t *)data; 255 char *packed = umem_alloc(nvsize, UMEM_NOFAIL); 256 257 VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); 258 259 VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); 260 261 umem_free(packed, nvsize); 262 263 dump_nvlist(nv, 8); 264 265 nvlist_free(nv); 266 } 267 268 /* ARGSUSED */ 269 static void 270 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) 271 { 272 spa_history_phys_t *shp = data; 273 274 if (shp == NULL) 275 return; 276 277 (void) printf("\t\tpool_create_len = %llu\n", 278 (u_longlong_t)shp->sh_pool_create_len); 279 (void) printf("\t\tphys_max_off = %llu\n", 280 (u_longlong_t)shp->sh_phys_max_off); 281 (void) printf("\t\tbof = %llu\n", 282 (u_longlong_t)shp->sh_bof); 283 (void) printf("\t\teof = %llu\n", 284 (u_longlong_t)shp->sh_eof); 285 (void) printf("\t\trecords_lost = %llu\n", 286 (u_longlong_t)shp->sh_records_lost); 287 } 288 289 static void 290 zdb_nicenum(uint64_t num, char *buf, size_t buflen) 291 { 292 if (dump_opt['P']) 293 (void) snprintf(buf, buflen, "%llu", (longlong_t)num); 294 else 295 nicenum(num, buf, sizeof (buf)); 296 } 297 298 static const char histo_stars[] = "****************************************"; 299 static const uint64_t histo_width = sizeof (histo_stars) - 1; 300 301 static void 302 dump_histogram(const uint64_t *histo, int size, int offset) 303 { 304 int i; 305 int minidx = size - 1; 306 int maxidx = 0; 307 uint64_t max = 0; 308 309 for (i = 0; i < size; i++) { 310 if (histo[i] > max) 311 max = histo[i]; 312 if (histo[i] > 0 && i > maxidx) 313 maxidx = i; 314 if (histo[i] > 0 && i < minidx) 315 minidx = i; 316 } 317 318 if (max < histo_width) 319 max = histo_width; 320 321 for (i = minidx; i <= maxidx; i++) { 322 (void) printf("\t\t\t%3u: %6llu %s\n", 323 i + offset, (u_longlong_t)histo[i], 324 &histo_stars[(max - histo[i]) * histo_width / max]); 325 } 326 } 327 328 static void 329 dump_zap_stats(objset_t *os, uint64_t object) 330 { 331 int error; 332 zap_stats_t zs; 333 334 error = zap_get_stats(os, object, &zs); 335 if (error) 336 return; 337 338 if (zs.zs_ptrtbl_len == 0) { 339 ASSERT(zs.zs_num_blocks == 1); 340 (void) printf("\tmicrozap: %llu bytes, %llu entries\n", 341 (u_longlong_t)zs.zs_blocksize, 342 (u_longlong_t)zs.zs_num_entries); 343 return; 344 } 345 346 (void) printf("\tFat ZAP stats:\n"); 347 348 (void) printf("\t\tPointer table:\n"); 349 (void) printf("\t\t\t%llu elements\n", 350 (u_longlong_t)zs.zs_ptrtbl_len); 351 (void) printf("\t\t\tzt_blk: %llu\n", 352 (u_longlong_t)zs.zs_ptrtbl_zt_blk); 353 (void) printf("\t\t\tzt_numblks: %llu\n", 354 (u_longlong_t)zs.zs_ptrtbl_zt_numblks); 355 (void) printf("\t\t\tzt_shift: %llu\n", 356 (u_longlong_t)zs.zs_ptrtbl_zt_shift); 357 (void) printf("\t\t\tzt_blks_copied: %llu\n", 358 (u_longlong_t)zs.zs_ptrtbl_blks_copied); 359 (void) printf("\t\t\tzt_nextblk: %llu\n", 360 (u_longlong_t)zs.zs_ptrtbl_nextblk); 361 362 (void) printf("\t\tZAP entries: %llu\n", 363 (u_longlong_t)zs.zs_num_entries); 364 (void) printf("\t\tLeaf blocks: %llu\n", 365 (u_longlong_t)zs.zs_num_leafs); 366 (void) printf("\t\tTotal blocks: %llu\n", 367 (u_longlong_t)zs.zs_num_blocks); 368 (void) printf("\t\tzap_block_type: 0x%llx\n", 369 (u_longlong_t)zs.zs_block_type); 370 (void) printf("\t\tzap_magic: 0x%llx\n", 371 (u_longlong_t)zs.zs_magic); 372 (void) printf("\t\tzap_salt: 0x%llx\n", 373 (u_longlong_t)zs.zs_salt); 374 375 (void) printf("\t\tLeafs with 2^n pointers:\n"); 376 dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); 377 378 (void) printf("\t\tBlocks with n*5 entries:\n"); 379 dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); 380 381 (void) printf("\t\tBlocks n/10 full:\n"); 382 dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); 383 384 (void) printf("\t\tEntries with n chunks:\n"); 385 dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); 386 387 (void) printf("\t\tBuckets with n entries:\n"); 388 dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); 389 } 390 391 /*ARGSUSED*/ 392 static void 393 dump_none(objset_t *os, uint64_t object, void *data, size_t size) 394 { 395 } 396 397 /*ARGSUSED*/ 398 static void 399 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) 400 { 401 (void) printf("\tUNKNOWN OBJECT TYPE\n"); 402 } 403 404 /*ARGSUSED*/ 405 static void 406 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) 407 { 408 } 409 410 /*ARGSUSED*/ 411 static void 412 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) 413 { 414 } 415 416 /*ARGSUSED*/ 417 static void 418 dump_zap(objset_t *os, uint64_t object, void *data, size_t size) 419 { 420 zap_cursor_t zc; 421 zap_attribute_t attr; 422 void *prop; 423 unsigned i; 424 425 dump_zap_stats(os, object); 426 (void) printf("\n"); 427 428 for (zap_cursor_init(&zc, os, object); 429 zap_cursor_retrieve(&zc, &attr) == 0; 430 zap_cursor_advance(&zc)) { 431 (void) printf("\t\t%s = ", attr.za_name); 432 if (attr.za_num_integers == 0) { 433 (void) printf("\n"); 434 continue; 435 } 436 prop = umem_zalloc(attr.za_num_integers * 437 attr.za_integer_length, UMEM_NOFAIL); 438 (void) zap_lookup(os, object, attr.za_name, 439 attr.za_integer_length, attr.za_num_integers, prop); 440 if (attr.za_integer_length == 1) { 441 if (strcmp(attr.za_name, 442 DSL_CRYPTO_KEY_MASTER_KEY) == 0 || 443 strcmp(attr.za_name, 444 DSL_CRYPTO_KEY_HMAC_KEY) == 0 || 445 strcmp(attr.za_name, DSL_CRYPTO_KEY_IV) == 0 || 446 strcmp(attr.za_name, DSL_CRYPTO_KEY_MAC) == 0 || 447 strcmp(attr.za_name, DMU_POOL_CHECKSUM_SALT) == 0) { 448 uint8_t *u8 = prop; 449 450 for (i = 0; i < attr.za_num_integers; i++) { 451 (void) printf("%02x", u8[i]); 452 } 453 } else { 454 (void) printf("%s", (char *)prop); 455 } 456 } else { 457 for (i = 0; i < attr.za_num_integers; i++) { 458 switch (attr.za_integer_length) { 459 case 2: 460 (void) printf("%u ", 461 ((uint16_t *)prop)[i]); 462 break; 463 case 4: 464 (void) printf("%u ", 465 ((uint32_t *)prop)[i]); 466 break; 467 case 8: 468 (void) printf("%lld ", 469 (u_longlong_t)((int64_t *)prop)[i]); 470 break; 471 } 472 } 473 } 474 (void) printf("\n"); 475 umem_free(prop, attr.za_num_integers * attr.za_integer_length); 476 } 477 zap_cursor_fini(&zc); 478 } 479 480 static void 481 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) 482 { 483 bpobj_phys_t *bpop = data; 484 char bytes[32], comp[32], uncomp[32]; 485 486 /* make sure the output won't get truncated */ 487 CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); 488 CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); 489 CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); 490 491 if (bpop == NULL) 492 return; 493 494 zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); 495 zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); 496 zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); 497 498 (void) printf("\t\tnum_blkptrs = %llu\n", 499 (u_longlong_t)bpop->bpo_num_blkptrs); 500 (void) printf("\t\tbytes = %s\n", bytes); 501 if (size >= BPOBJ_SIZE_V1) { 502 (void) printf("\t\tcomp = %s\n", comp); 503 (void) printf("\t\tuncomp = %s\n", uncomp); 504 } 505 if (size >= sizeof (*bpop)) { 506 (void) printf("\t\tsubobjs = %llu\n", 507 (u_longlong_t)bpop->bpo_subobjs); 508 (void) printf("\t\tnum_subobjs = %llu\n", 509 (u_longlong_t)bpop->bpo_num_subobjs); 510 } 511 512 if (dump_opt['d'] < 5) 513 return; 514 515 for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) { 516 char blkbuf[BP_SPRINTF_LEN]; 517 blkptr_t bp; 518 519 int err = dmu_read(os, object, 520 i * sizeof (bp), sizeof (bp), &bp, 0); 521 if (err != 0) { 522 (void) printf("got error %u from dmu_read\n", err); 523 break; 524 } 525 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp); 526 (void) printf("\t%s\n", blkbuf); 527 } 528 } 529 530 /* ARGSUSED */ 531 static void 532 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) 533 { 534 dmu_object_info_t doi; 535 536 VERIFY0(dmu_object_info(os, object, &doi)); 537 uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); 538 539 int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); 540 if (err != 0) { 541 (void) printf("got error %u from dmu_read\n", err); 542 kmem_free(subobjs, doi.doi_max_offset); 543 return; 544 } 545 546 int64_t last_nonzero = -1; 547 for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) { 548 if (subobjs[i] != 0) 549 last_nonzero = i; 550 } 551 552 for (int64_t i = 0; i <= last_nonzero; i++) { 553 (void) printf("\t%llu\n", (longlong_t)subobjs[i]); 554 } 555 kmem_free(subobjs, doi.doi_max_offset); 556 } 557 558 /*ARGSUSED*/ 559 static void 560 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) 561 { 562 dump_zap_stats(os, object); 563 /* contents are printed elsewhere, properly decoded */ 564 } 565 566 /*ARGSUSED*/ 567 static void 568 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) 569 { 570 zap_cursor_t zc; 571 zap_attribute_t attr; 572 573 dump_zap_stats(os, object); 574 (void) printf("\n"); 575 576 for (zap_cursor_init(&zc, os, object); 577 zap_cursor_retrieve(&zc, &attr) == 0; 578 zap_cursor_advance(&zc)) { 579 (void) printf("\t\t%s = ", attr.za_name); 580 if (attr.za_num_integers == 0) { 581 (void) printf("\n"); 582 continue; 583 } 584 (void) printf(" %llx : [%d:%d:%d]\n", 585 (u_longlong_t)attr.za_first_integer, 586 (int)ATTR_LENGTH(attr.za_first_integer), 587 (int)ATTR_BSWAP(attr.za_first_integer), 588 (int)ATTR_NUM(attr.za_first_integer)); 589 } 590 zap_cursor_fini(&zc); 591 } 592 593 /*ARGSUSED*/ 594 static void 595 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) 596 { 597 zap_cursor_t zc; 598 zap_attribute_t attr; 599 uint16_t *layout_attrs; 600 unsigned i; 601 602 dump_zap_stats(os, object); 603 (void) printf("\n"); 604 605 for (zap_cursor_init(&zc, os, object); 606 zap_cursor_retrieve(&zc, &attr) == 0; 607 zap_cursor_advance(&zc)) { 608 (void) printf("\t\t%s = [", attr.za_name); 609 if (attr.za_num_integers == 0) { 610 (void) printf("\n"); 611 continue; 612 } 613 614 VERIFY(attr.za_integer_length == 2); 615 layout_attrs = umem_zalloc(attr.za_num_integers * 616 attr.za_integer_length, UMEM_NOFAIL); 617 618 VERIFY(zap_lookup(os, object, attr.za_name, 619 attr.za_integer_length, 620 attr.za_num_integers, layout_attrs) == 0); 621 622 for (i = 0; i != attr.za_num_integers; i++) 623 (void) printf(" %d ", (int)layout_attrs[i]); 624 (void) printf("]\n"); 625 umem_free(layout_attrs, 626 attr.za_num_integers * attr.za_integer_length); 627 } 628 zap_cursor_fini(&zc); 629 } 630 631 /*ARGSUSED*/ 632 static void 633 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) 634 { 635 zap_cursor_t zc; 636 zap_attribute_t attr; 637 const char *typenames[] = { 638 /* 0 */ "not specified", 639 /* 1 */ "FIFO", 640 /* 2 */ "Character Device", 641 /* 3 */ "3 (invalid)", 642 /* 4 */ "Directory", 643 /* 5 */ "5 (invalid)", 644 /* 6 */ "Block Device", 645 /* 7 */ "7 (invalid)", 646 /* 8 */ "Regular File", 647 /* 9 */ "9 (invalid)", 648 /* 10 */ "Symbolic Link", 649 /* 11 */ "11 (invalid)", 650 /* 12 */ "Socket", 651 /* 13 */ "Door", 652 /* 14 */ "Event Port", 653 /* 15 */ "15 (invalid)", 654 }; 655 656 dump_zap_stats(os, object); 657 (void) printf("\n"); 658 659 for (zap_cursor_init(&zc, os, object); 660 zap_cursor_retrieve(&zc, &attr) == 0; 661 zap_cursor_advance(&zc)) { 662 (void) printf("\t\t%s = %lld (type: %s)\n", 663 attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), 664 typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); 665 } 666 zap_cursor_fini(&zc); 667 } 668 669 static int 670 get_dtl_refcount(vdev_t *vd) 671 { 672 int refcount = 0; 673 674 if (vd->vdev_ops->vdev_op_leaf) { 675 space_map_t *sm = vd->vdev_dtl_sm; 676 677 if (sm != NULL && 678 sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) 679 return (1); 680 return (0); 681 } 682 683 for (unsigned c = 0; c < vd->vdev_children; c++) 684 refcount += get_dtl_refcount(vd->vdev_child[c]); 685 return (refcount); 686 } 687 688 static int 689 get_metaslab_refcount(vdev_t *vd) 690 { 691 int refcount = 0; 692 693 if (vd->vdev_top == vd) { 694 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 695 space_map_t *sm = vd->vdev_ms[m]->ms_sm; 696 697 if (sm != NULL && 698 sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) 699 refcount++; 700 } 701 } 702 for (unsigned c = 0; c < vd->vdev_children; c++) 703 refcount += get_metaslab_refcount(vd->vdev_child[c]); 704 705 return (refcount); 706 } 707 708 static int 709 get_obsolete_refcount(vdev_t *vd) 710 { 711 int refcount = 0; 712 713 uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); 714 if (vd->vdev_top == vd && obsolete_sm_obj != 0) { 715 dmu_object_info_t doi; 716 VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, 717 obsolete_sm_obj, &doi)); 718 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { 719 refcount++; 720 } 721 } else { 722 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); 723 ASSERT3U(obsolete_sm_obj, ==, 0); 724 } 725 for (unsigned c = 0; c < vd->vdev_children; c++) { 726 refcount += get_obsolete_refcount(vd->vdev_child[c]); 727 } 728 729 return (refcount); 730 } 731 732 static int 733 get_prev_obsolete_spacemap_refcount(spa_t *spa) 734 { 735 uint64_t prev_obj = 736 spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; 737 if (prev_obj != 0) { 738 dmu_object_info_t doi; 739 VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); 740 if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { 741 return (1); 742 } 743 } 744 return (0); 745 } 746 747 static int 748 get_checkpoint_refcount(vdev_t *vd) 749 { 750 int refcount = 0; 751 752 if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && 753 zap_contains(spa_meta_objset(vd->vdev_spa), 754 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) 755 refcount++; 756 757 for (uint64_t c = 0; c < vd->vdev_children; c++) 758 refcount += get_checkpoint_refcount(vd->vdev_child[c]); 759 760 return (refcount); 761 } 762 763 static int 764 get_log_spacemap_refcount(spa_t *spa) 765 { 766 return (avl_numnodes(&spa->spa_sm_logs_by_txg)); 767 } 768 769 static int 770 verify_spacemap_refcounts(spa_t *spa) 771 { 772 uint64_t expected_refcount = 0; 773 uint64_t actual_refcount; 774 775 (void) feature_get_refcount(spa, 776 &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], 777 &expected_refcount); 778 actual_refcount = get_dtl_refcount(spa->spa_root_vdev); 779 actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); 780 actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); 781 actual_refcount += get_prev_obsolete_spacemap_refcount(spa); 782 actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); 783 actual_refcount += get_log_spacemap_refcount(spa); 784 785 if (expected_refcount != actual_refcount) { 786 (void) printf("space map refcount mismatch: expected %lld != " 787 "actual %lld\n", 788 (longlong_t)expected_refcount, 789 (longlong_t)actual_refcount); 790 return (2); 791 } 792 return (0); 793 } 794 795 static void 796 dump_spacemap(objset_t *os, space_map_t *sm) 797 { 798 char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", 799 "INVALID", "INVALID", "INVALID", "INVALID" }; 800 801 if (sm == NULL) 802 return; 803 804 (void) printf("space map object %llu:\n", 805 (longlong_t)sm->sm_object); 806 (void) printf(" smp_length = 0x%llx\n", 807 (longlong_t)sm->sm_phys->smp_length); 808 (void) printf(" smp_alloc = 0x%llx\n", 809 (longlong_t)sm->sm_phys->smp_alloc); 810 811 if (dump_opt['d'] < 6 && dump_opt['m'] < 4) 812 return; 813 814 /* 815 * Print out the freelist entries in both encoded and decoded form. 816 */ 817 uint8_t mapshift = sm->sm_shift; 818 int64_t alloc = 0; 819 uint64_t word, entry_id = 0; 820 for (uint64_t offset = 0; offset < space_map_length(sm); 821 offset += sizeof (word)) { 822 823 VERIFY0(dmu_read(os, space_map_object(sm), offset, 824 sizeof (word), &word, DMU_READ_PREFETCH)); 825 826 if (sm_entry_is_debug(word)) { 827 (void) printf("\t [%6llu] %s: txg %llu pass %llu\n", 828 (u_longlong_t)entry_id, 829 ddata[SM_DEBUG_ACTION_DECODE(word)], 830 (u_longlong_t)SM_DEBUG_TXG_DECODE(word), 831 (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word)); 832 entry_id++; 833 continue; 834 } 835 836 uint8_t words; 837 char entry_type; 838 uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; 839 840 if (sm_entry_is_single_word(word)) { 841 entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? 842 'A' : 'F'; 843 entry_off = (SM_OFFSET_DECODE(word) << mapshift) + 844 sm->sm_start; 845 entry_run = SM_RUN_DECODE(word) << mapshift; 846 words = 1; 847 } else { 848 /* it is a two-word entry so we read another word */ 849 ASSERT(sm_entry_is_double_word(word)); 850 851 uint64_t extra_word; 852 offset += sizeof (extra_word); 853 VERIFY0(dmu_read(os, space_map_object(sm), offset, 854 sizeof (extra_word), &extra_word, 855 DMU_READ_PREFETCH)); 856 857 ASSERT3U(offset, <=, space_map_length(sm)); 858 859 entry_run = SM2_RUN_DECODE(word) << mapshift; 860 entry_vdev = SM2_VDEV_DECODE(word); 861 entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? 862 'A' : 'F'; 863 entry_off = (SM2_OFFSET_DECODE(extra_word) << 864 mapshift) + sm->sm_start; 865 words = 2; 866 } 867 868 (void) printf("\t [%6llu] %c range:" 869 " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", 870 (u_longlong_t)entry_id, 871 entry_type, (u_longlong_t)entry_off, 872 (u_longlong_t)(entry_off + entry_run), 873 (u_longlong_t)entry_run, 874 (u_longlong_t)entry_vdev, words); 875 876 if (entry_type == 'A') 877 alloc += entry_run; 878 else 879 alloc -= entry_run; 880 entry_id++; 881 } 882 if (alloc != space_map_allocated(sm)) { 883 (void) printf("space_map_object alloc (%lld) INCONSISTENT " 884 "with space map summary (%lld)\n", 885 (longlong_t)space_map_allocated(sm), (longlong_t)alloc); 886 } 887 } 888 889 static void 890 dump_metaslab_stats(metaslab_t *msp) 891 { 892 char maxbuf[32]; 893 range_tree_t *rt = msp->ms_allocatable; 894 zfs_btree_t *t = &msp->ms_allocatable_by_size; 895 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 896 897 /* max sure nicenum has enough space */ 898 CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); 899 900 zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); 901 902 (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", 903 "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, 904 "freepct", free_pct); 905 (void) printf("\tIn-memory histogram:\n"); 906 dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); 907 } 908 909 static void 910 dump_metaslab(metaslab_t *msp) 911 { 912 vdev_t *vd = msp->ms_group->mg_vd; 913 spa_t *spa = vd->vdev_spa; 914 space_map_t *sm = msp->ms_sm; 915 char freebuf[32]; 916 917 zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, 918 sizeof (freebuf)); 919 920 (void) printf( 921 "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", 922 (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, 923 (u_longlong_t)space_map_object(sm), freebuf); 924 925 if (dump_opt['m'] > 2 && !dump_opt['L']) { 926 mutex_enter(&msp->ms_lock); 927 VERIFY0(metaslab_load(msp)); 928 range_tree_stat_verify(msp->ms_allocatable); 929 dump_metaslab_stats(msp); 930 metaslab_unload(msp); 931 mutex_exit(&msp->ms_lock); 932 } 933 934 if (dump_opt['m'] > 1 && sm != NULL && 935 spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { 936 /* 937 * The space map histogram represents free space in chunks 938 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). 939 */ 940 (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", 941 (u_longlong_t)msp->ms_fragmentation); 942 dump_histogram(sm->sm_phys->smp_histogram, 943 SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); 944 } 945 946 ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); 947 dump_spacemap(spa->spa_meta_objset, msp->ms_sm); 948 949 if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 950 (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", 951 (u_longlong_t)metaslab_unflushed_txg(msp)); 952 } 953 } 954 955 static void 956 print_vdev_metaslab_header(vdev_t *vd) 957 { 958 vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; 959 const char *bias_str = ""; 960 961 if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { 962 bias_str = VDEV_ALLOC_BIAS_LOG; 963 } else if (alloc_bias == VDEV_BIAS_SPECIAL) { 964 bias_str = VDEV_ALLOC_BIAS_SPECIAL; 965 } else if (alloc_bias == VDEV_BIAS_DEDUP) { 966 bias_str = VDEV_ALLOC_BIAS_DEDUP; 967 } 968 969 uint64_t ms_flush_data_obj = 0; 970 if (vd->vdev_top_zap != 0) { 971 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), 972 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, 973 sizeof (uint64_t), 1, &ms_flush_data_obj); 974 if (error != ENOENT) { 975 ASSERT0(error); 976 } 977 } 978 979 (void) printf("\tvdev %10llu %s", 980 (u_longlong_t)vd->vdev_id, bias_str); 981 982 if (ms_flush_data_obj != 0) { 983 (void) printf(" ms_unflushed_phys object %llu", 984 (u_longlong_t)ms_flush_data_obj); 985 } 986 987 (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", 988 "metaslabs", (u_longlong_t)vd->vdev_ms_count, 989 "offset", "spacemap", "free"); 990 (void) printf("\t%15s %19s %15s %12s\n", 991 "---------------", "-------------------", 992 "---------------", "------------"); 993 } 994 995 static void 996 dump_metaslab_groups(spa_t *spa) 997 { 998 vdev_t *rvd = spa->spa_root_vdev; 999 metaslab_class_t *mc = spa_normal_class(spa); 1000 uint64_t fragmentation; 1001 1002 metaslab_class_histogram_verify(mc); 1003 1004 for (unsigned c = 0; c < rvd->vdev_children; c++) { 1005 vdev_t *tvd = rvd->vdev_child[c]; 1006 metaslab_group_t *mg = tvd->vdev_mg; 1007 1008 if (mg == NULL || mg->mg_class != mc) 1009 continue; 1010 1011 metaslab_group_histogram_verify(mg); 1012 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 1013 1014 (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" 1015 "fragmentation", 1016 (u_longlong_t)tvd->vdev_id, 1017 (u_longlong_t)tvd->vdev_ms_count); 1018 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 1019 (void) printf("%3s\n", "-"); 1020 } else { 1021 (void) printf("%3llu%%\n", 1022 (u_longlong_t)mg->mg_fragmentation); 1023 } 1024 dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); 1025 } 1026 1027 (void) printf("\tpool %s\tfragmentation", spa_name(spa)); 1028 fragmentation = metaslab_class_fragmentation(mc); 1029 if (fragmentation == ZFS_FRAG_INVALID) 1030 (void) printf("\t%3s\n", "-"); 1031 else 1032 (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); 1033 dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); 1034 } 1035 1036 static void 1037 print_vdev_indirect(vdev_t *vd) 1038 { 1039 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 1040 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 1041 vdev_indirect_births_t *vib = vd->vdev_indirect_births; 1042 1043 if (vim == NULL) { 1044 ASSERT3P(vib, ==, NULL); 1045 return; 1046 } 1047 1048 ASSERT3U(vdev_indirect_mapping_object(vim), ==, 1049 vic->vic_mapping_object); 1050 ASSERT3U(vdev_indirect_births_object(vib), ==, 1051 vic->vic_births_object); 1052 1053 (void) printf("indirect births obj %llu:\n", 1054 (longlong_t)vic->vic_births_object); 1055 (void) printf(" vib_count = %llu\n", 1056 (longlong_t)vdev_indirect_births_count(vib)); 1057 for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { 1058 vdev_indirect_birth_entry_phys_t *cur_vibe = 1059 &vib->vib_entries[i]; 1060 (void) printf("\toffset %llx -> txg %llu\n", 1061 (longlong_t)cur_vibe->vibe_offset, 1062 (longlong_t)cur_vibe->vibe_phys_birth_txg); 1063 } 1064 (void) printf("\n"); 1065 1066 (void) printf("indirect mapping obj %llu:\n", 1067 (longlong_t)vic->vic_mapping_object); 1068 (void) printf(" vim_max_offset = 0x%llx\n", 1069 (longlong_t)vdev_indirect_mapping_max_offset(vim)); 1070 (void) printf(" vim_bytes_mapped = 0x%llx\n", 1071 (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); 1072 (void) printf(" vim_count = %llu\n", 1073 (longlong_t)vdev_indirect_mapping_num_entries(vim)); 1074 1075 if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) 1076 return; 1077 1078 uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); 1079 1080 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { 1081 vdev_indirect_mapping_entry_phys_t *vimep = 1082 &vim->vim_entries[i]; 1083 (void) printf("\t<%llx:%llx:%llx> -> " 1084 "<%llx:%llx:%llx> (%x obsolete)\n", 1085 (longlong_t)vd->vdev_id, 1086 (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), 1087 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 1088 (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), 1089 (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), 1090 (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 1091 counts[i]); 1092 } 1093 (void) printf("\n"); 1094 1095 uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); 1096 if (obsolete_sm_object != 0) { 1097 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1098 (void) printf("obsolete space map object %llu:\n", 1099 (u_longlong_t)obsolete_sm_object); 1100 ASSERT(vd->vdev_obsolete_sm != NULL); 1101 ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, 1102 obsolete_sm_object); 1103 dump_spacemap(mos, vd->vdev_obsolete_sm); 1104 (void) printf("\n"); 1105 } 1106 } 1107 1108 static void 1109 dump_metaslabs(spa_t *spa) 1110 { 1111 vdev_t *vd, *rvd = spa->spa_root_vdev; 1112 uint64_t m, c = 0, children = rvd->vdev_children; 1113 1114 (void) printf("\nMetaslabs:\n"); 1115 1116 if (!dump_opt['d'] && zopt_objects > 0) { 1117 c = zopt_object[0]; 1118 1119 if (c >= children) 1120 (void) fatal("bad vdev id: %llu", (u_longlong_t)c); 1121 1122 if (zopt_objects > 1) { 1123 vd = rvd->vdev_child[c]; 1124 print_vdev_metaslab_header(vd); 1125 1126 for (m = 1; m < zopt_objects; m++) { 1127 if (zopt_object[m] < vd->vdev_ms_count) 1128 dump_metaslab( 1129 vd->vdev_ms[zopt_object[m]]); 1130 else 1131 (void) fprintf(stderr, "bad metaslab " 1132 "number %llu\n", 1133 (u_longlong_t)zopt_object[m]); 1134 } 1135 (void) printf("\n"); 1136 return; 1137 } 1138 children = c + 1; 1139 } 1140 for (; c < children; c++) { 1141 vd = rvd->vdev_child[c]; 1142 print_vdev_metaslab_header(vd); 1143 1144 print_vdev_indirect(vd); 1145 1146 for (m = 0; m < vd->vdev_ms_count; m++) 1147 dump_metaslab(vd->vdev_ms[m]); 1148 (void) printf("\n"); 1149 } 1150 } 1151 1152 static void 1153 dump_log_spacemaps(spa_t *spa) 1154 { 1155 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1156 return; 1157 1158 (void) printf("\nLog Space Maps in Pool:\n"); 1159 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 1160 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { 1161 space_map_t *sm = NULL; 1162 VERIFY0(space_map_open(&sm, spa_meta_objset(spa), 1163 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); 1164 1165 (void) printf("Log Spacemap object %llu txg %llu\n", 1166 (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); 1167 dump_spacemap(spa->spa_meta_objset, sm); 1168 space_map_close(sm); 1169 } 1170 (void) printf("\n"); 1171 } 1172 1173 static void 1174 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) 1175 { 1176 const ddt_phys_t *ddp = dde->dde_phys; 1177 const ddt_key_t *ddk = &dde->dde_key; 1178 const char *types[4] = { "ditto", "single", "double", "triple" }; 1179 char blkbuf[BP_SPRINTF_LEN]; 1180 blkptr_t blk; 1181 1182 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1183 if (ddp->ddp_phys_birth == 0) 1184 continue; 1185 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 1186 snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); 1187 (void) printf("index %llx refcnt %llu %s %s\n", 1188 (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, 1189 types[p], blkbuf); 1190 } 1191 } 1192 1193 static void 1194 dump_dedup_ratio(const ddt_stat_t *dds) 1195 { 1196 double rL, rP, rD, D, dedup, compress, copies; 1197 1198 if (dds->dds_blocks == 0) 1199 return; 1200 1201 rL = (double)dds->dds_ref_lsize; 1202 rP = (double)dds->dds_ref_psize; 1203 rD = (double)dds->dds_ref_dsize; 1204 D = (double)dds->dds_dsize; 1205 1206 dedup = rD / D; 1207 compress = rL / rP; 1208 copies = rD / rP; 1209 1210 (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " 1211 "dedup * compress / copies = %.2f\n\n", 1212 dedup, compress, copies, dedup * compress / copies); 1213 } 1214 1215 static void 1216 dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 1217 { 1218 char name[DDT_NAMELEN]; 1219 ddt_entry_t dde; 1220 uint64_t walk = 0; 1221 dmu_object_info_t doi; 1222 uint64_t count, dspace, mspace; 1223 int error; 1224 1225 error = ddt_object_info(ddt, type, class, &doi); 1226 1227 if (error == ENOENT) 1228 return; 1229 ASSERT(error == 0); 1230 1231 if ((count = ddt_object_count(ddt, type, class)) == 0) 1232 return; 1233 1234 dspace = doi.doi_physical_blocks_512 << 9; 1235 mspace = doi.doi_fill_count * doi.doi_data_block_size; 1236 1237 ddt_object_name(ddt, type, class, name); 1238 1239 (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", 1240 name, 1241 (u_longlong_t)count, 1242 (u_longlong_t)(dspace / count), 1243 (u_longlong_t)(mspace / count)); 1244 1245 if (dump_opt['D'] < 3) 1246 return; 1247 1248 zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); 1249 1250 if (dump_opt['D'] < 4) 1251 return; 1252 1253 if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) 1254 return; 1255 1256 (void) printf("%s contents:\n\n", name); 1257 1258 while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) 1259 dump_dde(ddt, &dde, walk); 1260 1261 ASSERT3U(error, ==, ENOENT); 1262 1263 (void) printf("\n"); 1264 } 1265 1266 static void 1267 dump_all_ddts(spa_t *spa) 1268 { 1269 ddt_histogram_t ddh_total; 1270 ddt_stat_t dds_total; 1271 1272 bzero(&ddh_total, sizeof (ddh_total)); 1273 bzero(&dds_total, sizeof (dds_total)); 1274 1275 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 1276 ddt_t *ddt = spa->spa_ddt[c]; 1277 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 1278 for (enum ddt_class class = 0; class < DDT_CLASSES; 1279 class++) { 1280 dump_ddt(ddt, type, class); 1281 } 1282 } 1283 } 1284 1285 ddt_get_dedup_stats(spa, &dds_total); 1286 1287 if (dds_total.dds_blocks == 0) { 1288 (void) printf("All DDTs are empty\n"); 1289 return; 1290 } 1291 1292 (void) printf("\n"); 1293 1294 if (dump_opt['D'] > 1) { 1295 (void) printf("DDT histogram (aggregated over all DDTs):\n"); 1296 ddt_get_dedup_histogram(spa, &ddh_total); 1297 zpool_dump_ddt(&dds_total, &ddh_total); 1298 } 1299 1300 dump_dedup_ratio(&dds_total); 1301 } 1302 1303 static void 1304 dump_dtl_seg(void *arg, uint64_t start, uint64_t size) 1305 { 1306 char *prefix = arg; 1307 1308 (void) printf("%s [%llu,%llu) length %llu\n", 1309 prefix, 1310 (u_longlong_t)start, 1311 (u_longlong_t)(start + size), 1312 (u_longlong_t)(size)); 1313 } 1314 1315 static void 1316 dump_dtl(vdev_t *vd, int indent) 1317 { 1318 spa_t *spa = vd->vdev_spa; 1319 boolean_t required; 1320 const char *name[DTL_TYPES] = { "missing", "partial", "scrub", 1321 "outage" }; 1322 char prefix[256]; 1323 1324 spa_vdev_state_enter(spa, SCL_NONE); 1325 required = vdev_dtl_required(vd); 1326 (void) spa_vdev_state_exit(spa, NULL, 0); 1327 1328 if (indent == 0) 1329 (void) printf("\nDirty time logs:\n\n"); 1330 1331 (void) printf("\t%*s%s [%s]\n", indent, "", 1332 vd->vdev_path ? vd->vdev_path : 1333 vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), 1334 required ? "DTL-required" : "DTL-expendable"); 1335 1336 for (int t = 0; t < DTL_TYPES; t++) { 1337 range_tree_t *rt = vd->vdev_dtl[t]; 1338 if (range_tree_space(rt) == 0) 1339 continue; 1340 (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", 1341 indent + 2, "", name[t]); 1342 range_tree_walk(rt, dump_dtl_seg, prefix); 1343 if (dump_opt['d'] > 5 && vd->vdev_children == 0) 1344 dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); 1345 } 1346 1347 for (unsigned c = 0; c < vd->vdev_children; c++) 1348 dump_dtl(vd->vdev_child[c], indent + 4); 1349 } 1350 1351 static void 1352 dump_history(spa_t *spa) 1353 { 1354 nvlist_t **events = NULL; 1355 uint64_t resid, len, off = 0; 1356 uint_t num = 0; 1357 int error; 1358 time_t tsec; 1359 struct tm t; 1360 char tbuf[30]; 1361 char internalstr[MAXPATHLEN]; 1362 1363 char *buf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 1364 do { 1365 len = SPA_MAXBLOCKSIZE; 1366 1367 if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { 1368 (void) fprintf(stderr, "Unable to read history: " 1369 "error %d\n", error); 1370 umem_free(buf, SPA_MAXBLOCKSIZE); 1371 return; 1372 } 1373 1374 if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) 1375 break; 1376 1377 off -= resid; 1378 } while (len != 0); 1379 umem_free(buf, SPA_MAXBLOCKSIZE); 1380 1381 (void) printf("\nHistory:\n"); 1382 for (unsigned i = 0; i < num; i++) { 1383 uint64_t time, txg, ievent; 1384 char *cmd, *intstr; 1385 boolean_t printed = B_FALSE; 1386 1387 if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, 1388 &time) != 0) 1389 goto next; 1390 if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, 1391 &cmd) != 0) { 1392 if (nvlist_lookup_uint64(events[i], 1393 ZPOOL_HIST_INT_EVENT, &ievent) != 0) 1394 goto next; 1395 verify(nvlist_lookup_uint64(events[i], 1396 ZPOOL_HIST_TXG, &txg) == 0); 1397 verify(nvlist_lookup_string(events[i], 1398 ZPOOL_HIST_INT_STR, &intstr) == 0); 1399 if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) 1400 goto next; 1401 1402 (void) snprintf(internalstr, 1403 sizeof (internalstr), 1404 "[internal %s txg:%ju] %s", 1405 zfs_history_event_names[ievent], (uintmax_t)txg, 1406 intstr); 1407 cmd = internalstr; 1408 } 1409 tsec = time; 1410 (void) localtime_r(&tsec, &t); 1411 (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); 1412 (void) printf("%s %s\n", tbuf, cmd); 1413 printed = B_TRUE; 1414 1415 next: 1416 if (dump_opt['h'] > 1) { 1417 if (!printed) 1418 (void) printf("unrecognized record:\n"); 1419 dump_nvlist(events[i], 2); 1420 } 1421 } 1422 } 1423 1424 /*ARGSUSED*/ 1425 static void 1426 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) 1427 { 1428 } 1429 1430 static uint64_t 1431 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, 1432 const zbookmark_phys_t *zb) 1433 { 1434 if (dnp == NULL) { 1435 ASSERT(zb->zb_level < 0); 1436 if (zb->zb_object == 0) 1437 return (zb->zb_blkid); 1438 return (zb->zb_blkid * BP_GET_LSIZE(bp)); 1439 } 1440 1441 ASSERT(zb->zb_level >= 0); 1442 1443 return ((zb->zb_blkid << 1444 (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * 1445 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 1446 } 1447 1448 static void 1449 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) 1450 { 1451 const dva_t *dva = bp->blk_dva; 1452 unsigned int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; 1453 1454 if (dump_opt['b'] >= 6) { 1455 snprintf_blkptr(blkbuf, buflen, bp); 1456 return; 1457 } 1458 1459 if (BP_IS_EMBEDDED(bp)) { 1460 (void) sprintf(blkbuf, 1461 "EMBEDDED et=%u %llxL/%llxP B=%llu", 1462 (int)BPE_GET_ETYPE(bp), 1463 (u_longlong_t)BPE_GET_LSIZE(bp), 1464 (u_longlong_t)BPE_GET_PSIZE(bp), 1465 (u_longlong_t)bp->blk_birth); 1466 return; 1467 } 1468 1469 blkbuf[0] = '\0'; 1470 for (unsigned int i = 0; i < ndvas; i++) 1471 (void) snprintf(blkbuf + strlen(blkbuf), 1472 buflen - strlen(blkbuf), "%llu:%llx:%llx ", 1473 (u_longlong_t)DVA_GET_VDEV(&dva[i]), 1474 (u_longlong_t)DVA_GET_OFFSET(&dva[i]), 1475 (u_longlong_t)DVA_GET_ASIZE(&dva[i])); 1476 1477 if (BP_IS_HOLE(bp)) { 1478 (void) snprintf(blkbuf + strlen(blkbuf), 1479 buflen - strlen(blkbuf), 1480 "%llxL B=%llu", 1481 (u_longlong_t)BP_GET_LSIZE(bp), 1482 (u_longlong_t)bp->blk_birth); 1483 } else { 1484 (void) snprintf(blkbuf + strlen(blkbuf), 1485 buflen - strlen(blkbuf), 1486 "%llxL/%llxP F=%llu B=%llu/%llu", 1487 (u_longlong_t)BP_GET_LSIZE(bp), 1488 (u_longlong_t)BP_GET_PSIZE(bp), 1489 (u_longlong_t)BP_GET_FILL(bp), 1490 (u_longlong_t)bp->blk_birth, 1491 (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); 1492 } 1493 } 1494 1495 static void 1496 print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, 1497 const dnode_phys_t *dnp) 1498 { 1499 char blkbuf[BP_SPRINTF_LEN]; 1500 int l; 1501 1502 if (!BP_IS_EMBEDDED(bp)) { 1503 ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); 1504 ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); 1505 } 1506 1507 (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); 1508 1509 ASSERT(zb->zb_level >= 0); 1510 1511 for (l = dnp->dn_nlevels - 1; l >= -1; l--) { 1512 if (l == zb->zb_level) { 1513 (void) printf("L%llx", (u_longlong_t)zb->zb_level); 1514 } else { 1515 (void) printf(" "); 1516 } 1517 } 1518 1519 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); 1520 (void) printf("%s\n", blkbuf); 1521 } 1522 1523 static int 1524 visit_indirect(spa_t *spa, const dnode_phys_t *dnp, 1525 blkptr_t *bp, const zbookmark_phys_t *zb) 1526 { 1527 int err = 0; 1528 1529 if (bp->blk_birth == 0) 1530 return (0); 1531 1532 print_indirect(bp, zb, dnp); 1533 1534 if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { 1535 arc_flags_t flags = ARC_FLAG_WAIT; 1536 int i; 1537 blkptr_t *cbp; 1538 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 1539 arc_buf_t *buf; 1540 uint64_t fill = 0; 1541 1542 err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, 1543 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); 1544 if (err) 1545 return (err); 1546 ASSERT(buf->b_data); 1547 1548 /* recursively visit blocks below this */ 1549 cbp = buf->b_data; 1550 for (i = 0; i < epb; i++, cbp++) { 1551 zbookmark_phys_t czb; 1552 1553 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 1554 zb->zb_level - 1, 1555 zb->zb_blkid * epb + i); 1556 err = visit_indirect(spa, dnp, cbp, &czb); 1557 if (err) 1558 break; 1559 fill += BP_GET_FILL(cbp); 1560 } 1561 if (!err) 1562 ASSERT3U(fill, ==, BP_GET_FILL(bp)); 1563 arc_buf_destroy(buf, &buf); 1564 } 1565 1566 return (err); 1567 } 1568 1569 /*ARGSUSED*/ 1570 static void 1571 dump_indirect(dnode_t *dn) 1572 { 1573 dnode_phys_t *dnp = dn->dn_phys; 1574 int j; 1575 zbookmark_phys_t czb; 1576 1577 (void) printf("Indirect blocks:\n"); 1578 1579 SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), 1580 dn->dn_object, dnp->dn_nlevels - 1, 0); 1581 for (j = 0; j < dnp->dn_nblkptr; j++) { 1582 czb.zb_blkid = j; 1583 (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, 1584 &dnp->dn_blkptr[j], &czb); 1585 } 1586 1587 (void) printf("\n"); 1588 } 1589 1590 /*ARGSUSED*/ 1591 static void 1592 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) 1593 { 1594 dsl_dir_phys_t *dd = data; 1595 time_t crtime; 1596 char nice[32]; 1597 1598 /* make sure nicenum has enough space */ 1599 CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ); 1600 1601 if (dd == NULL) 1602 return; 1603 1604 ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); 1605 1606 crtime = dd->dd_creation_time; 1607 (void) printf("\t\tcreation_time = %s", ctime(&crtime)); 1608 (void) printf("\t\thead_dataset_obj = %llu\n", 1609 (u_longlong_t)dd->dd_head_dataset_obj); 1610 (void) printf("\t\tparent_dir_obj = %llu\n", 1611 (u_longlong_t)dd->dd_parent_obj); 1612 (void) printf("\t\torigin_obj = %llu\n", 1613 (u_longlong_t)dd->dd_origin_obj); 1614 (void) printf("\t\tchild_dir_zapobj = %llu\n", 1615 (u_longlong_t)dd->dd_child_dir_zapobj); 1616 zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); 1617 (void) printf("\t\tused_bytes = %s\n", nice); 1618 zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); 1619 (void) printf("\t\tcompressed_bytes = %s\n", nice); 1620 zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); 1621 (void) printf("\t\tuncompressed_bytes = %s\n", nice); 1622 zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); 1623 (void) printf("\t\tquota = %s\n", nice); 1624 zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); 1625 (void) printf("\t\treserved = %s\n", nice); 1626 (void) printf("\t\tprops_zapobj = %llu\n", 1627 (u_longlong_t)dd->dd_props_zapobj); 1628 (void) printf("\t\tdeleg_zapobj = %llu\n", 1629 (u_longlong_t)dd->dd_deleg_zapobj); 1630 (void) printf("\t\tflags = %llx\n", 1631 (u_longlong_t)dd->dd_flags); 1632 1633 #define DO(which) \ 1634 zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ 1635 sizeof (nice)); \ 1636 (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) 1637 DO(HEAD); 1638 DO(SNAP); 1639 DO(CHILD); 1640 DO(CHILD_RSRV); 1641 DO(REFRSRV); 1642 #undef DO 1643 (void) printf("\t\tclones = %llu\n", 1644 (u_longlong_t)dd->dd_clones); 1645 } 1646 1647 /*ARGSUSED*/ 1648 static void 1649 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) 1650 { 1651 dsl_dataset_phys_t *ds = data; 1652 time_t crtime; 1653 char used[32], compressed[32], uncompressed[32], unique[32]; 1654 char blkbuf[BP_SPRINTF_LEN]; 1655 1656 /* make sure nicenum has enough space */ 1657 CTASSERT(sizeof (used) >= NN_NUMBUF_SZ); 1658 CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ); 1659 CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ); 1660 CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ); 1661 1662 if (ds == NULL) 1663 return; 1664 1665 ASSERT(size == sizeof (*ds)); 1666 crtime = ds->ds_creation_time; 1667 zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); 1668 zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); 1669 zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, 1670 sizeof (uncompressed)); 1671 zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); 1672 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); 1673 1674 (void) printf("\t\tdir_obj = %llu\n", 1675 (u_longlong_t)ds->ds_dir_obj); 1676 (void) printf("\t\tprev_snap_obj = %llu\n", 1677 (u_longlong_t)ds->ds_prev_snap_obj); 1678 (void) printf("\t\tprev_snap_txg = %llu\n", 1679 (u_longlong_t)ds->ds_prev_snap_txg); 1680 (void) printf("\t\tnext_snap_obj = %llu\n", 1681 (u_longlong_t)ds->ds_next_snap_obj); 1682 (void) printf("\t\tsnapnames_zapobj = %llu\n", 1683 (u_longlong_t)ds->ds_snapnames_zapobj); 1684 (void) printf("\t\tnum_children = %llu\n", 1685 (u_longlong_t)ds->ds_num_children); 1686 (void) printf("\t\tuserrefs_obj = %llu\n", 1687 (u_longlong_t)ds->ds_userrefs_obj); 1688 (void) printf("\t\tcreation_time = %s", ctime(&crtime)); 1689 (void) printf("\t\tcreation_txg = %llu\n", 1690 (u_longlong_t)ds->ds_creation_txg); 1691 (void) printf("\t\tdeadlist_obj = %llu\n", 1692 (u_longlong_t)ds->ds_deadlist_obj); 1693 (void) printf("\t\tused_bytes = %s\n", used); 1694 (void) printf("\t\tcompressed_bytes = %s\n", compressed); 1695 (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); 1696 (void) printf("\t\tunique = %s\n", unique); 1697 (void) printf("\t\tfsid_guid = %llu\n", 1698 (u_longlong_t)ds->ds_fsid_guid); 1699 (void) printf("\t\tguid = %llu\n", 1700 (u_longlong_t)ds->ds_guid); 1701 (void) printf("\t\tflags = %llx\n", 1702 (u_longlong_t)ds->ds_flags); 1703 (void) printf("\t\tnext_clones_obj = %llu\n", 1704 (u_longlong_t)ds->ds_next_clones_obj); 1705 (void) printf("\t\tprops_obj = %llu\n", 1706 (u_longlong_t)ds->ds_props_obj); 1707 (void) printf("\t\tbp = %s\n", blkbuf); 1708 } 1709 1710 /* ARGSUSED */ 1711 static int 1712 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1713 { 1714 char blkbuf[BP_SPRINTF_LEN]; 1715 1716 if (bp->blk_birth != 0) { 1717 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 1718 (void) printf("\t%s\n", blkbuf); 1719 } 1720 return (0); 1721 } 1722 1723 static void 1724 dump_bptree(objset_t *os, uint64_t obj, const char *name) 1725 { 1726 char bytes[32]; 1727 bptree_phys_t *bt; 1728 dmu_buf_t *db; 1729 1730 /* make sure nicenum has enough space */ 1731 CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); 1732 1733 if (dump_opt['d'] < 3) 1734 return; 1735 1736 VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); 1737 bt = db->db_data; 1738 zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); 1739 (void) printf("\n %s: %llu datasets, %s\n", 1740 name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); 1741 dmu_buf_rele(db, FTAG); 1742 1743 if (dump_opt['d'] < 5) 1744 return; 1745 1746 (void) printf("\n"); 1747 1748 (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); 1749 } 1750 1751 /* ARGSUSED */ 1752 static int 1753 dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1754 { 1755 char blkbuf[BP_SPRINTF_LEN]; 1756 1757 ASSERT(bp->blk_birth != 0); 1758 snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); 1759 (void) printf("\t%s\n", blkbuf); 1760 return (0); 1761 } 1762 1763 static void 1764 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) 1765 { 1766 char bytes[32]; 1767 char comp[32]; 1768 char uncomp[32]; 1769 1770 /* make sure nicenum has enough space */ 1771 CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); 1772 CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); 1773 CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); 1774 1775 if (dump_opt['d'] < 3) 1776 return; 1777 1778 zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); 1779 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 1780 zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); 1781 zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); 1782 (void) printf(" %*s: object %llu, %llu local blkptrs, " 1783 "%llu subobjs in object %llu, %s (%s/%s comp)\n", 1784 indent * 8, name, 1785 (u_longlong_t)bpo->bpo_object, 1786 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 1787 (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, 1788 (u_longlong_t)bpo->bpo_phys->bpo_subobjs, 1789 bytes, comp, uncomp); 1790 1791 for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { 1792 uint64_t subobj; 1793 bpobj_t subbpo; 1794 int error; 1795 VERIFY0(dmu_read(bpo->bpo_os, 1796 bpo->bpo_phys->bpo_subobjs, 1797 i * sizeof (subobj), sizeof (subobj), &subobj, 0)); 1798 error = bpobj_open(&subbpo, bpo->bpo_os, subobj); 1799 if (error != 0) { 1800 (void) printf("ERROR %u while trying to open " 1801 "subobj id %llu\n", 1802 error, (u_longlong_t)subobj); 1803 continue; 1804 } 1805 dump_full_bpobj(&subbpo, "subobj", indent + 1); 1806 bpobj_close(&subbpo); 1807 } 1808 } else { 1809 (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", 1810 indent * 8, name, 1811 (u_longlong_t)bpo->bpo_object, 1812 (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, 1813 bytes); 1814 } 1815 1816 if (dump_opt['d'] < 5) 1817 return; 1818 1819 1820 if (indent == 0) { 1821 (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); 1822 (void) printf("\n"); 1823 } 1824 } 1825 1826 static void 1827 bpobj_count_refd(bpobj_t *bpo) 1828 { 1829 mos_obj_refd(bpo->bpo_object); 1830 1831 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 1832 mos_obj_refd(bpo->bpo_phys->bpo_subobjs); 1833 for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { 1834 uint64_t subobj; 1835 bpobj_t subbpo; 1836 int error; 1837 VERIFY0(dmu_read(bpo->bpo_os, 1838 bpo->bpo_phys->bpo_subobjs, 1839 i * sizeof (subobj), sizeof (subobj), &subobj, 0)); 1840 error = bpobj_open(&subbpo, bpo->bpo_os, subobj); 1841 if (error != 0) { 1842 (void) printf("ERROR %u while trying to open " 1843 "subobj id %llu\n", 1844 error, (u_longlong_t)subobj); 1845 continue; 1846 } 1847 bpobj_count_refd(&subbpo); 1848 bpobj_close(&subbpo); 1849 } 1850 } 1851 } 1852 1853 static void 1854 dump_deadlist(dsl_deadlist_t *dl) 1855 { 1856 dsl_deadlist_entry_t *dle; 1857 uint64_t unused; 1858 char bytes[32]; 1859 char comp[32]; 1860 char uncomp[32]; 1861 uint64_t empty_bpobj = 1862 dmu_objset_spa(dl->dl_os)->spa_dsl_pool->dp_empty_bpobj; 1863 1864 /* force the tree to be loaded */ 1865 dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); 1866 1867 if (dl->dl_oldfmt) { 1868 if (dl->dl_bpobj.bpo_object != empty_bpobj) 1869 bpobj_count_refd(&dl->dl_bpobj); 1870 } else { 1871 mos_obj_refd(dl->dl_object); 1872 for (dle = avl_first(&dl->dl_tree); dle; 1873 dle = AVL_NEXT(&dl->dl_tree, dle)) { 1874 if (dle->dle_bpobj.bpo_object != empty_bpobj) 1875 bpobj_count_refd(&dle->dle_bpobj); 1876 } 1877 } 1878 1879 /* make sure nicenum has enough space */ 1880 CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); 1881 CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); 1882 CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); 1883 1884 if (dump_opt['d'] < 3) 1885 return; 1886 1887 if (dl->dl_oldfmt) { 1888 dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); 1889 return; 1890 } 1891 1892 zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); 1893 zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); 1894 zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); 1895 (void) printf("\n Deadlist: %s (%s/%s comp)\n", 1896 bytes, comp, uncomp); 1897 1898 if (dump_opt['d'] < 4) 1899 return; 1900 1901 (void) printf("\n"); 1902 1903 for (dle = avl_first(&dl->dl_tree); dle; 1904 dle = AVL_NEXT(&dl->dl_tree, dle)) { 1905 if (dump_opt['d'] >= 5) { 1906 char buf[128]; 1907 (void) snprintf(buf, sizeof (buf), 1908 "mintxg %llu -> obj %llu", 1909 (longlong_t)dle->dle_mintxg, 1910 (longlong_t)dle->dle_bpobj.bpo_object); 1911 1912 dump_full_bpobj(&dle->dle_bpobj, buf, 0); 1913 } else { 1914 (void) printf("mintxg %llu -> obj %llu\n", 1915 (longlong_t)dle->dle_mintxg, 1916 (longlong_t)dle->dle_bpobj.bpo_object); 1917 } 1918 } 1919 } 1920 1921 static avl_tree_t idx_tree; 1922 static avl_tree_t domain_tree; 1923 static boolean_t fuid_table_loaded; 1924 static objset_t *sa_os = NULL; 1925 static sa_attr_type_t *sa_attr_table = NULL; 1926 1927 static int 1928 open_objset(const char *path, dmu_objset_type_t type, void *tag, objset_t **osp) 1929 { 1930 int err; 1931 uint64_t sa_attrs = 0; 1932 uint64_t version = 0; 1933 1934 VERIFY3P(sa_os, ==, NULL); 1935 err = dmu_objset_own(path, type, B_TRUE, B_FALSE, tag, osp); 1936 if (err != 0) { 1937 (void) fprintf(stderr, "failed to own dataset '%s': %s\n", path, 1938 strerror(err)); 1939 return (err); 1940 } 1941 1942 if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) { 1943 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, 1944 8, 1, &version); 1945 if (version >= ZPL_VERSION_SA) { 1946 (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 1947 8, 1, &sa_attrs); 1948 } 1949 err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, 1950 &sa_attr_table); 1951 if (err != 0) { 1952 (void) fprintf(stderr, "sa_setup failed: %s\n", 1953 strerror(err)); 1954 dmu_objset_disown(*osp, B_FALSE, tag); 1955 *osp = NULL; 1956 } 1957 } 1958 sa_os = *osp; 1959 1960 return (0); 1961 } 1962 1963 static void 1964 close_objset(objset_t *os, void *tag) 1965 { 1966 VERIFY3P(os, ==, sa_os); 1967 if (os->os_sa != NULL) 1968 sa_tear_down(os); 1969 dmu_objset_disown(os, B_FALSE, tag); 1970 sa_attr_table = NULL; 1971 sa_os = NULL; 1972 } 1973 1974 static void 1975 fuid_table_destroy() 1976 { 1977 if (fuid_table_loaded) { 1978 zfs_fuid_table_destroy(&idx_tree, &domain_tree); 1979 fuid_table_loaded = B_FALSE; 1980 } 1981 } 1982 1983 /* 1984 * print uid or gid information. 1985 * For normal POSIX id just the id is printed in decimal format. 1986 * For CIFS files with FUID the fuid is printed in hex followed by 1987 * the domain-rid string. 1988 */ 1989 static void 1990 print_idstr(uint64_t id, const char *id_type) 1991 { 1992 if (FUID_INDEX(id)) { 1993 char *domain; 1994 1995 domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); 1996 (void) printf("\t%s %llx [%s-%d]\n", id_type, 1997 (u_longlong_t)id, domain, (int)FUID_RID(id)); 1998 } else { 1999 (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); 2000 } 2001 2002 } 2003 2004 static void 2005 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) 2006 { 2007 uint32_t uid_idx, gid_idx; 2008 2009 uid_idx = FUID_INDEX(uid); 2010 gid_idx = FUID_INDEX(gid); 2011 2012 /* Load domain table, if not already loaded */ 2013 if (!fuid_table_loaded && (uid_idx || gid_idx)) { 2014 uint64_t fuid_obj; 2015 2016 /* first find the fuid object. It lives in the master node */ 2017 VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 2018 8, 1, &fuid_obj) == 0); 2019 zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); 2020 (void) zfs_fuid_table_load(os, fuid_obj, 2021 &idx_tree, &domain_tree); 2022 fuid_table_loaded = B_TRUE; 2023 } 2024 2025 print_idstr(uid, "uid"); 2026 print_idstr(gid, "gid"); 2027 } 2028 2029 /*ARGSUSED*/ 2030 static void 2031 dump_znode(objset_t *os, uint64_t object, void *data, size_t size) 2032 { 2033 char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ 2034 sa_handle_t *hdl; 2035 uint64_t xattr, rdev, gen; 2036 uint64_t uid, gid, mode, fsize, parent, links; 2037 uint64_t pflags; 2038 uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; 2039 time_t z_crtime, z_atime, z_mtime, z_ctime; 2040 sa_bulk_attr_t bulk[12]; 2041 int idx = 0; 2042 int error; 2043 2044 VERIFY3P(os, ==, sa_os); 2045 if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { 2046 (void) printf("Failed to get handle for SA znode\n"); 2047 return; 2048 } 2049 2050 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); 2051 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); 2052 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, 2053 &links, 8); 2054 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); 2055 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, 2056 &mode, 8); 2057 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], 2058 NULL, &parent, 8); 2059 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, 2060 &fsize, 8); 2061 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, 2062 acctm, 16); 2063 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, 2064 modtm, 16); 2065 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, 2066 crtm, 16); 2067 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, 2068 chgtm, 16); 2069 SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, 2070 &pflags, 8); 2071 2072 if (sa_bulk_lookup(hdl, bulk, idx)) { 2073 (void) sa_handle_destroy(hdl); 2074 return; 2075 } 2076 2077 z_crtime = (time_t)crtm[0]; 2078 z_atime = (time_t)acctm[0]; 2079 z_mtime = (time_t)modtm[0]; 2080 z_ctime = (time_t)chgtm[0]; 2081 2082 if (dump_opt['d'] > 4) { 2083 error = zfs_obj_to_path(os, object, path, sizeof (path)); 2084 if (error == ESTALE) { 2085 (void) snprintf(path, sizeof (path), "on delete queue"); 2086 } else if (error != 0) { 2087 leaked_objects++; 2088 (void) snprintf(path, sizeof (path), 2089 "path not found, possibly leaked"); 2090 } 2091 (void) printf("\tpath %s\n", path); 2092 } 2093 dump_uidgid(os, uid, gid); 2094 (void) printf("\tatime %s", ctime(&z_atime)); 2095 (void) printf("\tmtime %s", ctime(&z_mtime)); 2096 (void) printf("\tctime %s", ctime(&z_ctime)); 2097 (void) printf("\tcrtime %s", ctime(&z_crtime)); 2098 (void) printf("\tgen %llu\n", (u_longlong_t)gen); 2099 (void) printf("\tmode %llo\n", (u_longlong_t)mode); 2100 (void) printf("\tsize %llu\n", (u_longlong_t)fsize); 2101 (void) printf("\tparent %llu\n", (u_longlong_t)parent); 2102 (void) printf("\tlinks %llu\n", (u_longlong_t)links); 2103 (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); 2104 if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { 2105 uint64_t projid; 2106 2107 if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, 2108 sizeof (uint64_t)) == 0) 2109 (void) printf("\tprojid %llu\n", (u_longlong_t)projid); 2110 } 2111 if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, 2112 sizeof (uint64_t)) == 0) 2113 (void) printf("\txattr %llu\n", (u_longlong_t)xattr); 2114 if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, 2115 sizeof (uint64_t)) == 0) 2116 (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); 2117 sa_handle_destroy(hdl); 2118 } 2119 2120 /*ARGSUSED*/ 2121 static void 2122 dump_acl(objset_t *os, uint64_t object, void *data, size_t size) 2123 { 2124 } 2125 2126 /*ARGSUSED*/ 2127 static void 2128 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) 2129 { 2130 } 2131 2132 2133 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { 2134 dump_none, /* unallocated */ 2135 dump_zap, /* object directory */ 2136 dump_uint64, /* object array */ 2137 dump_none, /* packed nvlist */ 2138 dump_packed_nvlist, /* packed nvlist size */ 2139 dump_none, /* bpobj */ 2140 dump_bpobj, /* bpobj header */ 2141 dump_none, /* SPA space map header */ 2142 dump_none, /* SPA space map */ 2143 dump_none, /* ZIL intent log */ 2144 dump_dnode, /* DMU dnode */ 2145 dump_dmu_objset, /* DMU objset */ 2146 dump_dsl_dir, /* DSL directory */ 2147 dump_zap, /* DSL directory child map */ 2148 dump_zap, /* DSL dataset snap map */ 2149 dump_zap, /* DSL props */ 2150 dump_dsl_dataset, /* DSL dataset */ 2151 dump_znode, /* ZFS znode */ 2152 dump_acl, /* ZFS V0 ACL */ 2153 dump_uint8, /* ZFS plain file */ 2154 dump_zpldir, /* ZFS directory */ 2155 dump_zap, /* ZFS master node */ 2156 dump_zap, /* ZFS delete queue */ 2157 dump_uint8, /* zvol object */ 2158 dump_zap, /* zvol prop */ 2159 dump_uint8, /* other uint8[] */ 2160 dump_uint64, /* other uint64[] */ 2161 dump_zap, /* other ZAP */ 2162 dump_zap, /* persistent error log */ 2163 dump_uint8, /* SPA history */ 2164 dump_history_offsets, /* SPA history offsets */ 2165 dump_zap, /* Pool properties */ 2166 dump_zap, /* DSL permissions */ 2167 dump_acl, /* ZFS ACL */ 2168 dump_uint8, /* ZFS SYSACL */ 2169 dump_none, /* FUID nvlist */ 2170 dump_packed_nvlist, /* FUID nvlist size */ 2171 dump_zap, /* DSL dataset next clones */ 2172 dump_zap, /* DSL scrub queue */ 2173 dump_zap, /* ZFS user/group/project used */ 2174 dump_zap, /* ZFS user/group/project quota */ 2175 dump_zap, /* snapshot refcount tags */ 2176 dump_ddt_zap, /* DDT ZAP object */ 2177 dump_zap, /* DDT statistics */ 2178 dump_znode, /* SA object */ 2179 dump_zap, /* SA Master Node */ 2180 dump_sa_attrs, /* SA attribute registration */ 2181 dump_sa_layouts, /* SA attribute layouts */ 2182 dump_zap, /* DSL scrub translations */ 2183 dump_none, /* fake dedup BP */ 2184 dump_zap, /* deadlist */ 2185 dump_none, /* deadlist hdr */ 2186 dump_zap, /* dsl clones */ 2187 dump_bpobj_subobjs, /* bpobj subobjs */ 2188 dump_unknown, /* Unknown type, must be last */ 2189 }; 2190 2191 static void 2192 dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header, 2193 uint64_t *dnode_slots_used) 2194 { 2195 dmu_buf_t *db = NULL; 2196 dmu_object_info_t doi; 2197 dnode_t *dn; 2198 boolean_t dnode_held = B_FALSE; 2199 void *bonus = NULL; 2200 size_t bsize = 0; 2201 char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; 2202 char bonus_size[32]; 2203 char aux[50]; 2204 int error; 2205 2206 /* make sure nicenum has enough space */ 2207 CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ); 2208 CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ); 2209 CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); 2210 CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); 2211 CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); 2212 2213 if (*print_header) { 2214 (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", 2215 "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", 2216 "lsize", "%full", "type"); 2217 *print_header = 0; 2218 } 2219 2220 if (object == 0) { 2221 dn = DMU_META_DNODE(os); 2222 dmu_object_info_from_dnode(dn, &doi); 2223 } else { 2224 /* 2225 * Encrypted datasets will have sensitive bonus buffers 2226 * encrypted. Therefore we cannot hold the bonus buffer and 2227 * must hold the dnode itself instead. 2228 */ 2229 error = dmu_object_info(os, object, &doi); 2230 if (error) 2231 fatal("dmu_object_info() failed, errno %u", error); 2232 2233 if (os->os_encrypted && 2234 DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { 2235 error = dnode_hold(os, object, FTAG, &dn); 2236 if (error) 2237 fatal("dnode_hold() failed, errno %u", error); 2238 dnode_held = B_TRUE; 2239 } else { 2240 error = dmu_bonus_hold(os, object, FTAG, &db); 2241 if (error) 2242 fatal("dmu_bonus_hold(%llu) failed, errno %u", 2243 object, error); 2244 bonus = db->db_data; 2245 bsize = db->db_size; 2246 dn = DB_DNODE((dmu_buf_impl_t *)db); 2247 } 2248 } 2249 2250 if (dnode_slots_used != NULL) 2251 *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; 2252 2253 zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); 2254 zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); 2255 zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); 2256 zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); 2257 zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); 2258 zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); 2259 (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * 2260 doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / 2261 doi.doi_max_offset); 2262 2263 aux[0] = '\0'; 2264 2265 if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { 2266 (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)", 2267 ZDB_CHECKSUM_NAME(doi.doi_checksum)); 2268 } 2269 2270 if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { 2271 (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)", 2272 ZDB_COMPRESS_NAME(doi.doi_compress)); 2273 } 2274 2275 (void) printf("%10" PRIu64 2276 " %3u %5s %5s %5s %5s %5s %6s %s%s\n", 2277 object, doi.doi_indirection, iblk, dblk, 2278 asize, dnsize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); 2279 2280 if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { 2281 (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", 2282 "", "", "", "", "", "", bonus_size, "bonus", 2283 ZDB_OT_NAME(doi.doi_bonus_type)); 2284 } 2285 2286 if (verbosity >= 4) { 2287 (void) printf("\tdnode flags: %s%s%s%s\n", 2288 (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? 2289 "USED_BYTES " : "", 2290 (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? 2291 "USERUSED_ACCOUNTED " : "", 2292 (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? 2293 "USEROBJUSED_ACCOUNTED " : "", 2294 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? 2295 "SPILL_BLKPTR" : ""); 2296 (void) printf("\tdnode maxblkid: %llu\n", 2297 (longlong_t)dn->dn_phys->dn_maxblkid); 2298 2299 if (!dnode_held) { 2300 object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, 2301 object, bonus, bsize); 2302 } else { 2303 (void) printf("\t\t(bonus encrypted)\n"); 2304 } 2305 2306 if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) { 2307 object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, 2308 NULL, 0); 2309 } else { 2310 (void) printf("\t\t(object encrypted)\n"); 2311 } 2312 2313 *print_header = 1; 2314 } 2315 2316 if (verbosity >= 5) 2317 dump_indirect(dn); 2318 2319 if (verbosity >= 5) { 2320 /* 2321 * Report the list of segments that comprise the object. 2322 */ 2323 uint64_t start = 0; 2324 uint64_t end; 2325 uint64_t blkfill = 1; 2326 int minlvl = 1; 2327 2328 if (dn->dn_type == DMU_OT_DNODE) { 2329 minlvl = 0; 2330 blkfill = DNODES_PER_BLOCK; 2331 } 2332 2333 for (;;) { 2334 char segsize[32]; 2335 /* make sure nicenum has enough space */ 2336 CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ); 2337 error = dnode_next_offset(dn, 2338 0, &start, minlvl, blkfill, 0); 2339 if (error) 2340 break; 2341 end = start; 2342 error = dnode_next_offset(dn, 2343 DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); 2344 zdb_nicenum(end - start, segsize, sizeof (segsize)); 2345 (void) printf("\t\tsegment [%016llx, %016llx)" 2346 " size %5s\n", (u_longlong_t)start, 2347 (u_longlong_t)end, segsize); 2348 if (error) 2349 break; 2350 start = end; 2351 } 2352 } 2353 2354 if (db != NULL) 2355 dmu_buf_rele(db, FTAG); 2356 if (dnode_held) 2357 dnode_rele(dn, FTAG); 2358 } 2359 2360 static void 2361 count_dir_mos_objects(dsl_dir_t *dd) 2362 { 2363 mos_obj_refd(dd->dd_object); 2364 mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); 2365 mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); 2366 mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); 2367 mos_obj_refd(dsl_dir_phys(dd)->dd_clones); 2368 } 2369 2370 static void 2371 count_ds_mos_objects(dsl_dataset_t *ds) 2372 { 2373 mos_obj_refd(ds->ds_object); 2374 mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); 2375 mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); 2376 mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); 2377 mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); 2378 2379 if (!dsl_dataset_is_snapshot(ds)) { 2380 count_dir_mos_objects(ds->ds_dir); 2381 } 2382 } 2383 2384 static const char *objset_types[DMU_OST_NUMTYPES] = { 2385 "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; 2386 2387 static void 2388 dump_dir(objset_t *os) 2389 { 2390 dmu_objset_stats_t dds; 2391 uint64_t object, object_count; 2392 uint64_t refdbytes, usedobjs, scratch; 2393 char numbuf[32]; 2394 char blkbuf[BP_SPRINTF_LEN + 20]; 2395 char osname[ZFS_MAX_DATASET_NAME_LEN]; 2396 const char *type = "UNKNOWN"; 2397 int verbosity = dump_opt['d']; 2398 int print_header = 1; 2399 unsigned i; 2400 int error; 2401 uint64_t total_slots_used = 0; 2402 uint64_t max_slot_used = 0; 2403 uint64_t dnode_slots; 2404 2405 /* make sure nicenum has enough space */ 2406 CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); 2407 2408 dsl_pool_config_enter(dmu_objset_pool(os), FTAG); 2409 dmu_objset_fast_stat(os, &dds); 2410 dsl_pool_config_exit(dmu_objset_pool(os), FTAG); 2411 2412 if (dds.dds_type < DMU_OST_NUMTYPES) 2413 type = objset_types[dds.dds_type]; 2414 2415 if (dds.dds_type == DMU_OST_META) { 2416 dds.dds_creation_txg = TXG_INITIAL; 2417 usedobjs = BP_GET_FILL(os->os_rootbp); 2418 refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> 2419 dd_used_bytes; 2420 } else { 2421 dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); 2422 } 2423 2424 ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); 2425 2426 zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); 2427 2428 if (verbosity >= 4) { 2429 (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); 2430 (void) snprintf_blkptr(blkbuf + strlen(blkbuf), 2431 sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); 2432 } else { 2433 blkbuf[0] = '\0'; 2434 } 2435 2436 dmu_objset_name(os, osname); 2437 2438 (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " 2439 "%s, %llu objects%s%s\n", 2440 osname, type, (u_longlong_t)dmu_objset_id(os), 2441 (u_longlong_t)dds.dds_creation_txg, 2442 numbuf, (u_longlong_t)usedobjs, blkbuf, 2443 (dds.dds_inconsistent) ? " (inconsistent)" : ""); 2444 2445 if (zopt_objects != 0) { 2446 for (i = 0; i < zopt_objects; i++) 2447 dump_object(os, zopt_object[i], verbosity, 2448 &print_header, NULL); 2449 (void) printf("\n"); 2450 return; 2451 } 2452 2453 if (dump_opt['i'] != 0 || verbosity >= 2) 2454 dump_intent_log(dmu_objset_zil(os)); 2455 2456 if (dmu_objset_ds(os) != NULL) { 2457 dsl_dataset_t *ds = dmu_objset_ds(os); 2458 dump_deadlist(&ds->ds_deadlist); 2459 2460 if (dsl_dataset_remap_deadlist_exists(ds)) { 2461 (void) printf("ds_remap_deadlist:\n"); 2462 dump_deadlist(&ds->ds_remap_deadlist); 2463 } 2464 count_ds_mos_objects(ds); 2465 } 2466 2467 if (verbosity < 2) 2468 return; 2469 2470 if (BP_IS_HOLE(os->os_rootbp)) 2471 return; 2472 2473 dump_object(os, 0, verbosity, &print_header, NULL); 2474 object_count = 0; 2475 if (DMU_USERUSED_DNODE(os) != NULL && 2476 DMU_USERUSED_DNODE(os)->dn_type != 0) { 2477 dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, 2478 NULL); 2479 dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, 2480 NULL); 2481 } 2482 2483 if (DMU_PROJECTUSED_DNODE(os) != NULL && 2484 DMU_PROJECTUSED_DNODE(os)->dn_type != 0) 2485 dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, 2486 &print_header, NULL); 2487 2488 object = 0; 2489 while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { 2490 dump_object(os, object, verbosity, &print_header, &dnode_slots); 2491 object_count++; 2492 total_slots_used += dnode_slots; 2493 max_slot_used = object + dnode_slots - 1; 2494 } 2495 2496 (void) printf("\n"); 2497 2498 (void) printf(" Dnode slots:\n"); 2499 (void) printf("\tTotal used: %10llu\n", 2500 (u_longlong_t)total_slots_used); 2501 (void) printf("\tMax used: %10llu\n", 2502 (u_longlong_t)max_slot_used); 2503 (void) printf("\tPercent empty: %10lf\n", 2504 (double)(max_slot_used - total_slots_used)*100 / 2505 (double)max_slot_used); 2506 2507 (void) printf("\n"); 2508 2509 if (error != ESRCH) { 2510 (void) fprintf(stderr, "dmu_object_next() = %d\n", error); 2511 abort(); 2512 } 2513 if (leaked_objects != 0) { 2514 (void) printf("%d potentially leaked objects detected\n", 2515 leaked_objects); 2516 leaked_objects = 0; 2517 } 2518 2519 ASSERT3U(object_count, ==, usedobjs); 2520 } 2521 2522 static void 2523 dump_uberblock(uberblock_t *ub, const char *header, const char *footer) 2524 { 2525 time_t timestamp = ub->ub_timestamp; 2526 2527 (void) printf("%s", header ? header : ""); 2528 (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); 2529 (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); 2530 (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); 2531 (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); 2532 (void) printf("\ttimestamp = %llu UTC = %s", 2533 (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); 2534 2535 (void) printf("\tmmp_magic = %016llx\n", 2536 (u_longlong_t)ub->ub_mmp_magic); 2537 if (MMP_VALID(ub)) { 2538 (void) printf("\tmmp_delay = %0llu\n", 2539 (u_longlong_t)ub->ub_mmp_delay); 2540 if (MMP_SEQ_VALID(ub)) 2541 (void) printf("\tmmp_seq = %u\n", 2542 (unsigned int) MMP_SEQ(ub)); 2543 if (MMP_FAIL_INT_VALID(ub)) 2544 (void) printf("\tmmp_fail = %u\n", 2545 (unsigned int) MMP_FAIL_INT(ub)); 2546 if (MMP_INTERVAL_VALID(ub)) 2547 (void) printf("\tmmp_write = %u\n", 2548 (unsigned int) MMP_INTERVAL(ub)); 2549 /* After MMP_* to make summarize_uberblock_mmp cleaner */ 2550 (void) printf("\tmmp_valid = %x\n", 2551 (unsigned int) ub->ub_mmp_config & 0xFF); 2552 } 2553 2554 if (dump_opt['u'] >= 4) { 2555 char blkbuf[BP_SPRINTF_LEN]; 2556 snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); 2557 (void) printf("\trootbp = %s\n", blkbuf); 2558 } 2559 (void) printf("\tcheckpoint_txg = %llu\n", 2560 (u_longlong_t)ub->ub_checkpoint_txg); 2561 (void) printf("%s", footer ? footer : ""); 2562 } 2563 2564 static void 2565 dump_config(spa_t *spa) 2566 { 2567 dmu_buf_t *db; 2568 size_t nvsize = 0; 2569 int error = 0; 2570 2571 2572 error = dmu_bonus_hold(spa->spa_meta_objset, 2573 spa->spa_config_object, FTAG, &db); 2574 2575 if (error == 0) { 2576 nvsize = *(uint64_t *)db->db_data; 2577 dmu_buf_rele(db, FTAG); 2578 2579 (void) printf("\nMOS Configuration:\n"); 2580 dump_packed_nvlist(spa->spa_meta_objset, 2581 spa->spa_config_object, (void *)&nvsize, 1); 2582 } else { 2583 (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", 2584 (u_longlong_t)spa->spa_config_object, error); 2585 } 2586 } 2587 2588 static void 2589 dump_cachefile(const char *cachefile) 2590 { 2591 int fd; 2592 struct stat64 statbuf; 2593 char *buf; 2594 nvlist_t *config; 2595 2596 if ((fd = open64(cachefile, O_RDONLY)) < 0) { 2597 (void) printf("cannot open '%s': %s\n", cachefile, 2598 strerror(errno)); 2599 exit(1); 2600 } 2601 2602 if (fstat64(fd, &statbuf) != 0) { 2603 (void) printf("failed to stat '%s': %s\n", cachefile, 2604 strerror(errno)); 2605 exit(1); 2606 } 2607 2608 if ((buf = malloc(statbuf.st_size)) == NULL) { 2609 (void) fprintf(stderr, "failed to allocate %llu bytes\n", 2610 (u_longlong_t)statbuf.st_size); 2611 exit(1); 2612 } 2613 2614 if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { 2615 (void) fprintf(stderr, "failed to read %llu bytes\n", 2616 (u_longlong_t)statbuf.st_size); 2617 exit(1); 2618 } 2619 2620 (void) close(fd); 2621 2622 if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { 2623 (void) fprintf(stderr, "failed to unpack nvlist\n"); 2624 exit(1); 2625 } 2626 2627 free(buf); 2628 2629 dump_nvlist(config, 0); 2630 2631 nvlist_free(config); 2632 } 2633 2634 static char curpath[PATH_MAX]; 2635 2636 /* 2637 * Iterate through the path components, recursively passing 2638 * current one's obj and remaining path until we find the obj 2639 * for the last one. 2640 */ 2641 static int 2642 dump_path_impl(objset_t *os, uint64_t obj, char *name) 2643 { 2644 int err; 2645 int header = 1; 2646 uint64_t child_obj; 2647 char *s; 2648 dmu_buf_t *db; 2649 dmu_object_info_t doi; 2650 2651 if ((s = strchr(name, '/')) != NULL) 2652 *s = '\0'; 2653 err = zap_lookup(os, obj, name, 8, 1, &child_obj); 2654 2655 (void) strlcat(curpath, name, sizeof (curpath)); 2656 2657 if (err != 0) { 2658 (void) fprintf(stderr, "failed to lookup %s: %s\n", 2659 curpath, strerror(err)); 2660 return (err); 2661 } 2662 2663 child_obj = ZFS_DIRENT_OBJ(child_obj); 2664 err = sa_buf_hold(os, child_obj, FTAG, &db); 2665 if (err != 0) { 2666 (void) fprintf(stderr, 2667 "failed to get SA dbuf for obj %llu: %s\n", 2668 (u_longlong_t)child_obj, strerror(err)); 2669 return (EINVAL); 2670 } 2671 dmu_object_info_from_db(db, &doi); 2672 sa_buf_rele(db, FTAG); 2673 2674 if (doi.doi_bonus_type != DMU_OT_SA && 2675 doi.doi_bonus_type != DMU_OT_ZNODE) { 2676 (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", 2677 doi.doi_bonus_type, (u_longlong_t)child_obj); 2678 return (EINVAL); 2679 } 2680 2681 if (dump_opt['v'] > 6) { 2682 (void) printf("obj=%llu %s type=%d bonustype=%d\n", 2683 (u_longlong_t)child_obj, curpath, doi.doi_type, 2684 doi.doi_bonus_type); 2685 } 2686 2687 (void) strlcat(curpath, "/", sizeof (curpath)); 2688 2689 switch (doi.doi_type) { 2690 case DMU_OT_DIRECTORY_CONTENTS: 2691 if (s != NULL && *(s + 1) != '\0') 2692 return (dump_path_impl(os, child_obj, s + 1)); 2693 /*FALLTHROUGH*/ 2694 case DMU_OT_PLAIN_FILE_CONTENTS: 2695 dump_object(os, child_obj, dump_opt['v'], &header, NULL); 2696 return (0); 2697 default: 2698 (void) fprintf(stderr, "object %llu has non-file/directory " 2699 "type %d\n", (u_longlong_t)obj, doi.doi_type); 2700 break; 2701 } 2702 2703 return (EINVAL); 2704 } 2705 2706 /* 2707 * Dump the blocks for the object specified by path inside the dataset. 2708 */ 2709 static int 2710 dump_path(char *ds, char *path) 2711 { 2712 int err; 2713 objset_t *os; 2714 uint64_t root_obj; 2715 2716 err = open_objset(ds, DMU_OST_ZFS, FTAG, &os); 2717 if (err != 0) 2718 return (err); 2719 2720 err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); 2721 if (err != 0) { 2722 (void) fprintf(stderr, "can't lookup root znode: %s\n", 2723 strerror(err)); 2724 dmu_objset_disown(os, B_FALSE, FTAG); 2725 return (EINVAL); 2726 } 2727 2728 (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); 2729 2730 err = dump_path_impl(os, root_obj, path); 2731 2732 close_objset(os, FTAG); 2733 return (err); 2734 } 2735 2736 typedef struct cksum_record { 2737 zio_cksum_t cksum; 2738 boolean_t labels[VDEV_LABELS]; 2739 avl_node_t link; 2740 } cksum_record_t; 2741 2742 static int 2743 cksum_record_compare(const void *x1, const void *x2) 2744 { 2745 const cksum_record_t *l = (cksum_record_t *)x1; 2746 const cksum_record_t *r = (cksum_record_t *)x2; 2747 int arraysize = ARRAY_SIZE(l->cksum.zc_word); 2748 int difference; 2749 2750 for (int i = 0; i < arraysize; i++) { 2751 difference = AVL_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); 2752 if (difference) 2753 break; 2754 } 2755 2756 return (difference); 2757 } 2758 2759 static cksum_record_t * 2760 cksum_record_alloc(zio_cksum_t *cksum, int l) 2761 { 2762 cksum_record_t *rec; 2763 2764 rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); 2765 rec->cksum = *cksum; 2766 rec->labels[l] = B_TRUE; 2767 2768 return (rec); 2769 } 2770 2771 static cksum_record_t * 2772 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) 2773 { 2774 cksum_record_t lookup = { .cksum = *cksum }; 2775 avl_index_t where; 2776 2777 return (avl_find(tree, &lookup, &where)); 2778 } 2779 2780 static cksum_record_t * 2781 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) 2782 { 2783 cksum_record_t *rec; 2784 2785 rec = cksum_record_lookup(tree, cksum); 2786 if (rec) { 2787 rec->labels[l] = B_TRUE; 2788 } else { 2789 rec = cksum_record_alloc(cksum, l); 2790 avl_add(tree, rec); 2791 } 2792 2793 return (rec); 2794 } 2795 2796 static int 2797 first_label(cksum_record_t *rec) 2798 { 2799 for (int i = 0; i < VDEV_LABELS; i++) 2800 if (rec->labels[i]) 2801 return (i); 2802 2803 return (-1); 2804 } 2805 2806 static void 2807 print_label_numbers(char *prefix, cksum_record_t *rec) 2808 { 2809 printf("%s", prefix); 2810 for (int i = 0; i < VDEV_LABELS; i++) 2811 if (rec->labels[i] == B_TRUE) 2812 printf("%d ", i); 2813 printf("\n"); 2814 } 2815 2816 #define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) 2817 2818 typedef struct zdb_label { 2819 vdev_label_t label; 2820 nvlist_t *config_nv; 2821 cksum_record_t *config; 2822 cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; 2823 boolean_t header_printed; 2824 boolean_t read_failed; 2825 } zdb_label_t; 2826 2827 static void 2828 print_label_header(zdb_label_t *label, int l) 2829 { 2830 2831 if (dump_opt['q']) 2832 return; 2833 2834 if (label->header_printed == B_TRUE) 2835 return; 2836 2837 (void) printf("------------------------------------\n"); 2838 (void) printf("LABEL %d\n", l); 2839 (void) printf("------------------------------------\n"); 2840 2841 label->header_printed = B_TRUE; 2842 } 2843 2844 static void 2845 dump_config_from_label(zdb_label_t *label, size_t buflen, int l) 2846 { 2847 if (dump_opt['q']) 2848 return; 2849 2850 if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) 2851 return; 2852 2853 print_label_header(label, l); 2854 dump_nvlist(label->config_nv, 4); 2855 print_label_numbers(" labels = ", label->config); 2856 } 2857 2858 #define ZDB_MAX_UB_HEADER_SIZE 32 2859 2860 static void 2861 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) 2862 { 2863 2864 vdev_t vd; 2865 char header[ZDB_MAX_UB_HEADER_SIZE]; 2866 2867 vd.vdev_ashift = ashift; 2868 vd.vdev_top = &vd; 2869 2870 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { 2871 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); 2872 uberblock_t *ub = (void *)((char *)&label->label + uoff); 2873 cksum_record_t *rec = label->uberblocks[i]; 2874 2875 if (rec == NULL) { 2876 if (dump_opt['u'] >= 2) { 2877 print_label_header(label, label_num); 2878 (void) printf(" Uberblock[%d] invalid\n", i); 2879 } 2880 continue; 2881 } 2882 2883 if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) 2884 continue; 2885 2886 print_label_header(label, label_num); 2887 (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, 2888 " Uberblock[%d]\n", i); 2889 dump_uberblock(ub, header, ""); 2890 print_label_numbers(" labels = ", rec); 2891 } 2892 } 2893 2894 static int 2895 dump_label(const char *dev) 2896 { 2897 char path[MAXPATHLEN]; 2898 zdb_label_t labels[VDEV_LABELS]; 2899 uint64_t psize, ashift; 2900 struct stat64 statbuf; 2901 boolean_t config_found = B_FALSE; 2902 boolean_t error = B_FALSE; 2903 avl_tree_t config_tree; 2904 avl_tree_t uberblock_tree; 2905 void *node, *cookie; 2906 int fd; 2907 2908 bzero(labels, sizeof (labels)); 2909 2910 (void) strlcpy(path, dev, sizeof (path)); 2911 if (dev[0] == '/') { 2912 if (strncmp(dev, ZFS_DISK_ROOTD, 2913 strlen(ZFS_DISK_ROOTD)) == 0) { 2914 (void) snprintf(path, sizeof (path), "%s%s", 2915 ZFS_RDISK_ROOTD, dev + strlen(ZFS_DISK_ROOTD)); 2916 } 2917 } else if (stat64(path, &statbuf) != 0) { 2918 char *s; 2919 2920 (void) snprintf(path, sizeof (path), "%s%s", ZFS_RDISK_ROOTD, 2921 dev); 2922 if (((s = strrchr(dev, 's')) == NULL && 2923 (s = strchr(dev, 'p')) == NULL) || 2924 !isdigit(*(s + 1))) 2925 (void) strlcat(path, "s0", sizeof (path)); 2926 } 2927 2928 if ((fd = open64(path, O_RDONLY)) < 0) { 2929 (void) fprintf(stderr, "cannot open '%s': %s\n", path, 2930 strerror(errno)); 2931 exit(1); 2932 } 2933 2934 if (fstat64(fd, &statbuf) != 0) { 2935 (void) fprintf(stderr, "failed to stat '%s': %s\n", path, 2936 strerror(errno)); 2937 (void) close(fd); 2938 exit(1); 2939 } 2940 2941 if (S_ISBLK(statbuf.st_mode)) { 2942 (void) fprintf(stderr, 2943 "cannot use '%s': character device required\n", path); 2944 (void) close(fd); 2945 exit(1); 2946 } 2947 2948 avl_create(&config_tree, cksum_record_compare, 2949 sizeof (cksum_record_t), offsetof(cksum_record_t, link)); 2950 avl_create(&uberblock_tree, cksum_record_compare, 2951 sizeof (cksum_record_t), offsetof(cksum_record_t, link)); 2952 2953 psize = statbuf.st_size; 2954 psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); 2955 ashift = SPA_MINBLOCKSHIFT; 2956 2957 /* 2958 * 1. Read the label from disk 2959 * 2. Unpack the configuration and insert in config tree. 2960 * 3. Traverse all uberblocks and insert in uberblock tree. 2961 */ 2962 for (int l = 0; l < VDEV_LABELS; l++) { 2963 zdb_label_t *label = &labels[l]; 2964 char *buf = label->label.vl_vdev_phys.vp_nvlist; 2965 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); 2966 nvlist_t *config; 2967 cksum_record_t *rec; 2968 zio_cksum_t cksum; 2969 vdev_t vd; 2970 2971 if (pread64(fd, &label->label, sizeof (label->label), 2972 vdev_label_offset(psize, l, 0)) != sizeof (label->label)) { 2973 if (!dump_opt['q']) 2974 (void) printf("failed to read label %d\n", l); 2975 label->read_failed = B_TRUE; 2976 error = B_TRUE; 2977 continue; 2978 } 2979 2980 label->read_failed = B_FALSE; 2981 2982 if (nvlist_unpack(buf, buflen, &config, 0) == 0) { 2983 nvlist_t *vdev_tree = NULL; 2984 size_t size; 2985 2986 if ((nvlist_lookup_nvlist(config, 2987 ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || 2988 (nvlist_lookup_uint64(vdev_tree, 2989 ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) 2990 ashift = SPA_MINBLOCKSHIFT; 2991 2992 if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) 2993 size = buflen; 2994 2995 fletcher_4_native(buf, size, NULL, &cksum); 2996 rec = cksum_record_insert(&config_tree, &cksum, l); 2997 2998 label->config = rec; 2999 label->config_nv = config; 3000 config_found = B_TRUE; 3001 } else { 3002 error = B_TRUE; 3003 } 3004 3005 vd.vdev_ashift = ashift; 3006 vd.vdev_top = &vd; 3007 3008 for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { 3009 uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); 3010 uberblock_t *ub = (void *)((char *)label + uoff); 3011 3012 if (uberblock_verify(ub)) 3013 continue; 3014 3015 fletcher_4_native(ub, sizeof (*ub), NULL, &cksum); 3016 rec = cksum_record_insert(&uberblock_tree, &cksum, l); 3017 3018 label->uberblocks[i] = rec; 3019 } 3020 } 3021 3022 /* 3023 * Dump the label and uberblocks. 3024 */ 3025 for (int l = 0; l < VDEV_LABELS; l++) { 3026 zdb_label_t *label = &labels[l]; 3027 size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); 3028 3029 if (label->read_failed == B_TRUE) 3030 continue; 3031 3032 if (label->config_nv) { 3033 dump_config_from_label(label, buflen, l); 3034 } else { 3035 if (!dump_opt['q']) 3036 (void) printf("failed to unpack label %d\n", l); 3037 } 3038 if (dump_opt['u']) 3039 dump_label_uberblocks(label, ashift, l); 3040 3041 nvlist_free(label->config_nv); 3042 } 3043 3044 cookie = NULL; 3045 while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) 3046 umem_free(node, sizeof (cksum_record_t)); 3047 3048 cookie = NULL; 3049 while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) 3050 umem_free(node, sizeof (cksum_record_t)); 3051 3052 avl_destroy(&config_tree); 3053 avl_destroy(&uberblock_tree); 3054 3055 (void) close(fd); 3056 3057 return (config_found == B_FALSE ? 2 : 3058 (error == B_TRUE ? 1 : 0)); 3059 } 3060 3061 static uint64_t dataset_feature_count[SPA_FEATURES]; 3062 static uint64_t remap_deadlist_count = 0; 3063 3064 /*ARGSUSED*/ 3065 static int 3066 dump_one_dir(const char *dsname, void *arg) 3067 { 3068 int error; 3069 objset_t *os; 3070 3071 error = open_objset(dsname, DMU_OST_ANY, FTAG, &os); 3072 if (error != 0) 3073 return (0); 3074 3075 for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { 3076 if (!dmu_objset_ds(os)->ds_feature_inuse[f]) 3077 continue; 3078 ASSERT(spa_feature_table[f].fi_flags & 3079 ZFEATURE_FLAG_PER_DATASET); 3080 dataset_feature_count[f]++; 3081 } 3082 3083 if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { 3084 remap_deadlist_count++; 3085 } 3086 3087 dump_dir(os); 3088 close_objset(os, FTAG); 3089 fuid_table_destroy(); 3090 return (0); 3091 } 3092 3093 /* 3094 * Block statistics. 3095 */ 3096 #define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) 3097 typedef struct zdb_blkstats { 3098 uint64_t zb_asize; 3099 uint64_t zb_lsize; 3100 uint64_t zb_psize; 3101 uint64_t zb_count; 3102 uint64_t zb_gangs; 3103 uint64_t zb_ditto_samevdev; 3104 uint64_t zb_ditto_same_ms; 3105 uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; 3106 } zdb_blkstats_t; 3107 3108 /* 3109 * Extended object types to report deferred frees and dedup auto-ditto blocks. 3110 */ 3111 #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) 3112 #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) 3113 #define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) 3114 #define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) 3115 3116 static const char *zdb_ot_extname[] = { 3117 "deferred free", 3118 "dedup ditto", 3119 "other", 3120 "Total", 3121 }; 3122 3123 #define ZB_TOTAL DN_MAX_LEVELS 3124 3125 typedef struct zdb_cb { 3126 zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; 3127 uint64_t zcb_removing_size; 3128 uint64_t zcb_checkpoint_size; 3129 uint64_t zcb_dedup_asize; 3130 uint64_t zcb_dedup_blocks; 3131 uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; 3132 uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] 3133 [BPE_PAYLOAD_SIZE]; 3134 uint64_t zcb_start; 3135 hrtime_t zcb_lastprint; 3136 uint64_t zcb_totalasize; 3137 uint64_t zcb_errors[256]; 3138 int zcb_readfails; 3139 int zcb_haderrors; 3140 spa_t *zcb_spa; 3141 uint32_t **zcb_vd_obsolete_counts; 3142 } zdb_cb_t; 3143 3144 /* test if two DVA offsets from same vdev are within the same metaslab */ 3145 static boolean_t 3146 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) 3147 { 3148 vdev_t *vd = vdev_lookup_top(spa, vdev); 3149 uint64_t ms_shift = vd->vdev_ms_shift; 3150 3151 return ((off1 >> ms_shift) == (off2 >> ms_shift)); 3152 } 3153 3154 static void 3155 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, 3156 dmu_object_type_t type) 3157 { 3158 uint64_t refcnt = 0; 3159 3160 ASSERT(type < ZDB_OT_TOTAL); 3161 3162 if (zilog && zil_bp_tree_add(zilog, bp) != 0) 3163 return; 3164 3165 spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); 3166 3167 for (int i = 0; i < 4; i++) { 3168 int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; 3169 int t = (i & 1) ? type : ZDB_OT_TOTAL; 3170 int equal; 3171 zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; 3172 3173 zb->zb_asize += BP_GET_ASIZE(bp); 3174 zb->zb_lsize += BP_GET_LSIZE(bp); 3175 zb->zb_psize += BP_GET_PSIZE(bp); 3176 zb->zb_count++; 3177 3178 /* 3179 * The histogram is only big enough to record blocks up to 3180 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, 3181 * "other", bucket. 3182 */ 3183 unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; 3184 idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); 3185 zb->zb_psize_histogram[idx]++; 3186 3187 zb->zb_gangs += BP_COUNT_GANG(bp); 3188 3189 switch (BP_GET_NDVAS(bp)) { 3190 case 2: 3191 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 3192 DVA_GET_VDEV(&bp->blk_dva[1])) { 3193 zb->zb_ditto_samevdev++; 3194 3195 if (same_metaslab(zcb->zcb_spa, 3196 DVA_GET_VDEV(&bp->blk_dva[0]), 3197 DVA_GET_OFFSET(&bp->blk_dva[0]), 3198 DVA_GET_OFFSET(&bp->blk_dva[1]))) 3199 zb->zb_ditto_same_ms++; 3200 } 3201 break; 3202 case 3: 3203 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 3204 DVA_GET_VDEV(&bp->blk_dva[1])) + 3205 (DVA_GET_VDEV(&bp->blk_dva[0]) == 3206 DVA_GET_VDEV(&bp->blk_dva[2])) + 3207 (DVA_GET_VDEV(&bp->blk_dva[1]) == 3208 DVA_GET_VDEV(&bp->blk_dva[2])); 3209 if (equal != 0) { 3210 zb->zb_ditto_samevdev++; 3211 3212 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 3213 DVA_GET_VDEV(&bp->blk_dva[1]) && 3214 same_metaslab(zcb->zcb_spa, 3215 DVA_GET_VDEV(&bp->blk_dva[0]), 3216 DVA_GET_OFFSET(&bp->blk_dva[0]), 3217 DVA_GET_OFFSET(&bp->blk_dva[1]))) 3218 zb->zb_ditto_same_ms++; 3219 else if (DVA_GET_VDEV(&bp->blk_dva[0]) == 3220 DVA_GET_VDEV(&bp->blk_dva[2]) && 3221 same_metaslab(zcb->zcb_spa, 3222 DVA_GET_VDEV(&bp->blk_dva[0]), 3223 DVA_GET_OFFSET(&bp->blk_dva[0]), 3224 DVA_GET_OFFSET(&bp->blk_dva[2]))) 3225 zb->zb_ditto_same_ms++; 3226 else if (DVA_GET_VDEV(&bp->blk_dva[1]) == 3227 DVA_GET_VDEV(&bp->blk_dva[2]) && 3228 same_metaslab(zcb->zcb_spa, 3229 DVA_GET_VDEV(&bp->blk_dva[1]), 3230 DVA_GET_OFFSET(&bp->blk_dva[1]), 3231 DVA_GET_OFFSET(&bp->blk_dva[2]))) 3232 zb->zb_ditto_same_ms++; 3233 } 3234 break; 3235 } 3236 } 3237 3238 spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); 3239 3240 if (BP_IS_EMBEDDED(bp)) { 3241 zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; 3242 zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] 3243 [BPE_GET_PSIZE(bp)]++; 3244 return; 3245 } 3246 3247 if (dump_opt['L']) 3248 return; 3249 3250 if (BP_GET_DEDUP(bp)) { 3251 ddt_t *ddt; 3252 ddt_entry_t *dde; 3253 3254 ddt = ddt_select(zcb->zcb_spa, bp); 3255 ddt_enter(ddt); 3256 dde = ddt_lookup(ddt, bp, B_FALSE); 3257 3258 if (dde == NULL) { 3259 refcnt = 0; 3260 } else { 3261 ddt_phys_t *ddp = ddt_phys_select(dde, bp); 3262 ddt_phys_decref(ddp); 3263 refcnt = ddp->ddp_refcnt; 3264 if (ddt_phys_total_refcnt(dde) == 0) 3265 ddt_remove(ddt, dde); 3266 } 3267 ddt_exit(ddt); 3268 } 3269 3270 VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, 3271 refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), 3272 bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); 3273 } 3274 3275 static void 3276 zdb_blkptr_done(zio_t *zio) 3277 { 3278 spa_t *spa = zio->io_spa; 3279 blkptr_t *bp = zio->io_bp; 3280 int ioerr = zio->io_error; 3281 zdb_cb_t *zcb = zio->io_private; 3282 zbookmark_phys_t *zb = &zio->io_bookmark; 3283 3284 abd_free(zio->io_abd); 3285 3286 mutex_enter(&spa->spa_scrub_lock); 3287 spa->spa_load_verify_ios--; 3288 cv_broadcast(&spa->spa_scrub_io_cv); 3289 3290 if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 3291 char blkbuf[BP_SPRINTF_LEN]; 3292 3293 zcb->zcb_haderrors = 1; 3294 zcb->zcb_errors[ioerr]++; 3295 3296 if (dump_opt['b'] >= 2) 3297 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 3298 else 3299 blkbuf[0] = '\0'; 3300 3301 (void) printf("zdb_blkptr_cb: " 3302 "Got error %d reading " 3303 "<%llu, %llu, %lld, %llx> %s -- skipping\n", 3304 ioerr, 3305 (u_longlong_t)zb->zb_objset, 3306 (u_longlong_t)zb->zb_object, 3307 (u_longlong_t)zb->zb_level, 3308 (u_longlong_t)zb->zb_blkid, 3309 blkbuf); 3310 } 3311 mutex_exit(&spa->spa_scrub_lock); 3312 } 3313 3314 static int 3315 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 3316 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 3317 { 3318 zdb_cb_t *zcb = arg; 3319 dmu_object_type_t type; 3320 boolean_t is_metadata; 3321 3322 if (bp == NULL) 3323 return (0); 3324 3325 if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { 3326 char blkbuf[BP_SPRINTF_LEN]; 3327 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 3328 (void) printf("objset %llu object %llu " 3329 "level %lld offset 0x%llx %s\n", 3330 (u_longlong_t)zb->zb_objset, 3331 (u_longlong_t)zb->zb_object, 3332 (longlong_t)zb->zb_level, 3333 (u_longlong_t)blkid2offset(dnp, bp, zb), 3334 blkbuf); 3335 } 3336 3337 if (BP_IS_HOLE(bp)) 3338 return (0); 3339 3340 type = BP_GET_TYPE(bp); 3341 3342 zdb_count_block(zcb, zilog, bp, 3343 (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); 3344 3345 is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); 3346 3347 if (!BP_IS_EMBEDDED(bp) && 3348 (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { 3349 size_t size = BP_GET_PSIZE(bp); 3350 abd_t *abd = abd_alloc(size, B_FALSE); 3351 int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; 3352 3353 /* If it's an intent log block, failure is expected. */ 3354 if (zb->zb_level == ZB_ZIL_LEVEL) 3355 flags |= ZIO_FLAG_SPECULATIVE; 3356 3357 mutex_enter(&spa->spa_scrub_lock); 3358 while (spa->spa_load_verify_ios > max_inflight) 3359 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 3360 spa->spa_load_verify_ios++; 3361 mutex_exit(&spa->spa_scrub_lock); 3362 3363 zio_nowait(zio_read(NULL, spa, bp, abd, size, 3364 zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); 3365 } 3366 3367 zcb->zcb_readfails = 0; 3368 3369 /* only call gethrtime() every 100 blocks */ 3370 static int iters; 3371 if (++iters > 100) 3372 iters = 0; 3373 else 3374 return (0); 3375 3376 if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { 3377 uint64_t now = gethrtime(); 3378 char buf[10]; 3379 uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; 3380 int kb_per_sec = 3381 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); 3382 int sec_remaining = 3383 (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; 3384 3385 /* make sure nicenum has enough space */ 3386 CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ); 3387 3388 zfs_nicenum(bytes, buf, sizeof (buf)); 3389 (void) fprintf(stderr, 3390 "\r%5s completed (%4dMB/s) " 3391 "estimated time remaining: %uhr %02umin %02usec ", 3392 buf, kb_per_sec / 1024, 3393 sec_remaining / 60 / 60, 3394 sec_remaining / 60 % 60, 3395 sec_remaining % 60); 3396 3397 zcb->zcb_lastprint = now; 3398 } 3399 3400 return (0); 3401 } 3402 3403 static void 3404 zdb_leak(void *arg, uint64_t start, uint64_t size) 3405 { 3406 vdev_t *vd = arg; 3407 3408 (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", 3409 (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); 3410 } 3411 3412 static metaslab_ops_t zdb_metaslab_ops = { 3413 NULL /* alloc */ 3414 }; 3415 3416 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, 3417 uint64_t txg, void *arg); 3418 3419 typedef struct unflushed_iter_cb_arg { 3420 spa_t *uic_spa; 3421 uint64_t uic_txg; 3422 void *uic_arg; 3423 zdb_log_sm_cb_t uic_cb; 3424 } unflushed_iter_cb_arg_t; 3425 3426 static int 3427 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) 3428 { 3429 unflushed_iter_cb_arg_t *uic = arg; 3430 3431 return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); 3432 } 3433 3434 static void 3435 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) 3436 { 3437 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 3438 return; 3439 3440 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3441 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 3442 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { 3443 space_map_t *sm = NULL; 3444 VERIFY0(space_map_open(&sm, spa_meta_objset(spa), 3445 sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); 3446 3447 unflushed_iter_cb_arg_t uic = { 3448 .uic_spa = spa, 3449 .uic_txg = sls->sls_txg, 3450 .uic_arg = arg, 3451 .uic_cb = cb 3452 }; 3453 3454 VERIFY0(space_map_iterate(sm, space_map_length(sm), 3455 iterate_through_spacemap_logs_cb, &uic)); 3456 space_map_close(sm); 3457 } 3458 spa_config_exit(spa, SCL_CONFIG, FTAG); 3459 } 3460 3461 /* ARGSUSED */ 3462 static int 3463 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, 3464 uint64_t txg, void *arg) 3465 { 3466 spa_vdev_removal_t *svr = arg; 3467 3468 uint64_t offset = sme->sme_offset; 3469 uint64_t size = sme->sme_run; 3470 3471 /* skip vdevs we don't care about */ 3472 if (sme->sme_vdev != svr->svr_vdev_id) 3473 return (0); 3474 3475 vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); 3476 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3477 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 3478 3479 if (txg < metaslab_unflushed_txg(ms)) 3480 return (0); 3481 3482 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 3483 ASSERT(vim != NULL); 3484 if (offset >= vdev_indirect_mapping_max_offset(vim)) 3485 return (0); 3486 3487 if (sme->sme_type == SM_ALLOC) 3488 range_tree_add(svr->svr_allocd_segs, offset, size); 3489 else 3490 range_tree_remove(svr->svr_allocd_segs, offset, size); 3491 3492 return (0); 3493 } 3494 3495 static void 3496 zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) 3497 { 3498 ddt_bookmark_t ddb; 3499 ddt_entry_t dde; 3500 int error; 3501 3502 ASSERT(!dump_opt['L']); 3503 3504 bzero(&ddb, sizeof (ddb)); 3505 while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { 3506 blkptr_t blk; 3507 ddt_phys_t *ddp = dde.dde_phys; 3508 3509 if (ddb.ddb_class == DDT_CLASS_UNIQUE) 3510 return; 3511 3512 ASSERT(ddt_phys_total_refcnt(&dde) > 1); 3513 3514 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 3515 if (ddp->ddp_phys_birth == 0) 3516 continue; 3517 ddt_bp_create(ddb.ddb_checksum, 3518 &dde.dde_key, ddp, &blk); 3519 if (p == DDT_PHYS_DITTO) { 3520 zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); 3521 } else { 3522 zcb->zcb_dedup_asize += 3523 BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); 3524 zcb->zcb_dedup_blocks++; 3525 } 3526 } 3527 ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; 3528 ddt_enter(ddt); 3529 VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); 3530 ddt_exit(ddt); 3531 } 3532 3533 ASSERT(error == ENOENT); 3534 } 3535 3536 /* ARGSUSED */ 3537 static void 3538 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, 3539 uint64_t size, void *arg) 3540 { 3541 /* 3542 * This callback was called through a remap from 3543 * a device being removed. Therefore, the vdev that 3544 * this callback is applied to is a concrete 3545 * vdev. 3546 */ 3547 ASSERT(vdev_is_concrete(vd)); 3548 3549 VERIFY0(metaslab_claim_impl(vd, offset, size, 3550 spa_min_claim_txg(vd->vdev_spa))); 3551 } 3552 3553 static void 3554 claim_segment_cb(void *arg, uint64_t offset, uint64_t size) 3555 { 3556 vdev_t *vd = arg; 3557 3558 vdev_indirect_ops.vdev_op_remap(vd, offset, size, 3559 claim_segment_impl_cb, NULL); 3560 } 3561 3562 /* 3563 * After accounting for all allocated blocks that are directly referenced, 3564 * we might have missed a reference to a block from a partially complete 3565 * (and thus unused) indirect mapping object. We perform a secondary pass 3566 * through the metaslabs we have already mapped and claim the destination 3567 * blocks. 3568 */ 3569 static void 3570 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) 3571 { 3572 if (dump_opt['L']) 3573 return; 3574 3575 if (spa->spa_vdev_removal == NULL) 3576 return; 3577 3578 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3579 3580 spa_vdev_removal_t *svr = spa->spa_vdev_removal; 3581 vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); 3582 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 3583 3584 ASSERT0(range_tree_space(svr->svr_allocd_segs)); 3585 3586 range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); 3587 for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { 3588 metaslab_t *msp = vd->vdev_ms[msi]; 3589 3590 if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) 3591 break; 3592 3593 ASSERT0(range_tree_space(allocs)); 3594 if (msp->ms_sm != NULL) 3595 VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); 3596 range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); 3597 } 3598 range_tree_destroy(allocs); 3599 3600 iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); 3601 3602 /* 3603 * Clear everything past what has been synced, 3604 * because we have not allocated mappings for 3605 * it yet. 3606 */ 3607 range_tree_clear(svr->svr_allocd_segs, 3608 vdev_indirect_mapping_max_offset(vim), 3609 vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); 3610 3611 zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); 3612 range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); 3613 3614 spa_config_exit(spa, SCL_CONFIG, FTAG); 3615 } 3616 3617 /* ARGSUSED */ 3618 static int 3619 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 3620 { 3621 zdb_cb_t *zcb = arg; 3622 spa_t *spa = zcb->zcb_spa; 3623 vdev_t *vd; 3624 const dva_t *dva = &bp->blk_dva[0]; 3625 3626 ASSERT(!dump_opt['L']); 3627 ASSERT3U(BP_GET_NDVAS(bp), ==, 1); 3628 3629 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 3630 vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); 3631 ASSERT3P(vd, !=, NULL); 3632 spa_config_exit(spa, SCL_VDEV, FTAG); 3633 3634 ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); 3635 ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); 3636 3637 vdev_indirect_mapping_increment_obsolete_count( 3638 vd->vdev_indirect_mapping, 3639 DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), 3640 zcb->zcb_vd_obsolete_counts[vd->vdev_id]); 3641 3642 return (0); 3643 } 3644 3645 static uint32_t * 3646 zdb_load_obsolete_counts(vdev_t *vd) 3647 { 3648 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 3649 spa_t *spa = vd->vdev_spa; 3650 spa_condensing_indirect_phys_t *scip = 3651 &spa->spa_condensing_indirect_phys; 3652 uint32_t *counts; 3653 3654 EQUIV(vdev_obsolete_sm_object(vd) != 0, vd->vdev_obsolete_sm != NULL); 3655 counts = vdev_indirect_mapping_load_obsolete_counts(vim); 3656 if (vd->vdev_obsolete_sm != NULL) { 3657 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, 3658 vd->vdev_obsolete_sm); 3659 } 3660 if (scip->scip_vdev == vd->vdev_id && 3661 scip->scip_prev_obsolete_sm_object != 0) { 3662 space_map_t *prev_obsolete_sm = NULL; 3663 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 3664 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 3665 vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, 3666 prev_obsolete_sm); 3667 space_map_close(prev_obsolete_sm); 3668 } 3669 return (counts); 3670 } 3671 3672 typedef struct checkpoint_sm_exclude_entry_arg { 3673 vdev_t *cseea_vd; 3674 uint64_t cseea_checkpoint_size; 3675 } checkpoint_sm_exclude_entry_arg_t; 3676 3677 static int 3678 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) 3679 { 3680 checkpoint_sm_exclude_entry_arg_t *cseea = arg; 3681 vdev_t *vd = cseea->cseea_vd; 3682 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; 3683 uint64_t end = sme->sme_offset + sme->sme_run; 3684 3685 ASSERT(sme->sme_type == SM_FREE); 3686 3687 /* 3688 * Since the vdev_checkpoint_sm exists in the vdev level 3689 * and the ms_sm space maps exist in the metaslab level, 3690 * an entry in the checkpoint space map could theoretically 3691 * cross the boundaries of the metaslab that it belongs. 3692 * 3693 * In reality, because of the way that we populate and 3694 * manipulate the checkpoint's space maps currently, 3695 * there shouldn't be any entries that cross metaslabs. 3696 * Hence the assertion below. 3697 * 3698 * That said, there is no fundamental requirement that 3699 * the checkpoint's space map entries should not cross 3700 * metaslab boundaries. So if needed we could add code 3701 * that handles metaslab-crossing segments in the future. 3702 */ 3703 VERIFY3U(sme->sme_offset, >=, ms->ms_start); 3704 VERIFY3U(end, <=, ms->ms_start + ms->ms_size); 3705 3706 /* 3707 * By removing the entry from the allocated segments we 3708 * also verify that the entry is there to begin with. 3709 */ 3710 mutex_enter(&ms->ms_lock); 3711 range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); 3712 mutex_exit(&ms->ms_lock); 3713 3714 cseea->cseea_checkpoint_size += sme->sme_run; 3715 return (0); 3716 } 3717 3718 static void 3719 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) 3720 { 3721 spa_t *spa = vd->vdev_spa; 3722 space_map_t *checkpoint_sm = NULL; 3723 uint64_t checkpoint_sm_obj; 3724 3725 /* 3726 * If there is no vdev_top_zap, we are in a pool whose 3727 * version predates the pool checkpoint feature. 3728 */ 3729 if (vd->vdev_top_zap == 0) 3730 return; 3731 3732 /* 3733 * If there is no reference of the vdev_checkpoint_sm in 3734 * the vdev_top_zap, then one of the following scenarios 3735 * is true: 3736 * 3737 * 1] There is no checkpoint 3738 * 2] There is a checkpoint, but no checkpointed blocks 3739 * have been freed yet 3740 * 3] The current vdev is indirect 3741 * 3742 * In these cases we return immediately. 3743 */ 3744 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, 3745 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 3746 return; 3747 3748 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, 3749 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, 3750 &checkpoint_sm_obj)); 3751 3752 checkpoint_sm_exclude_entry_arg_t cseea; 3753 cseea.cseea_vd = vd; 3754 cseea.cseea_checkpoint_size = 0; 3755 3756 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), 3757 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); 3758 3759 VERIFY0(space_map_iterate(checkpoint_sm, 3760 space_map_length(checkpoint_sm), 3761 checkpoint_sm_exclude_entry_cb, &cseea)); 3762 space_map_close(checkpoint_sm); 3763 3764 zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; 3765 } 3766 3767 static void 3768 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) 3769 { 3770 ASSERT(!dump_opt['L']); 3771 3772 vdev_t *rvd = spa->spa_root_vdev; 3773 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 3774 ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); 3775 zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); 3776 } 3777 } 3778 3779 static int 3780 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, 3781 uint64_t txg, void *arg) 3782 { 3783 int64_t *ualloc_space = arg; 3784 uint64_t offset = sme->sme_offset; 3785 uint64_t vdev_id = sme->sme_vdev; 3786 3787 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 3788 if (!vdev_is_concrete(vd)) 3789 return (0); 3790 3791 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3792 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 3793 3794 if (txg < metaslab_unflushed_txg(ms)) 3795 return (0); 3796 3797 if (sme->sme_type == SM_ALLOC) 3798 *ualloc_space += sme->sme_run; 3799 else 3800 *ualloc_space -= sme->sme_run; 3801 3802 return (0); 3803 } 3804 3805 static int64_t 3806 get_unflushed_alloc_space(spa_t *spa) 3807 { 3808 if (dump_opt['L']) 3809 return (0); 3810 3811 int64_t ualloc_space = 0; 3812 iterate_through_spacemap_logs(spa, count_unflushed_space_cb, 3813 &ualloc_space); 3814 return (ualloc_space); 3815 } 3816 3817 static int 3818 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) 3819 { 3820 maptype_t *uic_maptype = arg; 3821 uint64_t offset = sme->sme_offset; 3822 uint64_t size = sme->sme_run; 3823 uint64_t vdev_id = sme->sme_vdev; 3824 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 3825 3826 /* skip indirect vdevs */ 3827 if (!vdev_is_concrete(vd)) 3828 return (0); 3829 3830 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 3831 3832 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 3833 ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); 3834 3835 if (txg < metaslab_unflushed_txg(ms)) 3836 return (0); 3837 3838 if (*uic_maptype == sme->sme_type) 3839 range_tree_add(ms->ms_allocatable, offset, size); 3840 else 3841 range_tree_remove(ms->ms_allocatable, offset, size); 3842 3843 return (0); 3844 } 3845 3846 static void 3847 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) 3848 { 3849 iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); 3850 } 3851 3852 static void 3853 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) 3854 { 3855 vdev_t *rvd = spa->spa_root_vdev; 3856 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 3857 vdev_t *vd = rvd->vdev_child[i]; 3858 3859 ASSERT3U(i, ==, vd->vdev_id); 3860 3861 if (vd->vdev_ops == &vdev_indirect_ops) 3862 continue; 3863 3864 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 3865 metaslab_t *msp = vd->vdev_ms[m]; 3866 3867 (void) fprintf(stderr, 3868 "\rloading concrete vdev %llu, " 3869 "metaslab %llu of %llu ...", 3870 (longlong_t)vd->vdev_id, 3871 (longlong_t)msp->ms_id, 3872 (longlong_t)vd->vdev_ms_count); 3873 3874 mutex_enter(&msp->ms_lock); 3875 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 3876 3877 /* 3878 * We don't want to spend the CPU manipulating the 3879 * size-ordered tree, so clear the range_tree ops. 3880 */ 3881 msp->ms_allocatable->rt_ops = NULL; 3882 3883 if (msp->ms_sm != NULL) { 3884 VERIFY0(space_map_load(msp->ms_sm, 3885 msp->ms_allocatable, maptype)); 3886 } 3887 if (!msp->ms_loaded) 3888 msp->ms_loaded = B_TRUE; 3889 mutex_exit(&msp->ms_lock); 3890 } 3891 } 3892 3893 load_unflushed_to_ms_allocatables(spa, maptype); 3894 } 3895 3896 /* 3897 * vm_idxp is an in-out parameter which (for indirect vdevs) is the 3898 * index in vim_entries that has the first entry in this metaslab. 3899 * On return, it will be set to the first entry after this metaslab. 3900 */ 3901 static void 3902 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, 3903 uint64_t *vim_idxp) 3904 { 3905 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 3906 3907 mutex_enter(&msp->ms_lock); 3908 range_tree_vacate(msp->ms_allocatable, NULL, NULL); 3909 3910 /* 3911 * We don't want to spend the CPU manipulating the 3912 * size-ordered tree, so clear the range_tree ops. 3913 */ 3914 msp->ms_allocatable->rt_ops = NULL; 3915 3916 for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); 3917 (*vim_idxp)++) { 3918 vdev_indirect_mapping_entry_phys_t *vimep = 3919 &vim->vim_entries[*vim_idxp]; 3920 uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); 3921 uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); 3922 ASSERT3U(ent_offset, >=, msp->ms_start); 3923 if (ent_offset >= msp->ms_start + msp->ms_size) 3924 break; 3925 3926 /* 3927 * Mappings do not cross metaslab boundaries, 3928 * because we create them by walking the metaslabs. 3929 */ 3930 ASSERT3U(ent_offset + ent_len, <=, 3931 msp->ms_start + msp->ms_size); 3932 range_tree_add(msp->ms_allocatable, ent_offset, ent_len); 3933 } 3934 3935 if (!msp->ms_loaded) 3936 msp->ms_loaded = B_TRUE; 3937 mutex_exit(&msp->ms_lock); 3938 } 3939 3940 static void 3941 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) 3942 { 3943 ASSERT(!dump_opt['L']); 3944 3945 vdev_t *rvd = spa->spa_root_vdev; 3946 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 3947 vdev_t *vd = rvd->vdev_child[c]; 3948 3949 ASSERT3U(c, ==, vd->vdev_id); 3950 3951 if (vd->vdev_ops != &vdev_indirect_ops) 3952 continue; 3953 3954 /* 3955 * Note: we don't check for mapping leaks on 3956 * removing vdevs because their ms_allocatable's 3957 * are used to look for leaks in allocated space. 3958 */ 3959 zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); 3960 3961 /* 3962 * Normally, indirect vdevs don't have any 3963 * metaslabs. We want to set them up for 3964 * zio_claim(). 3965 */ 3966 VERIFY0(vdev_metaslab_init(vd, 0)); 3967 3968 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 3969 uint64_t vim_idx = 0; 3970 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 3971 3972 (void) fprintf(stderr, 3973 "\rloading indirect vdev %llu, " 3974 "metaslab %llu of %llu ...", 3975 (longlong_t)vd->vdev_id, 3976 (longlong_t)vd->vdev_ms[m]->ms_id, 3977 (longlong_t)vd->vdev_ms_count); 3978 3979 load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], 3980 &vim_idx); 3981 } 3982 ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); 3983 } 3984 } 3985 3986 static void 3987 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) 3988 { 3989 zcb->zcb_spa = spa; 3990 3991 if (dump_opt['L']) 3992 return; 3993 3994 dsl_pool_t *dp = spa->spa_dsl_pool; 3995 vdev_t *rvd = spa->spa_root_vdev; 3996 3997 /* 3998 * We are going to be changing the meaning of the metaslab's 3999 * ms_allocatable. Ensure that the allocator doesn't try to 4000 * use the tree. 4001 */ 4002 spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; 4003 spa->spa_log_class->mc_ops = &zdb_metaslab_ops; 4004 4005 zcb->zcb_vd_obsolete_counts = 4006 umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), 4007 UMEM_NOFAIL); 4008 4009 /* 4010 * For leak detection, we overload the ms_allocatable trees 4011 * to contain allocated segments instead of free segments. 4012 * As a result, we can't use the normal metaslab_load/unload 4013 * interfaces. 4014 */ 4015 zdb_leak_init_prepare_indirect_vdevs(spa, zcb); 4016 load_concrete_ms_allocatable_trees(spa, SM_ALLOC); 4017 4018 /* 4019 * On load_concrete_ms_allocatable_trees() we loaded all the 4020 * allocated entries from the ms_sm to the ms_allocatable for 4021 * each metaslab. If the pool has a checkpoint or is in the 4022 * middle of discarding a checkpoint, some of these blocks 4023 * may have been freed but their ms_sm may not have been 4024 * updated because they are referenced by the checkpoint. In 4025 * order to avoid false-positives during leak-detection, we 4026 * go through the vdev's checkpoint space map and exclude all 4027 * its entries from their relevant ms_allocatable. 4028 * 4029 * We also aggregate the space held by the checkpoint and add 4030 * it to zcb_checkpoint_size. 4031 * 4032 * Note that at this point we are also verifying that all the 4033 * entries on the checkpoint_sm are marked as allocated in 4034 * the ms_sm of their relevant metaslab. 4035 * [see comment in checkpoint_sm_exclude_entry_cb()] 4036 */ 4037 zdb_leak_init_exclude_checkpoint(spa, zcb); 4038 ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); 4039 4040 /* for cleaner progress output */ 4041 (void) fprintf(stderr, "\n"); 4042 4043 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { 4044 ASSERT(spa_feature_is_enabled(spa, 4045 SPA_FEATURE_DEVICE_REMOVAL)); 4046 (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, 4047 increment_indirect_mapping_cb, zcb, NULL); 4048 } 4049 4050 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4051 zdb_ddt_leak_init(spa, zcb); 4052 spa_config_exit(spa, SCL_CONFIG, FTAG); 4053 } 4054 4055 static boolean_t 4056 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) 4057 { 4058 boolean_t leaks = B_FALSE; 4059 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 4060 uint64_t total_leaked = 0; 4061 4062 ASSERT(vim != NULL); 4063 4064 for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { 4065 vdev_indirect_mapping_entry_phys_t *vimep = 4066 &vim->vim_entries[i]; 4067 uint64_t obsolete_bytes = 0; 4068 uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); 4069 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 4070 4071 /* 4072 * This is not very efficient but it's easy to 4073 * verify correctness. 4074 */ 4075 for (uint64_t inner_offset = 0; 4076 inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); 4077 inner_offset += 1 << vd->vdev_ashift) { 4078 if (range_tree_contains(msp->ms_allocatable, 4079 offset + inner_offset, 1 << vd->vdev_ashift)) { 4080 obsolete_bytes += 1 << vd->vdev_ashift; 4081 } 4082 } 4083 4084 int64_t bytes_leaked = obsolete_bytes - 4085 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; 4086 ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, 4087 zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); 4088 if (bytes_leaked != 0 && 4089 (vdev_obsolete_counts_are_precise(vd) || 4090 dump_opt['d'] >= 5)) { 4091 (void) printf("obsolete indirect mapping count " 4092 "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", 4093 (u_longlong_t)vd->vdev_id, 4094 (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), 4095 (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), 4096 (u_longlong_t)bytes_leaked); 4097 } 4098 total_leaked += ABS(bytes_leaked); 4099 } 4100 4101 if (!vdev_obsolete_counts_are_precise(vd) && total_leaked > 0) { 4102 int pct_leaked = total_leaked * 100 / 4103 vdev_indirect_mapping_bytes_mapped(vim); 4104 (void) printf("cannot verify obsolete indirect mapping " 4105 "counts of vdev %llu because precise feature was not " 4106 "enabled when it was removed: %d%% (%llx bytes) of mapping" 4107 "unreferenced\n", 4108 (u_longlong_t)vd->vdev_id, pct_leaked, 4109 (u_longlong_t)total_leaked); 4110 } else if (total_leaked > 0) { 4111 (void) printf("obsolete indirect mapping count mismatch " 4112 "for vdev %llu -- %llx total bytes mismatched\n", 4113 (u_longlong_t)vd->vdev_id, 4114 (u_longlong_t)total_leaked); 4115 leaks |= B_TRUE; 4116 } 4117 4118 vdev_indirect_mapping_free_obsolete_counts(vim, 4119 zcb->zcb_vd_obsolete_counts[vd->vdev_id]); 4120 zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; 4121 4122 return (leaks); 4123 } 4124 4125 static boolean_t 4126 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) 4127 { 4128 if (dump_opt['L']) 4129 return (B_FALSE); 4130 4131 boolean_t leaks = B_FALSE; 4132 4133 vdev_t *rvd = spa->spa_root_vdev; 4134 for (unsigned c = 0; c < rvd->vdev_children; c++) { 4135 vdev_t *vd = rvd->vdev_child[c]; 4136 #if DEBUG 4137 metaslab_group_t *mg = vd->vdev_mg; 4138 #endif 4139 4140 if (zcb->zcb_vd_obsolete_counts[c] != NULL) { 4141 leaks |= zdb_check_for_obsolete_leaks(vd, zcb); 4142 } 4143 4144 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 4145 metaslab_t *msp = vd->vdev_ms[m]; 4146 ASSERT3P(mg, ==, msp->ms_group); 4147 4148 /* 4149 * ms_allocatable has been overloaded 4150 * to contain allocated segments. Now that 4151 * we finished traversing all blocks, any 4152 * block that remains in the ms_allocatable 4153 * represents an allocated block that we 4154 * did not claim during the traversal. 4155 * Claimed blocks would have been removed 4156 * from the ms_allocatable. For indirect 4157 * vdevs, space remaining in the tree 4158 * represents parts of the mapping that are 4159 * not referenced, which is not a bug. 4160 */ 4161 if (vd->vdev_ops == &vdev_indirect_ops) { 4162 range_tree_vacate(msp->ms_allocatable, 4163 NULL, NULL); 4164 } else { 4165 range_tree_vacate(msp->ms_allocatable, 4166 zdb_leak, vd); 4167 } 4168 if (msp->ms_loaded) { 4169 msp->ms_loaded = B_FALSE; 4170 } 4171 } 4172 4173 } 4174 4175 umem_free(zcb->zcb_vd_obsolete_counts, 4176 rvd->vdev_children * sizeof (uint32_t *)); 4177 zcb->zcb_vd_obsolete_counts = NULL; 4178 4179 return (leaks); 4180 } 4181 4182 /* ARGSUSED */ 4183 static int 4184 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 4185 { 4186 zdb_cb_t *zcb = arg; 4187 4188 if (dump_opt['b'] >= 5) { 4189 char blkbuf[BP_SPRINTF_LEN]; 4190 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 4191 (void) printf("[%s] %s\n", 4192 "deferred free", blkbuf); 4193 } 4194 zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); 4195 return (0); 4196 } 4197 4198 static int 4199 dump_block_stats(spa_t *spa) 4200 { 4201 zdb_cb_t zcb; 4202 zdb_blkstats_t *zb, *tzb; 4203 uint64_t norm_alloc, norm_space, total_alloc, total_found; 4204 int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 4205 TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; 4206 boolean_t leaks = B_FALSE; 4207 int err; 4208 4209 bzero(&zcb, sizeof (zcb)); 4210 (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", 4211 (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", 4212 (dump_opt['c'] == 1) ? "metadata " : "", 4213 dump_opt['c'] ? "checksums " : "", 4214 (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", 4215 !dump_opt['L'] ? "nothing leaked " : ""); 4216 4217 /* 4218 * When leak detection is enabled we load all space maps as SM_ALLOC 4219 * maps, then traverse the pool claiming each block we discover. If 4220 * the pool is perfectly consistent, the segment trees will be empty 4221 * when we're done. Anything left over is a leak; any block we can't 4222 * claim (because it's not part of any space map) is a double 4223 * allocation, reference to a freed block, or an unclaimed log block. 4224 * 4225 * When leak detection is disabled (-L option) we still traverse the 4226 * pool claiming each block we discover, but we skip opening any space 4227 * maps. 4228 */ 4229 bzero(&zcb, sizeof (zdb_cb_t)); 4230 zdb_leak_init(spa, &zcb); 4231 4232 /* 4233 * If there's a deferred-free bplist, process that first. 4234 */ 4235 (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, 4236 count_block_cb, &zcb, NULL); 4237 4238 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 4239 (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, 4240 count_block_cb, &zcb, NULL); 4241 } 4242 4243 zdb_claim_removing(spa, &zcb); 4244 4245 if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { 4246 VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, 4247 spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, 4248 &zcb, NULL)); 4249 } 4250 4251 if (dump_opt['c'] > 1) 4252 flags |= TRAVERSE_PREFETCH_DATA; 4253 4254 zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); 4255 zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); 4256 zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); 4257 zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); 4258 err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); 4259 4260 /* 4261 * If we've traversed the data blocks then we need to wait for those 4262 * I/Os to complete. We leverage "The Godfather" zio to wait on 4263 * all async I/Os to complete. 4264 */ 4265 if (dump_opt['c']) { 4266 for (int i = 0; i < max_ncpus; i++) { 4267 (void) zio_wait(spa->spa_async_zio_root[i]); 4268 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 4269 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 4270 ZIO_FLAG_GODFATHER); 4271 } 4272 } 4273 4274 /* 4275 * Done after zio_wait() since zcb_haderrors is modified in 4276 * zdb_blkptr_done() 4277 */ 4278 zcb.zcb_haderrors |= err; 4279 4280 if (zcb.zcb_haderrors) { 4281 (void) printf("\nError counts:\n\n"); 4282 (void) printf("\t%5s %s\n", "errno", "count"); 4283 for (int e = 0; e < 256; e++) { 4284 if (zcb.zcb_errors[e] != 0) { 4285 (void) printf("\t%5d %llu\n", 4286 e, (u_longlong_t)zcb.zcb_errors[e]); 4287 } 4288 } 4289 } 4290 4291 /* 4292 * Report any leaked segments. 4293 */ 4294 leaks |= zdb_leak_fini(spa, &zcb); 4295 4296 tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; 4297 4298 norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 4299 norm_space = metaslab_class_get_space(spa_normal_class(spa)); 4300 4301 total_alloc = norm_alloc + 4302 metaslab_class_get_alloc(spa_log_class(spa)) + 4303 metaslab_class_get_alloc(spa_special_class(spa)) + 4304 metaslab_class_get_alloc(spa_dedup_class(spa)) + 4305 get_unflushed_alloc_space(spa); 4306 total_found = tzb->zb_asize - zcb.zcb_dedup_asize + 4307 zcb.zcb_removing_size + zcb.zcb_checkpoint_size; 4308 4309 if (total_found == total_alloc && !dump_opt['L']) { 4310 (void) printf("\n\tNo leaks (block sum matches space" 4311 " maps exactly)\n"); 4312 } else if (!dump_opt['L']) { 4313 (void) printf("block traversal size %llu != alloc %llu " 4314 "(%s %lld)\n", 4315 (u_longlong_t)total_found, 4316 (u_longlong_t)total_alloc, 4317 (dump_opt['L']) ? "unreachable" : "leaked", 4318 (longlong_t)(total_alloc - total_found)); 4319 leaks = B_TRUE; 4320 } 4321 4322 if (tzb->zb_count == 0) 4323 return (2); 4324 4325 (void) printf("\n"); 4326 (void) printf("\t%-16s %14llu\n", "bp count:", 4327 (u_longlong_t)tzb->zb_count); 4328 (void) printf("\t%-16s %14llu\n", "ganged count:", 4329 (longlong_t)tzb->zb_gangs); 4330 (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", 4331 (u_longlong_t)tzb->zb_lsize, 4332 (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); 4333 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", 4334 "bp physical:", (u_longlong_t)tzb->zb_psize, 4335 (u_longlong_t)(tzb->zb_psize / tzb->zb_count), 4336 (double)tzb->zb_lsize / tzb->zb_psize); 4337 (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", 4338 "bp allocated:", (u_longlong_t)tzb->zb_asize, 4339 (u_longlong_t)(tzb->zb_asize / tzb->zb_count), 4340 (double)tzb->zb_lsize / tzb->zb_asize); 4341 (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", 4342 "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, 4343 (u_longlong_t)zcb.zcb_dedup_blocks, 4344 (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); 4345 (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", 4346 (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); 4347 4348 if (spa_special_class(spa)->mc_rotor != NULL) { 4349 uint64_t alloc = metaslab_class_get_alloc( 4350 spa_special_class(spa)); 4351 uint64_t space = metaslab_class_get_space( 4352 spa_special_class(spa)); 4353 4354 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 4355 "Special class", (u_longlong_t)alloc, 4356 100.0 * alloc / space); 4357 } 4358 4359 if (spa_dedup_class(spa)->mc_rotor != NULL) { 4360 uint64_t alloc = metaslab_class_get_alloc( 4361 spa_dedup_class(spa)); 4362 uint64_t space = metaslab_class_get_space( 4363 spa_dedup_class(spa)); 4364 4365 (void) printf("\t%-16s %14llu used: %5.2f%%\n", 4366 "Dedup class", (u_longlong_t)alloc, 4367 100.0 * alloc / space); 4368 } 4369 4370 for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { 4371 if (zcb.zcb_embedded_blocks[i] == 0) 4372 continue; 4373 (void) printf("\n"); 4374 (void) printf("\tadditional, non-pointer bps of type %u: " 4375 "%10llu\n", 4376 i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); 4377 4378 if (dump_opt['b'] >= 3) { 4379 (void) printf("\t number of (compressed) bytes: " 4380 "number of bps\n"); 4381 dump_histogram(zcb.zcb_embedded_histogram[i], 4382 sizeof (zcb.zcb_embedded_histogram[i]) / 4383 sizeof (zcb.zcb_embedded_histogram[i][0]), 0); 4384 } 4385 } 4386 4387 if (tzb->zb_ditto_samevdev != 0) { 4388 (void) printf("\tDittoed blocks on same vdev: %llu\n", 4389 (longlong_t)tzb->zb_ditto_samevdev); 4390 } 4391 if (tzb->zb_ditto_same_ms != 0) { 4392 (void) printf("\tDittoed blocks in same metaslab: %llu\n", 4393 (longlong_t)tzb->zb_ditto_same_ms); 4394 } 4395 4396 for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { 4397 vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; 4398 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 4399 4400 if (vim == NULL) { 4401 continue; 4402 } 4403 4404 char mem[32]; 4405 zdb_nicenum(vdev_indirect_mapping_num_entries(vim), 4406 mem, vdev_indirect_mapping_size(vim)); 4407 4408 (void) printf("\tindirect vdev id %llu has %llu segments " 4409 "(%s in memory)\n", 4410 (longlong_t)vd->vdev_id, 4411 (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); 4412 } 4413 4414 if (dump_opt['b'] >= 2) { 4415 int l, t, level; 4416 (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" 4417 "\t avg\t comp\t%%Total\tType\n"); 4418 4419 for (t = 0; t <= ZDB_OT_TOTAL; t++) { 4420 char csize[32], lsize[32], psize[32], asize[32]; 4421 char avg[32], gang[32]; 4422 const char *typename; 4423 4424 /* make sure nicenum has enough space */ 4425 CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ); 4426 CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); 4427 CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ); 4428 CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); 4429 CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ); 4430 CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ); 4431 4432 if (t < DMU_OT_NUMTYPES) 4433 typename = dmu_ot[t].ot_name; 4434 else 4435 typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; 4436 4437 if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { 4438 (void) printf("%6s\t%5s\t%5s\t%5s" 4439 "\t%5s\t%5s\t%6s\t%s\n", 4440 "-", 4441 "-", 4442 "-", 4443 "-", 4444 "-", 4445 "-", 4446 "-", 4447 typename); 4448 continue; 4449 } 4450 4451 for (l = ZB_TOTAL - 1; l >= -1; l--) { 4452 level = (l == -1 ? ZB_TOTAL : l); 4453 zb = &zcb.zcb_type[level][t]; 4454 4455 if (zb->zb_asize == 0) 4456 continue; 4457 4458 if (dump_opt['b'] < 3 && level != ZB_TOTAL) 4459 continue; 4460 4461 if (level == 0 && zb->zb_asize == 4462 zcb.zcb_type[ZB_TOTAL][t].zb_asize) 4463 continue; 4464 4465 zdb_nicenum(zb->zb_count, csize, 4466 sizeof (csize)); 4467 zdb_nicenum(zb->zb_lsize, lsize, 4468 sizeof (lsize)); 4469 zdb_nicenum(zb->zb_psize, psize, 4470 sizeof (psize)); 4471 zdb_nicenum(zb->zb_asize, asize, 4472 sizeof (asize)); 4473 zdb_nicenum(zb->zb_asize / zb->zb_count, avg, 4474 sizeof (avg)); 4475 zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); 4476 4477 (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" 4478 "\t%5.2f\t%6.2f\t", 4479 csize, lsize, psize, asize, avg, 4480 (double)zb->zb_lsize / zb->zb_psize, 4481 100.0 * zb->zb_asize / tzb->zb_asize); 4482 4483 if (level == ZB_TOTAL) 4484 (void) printf("%s\n", typename); 4485 else 4486 (void) printf(" L%d %s\n", 4487 level, typename); 4488 4489 if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { 4490 (void) printf("\t number of ganged " 4491 "blocks: %s\n", gang); 4492 } 4493 4494 if (dump_opt['b'] >= 4) { 4495 (void) printf("psize " 4496 "(in 512-byte sectors): " 4497 "number of blocks\n"); 4498 dump_histogram(zb->zb_psize_histogram, 4499 PSIZE_HISTO_SIZE, 0); 4500 } 4501 } 4502 } 4503 } 4504 4505 (void) printf("\n"); 4506 4507 if (leaks) 4508 return (2); 4509 4510 if (zcb.zcb_haderrors) 4511 return (3); 4512 4513 return (0); 4514 } 4515 4516 typedef struct zdb_ddt_entry { 4517 ddt_key_t zdde_key; 4518 uint64_t zdde_ref_blocks; 4519 uint64_t zdde_ref_lsize; 4520 uint64_t zdde_ref_psize; 4521 uint64_t zdde_ref_dsize; 4522 avl_node_t zdde_node; 4523 } zdb_ddt_entry_t; 4524 4525 /* ARGSUSED */ 4526 static int 4527 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 4528 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 4529 { 4530 avl_tree_t *t = arg; 4531 avl_index_t where; 4532 zdb_ddt_entry_t *zdde, zdde_search; 4533 4534 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 4535 return (0); 4536 4537 if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { 4538 (void) printf("traversing objset %llu, %llu objects, " 4539 "%lu blocks so far\n", 4540 (u_longlong_t)zb->zb_objset, 4541 (u_longlong_t)BP_GET_FILL(bp), 4542 avl_numnodes(t)); 4543 } 4544 4545 if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || 4546 BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) 4547 return (0); 4548 4549 ddt_key_fill(&zdde_search.zdde_key, bp); 4550 4551 zdde = avl_find(t, &zdde_search, &where); 4552 4553 if (zdde == NULL) { 4554 zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); 4555 zdde->zdde_key = zdde_search.zdde_key; 4556 avl_insert(t, zdde, where); 4557 } 4558 4559 zdde->zdde_ref_blocks += 1; 4560 zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); 4561 zdde->zdde_ref_psize += BP_GET_PSIZE(bp); 4562 zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); 4563 4564 return (0); 4565 } 4566 4567 static void 4568 dump_simulated_ddt(spa_t *spa) 4569 { 4570 avl_tree_t t; 4571 void *cookie = NULL; 4572 zdb_ddt_entry_t *zdde; 4573 ddt_histogram_t ddh_total; 4574 ddt_stat_t dds_total; 4575 4576 bzero(&ddh_total, sizeof (ddh_total)); 4577 bzero(&dds_total, sizeof (dds_total)); 4578 avl_create(&t, ddt_entry_compare, 4579 sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); 4580 4581 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4582 4583 (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 4584 TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); 4585 4586 spa_config_exit(spa, SCL_CONFIG, FTAG); 4587 4588 while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { 4589 ddt_stat_t dds; 4590 uint64_t refcnt = zdde->zdde_ref_blocks; 4591 ASSERT(refcnt != 0); 4592 4593 dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; 4594 dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; 4595 dds.dds_psize = zdde->zdde_ref_psize / refcnt; 4596 dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; 4597 4598 dds.dds_ref_blocks = zdde->zdde_ref_blocks; 4599 dds.dds_ref_lsize = zdde->zdde_ref_lsize; 4600 dds.dds_ref_psize = zdde->zdde_ref_psize; 4601 dds.dds_ref_dsize = zdde->zdde_ref_dsize; 4602 4603 ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], 4604 &dds, 0); 4605 4606 umem_free(zdde, sizeof (*zdde)); 4607 } 4608 4609 avl_destroy(&t); 4610 4611 ddt_histogram_stat(&dds_total, &ddh_total); 4612 4613 (void) printf("Simulated DDT histogram:\n"); 4614 4615 zpool_dump_ddt(&dds_total, &ddh_total); 4616 4617 dump_dedup_ratio(&dds_total); 4618 } 4619 4620 static int 4621 verify_device_removal_feature_counts(spa_t *spa) 4622 { 4623 uint64_t dr_feature_refcount = 0; 4624 uint64_t oc_feature_refcount = 0; 4625 uint64_t indirect_vdev_count = 0; 4626 uint64_t precise_vdev_count = 0; 4627 uint64_t obsolete_counts_object_count = 0; 4628 uint64_t obsolete_sm_count = 0; 4629 uint64_t obsolete_counts_count = 0; 4630 uint64_t scip_count = 0; 4631 uint64_t obsolete_bpobj_count = 0; 4632 int ret = 0; 4633 4634 spa_condensing_indirect_phys_t *scip = 4635 &spa->spa_condensing_indirect_phys; 4636 if (scip->scip_next_mapping_object != 0) { 4637 vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; 4638 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 4639 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 4640 4641 (void) printf("Condensing indirect vdev %llu: new mapping " 4642 "object %llu, prev obsolete sm %llu\n", 4643 (u_longlong_t)scip->scip_vdev, 4644 (u_longlong_t)scip->scip_next_mapping_object, 4645 (u_longlong_t)scip->scip_prev_obsolete_sm_object); 4646 if (scip->scip_prev_obsolete_sm_object != 0) { 4647 space_map_t *prev_obsolete_sm = NULL; 4648 VERIFY0(space_map_open(&prev_obsolete_sm, 4649 spa->spa_meta_objset, 4650 scip->scip_prev_obsolete_sm_object, 4651 0, vd->vdev_asize, 0)); 4652 dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); 4653 (void) printf("\n"); 4654 space_map_close(prev_obsolete_sm); 4655 } 4656 4657 scip_count += 2; 4658 } 4659 4660 for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { 4661 vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; 4662 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 4663 4664 if (vic->vic_mapping_object != 0) { 4665 ASSERT(vd->vdev_ops == &vdev_indirect_ops || 4666 vd->vdev_removing); 4667 indirect_vdev_count++; 4668 4669 if (vd->vdev_indirect_mapping->vim_havecounts) { 4670 obsolete_counts_count++; 4671 } 4672 } 4673 if (vdev_obsolete_counts_are_precise(vd)) { 4674 ASSERT(vic->vic_mapping_object != 0); 4675 precise_vdev_count++; 4676 } 4677 if (vdev_obsolete_sm_object(vd) != 0) { 4678 ASSERT(vic->vic_mapping_object != 0); 4679 obsolete_sm_count++; 4680 } 4681 } 4682 4683 (void) feature_get_refcount(spa, 4684 &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], 4685 &dr_feature_refcount); 4686 (void) feature_get_refcount(spa, 4687 &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], 4688 &oc_feature_refcount); 4689 4690 if (dr_feature_refcount != indirect_vdev_count) { 4691 ret = 1; 4692 (void) printf("Number of indirect vdevs (%llu) " \ 4693 "does not match feature count (%llu)\n", 4694 (u_longlong_t)indirect_vdev_count, 4695 (u_longlong_t)dr_feature_refcount); 4696 } else { 4697 (void) printf("Verified device_removal feature refcount " \ 4698 "of %llu is correct\n", 4699 (u_longlong_t)dr_feature_refcount); 4700 } 4701 4702 if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 4703 DMU_POOL_OBSOLETE_BPOBJ) == 0) { 4704 obsolete_bpobj_count++; 4705 } 4706 4707 4708 obsolete_counts_object_count = precise_vdev_count; 4709 obsolete_counts_object_count += obsolete_sm_count; 4710 obsolete_counts_object_count += obsolete_counts_count; 4711 obsolete_counts_object_count += scip_count; 4712 obsolete_counts_object_count += obsolete_bpobj_count; 4713 obsolete_counts_object_count += remap_deadlist_count; 4714 4715 if (oc_feature_refcount != obsolete_counts_object_count) { 4716 ret = 1; 4717 (void) printf("Number of obsolete counts objects (%llu) " \ 4718 "does not match feature count (%llu)\n", 4719 (u_longlong_t)obsolete_counts_object_count, 4720 (u_longlong_t)oc_feature_refcount); 4721 (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " 4722 "ob:%llu rd:%llu\n", 4723 (u_longlong_t)precise_vdev_count, 4724 (u_longlong_t)obsolete_sm_count, 4725 (u_longlong_t)obsolete_counts_count, 4726 (u_longlong_t)scip_count, 4727 (u_longlong_t)obsolete_bpobj_count, 4728 (u_longlong_t)remap_deadlist_count); 4729 } else { 4730 (void) printf("Verified indirect_refcount feature refcount " \ 4731 "of %llu is correct\n", 4732 (u_longlong_t)oc_feature_refcount); 4733 } 4734 return (ret); 4735 } 4736 4737 static void 4738 zdb_set_skip_mmp(char *target) 4739 { 4740 spa_t *spa; 4741 4742 /* 4743 * Disable the activity check to allow examination of 4744 * active pools. 4745 */ 4746 mutex_enter(&spa_namespace_lock); 4747 if ((spa = spa_lookup(target)) != NULL) { 4748 spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; 4749 } 4750 mutex_exit(&spa_namespace_lock); 4751 } 4752 4753 #define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" 4754 /* 4755 * Import the checkpointed state of the pool specified by the target 4756 * parameter as readonly. The function also accepts a pool config 4757 * as an optional parameter, else it attempts to infer the config by 4758 * the name of the target pool. 4759 * 4760 * Note that the checkpointed state's pool name will be the name of 4761 * the original pool with the above suffix appened to it. In addition, 4762 * if the target is not a pool name (e.g. a path to a dataset) then 4763 * the new_path parameter is populated with the updated path to 4764 * reflect the fact that we are looking into the checkpointed state. 4765 * 4766 * The function returns a newly-allocated copy of the name of the 4767 * pool containing the checkpointed state. When this copy is no 4768 * longer needed it should be freed with free(3C). Same thing 4769 * applies to the new_path parameter if allocated. 4770 */ 4771 static char * 4772 import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) 4773 { 4774 int error = 0; 4775 char *poolname, *bogus_name; 4776 4777 /* If the target is not a pool, the extract the pool name */ 4778 char *path_start = strchr(target, '/'); 4779 if (path_start != NULL) { 4780 size_t poolname_len = path_start - target; 4781 poolname = strndup(target, poolname_len); 4782 } else { 4783 poolname = target; 4784 } 4785 4786 if (cfg == NULL) { 4787 zdb_set_skip_mmp(poolname); 4788 error = spa_get_stats(poolname, &cfg, NULL, 0); 4789 if (error != 0) { 4790 fatal("Tried to read config of pool \"%s\" but " 4791 "spa_get_stats() failed with error %d\n", 4792 poolname, error); 4793 } 4794 } 4795 4796 (void) asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX); 4797 fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); 4798 4799 error = spa_import(bogus_name, cfg, NULL, 4800 ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | 4801 ZFS_IMPORT_SKIP_MMP); 4802 if (error != 0) { 4803 fatal("Tried to import pool \"%s\" but spa_import() failed " 4804 "with error %d\n", bogus_name, error); 4805 } 4806 4807 if (new_path != NULL && path_start != NULL) 4808 (void) asprintf(new_path, "%s%s", bogus_name, path_start); 4809 4810 if (target != poolname) 4811 free(poolname); 4812 4813 return (bogus_name); 4814 } 4815 4816 typedef struct verify_checkpoint_sm_entry_cb_arg { 4817 vdev_t *vcsec_vd; 4818 4819 /* the following fields are only used for printing progress */ 4820 uint64_t vcsec_entryid; 4821 uint64_t vcsec_num_entries; 4822 } verify_checkpoint_sm_entry_cb_arg_t; 4823 4824 #define ENTRIES_PER_PROGRESS_UPDATE 10000 4825 4826 static int 4827 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) 4828 { 4829 verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; 4830 vdev_t *vd = vcsec->vcsec_vd; 4831 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; 4832 uint64_t end = sme->sme_offset + sme->sme_run; 4833 4834 ASSERT(sme->sme_type == SM_FREE); 4835 4836 if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { 4837 (void) fprintf(stderr, 4838 "\rverifying vdev %llu, space map entry %llu of %llu ...", 4839 (longlong_t)vd->vdev_id, 4840 (longlong_t)vcsec->vcsec_entryid, 4841 (longlong_t)vcsec->vcsec_num_entries); 4842 } 4843 vcsec->vcsec_entryid++; 4844 4845 /* 4846 * See comment in checkpoint_sm_exclude_entry_cb() 4847 */ 4848 VERIFY3U(sme->sme_offset, >=, ms->ms_start); 4849 VERIFY3U(end, <=, ms->ms_start + ms->ms_size); 4850 4851 /* 4852 * The entries in the vdev_checkpoint_sm should be marked as 4853 * allocated in the checkpointed state of the pool, therefore 4854 * their respective ms_allocateable trees should not contain them. 4855 */ 4856 mutex_enter(&ms->ms_lock); 4857 range_tree_verify_not_present(ms->ms_allocatable, 4858 sme->sme_offset, sme->sme_run); 4859 mutex_exit(&ms->ms_lock); 4860 4861 return (0); 4862 } 4863 4864 /* 4865 * Verify that all segments in the vdev_checkpoint_sm are allocated 4866 * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's 4867 * ms_allocatable). 4868 * 4869 * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of 4870 * each vdev in the current state of the pool to the metaslab space maps 4871 * (ms_sm) of the checkpointed state of the pool. 4872 * 4873 * Note that the function changes the state of the ms_allocatable 4874 * trees of the current spa_t. The entries of these ms_allocatable 4875 * trees are cleared out and then repopulated from with the free 4876 * entries of their respective ms_sm space maps. 4877 */ 4878 static void 4879 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) 4880 { 4881 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; 4882 vdev_t *current_rvd = current->spa_root_vdev; 4883 4884 load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); 4885 4886 for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { 4887 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; 4888 vdev_t *current_vd = current_rvd->vdev_child[c]; 4889 4890 space_map_t *checkpoint_sm = NULL; 4891 uint64_t checkpoint_sm_obj; 4892 4893 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { 4894 /* 4895 * Since we don't allow device removal in a pool 4896 * that has a checkpoint, we expect that all removed 4897 * vdevs were removed from the pool before the 4898 * checkpoint. 4899 */ 4900 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); 4901 continue; 4902 } 4903 4904 /* 4905 * If the checkpoint space map doesn't exist, then nothing 4906 * here is checkpointed so there's nothing to verify. 4907 */ 4908 if (current_vd->vdev_top_zap == 0 || 4909 zap_contains(spa_meta_objset(current), 4910 current_vd->vdev_top_zap, 4911 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 4912 continue; 4913 4914 VERIFY0(zap_lookup(spa_meta_objset(current), 4915 current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 4916 sizeof (uint64_t), 1, &checkpoint_sm_obj)); 4917 4918 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), 4919 checkpoint_sm_obj, 0, current_vd->vdev_asize, 4920 current_vd->vdev_ashift)); 4921 4922 verify_checkpoint_sm_entry_cb_arg_t vcsec; 4923 vcsec.vcsec_vd = ckpoint_vd; 4924 vcsec.vcsec_entryid = 0; 4925 vcsec.vcsec_num_entries = 4926 space_map_length(checkpoint_sm) / sizeof (uint64_t); 4927 VERIFY0(space_map_iterate(checkpoint_sm, 4928 space_map_length(checkpoint_sm), 4929 verify_checkpoint_sm_entry_cb, &vcsec)); 4930 dump_spacemap(current->spa_meta_objset, checkpoint_sm); 4931 space_map_close(checkpoint_sm); 4932 } 4933 4934 /* 4935 * If we've added vdevs since we took the checkpoint, ensure 4936 * that their checkpoint space maps are empty. 4937 */ 4938 if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { 4939 for (uint64_t c = ckpoint_rvd->vdev_children; 4940 c < current_rvd->vdev_children; c++) { 4941 vdev_t *current_vd = current_rvd->vdev_child[c]; 4942 ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL); 4943 } 4944 } 4945 4946 /* for cleaner progress output */ 4947 (void) fprintf(stderr, "\n"); 4948 } 4949 4950 /* 4951 * Verifies that all space that's allocated in the checkpoint is 4952 * still allocated in the current version, by checking that everything 4953 * in checkpoint's ms_allocatable (which is actually allocated, not 4954 * allocatable/free) is not present in current's ms_allocatable. 4955 * 4956 * Note that the function changes the state of the ms_allocatable 4957 * trees of both spas when called. The entries of all ms_allocatable 4958 * trees are cleared out and then repopulated from their respective 4959 * ms_sm space maps. In the checkpointed state we load the allocated 4960 * entries, and in the current state we load the free entries. 4961 */ 4962 static void 4963 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) 4964 { 4965 vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; 4966 vdev_t *current_rvd = current->spa_root_vdev; 4967 4968 load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); 4969 load_concrete_ms_allocatable_trees(current, SM_FREE); 4970 4971 for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { 4972 vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; 4973 vdev_t *current_vd = current_rvd->vdev_child[i]; 4974 4975 if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { 4976 /* 4977 * See comment in verify_checkpoint_vdev_spacemaps() 4978 */ 4979 ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); 4980 continue; 4981 } 4982 4983 for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { 4984 metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; 4985 metaslab_t *current_msp = current_vd->vdev_ms[m]; 4986 4987 (void) fprintf(stderr, 4988 "\rverifying vdev %llu of %llu, " 4989 "metaslab %llu of %llu ...", 4990 (longlong_t)current_vd->vdev_id, 4991 (longlong_t)current_rvd->vdev_children, 4992 (longlong_t)current_vd->vdev_ms[m]->ms_id, 4993 (longlong_t)current_vd->vdev_ms_count); 4994 4995 /* 4996 * We walk through the ms_allocatable trees that 4997 * are loaded with the allocated blocks from the 4998 * ms_sm spacemaps of the checkpoint. For each 4999 * one of these ranges we ensure that none of them 5000 * exists in the ms_allocatable trees of the 5001 * current state which are loaded with the ranges 5002 * that are currently free. 5003 * 5004 * This way we ensure that none of the blocks that 5005 * are part of the checkpoint were freed by mistake. 5006 */ 5007 range_tree_walk(ckpoint_msp->ms_allocatable, 5008 (range_tree_func_t *)range_tree_verify_not_present, 5009 current_msp->ms_allocatable); 5010 } 5011 } 5012 5013 /* for cleaner progress output */ 5014 (void) fprintf(stderr, "\n"); 5015 } 5016 5017 static void 5018 verify_checkpoint_blocks(spa_t *spa) 5019 { 5020 ASSERT(!dump_opt['L']); 5021 5022 spa_t *checkpoint_spa; 5023 char *checkpoint_pool; 5024 nvlist_t *config = NULL; 5025 int error = 0; 5026 5027 /* 5028 * We import the checkpointed state of the pool (under a different 5029 * name) so we can do verification on it against the current state 5030 * of the pool. 5031 */ 5032 checkpoint_pool = import_checkpointed_state(spa->spa_name, config, 5033 NULL); 5034 ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); 5035 5036 error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); 5037 if (error != 0) { 5038 fatal("Tried to open pool \"%s\" but spa_open() failed with " 5039 "error %d\n", checkpoint_pool, error); 5040 } 5041 5042 /* 5043 * Ensure that ranges in the checkpoint space maps of each vdev 5044 * are allocated according to the checkpointed state's metaslab 5045 * space maps. 5046 */ 5047 verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); 5048 5049 /* 5050 * Ensure that allocated ranges in the checkpoint's metaslab 5051 * space maps remain allocated in the metaslab space maps of 5052 * the current state. 5053 */ 5054 verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); 5055 5056 /* 5057 * Once we are done, we get rid of the checkpointed state. 5058 */ 5059 spa_close(checkpoint_spa, FTAG); 5060 free(checkpoint_pool); 5061 } 5062 5063 static void 5064 dump_leftover_checkpoint_blocks(spa_t *spa) 5065 { 5066 vdev_t *rvd = spa->spa_root_vdev; 5067 5068 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 5069 vdev_t *vd = rvd->vdev_child[i]; 5070 5071 space_map_t *checkpoint_sm = NULL; 5072 uint64_t checkpoint_sm_obj; 5073 5074 if (vd->vdev_top_zap == 0) 5075 continue; 5076 5077 if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, 5078 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) 5079 continue; 5080 5081 VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, 5082 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, 5083 sizeof (uint64_t), 1, &checkpoint_sm_obj)); 5084 5085 VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), 5086 checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); 5087 dump_spacemap(spa->spa_meta_objset, checkpoint_sm); 5088 space_map_close(checkpoint_sm); 5089 } 5090 } 5091 5092 static int 5093 verify_checkpoint(spa_t *spa) 5094 { 5095 uberblock_t checkpoint; 5096 int error; 5097 5098 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) 5099 return (0); 5100 5101 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5102 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5103 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5104 5105 if (error == ENOENT && !dump_opt['L']) { 5106 /* 5107 * If the feature is active but the uberblock is missing 5108 * then we must be in the middle of discarding the 5109 * checkpoint. 5110 */ 5111 (void) printf("\nPartially discarded checkpoint " 5112 "state found:\n"); 5113 dump_leftover_checkpoint_blocks(spa); 5114 return (0); 5115 } else if (error != 0) { 5116 (void) printf("lookup error %d when looking for " 5117 "checkpointed uberblock in MOS\n", error); 5118 return (error); 5119 } 5120 dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); 5121 5122 if (checkpoint.ub_checkpoint_txg == 0) { 5123 (void) printf("\nub_checkpoint_txg not set in checkpointed " 5124 "uberblock\n"); 5125 error = 3; 5126 } 5127 5128 if (error == 0 && !dump_opt['L']) 5129 verify_checkpoint_blocks(spa); 5130 5131 return (error); 5132 } 5133 5134 /* ARGSUSED */ 5135 static void 5136 mos_leaks_cb(void *arg, uint64_t start, uint64_t size) 5137 { 5138 for (uint64_t i = start; i < size; i++) { 5139 (void) printf("MOS object %llu referenced but not allocated\n", 5140 (u_longlong_t)i); 5141 } 5142 } 5143 5144 static range_tree_t *mos_refd_objs; 5145 5146 static void 5147 mos_obj_refd(uint64_t obj) 5148 { 5149 if (obj != 0 && mos_refd_objs != NULL) 5150 range_tree_add(mos_refd_objs, obj, 1); 5151 } 5152 5153 static void 5154 mos_leak_vdev_top_zap(vdev_t *vd) 5155 { 5156 uint64_t ms_flush_data_obj; 5157 5158 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), 5159 vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, 5160 sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); 5161 if (error == ENOENT) 5162 return; 5163 ASSERT0(error); 5164 5165 mos_obj_refd(ms_flush_data_obj); 5166 } 5167 5168 static void 5169 mos_leak_vdev(vdev_t *vd) 5170 { 5171 mos_obj_refd(vd->vdev_dtl_object); 5172 mos_obj_refd(vd->vdev_ms_array); 5173 mos_obj_refd(vd->vdev_indirect_config.vic_births_object); 5174 mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); 5175 mos_obj_refd(vd->vdev_leaf_zap); 5176 if (vd->vdev_checkpoint_sm != NULL) 5177 mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); 5178 if (vd->vdev_indirect_mapping != NULL) { 5179 mos_obj_refd(vd->vdev_indirect_mapping-> 5180 vim_phys->vimp_counts_object); 5181 } 5182 if (vd->vdev_obsolete_sm != NULL) 5183 mos_obj_refd(vd->vdev_obsolete_sm->sm_object); 5184 5185 for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { 5186 metaslab_t *ms = vd->vdev_ms[m]; 5187 mos_obj_refd(space_map_object(ms->ms_sm)); 5188 } 5189 5190 if (vd->vdev_top_zap != 0) { 5191 mos_obj_refd(vd->vdev_top_zap); 5192 mos_leak_vdev_top_zap(vd); 5193 } 5194 5195 for (uint64_t c = 0; c < vd->vdev_children; c++) { 5196 mos_leak_vdev(vd->vdev_child[c]); 5197 } 5198 } 5199 5200 static void 5201 mos_leak_log_spacemaps(spa_t *spa) 5202 { 5203 uint64_t spacemap_zap; 5204 5205 int error = zap_lookup(spa_meta_objset(spa), 5206 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, 5207 sizeof (spacemap_zap), 1, &spacemap_zap); 5208 if (error == ENOENT) 5209 return; 5210 ASSERT0(error); 5211 5212 mos_obj_refd(spacemap_zap); 5213 for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); 5214 sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) 5215 mos_obj_refd(sls->sls_sm_obj); 5216 } 5217 5218 static int 5219 dump_mos_leaks(spa_t *spa) 5220 { 5221 int rv = 0; 5222 objset_t *mos = spa->spa_meta_objset; 5223 dsl_pool_t *dp = spa->spa_dsl_pool; 5224 5225 /* Visit and mark all referenced objects in the MOS */ 5226 5227 mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); 5228 mos_obj_refd(spa->spa_pool_props_object); 5229 mos_obj_refd(spa->spa_config_object); 5230 mos_obj_refd(spa->spa_ddt_stat_object); 5231 mos_obj_refd(spa->spa_feat_desc_obj); 5232 mos_obj_refd(spa->spa_feat_enabled_txg_obj); 5233 mos_obj_refd(spa->spa_feat_for_read_obj); 5234 mos_obj_refd(spa->spa_feat_for_write_obj); 5235 mos_obj_refd(spa->spa_history); 5236 mos_obj_refd(spa->spa_errlog_last); 5237 mos_obj_refd(spa->spa_errlog_scrub); 5238 mos_obj_refd(spa->spa_all_vdev_zaps); 5239 mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); 5240 mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); 5241 mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); 5242 bpobj_count_refd(&spa->spa_deferred_bpobj); 5243 mos_obj_refd(dp->dp_empty_bpobj); 5244 bpobj_count_refd(&dp->dp_obsolete_bpobj); 5245 bpobj_count_refd(&dp->dp_free_bpobj); 5246 mos_obj_refd(spa->spa_l2cache.sav_object); 5247 mos_obj_refd(spa->spa_spares.sav_object); 5248 5249 if (spa->spa_syncing_log_sm != NULL) 5250 mos_obj_refd(spa->spa_syncing_log_sm->sm_object); 5251 mos_leak_log_spacemaps(spa); 5252 5253 mos_obj_refd(spa->spa_condensing_indirect_phys. 5254 scip_next_mapping_object); 5255 mos_obj_refd(spa->spa_condensing_indirect_phys. 5256 scip_prev_obsolete_sm_object); 5257 if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { 5258 vdev_indirect_mapping_t *vim = 5259 vdev_indirect_mapping_open(mos, 5260 spa->spa_condensing_indirect_phys.scip_next_mapping_object); 5261 mos_obj_refd(vim->vim_phys->vimp_counts_object); 5262 vdev_indirect_mapping_close(vim); 5263 } 5264 5265 if (dp->dp_origin_snap != NULL) { 5266 dsl_dataset_t *ds; 5267 5268 dsl_pool_config_enter(dp, FTAG); 5269 VERIFY0(dsl_dataset_hold_obj(dp, 5270 dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, 5271 FTAG, &ds)); 5272 count_ds_mos_objects(ds); 5273 dump_deadlist(&ds->ds_deadlist); 5274 dsl_dataset_rele(ds, FTAG); 5275 dsl_pool_config_exit(dp, FTAG); 5276 5277 count_ds_mos_objects(dp->dp_origin_snap); 5278 dump_deadlist(&dp->dp_origin_snap->ds_deadlist); 5279 } 5280 count_dir_mos_objects(dp->dp_mos_dir); 5281 if (dp->dp_free_dir != NULL) 5282 count_dir_mos_objects(dp->dp_free_dir); 5283 if (dp->dp_leak_dir != NULL) 5284 count_dir_mos_objects(dp->dp_leak_dir); 5285 5286 mos_leak_vdev(spa->spa_root_vdev); 5287 5288 for (uint64_t class = 0; class < DDT_CLASSES; class++) { 5289 for (uint64_t type = 0; type < DDT_TYPES; type++) { 5290 for (uint64_t cksum = 0; 5291 cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { 5292 ddt_t *ddt = spa->spa_ddt[cksum]; 5293 mos_obj_refd(ddt->ddt_object[type][class]); 5294 } 5295 } 5296 } 5297 5298 /* 5299 * Visit all allocated objects and make sure they are referenced. 5300 */ 5301 uint64_t object = 0; 5302 while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { 5303 if (range_tree_contains(mos_refd_objs, object, 1)) { 5304 range_tree_remove(mos_refd_objs, object, 1); 5305 } else { 5306 dmu_object_info_t doi; 5307 const char *name; 5308 dmu_object_info(mos, object, &doi); 5309 if (doi.doi_type & DMU_OT_NEWTYPE) { 5310 dmu_object_byteswap_t bswap = 5311 DMU_OT_BYTESWAP(doi.doi_type); 5312 name = dmu_ot_byteswap[bswap].ob_name; 5313 } else { 5314 name = dmu_ot[doi.doi_type].ot_name; 5315 } 5316 5317 (void) printf("MOS object %llu (%s) leaked\n", 5318 (u_longlong_t)object, name); 5319 rv = 2; 5320 } 5321 } 5322 (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); 5323 if (!range_tree_is_empty(mos_refd_objs)) 5324 rv = 2; 5325 range_tree_vacate(mos_refd_objs, NULL, NULL); 5326 range_tree_destroy(mos_refd_objs); 5327 return (rv); 5328 } 5329 5330 typedef struct log_sm_obsolete_stats_arg { 5331 uint64_t lsos_current_txg; 5332 5333 uint64_t lsos_total_entries; 5334 uint64_t lsos_valid_entries; 5335 5336 uint64_t lsos_sm_entries; 5337 uint64_t lsos_valid_sm_entries; 5338 } log_sm_obsolete_stats_arg_t; 5339 5340 static int 5341 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, 5342 uint64_t txg, void *arg) 5343 { 5344 log_sm_obsolete_stats_arg_t *lsos = arg; 5345 uint64_t offset = sme->sme_offset; 5346 uint64_t vdev_id = sme->sme_vdev; 5347 5348 if (lsos->lsos_current_txg == 0) { 5349 /* this is the first log */ 5350 lsos->lsos_current_txg = txg; 5351 } else if (lsos->lsos_current_txg < txg) { 5352 /* we just changed log - print stats and reset */ 5353 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", 5354 (u_longlong_t)lsos->lsos_valid_sm_entries, 5355 (u_longlong_t)lsos->lsos_sm_entries, 5356 (u_longlong_t)lsos->lsos_current_txg); 5357 lsos->lsos_valid_sm_entries = 0; 5358 lsos->lsos_sm_entries = 0; 5359 lsos->lsos_current_txg = txg; 5360 } 5361 ASSERT3U(lsos->lsos_current_txg, ==, txg); 5362 5363 lsos->lsos_sm_entries++; 5364 lsos->lsos_total_entries++; 5365 5366 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 5367 if (!vdev_is_concrete(vd)) 5368 return (0); 5369 5370 metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 5371 ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); 5372 5373 if (txg < metaslab_unflushed_txg(ms)) 5374 return (0); 5375 lsos->lsos_valid_sm_entries++; 5376 lsos->lsos_valid_entries++; 5377 return (0); 5378 } 5379 5380 static void 5381 dump_log_spacemap_obsolete_stats(spa_t *spa) 5382 { 5383 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 5384 return; 5385 5386 log_sm_obsolete_stats_arg_t lsos; 5387 bzero(&lsos, sizeof (lsos)); 5388 5389 (void) printf("Log Space Map Obsolete Entry Statistics:\n"); 5390 5391 iterate_through_spacemap_logs(spa, 5392 log_spacemap_obsolete_stats_cb, &lsos); 5393 5394 /* print stats for latest log */ 5395 (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", 5396 (u_longlong_t)lsos.lsos_valid_sm_entries, 5397 (u_longlong_t)lsos.lsos_sm_entries, 5398 (u_longlong_t)lsos.lsos_current_txg); 5399 5400 (void) printf("%-8llu valid entries out of %-8llu - total\n\n", 5401 (u_longlong_t)lsos.lsos_valid_entries, 5402 (u_longlong_t)lsos.lsos_total_entries); 5403 } 5404 5405 static void 5406 dump_zpool(spa_t *spa) 5407 { 5408 dsl_pool_t *dp = spa_get_dsl(spa); 5409 int rc = 0; 5410 5411 if (dump_opt['S']) { 5412 dump_simulated_ddt(spa); 5413 return; 5414 } 5415 5416 if (!dump_opt['e'] && dump_opt['C'] > 1) { 5417 (void) printf("\nCached configuration:\n"); 5418 dump_nvlist(spa->spa_config, 8); 5419 } 5420 5421 if (dump_opt['C']) 5422 dump_config(spa); 5423 5424 if (dump_opt['u']) 5425 dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); 5426 5427 if (dump_opt['D']) 5428 dump_all_ddts(spa); 5429 5430 if (dump_opt['d'] > 2 || dump_opt['m']) 5431 dump_metaslabs(spa); 5432 if (dump_opt['M']) 5433 dump_metaslab_groups(spa); 5434 if (dump_opt['d'] > 2 || dump_opt['m']) { 5435 dump_log_spacemaps(spa); 5436 dump_log_spacemap_obsolete_stats(spa); 5437 } 5438 5439 if (dump_opt['d'] || dump_opt['i']) { 5440 mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 5441 0); 5442 dump_dir(dp->dp_meta_objset); 5443 5444 if (dump_opt['d'] >= 3) { 5445 dsl_pool_t *dp = spa->spa_dsl_pool; 5446 dump_full_bpobj(&spa->spa_deferred_bpobj, 5447 "Deferred frees", 0); 5448 if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { 5449 dump_full_bpobj(&dp->dp_free_bpobj, 5450 "Pool snapshot frees", 0); 5451 } 5452 if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { 5453 ASSERT(spa_feature_is_enabled(spa, 5454 SPA_FEATURE_DEVICE_REMOVAL)); 5455 dump_full_bpobj(&dp->dp_obsolete_bpobj, 5456 "Pool obsolete blocks", 0); 5457 } 5458 5459 if (spa_feature_is_active(spa, 5460 SPA_FEATURE_ASYNC_DESTROY)) { 5461 dump_bptree(spa->spa_meta_objset, 5462 dp->dp_bptree_obj, 5463 "Pool dataset frees"); 5464 } 5465 dump_dtl(spa->spa_root_vdev, 0); 5466 } 5467 (void) dmu_objset_find(spa_name(spa), dump_one_dir, 5468 NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); 5469 5470 if (rc == 0 && !dump_opt['L']) 5471 rc = dump_mos_leaks(spa); 5472 5473 for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { 5474 uint64_t refcount; 5475 5476 if (!(spa_feature_table[f].fi_flags & 5477 ZFEATURE_FLAG_PER_DATASET) || 5478 !spa_feature_is_enabled(spa, f)) { 5479 ASSERT0(dataset_feature_count[f]); 5480 continue; 5481 } 5482 (void) feature_get_refcount(spa, 5483 &spa_feature_table[f], &refcount); 5484 if (dataset_feature_count[f] != refcount) { 5485 (void) printf("%s feature refcount mismatch: " 5486 "%lld datasets != %lld refcount\n", 5487 spa_feature_table[f].fi_uname, 5488 (longlong_t)dataset_feature_count[f], 5489 (longlong_t)refcount); 5490 rc = 2; 5491 } else { 5492 (void) printf("Verified %s feature refcount " 5493 "of %llu is correct\n", 5494 spa_feature_table[f].fi_uname, 5495 (longlong_t)refcount); 5496 } 5497 } 5498 5499 if (rc == 0) 5500 rc = verify_device_removal_feature_counts(spa); 5501 } 5502 5503 if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) 5504 rc = dump_block_stats(spa); 5505 5506 if (rc == 0) 5507 rc = verify_spacemap_refcounts(spa); 5508 5509 if (dump_opt['s']) 5510 show_pool_stats(spa); 5511 5512 if (dump_opt['h']) 5513 dump_history(spa); 5514 5515 if (rc == 0) 5516 rc = verify_checkpoint(spa); 5517 5518 if (rc != 0) { 5519 dump_debug_buffer(); 5520 exit(rc); 5521 } 5522 } 5523 5524 #define ZDB_FLAG_CHECKSUM 0x0001 5525 #define ZDB_FLAG_DECOMPRESS 0x0002 5526 #define ZDB_FLAG_BSWAP 0x0004 5527 #define ZDB_FLAG_GBH 0x0008 5528 #define ZDB_FLAG_INDIRECT 0x0010 5529 #define ZDB_FLAG_PHYS 0x0020 5530 #define ZDB_FLAG_RAW 0x0040 5531 #define ZDB_FLAG_PRINT_BLKPTR 0x0080 5532 5533 static int flagbits[256]; 5534 5535 static void 5536 zdb_print_blkptr(blkptr_t *bp, int flags) 5537 { 5538 char blkbuf[BP_SPRINTF_LEN]; 5539 5540 if (flags & ZDB_FLAG_BSWAP) 5541 byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); 5542 5543 snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); 5544 (void) printf("%s\n", blkbuf); 5545 } 5546 5547 static void 5548 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) 5549 { 5550 int i; 5551 5552 for (i = 0; i < nbps; i++) 5553 zdb_print_blkptr(&bp[i], flags); 5554 } 5555 5556 static void 5557 zdb_dump_gbh(void *buf, int flags) 5558 { 5559 zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); 5560 } 5561 5562 static void 5563 zdb_dump_block_raw(void *buf, uint64_t size, int flags) 5564 { 5565 if (flags & ZDB_FLAG_BSWAP) 5566 byteswap_uint64_array(buf, size); 5567 (void) write(1, buf, size); 5568 } 5569 5570 static void 5571 zdb_dump_block(char *label, void *buf, uint64_t size, int flags) 5572 { 5573 uint64_t *d = (uint64_t *)buf; 5574 unsigned nwords = size / sizeof (uint64_t); 5575 int do_bswap = !!(flags & ZDB_FLAG_BSWAP); 5576 unsigned i, j; 5577 const char *hdr; 5578 char *c; 5579 5580 5581 if (do_bswap) 5582 hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; 5583 else 5584 hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; 5585 5586 (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); 5587 5588 for (i = 0; i < nwords; i += 2) { 5589 (void) printf("%06llx: %016llx %016llx ", 5590 (u_longlong_t)(i * sizeof (uint64_t)), 5591 (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), 5592 (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); 5593 5594 c = (char *)&d[i]; 5595 for (j = 0; j < 2 * sizeof (uint64_t); j++) 5596 (void) printf("%c", isprint(c[j]) ? c[j] : '.'); 5597 (void) printf("\n"); 5598 } 5599 } 5600 5601 /* 5602 * There are two acceptable formats: 5603 * leaf_name - For example: c1t0d0 or /tmp/ztest.0a 5604 * child[.child]* - For example: 0.1.1 5605 * 5606 * The second form can be used to specify arbitrary vdevs anywhere 5607 * in the heirarchy. For example, in a pool with a mirror of 5608 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . 5609 */ 5610 static vdev_t * 5611 zdb_vdev_lookup(vdev_t *vdev, const char *path) 5612 { 5613 char *s, *p, *q; 5614 unsigned i; 5615 5616 if (vdev == NULL) 5617 return (NULL); 5618 5619 /* First, assume the x.x.x.x format */ 5620 i = strtoul(path, &s, 10); 5621 if (s == path || (s && *s != '.' && *s != '\0')) 5622 goto name; 5623 if (i >= vdev->vdev_children) 5624 return (NULL); 5625 5626 vdev = vdev->vdev_child[i]; 5627 if (*s == '\0') 5628 return (vdev); 5629 return (zdb_vdev_lookup(vdev, s+1)); 5630 5631 name: 5632 for (i = 0; i < vdev->vdev_children; i++) { 5633 vdev_t *vc = vdev->vdev_child[i]; 5634 5635 if (vc->vdev_path == NULL) { 5636 vc = zdb_vdev_lookup(vc, path); 5637 if (vc == NULL) 5638 continue; 5639 else 5640 return (vc); 5641 } 5642 5643 p = strrchr(vc->vdev_path, '/'); 5644 p = p ? p + 1 : vc->vdev_path; 5645 q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; 5646 5647 if (strcmp(vc->vdev_path, path) == 0) 5648 return (vc); 5649 if (strcmp(p, path) == 0) 5650 return (vc); 5651 if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) 5652 return (vc); 5653 } 5654 5655 return (NULL); 5656 } 5657 5658 /* ARGSUSED */ 5659 static int 5660 random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) 5661 { 5662 return (random_get_pseudo_bytes(buf, len)); 5663 } 5664 5665 /* 5666 * Read a block from a pool and print it out. The syntax of the 5667 * block descriptor is: 5668 * 5669 * pool:vdev_specifier:offset:size[:flags] 5670 * 5671 * pool - The name of the pool you wish to read from 5672 * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) 5673 * offset - offset, in hex, in bytes 5674 * size - Amount of data to read, in hex, in bytes 5675 * flags - A string of characters specifying options 5676 * b: Decode a blkptr at given offset within block 5677 * *c: Calculate and display checksums 5678 * d: Decompress data before dumping 5679 * e: Byteswap data before dumping 5680 * g: Display data as a gang block header 5681 * i: Display as an indirect block 5682 * p: Do I/O to physical offset 5683 * r: Dump raw data to stdout 5684 * 5685 * * = not yet implemented 5686 */ 5687 static void 5688 zdb_read_block(char *thing, spa_t *spa) 5689 { 5690 blkptr_t blk, *bp = &blk; 5691 dva_t *dva = bp->blk_dva; 5692 int flags = 0; 5693 uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; 5694 zio_t *zio; 5695 vdev_t *vd; 5696 abd_t *pabd; 5697 void *lbuf, *buf; 5698 const char *s, *vdev; 5699 char *p, *dup, *flagstr; 5700 int i, error; 5701 5702 dup = strdup(thing); 5703 s = strtok(dup, ":"); 5704 vdev = s ? s : ""; 5705 s = strtok(NULL, ":"); 5706 offset = strtoull(s ? s : "", NULL, 16); 5707 s = strtok(NULL, ":"); 5708 size = strtoull(s ? s : "", NULL, 16); 5709 s = strtok(NULL, ":"); 5710 if (s) 5711 flagstr = strdup(s); 5712 else 5713 flagstr = strdup(""); 5714 5715 s = NULL; 5716 if (size == 0) 5717 s = "size must not be zero"; 5718 if (!IS_P2ALIGNED(size, DEV_BSIZE)) 5719 s = "size must be a multiple of sector size"; 5720 if (!IS_P2ALIGNED(offset, DEV_BSIZE)) 5721 s = "offset must be a multiple of sector size"; 5722 if (s) { 5723 (void) printf("Invalid block specifier: %s - %s\n", thing, s); 5724 free(dup); 5725 return; 5726 } 5727 5728 for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { 5729 for (i = 0; flagstr[i]; i++) { 5730 int bit = flagbits[(uchar_t)flagstr[i]]; 5731 5732 if (bit == 0) { 5733 (void) printf("***Invalid flag: %c\n", 5734 flagstr[i]); 5735 continue; 5736 } 5737 flags |= bit; 5738 5739 /* If it's not something with an argument, keep going */ 5740 if ((bit & (ZDB_FLAG_CHECKSUM | 5741 ZDB_FLAG_PRINT_BLKPTR)) == 0) 5742 continue; 5743 5744 p = &flagstr[i + 1]; 5745 if (bit == ZDB_FLAG_PRINT_BLKPTR) 5746 blkptr_offset = strtoull(p, &p, 16); 5747 if (*p != ':' && *p != '\0') { 5748 (void) printf("***Invalid flag arg: '%s'\n", s); 5749 free(dup); 5750 return; 5751 } 5752 } 5753 } 5754 free(flagstr); 5755 5756 vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); 5757 if (vd == NULL) { 5758 (void) printf("***Invalid vdev: %s\n", vdev); 5759 free(dup); 5760 return; 5761 } else { 5762 if (vd->vdev_path) 5763 (void) fprintf(stderr, "Found vdev: %s\n", 5764 vd->vdev_path); 5765 else 5766 (void) fprintf(stderr, "Found vdev type: %s\n", 5767 vd->vdev_ops->vdev_op_type); 5768 } 5769 5770 psize = size; 5771 lsize = size; 5772 5773 pabd = abd_alloc_linear(SPA_MAXBLOCKSIZE, B_FALSE); 5774 lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 5775 5776 BP_ZERO(bp); 5777 5778 DVA_SET_VDEV(&dva[0], vd->vdev_id); 5779 DVA_SET_OFFSET(&dva[0], offset); 5780 DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); 5781 DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); 5782 5783 BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); 5784 5785 BP_SET_LSIZE(bp, lsize); 5786 BP_SET_PSIZE(bp, psize); 5787 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 5788 BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); 5789 BP_SET_TYPE(bp, DMU_OT_NONE); 5790 BP_SET_LEVEL(bp, 0); 5791 BP_SET_DEDUP(bp, 0); 5792 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 5793 5794 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5795 zio = zio_root(spa, NULL, NULL, 0); 5796 5797 if (vd == vd->vdev_top) { 5798 /* 5799 * Treat this as a normal block read. 5800 */ 5801 zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, 5802 ZIO_PRIORITY_SYNC_READ, 5803 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); 5804 } else { 5805 /* 5806 * Treat this as a vdev child I/O. 5807 */ 5808 zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, 5809 psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, 5810 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | 5811 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | 5812 ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, 5813 NULL, NULL)); 5814 } 5815 5816 error = zio_wait(zio); 5817 spa_config_exit(spa, SCL_STATE, FTAG); 5818 5819 if (error) { 5820 (void) printf("Read of %s failed, error: %d\n", thing, error); 5821 goto out; 5822 } 5823 5824 if (flags & ZDB_FLAG_DECOMPRESS) { 5825 /* 5826 * We don't know how the data was compressed, so just try 5827 * every decompress function at every inflated blocksize. 5828 */ 5829 enum zio_compress c; 5830 void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 5831 void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); 5832 5833 abd_copy_to_buf(pbuf2, pabd, psize); 5834 5835 VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, 5836 random_get_pseudo_bytes_cb, NULL)); 5837 5838 VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, 5839 SPA_MAXBLOCKSIZE - psize)); 5840 5841 for (lsize = SPA_MAXBLOCKSIZE; lsize > psize; 5842 lsize -= SPA_MINBLOCKSIZE) { 5843 for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { 5844 if (zio_decompress_data(c, pabd, 5845 lbuf, psize, lsize) == 0 && 5846 zio_decompress_data_buf(c, pbuf2, 5847 lbuf2, psize, lsize) == 0 && 5848 bcmp(lbuf, lbuf2, lsize) == 0) 5849 break; 5850 } 5851 if (c != ZIO_COMPRESS_FUNCTIONS) 5852 break; 5853 lsize -= SPA_MINBLOCKSIZE; 5854 } 5855 5856 umem_free(pbuf2, SPA_MAXBLOCKSIZE); 5857 umem_free(lbuf2, SPA_MAXBLOCKSIZE); 5858 5859 if (lsize <= psize) { 5860 (void) printf("Decompress of %s failed\n", thing); 5861 goto out; 5862 } 5863 buf = lbuf; 5864 size = lsize; 5865 } else { 5866 buf = abd_to_buf(pabd); 5867 size = psize; 5868 } 5869 5870 if (flags & ZDB_FLAG_PRINT_BLKPTR) 5871 zdb_print_blkptr((blkptr_t *)(void *) 5872 ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); 5873 else if (flags & ZDB_FLAG_RAW) 5874 zdb_dump_block_raw(buf, size, flags); 5875 else if (flags & ZDB_FLAG_INDIRECT) 5876 zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t), 5877 flags); 5878 else if (flags & ZDB_FLAG_GBH) 5879 zdb_dump_gbh(buf, flags); 5880 else 5881 zdb_dump_block(thing, buf, size, flags); 5882 5883 out: 5884 abd_free(pabd); 5885 umem_free(lbuf, SPA_MAXBLOCKSIZE); 5886 free(dup); 5887 } 5888 5889 static void 5890 zdb_embedded_block(char *thing) 5891 { 5892 blkptr_t bp; 5893 unsigned long long *words = (void *)&bp; 5894 char *buf; 5895 int err; 5896 5897 bzero(&bp, sizeof (bp)); 5898 err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" 5899 "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", 5900 words + 0, words + 1, words + 2, words + 3, 5901 words + 4, words + 5, words + 6, words + 7, 5902 words + 8, words + 9, words + 10, words + 11, 5903 words + 12, words + 13, words + 14, words + 15); 5904 if (err != 16) { 5905 (void) fprintf(stderr, "invalid input format\n"); 5906 exit(1); 5907 } 5908 ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); 5909 buf = malloc(SPA_MAXBLOCKSIZE); 5910 if (buf == NULL) { 5911 (void) fprintf(stderr, "out of memory\n"); 5912 exit(1); 5913 } 5914 err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); 5915 if (err != 0) { 5916 (void) fprintf(stderr, "decode failed: %u\n", err); 5917 exit(1); 5918 } 5919 zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); 5920 free(buf); 5921 } 5922 5923 int 5924 main(int argc, char **argv) 5925 { 5926 int c; 5927 struct rlimit rl = { 1024, 1024 }; 5928 spa_t *spa = NULL; 5929 objset_t *os = NULL; 5930 int dump_all = 1; 5931 int verbose = 0; 5932 int error = 0; 5933 char **searchdirs = NULL; 5934 int nsearch = 0; 5935 char *target, *target_pool; 5936 nvlist_t *policy = NULL; 5937 uint64_t max_txg = UINT64_MAX; 5938 int flags = ZFS_IMPORT_MISSING_LOG; 5939 int rewind = ZPOOL_NEVER_REWIND; 5940 char *spa_config_path_env; 5941 boolean_t target_is_spa = B_TRUE; 5942 nvlist_t *cfg = NULL; 5943 5944 (void) setrlimit(RLIMIT_NOFILE, &rl); 5945 (void) enable_extended_FILE_stdio(-1, -1); 5946 5947 dprintf_setup(&argc, argv); 5948 5949 /* 5950 * If there is an environment variable SPA_CONFIG_PATH it overrides 5951 * default spa_config_path setting. If -U flag is specified it will 5952 * override this environment variable settings once again. 5953 */ 5954 spa_config_path_env = getenv("SPA_CONFIG_PATH"); 5955 if (spa_config_path_env != NULL) 5956 spa_config_path = spa_config_path_env; 5957 5958 /* 5959 * For performance reasons, we set this tunable down. We do so before 5960 * the arg parsing section so that the user can override this value if 5961 * they choose. 5962 */ 5963 zfs_btree_verify_intensity = 3; 5964 5965 while ((c = getopt(argc, argv, 5966 "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:X")) != -1) { 5967 switch (c) { 5968 case 'b': 5969 case 'c': 5970 case 'C': 5971 case 'd': 5972 case 'D': 5973 case 'E': 5974 case 'G': 5975 case 'h': 5976 case 'i': 5977 case 'l': 5978 case 'm': 5979 case 'M': 5980 case 'O': 5981 case 'R': 5982 case 's': 5983 case 'S': 5984 case 'u': 5985 dump_opt[c]++; 5986 dump_all = 0; 5987 break; 5988 case 'A': 5989 case 'e': 5990 case 'F': 5991 case 'k': 5992 case 'L': 5993 case 'P': 5994 case 'q': 5995 case 'X': 5996 dump_opt[c]++; 5997 break; 5998 /* NB: Sort single match options below. */ 5999 case 'I': 6000 max_inflight = strtoull(optarg, NULL, 0); 6001 if (max_inflight == 0) { 6002 (void) fprintf(stderr, "maximum number " 6003 "of inflight I/Os must be greater " 6004 "than 0\n"); 6005 usage(); 6006 } 6007 break; 6008 case 'o': 6009 error = set_global_var(optarg); 6010 if (error != 0) 6011 usage(); 6012 break; 6013 case 'p': 6014 if (searchdirs == NULL) { 6015 searchdirs = umem_alloc(sizeof (char *), 6016 UMEM_NOFAIL); 6017 } else { 6018 char **tmp = umem_alloc((nsearch + 1) * 6019 sizeof (char *), UMEM_NOFAIL); 6020 bcopy(searchdirs, tmp, nsearch * 6021 sizeof (char *)); 6022 umem_free(searchdirs, 6023 nsearch * sizeof (char *)); 6024 searchdirs = tmp; 6025 } 6026 searchdirs[nsearch++] = optarg; 6027 break; 6028 case 't': 6029 max_txg = strtoull(optarg, NULL, 0); 6030 if (max_txg < TXG_INITIAL) { 6031 (void) fprintf(stderr, "incorrect txg " 6032 "specified: %s\n", optarg); 6033 usage(); 6034 } 6035 break; 6036 case 'U': 6037 spa_config_path = optarg; 6038 if (spa_config_path[0] != '/') { 6039 (void) fprintf(stderr, 6040 "cachefile must be an absolute path " 6041 "(i.e. start with a slash)\n"); 6042 usage(); 6043 } 6044 break; 6045 case 'v': 6046 verbose++; 6047 break; 6048 case 'V': 6049 flags = ZFS_IMPORT_VERBATIM; 6050 break; 6051 case 'x': 6052 vn_dumpdir = optarg; 6053 break; 6054 default: 6055 usage(); 6056 break; 6057 } 6058 } 6059 6060 if (!dump_opt['e'] && searchdirs != NULL) { 6061 (void) fprintf(stderr, "-p option requires use of -e\n"); 6062 usage(); 6063 } 6064 6065 /* 6066 * ZDB does not typically re-read blocks; therefore limit the ARC 6067 * to 256 MB, which can be used entirely for metadata. 6068 */ 6069 zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; 6070 6071 /* 6072 * "zdb -c" uses checksum-verifying scrub i/os which are async reads. 6073 * "zdb -b" uses traversal prefetch which uses async reads. 6074 * For good performance, let several of them be active at once. 6075 */ 6076 zfs_vdev_async_read_max_active = 10; 6077 6078 /* 6079 * Disable reference tracking for better performance. 6080 */ 6081 reference_tracking_enable = B_FALSE; 6082 6083 /* 6084 * Do not fail spa_load when spa_load_verify fails. This is needed 6085 * to load non-idle pools. 6086 */ 6087 spa_load_verify_dryrun = B_TRUE; 6088 6089 kernel_init(FREAD); 6090 6091 if (dump_all) 6092 verbose = MAX(verbose, 1); 6093 6094 for (c = 0; c < 256; c++) { 6095 if (dump_all && strchr("AeEFklLOPRSX", c) == NULL) 6096 dump_opt[c] = 1; 6097 if (dump_opt[c]) 6098 dump_opt[c] += verbose; 6099 } 6100 6101 aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); 6102 zfs_recover = (dump_opt['A'] > 1); 6103 6104 argc -= optind; 6105 argv += optind; 6106 6107 if (argc < 2 && dump_opt['R']) 6108 usage(); 6109 6110 if (dump_opt['E']) { 6111 if (argc != 1) 6112 usage(); 6113 zdb_embedded_block(argv[0]); 6114 return (0); 6115 } 6116 6117 if (argc < 1) { 6118 if (!dump_opt['e'] && dump_opt['C']) { 6119 dump_cachefile(spa_config_path); 6120 return (0); 6121 } 6122 usage(); 6123 } 6124 6125 if (dump_opt['l']) 6126 return (dump_label(argv[0])); 6127 6128 if (dump_opt['O']) { 6129 if (argc != 2) 6130 usage(); 6131 dump_opt['v'] = verbose + 3; 6132 return (dump_path(argv[0], argv[1])); 6133 } 6134 6135 if (dump_opt['X'] || dump_opt['F']) 6136 rewind = ZPOOL_DO_REWIND | 6137 (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); 6138 6139 if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || 6140 nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || 6141 nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) 6142 fatal("internal error: %s", strerror(ENOMEM)); 6143 6144 error = 0; 6145 target = argv[0]; 6146 6147 if (strpbrk(target, "/@") != NULL) { 6148 size_t targetlen; 6149 6150 target_pool = strdup(target); 6151 *strpbrk(target_pool, "/@") = '\0'; 6152 6153 target_is_spa = B_FALSE; 6154 targetlen = strlen(target); 6155 if (targetlen && target[targetlen - 1] == '/') 6156 target[targetlen - 1] = '\0'; 6157 } else { 6158 target_pool = target; 6159 } 6160 6161 if (dump_opt['e']) { 6162 importargs_t args = { 0 }; 6163 6164 args.paths = nsearch; 6165 args.path = searchdirs; 6166 args.can_be_active = B_TRUE; 6167 6168 error = zpool_find_config(NULL, target_pool, &cfg, &args, 6169 &libzpool_config_ops); 6170 6171 if (error == 0) { 6172 6173 if (nvlist_add_nvlist(cfg, 6174 ZPOOL_LOAD_POLICY, policy) != 0) { 6175 fatal("can't open '%s': %s", 6176 target, strerror(ENOMEM)); 6177 } 6178 6179 if (dump_opt['C'] > 1) { 6180 (void) printf("\nConfiguration for import:\n"); 6181 dump_nvlist(cfg, 8); 6182 } 6183 6184 /* 6185 * Disable the activity check to allow examination of 6186 * active pools. 6187 */ 6188 error = spa_import(target_pool, cfg, NULL, 6189 flags | ZFS_IMPORT_SKIP_MMP); 6190 } 6191 } 6192 6193 char *checkpoint_pool = NULL; 6194 char *checkpoint_target = NULL; 6195 if (dump_opt['k']) { 6196 checkpoint_pool = import_checkpointed_state(target, cfg, 6197 &checkpoint_target); 6198 6199 if (checkpoint_target != NULL) 6200 target = checkpoint_target; 6201 6202 } 6203 6204 if (error == 0) { 6205 if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { 6206 ASSERT(checkpoint_pool != NULL); 6207 ASSERT(checkpoint_target == NULL); 6208 6209 error = spa_open(checkpoint_pool, &spa, FTAG); 6210 if (error != 0) { 6211 fatal("Tried to open pool \"%s\" but " 6212 "spa_open() failed with error %d\n", 6213 checkpoint_pool, error); 6214 } 6215 6216 } else if (target_is_spa || dump_opt['R']) { 6217 zdb_set_skip_mmp(target); 6218 error = spa_open_rewind(target, &spa, FTAG, policy, 6219 NULL); 6220 if (error) { 6221 /* 6222 * If we're missing the log device then 6223 * try opening the pool after clearing the 6224 * log state. 6225 */ 6226 mutex_enter(&spa_namespace_lock); 6227 if ((spa = spa_lookup(target)) != NULL && 6228 spa->spa_log_state == SPA_LOG_MISSING) { 6229 spa->spa_log_state = SPA_LOG_CLEAR; 6230 error = 0; 6231 } 6232 mutex_exit(&spa_namespace_lock); 6233 6234 if (!error) { 6235 error = spa_open_rewind(target, &spa, 6236 FTAG, policy, NULL); 6237 } 6238 } 6239 } else { 6240 zdb_set_skip_mmp(target); 6241 error = open_objset(target, DMU_OST_ANY, FTAG, &os); 6242 } 6243 } 6244 nvlist_free(policy); 6245 6246 if (error) 6247 fatal("can't open '%s': %s", target, strerror(error)); 6248 6249 argv++; 6250 argc--; 6251 if (!dump_opt['R']) { 6252 if (argc > 0) { 6253 zopt_objects = argc; 6254 zopt_object = calloc(zopt_objects, sizeof (uint64_t)); 6255 for (unsigned i = 0; i < zopt_objects; i++) { 6256 errno = 0; 6257 zopt_object[i] = strtoull(argv[i], NULL, 0); 6258 if (zopt_object[i] == 0 && errno != 0) 6259 fatal("bad number %s: %s", 6260 argv[i], strerror(errno)); 6261 } 6262 } 6263 if (os != NULL) { 6264 dump_dir(os); 6265 } else if (zopt_objects > 0 && !dump_opt['m']) { 6266 dump_dir(spa->spa_meta_objset); 6267 } else { 6268 dump_zpool(spa); 6269 } 6270 } else { 6271 flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; 6272 flagbits['c'] = ZDB_FLAG_CHECKSUM; 6273 flagbits['d'] = ZDB_FLAG_DECOMPRESS; 6274 flagbits['e'] = ZDB_FLAG_BSWAP; 6275 flagbits['g'] = ZDB_FLAG_GBH; 6276 flagbits['i'] = ZDB_FLAG_INDIRECT; 6277 flagbits['p'] = ZDB_FLAG_PHYS; 6278 flagbits['r'] = ZDB_FLAG_RAW; 6279 6280 for (int i = 0; i < argc; i++) 6281 zdb_read_block(argv[i], spa); 6282 } 6283 6284 if (dump_opt['k']) { 6285 free(checkpoint_pool); 6286 if (!target_is_spa) 6287 free(checkpoint_target); 6288 } 6289 6290 if (os != NULL) 6291 close_objset(os, FTAG); 6292 else 6293 spa_close(spa, FTAG); 6294 6295 fuid_table_destroy(); 6296 6297 dump_debug_buffer(); 6298 6299 kernel_fini(); 6300 6301 return (error); 6302 } 6303