1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 25 * Copyright (c) 2022 by Pawel Jakub Dawidek 26 * Copyright (c) 2019, 2023, Klara Inc. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/spa_impl.h> 32 #include <sys/zio.h> 33 #include <sys/ddt.h> 34 #include <sys/ddt_impl.h> 35 #include <sys/zap.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/arc.h> 38 #include <sys/dsl_pool.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/dsl_scan.h> 41 #include <sys/abd.h> 42 43 /* 44 * # DDT: Deduplication tables 45 * 46 * The dedup subsystem provides block-level deduplication. When enabled, blocks 47 * to be written will have the dedup (D) bit set, which causes them to be 48 * tracked in a "dedup table", or DDT. If a block has been seen before (exists 49 * in the DDT), instead of being written, it will instead be made to reference 50 * the existing on-disk data, and a refcount bumped in the DDT instead. 51 * 52 * ## Dedup tables and entries 53 * 54 * Conceptually, a DDT is a dictionary or map. Each entry has a "key" 55 * (ddt_key_t) made up a block's checksum and certian properties, and a "value" 56 * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth 57 * time and refcount. Together these are enough to track references to a 58 * specific block, to build a valid block pointer to reference that block (for 59 * freeing, scrubbing, etc), and to fill a new block pointer with the missing 60 * pieces to make it seem like it was written. 61 * 62 * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[]. 63 * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk 64 * object data formats, each with their own implementations) and "classes" 65 * (ddt_class_t, instance of a storage type object, for entries with a specific 66 * characteristic). An entry (key) will only ever exist on one of these objects 67 * at any given time, but may be moved from one to another if their type or 68 * class changes. 69 * 70 * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block 71 * is to be written, before DVAs have been allocated, ddt_lookup() is called to 72 * see if the block has been seen before. If its not found, the write proceeds 73 * as normal, and after it succeeds, a new entry is created. If it is found, we 74 * fill the BP with the DVAs from the entry, increment the refcount and cause 75 * the write IO to return immediately. 76 * 77 * Each ddt_phys_t slot in the entry represents a separate dedup block for the 78 * same content/checksum. The slot is selected based on the zp_copies parameter 79 * the block is written with, that is, the number of DVAs in the block. The 80 * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto" 81 * feature. These are no longer written, and will be freed if encountered on 82 * old pools. 83 * 84 * ## Lifetime of an entry 85 * 86 * A DDT can be enormous, and typically is not held in memory all at once. 87 * Instead, the changes to an entry are tracked in memory, and written down to 88 * disk at the end of each txg. 89 * 90 * A "live" in-memory entry (ddt_entry_t) is a node on the live tree 91 * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is 92 * required for IO, ddt_lookup() is called. If an entry already exists on 93 * ddt_tree, it is returned. Otherwise, a new one is created, and the 94 * type/class objects for the DDT are searched for that key. If its found, its 95 * value is copied into the live entry. If not, an empty entry is created. 96 * 97 * The live entry will be modified during the txg, usually by modifying the 98 * refcount, but sometimes by adding or updating DVAs. At the end of the txg 99 * (during spa_sync()), type and class are recalculated for entry (see 100 * ddt_sync_entry()), and the entry is written to the appropriate storage 101 * object and (if necessary), removed from an old one. ddt_tree is cleared and 102 * the next txg can start. 103 * 104 * ## Dedup quota 105 * 106 * A maximum size for all DDTs on the pool can be set with the 107 * dedup_table_quota property. This is determined in ddt_over_quota() and 108 * enforced during ddt_lookup(). If the pool is at or over its quota limit, 109 * ddt_lookup() will only return entries for existing blocks, as updates are 110 * still possible. New entries will not be created; instead, ddt_lookup() will 111 * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove 112 * the D bit on the block and reissue the IO as a regular write. The block will 113 * not be deduplicated. 114 * 115 * Note that this is based on the on-disk size of the dedup store. Reclaiming 116 * this space after deleting entries relies on the ZAP "shrinking" behaviour, 117 * without which, no space would be recovered and the DDT would continue to be 118 * considered "over quota". See zap_shrink_enabled. 119 * 120 * ## Repair IO 121 * 122 * If a read on a dedup block fails, but there are other copies of the block in 123 * the other ddt_phys_t slots, reads will be issued for those instead 124 * (zio_ddt_read_start()). If one of those succeeds, the read is returned to 125 * the caller, and a copy is stashed on the entry's dde_repair_abd. 126 * 127 * During the end-of-txg sync, any entries with a dde_repair_abd get a 128 * "rewrite" write issued for the original block pointer, with the data read 129 * from the alternate block. If the block is actually damaged, this will invoke 130 * the pool's "self-healing" mechanism, and repair the block. 131 * 132 * ## Scanning (scrub/resilver) 133 * 134 * If dedup is active, the scrub machinery will walk the dedup table first, and 135 * scrub all blocks with refcnt > 1 first. After that it will move on to the 136 * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them. 137 * In this way, heavily deduplicated blocks are only scrubbed once. See the 138 * commentary on dsl_scan_ddt() for more details. 139 * 140 * Walking the DDT is done via ddt_walk(). The current position is stored in a 141 * ddt_bookmark_t, which represents a stable position in the storage object. 142 * This bookmark is stored by the scan machinery, and must reference the same 143 * position on the object even if the object changes, the pool is exported, or 144 * OpenZFS is upgraded. 145 * 146 * ## Interaction with block cloning 147 * 148 * If block cloning and dedup are both enabled on a pool, BRT will look for the 149 * dedup bit on an incoming block pointer. If set, it will call into the DDT 150 * (ddt_addref()) to add a reference to the block, instead of adding a 151 * reference to the BRT. See brt_pending_apply(). 152 */ 153 154 /* 155 * These are the only checksums valid for dedup. They must match the list 156 * from dedup_table in zfs_prop.c 157 */ 158 #define DDT_CHECKSUM_VALID(c) \ 159 (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \ 160 c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \ 161 c == ZIO_CHECKSUM_BLAKE3) 162 163 static kmem_cache_t *ddt_cache; 164 static kmem_cache_t *ddt_entry_cache; 165 166 /* 167 * Enable/disable prefetching of dedup-ed blocks which are going to be freed. 168 */ 169 int zfs_dedup_prefetch = 0; 170 171 /* 172 * If the dedup class cannot satisfy a DDT allocation, treat as over quota 173 * for this many TXGs. 174 */ 175 uint_t dedup_class_wait_txgs = 5; 176 177 178 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = { 179 &ddt_zap_ops, 180 }; 181 182 static const char *const ddt_class_name[DDT_CLASSES] = { 183 "ditto", 184 "duplicate", 185 "unique", 186 }; 187 188 static void 189 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 190 dmu_tx_t *tx) 191 { 192 spa_t *spa = ddt->ddt_spa; 193 objset_t *os = ddt->ddt_os; 194 uint64_t *objectp = &ddt->ddt_object[type][class]; 195 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & 196 ZCHECKSUM_FLAG_DEDUP; 197 char name[DDT_NAMELEN]; 198 199 ddt_object_name(ddt, type, class, name); 200 201 ASSERT3U(*objectp, ==, 0); 202 VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); 203 ASSERT3U(*objectp, !=, 0); 204 205 VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, 206 sizeof (uint64_t), 1, objectp, tx)); 207 208 VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name, 209 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 210 &ddt->ddt_histogram[type][class], tx)); 211 } 212 213 static void 214 ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 215 dmu_tx_t *tx) 216 { 217 spa_t *spa = ddt->ddt_spa; 218 objset_t *os = ddt->ddt_os; 219 uint64_t *objectp = &ddt->ddt_object[type][class]; 220 uint64_t count; 221 char name[DDT_NAMELEN]; 222 223 ddt_object_name(ddt, type, class, name); 224 225 ASSERT3U(*objectp, !=, 0); 226 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); 227 VERIFY0(ddt_object_count(ddt, type, class, &count)); 228 VERIFY0(count); 229 VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); 230 VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx)); 231 VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx)); 232 memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t)); 233 234 *objectp = 0; 235 } 236 237 static int 238 ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class) 239 { 240 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 241 dmu_object_info_t doi; 242 uint64_t count; 243 char name[DDT_NAMELEN]; 244 int error; 245 246 ddt_object_name(ddt, type, class, name); 247 248 error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, 249 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); 250 if (error != 0) 251 return (error); 252 253 error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 254 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 255 &ddt->ddt_histogram[type][class]); 256 if (error != 0) 257 return (error); 258 259 /* 260 * Seed the cached statistics. 261 */ 262 error = ddt_object_info(ddt, type, class, &doi); 263 if (error) 264 return (error); 265 266 error = ddt_object_count(ddt, type, class, &count); 267 if (error) 268 return (error); 269 270 ddo->ddo_count = count; 271 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 272 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 273 274 return (0); 275 } 276 277 static void 278 ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 279 dmu_tx_t *tx) 280 { 281 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 282 dmu_object_info_t doi; 283 uint64_t count; 284 char name[DDT_NAMELEN]; 285 286 ddt_object_name(ddt, type, class, name); 287 288 VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 289 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 290 &ddt->ddt_histogram[type][class], tx)); 291 292 /* 293 * Cache DDT statistics; this is the only time they'll change. 294 */ 295 VERIFY0(ddt_object_info(ddt, type, class, &doi)); 296 VERIFY0(ddt_object_count(ddt, type, class, &count)); 297 298 ddo->ddo_count = count; 299 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 300 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 301 } 302 303 static boolean_t 304 ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class) 305 { 306 return (!!ddt->ddt_object[type][class]); 307 } 308 309 static int 310 ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 311 ddt_entry_t *dde) 312 { 313 if (!ddt_object_exists(ddt, type, class)) 314 return (SET_ERROR(ENOENT)); 315 316 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, 317 ddt->ddt_object[type][class], &dde->dde_key, 318 dde->dde_phys, sizeof (dde->dde_phys))); 319 } 320 321 static int 322 ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 323 const ddt_key_t *ddk) 324 { 325 if (!ddt_object_exists(ddt, type, class)) 326 return (SET_ERROR(ENOENT)); 327 328 return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os, 329 ddt->ddt_object[type][class], ddk)); 330 } 331 332 static void 333 ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 334 const ddt_key_t *ddk) 335 { 336 if (!ddt_object_exists(ddt, type, class)) 337 return; 338 339 ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, 340 ddt->ddt_object[type][class], ddk); 341 } 342 343 static void 344 ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class) 345 { 346 if (!ddt_object_exists(ddt, type, class)) 347 return; 348 349 ddt_ops[type]->ddt_op_prefetch_all(ddt->ddt_os, 350 ddt->ddt_object[type][class]); 351 } 352 353 static int 354 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 355 ddt_entry_t *dde, dmu_tx_t *tx) 356 { 357 ASSERT(ddt_object_exists(ddt, type, class)); 358 359 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, 360 ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys, 361 sizeof (dde->dde_phys), tx)); 362 } 363 364 static int 365 ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 366 const ddt_key_t *ddk, dmu_tx_t *tx) 367 { 368 ASSERT(ddt_object_exists(ddt, type, class)); 369 370 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, 371 ddt->ddt_object[type][class], ddk, tx)); 372 } 373 374 int 375 ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 376 uint64_t *walk, ddt_entry_t *dde) 377 { 378 ASSERT(ddt_object_exists(ddt, type, class)); 379 380 return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, 381 ddt->ddt_object[type][class], walk, &dde->dde_key, 382 dde->dde_phys, sizeof (dde->dde_phys))); 383 } 384 385 int 386 ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 387 uint64_t *count) 388 { 389 ASSERT(ddt_object_exists(ddt, type, class)); 390 391 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, 392 ddt->ddt_object[type][class], count)); 393 } 394 395 int 396 ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 397 dmu_object_info_t *doi) 398 { 399 if (!ddt_object_exists(ddt, type, class)) 400 return (SET_ERROR(ENOENT)); 401 402 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], 403 doi)); 404 } 405 406 void 407 ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 408 char *name) 409 { 410 (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT, 411 zio_checksum_table[ddt->ddt_checksum].ci_name, 412 ddt_ops[type]->ddt_op_name, ddt_class_name[class]); 413 } 414 415 void 416 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) 417 { 418 ASSERT3U(txg, !=, 0); 419 420 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 421 bp->blk_dva[d] = ddp->ddp_dva[d]; 422 BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); 423 } 424 425 /* 426 * The bp created via this function may be used for repairs and scrub, but it 427 * will be missing the salt / IV required to do a full decrypting read. 428 */ 429 void 430 ddt_bp_create(enum zio_checksum checksum, 431 const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) 432 { 433 BP_ZERO(bp); 434 435 if (ddp != NULL) 436 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); 437 438 bp->blk_cksum = ddk->ddk_cksum; 439 440 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); 441 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); 442 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); 443 BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk)); 444 BP_SET_FILL(bp, 1); 445 BP_SET_CHECKSUM(bp, checksum); 446 BP_SET_TYPE(bp, DMU_OT_DEDUP); 447 BP_SET_LEVEL(bp, 0); 448 BP_SET_DEDUP(bp, 1); 449 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 450 } 451 452 void 453 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) 454 { 455 ddk->ddk_cksum = bp->blk_cksum; 456 ddk->ddk_prop = 0; 457 458 ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp)); 459 460 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); 461 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); 462 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); 463 DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp)); 464 } 465 466 void 467 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) 468 { 469 ASSERT0(ddp->ddp_phys_birth); 470 471 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 472 ddp->ddp_dva[d] = bp->blk_dva[d]; 473 ddp->ddp_phys_birth = BP_GET_BIRTH(bp); 474 } 475 476 void 477 ddt_phys_clear(ddt_phys_t *ddp) 478 { 479 memset(ddp, 0, sizeof (*ddp)); 480 } 481 482 void 483 ddt_phys_addref(ddt_phys_t *ddp) 484 { 485 ddp->ddp_refcnt++; 486 } 487 488 void 489 ddt_phys_decref(ddt_phys_t *ddp) 490 { 491 if (ddp) { 492 ASSERT3U(ddp->ddp_refcnt, >, 0); 493 ddp->ddp_refcnt--; 494 } 495 } 496 497 static void 498 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) 499 { 500 blkptr_t blk; 501 502 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 503 504 /* 505 * We clear the dedup bit so that zio_free() will actually free the 506 * space, rather than just decrementing the refcount in the DDT. 507 */ 508 BP_SET_DEDUP(&blk, 0); 509 510 ddt_phys_clear(ddp); 511 zio_free(ddt->ddt_spa, txg, &blk); 512 } 513 514 ddt_phys_t * 515 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) 516 { 517 ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; 518 519 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 520 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && 521 BP_GET_BIRTH(bp) == ddp->ddp_phys_birth) 522 return (ddp); 523 } 524 return (NULL); 525 } 526 527 uint64_t 528 ddt_phys_total_refcnt(const ddt_entry_t *dde) 529 { 530 uint64_t refcnt = 0; 531 532 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) 533 refcnt += dde->dde_phys[p].ddp_refcnt; 534 535 return (refcnt); 536 } 537 538 ddt_t * 539 ddt_select(spa_t *spa, const blkptr_t *bp) 540 { 541 ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp))); 542 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); 543 } 544 545 void 546 ddt_enter(ddt_t *ddt) 547 { 548 mutex_enter(&ddt->ddt_lock); 549 } 550 551 void 552 ddt_exit(ddt_t *ddt) 553 { 554 mutex_exit(&ddt->ddt_lock); 555 } 556 557 void 558 ddt_init(void) 559 { 560 ddt_cache = kmem_cache_create("ddt_cache", 561 sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 562 ddt_entry_cache = kmem_cache_create("ddt_entry_cache", 563 sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 564 } 565 566 void 567 ddt_fini(void) 568 { 569 kmem_cache_destroy(ddt_entry_cache); 570 kmem_cache_destroy(ddt_cache); 571 } 572 573 static ddt_entry_t * 574 ddt_alloc(const ddt_key_t *ddk) 575 { 576 ddt_entry_t *dde; 577 578 dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); 579 memset(dde, 0, sizeof (ddt_entry_t)); 580 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); 581 582 dde->dde_key = *ddk; 583 584 return (dde); 585 } 586 587 static void 588 ddt_free(ddt_entry_t *dde) 589 { 590 for (int p = 0; p < DDT_PHYS_TYPES; p++) 591 ASSERT3P(dde->dde_lead_zio[p], ==, NULL); 592 593 if (dde->dde_repair_abd != NULL) 594 abd_free(dde->dde_repair_abd); 595 596 cv_destroy(&dde->dde_cv); 597 kmem_cache_free(ddt_entry_cache, dde); 598 } 599 600 void 601 ddt_remove(ddt_t *ddt, ddt_entry_t *dde) 602 { 603 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 604 605 avl_remove(&ddt->ddt_tree, dde); 606 ddt_free(dde); 607 } 608 609 static boolean_t 610 ddt_special_over_quota(spa_t *spa, metaslab_class_t *mc) 611 { 612 if (mc != NULL && metaslab_class_get_space(mc) > 0) { 613 /* Over quota if allocating outside of this special class */ 614 if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg + 615 dedup_class_wait_txgs) { 616 /* Waiting for some deferred frees to be processed */ 617 return (B_TRUE); 618 } 619 620 /* 621 * We're considered over quota when we hit 85% full, or for 622 * larger drives, when there is less than 8GB free. 623 */ 624 uint64_t allocated = metaslab_class_get_alloc(mc); 625 uint64_t capacity = metaslab_class_get_space(mc); 626 uint64_t limit = MAX(capacity * 85 / 100, 627 (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0); 628 629 return (allocated >= limit); 630 } 631 return (B_FALSE); 632 } 633 634 /* 635 * Check if the DDT is over its quota. This can be due to a few conditions: 636 * 1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize 637 * exceeds this limit 638 * 639 * 2. 'dedup_table_quota' property is set to automatic and 640 * a. the dedup or special allocation class could not satisfy a DDT 641 * allocation in a recent transaction 642 * b. the dedup or special allocation class has exceeded its 85% limit 643 */ 644 static boolean_t 645 ddt_over_quota(spa_t *spa) 646 { 647 if (spa->spa_dedup_table_quota == 0) 648 return (B_FALSE); 649 650 if (spa->spa_dedup_table_quota != UINT64_MAX) 651 return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota); 652 653 /* 654 * For automatic quota, table size is limited by dedup or special class 655 */ 656 if (ddt_special_over_quota(spa, spa_dedup_class(spa))) 657 return (B_TRUE); 658 else if (spa_special_has_ddt(spa) && 659 ddt_special_over_quota(spa, spa_special_class(spa))) 660 return (B_TRUE); 661 662 return (B_FALSE); 663 } 664 665 void 666 ddt_prefetch_all(spa_t *spa) 667 { 668 /* 669 * Load all DDT entries for each type/class combination. This is 670 * indended to perform a prefetch on all such blocks. For the same 671 * reason that ddt_prefetch isn't locked, this is also not locked. 672 */ 673 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 674 ddt_t *ddt = spa->spa_ddt[c]; 675 if (!ddt) 676 continue; 677 678 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 679 for (ddt_class_t class = 0; class < DDT_CLASSES; 680 class++) { 681 ddt_object_prefetch_all(ddt, type, class); 682 } 683 } 684 } 685 } 686 687 ddt_entry_t * 688 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) 689 { 690 spa_t *spa = ddt->ddt_spa; 691 ddt_key_t search; 692 ddt_entry_t *dde; 693 ddt_type_t type; 694 ddt_class_t class; 695 avl_index_t where; 696 int error; 697 698 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 699 700 ddt_key_fill(&search, bp); 701 702 /* Find an existing live entry */ 703 dde = avl_find(&ddt->ddt_tree, &search, &where); 704 if (dde != NULL) { 705 /* If we went over quota, act like we didn't find it */ 706 if (dde->dde_flags & DDE_FLAG_OVERQUOTA) 707 return (NULL); 708 709 /* If it's already loaded, we can just return it. */ 710 if (dde->dde_flags & DDE_FLAG_LOADED) 711 return (dde); 712 713 /* Someone else is loading it, wait for it. */ 714 dde->dde_waiters++; 715 while (!(dde->dde_flags & DDE_FLAG_LOADED)) 716 cv_wait(&dde->dde_cv, &ddt->ddt_lock); 717 dde->dde_waiters--; 718 719 /* Loaded but over quota, forget we were ever here */ 720 if (dde->dde_flags & DDE_FLAG_OVERQUOTA) { 721 if (dde->dde_waiters == 0) { 722 avl_remove(&ddt->ddt_tree, dde); 723 ddt_free(dde); 724 } 725 return (NULL); 726 } 727 728 return (dde); 729 } 730 731 /* Not found. */ 732 if (!add) 733 return (NULL); 734 735 /* Time to make a new entry. */ 736 dde = ddt_alloc(&search); 737 avl_insert(&ddt->ddt_tree, dde, where); 738 739 /* 740 * ddt_tree is now stable, so unlock and let everyone else keep moving. 741 * Anyone landing on this entry will find it without DDE_FLAG_LOADED, 742 * and go to sleep waiting for it above. 743 */ 744 ddt_exit(ddt); 745 746 /* Search all store objects for the entry. */ 747 error = ENOENT; 748 for (type = 0; type < DDT_TYPES; type++) { 749 for (class = 0; class < DDT_CLASSES; class++) { 750 error = ddt_object_lookup(ddt, type, class, dde); 751 if (error != ENOENT) { 752 ASSERT0(error); 753 break; 754 } 755 } 756 if (error != ENOENT) 757 break; 758 } 759 760 ddt_enter(ddt); 761 762 ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED)); 763 764 dde->dde_type = type; /* will be DDT_TYPES if no entry found */ 765 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ 766 767 if (dde->dde_type == DDT_TYPES && 768 dde->dde_class == DDT_CLASSES && 769 ddt_over_quota(spa)) { 770 /* Over quota. If no one is waiting, clean up right now. */ 771 if (dde->dde_waiters == 0) { 772 avl_remove(&ddt->ddt_tree, dde); 773 ddt_free(dde); 774 return (NULL); 775 } 776 777 /* Flag cleanup required */ 778 dde->dde_flags |= DDE_FLAG_OVERQUOTA; 779 } else if (error == 0) { 780 ddt_stat_update(ddt, dde, -1ULL); 781 } 782 783 /* Entry loaded, everyone can proceed now */ 784 dde->dde_flags |= DDE_FLAG_LOADED; 785 cv_broadcast(&dde->dde_cv); 786 787 return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde); 788 } 789 790 void 791 ddt_prefetch(spa_t *spa, const blkptr_t *bp) 792 { 793 ddt_t *ddt; 794 ddt_key_t ddk; 795 796 if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) 797 return; 798 799 /* 800 * We only remove the DDT once all tables are empty and only 801 * prefetch dedup blocks when there are entries in the DDT. 802 * Thus no locking is required as the DDT can't disappear on us. 803 */ 804 ddt = ddt_select(spa, bp); 805 ddt_key_fill(&ddk, bp); 806 807 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 808 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 809 ddt_object_prefetch(ddt, type, class, &ddk); 810 } 811 } 812 } 813 814 /* 815 * Key comparison. Any struct wanting to make use of this function must have 816 * the key as the first element. 817 */ 818 #define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t)) 819 820 typedef struct ddt_key_cmp { 821 uint16_t u16[DDT_KEY_CMP_LEN]; 822 } ddt_key_cmp_t; 823 824 int 825 ddt_key_compare(const void *x1, const void *x2) 826 { 827 const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)x1; 828 const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)x2; 829 int32_t cmp = 0; 830 831 for (int i = 0; i < DDT_KEY_CMP_LEN; i++) { 832 cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i]; 833 if (likely(cmp)) 834 break; 835 } 836 837 return (TREE_ISIGN(cmp)); 838 } 839 840 static ddt_t * 841 ddt_table_alloc(spa_t *spa, enum zio_checksum c) 842 { 843 ddt_t *ddt; 844 845 ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); 846 memset(ddt, 0, sizeof (ddt_t)); 847 848 mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); 849 avl_create(&ddt->ddt_tree, ddt_key_compare, 850 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 851 avl_create(&ddt->ddt_repair_tree, ddt_key_compare, 852 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 853 ddt->ddt_checksum = c; 854 ddt->ddt_spa = spa; 855 ddt->ddt_os = spa->spa_meta_objset; 856 857 return (ddt); 858 } 859 860 static void 861 ddt_table_free(ddt_t *ddt) 862 { 863 ASSERT0(avl_numnodes(&ddt->ddt_tree)); 864 ASSERT0(avl_numnodes(&ddt->ddt_repair_tree)); 865 avl_destroy(&ddt->ddt_tree); 866 avl_destroy(&ddt->ddt_repair_tree); 867 mutex_destroy(&ddt->ddt_lock); 868 kmem_cache_free(ddt_cache, ddt); 869 } 870 871 void 872 ddt_create(spa_t *spa) 873 { 874 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; 875 876 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 877 if (DDT_CHECKSUM_VALID(c)) 878 spa->spa_ddt[c] = ddt_table_alloc(spa, c); 879 } 880 } 881 882 int 883 ddt_load(spa_t *spa) 884 { 885 int error; 886 887 ddt_create(spa); 888 889 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 890 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 891 &spa->spa_ddt_stat_object); 892 893 if (error) 894 return (error == ENOENT ? 0 : error); 895 896 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 897 if (!DDT_CHECKSUM_VALID(c)) 898 continue; 899 900 ddt_t *ddt = spa->spa_ddt[c]; 901 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 902 for (ddt_class_t class = 0; class < DDT_CLASSES; 903 class++) { 904 error = ddt_object_load(ddt, type, class); 905 if (error != 0 && error != ENOENT) 906 return (error); 907 } 908 } 909 910 /* 911 * Seed the cached histograms. 912 */ 913 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, 914 sizeof (ddt->ddt_histogram)); 915 spa->spa_dedup_dspace = ~0ULL; 916 spa->spa_dedup_dsize = ~0ULL; 917 } 918 919 return (0); 920 } 921 922 void 923 ddt_unload(spa_t *spa) 924 { 925 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 926 if (spa->spa_ddt[c]) { 927 ddt_table_free(spa->spa_ddt[c]); 928 spa->spa_ddt[c] = NULL; 929 } 930 } 931 } 932 933 boolean_t 934 ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp) 935 { 936 ddt_t *ddt; 937 ddt_key_t ddk; 938 939 if (!BP_GET_DEDUP(bp)) 940 return (B_FALSE); 941 942 if (max_class == DDT_CLASS_UNIQUE) 943 return (B_TRUE); 944 945 ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; 946 947 ddt_key_fill(&ddk, bp); 948 949 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 950 for (ddt_class_t class = 0; class <= max_class; class++) { 951 if (ddt_object_contains(ddt, type, class, &ddk) == 0) 952 return (B_TRUE); 953 } 954 } 955 956 return (B_FALSE); 957 } 958 959 ddt_entry_t * 960 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) 961 { 962 ddt_key_t ddk; 963 ddt_entry_t *dde; 964 965 ddt_key_fill(&ddk, bp); 966 967 dde = ddt_alloc(&ddk); 968 969 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 970 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 971 /* 972 * We can only do repair if there are multiple copies 973 * of the block. For anything in the UNIQUE class, 974 * there's definitely only one copy, so don't even try. 975 */ 976 if (class != DDT_CLASS_UNIQUE && 977 ddt_object_lookup(ddt, type, class, dde) == 0) 978 return (dde); 979 } 980 } 981 982 memset(dde->dde_phys, 0, sizeof (dde->dde_phys)); 983 984 return (dde); 985 } 986 987 void 988 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) 989 { 990 avl_index_t where; 991 992 ddt_enter(ddt); 993 994 if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && 995 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) 996 avl_insert(&ddt->ddt_repair_tree, dde, where); 997 else 998 ddt_free(dde); 999 1000 ddt_exit(ddt); 1001 } 1002 1003 static void 1004 ddt_repair_entry_done(zio_t *zio) 1005 { 1006 ddt_entry_t *rdde = zio->io_private; 1007 1008 ddt_free(rdde); 1009 } 1010 1011 static void 1012 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) 1013 { 1014 ddt_phys_t *ddp = dde->dde_phys; 1015 ddt_phys_t *rddp = rdde->dde_phys; 1016 ddt_key_t *ddk = &dde->dde_key; 1017 ddt_key_t *rddk = &rdde->dde_key; 1018 zio_t *zio; 1019 blkptr_t blk; 1020 1021 zio = zio_null(rio, rio->io_spa, NULL, 1022 ddt_repair_entry_done, rdde, rio->io_flags); 1023 1024 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { 1025 if (ddp->ddp_phys_birth == 0 || 1026 ddp->ddp_phys_birth != rddp->ddp_phys_birth || 1027 memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) 1028 continue; 1029 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 1030 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, 1031 rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, 1032 ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); 1033 } 1034 1035 zio_nowait(zio); 1036 } 1037 1038 static void 1039 ddt_repair_table(ddt_t *ddt, zio_t *rio) 1040 { 1041 spa_t *spa = ddt->ddt_spa; 1042 ddt_entry_t *dde, *rdde_next, *rdde; 1043 avl_tree_t *t = &ddt->ddt_repair_tree; 1044 blkptr_t blk; 1045 1046 if (spa_sync_pass(spa) > 1) 1047 return; 1048 1049 ddt_enter(ddt); 1050 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { 1051 rdde_next = AVL_NEXT(t, rdde); 1052 avl_remove(&ddt->ddt_repair_tree, rdde); 1053 ddt_exit(ddt); 1054 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); 1055 dde = ddt_repair_start(ddt, &blk); 1056 ddt_repair_entry(ddt, dde, rdde, rio); 1057 ddt_repair_done(ddt, dde); 1058 ddt_enter(ddt); 1059 } 1060 ddt_exit(ddt); 1061 } 1062 1063 static void 1064 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) 1065 { 1066 dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; 1067 ddt_phys_t *ddp = dde->dde_phys; 1068 ddt_key_t *ddk = &dde->dde_key; 1069 ddt_type_t otype = dde->dde_type; 1070 ddt_type_t ntype = DDT_TYPE_DEFAULT; 1071 ddt_class_t oclass = dde->dde_class; 1072 ddt_class_t nclass; 1073 uint64_t total_refcnt = 0; 1074 1075 ASSERT(dde->dde_flags & DDE_FLAG_LOADED); 1076 1077 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1078 ASSERT3P(dde->dde_lead_zio[p], ==, NULL); 1079 if (ddp->ddp_phys_birth == 0) { 1080 ASSERT0(ddp->ddp_refcnt); 1081 continue; 1082 } 1083 if (p == DDT_PHYS_DITTO) { 1084 /* 1085 * Note, we no longer create DDT-DITTO blocks, but we 1086 * don't want to leak any written by older software. 1087 */ 1088 ddt_phys_free(ddt, ddk, ddp, txg); 1089 continue; 1090 } 1091 if (ddp->ddp_refcnt == 0) 1092 ddt_phys_free(ddt, ddk, ddp, txg); 1093 total_refcnt += ddp->ddp_refcnt; 1094 } 1095 1096 /* We do not create new DDT-DITTO blocks. */ 1097 ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth); 1098 if (total_refcnt > 1) 1099 nclass = DDT_CLASS_DUPLICATE; 1100 else 1101 nclass = DDT_CLASS_UNIQUE; 1102 1103 if (otype != DDT_TYPES && 1104 (otype != ntype || oclass != nclass || total_refcnt == 0)) { 1105 VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx)); 1106 ASSERT3U( 1107 ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT); 1108 } 1109 1110 if (total_refcnt != 0) { 1111 dde->dde_type = ntype; 1112 dde->dde_class = nclass; 1113 ddt_stat_update(ddt, dde, 0); 1114 if (!ddt_object_exists(ddt, ntype, nclass)) 1115 ddt_object_create(ddt, ntype, nclass, tx); 1116 VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx)); 1117 1118 /* 1119 * If the class changes, the order that we scan this bp 1120 * changes. If it decreases, we could miss it, so 1121 * scan it right now. (This covers both class changing 1122 * while we are doing ddt_walk(), and when we are 1123 * traversing.) 1124 */ 1125 if (nclass < oclass) { 1126 dsl_scan_ddt_entry(dp->dp_scan, 1127 ddt->ddt_checksum, dde, tx); 1128 } 1129 } 1130 } 1131 1132 static void 1133 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) 1134 { 1135 spa_t *spa = ddt->ddt_spa; 1136 ddt_entry_t *dde; 1137 void *cookie = NULL; 1138 1139 if (avl_numnodes(&ddt->ddt_tree) == 0) 1140 return; 1141 1142 ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP); 1143 1144 if (spa->spa_ddt_stat_object == 0) { 1145 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, 1146 DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, 1147 DMU_POOL_DDT_STATS, tx); 1148 } 1149 1150 while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { 1151 ddt_sync_entry(ddt, dde, tx, txg); 1152 ddt_free(dde); 1153 } 1154 1155 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 1156 uint64_t add, count = 0; 1157 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 1158 if (ddt_object_exists(ddt, type, class)) { 1159 ddt_object_sync(ddt, type, class, tx); 1160 VERIFY0(ddt_object_count(ddt, type, class, 1161 &add)); 1162 count += add; 1163 } 1164 } 1165 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 1166 if (count == 0 && ddt_object_exists(ddt, type, class)) 1167 ddt_object_destroy(ddt, type, class, tx); 1168 } 1169 } 1170 1171 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, 1172 sizeof (ddt->ddt_histogram)); 1173 spa->spa_dedup_dspace = ~0ULL; 1174 spa->spa_dedup_dsize = ~0ULL; 1175 } 1176 1177 void 1178 ddt_sync(spa_t *spa, uint64_t txg) 1179 { 1180 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1181 dmu_tx_t *tx; 1182 zio_t *rio; 1183 1184 ASSERT3U(spa_syncing_txg(spa), ==, txg); 1185 1186 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1187 1188 rio = zio_root(spa, NULL, NULL, 1189 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); 1190 1191 /* 1192 * This function may cause an immediate scan of ddt blocks (see 1193 * the comment above dsl_scan_ddt() for details). We set the 1194 * scan's root zio here so that we can wait for any scan IOs in 1195 * addition to the regular ddt IOs. 1196 */ 1197 ASSERT3P(scn->scn_zio_root, ==, NULL); 1198 scn->scn_zio_root = rio; 1199 1200 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 1201 ddt_t *ddt = spa->spa_ddt[c]; 1202 if (ddt == NULL) 1203 continue; 1204 ddt_sync_table(ddt, tx, txg); 1205 ddt_repair_table(ddt, rio); 1206 } 1207 1208 (void) zio_wait(rio); 1209 scn->scn_zio_root = NULL; 1210 1211 dmu_tx_commit(tx); 1212 } 1213 1214 int 1215 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) 1216 { 1217 do { 1218 do { 1219 do { 1220 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; 1221 if (ddt == NULL) 1222 continue; 1223 int error = ENOENT; 1224 if (ddt_object_exists(ddt, ddb->ddb_type, 1225 ddb->ddb_class)) { 1226 error = ddt_object_walk(ddt, 1227 ddb->ddb_type, ddb->ddb_class, 1228 &ddb->ddb_cursor, dde); 1229 } 1230 dde->dde_type = ddb->ddb_type; 1231 dde->dde_class = ddb->ddb_class; 1232 if (error == 0) 1233 return (0); 1234 if (error != ENOENT) 1235 return (error); 1236 ddb->ddb_cursor = 0; 1237 } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); 1238 ddb->ddb_checksum = 0; 1239 } while (++ddb->ddb_type < DDT_TYPES); 1240 ddb->ddb_type = 0; 1241 } while (++ddb->ddb_class < DDT_CLASSES); 1242 1243 return (SET_ERROR(ENOENT)); 1244 } 1245 1246 /* 1247 * This function is used by Block Cloning (brt.c) to increase reference 1248 * counter for the DDT entry if the block is already in DDT. 1249 * 1250 * Return false if the block, despite having the D bit set, is not present 1251 * in the DDT. Currently this is not possible but might be in the future. 1252 * See the comment below. 1253 */ 1254 boolean_t 1255 ddt_addref(spa_t *spa, const blkptr_t *bp) 1256 { 1257 ddt_t *ddt; 1258 ddt_entry_t *dde; 1259 boolean_t result; 1260 1261 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1262 ddt = ddt_select(spa, bp); 1263 ddt_enter(ddt); 1264 1265 dde = ddt_lookup(ddt, bp, B_TRUE); 1266 1267 /* Can be NULL if the entry for this block was pruned. */ 1268 if (dde == NULL) { 1269 ddt_exit(ddt); 1270 spa_config_exit(spa, SCL_ZIO, FTAG); 1271 return (B_FALSE); 1272 } 1273 1274 if (dde->dde_type < DDT_TYPES) { 1275 ddt_phys_t *ddp; 1276 1277 ASSERT3S(dde->dde_class, <, DDT_CLASSES); 1278 1279 ddp = &dde->dde_phys[BP_GET_NDVAS(bp)]; 1280 1281 /* 1282 * This entry already existed (dde_type is real), so it must 1283 * have refcnt >0 at the start of this txg. We are called from 1284 * brt_pending_apply(), before frees are issued, so the refcnt 1285 * can't be lowered yet. Therefore, it must be >0. We assert 1286 * this because if the order of BRT and DDT interactions were 1287 * ever to change and the refcnt was ever zero here, then 1288 * likely further action is required to fill out the DDT entry, 1289 * and this is a place that is likely to be missed in testing. 1290 */ 1291 ASSERT3U(ddp->ddp_refcnt, >, 0); 1292 1293 ddt_phys_addref(ddp); 1294 result = B_TRUE; 1295 } else { 1296 /* 1297 * At the time of implementating this if the block has the 1298 * DEDUP flag set it must exist in the DEDUP table, but 1299 * there are many advocates that want ability to remove 1300 * entries from DDT with refcnt=1. If this will happen, 1301 * we may have a block with the DEDUP set, but which doesn't 1302 * have a corresponding entry in the DDT. Be ready. 1303 */ 1304 ASSERT3S(dde->dde_class, ==, DDT_CLASSES); 1305 ddt_remove(ddt, dde); 1306 result = B_FALSE; 1307 } 1308 1309 ddt_exit(ddt); 1310 spa_config_exit(spa, SCL_ZIO, FTAG); 1311 1312 return (result); 1313 } 1314 1315 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, 1316 "Enable prefetching dedup-ed blks"); 1317