1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 25 * Copyright (c) 2022 by Pawel Jakub Dawidek 26 * Copyright (c) 2023, Klara Inc. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/spa_impl.h> 32 #include <sys/zio.h> 33 #include <sys/ddt.h> 34 #include <sys/ddt_impl.h> 35 #include <sys/zap.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/arc.h> 38 #include <sys/dsl_pool.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/dsl_scan.h> 41 #include <sys/abd.h> 42 43 /* 44 * # DDT: Deduplication tables 45 * 46 * The dedup subsystem provides block-level deduplication. When enabled, blocks 47 * to be written will have the dedup (D) bit set, which causes them to be 48 * tracked in a "dedup table", or DDT. If a block has been seen before (exists 49 * in the DDT), instead of being written, it will instead be made to reference 50 * the existing on-disk data, and a refcount bumped in the DDT instead. 51 * 52 * ## Dedup tables and entries 53 * 54 * Conceptually, a DDT is a dictionary or map. Each entry has a "key" 55 * (ddt_key_t) made up a block's checksum and certian properties, and a "value" 56 * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth 57 * time and refcount. Together these are enough to track references to a 58 * specific block, to build a valid block pointer to reference that block (for 59 * freeing, scrubbing, etc), and to fill a new block pointer with the missing 60 * pieces to make it seem like it was written. 61 * 62 * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[]. 63 * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk 64 * object data formats, each with their own implementations) and "classes" 65 * (ddt_class_t, instance of a storage type object, for entries with a specific 66 * characteristic). An entry (key) will only ever exist on one of these objects 67 * at any given time, but may be moved from one to another if their type or 68 * class changes. 69 * 70 * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block 71 * is to be written, before DVAs have been allocated, ddt_lookup() is called to 72 * see if the block has been seen before. If its not found, the write proceeds 73 * as normal, and after it succeeds, a new entry is created. If it is found, we 74 * fill the BP with the DVAs from the entry, increment the refcount and cause 75 * the write IO to return immediately. 76 * 77 * Each ddt_phys_t slot in the entry represents a separate dedup block for the 78 * same content/checksum. The slot is selected based on the zp_copies parameter 79 * the block is written with, that is, the number of DVAs in the block. The 80 * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto" 81 * feature. These are no longer written, and will be freed if encountered on 82 * old pools. 83 * 84 * ## Lifetime of an entry 85 * 86 * A DDT can be enormous, and typically is not held in memory all at once. 87 * Instead, the changes to an entry are tracked in memory, and written down to 88 * disk at the end of each txg. 89 * 90 * A "live" in-memory entry (ddt_entry_t) is a node on the live tree 91 * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is 92 * required for IO, ddt_lookup() is called. If an entry already exists on 93 * ddt_tree, it is returned. Otherwise, a new one is created, and the 94 * type/class objects for the DDT are searched for that key. If its found, its 95 * value is copied into the live entry. If not, an empty entry is created. 96 * 97 * The live entry will be modified during the txg, usually by modifying the 98 * refcount, but sometimes by adding or updating DVAs. At the end of the txg 99 * (during spa_sync()), type and class are recalculated for entry (see 100 * ddt_sync_entry()), and the entry is written to the appropriate storage 101 * object and (if necessary), removed from an old one. ddt_tree is cleared and 102 * the next txg can start. 103 * 104 * ## Repair IO 105 * 106 * If a read on a dedup block fails, but there are other copies of the block in 107 * the other ddt_phys_t slots, reads will be issued for those instead 108 * (zio_ddt_read_start()). If one of those succeeds, the read is returned to 109 * the caller, and a copy is stashed on the entry's dde_repair_abd. 110 * 111 * During the end-of-txg sync, any entries with a dde_repair_abd get a 112 * "rewrite" write issued for the original block pointer, with the data read 113 * from the alternate block. If the block is actually damaged, this will invoke 114 * the pool's "self-healing" mechanism, and repair the block. 115 * 116 * ## Scanning (scrub/resilver) 117 * 118 * If dedup is active, the scrub machinery will walk the dedup table first, and 119 * scrub all blocks with refcnt > 1 first. After that it will move on to the 120 * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them. 121 * In this way, heavily deduplicated blocks are only scrubbed once. See the 122 * commentary on dsl_scan_ddt() for more details. 123 * 124 * Walking the DDT is done via ddt_walk(). The current position is stored in a 125 * ddt_bookmark_t, which represents a stable position in the storage object. 126 * This bookmark is stored by the scan machinery, and must reference the same 127 * position on the object even if the object changes, the pool is exported, or 128 * OpenZFS is upgraded. 129 * 130 * ## Interaction with block cloning 131 * 132 * If block cloning and dedup are both enabled on a pool, BRT will look for the 133 * dedup bit on an incoming block pointer. If set, it will call into the DDT 134 * (ddt_addref()) to add a reference to the block, instead of adding a 135 * reference to the BRT. See brt_pending_apply(). 136 */ 137 138 /* 139 * These are the only checksums valid for dedup. They must match the list 140 * from dedup_table in zfs_prop.c 141 */ 142 #define DDT_CHECKSUM_VALID(c) \ 143 (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \ 144 c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \ 145 c == ZIO_CHECKSUM_BLAKE3) 146 147 static kmem_cache_t *ddt_cache; 148 static kmem_cache_t *ddt_entry_cache; 149 150 /* 151 * Enable/disable prefetching of dedup-ed blocks which are going to be freed. 152 */ 153 int zfs_dedup_prefetch = 0; 154 155 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = { 156 &ddt_zap_ops, 157 }; 158 159 static const char *const ddt_class_name[DDT_CLASSES] = { 160 "ditto", 161 "duplicate", 162 "unique", 163 }; 164 165 static void 166 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 167 dmu_tx_t *tx) 168 { 169 spa_t *spa = ddt->ddt_spa; 170 objset_t *os = ddt->ddt_os; 171 uint64_t *objectp = &ddt->ddt_object[type][class]; 172 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & 173 ZCHECKSUM_FLAG_DEDUP; 174 char name[DDT_NAMELEN]; 175 176 ddt_object_name(ddt, type, class, name); 177 178 ASSERT3U(*objectp, ==, 0); 179 VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); 180 ASSERT3U(*objectp, !=, 0); 181 182 VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, 183 sizeof (uint64_t), 1, objectp, tx)); 184 185 VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name, 186 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 187 &ddt->ddt_histogram[type][class], tx)); 188 } 189 190 static void 191 ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 192 dmu_tx_t *tx) 193 { 194 spa_t *spa = ddt->ddt_spa; 195 objset_t *os = ddt->ddt_os; 196 uint64_t *objectp = &ddt->ddt_object[type][class]; 197 uint64_t count; 198 char name[DDT_NAMELEN]; 199 200 ddt_object_name(ddt, type, class, name); 201 202 ASSERT3U(*objectp, !=, 0); 203 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); 204 VERIFY0(ddt_object_count(ddt, type, class, &count)); 205 VERIFY0(count); 206 VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); 207 VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx)); 208 VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx)); 209 memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t)); 210 211 *objectp = 0; 212 } 213 214 static int 215 ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class) 216 { 217 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 218 dmu_object_info_t doi; 219 uint64_t count; 220 char name[DDT_NAMELEN]; 221 int error; 222 223 ddt_object_name(ddt, type, class, name); 224 225 error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, 226 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); 227 if (error != 0) 228 return (error); 229 230 error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 231 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 232 &ddt->ddt_histogram[type][class]); 233 if (error != 0) 234 return (error); 235 236 /* 237 * Seed the cached statistics. 238 */ 239 error = ddt_object_info(ddt, type, class, &doi); 240 if (error) 241 return (error); 242 243 error = ddt_object_count(ddt, type, class, &count); 244 if (error) 245 return (error); 246 247 ddo->ddo_count = count; 248 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 249 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 250 251 return (0); 252 } 253 254 static void 255 ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 256 dmu_tx_t *tx) 257 { 258 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 259 dmu_object_info_t doi; 260 uint64_t count; 261 char name[DDT_NAMELEN]; 262 263 ddt_object_name(ddt, type, class, name); 264 265 VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 266 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 267 &ddt->ddt_histogram[type][class], tx)); 268 269 /* 270 * Cache DDT statistics; this is the only time they'll change. 271 */ 272 VERIFY0(ddt_object_info(ddt, type, class, &doi)); 273 VERIFY0(ddt_object_count(ddt, type, class, &count)); 274 275 ddo->ddo_count = count; 276 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 277 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 278 } 279 280 static boolean_t 281 ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class) 282 { 283 return (!!ddt->ddt_object[type][class]); 284 } 285 286 static int 287 ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 288 ddt_entry_t *dde) 289 { 290 if (!ddt_object_exists(ddt, type, class)) 291 return (SET_ERROR(ENOENT)); 292 293 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, 294 ddt->ddt_object[type][class], &dde->dde_key, 295 dde->dde_phys, sizeof (dde->dde_phys))); 296 } 297 298 static int 299 ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 300 const ddt_key_t *ddk) 301 { 302 if (!ddt_object_exists(ddt, type, class)) 303 return (SET_ERROR(ENOENT)); 304 305 return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os, 306 ddt->ddt_object[type][class], ddk)); 307 } 308 309 static void 310 ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 311 const ddt_key_t *ddk) 312 { 313 if (!ddt_object_exists(ddt, type, class)) 314 return; 315 316 ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, 317 ddt->ddt_object[type][class], ddk); 318 } 319 320 static int 321 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 322 ddt_entry_t *dde, dmu_tx_t *tx) 323 { 324 ASSERT(ddt_object_exists(ddt, type, class)); 325 326 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, 327 ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys, 328 sizeof (dde->dde_phys), tx)); 329 } 330 331 static int 332 ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 333 const ddt_key_t *ddk, dmu_tx_t *tx) 334 { 335 ASSERT(ddt_object_exists(ddt, type, class)); 336 337 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, 338 ddt->ddt_object[type][class], ddk, tx)); 339 } 340 341 int 342 ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 343 uint64_t *walk, ddt_entry_t *dde) 344 { 345 ASSERT(ddt_object_exists(ddt, type, class)); 346 347 return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, 348 ddt->ddt_object[type][class], walk, &dde->dde_key, 349 dde->dde_phys, sizeof (dde->dde_phys))); 350 } 351 352 int 353 ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 354 uint64_t *count) 355 { 356 ASSERT(ddt_object_exists(ddt, type, class)); 357 358 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, 359 ddt->ddt_object[type][class], count)); 360 } 361 362 int 363 ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 364 dmu_object_info_t *doi) 365 { 366 if (!ddt_object_exists(ddt, type, class)) 367 return (SET_ERROR(ENOENT)); 368 369 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], 370 doi)); 371 } 372 373 void 374 ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class, 375 char *name) 376 { 377 (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT, 378 zio_checksum_table[ddt->ddt_checksum].ci_name, 379 ddt_ops[type]->ddt_op_name, ddt_class_name[class]); 380 } 381 382 void 383 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) 384 { 385 ASSERT3U(txg, !=, 0); 386 387 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 388 bp->blk_dva[d] = ddp->ddp_dva[d]; 389 BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); 390 } 391 392 /* 393 * The bp created via this function may be used for repairs and scrub, but it 394 * will be missing the salt / IV required to do a full decrypting read. 395 */ 396 void 397 ddt_bp_create(enum zio_checksum checksum, 398 const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) 399 { 400 BP_ZERO(bp); 401 402 if (ddp != NULL) 403 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); 404 405 bp->blk_cksum = ddk->ddk_cksum; 406 407 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); 408 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); 409 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); 410 BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk)); 411 BP_SET_FILL(bp, 1); 412 BP_SET_CHECKSUM(bp, checksum); 413 BP_SET_TYPE(bp, DMU_OT_DEDUP); 414 BP_SET_LEVEL(bp, 0); 415 BP_SET_DEDUP(bp, 1); 416 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 417 } 418 419 void 420 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) 421 { 422 ddk->ddk_cksum = bp->blk_cksum; 423 ddk->ddk_prop = 0; 424 425 ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp)); 426 427 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); 428 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); 429 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); 430 DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp)); 431 } 432 433 void 434 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) 435 { 436 ASSERT0(ddp->ddp_phys_birth); 437 438 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 439 ddp->ddp_dva[d] = bp->blk_dva[d]; 440 ddp->ddp_phys_birth = BP_GET_BIRTH(bp); 441 } 442 443 void 444 ddt_phys_clear(ddt_phys_t *ddp) 445 { 446 memset(ddp, 0, sizeof (*ddp)); 447 } 448 449 void 450 ddt_phys_addref(ddt_phys_t *ddp) 451 { 452 ddp->ddp_refcnt++; 453 } 454 455 void 456 ddt_phys_decref(ddt_phys_t *ddp) 457 { 458 if (ddp) { 459 ASSERT3U(ddp->ddp_refcnt, >, 0); 460 ddp->ddp_refcnt--; 461 } 462 } 463 464 static void 465 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) 466 { 467 blkptr_t blk; 468 469 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 470 471 /* 472 * We clear the dedup bit so that zio_free() will actually free the 473 * space, rather than just decrementing the refcount in the DDT. 474 */ 475 BP_SET_DEDUP(&blk, 0); 476 477 ddt_phys_clear(ddp); 478 zio_free(ddt->ddt_spa, txg, &blk); 479 } 480 481 ddt_phys_t * 482 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) 483 { 484 ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; 485 486 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 487 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && 488 BP_GET_BIRTH(bp) == ddp->ddp_phys_birth) 489 return (ddp); 490 } 491 return (NULL); 492 } 493 494 uint64_t 495 ddt_phys_total_refcnt(const ddt_entry_t *dde) 496 { 497 uint64_t refcnt = 0; 498 499 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) 500 refcnt += dde->dde_phys[p].ddp_refcnt; 501 502 return (refcnt); 503 } 504 505 ddt_t * 506 ddt_select(spa_t *spa, const blkptr_t *bp) 507 { 508 ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp))); 509 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); 510 } 511 512 void 513 ddt_enter(ddt_t *ddt) 514 { 515 mutex_enter(&ddt->ddt_lock); 516 } 517 518 void 519 ddt_exit(ddt_t *ddt) 520 { 521 mutex_exit(&ddt->ddt_lock); 522 } 523 524 void 525 ddt_init(void) 526 { 527 ddt_cache = kmem_cache_create("ddt_cache", 528 sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 529 ddt_entry_cache = kmem_cache_create("ddt_entry_cache", 530 sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 531 } 532 533 void 534 ddt_fini(void) 535 { 536 kmem_cache_destroy(ddt_entry_cache); 537 kmem_cache_destroy(ddt_cache); 538 } 539 540 static ddt_entry_t * 541 ddt_alloc(const ddt_key_t *ddk) 542 { 543 ddt_entry_t *dde; 544 545 dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); 546 memset(dde, 0, sizeof (ddt_entry_t)); 547 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); 548 549 dde->dde_key = *ddk; 550 551 return (dde); 552 } 553 554 static void 555 ddt_free(ddt_entry_t *dde) 556 { 557 ASSERT(dde->dde_flags & DDE_FLAG_LOADED); 558 559 for (int p = 0; p < DDT_PHYS_TYPES; p++) 560 ASSERT3P(dde->dde_lead_zio[p], ==, NULL); 561 562 if (dde->dde_repair_abd != NULL) 563 abd_free(dde->dde_repair_abd); 564 565 cv_destroy(&dde->dde_cv); 566 kmem_cache_free(ddt_entry_cache, dde); 567 } 568 569 void 570 ddt_remove(ddt_t *ddt, ddt_entry_t *dde) 571 { 572 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 573 574 avl_remove(&ddt->ddt_tree, dde); 575 ddt_free(dde); 576 } 577 578 ddt_entry_t * 579 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) 580 { 581 ddt_key_t search; 582 ddt_entry_t *dde; 583 ddt_type_t type; 584 ddt_class_t class; 585 avl_index_t where; 586 int error; 587 588 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 589 590 ddt_key_fill(&search, bp); 591 592 /* Find an existing live entry */ 593 dde = avl_find(&ddt->ddt_tree, &search, &where); 594 if (dde != NULL) { 595 /* Found it. If it's already loaded, we can just return it. */ 596 if (dde->dde_flags & DDE_FLAG_LOADED) 597 return (dde); 598 599 /* Someone else is loading it, wait for it. */ 600 while (!(dde->dde_flags & DDE_FLAG_LOADED)) 601 cv_wait(&dde->dde_cv, &ddt->ddt_lock); 602 603 return (dde); 604 } 605 606 /* Not found. */ 607 if (!add) 608 return (NULL); 609 610 /* Time to make a new entry. */ 611 dde = ddt_alloc(&search); 612 avl_insert(&ddt->ddt_tree, dde, where); 613 614 /* 615 * ddt_tree is now stable, so unlock and let everyone else keep moving. 616 * Anyone landing on this entry will find it without DDE_FLAG_LOADED, 617 * and go to sleep waiting for it above. 618 */ 619 ddt_exit(ddt); 620 621 /* Search all store objects for the entry. */ 622 error = ENOENT; 623 for (type = 0; type < DDT_TYPES; type++) { 624 for (class = 0; class < DDT_CLASSES; class++) { 625 error = ddt_object_lookup(ddt, type, class, dde); 626 if (error != ENOENT) { 627 ASSERT0(error); 628 break; 629 } 630 } 631 if (error != ENOENT) 632 break; 633 } 634 635 ddt_enter(ddt); 636 637 ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED)); 638 639 dde->dde_type = type; /* will be DDT_TYPES if no entry found */ 640 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ 641 642 if (error == 0) 643 ddt_stat_update(ddt, dde, -1ULL); 644 645 /* Entry loaded, everyone can proceed now */ 646 dde->dde_flags |= DDE_FLAG_LOADED; 647 cv_broadcast(&dde->dde_cv); 648 649 return (dde); 650 } 651 652 void 653 ddt_prefetch(spa_t *spa, const blkptr_t *bp) 654 { 655 ddt_t *ddt; 656 ddt_key_t ddk; 657 658 if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) 659 return; 660 661 /* 662 * We only remove the DDT once all tables are empty and only 663 * prefetch dedup blocks when there are entries in the DDT. 664 * Thus no locking is required as the DDT can't disappear on us. 665 */ 666 ddt = ddt_select(spa, bp); 667 ddt_key_fill(&ddk, bp); 668 669 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 670 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 671 ddt_object_prefetch(ddt, type, class, &ddk); 672 } 673 } 674 } 675 676 /* 677 * Key comparison. Any struct wanting to make use of this function must have 678 * the key as the first element. 679 */ 680 #define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t)) 681 682 typedef struct ddt_key_cmp { 683 uint16_t u16[DDT_KEY_CMP_LEN]; 684 } ddt_key_cmp_t; 685 686 int 687 ddt_key_compare(const void *x1, const void *x2) 688 { 689 const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)x1; 690 const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)x2; 691 int32_t cmp = 0; 692 693 for (int i = 0; i < DDT_KEY_CMP_LEN; i++) { 694 cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i]; 695 if (likely(cmp)) 696 break; 697 } 698 699 return (TREE_ISIGN(cmp)); 700 } 701 702 static ddt_t * 703 ddt_table_alloc(spa_t *spa, enum zio_checksum c) 704 { 705 ddt_t *ddt; 706 707 ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); 708 memset(ddt, 0, sizeof (ddt_t)); 709 710 mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); 711 avl_create(&ddt->ddt_tree, ddt_key_compare, 712 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 713 avl_create(&ddt->ddt_repair_tree, ddt_key_compare, 714 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 715 ddt->ddt_checksum = c; 716 ddt->ddt_spa = spa; 717 ddt->ddt_os = spa->spa_meta_objset; 718 719 return (ddt); 720 } 721 722 static void 723 ddt_table_free(ddt_t *ddt) 724 { 725 ASSERT0(avl_numnodes(&ddt->ddt_tree)); 726 ASSERT0(avl_numnodes(&ddt->ddt_repair_tree)); 727 avl_destroy(&ddt->ddt_tree); 728 avl_destroy(&ddt->ddt_repair_tree); 729 mutex_destroy(&ddt->ddt_lock); 730 kmem_cache_free(ddt_cache, ddt); 731 } 732 733 void 734 ddt_create(spa_t *spa) 735 { 736 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; 737 738 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 739 if (DDT_CHECKSUM_VALID(c)) 740 spa->spa_ddt[c] = ddt_table_alloc(spa, c); 741 } 742 } 743 744 int 745 ddt_load(spa_t *spa) 746 { 747 int error; 748 749 ddt_create(spa); 750 751 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 752 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 753 &spa->spa_ddt_stat_object); 754 755 if (error) 756 return (error == ENOENT ? 0 : error); 757 758 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 759 if (!DDT_CHECKSUM_VALID(c)) 760 continue; 761 762 ddt_t *ddt = spa->spa_ddt[c]; 763 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 764 for (ddt_class_t class = 0; class < DDT_CLASSES; 765 class++) { 766 error = ddt_object_load(ddt, type, class); 767 if (error != 0 && error != ENOENT) 768 return (error); 769 } 770 } 771 772 /* 773 * Seed the cached histograms. 774 */ 775 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, 776 sizeof (ddt->ddt_histogram)); 777 spa->spa_dedup_dspace = ~0ULL; 778 } 779 780 return (0); 781 } 782 783 void 784 ddt_unload(spa_t *spa) 785 { 786 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 787 if (spa->spa_ddt[c]) { 788 ddt_table_free(spa->spa_ddt[c]); 789 spa->spa_ddt[c] = NULL; 790 } 791 } 792 } 793 794 boolean_t 795 ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp) 796 { 797 ddt_t *ddt; 798 ddt_key_t ddk; 799 800 if (!BP_GET_DEDUP(bp)) 801 return (B_FALSE); 802 803 if (max_class == DDT_CLASS_UNIQUE) 804 return (B_TRUE); 805 806 ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; 807 808 ddt_key_fill(&ddk, bp); 809 810 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 811 for (ddt_class_t class = 0; class <= max_class; class++) { 812 if (ddt_object_contains(ddt, type, class, &ddk) == 0) 813 return (B_TRUE); 814 } 815 } 816 817 return (B_FALSE); 818 } 819 820 ddt_entry_t * 821 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) 822 { 823 ddt_key_t ddk; 824 ddt_entry_t *dde; 825 826 ddt_key_fill(&ddk, bp); 827 828 dde = ddt_alloc(&ddk); 829 830 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 831 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 832 /* 833 * We can only do repair if there are multiple copies 834 * of the block. For anything in the UNIQUE class, 835 * there's definitely only one copy, so don't even try. 836 */ 837 if (class != DDT_CLASS_UNIQUE && 838 ddt_object_lookup(ddt, type, class, dde) == 0) 839 return (dde); 840 } 841 } 842 843 memset(dde->dde_phys, 0, sizeof (dde->dde_phys)); 844 845 return (dde); 846 } 847 848 void 849 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) 850 { 851 avl_index_t where; 852 853 ddt_enter(ddt); 854 855 if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && 856 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) 857 avl_insert(&ddt->ddt_repair_tree, dde, where); 858 else 859 ddt_free(dde); 860 861 ddt_exit(ddt); 862 } 863 864 static void 865 ddt_repair_entry_done(zio_t *zio) 866 { 867 ddt_entry_t *rdde = zio->io_private; 868 869 ddt_free(rdde); 870 } 871 872 static void 873 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) 874 { 875 ddt_phys_t *ddp = dde->dde_phys; 876 ddt_phys_t *rddp = rdde->dde_phys; 877 ddt_key_t *ddk = &dde->dde_key; 878 ddt_key_t *rddk = &rdde->dde_key; 879 zio_t *zio; 880 blkptr_t blk; 881 882 zio = zio_null(rio, rio->io_spa, NULL, 883 ddt_repair_entry_done, rdde, rio->io_flags); 884 885 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { 886 if (ddp->ddp_phys_birth == 0 || 887 ddp->ddp_phys_birth != rddp->ddp_phys_birth || 888 memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) 889 continue; 890 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 891 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, 892 rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, 893 ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); 894 } 895 896 zio_nowait(zio); 897 } 898 899 static void 900 ddt_repair_table(ddt_t *ddt, zio_t *rio) 901 { 902 spa_t *spa = ddt->ddt_spa; 903 ddt_entry_t *dde, *rdde_next, *rdde; 904 avl_tree_t *t = &ddt->ddt_repair_tree; 905 blkptr_t blk; 906 907 if (spa_sync_pass(spa) > 1) 908 return; 909 910 ddt_enter(ddt); 911 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { 912 rdde_next = AVL_NEXT(t, rdde); 913 avl_remove(&ddt->ddt_repair_tree, rdde); 914 ddt_exit(ddt); 915 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); 916 dde = ddt_repair_start(ddt, &blk); 917 ddt_repair_entry(ddt, dde, rdde, rio); 918 ddt_repair_done(ddt, dde); 919 ddt_enter(ddt); 920 } 921 ddt_exit(ddt); 922 } 923 924 static void 925 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) 926 { 927 dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; 928 ddt_phys_t *ddp = dde->dde_phys; 929 ddt_key_t *ddk = &dde->dde_key; 930 ddt_type_t otype = dde->dde_type; 931 ddt_type_t ntype = DDT_TYPE_DEFAULT; 932 ddt_class_t oclass = dde->dde_class; 933 ddt_class_t nclass; 934 uint64_t total_refcnt = 0; 935 936 ASSERT(dde->dde_flags & DDE_FLAG_LOADED); 937 938 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 939 ASSERT3P(dde->dde_lead_zio[p], ==, NULL); 940 if (ddp->ddp_phys_birth == 0) { 941 ASSERT0(ddp->ddp_refcnt); 942 continue; 943 } 944 if (p == DDT_PHYS_DITTO) { 945 /* 946 * Note, we no longer create DDT-DITTO blocks, but we 947 * don't want to leak any written by older software. 948 */ 949 ddt_phys_free(ddt, ddk, ddp, txg); 950 continue; 951 } 952 if (ddp->ddp_refcnt == 0) 953 ddt_phys_free(ddt, ddk, ddp, txg); 954 total_refcnt += ddp->ddp_refcnt; 955 } 956 957 /* We do not create new DDT-DITTO blocks. */ 958 ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth); 959 if (total_refcnt > 1) 960 nclass = DDT_CLASS_DUPLICATE; 961 else 962 nclass = DDT_CLASS_UNIQUE; 963 964 if (otype != DDT_TYPES && 965 (otype != ntype || oclass != nclass || total_refcnt == 0)) { 966 VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx)); 967 ASSERT3U( 968 ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT); 969 } 970 971 if (total_refcnt != 0) { 972 dde->dde_type = ntype; 973 dde->dde_class = nclass; 974 ddt_stat_update(ddt, dde, 0); 975 if (!ddt_object_exists(ddt, ntype, nclass)) 976 ddt_object_create(ddt, ntype, nclass, tx); 977 VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx)); 978 979 /* 980 * If the class changes, the order that we scan this bp 981 * changes. If it decreases, we could miss it, so 982 * scan it right now. (This covers both class changing 983 * while we are doing ddt_walk(), and when we are 984 * traversing.) 985 */ 986 if (nclass < oclass) { 987 dsl_scan_ddt_entry(dp->dp_scan, 988 ddt->ddt_checksum, dde, tx); 989 } 990 } 991 } 992 993 static void 994 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) 995 { 996 spa_t *spa = ddt->ddt_spa; 997 ddt_entry_t *dde; 998 void *cookie = NULL; 999 1000 if (avl_numnodes(&ddt->ddt_tree) == 0) 1001 return; 1002 1003 ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP); 1004 1005 if (spa->spa_ddt_stat_object == 0) { 1006 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, 1007 DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, 1008 DMU_POOL_DDT_STATS, tx); 1009 } 1010 1011 while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { 1012 ddt_sync_entry(ddt, dde, tx, txg); 1013 ddt_free(dde); 1014 } 1015 1016 for (ddt_type_t type = 0; type < DDT_TYPES; type++) { 1017 uint64_t add, count = 0; 1018 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 1019 if (ddt_object_exists(ddt, type, class)) { 1020 ddt_object_sync(ddt, type, class, tx); 1021 VERIFY0(ddt_object_count(ddt, type, class, 1022 &add)); 1023 count += add; 1024 } 1025 } 1026 for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { 1027 if (count == 0 && ddt_object_exists(ddt, type, class)) 1028 ddt_object_destroy(ddt, type, class, tx); 1029 } 1030 } 1031 1032 memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, 1033 sizeof (ddt->ddt_histogram)); 1034 spa->spa_dedup_dspace = ~0ULL; 1035 } 1036 1037 void 1038 ddt_sync(spa_t *spa, uint64_t txg) 1039 { 1040 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1041 dmu_tx_t *tx; 1042 zio_t *rio; 1043 1044 ASSERT3U(spa_syncing_txg(spa), ==, txg); 1045 1046 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1047 1048 rio = zio_root(spa, NULL, NULL, 1049 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); 1050 1051 /* 1052 * This function may cause an immediate scan of ddt blocks (see 1053 * the comment above dsl_scan_ddt() for details). We set the 1054 * scan's root zio here so that we can wait for any scan IOs in 1055 * addition to the regular ddt IOs. 1056 */ 1057 ASSERT3P(scn->scn_zio_root, ==, NULL); 1058 scn->scn_zio_root = rio; 1059 1060 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 1061 ddt_t *ddt = spa->spa_ddt[c]; 1062 if (ddt == NULL) 1063 continue; 1064 ddt_sync_table(ddt, tx, txg); 1065 ddt_repair_table(ddt, rio); 1066 } 1067 1068 (void) zio_wait(rio); 1069 scn->scn_zio_root = NULL; 1070 1071 dmu_tx_commit(tx); 1072 } 1073 1074 int 1075 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) 1076 { 1077 do { 1078 do { 1079 do { 1080 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; 1081 if (ddt == NULL) 1082 continue; 1083 int error = ENOENT; 1084 if (ddt_object_exists(ddt, ddb->ddb_type, 1085 ddb->ddb_class)) { 1086 error = ddt_object_walk(ddt, 1087 ddb->ddb_type, ddb->ddb_class, 1088 &ddb->ddb_cursor, dde); 1089 } 1090 dde->dde_type = ddb->ddb_type; 1091 dde->dde_class = ddb->ddb_class; 1092 if (error == 0) 1093 return (0); 1094 if (error != ENOENT) 1095 return (error); 1096 ddb->ddb_cursor = 0; 1097 } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); 1098 ddb->ddb_checksum = 0; 1099 } while (++ddb->ddb_type < DDT_TYPES); 1100 ddb->ddb_type = 0; 1101 } while (++ddb->ddb_class < DDT_CLASSES); 1102 1103 return (SET_ERROR(ENOENT)); 1104 } 1105 1106 /* 1107 * This function is used by Block Cloning (brt.c) to increase reference 1108 * counter for the DDT entry if the block is already in DDT. 1109 * 1110 * Return false if the block, despite having the D bit set, is not present 1111 * in the DDT. Currently this is not possible but might be in the future. 1112 * See the comment below. 1113 */ 1114 boolean_t 1115 ddt_addref(spa_t *spa, const blkptr_t *bp) 1116 { 1117 ddt_t *ddt; 1118 ddt_entry_t *dde; 1119 boolean_t result; 1120 1121 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1122 ddt = ddt_select(spa, bp); 1123 ddt_enter(ddt); 1124 1125 dde = ddt_lookup(ddt, bp, B_TRUE); 1126 ASSERT3P(dde, !=, NULL); 1127 1128 if (dde->dde_type < DDT_TYPES) { 1129 ddt_phys_t *ddp; 1130 1131 ASSERT3S(dde->dde_class, <, DDT_CLASSES); 1132 1133 ddp = &dde->dde_phys[BP_GET_NDVAS(bp)]; 1134 1135 /* 1136 * This entry already existed (dde_type is real), so it must 1137 * have refcnt >0 at the start of this txg. We are called from 1138 * brt_pending_apply(), before frees are issued, so the refcnt 1139 * can't be lowered yet. Therefore, it must be >0. We assert 1140 * this because if the order of BRT and DDT interactions were 1141 * ever to change and the refcnt was ever zero here, then 1142 * likely further action is required to fill out the DDT entry, 1143 * and this is a place that is likely to be missed in testing. 1144 */ 1145 ASSERT3U(ddp->ddp_refcnt, >, 0); 1146 1147 ddt_phys_addref(ddp); 1148 result = B_TRUE; 1149 } else { 1150 /* 1151 * At the time of implementating this if the block has the 1152 * DEDUP flag set it must exist in the DEDUP table, but 1153 * there are many advocates that want ability to remove 1154 * entries from DDT with refcnt=1. If this will happen, 1155 * we may have a block with the DEDUP set, but which doesn't 1156 * have a corresponding entry in the DDT. Be ready. 1157 */ 1158 ASSERT3S(dde->dde_class, ==, DDT_CLASSES); 1159 ddt_remove(ddt, dde); 1160 result = B_FALSE; 1161 } 1162 1163 ddt_exit(ddt); 1164 spa_config_exit(spa, SCL_ZIO, FTAG); 1165 1166 return (result); 1167 } 1168 1169 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, 1170 "Enable prefetching dedup-ed blks"); 1171