1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2023, Klara Inc. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/ddt.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dmu.h> 32 #include <sys/ddt_impl.h> 33 #include <sys/dnode.h> 34 #include <sys/dbuf.h> 35 #include <sys/zap.h> 36 #include <sys/zio_checksum.h> 37 38 /* 39 * No more than this many txgs before swapping logs. 40 */ 41 uint_t zfs_dedup_log_txg_max = 8; 42 43 /* 44 * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module 45 * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory. 46 */ 47 uint64_t zfs_dedup_log_mem_max = 0; 48 uint_t zfs_dedup_log_mem_max_percent = 1; 49 50 51 static kmem_cache_t *ddt_log_entry_flat_cache; 52 static kmem_cache_t *ddt_log_entry_trad_cache; 53 54 #define DDT_LOG_ENTRY_FLAT_SIZE \ 55 (sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE) 56 #define DDT_LOG_ENTRY_TRAD_SIZE \ 57 (sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE) 58 59 #define DDT_LOG_ENTRY_SIZE(ddt) \ 60 _DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE) 61 62 void 63 ddt_log_init(void) 64 { 65 ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache", 66 DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 67 ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache", 68 DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 69 70 /* 71 * Max memory for log AVL entries. At least 1M, because we need 72 * something (that's ~3800 entries per tree). They can say 100% if they 73 * want; it just means they're at the mercy of the the txg flush limit. 74 */ 75 if (zfs_dedup_log_mem_max == 0) { 76 zfs_dedup_log_mem_max_percent = 77 MIN(zfs_dedup_log_mem_max_percent, 100); 78 zfs_dedup_log_mem_max = (physmem * PAGESIZE) * 79 zfs_dedup_log_mem_max_percent / 100; 80 } 81 zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024); 82 } 83 84 void 85 ddt_log_fini(void) 86 { 87 kmem_cache_destroy(ddt_log_entry_trad_cache); 88 kmem_cache_destroy(ddt_log_entry_flat_cache); 89 } 90 91 static void 92 ddt_log_name(ddt_t *ddt, char *name, uint_t n) 93 { 94 snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG, 95 zio_checksum_table[ddt->ddt_checksum].ci_name, n); 96 } 97 98 static void 99 ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx) 100 { 101 dmu_buf_t *db; 102 VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db)); 103 dmu_buf_will_dirty(db, tx); 104 105 ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data; 106 DLH_SET_VERSION(hdr, 1); 107 DLH_SET_FLAGS(hdr, ddl->ddl_flags); 108 hdr->dlh_length = ddl->ddl_length; 109 hdr->dlh_first_txg = ddl->ddl_first_txg; 110 hdr->dlh_checkpoint = ddl->ddl_checkpoint; 111 112 dmu_buf_rele(db, FTAG); 113 } 114 115 static void 116 ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) 117 { 118 ASSERT3U(ddt->ddt_dir_object, >, 0); 119 ASSERT0(ddl->ddl_object); 120 121 char name[DDT_NAMELEN]; 122 ddt_log_name(ddt, name, n); 123 124 ddl->ddl_object = dmu_object_alloc(ddt->ddt_os, 125 DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, 126 DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx); 127 VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name, 128 sizeof (uint64_t), 1, &ddl->ddl_object, tx)); 129 ddl->ddl_length = 0; 130 ddl->ddl_first_txg = tx->tx_txg; 131 ddt_log_update_header(ddt, ddl, tx); 132 } 133 134 static void 135 ddt_log_create(ddt_t *ddt, dmu_tx_t *tx) 136 { 137 ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx); 138 ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx); 139 } 140 141 static void 142 ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) 143 { 144 ASSERT3U(ddt->ddt_dir_object, >, 0); 145 146 if (ddl->ddl_object == 0) 147 return; 148 149 ASSERT0(ddl->ddl_length); 150 151 char name[DDT_NAMELEN]; 152 ddt_log_name(ddt, name, n); 153 154 VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx)); 155 VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx)); 156 157 ddl->ddl_object = 0; 158 } 159 160 void 161 ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx) 162 { 163 ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx); 164 ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx); 165 } 166 167 static void 168 ddt_log_update_stats(ddt_t *ddt) 169 { 170 /* 171 * Log object stats. We count the number of live entries in the log 172 * tree, even if there are more than on disk, and even if the same 173 * entry is on both append and flush trees, because that's more what 174 * the user expects to see. This does mean the on-disk size is not 175 * really correlated with the number of entries, but I don't think 176 * that's reasonable to expect anyway. 177 */ 178 dmu_object_info_t doi; 179 uint64_t nblocks = 0; 180 if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, 181 &doi) == 0) 182 nblocks += doi.doi_physical_blocks_512; 183 if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, 184 &doi) == 0) 185 nblocks += doi.doi_physical_blocks_512; 186 187 ddt_object_t *ddo = &ddt->ddt_log_stats; 188 ddo->ddo_count = 189 avl_numnodes(&ddt->ddt_log_active->ddl_tree) + 190 avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); 191 ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt); 192 ddo->ddo_dspace = nblocks << 9; 193 } 194 195 void 196 ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu) 197 { 198 ASSERT3U(nentries, >, 0); 199 ASSERT0P(dlu->dlu_dbp); 200 201 if (ddt->ddt_log_active->ddl_object == 0) 202 ddt_log_create(ddt, tx); 203 204 /* 205 * We want to store as many entries as we can in a block, but never 206 * split an entry across block boundaries. 207 */ 208 size_t reclen = P2ALIGN_TYPED( 209 sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) + 210 DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t); 211 ASSERT3U(reclen, <=, UINT16_MAX); 212 dlu->dlu_reclen = reclen; 213 214 VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG, 215 &dlu->dlu_dn)); 216 dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP); 217 218 uint64_t nblocks = howmany(nentries, 219 dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen); 220 uint64_t offset = ddt->ddt_log_active->ddl_length; 221 uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz; 222 223 VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length, 224 B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp, 225 DMU_READ_NO_PREFETCH | DMU_UNCACHEDIO)); 226 227 dlu->dlu_tx = tx; 228 dlu->dlu_block = dlu->dlu_offset = 0; 229 } 230 231 static ddt_log_entry_t * 232 ddt_log_alloc_entry(ddt_t *ddt) 233 { 234 ddt_log_entry_t *ddle; 235 236 if (ddt->ddt_flags & DDT_FLAG_FLAT) { 237 ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP); 238 memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE); 239 } else { 240 ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP); 241 memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE); 242 } 243 244 return (ddle); 245 } 246 247 static void 248 ddt_log_free_entry(ddt_t *ddt, ddt_log_entry_t *ddle) 249 { 250 kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? 251 ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); 252 } 253 254 static void 255 ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) 256 { 257 /* Create the log tree entry from a live or stored entry */ 258 avl_index_t where; 259 ddt_log_entry_t *ddle = 260 avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where); 261 if (ddle == NULL) { 262 ddle = ddt_log_alloc_entry(ddt); 263 ddle->ddle_key = ddlwe->ddlwe_key; 264 avl_insert(&ddl->ddl_tree, ddle, where); 265 } 266 ddle->ddle_type = ddlwe->ddlwe_type; 267 ddle->ddle_class = ddlwe->ddlwe_class; 268 memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); 269 } 270 271 void 272 ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu) 273 { 274 ASSERT3U(dlu->dlu_dbp, !=, NULL); 275 276 ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe); 277 ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe); 278 279 /* Get our block */ 280 ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); 281 dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block]; 282 283 /* 284 * If this would take us past the end of the block, finish it and 285 * move to the next one. 286 */ 287 if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) { 288 ASSERT3U(dlu->dlu_offset, >, 0); 289 dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE); 290 dlu->dlu_block++; 291 dlu->dlu_offset = 0; 292 ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); 293 db = dlu->dlu_dbp[dlu->dlu_block]; 294 } 295 296 /* 297 * If this is the first time touching the block, inform the DMU that 298 * we will fill it, and zero it out. 299 */ 300 if (dlu->dlu_offset == 0) { 301 dmu_buf_will_fill_flags(db, dlu->dlu_tx, B_FALSE, 302 DMU_UNCACHEDIO); 303 memset(db->db_data, 0, db->db_size); 304 } 305 306 /* Create the log record directly in the buffer */ 307 ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset); 308 DLR_SET_TYPE(dlr, DLR_ENTRY); 309 DLR_SET_RECLEN(dlr, dlu->dlu_reclen); 310 DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type); 311 DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class); 312 313 ddt_log_record_entry_t *dlre = 314 (ddt_log_record_entry_t *)&dlr->dlr_payload; 315 dlre->dlre_key = ddlwe->ddlwe_key; 316 memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); 317 318 /* Advance offset for next record. */ 319 dlu->dlu_offset += dlu->dlu_reclen; 320 } 321 322 void 323 ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu) 324 { 325 ASSERT3U(dlu->dlu_dbp, !=, NULL); 326 ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp); 327 ASSERT3U(dlu->dlu_offset, >, 0); 328 329 /* 330 * Close out the last block. Whatever we haven't used will be zeroed, 331 * which matches DLR_INVALID, so we can detect this during load. 332 */ 333 dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE); 334 335 dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG); 336 337 ddt->ddt_log_active->ddl_length += 338 dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz; 339 dnode_rele(dlu->dlu_dn, FTAG); 340 341 ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx); 342 343 memset(dlu, 0, sizeof (ddt_log_update_t)); 344 345 ddt_log_update_stats(ddt); 346 } 347 348 boolean_t 349 ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) 350 { 351 ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); 352 if (ddle == NULL) 353 return (B_FALSE); 354 355 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); 356 357 ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); 358 359 avl_remove(&ddl->ddl_tree, ddle); 360 ddt_log_free_entry(ddt, ddle); 361 362 return (B_TRUE); 363 } 364 365 boolean_t 366 ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) 367 { 368 ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL); 369 if (ddle == NULL) 370 return (B_FALSE); 371 372 ddt_lightweight_entry_t ddlwe; 373 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); 374 ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); 375 376 avl_remove(&ddl->ddl_tree, ddle); 377 ddt_log_free_entry(ddt, ddle); 378 379 return (B_TRUE); 380 } 381 382 boolean_t 383 ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, 384 ddt_lightweight_entry_t *ddlwe) 385 { 386 ddt_log_entry_t *ddle = 387 avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL); 388 if (!ddle) 389 ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL); 390 if (!ddle) 391 return (B_FALSE); 392 if (ddlwe) 393 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); 394 return (B_TRUE); 395 } 396 397 void 398 ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) 399 { 400 ddt_log_t *ddl = ddt->ddt_log_flushing; 401 402 ASSERT3U(ddl->ddl_object, !=, 0); 403 404 #ifdef ZFS_DEBUG 405 /* 406 * There should not be any entries on the log tree before the given 407 * checkpoint. Assert that this is the case. 408 */ 409 ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); 410 if (ddle != NULL) 411 VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key), 412 >, 0); 413 #endif 414 415 ddl->ddl_flags |= DDL_FLAG_CHECKPOINT; 416 ddl->ddl_checkpoint = ddlwe->ddlwe_key; 417 ddt_log_update_header(ddt, ddl, tx); 418 419 ddt_log_update_stats(ddt); 420 } 421 422 void 423 ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx) 424 { 425 ddt_log_t *ddl = ddt->ddt_log_flushing; 426 427 if (ddl->ddl_object == 0) 428 return; 429 430 ASSERT(avl_is_empty(&ddl->ddl_tree)); 431 432 /* Eject the entire object */ 433 dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx); 434 435 ddl->ddl_length = 0; 436 ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT; 437 memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t)); 438 ddt_log_update_header(ddt, ddl, tx); 439 440 ddt_log_update_stats(ddt); 441 } 442 443 boolean_t 444 ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx) 445 { 446 /* Swap the logs. The old flushing one must be empty */ 447 VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)); 448 449 /* 450 * If there are still blocks on the flushing log, truncate it first. 451 * This can happen if there were entries on the flushing log that were 452 * removed in memory via ddt_lookup(); their vestigal remains are 453 * on disk. 454 */ 455 if (ddt->ddt_log_flushing->ddl_length > 0) 456 ddt_log_truncate(ddt, tx); 457 458 /* 459 * Swap policy. We swap the logs (and so begin flushing) when the 460 * active tree grows too large, or when we haven't swapped it in 461 * some amount of time, or if something has requested the logs be 462 * flushed ASAP (see ddt_walk_init()). 463 */ 464 465 /* 466 * The log tree is too large if the memory usage of its entries is over 467 * half of the memory limit. This effectively gives each log tree half 468 * the available memory. 469 */ 470 const boolean_t too_large = 471 (avl_numnodes(&ddt->ddt_log_active->ddl_tree) * 472 DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1); 473 474 const boolean_t too_old = 475 tx->tx_txg >= 476 (ddt->ddt_log_active->ddl_first_txg + 477 MAX(1, zfs_dedup_log_txg_max)); 478 479 const boolean_t force = 480 ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg; 481 482 if (!(too_large || too_old || force)) 483 return (B_FALSE); 484 485 ddt_log_t *swap = ddt->ddt_log_active; 486 ddt->ddt_log_active = ddt->ddt_log_flushing; 487 ddt->ddt_log_flushing = swap; 488 489 ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING); 490 ddt->ddt_log_active->ddl_flags &= 491 ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT); 492 493 ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING)); 494 ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; 495 496 ddt->ddt_log_active->ddl_first_txg = tx->tx_txg; 497 498 ddt_log_update_header(ddt, ddt->ddt_log_active, tx); 499 ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx); 500 501 ddt_log_update_stats(ddt); 502 503 return (B_TRUE); 504 } 505 506 static inline void 507 ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr, 508 const ddt_key_t *checkpoint) 509 { 510 ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY); 511 512 ddt_log_record_entry_t *dlre = 513 (ddt_log_record_entry_t *)dlr->dlr_payload; 514 if (checkpoint != NULL && 515 ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) { 516 /* Skip pre-checkpoint entries; they're already flushed. */ 517 return; 518 } 519 520 ddt_lightweight_entry_t ddlwe; 521 ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr); 522 ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr); 523 524 ddlwe.ddlwe_key = dlre->dlre_key; 525 memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt)); 526 527 ddt_log_update_entry(ddt, ddl, &ddlwe); 528 } 529 530 static void 531 ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) 532 { 533 void *cookie = NULL; 534 ddt_log_entry_t *ddle; 535 IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); 536 while ((ddle = 537 avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { 538 ddt_log_free_entry(ddt, ddle); 539 } 540 ASSERT(avl_is_empty(&ddl->ddl_tree)); 541 } 542 543 static int 544 ddt_log_load_one(ddt_t *ddt, uint_t n) 545 { 546 ASSERT3U(n, <, 2); 547 548 ddt_log_t *ddl = &ddt->ddt_log[n]; 549 550 char name[DDT_NAMELEN]; 551 ddt_log_name(ddt, name, n); 552 553 uint64_t obj; 554 int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, 555 sizeof (uint64_t), 1, &obj); 556 if (err == ENOENT) 557 return (0); 558 if (err != 0) 559 return (err); 560 561 dnode_t *dn; 562 err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn); 563 if (err != 0) 564 return (err); 565 566 ddt_log_header_t hdr; 567 dmu_buf_t *db; 568 err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH); 569 if (err != 0) { 570 dnode_rele(dn, FTAG); 571 return (err); 572 } 573 memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t)); 574 dmu_buf_rele(db, FTAG); 575 576 if (DLH_GET_VERSION(&hdr) != 1) { 577 dnode_rele(dn, FTAG); 578 zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s " 579 "unknown version=%llu", spa_name(ddt->ddt_spa), name, 580 (u_longlong_t)DLH_GET_VERSION(&hdr)); 581 return (SET_ERROR(EINVAL)); 582 } 583 584 ddt_key_t *checkpoint = NULL; 585 if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) { 586 /* 587 * If the log has a checkpoint, then we can ignore any entries 588 * that have already been flushed. 589 */ 590 ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING); 591 checkpoint = &hdr.dlh_checkpoint; 592 } 593 594 if (hdr.dlh_length > 0) { 595 dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length, 596 ZIO_PRIORITY_SYNC_READ); 597 598 for (uint64_t offset = 0; offset < hdr.dlh_length; 599 offset += dn->dn_datablksz) { 600 err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db, 601 DMU_READ_PREFETCH | DMU_UNCACHEDIO); 602 if (err != 0) { 603 dnode_rele(dn, FTAG); 604 ddt_log_empty(ddt, ddl); 605 return (err); 606 } 607 608 uint64_t boffset = 0; 609 while (boffset < db->db_size) { 610 ddt_log_record_t *dlr = 611 (ddt_log_record_t *)(db->db_data + boffset); 612 613 /* Partially-filled block, skip the rest */ 614 if (DLR_GET_TYPE(dlr) == DLR_INVALID) 615 break; 616 617 switch (DLR_GET_TYPE(dlr)) { 618 case DLR_ENTRY: 619 ddt_log_load_entry(ddt, ddl, dlr, 620 checkpoint); 621 break; 622 623 default: 624 dmu_buf_rele(db, FTAG); 625 dnode_rele(dn, FTAG); 626 ddt_log_empty(ddt, ddl); 627 return (SET_ERROR(EINVAL)); 628 } 629 630 boffset += DLR_GET_RECLEN(dlr); 631 } 632 633 dmu_buf_rele(db, FTAG); 634 } 635 } 636 637 dnode_rele(dn, FTAG); 638 639 ddl->ddl_object = obj; 640 ddl->ddl_flags = DLH_GET_FLAGS(&hdr); 641 ddl->ddl_length = hdr.dlh_length; 642 ddl->ddl_first_txg = hdr.dlh_first_txg; 643 644 if (ddl->ddl_flags & DDL_FLAG_FLUSHING) 645 ddt->ddt_log_flushing = ddl; 646 else 647 ddt->ddt_log_active = ddl; 648 649 return (0); 650 } 651 652 int 653 ddt_log_load(ddt_t *ddt) 654 { 655 int err; 656 657 if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) { 658 /* 659 * The DDT is going to be freed again in a moment, so there's 660 * no point loading the log; it'll just slow down import. 661 */ 662 return (0); 663 } 664 665 ASSERT0(ddt->ddt_log[0].ddl_object); 666 ASSERT0(ddt->ddt_log[1].ddl_object); 667 if (ddt->ddt_dir_object == 0) { 668 /* 669 * If we're configured but the containing dir doesn't exist 670 * yet, then the log object can't possibly exist either. 671 */ 672 ASSERT3U(ddt->ddt_version, !=, UINT64_MAX); 673 return (SET_ERROR(ENOENT)); 674 } 675 676 if ((err = ddt_log_load_one(ddt, 0)) != 0) 677 return (err); 678 if ((err = ddt_log_load_one(ddt, 1)) != 0) 679 return (err); 680 681 VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing); 682 VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING)); 683 VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT)); 684 VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING); 685 686 /* 687 * We have two finalisation tasks: 688 * 689 * - rebuild the histogram. We do this at the end rather than while 690 * we're loading so we don't need to uncount and recount entries that 691 * appear multiple times in the log. 692 * 693 * - remove entries from the flushing tree that are on both trees. This 694 * happens when ddt_lookup() rehydrates an entry from the flushing 695 * tree, as ddt_log_take_key() removes the entry from the in-memory 696 * tree but doesn't remove it from disk. 697 */ 698 699 /* 700 * We don't technically need a config lock here, since there shouldn't 701 * be pool config changes during DDT load. dva_get_dsize_sync() via 702 * ddt_stat_generate() is expecting it though, and it won't hurt 703 * anything, so we take it. 704 */ 705 spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER); 706 707 avl_tree_t *al = &ddt->ddt_log_active->ddl_tree; 708 avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree; 709 ddt_log_entry_t *ae = avl_first(al); 710 ddt_log_entry_t *fe = avl_first(fl); 711 while (ae != NULL || fe != NULL) { 712 ddt_log_entry_t *ddle; 713 if (ae == NULL) { 714 /* active exhausted, take flushing */ 715 ddle = fe; 716 fe = AVL_NEXT(fl, fe); 717 } else if (fe == NULL) { 718 /* flushing exuhausted, take active */ 719 ddle = ae; 720 ae = AVL_NEXT(al, ae); 721 } else { 722 /* compare active and flushing */ 723 int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key); 724 if (c < 0) { 725 /* active behind, take and advance */ 726 ddle = ae; 727 ae = AVL_NEXT(al, ae); 728 } else if (c > 0) { 729 /* flushing behind, take and advance */ 730 ddle = fe; 731 fe = AVL_NEXT(fl, fe); 732 } else { 733 /* match. remove from flushing, take active */ 734 ddle = fe; 735 fe = AVL_NEXT(fl, fe); 736 avl_remove(fl, ddle); 737 ddt_log_free_entry(ddt, ddle); 738 ddle = ae; 739 ae = AVL_NEXT(al, ae); 740 } 741 } 742 743 ddt_lightweight_entry_t ddlwe; 744 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); 745 ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); 746 } 747 748 spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG); 749 750 ddt_log_update_stats(ddt); 751 752 return (0); 753 } 754 755 void 756 ddt_log_alloc(ddt_t *ddt) 757 { 758 ASSERT0P(ddt->ddt_log_active); 759 ASSERT0P(ddt->ddt_log_flushing); 760 761 avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare, 762 sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); 763 avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare, 764 sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); 765 ddt->ddt_log_active = &ddt->ddt_log[0]; 766 ddt->ddt_log_flushing = &ddt->ddt_log[1]; 767 ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; 768 } 769 770 void 771 ddt_log_free(ddt_t *ddt) 772 { 773 ddt_log_empty(ddt, &ddt->ddt_log[0]); 774 ddt_log_empty(ddt, &ddt->ddt_log[1]); 775 avl_destroy(&ddt->ddt_log[0].ddl_tree); 776 avl_destroy(&ddt->ddt_log[1].ddl_tree); 777 } 778 779 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW, 780 "Max transactions before starting to flush dedup logs"); 781 782 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD, 783 "Max memory for dedup logs"); 784 785 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD, 786 "Max memory for dedup logs, as % of total memory"); 787