1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2023, Klara Inc. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/ddt.h> 29 #include <sys/dmu_tx.h> 30 #include <sys/dmu.h> 31 #include <sys/ddt_impl.h> 32 #include <sys/dnode.h> 33 #include <sys/dbuf.h> 34 #include <sys/zap.h> 35 #include <sys/zio_checksum.h> 36 37 /* 38 * No more than this many txgs before swapping logs. 39 */ 40 uint_t zfs_dedup_log_txg_max = 8; 41 42 /* 43 * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module 44 * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory. 45 */ 46 uint64_t zfs_dedup_log_mem_max = 0; 47 uint_t zfs_dedup_log_mem_max_percent = 1; 48 49 50 static kmem_cache_t *ddt_log_entry_flat_cache; 51 static kmem_cache_t *ddt_log_entry_trad_cache; 52 53 #define DDT_LOG_ENTRY_FLAT_SIZE \ 54 (sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE) 55 #define DDT_LOG_ENTRY_TRAD_SIZE \ 56 (sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE) 57 58 #define DDT_LOG_ENTRY_SIZE(ddt) \ 59 _DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE) 60 61 void 62 ddt_log_init(void) 63 { 64 ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache", 65 DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 66 ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache", 67 DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); 68 69 /* 70 * Max memory for log AVL entries. At least 1M, because we need 71 * something (that's ~3800 entries per tree). They can say 100% if they 72 * want; it just means they're at the mercy of the the txg flush limit. 73 */ 74 if (zfs_dedup_log_mem_max == 0) { 75 zfs_dedup_log_mem_max_percent = 76 MIN(zfs_dedup_log_mem_max_percent, 100); 77 zfs_dedup_log_mem_max = (physmem * PAGESIZE) * 78 zfs_dedup_log_mem_max_percent / 100; 79 } 80 zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024); 81 } 82 83 void 84 ddt_log_fini(void) 85 { 86 kmem_cache_destroy(ddt_log_entry_trad_cache); 87 kmem_cache_destroy(ddt_log_entry_flat_cache); 88 } 89 90 static void 91 ddt_log_name(ddt_t *ddt, char *name, uint_t n) 92 { 93 snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG, 94 zio_checksum_table[ddt->ddt_checksum].ci_name, n); 95 } 96 97 static void 98 ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx) 99 { 100 dmu_buf_t *db; 101 VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db)); 102 dmu_buf_will_dirty(db, tx); 103 104 ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data; 105 DLH_SET_VERSION(hdr, 1); 106 DLH_SET_FLAGS(hdr, ddl->ddl_flags); 107 hdr->dlh_length = ddl->ddl_length; 108 hdr->dlh_first_txg = ddl->ddl_first_txg; 109 hdr->dlh_checkpoint = ddl->ddl_checkpoint; 110 111 dmu_buf_rele(db, FTAG); 112 } 113 114 static void 115 ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) 116 { 117 ASSERT3U(ddt->ddt_dir_object, >, 0); 118 ASSERT3U(ddl->ddl_object, ==, 0); 119 120 char name[DDT_NAMELEN]; 121 ddt_log_name(ddt, name, n); 122 123 ddl->ddl_object = dmu_object_alloc(ddt->ddt_os, 124 DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, 125 DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx); 126 VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name, 127 sizeof (uint64_t), 1, &ddl->ddl_object, tx)); 128 ddl->ddl_length = 0; 129 ddl->ddl_first_txg = tx->tx_txg; 130 ddt_log_update_header(ddt, ddl, tx); 131 } 132 133 static void 134 ddt_log_create(ddt_t *ddt, dmu_tx_t *tx) 135 { 136 ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx); 137 ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx); 138 } 139 140 static void 141 ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx) 142 { 143 ASSERT3U(ddt->ddt_dir_object, >, 0); 144 145 if (ddl->ddl_object == 0) 146 return; 147 148 ASSERT0(ddl->ddl_length); 149 150 char name[DDT_NAMELEN]; 151 ddt_log_name(ddt, name, n); 152 153 VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx)); 154 VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx)); 155 156 ddl->ddl_object = 0; 157 } 158 159 void 160 ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx) 161 { 162 ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx); 163 ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx); 164 } 165 166 static void 167 ddt_log_update_stats(ddt_t *ddt) 168 { 169 /* 170 * Log object stats. We count the number of live entries in the log 171 * tree, even if there are more than on disk, and even if the same 172 * entry is on both append and flush trees, because that's more what 173 * the user expects to see. This does mean the on-disk size is not 174 * really correlated with the number of entries, but I don't think 175 * that's reasonable to expect anyway. 176 */ 177 dmu_object_info_t doi; 178 uint64_t nblocks; 179 dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi); 180 nblocks = doi.doi_physical_blocks_512; 181 dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi); 182 nblocks += doi.doi_physical_blocks_512; 183 184 ddt_object_t *ddo = &ddt->ddt_log_stats; 185 ddo->ddo_count = 186 avl_numnodes(&ddt->ddt_log_active->ddl_tree) + 187 avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); 188 ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt); 189 ddo->ddo_dspace = nblocks << 9; 190 } 191 192 void 193 ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu) 194 { 195 ASSERT3U(nentries, >, 0); 196 ASSERT3P(dlu->dlu_dbp, ==, NULL); 197 198 if (ddt->ddt_log_active->ddl_object == 0) 199 ddt_log_create(ddt, tx); 200 201 /* 202 * We want to store as many entries as we can in a block, but never 203 * split an entry across block boundaries. 204 */ 205 size_t reclen = P2ALIGN_TYPED( 206 sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) + 207 DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t); 208 ASSERT3U(reclen, <=, UINT16_MAX); 209 dlu->dlu_reclen = reclen; 210 211 VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG, 212 &dlu->dlu_dn)); 213 dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP); 214 215 uint64_t nblocks = howmany(nentries, 216 dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen); 217 uint64_t offset = ddt->ddt_log_active->ddl_length; 218 uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz; 219 220 VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length, 221 B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp, 222 DMU_READ_NO_PREFETCH)); 223 224 dlu->dlu_tx = tx; 225 dlu->dlu_block = dlu->dlu_offset = 0; 226 } 227 228 static ddt_log_entry_t * 229 ddt_log_alloc_entry(ddt_t *ddt) 230 { 231 ddt_log_entry_t *ddle; 232 233 if (ddt->ddt_flags & DDT_FLAG_FLAT) { 234 ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP); 235 memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE); 236 } else { 237 ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP); 238 memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE); 239 } 240 241 return (ddle); 242 } 243 244 static void 245 ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) 246 { 247 /* Create the log tree entry from a live or stored entry */ 248 avl_index_t where; 249 ddt_log_entry_t *ddle = 250 avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where); 251 if (ddle == NULL) { 252 ddle = ddt_log_alloc_entry(ddt); 253 ddle->ddle_key = ddlwe->ddlwe_key; 254 avl_insert(&ddl->ddl_tree, ddle, where); 255 } 256 ddle->ddle_type = ddlwe->ddlwe_type; 257 ddle->ddle_class = ddlwe->ddlwe_class; 258 memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); 259 } 260 261 void 262 ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu) 263 { 264 ASSERT3U(dlu->dlu_dbp, !=, NULL); 265 266 ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe); 267 ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe); 268 269 /* Get our block */ 270 ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); 271 dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block]; 272 273 /* 274 * If this would take us past the end of the block, finish it and 275 * move to the next one. 276 */ 277 if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) { 278 ASSERT3U(dlu->dlu_offset, >, 0); 279 dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE); 280 dlu->dlu_block++; 281 dlu->dlu_offset = 0; 282 ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp); 283 db = dlu->dlu_dbp[dlu->dlu_block]; 284 } 285 286 /* 287 * If this is the first time touching the block, inform the DMU that 288 * we will fill it, and zero it out. 289 */ 290 if (dlu->dlu_offset == 0) { 291 dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE); 292 memset(db->db_data, 0, db->db_size); 293 } 294 295 /* Create the log record directly in the buffer */ 296 ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset); 297 DLR_SET_TYPE(dlr, DLR_ENTRY); 298 DLR_SET_RECLEN(dlr, dlu->dlu_reclen); 299 DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type); 300 DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class); 301 302 ddt_log_record_entry_t *dlre = 303 (ddt_log_record_entry_t *)&dlr->dlr_payload; 304 dlre->dlre_key = ddlwe->ddlwe_key; 305 memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); 306 307 /* Advance offset for next record. */ 308 dlu->dlu_offset += dlu->dlu_reclen; 309 } 310 311 void 312 ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu) 313 { 314 ASSERT3U(dlu->dlu_dbp, !=, NULL); 315 ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp); 316 ASSERT3U(dlu->dlu_offset, >, 0); 317 318 /* 319 * Close out the last block. Whatever we haven't used will be zeroed, 320 * which matches DLR_INVALID, so we can detect this during load. 321 */ 322 dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE); 323 324 dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG); 325 326 ddt->ddt_log_active->ddl_length += 327 dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz; 328 dnode_rele(dlu->dlu_dn, FTAG); 329 330 ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx); 331 332 memset(dlu, 0, sizeof (ddt_log_update_t)); 333 334 ddt_log_update_stats(ddt); 335 } 336 337 boolean_t 338 ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) 339 { 340 ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); 341 if (ddle == NULL) 342 return (B_FALSE); 343 344 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); 345 346 ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); 347 348 avl_remove(&ddl->ddl_tree, ddle); 349 kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? 350 ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); 351 352 return (B_TRUE); 353 } 354 355 boolean_t 356 ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) 357 { 358 ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL); 359 if (ddle == NULL) 360 return (B_FALSE); 361 362 ddt_lightweight_entry_t ddlwe; 363 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); 364 ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); 365 366 avl_remove(&ddl->ddl_tree, ddle); 367 kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? 368 ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); 369 370 return (B_TRUE); 371 } 372 373 boolean_t 374 ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, 375 ddt_lightweight_entry_t *ddlwe) 376 { 377 ddt_log_entry_t *ddle = 378 avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL); 379 if (!ddle) 380 ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL); 381 if (!ddle) 382 return (B_FALSE); 383 if (ddlwe) 384 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); 385 return (B_TRUE); 386 } 387 388 void 389 ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) 390 { 391 ddt_log_t *ddl = ddt->ddt_log_flushing; 392 393 ASSERT3U(ddl->ddl_object, !=, 0); 394 395 #ifdef ZFS_DEBUG 396 /* 397 * There should not be any entries on the log tree before the given 398 * checkpoint. Assert that this is the case. 399 */ 400 ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree); 401 if (ddle != NULL) 402 VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key), 403 >, 0); 404 #endif 405 406 ddl->ddl_flags |= DDL_FLAG_CHECKPOINT; 407 ddl->ddl_checkpoint = ddlwe->ddlwe_key; 408 ddt_log_update_header(ddt, ddl, tx); 409 410 ddt_log_update_stats(ddt); 411 } 412 413 void 414 ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx) 415 { 416 ddt_log_t *ddl = ddt->ddt_log_flushing; 417 418 if (ddl->ddl_object == 0) 419 return; 420 421 ASSERT(avl_is_empty(&ddl->ddl_tree)); 422 423 /* Eject the entire object */ 424 dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx); 425 426 ddl->ddl_length = 0; 427 ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT; 428 memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t)); 429 ddt_log_update_header(ddt, ddl, tx); 430 431 ddt_log_update_stats(ddt); 432 } 433 434 boolean_t 435 ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx) 436 { 437 /* Swap the logs. The old flushing one must be empty */ 438 VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)); 439 440 /* 441 * If there are still blocks on the flushing log, truncate it first. 442 * This can happen if there were entries on the flushing log that were 443 * removed in memory via ddt_lookup(); their vestigal remains are 444 * on disk. 445 */ 446 if (ddt->ddt_log_flushing->ddl_length > 0) 447 ddt_log_truncate(ddt, tx); 448 449 /* 450 * Swap policy. We swap the logs (and so begin flushing) when the 451 * active tree grows too large, or when we haven't swapped it in 452 * some amount of time, or if something has requested the logs be 453 * flushed ASAP (see ddt_walk_init()). 454 */ 455 456 /* 457 * The log tree is too large if the memory usage of its entries is over 458 * half of the memory limit. This effectively gives each log tree half 459 * the available memory. 460 */ 461 const boolean_t too_large = 462 (avl_numnodes(&ddt->ddt_log_active->ddl_tree) * 463 DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1); 464 465 const boolean_t too_old = 466 tx->tx_txg >= 467 (ddt->ddt_log_active->ddl_first_txg + 468 MAX(1, zfs_dedup_log_txg_max)); 469 470 const boolean_t force = 471 ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg; 472 473 if (!(too_large || too_old || force)) 474 return (B_FALSE); 475 476 ddt_log_t *swap = ddt->ddt_log_active; 477 ddt->ddt_log_active = ddt->ddt_log_flushing; 478 ddt->ddt_log_flushing = swap; 479 480 ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING); 481 ddt->ddt_log_active->ddl_flags &= 482 ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT); 483 484 ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING)); 485 ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; 486 487 ddt->ddt_log_active->ddl_first_txg = tx->tx_txg; 488 489 ddt_log_update_header(ddt, ddt->ddt_log_active, tx); 490 ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx); 491 492 ddt_log_update_stats(ddt); 493 494 return (B_TRUE); 495 } 496 497 static inline void 498 ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr, 499 const ddt_key_t *checkpoint) 500 { 501 ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY); 502 503 ddt_log_record_entry_t *dlre = 504 (ddt_log_record_entry_t *)dlr->dlr_payload; 505 if (checkpoint != NULL && 506 ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) { 507 /* Skip pre-checkpoint entries; they're already flushed. */ 508 return; 509 } 510 511 ddt_lightweight_entry_t ddlwe; 512 ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr); 513 ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr); 514 515 ddlwe.ddlwe_key = dlre->dlre_key; 516 memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt)); 517 518 ddt_log_update_entry(ddt, ddl, &ddlwe); 519 } 520 521 static void 522 ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl) 523 { 524 void *cookie = NULL; 525 ddt_log_entry_t *ddle; 526 IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree)); 527 while ((ddle = 528 avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) { 529 kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? 530 ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle); 531 } 532 ASSERT(avl_is_empty(&ddl->ddl_tree)); 533 } 534 535 static int 536 ddt_log_load_one(ddt_t *ddt, uint_t n) 537 { 538 ASSERT3U(n, <, 2); 539 540 ddt_log_t *ddl = &ddt->ddt_log[n]; 541 542 char name[DDT_NAMELEN]; 543 ddt_log_name(ddt, name, n); 544 545 uint64_t obj; 546 int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, 547 sizeof (uint64_t), 1, &obj); 548 if (err == ENOENT) 549 return (0); 550 if (err != 0) 551 return (err); 552 553 dnode_t *dn; 554 err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn); 555 if (err != 0) 556 return (err); 557 558 ddt_log_header_t hdr; 559 dmu_buf_t *db; 560 err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH); 561 if (err != 0) { 562 dnode_rele(dn, FTAG); 563 return (err); 564 } 565 memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t)); 566 dmu_buf_rele(db, FTAG); 567 568 if (DLH_GET_VERSION(&hdr) != 1) { 569 dnode_rele(dn, FTAG); 570 zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s " 571 "unknown version=%llu", spa_name(ddt->ddt_spa), name, 572 (u_longlong_t)DLH_GET_VERSION(&hdr)); 573 return (SET_ERROR(EINVAL)); 574 } 575 576 ddt_key_t *checkpoint = NULL; 577 if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) { 578 /* 579 * If the log has a checkpoint, then we can ignore any entries 580 * that have already been flushed. 581 */ 582 ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING); 583 checkpoint = &hdr.dlh_checkpoint; 584 } 585 586 if (hdr.dlh_length > 0) { 587 dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length, 588 ZIO_PRIORITY_SYNC_READ); 589 590 for (uint64_t offset = 0; offset < hdr.dlh_length; 591 offset += dn->dn_datablksz) { 592 err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db, 593 DMU_READ_PREFETCH); 594 if (err != 0) { 595 dnode_rele(dn, FTAG); 596 ddt_log_empty(ddt, ddl); 597 return (err); 598 } 599 600 uint64_t boffset = 0; 601 while (boffset < db->db_size) { 602 ddt_log_record_t *dlr = 603 (ddt_log_record_t *)(db->db_data + boffset); 604 605 /* Partially-filled block, skip the rest */ 606 if (DLR_GET_TYPE(dlr) == DLR_INVALID) 607 break; 608 609 switch (DLR_GET_TYPE(dlr)) { 610 case DLR_ENTRY: 611 ddt_log_load_entry(ddt, ddl, dlr, 612 checkpoint); 613 break; 614 615 default: 616 dmu_buf_rele(db, FTAG); 617 dnode_rele(dn, FTAG); 618 ddt_log_empty(ddt, ddl); 619 return (SET_ERROR(EINVAL)); 620 } 621 622 boffset += DLR_GET_RECLEN(dlr); 623 } 624 625 dmu_buf_rele(db, FTAG); 626 } 627 } 628 629 dnode_rele(dn, FTAG); 630 631 ddl->ddl_object = obj; 632 ddl->ddl_flags = DLH_GET_FLAGS(&hdr); 633 ddl->ddl_length = hdr.dlh_length; 634 ddl->ddl_first_txg = hdr.dlh_first_txg; 635 636 if (ddl->ddl_flags & DDL_FLAG_FLUSHING) 637 ddt->ddt_log_flushing = ddl; 638 else 639 ddt->ddt_log_active = ddl; 640 641 return (0); 642 } 643 644 int 645 ddt_log_load(ddt_t *ddt) 646 { 647 int err; 648 649 if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) { 650 /* 651 * The DDT is going to be freed again in a moment, so there's 652 * no point loading the log; it'll just slow down import. 653 */ 654 return (0); 655 } 656 657 ASSERT0(ddt->ddt_log[0].ddl_object); 658 ASSERT0(ddt->ddt_log[1].ddl_object); 659 if (ddt->ddt_dir_object == 0) { 660 /* 661 * If we're configured but the containing dir doesn't exist 662 * yet, then the log object can't possibly exist either. 663 */ 664 ASSERT3U(ddt->ddt_version, !=, UINT64_MAX); 665 return (SET_ERROR(ENOENT)); 666 } 667 668 if ((err = ddt_log_load_one(ddt, 0)) != 0) 669 return (err); 670 if ((err = ddt_log_load_one(ddt, 1)) != 0) 671 return (err); 672 673 VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing); 674 VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING)); 675 VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT)); 676 VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING); 677 678 /* 679 * We have two finalisation tasks: 680 * 681 * - rebuild the histogram. We do this at the end rather than while 682 * we're loading so we don't need to uncount and recount entries that 683 * appear multiple times in the log. 684 * 685 * - remove entries from the flushing tree that are on both trees. This 686 * happens when ddt_lookup() rehydrates an entry from the flushing 687 * tree, as ddt_log_take_key() removes the entry from the in-memory 688 * tree but doesn't remove it from disk. 689 */ 690 691 /* 692 * We don't technically need a config lock here, since there shouldn't 693 * be pool config changes during DDT load. dva_get_dsize_sync() via 694 * ddt_stat_generate() is expecting it though, and it won't hurt 695 * anything, so we take it. 696 */ 697 spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER); 698 699 avl_tree_t *al = &ddt->ddt_log_active->ddl_tree; 700 avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree; 701 ddt_log_entry_t *ae = avl_first(al); 702 ddt_log_entry_t *fe = avl_first(fl); 703 while (ae != NULL || fe != NULL) { 704 ddt_log_entry_t *ddle; 705 if (ae == NULL) { 706 /* active exhausted, take flushing */ 707 ddle = fe; 708 fe = AVL_NEXT(fl, fe); 709 } else if (fe == NULL) { 710 /* flushing exuhausted, take active */ 711 ddle = ae; 712 ae = AVL_NEXT(al, ae); 713 } else { 714 /* compare active and flushing */ 715 int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key); 716 if (c < 0) { 717 /* active behind, take and advance */ 718 ddle = ae; 719 ae = AVL_NEXT(al, ae); 720 } else if (c > 0) { 721 /* flushing behind, take and advance */ 722 ddle = fe; 723 fe = AVL_NEXT(fl, fe); 724 } else { 725 /* match. remove from flushing, take active */ 726 ddle = fe; 727 fe = AVL_NEXT(fl, fe); 728 avl_remove(fl, ddle); 729 730 ddle = ae; 731 ae = AVL_NEXT(al, ae); 732 } 733 } 734 735 ddt_lightweight_entry_t ddlwe; 736 DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); 737 ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); 738 } 739 740 spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG); 741 742 ddt_log_update_stats(ddt); 743 744 return (0); 745 } 746 747 void 748 ddt_log_alloc(ddt_t *ddt) 749 { 750 ASSERT3P(ddt->ddt_log_active, ==, NULL); 751 ASSERT3P(ddt->ddt_log_flushing, ==, NULL); 752 753 avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare, 754 sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); 755 avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare, 756 sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node)); 757 ddt->ddt_log_active = &ddt->ddt_log[0]; 758 ddt->ddt_log_flushing = &ddt->ddt_log[1]; 759 ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING; 760 } 761 762 void 763 ddt_log_free(ddt_t *ddt) 764 { 765 ddt_log_empty(ddt, &ddt->ddt_log[0]); 766 ddt_log_empty(ddt, &ddt->ddt_log[1]); 767 avl_destroy(&ddt->ddt_log[0].ddl_tree); 768 avl_destroy(&ddt->ddt_log[1].ddl_tree); 769 } 770 771 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW, 772 "Max transactions before starting to flush dedup logs"); 773 774 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD, 775 "Max memory for dedup logs"); 776 777 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD, 778 "Max memory for dedup logs, as % of total memory"); 779