1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/spa_impl.h> 30 #include <sys/zio.h> 31 #include <sys/ddt.h> 32 #include <sys/zap.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/arc.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/zio_compress.h> 37 38 static const ddt_ops_t *ddt_ops[DDT_TYPES] = { 39 &ddt_zap_ops, 40 }; 41 42 static const char *ddt_class_name[DDT_CLASSES] = { 43 "ditto", 44 "duplicate", 45 "unique", 46 }; 47 48 static void 49 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 50 dmu_tx_t *tx) 51 { 52 spa_t *spa = ddt->ddt_spa; 53 objset_t *os = ddt->ddt_os; 54 uint64_t *objectp = &ddt->ddt_object[type][class]; 55 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; 56 char name[DDT_NAMELEN]; 57 58 ddt_object_name(ddt, type, class, name); 59 60 ASSERT(*objectp == 0); 61 VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); 62 ASSERT(*objectp != 0); 63 64 VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, 65 sizeof (uint64_t), 1, objectp, tx) == 0); 66 67 VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, 68 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 69 &ddt->ddt_histogram[type][class], tx) == 0); 70 } 71 72 static void 73 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 74 dmu_tx_t *tx) 75 { 76 spa_t *spa = ddt->ddt_spa; 77 objset_t *os = ddt->ddt_os; 78 uint64_t *objectp = &ddt->ddt_object[type][class]; 79 char name[DDT_NAMELEN]; 80 81 ddt_object_name(ddt, type, class, name); 82 83 ASSERT(*objectp != 0); 84 ASSERT(ddt_object_count(ddt, type, class) == 0); 85 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); 86 VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); 87 VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); 88 VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); 89 90 *objectp = 0; 91 } 92 93 static int 94 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 95 { 96 char name[DDT_NAMELEN]; 97 int error; 98 99 ddt_object_name(ddt, type, class, name); 100 101 error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, 102 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); 103 104 if (error) 105 return (error); 106 107 error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 108 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 109 &ddt->ddt_histogram[type][class]); 110 111 ASSERT(error == 0); 112 return (error); 113 } 114 115 static void 116 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 117 dmu_tx_t *tx) 118 { 119 char name[DDT_NAMELEN]; 120 121 ddt_object_name(ddt, type, class, name); 122 123 VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 124 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 125 &ddt->ddt_histogram[type][class], tx) == 0); 126 } 127 128 static int 129 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 130 ddt_entry_t *dde) 131 { 132 if (!ddt_object_exists(ddt, type, class)) 133 return (ENOENT); 134 135 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, 136 ddt->ddt_object[type][class], dde)); 137 } 138 139 static int 140 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 141 ddt_entry_t *dde, dmu_tx_t *tx) 142 { 143 ASSERT(ddt_object_exists(ddt, type, class)); 144 145 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, 146 ddt->ddt_object[type][class], dde, tx)); 147 } 148 149 static int 150 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 151 ddt_entry_t *dde, dmu_tx_t *tx) 152 { 153 ASSERT(ddt_object_exists(ddt, type, class)); 154 155 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, 156 ddt->ddt_object[type][class], dde, tx)); 157 } 158 159 int 160 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 161 ddt_entry_t *dde, uint64_t *walk) 162 { 163 ASSERT(ddt_object_exists(ddt, type, class)); 164 165 return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, 166 ddt->ddt_object[type][class], dde, walk)); 167 } 168 169 uint64_t 170 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 171 { 172 ASSERT(ddt_object_exists(ddt, type, class)); 173 174 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, 175 ddt->ddt_object[type][class])); 176 } 177 178 int 179 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 180 dmu_object_info_t *doi) 181 { 182 if (!ddt_object_exists(ddt, type, class)) 183 return (ENOENT); 184 185 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], 186 doi)); 187 } 188 189 boolean_t 190 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 191 { 192 return (!!ddt->ddt_object[type][class]); 193 } 194 195 void 196 ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 197 char *name) 198 { 199 (void) sprintf(name, DMU_POOL_DDT, 200 zio_checksum_table[ddt->ddt_checksum].ci_name, 201 ddt_ops[type]->ddt_op_name, ddt_class_name[class]); 202 } 203 204 void 205 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) 206 { 207 ASSERT(txg != 0); 208 209 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 210 bp->blk_dva[d] = ddp->ddp_dva[d]; 211 BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); 212 } 213 214 void 215 ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk, const ddt_phys_t *ddp, 216 blkptr_t *bp) 217 { 218 BP_ZERO(bp); 219 220 if (ddp != NULL) 221 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); 222 223 bp->blk_cksum = ddk->ddk_cksum; 224 225 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); 226 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); 227 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); 228 BP_SET_CHECKSUM(bp, ddt->ddt_checksum); 229 BP_SET_TYPE(bp, DMU_OT_NONE); 230 BP_SET_LEVEL(bp, 0); 231 BP_SET_DEDUP(bp, 0); 232 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 233 } 234 235 void 236 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) 237 { 238 ddk->ddk_cksum = bp->blk_cksum; 239 ddk->ddk_prop = 0; 240 241 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); 242 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); 243 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); 244 } 245 246 void 247 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) 248 { 249 ASSERT(ddp->ddp_phys_birth == 0); 250 251 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 252 ddp->ddp_dva[d] = bp->blk_dva[d]; 253 ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); 254 } 255 256 void 257 ddt_phys_clear(ddt_phys_t *ddp) 258 { 259 bzero(ddp, sizeof (*ddp)); 260 } 261 262 void 263 ddt_phys_addref(ddt_phys_t *ddp) 264 { 265 ddp->ddp_refcnt++; 266 } 267 268 void 269 ddt_phys_decref(ddt_phys_t *ddp) 270 { 271 ASSERT((int64_t)ddp->ddp_refcnt > 0); 272 ddp->ddp_refcnt--; 273 } 274 275 void 276 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) 277 { 278 blkptr_t blk; 279 280 ddt_bp_create(ddt, ddk, ddp, &blk); 281 ddt_phys_clear(ddp); 282 zio_free(ddt->ddt_spa, txg, &blk); 283 } 284 285 ddt_phys_t * 286 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) 287 { 288 ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; 289 290 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 291 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && 292 BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) 293 return (ddp); 294 } 295 return (NULL); 296 } 297 298 uint64_t 299 ddt_phys_total_refcnt(const ddt_entry_t *dde) 300 { 301 uint64_t refcnt = 0; 302 303 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) 304 refcnt += dde->dde_phys[p].ddp_refcnt; 305 306 return (refcnt); 307 } 308 309 static void 310 ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) 311 { 312 spa_t *spa = ddt->ddt_spa; 313 ddt_phys_t *ddp = dde->dde_phys; 314 ddt_key_t *ddk = &dde->dde_key; 315 uint64_t lsize = DDK_GET_LSIZE(ddk); 316 uint64_t psize = DDK_GET_PSIZE(ddk); 317 318 bzero(dds, sizeof (*dds)); 319 320 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 321 uint64_t dsize = 0; 322 uint64_t refcnt = ddp->ddp_refcnt; 323 324 if (ddp->ddp_phys_birth == 0) 325 continue; 326 327 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 328 dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); 329 330 dds->dds_blocks += 1; 331 dds->dds_lsize += lsize; 332 dds->dds_psize += psize; 333 dds->dds_dsize += dsize; 334 335 dds->dds_ref_blocks += refcnt; 336 dds->dds_ref_lsize += lsize * refcnt; 337 dds->dds_ref_psize += psize * refcnt; 338 dds->dds_ref_dsize += dsize * refcnt; 339 } 340 } 341 342 void 343 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) 344 { 345 const uint64_t *s = (const uint64_t *)src; 346 uint64_t *d = (uint64_t *)dst; 347 uint64_t *d_end = (uint64_t *)(dst + 1); 348 349 ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ 350 351 while (d < d_end) 352 *d++ += (*s++ ^ neg) - neg; 353 } 354 355 static void 356 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) 357 { 358 ddt_stat_t dds; 359 ddt_histogram_t *ddh; 360 int bucket; 361 362 ddt_stat_generate(ddt, dde, &dds); 363 364 bucket = highbit(dds.dds_ref_blocks) - 1; 365 ASSERT(bucket >= 0); 366 367 ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; 368 369 ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); 370 } 371 372 void 373 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) 374 { 375 for (int h = 0; h < 64; h++) 376 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); 377 } 378 379 void 380 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) 381 { 382 bzero(dds, sizeof (*dds)); 383 384 for (int h = 0; h < 64; h++) 385 ddt_stat_add(dds, &ddh->ddh_stat[h], 0); 386 } 387 388 boolean_t 389 ddt_histogram_empty(const ddt_histogram_t *ddh) 390 { 391 const uint64_t *s = (const uint64_t *)ddh; 392 const uint64_t *s_end = (const uint64_t *)(ddh + 1); 393 394 while (s < s_end) 395 if (*s++ != 0) 396 return (B_FALSE); 397 398 return (B_TRUE); 399 } 400 401 uint64_t 402 ddt_get_pool_dedup_ratio(spa_t *spa) 403 { 404 ddt_histogram_t ddh_total = { 0 }; 405 ddt_stat_t dds_total = { 0 }; 406 407 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 408 ddt_t *ddt = spa->spa_ddt[c]; 409 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 410 for (enum ddt_class class = 0; class < DDT_CLASSES; 411 class++) { 412 ddt_histogram_add(&ddh_total, 413 &ddt->ddt_histogram[type][class]); 414 } 415 } 416 } 417 418 ddt_histogram_stat(&dds_total, &ddh_total); 419 420 if (dds_total.dds_dsize == 0) 421 return (100); 422 423 return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); 424 } 425 426 int 427 ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) 428 { 429 spa_t *spa = ddt->ddt_spa; 430 uint64_t total_refcnt = 0; 431 uint64_t ditto = spa->spa_dedup_ditto; 432 int total_copies = 0; 433 int desired_copies = 0; 434 435 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 436 ddt_phys_t *ddp = &dde->dde_phys[p]; 437 zio_t *zio = dde->dde_lead_zio[p]; 438 uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ 439 if (zio != NULL) 440 refcnt += zio->io_parent_count; /* pending refs */ 441 if (ddp == ddp_willref) 442 refcnt++; /* caller's ref */ 443 if (refcnt != 0) { 444 total_refcnt += refcnt; 445 total_copies += p; 446 } 447 } 448 449 if (ditto == 0 || ditto > UINT32_MAX) 450 ditto = UINT32_MAX; 451 452 if (total_refcnt >= 1) 453 desired_copies++; 454 if (total_refcnt >= ditto) 455 desired_copies++; 456 if (total_refcnt >= ditto * ditto) 457 desired_copies++; 458 459 return (MAX(desired_copies, total_copies) - total_copies); 460 } 461 462 int 463 ddt_ditto_copies_present(ddt_entry_t *dde) 464 { 465 ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; 466 dva_t *dva = ddp->ddp_dva; 467 int copies = 0 - DVA_GET_GANG(dva); 468 469 for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) 470 if (DVA_IS_VALID(dva)) 471 copies++; 472 473 ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); 474 475 return (copies); 476 } 477 478 size_t 479 ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) 480 { 481 uchar_t *version = dst++; 482 int cpfunc = ZIO_COMPRESS_ZLE; 483 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 484 size_t c_len; 485 486 ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ 487 488 c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); 489 490 if (c_len == s_len) { 491 cpfunc = ZIO_COMPRESS_OFF; 492 bcopy(src, dst, s_len); 493 } 494 495 *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; 496 497 return (c_len + 1); 498 } 499 500 void 501 ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) 502 { 503 uchar_t version = *src++; 504 int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; 505 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 506 507 if (ci->ci_decompress != NULL) 508 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); 509 else 510 bcopy(src, dst, d_len); 511 512 if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) 513 byteswap_uint64_array(dst, d_len); 514 } 515 516 ddt_t * 517 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) 518 { 519 return (spa->spa_ddt[c]); 520 } 521 522 ddt_t * 523 ddt_select(spa_t *spa, const blkptr_t *bp) 524 { 525 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); 526 } 527 528 void 529 ddt_enter(ddt_t *ddt) 530 { 531 mutex_enter(&ddt->ddt_lock); 532 } 533 534 void 535 ddt_exit(ddt_t *ddt) 536 { 537 mutex_exit(&ddt->ddt_lock); 538 } 539 540 static ddt_entry_t * 541 ddt_alloc(const ddt_key_t *ddk) 542 { 543 ddt_entry_t *dde; 544 545 dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); 546 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); 547 548 dde->dde_key = *ddk; 549 550 return (dde); 551 } 552 553 static void 554 ddt_free(ddt_entry_t *dde) 555 { 556 ASSERT(!dde->dde_loading); 557 558 for (int p = 0; p < DDT_PHYS_TYPES; p++) 559 ASSERT(dde->dde_lead_zio[p] == NULL); 560 561 if (dde->dde_repair_data != NULL) 562 zio_buf_free(dde->dde_repair_data, 563 DDK_GET_PSIZE(&dde->dde_key)); 564 565 cv_destroy(&dde->dde_cv); 566 kmem_free(dde, sizeof (*dde)); 567 } 568 569 void 570 ddt_remove(ddt_t *ddt, ddt_entry_t *dde) 571 { 572 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 573 574 avl_remove(&ddt->ddt_tree, dde); 575 ddt_free(dde); 576 } 577 578 ddt_entry_t * 579 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) 580 { 581 ddt_entry_t *dde, dde_search; 582 enum ddt_type type; 583 enum ddt_class class; 584 avl_index_t where; 585 int error; 586 587 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 588 589 ddt_key_fill(&dde_search.dde_key, bp); 590 591 dde = avl_find(&ddt->ddt_tree, &dde_search, &where); 592 if (dde == NULL) { 593 if (!add) 594 return (NULL); 595 dde = ddt_alloc(&dde_search.dde_key); 596 avl_insert(&ddt->ddt_tree, dde, where); 597 } 598 599 while (dde->dde_loading) 600 cv_wait(&dde->dde_cv, &ddt->ddt_lock); 601 602 if (dde->dde_loaded) 603 return (dde); 604 605 dde->dde_loading = B_TRUE; 606 607 ddt_exit(ddt); 608 609 error = ENOENT; 610 611 for (type = 0; type < DDT_TYPES; type++) { 612 for (class = 0; class < DDT_CLASSES; class++) { 613 error = ddt_object_lookup(ddt, type, class, dde); 614 if (error != ENOENT) 615 break; 616 } 617 if (error != ENOENT) 618 break; 619 } 620 621 ASSERT(error == 0 || error == ENOENT); 622 623 ddt_enter(ddt); 624 625 ASSERT(dde->dde_loaded == B_FALSE); 626 ASSERT(dde->dde_loading == B_TRUE); 627 628 dde->dde_type = type; /* will be DDT_TYPES if no entry found */ 629 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ 630 dde->dde_loaded = B_TRUE; 631 dde->dde_loading = B_FALSE; 632 633 if (error == 0) 634 ddt_stat_update(ddt, dde, -1ULL); 635 636 cv_broadcast(&dde->dde_cv); 637 638 return (dde); 639 } 640 641 int 642 ddt_entry_compare(const void *x1, const void *x2) 643 { 644 const ddt_entry_t *dde1 = x1; 645 const ddt_entry_t *dde2 = x2; 646 const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; 647 const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; 648 649 for (int i = 0; i < DDT_KEY_WORDS; i++) { 650 if (u1[i] < u2[i]) 651 return (-1); 652 if (u1[i] > u2[i]) 653 return (1); 654 } 655 656 return (0); 657 } 658 659 static ddt_t * 660 ddt_table_alloc(spa_t *spa, enum zio_checksum c) 661 { 662 ddt_t *ddt; 663 664 ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); 665 666 mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); 667 avl_create(&ddt->ddt_tree, ddt_entry_compare, 668 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 669 avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, 670 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 671 ddt->ddt_checksum = c; 672 ddt->ddt_spa = spa; 673 ddt->ddt_os = spa->spa_meta_objset; 674 675 return (ddt); 676 } 677 678 static void 679 ddt_table_free(ddt_t *ddt) 680 { 681 ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); 682 ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); 683 avl_destroy(&ddt->ddt_tree); 684 avl_destroy(&ddt->ddt_repair_tree); 685 mutex_destroy(&ddt->ddt_lock); 686 kmem_free(ddt, sizeof (*ddt)); 687 } 688 689 void 690 ddt_create(spa_t *spa) 691 { 692 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; 693 694 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) 695 spa->spa_ddt[c] = ddt_table_alloc(spa, c); 696 } 697 698 int 699 ddt_load(spa_t *spa) 700 { 701 int error; 702 703 ddt_create(spa); 704 705 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 706 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 707 &spa->spa_ddt_stat_object); 708 709 if (error) 710 return (error == ENOENT ? 0 : error); 711 712 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 713 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 714 for (enum ddt_class class = 0; class < DDT_CLASSES; 715 class++) { 716 ddt_t *ddt = spa->spa_ddt[c]; 717 error = ddt_object_load(ddt, type, class); 718 if (error != 0 && error != ENOENT) 719 return (error); 720 } 721 } 722 } 723 724 return (0); 725 } 726 727 void 728 ddt_unload(spa_t *spa) 729 { 730 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 731 if (spa->spa_ddt[c]) { 732 ddt_table_free(spa->spa_ddt[c]); 733 spa->spa_ddt[c] = NULL; 734 } 735 } 736 } 737 738 ddt_entry_t * 739 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) 740 { 741 ddt_key_t ddk; 742 ddt_entry_t *dde; 743 744 ddt_key_fill(&ddk, bp); 745 746 dde = ddt_alloc(&ddk); 747 748 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 749 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 750 /* 751 * We can only do repair if there are multiple copies 752 * of the block. For anything in the UNIQUE class, 753 * there's definitely only one copy, so don't even try. 754 */ 755 if (class != DDT_CLASS_UNIQUE && 756 ddt_object_lookup(ddt, type, class, dde) == 0) 757 return (dde); 758 } 759 } 760 761 bzero(dde->dde_phys, sizeof (dde->dde_phys)); 762 763 return (dde); 764 } 765 766 void 767 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) 768 { 769 avl_index_t where; 770 771 ddt_enter(ddt); 772 773 if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && 774 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) 775 avl_insert(&ddt->ddt_repair_tree, dde, where); 776 else 777 ddt_free(dde); 778 779 ddt_exit(ddt); 780 } 781 782 static void 783 ddt_repair_entry_done(zio_t *zio) 784 { 785 ddt_entry_t *rdde = zio->io_private; 786 787 ddt_free(rdde); 788 } 789 790 static void 791 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) 792 { 793 ddt_phys_t *ddp = dde->dde_phys; 794 ddt_phys_t *rddp = rdde->dde_phys; 795 ddt_key_t *ddk = &dde->dde_key; 796 ddt_key_t *rddk = &rdde->dde_key; 797 zio_t *zio; 798 blkptr_t blk; 799 800 zio = zio_null(rio, rio->io_spa, NULL, 801 ddt_repair_entry_done, rdde, rio->io_flags); 802 803 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { 804 if (ddp->ddp_phys_birth == 0 || 805 ddp->ddp_phys_birth != rddp->ddp_phys_birth || 806 bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) 807 continue; 808 ddt_bp_create(ddt, ddk, ddp, &blk); 809 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, 810 rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, 811 ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); 812 } 813 814 zio_nowait(zio); 815 } 816 817 static void 818 ddt_repair_table(ddt_t *ddt, zio_t *rio) 819 { 820 spa_t *spa = ddt->ddt_spa; 821 ddt_entry_t *dde, *rdde_next, *rdde; 822 avl_tree_t *t = &ddt->ddt_repair_tree; 823 blkptr_t blk; 824 825 if (spa_sync_pass(spa) > 1) 826 return; 827 828 ddt_enter(ddt); 829 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { 830 rdde_next = AVL_NEXT(t, rdde); 831 avl_remove(&ddt->ddt_repair_tree, rdde); 832 ddt_exit(ddt); 833 ddt_bp_create(ddt, &rdde->dde_key, NULL, &blk); 834 dde = ddt_repair_start(ddt, &blk); 835 ddt_repair_entry(ddt, dde, rdde, rio); 836 ddt_repair_done(ddt, dde); 837 ddt_enter(ddt); 838 } 839 ddt_exit(ddt); 840 } 841 842 static void 843 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) 844 { 845 ddt_phys_t *ddp = dde->dde_phys; 846 ddt_key_t *ddk = &dde->dde_key; 847 enum ddt_type otype = dde->dde_type; 848 enum ddt_type ntype = DDT_TYPE_CURRENT; 849 enum ddt_class oclass = dde->dde_class; 850 enum ddt_class nclass; 851 uint64_t total_refcnt = 0; 852 853 ASSERT(dde->dde_loaded); 854 ASSERT(!dde->dde_loading); 855 856 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 857 ASSERT(dde->dde_lead_zio[p] == NULL); 858 ASSERT((int64_t)ddp->ddp_refcnt >= 0); 859 if (ddp->ddp_phys_birth == 0) { 860 ASSERT(ddp->ddp_refcnt == 0); 861 continue; 862 } 863 if (p == DDT_PHYS_DITTO) { 864 if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) 865 ddt_phys_free(ddt, ddk, ddp, txg); 866 continue; 867 } 868 if (ddp->ddp_refcnt == 0) 869 ddt_phys_free(ddt, ddk, ddp, txg); 870 total_refcnt += ddp->ddp_refcnt; 871 } 872 873 if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) 874 nclass = DDT_CLASS_DITTO; 875 else if (total_refcnt > 1) 876 nclass = DDT_CLASS_DUPLICATE; 877 else 878 nclass = DDT_CLASS_UNIQUE; 879 880 if (otype != DDT_TYPES && 881 (otype != ntype || oclass != nclass || total_refcnt == 0)) { 882 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); 883 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); 884 } 885 886 if (total_refcnt != 0) { 887 dde->dde_type = ntype; 888 dde->dde_class = nclass; 889 ddt_stat_update(ddt, dde, 0); 890 if (!ddt_object_exists(ddt, ntype, nclass)) 891 ddt_object_create(ddt, ntype, nclass, tx); 892 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); 893 } 894 } 895 896 static void 897 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) 898 { 899 spa_t *spa = ddt->ddt_spa; 900 ddt_entry_t *dde; 901 void *cookie = NULL; 902 903 if (avl_numnodes(&ddt->ddt_tree) == 0) 904 return; 905 906 ASSERT(spa_sync_pass(spa) == 1); 907 ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); 908 909 if (spa->spa_ddt_stat_object == 0) { 910 spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, 911 DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); 912 VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, 913 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 914 &spa->spa_ddt_stat_object, tx) == 0); 915 } 916 917 while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { 918 ddt_sync_entry(ddt, dde, tx, txg); 919 ddt_free(dde); 920 } 921 922 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 923 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 924 if (!ddt_object_exists(ddt, type, class)) 925 continue; 926 ddt_object_sync(ddt, type, class, tx); 927 if (ddt_object_count(ddt, type, class) == 0) 928 ddt_object_destroy(ddt, type, class, tx); 929 } 930 } 931 } 932 933 void 934 ddt_sync(spa_t *spa, uint64_t txg) 935 { 936 dmu_tx_t *tx; 937 zio_t *rio = zio_root(spa, NULL, NULL, 938 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 939 940 ASSERT(spa_syncing_txg(spa) == txg); 941 942 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 943 944 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 945 ddt_t *ddt = spa->spa_ddt[c]; 946 if (ddt == NULL) 947 continue; 948 ddt_sync_table(ddt, tx, txg); 949 ddt_repair_table(ddt, rio); 950 } 951 952 (void) zio_wait(rio); 953 954 dmu_tx_commit(tx); 955 } 956