1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa.h> 29 #include <sys/spa_impl.h> 30 #include <sys/zio.h> 31 #include <sys/ddt.h> 32 #include <sys/zap.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/arc.h> 35 #include <sys/zio_checksum.h> 36 #include <sys/zio_compress.h> 37 38 static const ddt_ops_t *ddt_ops[DDT_TYPES] = { 39 &ddt_zap_ops, 40 }; 41 42 static const char *ddt_class_name[DDT_CLASSES] = { 43 "ditto", 44 "duplicate", 45 "unique", 46 }; 47 48 static void 49 ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 50 dmu_tx_t *tx) 51 { 52 spa_t *spa = ddt->ddt_spa; 53 objset_t *os = ddt->ddt_os; 54 uint64_t *objectp = &ddt->ddt_object[type][class]; 55 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; 56 char name[DDT_NAMELEN]; 57 58 ddt_object_name(ddt, type, class, name); 59 60 ASSERT(*objectp == 0); 61 VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); 62 ASSERT(*objectp != 0); 63 64 VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, 65 sizeof (uint64_t), 1, objectp, tx) == 0); 66 67 VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, 68 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 69 &ddt->ddt_histogram[type][class], tx) == 0); 70 } 71 72 static void 73 ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 74 dmu_tx_t *tx) 75 { 76 spa_t *spa = ddt->ddt_spa; 77 objset_t *os = ddt->ddt_os; 78 uint64_t *objectp = &ddt->ddt_object[type][class]; 79 char name[DDT_NAMELEN]; 80 81 ddt_object_name(ddt, type, class, name); 82 83 ASSERT(*objectp != 0); 84 ASSERT(ddt_object_count(ddt, type, class) == 0); 85 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); 86 VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); 87 VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); 88 VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); 89 90 *objectp = 0; 91 } 92 93 static int 94 ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 95 { 96 char name[DDT_NAMELEN]; 97 int error; 98 99 ddt_object_name(ddt, type, class, name); 100 101 error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, 102 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); 103 104 if (error) 105 return (error); 106 107 error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 108 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 109 &ddt->ddt_histogram[type][class]); 110 111 ASSERT(error == 0); 112 return (error); 113 } 114 115 static void 116 ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 117 dmu_tx_t *tx) 118 { 119 char name[DDT_NAMELEN]; 120 121 ddt_object_name(ddt, type, class, name); 122 123 VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 124 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 125 &ddt->ddt_histogram[type][class], tx) == 0); 126 } 127 128 static int 129 ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 130 ddt_entry_t *dde) 131 { 132 if (!ddt_object_exists(ddt, type, class)) 133 return (ENOENT); 134 135 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, 136 ddt->ddt_object[type][class], dde)); 137 } 138 139 static int 140 ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 141 ddt_entry_t *dde, dmu_tx_t *tx) 142 { 143 ASSERT(ddt_object_exists(ddt, type, class)); 144 145 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, 146 ddt->ddt_object[type][class], dde, tx)); 147 } 148 149 static int 150 ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 151 ddt_entry_t *dde, dmu_tx_t *tx) 152 { 153 ASSERT(ddt_object_exists(ddt, type, class)); 154 155 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, 156 ddt->ddt_object[type][class], dde, tx)); 157 } 158 159 int 160 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 161 ddt_entry_t *dde, uint64_t *walk) 162 { 163 ASSERT(ddt_object_exists(ddt, type, class)); 164 165 return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, 166 ddt->ddt_object[type][class], dde, walk)); 167 } 168 169 uint64_t 170 ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 171 { 172 ASSERT(ddt_object_exists(ddt, type, class)); 173 174 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, 175 ddt->ddt_object[type][class])); 176 } 177 178 int 179 ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 180 dmu_object_info_t *doi) 181 { 182 if (!ddt_object_exists(ddt, type, class)) 183 return (ENOENT); 184 185 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], 186 doi)); 187 } 188 189 boolean_t 190 ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 191 { 192 return (!!ddt->ddt_object[type][class]); 193 } 194 195 void 196 ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 197 char *name) 198 { 199 (void) sprintf(name, DMU_POOL_DDT, 200 zio_checksum_table[ddt->ddt_checksum].ci_name, 201 ddt_ops[type]->ddt_op_name, ddt_class_name[class]); 202 } 203 204 void 205 ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) 206 { 207 ASSERT(txg != 0); 208 209 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 210 bp->blk_dva[d] = ddp->ddp_dva[d]; 211 BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); 212 } 213 214 void 215 ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk, const ddt_phys_t *ddp, 216 blkptr_t *bp) 217 { 218 BP_ZERO(bp); 219 220 if (ddp != NULL) 221 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); 222 223 bp->blk_cksum = ddk->ddk_cksum; 224 225 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); 226 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); 227 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); 228 BP_SET_CHECKSUM(bp, ddt->ddt_checksum); 229 BP_SET_TYPE(bp, DMU_OT_NONE); 230 BP_SET_LEVEL(bp, 0); 231 BP_SET_DEDUP(bp, 0); 232 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 233 } 234 235 void 236 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) 237 { 238 ddk->ddk_cksum = bp->blk_cksum; 239 ddk->ddk_prop = 0; 240 241 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); 242 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); 243 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); 244 } 245 246 void 247 ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) 248 { 249 ASSERT(ddp->ddp_phys_birth == 0); 250 251 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 252 ddp->ddp_dva[d] = bp->blk_dva[d]; 253 ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); 254 } 255 256 void 257 ddt_phys_clear(ddt_phys_t *ddp) 258 { 259 bzero(ddp, sizeof (*ddp)); 260 } 261 262 void 263 ddt_phys_addref(ddt_phys_t *ddp) 264 { 265 ddp->ddp_refcnt++; 266 } 267 268 void 269 ddt_phys_decref(ddt_phys_t *ddp) 270 { 271 ASSERT((int64_t)ddp->ddp_refcnt > 0); 272 ddp->ddp_refcnt--; 273 } 274 275 void 276 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) 277 { 278 blkptr_t blk; 279 280 ddt_bp_create(ddt, ddk, ddp, &blk); 281 ddt_phys_clear(ddp); 282 zio_free(ddt->ddt_spa, txg, &blk); 283 } 284 285 ddt_phys_t * 286 ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) 287 { 288 ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; 289 290 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 291 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && 292 BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) 293 return (ddp); 294 } 295 return (NULL); 296 } 297 298 uint64_t 299 ddt_phys_total_refcnt(const ddt_entry_t *dde) 300 { 301 uint64_t refcnt = 0; 302 303 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) 304 refcnt += dde->dde_phys[p].ddp_refcnt; 305 306 return (refcnt); 307 } 308 309 static void 310 ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) 311 { 312 spa_t *spa = ddt->ddt_spa; 313 ddt_phys_t *ddp = dde->dde_phys; 314 ddt_key_t *ddk = &dde->dde_key; 315 uint64_t lsize = DDK_GET_LSIZE(ddk); 316 uint64_t psize = DDK_GET_PSIZE(ddk); 317 318 bzero(dds, sizeof (*dds)); 319 320 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 321 uint64_t dsize = 0; 322 uint64_t refcnt = ddp->ddp_refcnt; 323 324 if (ddp->ddp_phys_birth == 0) 325 continue; 326 327 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 328 dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); 329 330 dds->dds_blocks += 1; 331 dds->dds_lsize += lsize; 332 dds->dds_psize += psize; 333 dds->dds_dsize += dsize; 334 335 dds->dds_ref_blocks += refcnt; 336 dds->dds_ref_lsize += lsize * refcnt; 337 dds->dds_ref_psize += psize * refcnt; 338 dds->dds_ref_dsize += dsize * refcnt; 339 } 340 } 341 342 void 343 ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) 344 { 345 const uint64_t *s = (const uint64_t *)src; 346 uint64_t *d = (uint64_t *)dst; 347 uint64_t *d_end = (uint64_t *)(dst + 1); 348 349 ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ 350 351 while (d < d_end) 352 *d++ += (*s++ ^ neg) - neg; 353 } 354 355 static void 356 ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) 357 { 358 ddt_stat_t dds; 359 ddt_histogram_t *ddh; 360 int bucket; 361 362 ddt_stat_generate(ddt, dde, &dds); 363 364 bucket = highbit(dds.dds_ref_blocks) - 1; 365 ASSERT(bucket >= 0); 366 367 ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; 368 369 ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); 370 } 371 372 void 373 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) 374 { 375 for (int h = 0; h < 64; h++) 376 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); 377 } 378 379 void 380 ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) 381 { 382 bzero(dds, sizeof (*dds)); 383 384 for (int h = 0; h < 64; h++) 385 ddt_stat_add(dds, &ddh->ddh_stat[h], 0); 386 } 387 388 boolean_t 389 ddt_histogram_empty(const ddt_histogram_t *ddh) 390 { 391 const uint64_t *s = (const uint64_t *)ddh; 392 const uint64_t *s_end = (const uint64_t *)(ddh + 1); 393 394 while (s < s_end) 395 if (*s++ != 0) 396 return (B_FALSE); 397 398 return (B_TRUE); 399 } 400 401 static void 402 ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) 403 { 404 ddt_histogram_t ddh_total = { 0 }; 405 406 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 407 ddt_t *ddt = spa->spa_ddt[c]; 408 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 409 for (enum ddt_class class = 0; class < DDT_CLASSES; 410 class++) { 411 ddt_histogram_add(&ddh_total, 412 &ddt->ddt_histogram[type][class]); 413 } 414 } 415 } 416 417 ddt_histogram_stat(dds_total, &ddh_total); 418 } 419 420 uint64_t 421 ddt_get_dedup_dspace(spa_t *spa) 422 { 423 ddt_stat_t dds_total = { 0 }; 424 425 ddt_get_dedup_stats(spa, &dds_total); 426 return (dds_total.dds_ref_dsize - dds_total.dds_dsize); 427 } 428 429 uint64_t 430 ddt_get_pool_dedup_ratio(spa_t *spa) 431 { 432 ddt_stat_t dds_total = { 0 }; 433 434 ddt_get_dedup_stats(spa, &dds_total); 435 if (dds_total.dds_dsize == 0) 436 return (100); 437 438 return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); 439 } 440 441 int 442 ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) 443 { 444 spa_t *spa = ddt->ddt_spa; 445 uint64_t total_refcnt = 0; 446 uint64_t ditto = spa->spa_dedup_ditto; 447 int total_copies = 0; 448 int desired_copies = 0; 449 450 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 451 ddt_phys_t *ddp = &dde->dde_phys[p]; 452 zio_t *zio = dde->dde_lead_zio[p]; 453 uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ 454 if (zio != NULL) 455 refcnt += zio->io_parent_count; /* pending refs */ 456 if (ddp == ddp_willref) 457 refcnt++; /* caller's ref */ 458 if (refcnt != 0) { 459 total_refcnt += refcnt; 460 total_copies += p; 461 } 462 } 463 464 if (ditto == 0 || ditto > UINT32_MAX) 465 ditto = UINT32_MAX; 466 467 if (total_refcnt >= 1) 468 desired_copies++; 469 if (total_refcnt >= ditto) 470 desired_copies++; 471 if (total_refcnt >= ditto * ditto) 472 desired_copies++; 473 474 return (MAX(desired_copies, total_copies) - total_copies); 475 } 476 477 int 478 ddt_ditto_copies_present(ddt_entry_t *dde) 479 { 480 ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; 481 dva_t *dva = ddp->ddp_dva; 482 int copies = 0 - DVA_GET_GANG(dva); 483 484 for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) 485 if (DVA_IS_VALID(dva)) 486 copies++; 487 488 ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); 489 490 return (copies); 491 } 492 493 size_t 494 ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) 495 { 496 uchar_t *version = dst++; 497 int cpfunc = ZIO_COMPRESS_ZLE; 498 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 499 size_t c_len; 500 501 ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ 502 503 c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); 504 505 if (c_len == s_len) { 506 cpfunc = ZIO_COMPRESS_OFF; 507 bcopy(src, dst, s_len); 508 } 509 510 *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; 511 512 return (c_len + 1); 513 } 514 515 void 516 ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) 517 { 518 uchar_t version = *src++; 519 int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; 520 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 521 522 if (ci->ci_decompress != NULL) 523 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); 524 else 525 bcopy(src, dst, d_len); 526 527 if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) 528 byteswap_uint64_array(dst, d_len); 529 } 530 531 ddt_t * 532 ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) 533 { 534 return (spa->spa_ddt[c]); 535 } 536 537 ddt_t * 538 ddt_select(spa_t *spa, const blkptr_t *bp) 539 { 540 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); 541 } 542 543 void 544 ddt_enter(ddt_t *ddt) 545 { 546 mutex_enter(&ddt->ddt_lock); 547 } 548 549 void 550 ddt_exit(ddt_t *ddt) 551 { 552 mutex_exit(&ddt->ddt_lock); 553 } 554 555 static ddt_entry_t * 556 ddt_alloc(const ddt_key_t *ddk) 557 { 558 ddt_entry_t *dde; 559 560 dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); 561 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); 562 563 dde->dde_key = *ddk; 564 565 return (dde); 566 } 567 568 static void 569 ddt_free(ddt_entry_t *dde) 570 { 571 ASSERT(!dde->dde_loading); 572 573 for (int p = 0; p < DDT_PHYS_TYPES; p++) 574 ASSERT(dde->dde_lead_zio[p] == NULL); 575 576 if (dde->dde_repair_data != NULL) 577 zio_buf_free(dde->dde_repair_data, 578 DDK_GET_PSIZE(&dde->dde_key)); 579 580 cv_destroy(&dde->dde_cv); 581 kmem_free(dde, sizeof (*dde)); 582 } 583 584 void 585 ddt_remove(ddt_t *ddt, ddt_entry_t *dde) 586 { 587 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 588 589 avl_remove(&ddt->ddt_tree, dde); 590 ddt_free(dde); 591 } 592 593 ddt_entry_t * 594 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) 595 { 596 ddt_entry_t *dde, dde_search; 597 enum ddt_type type; 598 enum ddt_class class; 599 avl_index_t where; 600 int error; 601 602 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 603 604 ddt_key_fill(&dde_search.dde_key, bp); 605 606 dde = avl_find(&ddt->ddt_tree, &dde_search, &where); 607 if (dde == NULL) { 608 if (!add) 609 return (NULL); 610 dde = ddt_alloc(&dde_search.dde_key); 611 avl_insert(&ddt->ddt_tree, dde, where); 612 } 613 614 while (dde->dde_loading) 615 cv_wait(&dde->dde_cv, &ddt->ddt_lock); 616 617 if (dde->dde_loaded) 618 return (dde); 619 620 dde->dde_loading = B_TRUE; 621 622 ddt_exit(ddt); 623 624 error = ENOENT; 625 626 for (type = 0; type < DDT_TYPES; type++) { 627 for (class = 0; class < DDT_CLASSES; class++) { 628 error = ddt_object_lookup(ddt, type, class, dde); 629 if (error != ENOENT) 630 break; 631 } 632 if (error != ENOENT) 633 break; 634 } 635 636 ASSERT(error == 0 || error == ENOENT); 637 638 ddt_enter(ddt); 639 640 ASSERT(dde->dde_loaded == B_FALSE); 641 ASSERT(dde->dde_loading == B_TRUE); 642 643 dde->dde_type = type; /* will be DDT_TYPES if no entry found */ 644 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ 645 dde->dde_loaded = B_TRUE; 646 dde->dde_loading = B_FALSE; 647 648 if (error == 0) 649 ddt_stat_update(ddt, dde, -1ULL); 650 651 cv_broadcast(&dde->dde_cv); 652 653 return (dde); 654 } 655 656 int 657 ddt_entry_compare(const void *x1, const void *x2) 658 { 659 const ddt_entry_t *dde1 = x1; 660 const ddt_entry_t *dde2 = x2; 661 const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; 662 const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; 663 664 for (int i = 0; i < DDT_KEY_WORDS; i++) { 665 if (u1[i] < u2[i]) 666 return (-1); 667 if (u1[i] > u2[i]) 668 return (1); 669 } 670 671 return (0); 672 } 673 674 static ddt_t * 675 ddt_table_alloc(spa_t *spa, enum zio_checksum c) 676 { 677 ddt_t *ddt; 678 679 ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); 680 681 mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); 682 avl_create(&ddt->ddt_tree, ddt_entry_compare, 683 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 684 avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, 685 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 686 ddt->ddt_checksum = c; 687 ddt->ddt_spa = spa; 688 ddt->ddt_os = spa->spa_meta_objset; 689 690 return (ddt); 691 } 692 693 static void 694 ddt_table_free(ddt_t *ddt) 695 { 696 ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); 697 ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); 698 avl_destroy(&ddt->ddt_tree); 699 avl_destroy(&ddt->ddt_repair_tree); 700 mutex_destroy(&ddt->ddt_lock); 701 kmem_free(ddt, sizeof (*ddt)); 702 } 703 704 void 705 ddt_create(spa_t *spa) 706 { 707 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; 708 709 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) 710 spa->spa_ddt[c] = ddt_table_alloc(spa, c); 711 } 712 713 int 714 ddt_load(spa_t *spa) 715 { 716 int error; 717 718 ddt_create(spa); 719 720 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 721 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 722 &spa->spa_ddt_stat_object); 723 724 if (error) 725 return (error == ENOENT ? 0 : error); 726 727 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 728 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 729 for (enum ddt_class class = 0; class < DDT_CLASSES; 730 class++) { 731 ddt_t *ddt = spa->spa_ddt[c]; 732 error = ddt_object_load(ddt, type, class); 733 if (error != 0 && error != ENOENT) 734 return (error); 735 } 736 } 737 } 738 739 return (0); 740 } 741 742 void 743 ddt_unload(spa_t *spa) 744 { 745 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 746 if (spa->spa_ddt[c]) { 747 ddt_table_free(spa->spa_ddt[c]); 748 spa->spa_ddt[c] = NULL; 749 } 750 } 751 } 752 753 ddt_entry_t * 754 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) 755 { 756 ddt_key_t ddk; 757 ddt_entry_t *dde; 758 759 ddt_key_fill(&ddk, bp); 760 761 dde = ddt_alloc(&ddk); 762 763 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 764 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 765 /* 766 * We can only do repair if there are multiple copies 767 * of the block. For anything in the UNIQUE class, 768 * there's definitely only one copy, so don't even try. 769 */ 770 if (class != DDT_CLASS_UNIQUE && 771 ddt_object_lookup(ddt, type, class, dde) == 0) 772 return (dde); 773 } 774 } 775 776 bzero(dde->dde_phys, sizeof (dde->dde_phys)); 777 778 return (dde); 779 } 780 781 void 782 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) 783 { 784 avl_index_t where; 785 786 ddt_enter(ddt); 787 788 if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && 789 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) 790 avl_insert(&ddt->ddt_repair_tree, dde, where); 791 else 792 ddt_free(dde); 793 794 ddt_exit(ddt); 795 } 796 797 static void 798 ddt_repair_entry_done(zio_t *zio) 799 { 800 ddt_entry_t *rdde = zio->io_private; 801 802 ddt_free(rdde); 803 } 804 805 static void 806 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) 807 { 808 ddt_phys_t *ddp = dde->dde_phys; 809 ddt_phys_t *rddp = rdde->dde_phys; 810 ddt_key_t *ddk = &dde->dde_key; 811 ddt_key_t *rddk = &rdde->dde_key; 812 zio_t *zio; 813 blkptr_t blk; 814 815 zio = zio_null(rio, rio->io_spa, NULL, 816 ddt_repair_entry_done, rdde, rio->io_flags); 817 818 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { 819 if (ddp->ddp_phys_birth == 0 || 820 ddp->ddp_phys_birth != rddp->ddp_phys_birth || 821 bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) 822 continue; 823 ddt_bp_create(ddt, ddk, ddp, &blk); 824 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, 825 rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, 826 ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); 827 } 828 829 zio_nowait(zio); 830 } 831 832 static void 833 ddt_repair_table(ddt_t *ddt, zio_t *rio) 834 { 835 spa_t *spa = ddt->ddt_spa; 836 ddt_entry_t *dde, *rdde_next, *rdde; 837 avl_tree_t *t = &ddt->ddt_repair_tree; 838 blkptr_t blk; 839 840 if (spa_sync_pass(spa) > 1) 841 return; 842 843 ddt_enter(ddt); 844 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { 845 rdde_next = AVL_NEXT(t, rdde); 846 avl_remove(&ddt->ddt_repair_tree, rdde); 847 ddt_exit(ddt); 848 ddt_bp_create(ddt, &rdde->dde_key, NULL, &blk); 849 dde = ddt_repair_start(ddt, &blk); 850 ddt_repair_entry(ddt, dde, rdde, rio); 851 ddt_repair_done(ddt, dde); 852 ddt_enter(ddt); 853 } 854 ddt_exit(ddt); 855 } 856 857 static void 858 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) 859 { 860 ddt_phys_t *ddp = dde->dde_phys; 861 ddt_key_t *ddk = &dde->dde_key; 862 enum ddt_type otype = dde->dde_type; 863 enum ddt_type ntype = DDT_TYPE_CURRENT; 864 enum ddt_class oclass = dde->dde_class; 865 enum ddt_class nclass; 866 uint64_t total_refcnt = 0; 867 868 ASSERT(dde->dde_loaded); 869 ASSERT(!dde->dde_loading); 870 871 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 872 ASSERT(dde->dde_lead_zio[p] == NULL); 873 ASSERT((int64_t)ddp->ddp_refcnt >= 0); 874 if (ddp->ddp_phys_birth == 0) { 875 ASSERT(ddp->ddp_refcnt == 0); 876 continue; 877 } 878 if (p == DDT_PHYS_DITTO) { 879 if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) 880 ddt_phys_free(ddt, ddk, ddp, txg); 881 continue; 882 } 883 if (ddp->ddp_refcnt == 0) 884 ddt_phys_free(ddt, ddk, ddp, txg); 885 total_refcnt += ddp->ddp_refcnt; 886 } 887 888 if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) 889 nclass = DDT_CLASS_DITTO; 890 else if (total_refcnt > 1) 891 nclass = DDT_CLASS_DUPLICATE; 892 else 893 nclass = DDT_CLASS_UNIQUE; 894 895 if (otype != DDT_TYPES && 896 (otype != ntype || oclass != nclass || total_refcnt == 0)) { 897 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); 898 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); 899 } 900 901 if (total_refcnt != 0) { 902 dde->dde_type = ntype; 903 dde->dde_class = nclass; 904 ddt_stat_update(ddt, dde, 0); 905 if (!ddt_object_exists(ddt, ntype, nclass)) 906 ddt_object_create(ddt, ntype, nclass, tx); 907 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); 908 } 909 } 910 911 static void 912 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) 913 { 914 spa_t *spa = ddt->ddt_spa; 915 ddt_entry_t *dde; 916 void *cookie = NULL; 917 918 if (avl_numnodes(&ddt->ddt_tree) == 0) 919 return; 920 921 ASSERT(spa_sync_pass(spa) == 1); 922 ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); 923 924 if (spa->spa_ddt_stat_object == 0) { 925 spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, 926 DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); 927 VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, 928 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 929 &spa->spa_ddt_stat_object, tx) == 0); 930 } 931 932 while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { 933 ddt_sync_entry(ddt, dde, tx, txg); 934 ddt_free(dde); 935 } 936 937 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 938 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 939 if (!ddt_object_exists(ddt, type, class)) 940 continue; 941 ddt_object_sync(ddt, type, class, tx); 942 if (ddt_object_count(ddt, type, class) == 0) 943 ddt_object_destroy(ddt, type, class, tx); 944 } 945 } 946 } 947 948 void 949 ddt_sync(spa_t *spa, uint64_t txg) 950 { 951 dmu_tx_t *tx; 952 zio_t *rio = zio_root(spa, NULL, NULL, 953 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 954 955 ASSERT(spa_syncing_txg(spa) == txg); 956 957 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 958 959 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 960 ddt_t *ddt = spa->spa_ddt[c]; 961 if (ddt == NULL) 962 continue; 963 ddt_sync_table(ddt, tx, txg); 964 ddt_repair_table(ddt, rio); 965 } 966 967 (void) zio_wait(rio); 968 969 dmu_tx_commit(tx); 970 } 971