1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2019 by Delphix. All rights reserved. 25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 26 */ 27 28 #include <sys/dmu.h> 29 #include <sys/zap.h> 30 #include <sys/zfs_context.h> 31 #include <sys/dsl_pool.h> 32 #include <sys/dsl_dataset.h> 33 34 /* 35 * Deadlist concurrency: 36 * 37 * Deadlists can only be modified from the syncing thread. 38 * 39 * Except for dsl_deadlist_insert(), it can only be modified with the 40 * dp_config_rwlock held with RW_WRITER. 41 * 42 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 43 * be called concurrently, from open context, with the dl_config_rwlock held 44 * with RW_READER. 45 * 46 * Therefore, we only need to provide locking between dsl_deadlist_insert() and 47 * the accessors, protecting: 48 * dl_phys->dl_used,comp,uncomp 49 * and protecting the dl_tree from being loaded. 50 * The locking is provided by dl_lock. Note that locking on the bpobj_t 51 * provides its own locking, and dl_oldfmt is immutable. 52 */ 53 54 /* 55 * Livelist Overview 56 * ================ 57 * 58 * Livelists use the same 'deadlist_t' struct as deadlists and are also used 59 * to track blkptrs over the lifetime of a dataset. Livelists however, belong 60 * to clones and track the blkptrs that are clone-specific (were born after 61 * the clone's creation). The exception is embedded block pointers which are 62 * not included in livelists because they do not need to be freed. 63 * 64 * When it comes time to delete the clone, the livelist provides a quick 65 * reference as to what needs to be freed. For this reason, livelists also track 66 * when clone-specific blkptrs are freed before deletion to prevent double 67 * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the 68 * deletion algorithm iterates backwards over the livelist, matching 69 * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists 70 * are also updated in the case when blkptrs are remapped: the old version 71 * of the blkptr is cancelled out with a FREE and the new version is tracked 72 * with an ALLOC. 73 * 74 * To bound the amount of memory required for deletion, livelists over a 75 * certain size are spread over multiple entries. Entries are grouped by 76 * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will 77 * be in the same entry. This allows us to delete livelists incrementally 78 * over multiple syncs, one entry at a time. 79 * 80 * During the lifetime of the clone, livelists can get extremely large. 81 * Their size is managed by periodic condensing (preemptively cancelling out 82 * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when 83 * the shared space between the clone and its origin is so small that it 84 * doesn't make sense to use livelists anymore. 85 */ 86 87 /* 88 * The threshold sublist size at which we create a new sub-livelist for the 89 * next txg. However, since blkptrs of the same transaction group must be in 90 * the same sub-list, the actual sublist size may exceed this. When picking the 91 * size we had to balance the fact that larger sublists mean fewer sublists 92 * (decreasing the cost of insertion) against the consideration that sublists 93 * will be loaded into memory and shouldn't take up an inordinate amount of 94 * space. We settled on ~500000 entries, corresponding to roughly 128M. 95 */ 96 uint64_t zfs_livelist_max_entries = 500000; 97 98 /* 99 * We can approximate how much of a performance gain a livelist will give us 100 * based on the percentage of blocks shared between the clone and its origin. 101 * 0 percent shared means that the clone has completely diverged and that the 102 * old method is maximally effective: every read from the block tree will 103 * result in lots of frees. Livelists give us gains when they track blocks 104 * scattered across the tree, when one read in the old method might only 105 * result in a few frees. Once the clone has been overwritten enough, 106 * writes are no longer sparse and we'll no longer get much of a benefit from 107 * tracking them with a livelist. We chose a lower limit of 75 percent shared 108 * (25 percent overwritten). This means that 1/4 of all block pointers will be 109 * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists 110 * to make deletion 4x faster. Once the amount of shared space drops below this 111 * threshold, the clone will revert to the old deletion method. 112 */ 113 int zfs_livelist_min_percent_shared = 75; 114 115 static int 116 dsl_deadlist_compare(const void *arg1, const void *arg2) 117 { 118 const dsl_deadlist_entry_t *dle1 = arg1; 119 const dsl_deadlist_entry_t *dle2 = arg2; 120 121 return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); 122 } 123 124 static int 125 dsl_deadlist_cache_compare(const void *arg1, const void *arg2) 126 { 127 const dsl_deadlist_cache_entry_t *dlce1 = arg1; 128 const dsl_deadlist_cache_entry_t *dlce2 = arg2; 129 130 return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); 131 } 132 133 static void 134 dsl_deadlist_load_tree(dsl_deadlist_t *dl) 135 { 136 zap_cursor_t zc; 137 zap_attribute_t *za; 138 int error; 139 140 ASSERT(MUTEX_HELD(&dl->dl_lock)); 141 142 ASSERT(!dl->dl_oldfmt); 143 if (dl->dl_havecache) { 144 /* 145 * After loading the tree, the caller may modify the tree, 146 * e.g. to add or remove nodes, or to make a node no longer 147 * refer to the empty_bpobj. These changes would make the 148 * dl_cache incorrect. Therefore we discard the cache here, 149 * so that it can't become incorrect. 150 */ 151 dsl_deadlist_cache_entry_t *dlce; 152 void *cookie = NULL; 153 while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) 154 != NULL) { 155 kmem_free(dlce, sizeof (*dlce)); 156 } 157 avl_destroy(&dl->dl_cache); 158 dl->dl_havecache = B_FALSE; 159 } 160 if (dl->dl_havetree) 161 return; 162 163 za = zap_attribute_alloc(); 164 avl_create(&dl->dl_tree, dsl_deadlist_compare, 165 sizeof (dsl_deadlist_entry_t), 166 offsetof(dsl_deadlist_entry_t, dle_node)); 167 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 168 (error = zap_cursor_retrieve(&zc, za)) == 0; 169 zap_cursor_advance(&zc)) { 170 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 171 dle->dle_mintxg = zfs_strtonum(za->za_name, NULL); 172 173 /* 174 * Prefetch all the bpobj's so that we do that i/o 175 * in parallel. Then open them all in a second pass. 176 */ 177 dle->dle_bpobj.bpo_object = za->za_first_integer; 178 dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object, 179 ZIO_PRIORITY_SYNC_READ); 180 181 avl_add(&dl->dl_tree, dle); 182 } 183 VERIFY3U(error, ==, ENOENT); 184 zap_cursor_fini(&zc); 185 zap_attribute_free(za); 186 187 for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree); 188 dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) { 189 VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, 190 dle->dle_bpobj.bpo_object)); 191 } 192 dl->dl_havetree = B_TRUE; 193 } 194 195 /* 196 * Load only the non-empty bpobj's into the dl_cache. The cache is an analog 197 * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It 198 * is used only for gathering space statistics. The dl_cache has two 199 * advantages over the dl_tree: 200 * 201 * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's 202 * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj 203 * many times and to inquire about its (zero) space stats many times. 204 * 205 * 2. The dl_cache uses less memory than the dl_tree. We only need to load 206 * the dl_tree of snapshots when deleting a snapshot, after which we free the 207 * dl_tree with dsl_deadlist_discard_tree 208 */ 209 static void 210 dsl_deadlist_load_cache(dsl_deadlist_t *dl) 211 { 212 zap_cursor_t zc; 213 zap_attribute_t *za; 214 int error; 215 216 ASSERT(MUTEX_HELD(&dl->dl_lock)); 217 218 ASSERT(!dl->dl_oldfmt); 219 if (dl->dl_havecache) 220 return; 221 222 uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj; 223 224 avl_create(&dl->dl_cache, dsl_deadlist_cache_compare, 225 sizeof (dsl_deadlist_cache_entry_t), 226 offsetof(dsl_deadlist_cache_entry_t, dlce_node)); 227 za = zap_attribute_alloc(); 228 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 229 (error = zap_cursor_retrieve(&zc, za)) == 0; 230 zap_cursor_advance(&zc)) { 231 if (za->za_first_integer == empty_bpobj) 232 continue; 233 dsl_deadlist_cache_entry_t *dlce = 234 kmem_zalloc(sizeof (*dlce), KM_SLEEP); 235 dlce->dlce_mintxg = zfs_strtonum(za->za_name, NULL); 236 237 /* 238 * Prefetch all the bpobj's so that we do that i/o 239 * in parallel. Then open them all in a second pass. 240 */ 241 dlce->dlce_bpobj = za->za_first_integer; 242 dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj, 243 ZIO_PRIORITY_SYNC_READ); 244 avl_add(&dl->dl_cache, dlce); 245 } 246 VERIFY3U(error, ==, ENOENT); 247 zap_cursor_fini(&zc); 248 zap_attribute_free(za); 249 250 for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache); 251 dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) { 252 bpobj_t bpo; 253 VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj)); 254 255 VERIFY0(bpobj_space(&bpo, 256 &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp)); 257 bpobj_close(&bpo); 258 } 259 dl->dl_havecache = B_TRUE; 260 } 261 262 /* 263 * Discard the tree to save memory. 264 */ 265 void 266 dsl_deadlist_discard_tree(dsl_deadlist_t *dl) 267 { 268 mutex_enter(&dl->dl_lock); 269 270 if (!dl->dl_havetree) { 271 mutex_exit(&dl->dl_lock); 272 return; 273 } 274 dsl_deadlist_entry_t *dle; 275 void *cookie = NULL; 276 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { 277 bpobj_close(&dle->dle_bpobj); 278 kmem_free(dle, sizeof (*dle)); 279 } 280 avl_destroy(&dl->dl_tree); 281 282 dl->dl_havetree = B_FALSE; 283 mutex_exit(&dl->dl_lock); 284 } 285 286 void 287 dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) 288 { 289 dsl_deadlist_entry_t *dle; 290 291 ASSERT(dsl_deadlist_is_open(dl)); 292 293 mutex_enter(&dl->dl_lock); 294 dsl_deadlist_load_tree(dl); 295 mutex_exit(&dl->dl_lock); 296 for (dle = avl_first(&dl->dl_tree); dle != NULL; 297 dle = AVL_NEXT(&dl->dl_tree, dle)) { 298 if (func(args, dle) != 0) 299 break; 300 } 301 } 302 303 int 304 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 305 { 306 dmu_object_info_t doi; 307 int err; 308 309 ASSERT(!dsl_deadlist_is_open(dl)); 310 311 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 312 dl->dl_os = os; 313 dl->dl_object = object; 314 err = dmu_bonus_hold(os, object, dl, &dl->dl_dbuf); 315 if (err != 0) 316 return (err); 317 dmu_object_info_from_db(dl->dl_dbuf, &doi); 318 if (doi.doi_type == DMU_OT_BPOBJ) { 319 dmu_buf_rele(dl->dl_dbuf, dl); 320 dl->dl_dbuf = NULL; 321 dl->dl_oldfmt = B_TRUE; 322 return (bpobj_open(&dl->dl_bpobj, os, object)); 323 } 324 325 dl->dl_oldfmt = B_FALSE; 326 dl->dl_phys = dl->dl_dbuf->db_data; 327 dl->dl_havetree = B_FALSE; 328 dl->dl_havecache = B_FALSE; 329 return (0); 330 } 331 332 boolean_t 333 dsl_deadlist_is_open(dsl_deadlist_t *dl) 334 { 335 return (dl->dl_os != NULL); 336 } 337 338 void 339 dsl_deadlist_close(dsl_deadlist_t *dl) 340 { 341 ASSERT(dsl_deadlist_is_open(dl)); 342 mutex_destroy(&dl->dl_lock); 343 344 if (dl->dl_oldfmt) { 345 dl->dl_oldfmt = B_FALSE; 346 bpobj_close(&dl->dl_bpobj); 347 dl->dl_os = NULL; 348 dl->dl_object = 0; 349 return; 350 } 351 352 if (dl->dl_havetree) { 353 dsl_deadlist_entry_t *dle; 354 void *cookie = NULL; 355 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 356 != NULL) { 357 bpobj_close(&dle->dle_bpobj); 358 kmem_free(dle, sizeof (*dle)); 359 } 360 avl_destroy(&dl->dl_tree); 361 } 362 if (dl->dl_havecache) { 363 dsl_deadlist_cache_entry_t *dlce; 364 void *cookie = NULL; 365 while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) 366 != NULL) { 367 kmem_free(dlce, sizeof (*dlce)); 368 } 369 avl_destroy(&dl->dl_cache); 370 } 371 dmu_buf_rele(dl->dl_dbuf, dl); 372 dl->dl_dbuf = NULL; 373 dl->dl_phys = NULL; 374 dl->dl_os = NULL; 375 dl->dl_object = 0; 376 } 377 378 uint64_t 379 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 380 { 381 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 382 return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); 383 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 384 sizeof (dsl_deadlist_phys_t), tx)); 385 } 386 387 void 388 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 389 { 390 dmu_object_info_t doi; 391 zap_cursor_t zc; 392 zap_attribute_t *za; 393 int error; 394 395 VERIFY0(dmu_object_info(os, dlobj, &doi)); 396 if (doi.doi_type == DMU_OT_BPOBJ) { 397 bpobj_free(os, dlobj, tx); 398 return; 399 } 400 401 za = zap_attribute_alloc(); 402 for (zap_cursor_init(&zc, os, dlobj); 403 (error = zap_cursor_retrieve(&zc, za)) == 0; 404 zap_cursor_advance(&zc)) { 405 uint64_t obj = za->za_first_integer; 406 if (obj == dmu_objset_pool(os)->dp_empty_bpobj) 407 bpobj_decr_empty(os, tx); 408 else 409 bpobj_free(os, obj, tx); 410 } 411 VERIFY3U(error, ==, ENOENT); 412 zap_cursor_fini(&zc); 413 zap_attribute_free(za); 414 VERIFY0(dmu_object_free(os, dlobj, tx)); 415 } 416 417 static void 418 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 419 const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) 420 { 421 ASSERT(MUTEX_HELD(&dl->dl_lock)); 422 if (dle->dle_bpobj.bpo_object == 423 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 424 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 425 bpobj_close(&dle->dle_bpobj); 426 bpobj_decr_empty(dl->dl_os, tx); 427 VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 428 VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, 429 dle->dle_mintxg, obj, tx)); 430 } 431 bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); 432 } 433 434 static void 435 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 436 uint64_t obj, dmu_tx_t *tx) 437 { 438 ASSERT(MUTEX_HELD(&dl->dl_lock)); 439 if (dle->dle_bpobj.bpo_object != 440 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 441 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 442 } else { 443 bpobj_close(&dle->dle_bpobj); 444 bpobj_decr_empty(dl->dl_os, tx); 445 VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 446 VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, 447 dle->dle_mintxg, obj, tx)); 448 } 449 } 450 451 /* 452 * Prefetch metadata required for dle_enqueue_subobj(). 453 */ 454 static void 455 dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 456 uint64_t obj) 457 { 458 if (dle->dle_bpobj.bpo_object != 459 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) 460 bpobj_prefetch_subobj(&dle->dle_bpobj, obj); 461 } 462 463 void 464 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, 465 dmu_tx_t *tx) 466 { 467 dsl_deadlist_entry_t dle_tofind; 468 dsl_deadlist_entry_t *dle; 469 avl_index_t where; 470 471 if (dl->dl_oldfmt) { 472 bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); 473 return; 474 } 475 476 mutex_enter(&dl->dl_lock); 477 dsl_deadlist_load_tree(dl); 478 479 dmu_buf_will_dirty(dl->dl_dbuf, tx); 480 481 int sign = bp_freed ? -1 : +1; 482 dl->dl_phys->dl_used += 483 sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 484 dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); 485 dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); 486 487 dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp); 488 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 489 if (dle == NULL) 490 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 491 else 492 dle = AVL_PREV(&dl->dl_tree, dle); 493 494 if (dle == NULL) { 495 zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", 496 bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp)); 497 dle = avl_first(&dl->dl_tree); 498 } 499 500 ASSERT3P(dle, !=, NULL); 501 dle_enqueue(dl, dle, bp, bp_freed, tx); 502 mutex_exit(&dl->dl_lock); 503 } 504 505 int 506 dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 507 { 508 dsl_deadlist_t *dl = arg; 509 dsl_deadlist_insert(dl, bp, B_FALSE, tx); 510 return (0); 511 } 512 513 int 514 dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 515 { 516 dsl_deadlist_t *dl = arg; 517 dsl_deadlist_insert(dl, bp, B_TRUE, tx); 518 return (0); 519 } 520 521 /* 522 * Insert new key in deadlist, which must be > all current entries. 523 * mintxg is not inclusive. 524 */ 525 void 526 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 527 { 528 uint64_t obj; 529 dsl_deadlist_entry_t *dle; 530 531 if (dl->dl_oldfmt) 532 return; 533 534 dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 535 dle->dle_mintxg = mintxg; 536 537 mutex_enter(&dl->dl_lock); 538 dsl_deadlist_load_tree(dl); 539 540 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 541 VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 542 avl_add(&dl->dl_tree, dle); 543 544 VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object, 545 mintxg, obj, tx)); 546 mutex_exit(&dl->dl_lock); 547 } 548 549 /* 550 * Remove this key, merging its entries into the previous key. 551 */ 552 void 553 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 554 { 555 dsl_deadlist_entry_t dle_tofind; 556 dsl_deadlist_entry_t *dle, *dle_prev; 557 558 if (dl->dl_oldfmt) 559 return; 560 mutex_enter(&dl->dl_lock); 561 dsl_deadlist_load_tree(dl); 562 563 dle_tofind.dle_mintxg = mintxg; 564 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 565 ASSERT3P(dle, !=, NULL); 566 dle_prev = AVL_PREV(&dl->dl_tree, dle); 567 ASSERT3P(dle_prev, !=, NULL); 568 569 dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); 570 571 avl_remove(&dl->dl_tree, dle); 572 bpobj_close(&dle->dle_bpobj); 573 kmem_free(dle, sizeof (*dle)); 574 575 VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 576 mutex_exit(&dl->dl_lock); 577 } 578 579 /* 580 * Remove a deadlist entry and all of its contents by removing the entry from 581 * the deadlist's avl tree, freeing the entry's bpobj and adjusting the 582 * deadlist's space accounting accordingly. 583 */ 584 void 585 dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 586 { 587 uint64_t used, comp, uncomp; 588 dsl_deadlist_entry_t dle_tofind; 589 dsl_deadlist_entry_t *dle; 590 objset_t *os = dl->dl_os; 591 592 if (dl->dl_oldfmt) 593 return; 594 595 mutex_enter(&dl->dl_lock); 596 dsl_deadlist_load_tree(dl); 597 598 dle_tofind.dle_mintxg = mintxg; 599 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 600 VERIFY3P(dle, !=, NULL); 601 602 avl_remove(&dl->dl_tree, dle); 603 VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); 604 VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); 605 dmu_buf_will_dirty(dl->dl_dbuf, tx); 606 dl->dl_phys->dl_used -= used; 607 dl->dl_phys->dl_comp -= comp; 608 dl->dl_phys->dl_uncomp -= uncomp; 609 if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { 610 bpobj_decr_empty(os, tx); 611 } else { 612 bpobj_free(os, dle->dle_bpobj.bpo_object, tx); 613 } 614 bpobj_close(&dle->dle_bpobj); 615 kmem_free(dle, sizeof (*dle)); 616 mutex_exit(&dl->dl_lock); 617 } 618 619 /* 620 * Clear out the contents of a deadlist_entry by freeing its bpobj, 621 * replacing it with an empty bpobj and adjusting the deadlist's 622 * space accounting 623 */ 624 void 625 dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, 626 dmu_tx_t *tx) 627 { 628 uint64_t new_obj, used, comp, uncomp; 629 objset_t *os = dl->dl_os; 630 631 mutex_enter(&dl->dl_lock); 632 VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); 633 VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); 634 dmu_buf_will_dirty(dl->dl_dbuf, tx); 635 dl->dl_phys->dl_used -= used; 636 dl->dl_phys->dl_comp -= comp; 637 dl->dl_phys->dl_uncomp -= uncomp; 638 if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) 639 bpobj_decr_empty(os, tx); 640 else 641 bpobj_free(os, dle->dle_bpobj.bpo_object, tx); 642 bpobj_close(&dle->dle_bpobj); 643 new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); 644 VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); 645 VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, 646 new_obj, tx)); 647 ASSERT(bpobj_is_empty(&dle->dle_bpobj)); 648 mutex_exit(&dl->dl_lock); 649 } 650 651 /* 652 * Return the first entry in deadlist's avl tree 653 */ 654 dsl_deadlist_entry_t * 655 dsl_deadlist_first(dsl_deadlist_t *dl) 656 { 657 dsl_deadlist_entry_t *dle; 658 659 mutex_enter(&dl->dl_lock); 660 dsl_deadlist_load_tree(dl); 661 dle = avl_first(&dl->dl_tree); 662 mutex_exit(&dl->dl_lock); 663 664 return (dle); 665 } 666 667 /* 668 * Return the last entry in deadlist's avl tree 669 */ 670 dsl_deadlist_entry_t * 671 dsl_deadlist_last(dsl_deadlist_t *dl) 672 { 673 dsl_deadlist_entry_t *dle; 674 675 mutex_enter(&dl->dl_lock); 676 dsl_deadlist_load_tree(dl); 677 dle = avl_last(&dl->dl_tree); 678 mutex_exit(&dl->dl_lock); 679 680 return (dle); 681 } 682 683 /* 684 * Walk ds's snapshots to regenerate generate ZAP & AVL. 685 */ 686 static void 687 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 688 uint64_t mrs_obj, dmu_tx_t *tx) 689 { 690 dsl_deadlist_t dl = { 0 }; 691 dsl_pool_t *dp = dmu_objset_pool(os); 692 693 VERIFY0(dsl_deadlist_open(&dl, os, dlobj)); 694 if (dl.dl_oldfmt) { 695 dsl_deadlist_close(&dl); 696 return; 697 } 698 699 while (mrs_obj != 0) { 700 dsl_dataset_t *ds; 701 VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 702 dsl_deadlist_add_key(&dl, 703 dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); 704 mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 705 dsl_dataset_rele(ds, FTAG); 706 } 707 dsl_deadlist_close(&dl); 708 } 709 710 uint64_t 711 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 712 uint64_t mrs_obj, dmu_tx_t *tx) 713 { 714 dsl_deadlist_entry_t *dle; 715 uint64_t newobj; 716 717 newobj = dsl_deadlist_alloc(dl->dl_os, tx); 718 719 if (dl->dl_oldfmt) { 720 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 721 return (newobj); 722 } 723 724 mutex_enter(&dl->dl_lock); 725 dsl_deadlist_load_tree(dl); 726 727 for (dle = avl_first(&dl->dl_tree); dle; 728 dle = AVL_NEXT(&dl->dl_tree, dle)) { 729 uint64_t obj; 730 731 if (dle->dle_mintxg >= maxtxg) 732 break; 733 734 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 735 VERIFY0(zap_add_int_key(dl->dl_os, newobj, 736 dle->dle_mintxg, obj, tx)); 737 } 738 mutex_exit(&dl->dl_lock); 739 return (newobj); 740 } 741 742 void 743 dsl_deadlist_space(dsl_deadlist_t *dl, 744 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 745 { 746 ASSERT(dsl_deadlist_is_open(dl)); 747 if (dl->dl_oldfmt) { 748 VERIFY0(bpobj_space(&dl->dl_bpobj, 749 usedp, compp, uncompp)); 750 return; 751 } 752 753 mutex_enter(&dl->dl_lock); 754 *usedp = dl->dl_phys->dl_used; 755 *compp = dl->dl_phys->dl_comp; 756 *uncompp = dl->dl_phys->dl_uncomp; 757 mutex_exit(&dl->dl_lock); 758 } 759 760 /* 761 * return space used in the range (mintxg, maxtxg]. 762 * Includes maxtxg, does not include mintxg. 763 * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is 764 * UINT64_MAX). 765 */ 766 void 767 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 768 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 769 { 770 dsl_deadlist_cache_entry_t *dlce; 771 dsl_deadlist_cache_entry_t dlce_tofind; 772 avl_index_t where; 773 774 if (dl->dl_oldfmt) { 775 VERIFY0(bpobj_space_range(&dl->dl_bpobj, 776 mintxg, maxtxg, usedp, compp, uncompp)); 777 return; 778 } 779 780 *usedp = *compp = *uncompp = 0; 781 782 mutex_enter(&dl->dl_lock); 783 dsl_deadlist_load_cache(dl); 784 dlce_tofind.dlce_mintxg = mintxg; 785 dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where); 786 787 /* 788 * If this mintxg doesn't exist, it may be an empty_bpobj which 789 * is omitted from the sparse tree. Start at the next non-empty 790 * entry. 791 */ 792 if (dlce == NULL) 793 dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER); 794 795 for (; dlce && dlce->dlce_mintxg < maxtxg; 796 dlce = AVL_NEXT(&dl->dl_tree, dlce)) { 797 *usedp += dlce->dlce_bytes; 798 *compp += dlce->dlce_comp; 799 *uncompp += dlce->dlce_uncomp; 800 } 801 802 mutex_exit(&dl->dl_lock); 803 } 804 805 static void 806 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 807 dmu_tx_t *tx) 808 { 809 dsl_deadlist_entry_t dle_tofind; 810 dsl_deadlist_entry_t *dle; 811 avl_index_t where; 812 uint64_t used, comp, uncomp; 813 bpobj_t bpo; 814 815 ASSERT(MUTEX_HELD(&dl->dl_lock)); 816 817 VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); 818 VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp)); 819 bpobj_close(&bpo); 820 821 dsl_deadlist_load_tree(dl); 822 823 dmu_buf_will_dirty(dl->dl_dbuf, tx); 824 dl->dl_phys->dl_used += used; 825 dl->dl_phys->dl_comp += comp; 826 dl->dl_phys->dl_uncomp += uncomp; 827 828 dle_tofind.dle_mintxg = birth; 829 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 830 if (dle == NULL) 831 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 832 dle_enqueue_subobj(dl, dle, obj, tx); 833 } 834 835 /* 836 * Prefetch metadata required for dsl_deadlist_insert_bpobj(). 837 */ 838 static void 839 dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth) 840 { 841 dsl_deadlist_entry_t dle_tofind; 842 dsl_deadlist_entry_t *dle; 843 avl_index_t where; 844 845 ASSERT(MUTEX_HELD(&dl->dl_lock)); 846 847 dsl_deadlist_load_tree(dl); 848 849 dle_tofind.dle_mintxg = birth; 850 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 851 if (dle == NULL) 852 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 853 dle_prefetch_subobj(dl, dle, obj); 854 } 855 856 static int 857 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 858 dmu_tx_t *tx) 859 { 860 dsl_deadlist_t *dl = arg; 861 dsl_deadlist_insert(dl, bp, bp_freed, tx); 862 return (0); 863 } 864 865 /* 866 * Merge the deadlist pointed to by 'obj' into dl. obj will be left as 867 * an empty deadlist. 868 */ 869 void 870 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 871 { 872 zap_cursor_t zc, pzc; 873 zap_attribute_t *za, *pza; 874 dmu_buf_t *bonus; 875 dsl_deadlist_phys_t *dlp; 876 dmu_object_info_t doi; 877 int error, perror, i; 878 879 VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); 880 if (doi.doi_type == DMU_OT_BPOBJ) { 881 bpobj_t bpo; 882 VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); 883 VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx)); 884 bpobj_close(&bpo); 885 return; 886 } 887 888 za = zap_attribute_alloc(); 889 pza = zap_attribute_alloc(); 890 891 mutex_enter(&dl->dl_lock); 892 /* 893 * Prefetch up to 128 deadlists first and then more as we progress. 894 * The limit is a balance between ARC use and diminishing returns. 895 */ 896 for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0; 897 (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128; 898 zap_cursor_advance(&pzc), i++) { 899 dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, 900 zfs_strtonum(pza->za_name, NULL)); 901 } 902 for (zap_cursor_init(&zc, dl->dl_os, obj); 903 (error = zap_cursor_retrieve(&zc, za)) == 0; 904 zap_cursor_advance(&zc)) { 905 dsl_deadlist_insert_bpobj(dl, za->za_first_integer, 906 zfs_strtonum(za->za_name, NULL), tx); 907 VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx)); 908 if (perror == 0) { 909 dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, 910 zfs_strtonum(pza->za_name, NULL)); 911 zap_cursor_advance(&pzc); 912 perror = zap_cursor_retrieve(&pzc, pza); 913 } 914 } 915 VERIFY3U(error, ==, ENOENT); 916 zap_cursor_fini(&zc); 917 zap_cursor_fini(&pzc); 918 919 VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 920 dlp = bonus->db_data; 921 dmu_buf_will_dirty(bonus, tx); 922 memset(dlp, 0, sizeof (*dlp)); 923 dmu_buf_rele(bonus, FTAG); 924 mutex_exit(&dl->dl_lock); 925 926 zap_attribute_free(za); 927 zap_attribute_free(pza); 928 } 929 930 /* 931 * Remove entries on dl that are born > mintxg, and put them on the bpobj. 932 */ 933 void 934 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 935 dmu_tx_t *tx) 936 { 937 dsl_deadlist_entry_t dle_tofind; 938 dsl_deadlist_entry_t *dle, *pdle; 939 avl_index_t where; 940 int i; 941 942 ASSERT(!dl->dl_oldfmt); 943 944 mutex_enter(&dl->dl_lock); 945 dmu_buf_will_dirty(dl->dl_dbuf, tx); 946 dsl_deadlist_load_tree(dl); 947 948 dle_tofind.dle_mintxg = mintxg; 949 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 950 if (dle == NULL) 951 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 952 /* 953 * Prefetch up to 128 deadlists first and then more as we progress. 954 * The limit is a balance between ARC use and diminishing returns. 955 */ 956 for (pdle = dle, i = 0; pdle && i < 128; i++) { 957 bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); 958 pdle = AVL_NEXT(&dl->dl_tree, pdle); 959 } 960 while (dle) { 961 uint64_t used, comp, uncomp; 962 dsl_deadlist_entry_t *dle_next; 963 964 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 965 if (pdle) { 966 bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); 967 pdle = AVL_NEXT(&dl->dl_tree, pdle); 968 } 969 970 VERIFY0(bpobj_space(&dle->dle_bpobj, 971 &used, &comp, &uncomp)); 972 ASSERT3U(dl->dl_phys->dl_used, >=, used); 973 ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 974 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 975 dl->dl_phys->dl_used -= used; 976 dl->dl_phys->dl_comp -= comp; 977 dl->dl_phys->dl_uncomp -= uncomp; 978 979 VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, 980 dle->dle_mintxg, tx)); 981 982 dle_next = AVL_NEXT(&dl->dl_tree, dle); 983 avl_remove(&dl->dl_tree, dle); 984 bpobj_close(&dle->dle_bpobj); 985 kmem_free(dle, sizeof (*dle)); 986 dle = dle_next; 987 } 988 mutex_exit(&dl->dl_lock); 989 } 990 991 typedef struct livelist_entry { 992 blkptr_t le_bp; 993 uint32_t le_refcnt; 994 avl_node_t le_node; 995 } livelist_entry_t; 996 997 static int 998 livelist_compare(const void *larg, const void *rarg) 999 { 1000 const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp; 1001 const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp; 1002 1003 /* Sort them according to dva[0] */ 1004 uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); 1005 uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); 1006 1007 if (l_dva0_vdev != r_dva0_vdev) 1008 return (TREE_CMP(l_dva0_vdev, r_dva0_vdev)); 1009 1010 /* if vdevs are equal, sort by offsets. */ 1011 uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); 1012 uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); 1013 return (TREE_CMP(l_dva0_offset, r_dva0_offset)); 1014 } 1015 1016 struct livelist_iter_arg { 1017 avl_tree_t *avl; 1018 bplist_t *to_free; 1019 zthr_t *t; 1020 }; 1021 1022 /* 1023 * Expects an AVL tree which is incrementally filled will FREE blkptrs 1024 * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a 1025 * corresponding FREE are stored in the supplied bplist. 1026 * 1027 * Note that multiple FREE and ALLOC entries for the same blkptr may be 1028 * encountered when dedup or block cloning is involved. For this reason we 1029 * keep a refcount for all the FREE entries of each blkptr and ensure that 1030 * each of those FREE entries has a corresponding ALLOC preceding it. 1031 */ 1032 static int 1033 dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, 1034 dmu_tx_t *tx) 1035 { 1036 struct livelist_iter_arg *lia = arg; 1037 avl_tree_t *avl = lia->avl; 1038 bplist_t *to_free = lia->to_free; 1039 zthr_t *t = lia->t; 1040 ASSERT(tx == NULL); 1041 1042 if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) 1043 return (SET_ERROR(EINTR)); 1044 1045 livelist_entry_t node; 1046 node.le_bp = *bp; 1047 livelist_entry_t *found = avl_find(avl, &node, NULL); 1048 if (found) { 1049 ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp)); 1050 ASSERT3U(BP_GET_CHECKSUM(bp), ==, 1051 BP_GET_CHECKSUM(&found->le_bp)); 1052 ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp)); 1053 } 1054 if (bp_freed) { 1055 if (found == NULL) { 1056 /* first free entry for this blkptr */ 1057 livelist_entry_t *e = 1058 kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP); 1059 e->le_bp = *bp; 1060 e->le_refcnt = 1; 1061 avl_add(avl, e); 1062 } else { 1063 /* 1064 * Deduped or cloned block free. We could assert D bit 1065 * for dedup, but there is no such one for cloning. 1066 */ 1067 ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); 1068 found->le_refcnt++; 1069 } 1070 } else { 1071 if (found == NULL) { 1072 /* block is currently marked as allocated */ 1073 bplist_append(to_free, bp); 1074 } else { 1075 /* alloc matches a free entry */ 1076 ASSERT3U(found->le_refcnt, !=, 0); 1077 found->le_refcnt--; 1078 if (found->le_refcnt == 0) { 1079 /* all tracked free pairs have been matched */ 1080 avl_remove(avl, found); 1081 kmem_free(found, sizeof (livelist_entry_t)); 1082 } 1083 } 1084 } 1085 return (0); 1086 } 1087 1088 /* 1089 * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs 1090 * which have an ALLOC entry but no matching FREE 1091 */ 1092 int 1093 dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, 1094 uint64_t *size) 1095 { 1096 avl_tree_t avl; 1097 avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), 1098 offsetof(livelist_entry_t, le_node)); 1099 1100 /* process the sublist */ 1101 struct livelist_iter_arg arg = { 1102 .avl = &avl, 1103 .to_free = to_free, 1104 .t = t 1105 }; 1106 int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); 1107 VERIFY(err != 0 || avl_numnodes(&avl) == 0); 1108 1109 void *cookie = NULL; 1110 livelist_entry_t *le = NULL; 1111 while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) { 1112 kmem_free(le, sizeof (livelist_entry_t)); 1113 } 1114 avl_destroy(&avl); 1115 return (err); 1116 } 1117 1118 ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW, 1119 "Size to start the next sub-livelist in a livelist"); 1120 1121 ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW, 1122 "Threshold at which livelist is disabled"); 1123