1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2017 Datto Inc. 26 */ 27 28 #include <sys/bpobj.h> 29 #include <sys/zfs_context.h> 30 #include <sys/zfs_refcount.h> 31 #include <sys/dsl_pool.h> 32 #include <sys/zfeature.h> 33 #include <sys/zap.h> 34 35 /* 36 * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). 37 */ 38 uint64_t 39 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) 40 { 41 spa_t *spa = dmu_objset_spa(os); 42 dsl_pool_t *dp = dmu_objset_pool(os); 43 44 if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { 45 if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { 46 ASSERT0(dp->dp_empty_bpobj); 47 dp->dp_empty_bpobj = 48 bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); 49 VERIFY(zap_add(os, 50 DMU_POOL_DIRECTORY_OBJECT, 51 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 52 &dp->dp_empty_bpobj, tx) == 0); 53 } 54 spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); 55 ASSERT(dp->dp_empty_bpobj != 0); 56 return (dp->dp_empty_bpobj); 57 } else { 58 return (bpobj_alloc(os, blocksize, tx)); 59 } 60 } 61 62 void 63 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) 64 { 65 dsl_pool_t *dp = dmu_objset_pool(os); 66 67 spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); 68 if (!spa_feature_is_active(dmu_objset_spa(os), 69 SPA_FEATURE_EMPTY_BPOBJ)) { 70 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, 71 DMU_POOL_DIRECTORY_OBJECT, 72 DMU_POOL_EMPTY_BPOBJ, tx)); 73 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); 74 dp->dp_empty_bpobj = 0; 75 } 76 } 77 78 uint64_t 79 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) 80 { 81 int size; 82 83 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) 84 size = BPOBJ_SIZE_V0; 85 else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 86 size = BPOBJ_SIZE_V1; 87 else if (!spa_feature_is_active(dmu_objset_spa(os), 88 SPA_FEATURE_LIVELIST)) 89 size = BPOBJ_SIZE_V2; 90 else 91 size = sizeof (bpobj_phys_t); 92 93 return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, 94 DMU_OT_BPOBJ_HDR, size, tx)); 95 } 96 97 void 98 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 99 { 100 int64_t i; 101 bpobj_t bpo; 102 dmu_object_info_t doi; 103 int epb; 104 dmu_buf_t *dbuf = NULL; 105 106 ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); 107 VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); 108 109 mutex_enter(&bpo.bpo_lock); 110 111 if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) 112 goto out; 113 114 VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); 115 epb = doi.doi_data_block_size / sizeof (uint64_t); 116 117 for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 118 uint64_t *objarray; 119 uint64_t offset, blkoff; 120 121 offset = i * sizeof (uint64_t); 122 blkoff = P2PHASE(i, epb); 123 124 if (dbuf == NULL || dbuf->db_offset > offset) { 125 if (dbuf) 126 dmu_buf_rele(dbuf, FTAG); 127 VERIFY3U(0, ==, dmu_buf_hold(os, 128 bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); 129 } 130 131 ASSERT3U(offset, >=, dbuf->db_offset); 132 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 133 134 objarray = dbuf->db_data; 135 bpobj_free(os, objarray[blkoff], tx); 136 } 137 if (dbuf) { 138 dmu_buf_rele(dbuf, FTAG); 139 dbuf = NULL; 140 } 141 VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); 142 143 out: 144 mutex_exit(&bpo.bpo_lock); 145 bpobj_close(&bpo); 146 147 VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); 148 } 149 150 int 151 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) 152 { 153 dmu_object_info_t doi; 154 int err; 155 156 err = dmu_object_info(os, object, &doi); 157 if (err) 158 return (err); 159 160 memset(bpo, 0, sizeof (*bpo)); 161 mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); 162 163 ASSERT(bpo->bpo_dbuf == NULL); 164 ASSERT(bpo->bpo_phys == NULL); 165 ASSERT(object != 0); 166 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); 167 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); 168 169 err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); 170 if (err) 171 return (err); 172 173 bpo->bpo_os = os; 174 bpo->bpo_object = object; 175 bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; 176 bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); 177 bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); 178 bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); 179 bpo->bpo_phys = bpo->bpo_dbuf->db_data; 180 return (0); 181 } 182 183 boolean_t 184 bpobj_is_open(const bpobj_t *bpo) 185 { 186 return (bpo->bpo_object != 0); 187 } 188 189 void 190 bpobj_close(bpobj_t *bpo) 191 { 192 /* Lame workaround for closing a bpobj that was never opened. */ 193 if (bpo->bpo_object == 0) 194 return; 195 196 dmu_buf_rele(bpo->bpo_dbuf, bpo); 197 if (bpo->bpo_cached_dbuf != NULL) 198 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 199 bpo->bpo_dbuf = NULL; 200 bpo->bpo_phys = NULL; 201 bpo->bpo_cached_dbuf = NULL; 202 bpo->bpo_object = 0; 203 204 mutex_destroy(&bpo->bpo_lock); 205 } 206 207 static boolean_t 208 bpobj_is_empty_impl(bpobj_t *bpo) 209 { 210 ASSERT(MUTEX_HELD(&bpo->bpo_lock)); 211 return (bpo->bpo_phys->bpo_num_blkptrs == 0 && 212 (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); 213 } 214 215 boolean_t 216 bpobj_is_empty(bpobj_t *bpo) 217 { 218 mutex_enter(&bpo->bpo_lock); 219 boolean_t is_empty = bpobj_is_empty_impl(bpo); 220 mutex_exit(&bpo->bpo_lock); 221 return (is_empty); 222 } 223 224 /* 225 * A recursive iteration of the bpobjs would be nice here but we run the risk 226 * of overflowing function stack space. Instead, find each subobj and add it 227 * to the head of our list so it can be scanned for subjobjs. Like a 228 * recursive implementation, the "deepest" subobjs will be freed first. 229 * When a subobj is found to have no additional subojs, free it. 230 */ 231 typedef struct bpobj_info { 232 bpobj_t *bpi_bpo; 233 /* 234 * This object is a subobj of bpi_parent, 235 * at bpi_index in its subobj array. 236 */ 237 struct bpobj_info *bpi_parent; 238 uint64_t bpi_index; 239 /* How many of our subobj's are left to process. */ 240 uint64_t bpi_unprocessed_subobjs; 241 /* True after having visited this bpo's directly referenced BPs. */ 242 boolean_t bpi_visited; 243 list_node_t bpi_node; 244 } bpobj_info_t; 245 246 static bpobj_info_t * 247 bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) 248 { 249 bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP); 250 bpi->bpi_bpo = bpo; 251 bpi->bpi_parent = parent; 252 bpi->bpi_index = index; 253 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 254 bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs; 255 } 256 return (bpi); 257 } 258 259 /* 260 * Update bpobj and all of its parents with new space accounting. 261 */ 262 static void 263 propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, 264 int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) 265 { 266 267 for (; bpi != NULL; bpi = bpi->bpi_parent) { 268 bpobj_t *p = bpi->bpi_bpo; 269 ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx)); 270 p->bpo_phys->bpo_bytes -= freed; 271 ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0); 272 if (p->bpo_havecomp) { 273 p->bpo_phys->bpo_comp -= comp_freed; 274 p->bpo_phys->bpo_uncomp -= uncomp_freed; 275 } 276 } 277 } 278 279 static int 280 bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, 281 int64_t start, dmu_tx_t *tx, boolean_t free) 282 { 283 int err = 0; 284 int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; 285 dmu_buf_t *dbuf = NULL; 286 bpobj_t *bpo = bpi->bpi_bpo; 287 288 int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; 289 uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) * 290 sizeof (blkptr_t); 291 uint64_t ps = start * sizeof (blkptr_t); 292 uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0, 293 ps); 294 if (pe > pb) { 295 dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb, 296 ZIO_PRIORITY_ASYNC_READ); 297 } 298 for (; i >= start; i--) { 299 uint64_t offset = i * sizeof (blkptr_t); 300 uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); 301 302 if (dbuf == NULL || dbuf->db_offset > offset) { 303 if (dbuf) 304 dmu_buf_rele(dbuf, FTAG); 305 err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 306 offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH); 307 if (err) 308 break; 309 pe = pb; 310 pb = MAX((dbuf->db_offset > dmu_prefetch_max) ? 311 dbuf->db_offset - dmu_prefetch_max : 0, ps); 312 if (pe > pb) { 313 dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, 314 pb, pe - pb, ZIO_PRIORITY_ASYNC_READ); 315 } 316 } 317 318 ASSERT3U(offset, >=, dbuf->db_offset); 319 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 320 321 blkptr_t *bparray = dbuf->db_data; 322 blkptr_t *bp = &bparray[blkoff]; 323 324 boolean_t bp_freed = BP_GET_FREE(bp); 325 err = func(arg, bp, bp_freed, tx); 326 if (err) 327 break; 328 329 if (free) { 330 int sign = bp_freed ? -1 : +1; 331 spa_t *spa = dmu_objset_spa(bpo->bpo_os); 332 freed += sign * bp_get_dsize_sync(spa, bp); 333 comp_freed += sign * BP_GET_PSIZE(bp); 334 uncomp_freed += sign * BP_GET_UCSIZE(bp); 335 ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); 336 bpo->bpo_phys->bpo_num_blkptrs--; 337 ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); 338 if (bp_freed) { 339 ASSERT(bpo->bpo_havefreed); 340 bpo->bpo_phys->bpo_num_freed--; 341 ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); 342 } 343 } 344 } 345 if (free) { 346 propagate_space_reduction(bpi, freed, comp_freed, 347 uncomp_freed, tx); 348 VERIFY0(dmu_free_range(bpo->bpo_os, 349 bpo->bpo_object, 350 bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 351 DMU_OBJECT_END, tx)); 352 } 353 if (dbuf) { 354 dmu_buf_rele(dbuf, FTAG); 355 dbuf = NULL; 356 } 357 return (err); 358 } 359 360 /* 361 * Given an initial bpo, start by freeing the BPs that are directly referenced 362 * by that bpo. If the bpo has subobjs, read in its last subobj and push the 363 * subobj to our stack. By popping items off our stack, eventually we will 364 * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if 365 * requested also free the now-empty bpo from disk and decrement 366 * its parent's subobj count. We continue popping each subobj from our stack, 367 * visiting its last subobj until they too have no more subobjs, and so on. 368 */ 369 static int 370 bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, 371 dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) 372 { 373 list_t stack; 374 bpobj_info_t *bpi; 375 int err = 0; 376 377 /* 378 * Create a "stack" for us to work with without worrying about 379 * stack overflows. Initialize it with the initial_bpo. 380 */ 381 list_create(&stack, sizeof (bpobj_info_t), 382 offsetof(bpobj_info_t, bpi_node)); 383 mutex_enter(&initial_bpo->bpo_lock); 384 385 if (bpobj_size != NULL) 386 *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; 387 388 list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); 389 390 while ((bpi = list_head(&stack)) != NULL) { 391 bpobj_t *bpo = bpi->bpi_bpo; 392 393 ASSERT3P(bpo, !=, NULL); 394 ASSERT(MUTEX_HELD(&bpo->bpo_lock)); 395 ASSERT(bpobj_is_open(bpo)); 396 397 if (free) 398 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 399 400 if (bpi->bpi_visited == B_FALSE) { 401 err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, 402 free); 403 bpi->bpi_visited = B_TRUE; 404 if (err != 0) 405 break; 406 } 407 /* 408 * We've finished with this bpo's directly-referenced BP's and 409 * it has no more unprocessed subobjs. We can free its 410 * bpobj_info_t (unless it is the topmost, initial_bpo). 411 * If we are freeing from disk, we can also do that. 412 */ 413 if (bpi->bpi_unprocessed_subobjs == 0) { 414 /* 415 * If there are no entries, there should 416 * be no bytes. 417 */ 418 if (bpobj_is_empty_impl(bpo)) { 419 ASSERT0(bpo->bpo_phys->bpo_bytes); 420 ASSERT0(bpo->bpo_phys->bpo_comp); 421 ASSERT0(bpo->bpo_phys->bpo_uncomp); 422 } 423 424 /* The initial_bpo has no parent and is not closed. */ 425 if (bpi->bpi_parent != NULL) { 426 if (free) { 427 bpobj_t *p = bpi->bpi_parent->bpi_bpo; 428 429 ASSERT0(bpo->bpo_phys->bpo_num_blkptrs); 430 ASSERT3U(p->bpo_phys->bpo_num_subobjs, 431 >, 0); 432 ASSERT3U(bpi->bpi_index, ==, 433 p->bpo_phys->bpo_num_subobjs - 1); 434 ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, 435 tx)); 436 437 p->bpo_phys->bpo_num_subobjs--; 438 439 VERIFY0(dmu_free_range(p->bpo_os, 440 p->bpo_phys->bpo_subobjs, 441 bpi->bpi_index * sizeof (uint64_t), 442 sizeof (uint64_t), tx)); 443 444 /* eliminate the empty subobj list */ 445 if (bpo->bpo_havesubobj && 446 bpo->bpo_phys->bpo_subobjs != 0) { 447 ASSERT0(bpo->bpo_phys-> 448 bpo_num_subobjs); 449 err = dmu_object_free( 450 bpo->bpo_os, 451 bpo->bpo_phys->bpo_subobjs, 452 tx); 453 if (err) 454 break; 455 bpo->bpo_phys->bpo_subobjs = 0; 456 } 457 err = dmu_object_free(p->bpo_os, 458 bpo->bpo_object, tx); 459 if (err) 460 break; 461 } 462 463 mutex_exit(&bpo->bpo_lock); 464 bpobj_close(bpo); 465 kmem_free(bpo, sizeof (bpobj_t)); 466 } else { 467 mutex_exit(&bpo->bpo_lock); 468 } 469 470 /* 471 * Finished processing this bpo. Unlock, and free 472 * our "stack" info. 473 */ 474 list_remove_head(&stack); 475 kmem_free(bpi, sizeof (bpobj_info_t)); 476 } else { 477 /* 478 * We have unprocessed subobjs. Process the next one. 479 */ 480 ASSERT(bpo->bpo_havecomp); 481 ASSERT3P(bpobj_size, ==, NULL); 482 483 /* Add the last subobj to stack. */ 484 int64_t i = bpi->bpi_unprocessed_subobjs - 1; 485 uint64_t offset = i * sizeof (uint64_t); 486 487 uint64_t subobj; 488 err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 489 offset, sizeof (uint64_t), &subobj, 490 DMU_READ_NO_PREFETCH); 491 if (err) 492 break; 493 494 bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t), 495 KM_SLEEP); 496 err = bpobj_open(subbpo, bpo->bpo_os, subobj); 497 if (err) { 498 kmem_free(subbpo, sizeof (bpobj_t)); 499 break; 500 } 501 502 if (subbpo->bpo_havesubobj && 503 subbpo->bpo_phys->bpo_subobjs != 0) { 504 dmu_prefetch(subbpo->bpo_os, 505 subbpo->bpo_phys->bpo_subobjs, 0, 0, 0, 506 ZIO_PRIORITY_ASYNC_READ); 507 } 508 509 list_insert_head(&stack, bpi_alloc(subbpo, bpi, i)); 510 mutex_enter(&subbpo->bpo_lock); 511 bpi->bpi_unprocessed_subobjs--; 512 } 513 } 514 /* 515 * Cleanup anything left on the "stack" after we left the loop. 516 * Every bpo on the stack is locked so we must remember to undo 517 * that now (in LIFO order). 518 */ 519 while ((bpi = list_remove_head(&stack)) != NULL) { 520 bpobj_t *bpo = bpi->bpi_bpo; 521 ASSERT(err != 0); 522 ASSERT3P(bpo, !=, NULL); 523 524 mutex_exit(&bpo->bpo_lock); 525 526 /* do not free the initial_bpo */ 527 if (bpi->bpi_parent != NULL) { 528 bpobj_close(bpi->bpi_bpo); 529 kmem_free(bpi->bpi_bpo, sizeof (bpobj_t)); 530 } 531 kmem_free(bpi, sizeof (bpobj_info_t)); 532 } 533 534 list_destroy(&stack); 535 536 return (err); 537 } 538 539 /* 540 * Iterate and remove the entries. If func returns nonzero, iteration 541 * will stop and that entry will not be removed. 542 */ 543 int 544 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 545 { 546 return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); 547 } 548 549 /* 550 * Iterate the entries. If func returns nonzero, iteration will stop. 551 * 552 * If there are no subobjs: 553 * 554 * *bpobj_size can be used to return the number of block pointers in the 555 * bpobj. Note that this may be different from the number of block pointers 556 * that are iterated over, if iteration is terminated early (e.g. by the func 557 * returning nonzero). 558 * 559 * If there are concurrent (or subsequent) modifications to the bpobj then the 560 * returned *bpobj_size can be passed as "start" to 561 * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. 562 */ 563 int 564 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, 565 uint64_t *bpobj_size) 566 { 567 return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); 568 } 569 570 /* 571 * Iterate over the blkptrs in the bpobj beginning at index start. If func 572 * returns nonzero, iteration will stop. This is a livelist specific function 573 * since it assumes that there are no subobjs present. 574 */ 575 int 576 livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, 577 int64_t start) 578 { 579 if (bpo->bpo_havesubobj) 580 VERIFY0(bpo->bpo_phys->bpo_subobjs); 581 bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); 582 int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); 583 kmem_free(bpi, sizeof (bpobj_info_t)); 584 return (err); 585 } 586 587 /* 588 * Logically add subobj's contents to the parent bpobj. 589 * 590 * In the most general case, this is accomplished in constant time by adding 591 * a reference to subobj. This case is used when enqueuing a large subobj: 592 * +--------------+ +--------------+ 593 * | bpobj |----------------------->| subobj list | 594 * +----+----+----+----+----+ +-----+-----+--+--+ 595 * | bp | bp | bp | bp | bp | | obj | obj | obj | 596 * +----+----+----+----+----+ +-----+-----+-----+ 597 * 598 * +--------------+ +--------------+ 599 * | sub-bpobj |----------------------> | subsubobj | 600 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ 601 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | 602 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ 603 * 604 * Result: sub-bpobj added to parent's subobj list. 605 * +--------------+ +--------------+ 606 * | bpobj |----------------------->| subobj list | 607 * +----+----+----+----+----+ +-----+-----+--+--+-----+ 608 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | 609 * +----+----+----+----+----+ +-----+-----+-----+--|--+ 610 * | 611 * /-----------------------------------------------------/ 612 * v 613 * +--------------+ +--------------+ 614 * | sub-bpobj |----------------------> | subsubobj | 615 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ 616 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | 617 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ 618 * 619 * 620 * In a common case, the subobj is small: its bp's and its list of subobj's 621 * are each stored in a single block. In this case we copy the subobj's 622 * contents to the parent: 623 * +--------------+ +--------------+ 624 * | bpobj |----------------------->| subobj list | 625 * +----+----+----+----+----+ +-----+-----+--+--+ 626 * | bp | bp | bp | bp | bp | | obj | obj | obj | 627 * +----+----+----+----+----+ +-----+-----+-----+ 628 * ^ ^ 629 * +--------------+ | +--------------+ | 630 * | sub-bpobj |---------^------------> | subsubobj | ^ 631 * +----+----+----+ | +-----+-----+--+ | 632 * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/ 633 * +----+----+ +-----+-----+ 634 * 635 * Result: subobj destroyed, contents copied to parent: 636 * +--------------+ +--------------+ 637 * | bpobj |----------------------->| subobj list | 638 * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+ 639 * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ | 640 * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+ 641 * 642 * 643 * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's 644 * but retain the sub-bpobj: 645 * +--------------+ +--------------+ 646 * | bpobj |----------------------->| subobj list | 647 * +----+----+----+----+----+ +-----+-----+--+--+ 648 * | bp | bp | bp | bp | bp | | obj | obj | obj | 649 * +----+----+----+----+----+ +-----+-----+-----+ 650 * ^ 651 * +--------------+ +--------------+ | 652 * | sub-bpobj |----------------------> | subsubobj | ^ 653 * +----+----+----+----+---------+----+ +-----+-----+--+ | 654 * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/ 655 * +----+----+----+----+---------+----+ +-----+-----+ 656 * 657 * Result: sub-sub-bpobjs and subobj added to parent's subobj list. 658 * +--------------+ +--------------+ 659 * | bpobj |-------------------->| subobj list | 660 * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+ 661 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* | 662 * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+ 663 * | 664 * /--------------------------------------------------------------/ 665 * v 666 * +--------------+ 667 * | sub-bpobj | 668 * +----+----+----+----+---------+----+ 669 * | bp | bp | bp | bp | ... | bp | 670 * +----+----+----+----+---------+----+ 671 */ 672 void 673 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) 674 { 675 bpobj_t subbpo; 676 uint64_t used, comp, uncomp, subsubobjs; 677 boolean_t copy_subsub = B_TRUE; 678 boolean_t copy_bps = B_TRUE; 679 680 ASSERT(bpobj_is_open(bpo)); 681 ASSERT(subobj != 0); 682 ASSERT(bpo->bpo_havesubobj); 683 ASSERT(bpo->bpo_havecomp); 684 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 685 686 if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { 687 bpobj_decr_empty(bpo->bpo_os, tx); 688 return; 689 } 690 691 VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); 692 if (bpobj_is_empty(&subbpo)) { 693 /* No point in having an empty subobj. */ 694 bpobj_close(&subbpo); 695 bpobj_free(bpo->bpo_os, subobj, tx); 696 return; 697 } 698 VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); 699 700 mutex_enter(&bpo->bpo_lock); 701 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 702 703 dmu_object_info_t doi; 704 705 if (bpo->bpo_phys->bpo_subobjs != 0) { 706 ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 707 &doi)); 708 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); 709 } 710 711 /* 712 * If subobj has only one block of subobjs, then move subobj's 713 * subobjs to bpo's subobj list directly. This reduces recursion in 714 * bpobj_iterate due to nested subobjs. 715 */ 716 subsubobjs = subbpo.bpo_phys->bpo_subobjs; 717 if (subsubobjs != 0) { 718 VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); 719 if (doi.doi_max_offset > doi.doi_data_block_size) { 720 copy_subsub = B_FALSE; 721 } 722 } 723 724 /* 725 * If, in addition to having only one block of subobj's, subobj has 726 * only one block of bp's, then move subobj's bp's to bpo's bp list 727 * directly. This reduces recursion in bpobj_iterate due to nested 728 * subobjs. 729 */ 730 VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi)); 731 if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) { 732 copy_bps = B_FALSE; 733 } 734 735 if (copy_subsub && subsubobjs != 0) { 736 dmu_buf_t *subdb; 737 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; 738 739 VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs, 740 0, FTAG, &subdb, 0)); 741 /* 742 * Make sure that we are not asking dmu_write() 743 * to write more data than we have in our buffer. 744 */ 745 VERIFY3U(subdb->db_size, >=, 746 numsubsub * sizeof (subobj)); 747 if (bpo->bpo_phys->bpo_subobjs == 0) { 748 bpo->bpo_phys->bpo_subobjs = 749 dmu_object_alloc(bpo->bpo_os, 750 DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, 751 DMU_OT_NONE, 0, tx); 752 } 753 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 754 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 755 numsubsub * sizeof (subobj), subdb->db_data, tx); 756 dmu_buf_rele(subdb, FTAG); 757 bpo->bpo_phys->bpo_num_subobjs += numsubsub; 758 759 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); 760 subbpo.bpo_phys->bpo_subobjs = 0; 761 VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx)); 762 } 763 764 if (copy_bps) { 765 dmu_buf_t *bps; 766 uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs; 767 768 ASSERT(copy_subsub); 769 VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj, 770 0, FTAG, &bps, 0)); 771 772 /* 773 * Make sure that we are not asking dmu_write() 774 * to write more data than we have in our buffer. 775 */ 776 VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t)); 777 dmu_write(bpo->bpo_os, bpo->bpo_object, 778 bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 779 numbps * sizeof (blkptr_t), 780 bps->db_data, tx); 781 dmu_buf_rele(bps, FTAG); 782 bpo->bpo_phys->bpo_num_blkptrs += numbps; 783 784 bpobj_close(&subbpo); 785 VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx)); 786 } else { 787 bpobj_close(&subbpo); 788 if (bpo->bpo_phys->bpo_subobjs == 0) { 789 bpo->bpo_phys->bpo_subobjs = 790 dmu_object_alloc(bpo->bpo_os, 791 DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, 792 DMU_OT_NONE, 0, tx); 793 } 794 795 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 796 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 797 sizeof (subobj), &subobj, tx); 798 bpo->bpo_phys->bpo_num_subobjs++; 799 } 800 801 bpo->bpo_phys->bpo_bytes += used; 802 bpo->bpo_phys->bpo_comp += comp; 803 bpo->bpo_phys->bpo_uncomp += uncomp; 804 mutex_exit(&bpo->bpo_lock); 805 806 } 807 808 /* 809 * Prefetch metadata required for bpobj_enqueue_subobj(). 810 */ 811 void 812 bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj) 813 { 814 dmu_object_info_t doi; 815 bpobj_t subbpo; 816 uint64_t subsubobjs; 817 boolean_t copy_subsub = B_TRUE; 818 boolean_t copy_bps = B_TRUE; 819 820 ASSERT(bpobj_is_open(bpo)); 821 ASSERT(subobj != 0); 822 823 if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) 824 return; 825 826 if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0) 827 return; 828 if (bpobj_is_empty(&subbpo)) { 829 bpobj_close(&subbpo); 830 return; 831 } 832 subsubobjs = subbpo.bpo_phys->bpo_subobjs; 833 bpobj_close(&subbpo); 834 835 if (subsubobjs != 0) { 836 if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0) 837 return; 838 if (doi.doi_max_offset > doi.doi_data_block_size) 839 copy_subsub = B_FALSE; 840 } 841 842 if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0) 843 return; 844 if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) 845 copy_bps = B_FALSE; 846 847 if (copy_subsub && subsubobjs != 0) { 848 if (bpo->bpo_phys->bpo_subobjs) { 849 dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, 850 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, 851 ZIO_PRIORITY_ASYNC_READ); 852 } 853 dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1, 854 ZIO_PRIORITY_ASYNC_READ); 855 } 856 857 if (copy_bps) { 858 dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, 859 bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1, 860 ZIO_PRIORITY_ASYNC_READ); 861 dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1, 862 ZIO_PRIORITY_ASYNC_READ); 863 } else if (bpo->bpo_phys->bpo_subobjs) { 864 dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, 865 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, 866 ZIO_PRIORITY_ASYNC_READ); 867 } 868 } 869 870 void 871 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, 872 dmu_tx_t *tx) 873 { 874 blkptr_t stored_bp = *bp; 875 uint64_t offset; 876 int blkoff; 877 blkptr_t *bparray; 878 879 ASSERT(bpobj_is_open(bpo)); 880 ASSERT(!BP_IS_HOLE(bp)); 881 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 882 883 if (BP_IS_EMBEDDED(bp)) { 884 /* 885 * The bpobj will compress better without the payload. 886 * 887 * Note that we store EMBEDDED bp's because they have an 888 * uncompressed size, which must be accounted for. An 889 * alternative would be to add their size to bpo_uncomp 890 * without storing the bp, but that would create additional 891 * complications: bpo_uncomp would be inconsistent with the 892 * set of BP's stored, and bpobj_iterate() wouldn't visit 893 * all the space accounted for in the bpobj. 894 */ 895 memset(&stored_bp, 0, sizeof (stored_bp)); 896 stored_bp.blk_prop = bp->blk_prop; 897 BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp)); 898 } else if (!BP_GET_DEDUP(bp)) { 899 /* The bpobj will compress better without the checksum */ 900 memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum)); 901 } 902 903 stored_bp.blk_fill = 0; 904 BP_SET_FREE(&stored_bp, bp_freed); 905 906 mutex_enter(&bpo->bpo_lock); 907 908 offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); 909 blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); 910 911 if (bpo->bpo_cached_dbuf == NULL || 912 offset < bpo->bpo_cached_dbuf->db_offset || 913 offset >= bpo->bpo_cached_dbuf->db_offset + 914 bpo->bpo_cached_dbuf->db_size) { 915 if (bpo->bpo_cached_dbuf) 916 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 917 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 918 offset, bpo, &bpo->bpo_cached_dbuf, 0)); 919 ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL); 920 } 921 922 dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); 923 bparray = bpo->bpo_cached_dbuf->db_data; 924 bparray[blkoff] = stored_bp; 925 926 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 927 bpo->bpo_phys->bpo_num_blkptrs++; 928 int sign = bp_freed ? -1 : +1; 929 bpo->bpo_phys->bpo_bytes += sign * 930 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 931 if (bpo->bpo_havecomp) { 932 bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); 933 bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); 934 } 935 if (bp_freed) { 936 ASSERT(bpo->bpo_havefreed); 937 bpo->bpo_phys->bpo_num_freed++; 938 } 939 mutex_exit(&bpo->bpo_lock); 940 } 941 942 struct space_range_arg { 943 spa_t *spa; 944 uint64_t mintxg; 945 uint64_t maxtxg; 946 uint64_t used; 947 uint64_t comp; 948 uint64_t uncomp; 949 }; 950 951 static int 952 space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) 953 { 954 (void) bp_freed, (void) tx; 955 struct space_range_arg *sra = arg; 956 957 if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg && 958 BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) { 959 if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) 960 sra->used += bp_get_dsize_sync(sra->spa, bp); 961 else 962 sra->used += bp_get_dsize(sra->spa, bp); 963 sra->comp += BP_GET_PSIZE(bp); 964 sra->uncomp += BP_GET_UCSIZE(bp); 965 } 966 return (0); 967 } 968 969 int 970 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 971 { 972 ASSERT(bpobj_is_open(bpo)); 973 mutex_enter(&bpo->bpo_lock); 974 975 *usedp = bpo->bpo_phys->bpo_bytes; 976 if (bpo->bpo_havecomp) { 977 *compp = bpo->bpo_phys->bpo_comp; 978 *uncompp = bpo->bpo_phys->bpo_uncomp; 979 mutex_exit(&bpo->bpo_lock); 980 return (0); 981 } else { 982 mutex_exit(&bpo->bpo_lock); 983 return (bpobj_space_range(bpo, 0, UINT64_MAX, 984 usedp, compp, uncompp)); 985 } 986 } 987 988 /* 989 * Return the amount of space in the bpobj which is: 990 * mintxg < logical birth <= maxtxg 991 */ 992 int 993 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, 994 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 995 { 996 struct space_range_arg sra = { 0 }; 997 int err; 998 999 ASSERT(bpobj_is_open(bpo)); 1000 1001 /* 1002 * As an optimization, if they want the whole txg range, just 1003 * get bpo_bytes rather than iterating over the bps. 1004 */ 1005 if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) 1006 return (bpobj_space(bpo, usedp, compp, uncompp)); 1007 1008 sra.spa = dmu_objset_spa(bpo->bpo_os); 1009 sra.mintxg = mintxg; 1010 sra.maxtxg = maxtxg; 1011 1012 err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); 1013 *usedp = sra.used; 1014 *compp = sra.comp; 1015 *uncompp = sra.uncomp; 1016 return (err); 1017 } 1018 1019 /* 1020 * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a 1021 * bpobj are designated as free or allocated that information is not preserved 1022 * in bplists. 1023 */ 1024 int 1025 bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 1026 dmu_tx_t *tx) 1027 { 1028 (void) bp_freed, (void) tx; 1029 bplist_t *bpl = arg; 1030 bplist_append(bpl, bp); 1031 return (0); 1032 } 1033