1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2017 Datto Inc. 25 */ 26 27 #include <sys/bpobj.h> 28 #include <sys/zfs_context.h> 29 #include <sys/zfs_refcount.h> 30 #include <sys/dsl_pool.h> 31 #include <sys/zfeature.h> 32 #include <sys/zap.h> 33 34 /* 35 * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). 36 */ 37 uint64_t 38 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) 39 { 40 spa_t *spa = dmu_objset_spa(os); 41 dsl_pool_t *dp = dmu_objset_pool(os); 42 43 if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { 44 if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { 45 ASSERT0(dp->dp_empty_bpobj); 46 dp->dp_empty_bpobj = 47 bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); 48 VERIFY(zap_add(os, 49 DMU_POOL_DIRECTORY_OBJECT, 50 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 51 &dp->dp_empty_bpobj, tx) == 0); 52 } 53 spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); 54 ASSERT(dp->dp_empty_bpobj != 0); 55 return (dp->dp_empty_bpobj); 56 } else { 57 return (bpobj_alloc(os, blocksize, tx)); 58 } 59 } 60 61 void 62 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) 63 { 64 dsl_pool_t *dp = dmu_objset_pool(os); 65 66 spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); 67 if (!spa_feature_is_active(dmu_objset_spa(os), 68 SPA_FEATURE_EMPTY_BPOBJ)) { 69 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, 70 DMU_POOL_DIRECTORY_OBJECT, 71 DMU_POOL_EMPTY_BPOBJ, tx)); 72 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); 73 dp->dp_empty_bpobj = 0; 74 } 75 } 76 77 uint64_t 78 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) 79 { 80 int size; 81 82 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) 83 size = BPOBJ_SIZE_V0; 84 else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 85 size = BPOBJ_SIZE_V1; 86 else if (!spa_feature_is_active(dmu_objset_spa(os), 87 SPA_FEATURE_LIVELIST)) 88 size = BPOBJ_SIZE_V2; 89 else 90 size = sizeof (bpobj_phys_t); 91 92 return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, 93 DMU_OT_BPOBJ_HDR, size, tx)); 94 } 95 96 void 97 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 98 { 99 int64_t i; 100 bpobj_t bpo; 101 dmu_object_info_t doi; 102 int epb; 103 dmu_buf_t *dbuf = NULL; 104 105 ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); 106 VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); 107 108 mutex_enter(&bpo.bpo_lock); 109 110 if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) 111 goto out; 112 113 VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); 114 epb = doi.doi_data_block_size / sizeof (uint64_t); 115 116 for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 117 uint64_t *objarray; 118 uint64_t offset, blkoff; 119 120 offset = i * sizeof (uint64_t); 121 blkoff = P2PHASE(i, epb); 122 123 if (dbuf == NULL || dbuf->db_offset > offset) { 124 if (dbuf) 125 dmu_buf_rele(dbuf, FTAG); 126 VERIFY3U(0, ==, dmu_buf_hold(os, 127 bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); 128 } 129 130 ASSERT3U(offset, >=, dbuf->db_offset); 131 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 132 133 objarray = dbuf->db_data; 134 bpobj_free(os, objarray[blkoff], tx); 135 } 136 if (dbuf) { 137 dmu_buf_rele(dbuf, FTAG); 138 dbuf = NULL; 139 } 140 VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); 141 142 out: 143 mutex_exit(&bpo.bpo_lock); 144 bpobj_close(&bpo); 145 146 VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); 147 } 148 149 int 150 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) 151 { 152 dmu_object_info_t doi; 153 int err; 154 155 err = dmu_object_info(os, object, &doi); 156 if (err) 157 return (err); 158 159 memset(bpo, 0, sizeof (*bpo)); 160 mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); 161 162 ASSERT(bpo->bpo_dbuf == NULL); 163 ASSERT(bpo->bpo_phys == NULL); 164 ASSERT(object != 0); 165 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); 166 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); 167 168 err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); 169 if (err) 170 return (err); 171 172 bpo->bpo_os = os; 173 bpo->bpo_object = object; 174 bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; 175 bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); 176 bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); 177 bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); 178 bpo->bpo_phys = bpo->bpo_dbuf->db_data; 179 return (0); 180 } 181 182 boolean_t 183 bpobj_is_open(const bpobj_t *bpo) 184 { 185 return (bpo->bpo_object != 0); 186 } 187 188 void 189 bpobj_close(bpobj_t *bpo) 190 { 191 /* Lame workaround for closing a bpobj that was never opened. */ 192 if (bpo->bpo_object == 0) 193 return; 194 195 dmu_buf_rele(bpo->bpo_dbuf, bpo); 196 if (bpo->bpo_cached_dbuf != NULL) 197 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 198 bpo->bpo_dbuf = NULL; 199 bpo->bpo_phys = NULL; 200 bpo->bpo_cached_dbuf = NULL; 201 bpo->bpo_object = 0; 202 203 mutex_destroy(&bpo->bpo_lock); 204 } 205 206 static boolean_t 207 bpobj_is_empty_impl(bpobj_t *bpo) 208 { 209 ASSERT(MUTEX_HELD(&bpo->bpo_lock)); 210 return (bpo->bpo_phys->bpo_num_blkptrs == 0 && 211 (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); 212 } 213 214 boolean_t 215 bpobj_is_empty(bpobj_t *bpo) 216 { 217 mutex_enter(&bpo->bpo_lock); 218 boolean_t is_empty = bpobj_is_empty_impl(bpo); 219 mutex_exit(&bpo->bpo_lock); 220 return (is_empty); 221 } 222 223 /* 224 * A recursive iteration of the bpobjs would be nice here but we run the risk 225 * of overflowing function stack space. Instead, find each subobj and add it 226 * to the head of our list so it can be scanned for subjobjs. Like a 227 * recursive implementation, the "deepest" subobjs will be freed first. 228 * When a subobj is found to have no additional subojs, free it. 229 */ 230 typedef struct bpobj_info { 231 bpobj_t *bpi_bpo; 232 /* 233 * This object is a subobj of bpi_parent, 234 * at bpi_index in its subobj array. 235 */ 236 struct bpobj_info *bpi_parent; 237 uint64_t bpi_index; 238 /* How many of our subobj's are left to process. */ 239 uint64_t bpi_unprocessed_subobjs; 240 /* True after having visited this bpo's directly referenced BPs. */ 241 boolean_t bpi_visited; 242 list_node_t bpi_node; 243 } bpobj_info_t; 244 245 static bpobj_info_t * 246 bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) 247 { 248 bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP); 249 bpi->bpi_bpo = bpo; 250 bpi->bpi_parent = parent; 251 bpi->bpi_index = index; 252 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 253 bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs; 254 } 255 return (bpi); 256 } 257 258 /* 259 * Update bpobj and all of its parents with new space accounting. 260 */ 261 static void 262 propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, 263 int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) 264 { 265 266 for (; bpi != NULL; bpi = bpi->bpi_parent) { 267 bpobj_t *p = bpi->bpi_bpo; 268 ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx)); 269 p->bpo_phys->bpo_bytes -= freed; 270 ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0); 271 if (p->bpo_havecomp) { 272 p->bpo_phys->bpo_comp -= comp_freed; 273 p->bpo_phys->bpo_uncomp -= uncomp_freed; 274 } 275 } 276 } 277 278 static int 279 bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, 280 int64_t start, dmu_tx_t *tx, boolean_t free) 281 { 282 int err = 0; 283 int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; 284 dmu_buf_t *dbuf = NULL; 285 bpobj_t *bpo = bpi->bpi_bpo; 286 287 int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; 288 uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) * 289 sizeof (blkptr_t); 290 uint64_t ps = start * sizeof (blkptr_t); 291 uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0, 292 ps); 293 if (pe > pb) { 294 dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb, 295 ZIO_PRIORITY_ASYNC_READ); 296 } 297 for (; i >= start; i--) { 298 uint64_t offset = i * sizeof (blkptr_t); 299 uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); 300 301 if (dbuf == NULL || dbuf->db_offset > offset) { 302 if (dbuf) 303 dmu_buf_rele(dbuf, FTAG); 304 err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 305 offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH); 306 if (err) 307 break; 308 pe = pb; 309 pb = MAX((dbuf->db_offset > dmu_prefetch_max) ? 310 dbuf->db_offset - dmu_prefetch_max : 0, ps); 311 if (pe > pb) { 312 dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, 313 pb, pe - pb, ZIO_PRIORITY_ASYNC_READ); 314 } 315 } 316 317 ASSERT3U(offset, >=, dbuf->db_offset); 318 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 319 320 blkptr_t *bparray = dbuf->db_data; 321 blkptr_t *bp = &bparray[blkoff]; 322 323 boolean_t bp_freed = BP_GET_FREE(bp); 324 err = func(arg, bp, bp_freed, tx); 325 if (err) 326 break; 327 328 if (free) { 329 int sign = bp_freed ? -1 : +1; 330 spa_t *spa = dmu_objset_spa(bpo->bpo_os); 331 freed += sign * bp_get_dsize_sync(spa, bp); 332 comp_freed += sign * BP_GET_PSIZE(bp); 333 uncomp_freed += sign * BP_GET_UCSIZE(bp); 334 ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); 335 bpo->bpo_phys->bpo_num_blkptrs--; 336 ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); 337 if (bp_freed) { 338 ASSERT(bpo->bpo_havefreed); 339 bpo->bpo_phys->bpo_num_freed--; 340 ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); 341 } 342 } 343 } 344 if (free) { 345 propagate_space_reduction(bpi, freed, comp_freed, 346 uncomp_freed, tx); 347 VERIFY0(dmu_free_range(bpo->bpo_os, 348 bpo->bpo_object, 349 bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 350 DMU_OBJECT_END, tx)); 351 } 352 if (dbuf) { 353 dmu_buf_rele(dbuf, FTAG); 354 dbuf = NULL; 355 } 356 return (err); 357 } 358 359 /* 360 * Given an initial bpo, start by freeing the BPs that are directly referenced 361 * by that bpo. If the bpo has subobjs, read in its last subobj and push the 362 * subobj to our stack. By popping items off our stack, eventually we will 363 * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if 364 * requested also free the now-empty bpo from disk and decrement 365 * its parent's subobj count. We continue popping each subobj from our stack, 366 * visiting its last subobj until they too have no more subobjs, and so on. 367 */ 368 static int 369 bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, 370 dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) 371 { 372 list_t stack; 373 bpobj_info_t *bpi; 374 int err = 0; 375 376 /* 377 * Create a "stack" for us to work with without worrying about 378 * stack overflows. Initialize it with the initial_bpo. 379 */ 380 list_create(&stack, sizeof (bpobj_info_t), 381 offsetof(bpobj_info_t, bpi_node)); 382 mutex_enter(&initial_bpo->bpo_lock); 383 384 if (bpobj_size != NULL) 385 *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; 386 387 list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); 388 389 while ((bpi = list_head(&stack)) != NULL) { 390 bpobj_t *bpo = bpi->bpi_bpo; 391 392 ASSERT3P(bpo, !=, NULL); 393 ASSERT(MUTEX_HELD(&bpo->bpo_lock)); 394 ASSERT(bpobj_is_open(bpo)); 395 396 if (free) 397 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 398 399 if (bpi->bpi_visited == B_FALSE) { 400 err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, 401 free); 402 bpi->bpi_visited = B_TRUE; 403 if (err != 0) 404 break; 405 } 406 /* 407 * We've finished with this bpo's directly-referenced BP's and 408 * it has no more unprocessed subobjs. We can free its 409 * bpobj_info_t (unless it is the topmost, initial_bpo). 410 * If we are freeing from disk, we can also do that. 411 */ 412 if (bpi->bpi_unprocessed_subobjs == 0) { 413 /* 414 * If there are no entries, there should 415 * be no bytes. 416 */ 417 if (bpobj_is_empty_impl(bpo)) { 418 ASSERT0(bpo->bpo_phys->bpo_bytes); 419 ASSERT0(bpo->bpo_phys->bpo_comp); 420 ASSERT0(bpo->bpo_phys->bpo_uncomp); 421 } 422 423 /* The initial_bpo has no parent and is not closed. */ 424 if (bpi->bpi_parent != NULL) { 425 if (free) { 426 bpobj_t *p = bpi->bpi_parent->bpi_bpo; 427 428 ASSERT0(bpo->bpo_phys->bpo_num_blkptrs); 429 ASSERT3U(p->bpo_phys->bpo_num_subobjs, 430 >, 0); 431 ASSERT3U(bpi->bpi_index, ==, 432 p->bpo_phys->bpo_num_subobjs - 1); 433 ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, 434 tx)); 435 436 p->bpo_phys->bpo_num_subobjs--; 437 438 VERIFY0(dmu_free_range(p->bpo_os, 439 p->bpo_phys->bpo_subobjs, 440 bpi->bpi_index * sizeof (uint64_t), 441 sizeof (uint64_t), tx)); 442 443 /* eliminate the empty subobj list */ 444 if (bpo->bpo_havesubobj && 445 bpo->bpo_phys->bpo_subobjs != 0) { 446 ASSERT0(bpo->bpo_phys-> 447 bpo_num_subobjs); 448 err = dmu_object_free( 449 bpo->bpo_os, 450 bpo->bpo_phys->bpo_subobjs, 451 tx); 452 if (err) 453 break; 454 bpo->bpo_phys->bpo_subobjs = 0; 455 } 456 err = dmu_object_free(p->bpo_os, 457 bpo->bpo_object, tx); 458 if (err) 459 break; 460 } 461 462 mutex_exit(&bpo->bpo_lock); 463 bpobj_close(bpo); 464 kmem_free(bpo, sizeof (bpobj_t)); 465 } else { 466 mutex_exit(&bpo->bpo_lock); 467 } 468 469 /* 470 * Finished processing this bpo. Unlock, and free 471 * our "stack" info. 472 */ 473 list_remove_head(&stack); 474 kmem_free(bpi, sizeof (bpobj_info_t)); 475 } else { 476 /* 477 * We have unprocessed subobjs. Process the next one. 478 */ 479 ASSERT(bpo->bpo_havecomp); 480 ASSERT3P(bpobj_size, ==, NULL); 481 482 /* Add the last subobj to stack. */ 483 int64_t i = bpi->bpi_unprocessed_subobjs - 1; 484 uint64_t offset = i * sizeof (uint64_t); 485 486 uint64_t subobj; 487 err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 488 offset, sizeof (uint64_t), &subobj, 489 DMU_READ_NO_PREFETCH); 490 if (err) 491 break; 492 493 bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t), 494 KM_SLEEP); 495 err = bpobj_open(subbpo, bpo->bpo_os, subobj); 496 if (err) { 497 kmem_free(subbpo, sizeof (bpobj_t)); 498 break; 499 } 500 501 if (subbpo->bpo_havesubobj && 502 subbpo->bpo_phys->bpo_subobjs != 0) { 503 dmu_prefetch(subbpo->bpo_os, 504 subbpo->bpo_phys->bpo_subobjs, 0, 0, 0, 505 ZIO_PRIORITY_ASYNC_READ); 506 } 507 508 list_insert_head(&stack, bpi_alloc(subbpo, bpi, i)); 509 mutex_enter(&subbpo->bpo_lock); 510 bpi->bpi_unprocessed_subobjs--; 511 } 512 } 513 /* 514 * Cleanup anything left on the "stack" after we left the loop. 515 * Every bpo on the stack is locked so we must remember to undo 516 * that now (in LIFO order). 517 */ 518 while ((bpi = list_remove_head(&stack)) != NULL) { 519 bpobj_t *bpo = bpi->bpi_bpo; 520 ASSERT(err != 0); 521 ASSERT3P(bpo, !=, NULL); 522 523 mutex_exit(&bpo->bpo_lock); 524 525 /* do not free the initial_bpo */ 526 if (bpi->bpi_parent != NULL) { 527 bpobj_close(bpi->bpi_bpo); 528 kmem_free(bpi->bpi_bpo, sizeof (bpobj_t)); 529 } 530 kmem_free(bpi, sizeof (bpobj_info_t)); 531 } 532 533 list_destroy(&stack); 534 535 return (err); 536 } 537 538 /* 539 * Iterate and remove the entries. If func returns nonzero, iteration 540 * will stop and that entry will not be removed. 541 */ 542 int 543 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 544 { 545 return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); 546 } 547 548 /* 549 * Iterate the entries. If func returns nonzero, iteration will stop. 550 * 551 * If there are no subobjs: 552 * 553 * *bpobj_size can be used to return the number of block pointers in the 554 * bpobj. Note that this may be different from the number of block pointers 555 * that are iterated over, if iteration is terminated early (e.g. by the func 556 * returning nonzero). 557 * 558 * If there are concurrent (or subsequent) modifications to the bpobj then the 559 * returned *bpobj_size can be passed as "start" to 560 * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. 561 */ 562 int 563 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, 564 uint64_t *bpobj_size) 565 { 566 return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); 567 } 568 569 /* 570 * Iterate over the blkptrs in the bpobj beginning at index start. If func 571 * returns nonzero, iteration will stop. This is a livelist specific function 572 * since it assumes that there are no subobjs present. 573 */ 574 int 575 livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, 576 int64_t start) 577 { 578 if (bpo->bpo_havesubobj) 579 VERIFY0(bpo->bpo_phys->bpo_subobjs); 580 bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); 581 int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); 582 kmem_free(bpi, sizeof (bpobj_info_t)); 583 return (err); 584 } 585 586 /* 587 * Logically add subobj's contents to the parent bpobj. 588 * 589 * In the most general case, this is accomplished in constant time by adding 590 * a reference to subobj. This case is used when enqueuing a large subobj: 591 * +--------------+ +--------------+ 592 * | bpobj |----------------------->| subobj list | 593 * +----+----+----+----+----+ +-----+-----+--+--+ 594 * | bp | bp | bp | bp | bp | | obj | obj | obj | 595 * +----+----+----+----+----+ +-----+-----+-----+ 596 * 597 * +--------------+ +--------------+ 598 * | sub-bpobj |----------------------> | subsubobj | 599 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ 600 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | 601 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ 602 * 603 * Result: sub-bpobj added to parent's subobj list. 604 * +--------------+ +--------------+ 605 * | bpobj |----------------------->| subobj list | 606 * +----+----+----+----+----+ +-----+-----+--+--+-----+ 607 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | 608 * +----+----+----+----+----+ +-----+-----+-----+--|--+ 609 * | 610 * /-----------------------------------------------------/ 611 * v 612 * +--------------+ +--------------+ 613 * | sub-bpobj |----------------------> | subsubobj | 614 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ 615 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | 616 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ 617 * 618 * 619 * In a common case, the subobj is small: its bp's and its list of subobj's 620 * are each stored in a single block. In this case we copy the subobj's 621 * contents to the parent: 622 * +--------------+ +--------------+ 623 * | bpobj |----------------------->| subobj list | 624 * +----+----+----+----+----+ +-----+-----+--+--+ 625 * | bp | bp | bp | bp | bp | | obj | obj | obj | 626 * +----+----+----+----+----+ +-----+-----+-----+ 627 * ^ ^ 628 * +--------------+ | +--------------+ | 629 * | sub-bpobj |---------^------------> | subsubobj | ^ 630 * +----+----+----+ | +-----+-----+--+ | 631 * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/ 632 * +----+----+ +-----+-----+ 633 * 634 * Result: subobj destroyed, contents copied to parent: 635 * +--------------+ +--------------+ 636 * | bpobj |----------------------->| subobj list | 637 * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+ 638 * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ | 639 * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+ 640 * 641 * 642 * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's 643 * but retain the sub-bpobj: 644 * +--------------+ +--------------+ 645 * | bpobj |----------------------->| subobj list | 646 * +----+----+----+----+----+ +-----+-----+--+--+ 647 * | bp | bp | bp | bp | bp | | obj | obj | obj | 648 * +----+----+----+----+----+ +-----+-----+-----+ 649 * ^ 650 * +--------------+ +--------------+ | 651 * | sub-bpobj |----------------------> | subsubobj | ^ 652 * +----+----+----+----+---------+----+ +-----+-----+--+ | 653 * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/ 654 * +----+----+----+----+---------+----+ +-----+-----+ 655 * 656 * Result: sub-sub-bpobjs and subobj added to parent's subobj list. 657 * +--------------+ +--------------+ 658 * | bpobj |-------------------->| subobj list | 659 * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+ 660 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* | 661 * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+ 662 * | 663 * /--------------------------------------------------------------/ 664 * v 665 * +--------------+ 666 * | sub-bpobj | 667 * +----+----+----+----+---------+----+ 668 * | bp | bp | bp | bp | ... | bp | 669 * +----+----+----+----+---------+----+ 670 */ 671 void 672 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) 673 { 674 bpobj_t subbpo; 675 uint64_t used, comp, uncomp, subsubobjs; 676 boolean_t copy_subsub = B_TRUE; 677 boolean_t copy_bps = B_TRUE; 678 679 ASSERT(bpobj_is_open(bpo)); 680 ASSERT(subobj != 0); 681 ASSERT(bpo->bpo_havesubobj); 682 ASSERT(bpo->bpo_havecomp); 683 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 684 685 if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { 686 bpobj_decr_empty(bpo->bpo_os, tx); 687 return; 688 } 689 690 VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); 691 if (bpobj_is_empty(&subbpo)) { 692 /* No point in having an empty subobj. */ 693 bpobj_close(&subbpo); 694 bpobj_free(bpo->bpo_os, subobj, tx); 695 return; 696 } 697 VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); 698 699 mutex_enter(&bpo->bpo_lock); 700 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 701 702 dmu_object_info_t doi; 703 704 if (bpo->bpo_phys->bpo_subobjs != 0) { 705 ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 706 &doi)); 707 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); 708 } 709 710 /* 711 * If subobj has only one block of subobjs, then move subobj's 712 * subobjs to bpo's subobj list directly. This reduces recursion in 713 * bpobj_iterate due to nested subobjs. 714 */ 715 subsubobjs = subbpo.bpo_phys->bpo_subobjs; 716 if (subsubobjs != 0) { 717 VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); 718 if (doi.doi_max_offset > doi.doi_data_block_size) { 719 copy_subsub = B_FALSE; 720 } 721 } 722 723 /* 724 * If, in addition to having only one block of subobj's, subobj has 725 * only one block of bp's, then move subobj's bp's to bpo's bp list 726 * directly. This reduces recursion in bpobj_iterate due to nested 727 * subobjs. 728 */ 729 VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi)); 730 if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) { 731 copy_bps = B_FALSE; 732 } 733 734 if (copy_subsub && subsubobjs != 0) { 735 dmu_buf_t *subdb; 736 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; 737 738 VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs, 739 0, FTAG, &subdb, 0)); 740 /* 741 * Make sure that we are not asking dmu_write() 742 * to write more data than we have in our buffer. 743 */ 744 VERIFY3U(subdb->db_size, >=, 745 numsubsub * sizeof (subobj)); 746 if (bpo->bpo_phys->bpo_subobjs == 0) { 747 bpo->bpo_phys->bpo_subobjs = 748 dmu_object_alloc(bpo->bpo_os, 749 DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, 750 DMU_OT_NONE, 0, tx); 751 } 752 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 753 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 754 numsubsub * sizeof (subobj), subdb->db_data, tx); 755 dmu_buf_rele(subdb, FTAG); 756 bpo->bpo_phys->bpo_num_subobjs += numsubsub; 757 758 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); 759 subbpo.bpo_phys->bpo_subobjs = 0; 760 VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx)); 761 } 762 763 if (copy_bps) { 764 dmu_buf_t *bps; 765 uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs; 766 767 ASSERT(copy_subsub); 768 VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj, 769 0, FTAG, &bps, 0)); 770 771 /* 772 * Make sure that we are not asking dmu_write() 773 * to write more data than we have in our buffer. 774 */ 775 VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t)); 776 dmu_write(bpo->bpo_os, bpo->bpo_object, 777 bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 778 numbps * sizeof (blkptr_t), 779 bps->db_data, tx); 780 dmu_buf_rele(bps, FTAG); 781 bpo->bpo_phys->bpo_num_blkptrs += numbps; 782 783 bpobj_close(&subbpo); 784 VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx)); 785 } else { 786 bpobj_close(&subbpo); 787 if (bpo->bpo_phys->bpo_subobjs == 0) { 788 bpo->bpo_phys->bpo_subobjs = 789 dmu_object_alloc(bpo->bpo_os, 790 DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, 791 DMU_OT_NONE, 0, tx); 792 } 793 794 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 795 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 796 sizeof (subobj), &subobj, tx); 797 bpo->bpo_phys->bpo_num_subobjs++; 798 } 799 800 bpo->bpo_phys->bpo_bytes += used; 801 bpo->bpo_phys->bpo_comp += comp; 802 bpo->bpo_phys->bpo_uncomp += uncomp; 803 mutex_exit(&bpo->bpo_lock); 804 805 } 806 807 /* 808 * Prefetch metadata required for bpobj_enqueue_subobj(). 809 */ 810 void 811 bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj) 812 { 813 dmu_object_info_t doi; 814 bpobj_t subbpo; 815 uint64_t subsubobjs; 816 boolean_t copy_subsub = B_TRUE; 817 boolean_t copy_bps = B_TRUE; 818 819 ASSERT(bpobj_is_open(bpo)); 820 ASSERT(subobj != 0); 821 822 if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) 823 return; 824 825 if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0) 826 return; 827 if (bpobj_is_empty(&subbpo)) { 828 bpobj_close(&subbpo); 829 return; 830 } 831 subsubobjs = subbpo.bpo_phys->bpo_subobjs; 832 bpobj_close(&subbpo); 833 834 if (subsubobjs != 0) { 835 if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0) 836 return; 837 if (doi.doi_max_offset > doi.doi_data_block_size) 838 copy_subsub = B_FALSE; 839 } 840 841 if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0) 842 return; 843 if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) 844 copy_bps = B_FALSE; 845 846 if (copy_subsub && subsubobjs != 0) { 847 if (bpo->bpo_phys->bpo_subobjs) { 848 dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, 849 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, 850 ZIO_PRIORITY_ASYNC_READ); 851 } 852 dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1, 853 ZIO_PRIORITY_ASYNC_READ); 854 } 855 856 if (copy_bps) { 857 dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, 858 bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1, 859 ZIO_PRIORITY_ASYNC_READ); 860 dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1, 861 ZIO_PRIORITY_ASYNC_READ); 862 } else if (bpo->bpo_phys->bpo_subobjs) { 863 dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, 864 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, 865 ZIO_PRIORITY_ASYNC_READ); 866 } 867 } 868 869 void 870 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, 871 dmu_tx_t *tx) 872 { 873 blkptr_t stored_bp = *bp; 874 uint64_t offset; 875 int blkoff; 876 blkptr_t *bparray; 877 878 ASSERT(bpobj_is_open(bpo)); 879 ASSERT(!BP_IS_HOLE(bp)); 880 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 881 882 if (BP_IS_EMBEDDED(bp)) { 883 /* 884 * The bpobj will compress better without the payload. 885 * 886 * Note that we store EMBEDDED bp's because they have an 887 * uncompressed size, which must be accounted for. An 888 * alternative would be to add their size to bpo_uncomp 889 * without storing the bp, but that would create additional 890 * complications: bpo_uncomp would be inconsistent with the 891 * set of BP's stored, and bpobj_iterate() wouldn't visit 892 * all the space accounted for in the bpobj. 893 */ 894 memset(&stored_bp, 0, sizeof (stored_bp)); 895 stored_bp.blk_prop = bp->blk_prop; 896 BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp)); 897 } else if (!BP_GET_DEDUP(bp)) { 898 /* The bpobj will compress better without the checksum */ 899 memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum)); 900 } 901 902 stored_bp.blk_fill = 0; 903 BP_SET_FREE(&stored_bp, bp_freed); 904 905 mutex_enter(&bpo->bpo_lock); 906 907 offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); 908 blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); 909 910 if (bpo->bpo_cached_dbuf == NULL || 911 offset < bpo->bpo_cached_dbuf->db_offset || 912 offset >= bpo->bpo_cached_dbuf->db_offset + 913 bpo->bpo_cached_dbuf->db_size) { 914 if (bpo->bpo_cached_dbuf) 915 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 916 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 917 offset, bpo, &bpo->bpo_cached_dbuf, 0)); 918 ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL); 919 } 920 921 dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); 922 bparray = bpo->bpo_cached_dbuf->db_data; 923 bparray[blkoff] = stored_bp; 924 925 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 926 bpo->bpo_phys->bpo_num_blkptrs++; 927 int sign = bp_freed ? -1 : +1; 928 bpo->bpo_phys->bpo_bytes += sign * 929 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 930 if (bpo->bpo_havecomp) { 931 bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); 932 bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); 933 } 934 if (bp_freed) { 935 ASSERT(bpo->bpo_havefreed); 936 bpo->bpo_phys->bpo_num_freed++; 937 } 938 mutex_exit(&bpo->bpo_lock); 939 } 940 941 struct space_range_arg { 942 spa_t *spa; 943 uint64_t mintxg; 944 uint64_t maxtxg; 945 uint64_t used; 946 uint64_t comp; 947 uint64_t uncomp; 948 }; 949 950 static int 951 space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) 952 { 953 (void) bp_freed, (void) tx; 954 struct space_range_arg *sra = arg; 955 956 if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg && 957 BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) { 958 if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) 959 sra->used += bp_get_dsize_sync(sra->spa, bp); 960 else 961 sra->used += bp_get_dsize(sra->spa, bp); 962 sra->comp += BP_GET_PSIZE(bp); 963 sra->uncomp += BP_GET_UCSIZE(bp); 964 } 965 return (0); 966 } 967 968 int 969 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 970 { 971 ASSERT(bpobj_is_open(bpo)); 972 mutex_enter(&bpo->bpo_lock); 973 974 *usedp = bpo->bpo_phys->bpo_bytes; 975 if (bpo->bpo_havecomp) { 976 *compp = bpo->bpo_phys->bpo_comp; 977 *uncompp = bpo->bpo_phys->bpo_uncomp; 978 mutex_exit(&bpo->bpo_lock); 979 return (0); 980 } else { 981 mutex_exit(&bpo->bpo_lock); 982 return (bpobj_space_range(bpo, 0, UINT64_MAX, 983 usedp, compp, uncompp)); 984 } 985 } 986 987 /* 988 * Return the amount of space in the bpobj which is: 989 * mintxg < logical birth <= maxtxg 990 */ 991 int 992 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, 993 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 994 { 995 struct space_range_arg sra = { 0 }; 996 int err; 997 998 ASSERT(bpobj_is_open(bpo)); 999 1000 /* 1001 * As an optimization, if they want the whole txg range, just 1002 * get bpo_bytes rather than iterating over the bps. 1003 */ 1004 if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) 1005 return (bpobj_space(bpo, usedp, compp, uncompp)); 1006 1007 sra.spa = dmu_objset_spa(bpo->bpo_os); 1008 sra.mintxg = mintxg; 1009 sra.maxtxg = maxtxg; 1010 1011 err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); 1012 *usedp = sra.used; 1013 *compp = sra.comp; 1014 *uncompp = sra.uncomp; 1015 return (err); 1016 } 1017 1018 /* 1019 * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a 1020 * bpobj are designated as free or allocated that information is not preserved 1021 * in bplists. 1022 */ 1023 int 1024 bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 1025 dmu_tx_t *tx) 1026 { 1027 (void) bp_freed, (void) tx; 1028 bplist_t *bpl = arg; 1029 bplist_append(bpl, bp); 1030 return (0); 1031 } 1032