1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24 * Copyright (c) 2017 Datto Inc. 25 */ 26 27 #include <sys/bpobj.h> 28 #include <sys/zfs_context.h> 29 #include <sys/zfs_refcount.h> 30 #include <sys/dsl_pool.h> 31 #include <sys/zfeature.h> 32 #include <sys/zap.h> 33 34 /* 35 * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). 36 */ 37 uint64_t 38 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) 39 { 40 spa_t *spa = dmu_objset_spa(os); 41 dsl_pool_t *dp = dmu_objset_pool(os); 42 43 if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { 44 if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { 45 ASSERT0(dp->dp_empty_bpobj); 46 dp->dp_empty_bpobj = 47 bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); 48 VERIFY(zap_add(os, 49 DMU_POOL_DIRECTORY_OBJECT, 50 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 51 &dp->dp_empty_bpobj, tx) == 0); 52 } 53 spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); 54 ASSERT(dp->dp_empty_bpobj != 0); 55 return (dp->dp_empty_bpobj); 56 } else { 57 return (bpobj_alloc(os, blocksize, tx)); 58 } 59 } 60 61 void 62 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) 63 { 64 dsl_pool_t *dp = dmu_objset_pool(os); 65 66 spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); 67 if (!spa_feature_is_active(dmu_objset_spa(os), 68 SPA_FEATURE_EMPTY_BPOBJ)) { 69 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, 70 DMU_POOL_DIRECTORY_OBJECT, 71 DMU_POOL_EMPTY_BPOBJ, tx)); 72 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); 73 dp->dp_empty_bpobj = 0; 74 } 75 } 76 77 uint64_t 78 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) 79 { 80 int size; 81 82 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) 83 size = BPOBJ_SIZE_V0; 84 else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 85 size = BPOBJ_SIZE_V1; 86 else if (!spa_feature_is_active(dmu_objset_spa(os), 87 SPA_FEATURE_LIVELIST)) 88 size = BPOBJ_SIZE_V2; 89 else 90 size = sizeof (bpobj_phys_t); 91 92 return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, 93 DMU_OT_BPOBJ_HDR, size, tx)); 94 } 95 96 void 97 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 98 { 99 int64_t i; 100 bpobj_t bpo; 101 dmu_object_info_t doi; 102 int epb; 103 dmu_buf_t *dbuf = NULL; 104 105 ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); 106 VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); 107 108 mutex_enter(&bpo.bpo_lock); 109 110 if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) 111 goto out; 112 113 VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); 114 epb = doi.doi_data_block_size / sizeof (uint64_t); 115 116 for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 117 uint64_t *objarray; 118 uint64_t offset, blkoff; 119 120 offset = i * sizeof (uint64_t); 121 blkoff = P2PHASE(i, epb); 122 123 if (dbuf == NULL || dbuf->db_offset > offset) { 124 if (dbuf) 125 dmu_buf_rele(dbuf, FTAG); 126 VERIFY3U(0, ==, dmu_buf_hold(os, 127 bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); 128 } 129 130 ASSERT3U(offset, >=, dbuf->db_offset); 131 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 132 133 objarray = dbuf->db_data; 134 bpobj_free(os, objarray[blkoff], tx); 135 } 136 if (dbuf) { 137 dmu_buf_rele(dbuf, FTAG); 138 dbuf = NULL; 139 } 140 VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); 141 142 out: 143 mutex_exit(&bpo.bpo_lock); 144 bpobj_close(&bpo); 145 146 VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); 147 } 148 149 int 150 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) 151 { 152 dmu_object_info_t doi; 153 int err; 154 155 err = dmu_object_info(os, object, &doi); 156 if (err) 157 return (err); 158 159 memset(bpo, 0, sizeof (*bpo)); 160 mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); 161 162 ASSERT(bpo->bpo_dbuf == NULL); 163 ASSERT(bpo->bpo_phys == NULL); 164 ASSERT(object != 0); 165 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); 166 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); 167 168 err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); 169 if (err) 170 return (err); 171 172 bpo->bpo_os = os; 173 bpo->bpo_object = object; 174 bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; 175 bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); 176 bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); 177 bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); 178 bpo->bpo_phys = bpo->bpo_dbuf->db_data; 179 return (0); 180 } 181 182 boolean_t 183 bpobj_is_open(const bpobj_t *bpo) 184 { 185 return (bpo->bpo_object != 0); 186 } 187 188 void 189 bpobj_close(bpobj_t *bpo) 190 { 191 /* Lame workaround for closing a bpobj that was never opened. */ 192 if (bpo->bpo_object == 0) 193 return; 194 195 dmu_buf_rele(bpo->bpo_dbuf, bpo); 196 if (bpo->bpo_cached_dbuf != NULL) 197 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 198 bpo->bpo_dbuf = NULL; 199 bpo->bpo_phys = NULL; 200 bpo->bpo_cached_dbuf = NULL; 201 bpo->bpo_object = 0; 202 203 mutex_destroy(&bpo->bpo_lock); 204 } 205 206 static boolean_t 207 bpobj_is_empty_impl(bpobj_t *bpo) 208 { 209 ASSERT(MUTEX_HELD(&bpo->bpo_lock)); 210 return (bpo->bpo_phys->bpo_num_blkptrs == 0 && 211 (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); 212 } 213 214 boolean_t 215 bpobj_is_empty(bpobj_t *bpo) 216 { 217 mutex_enter(&bpo->bpo_lock); 218 boolean_t is_empty = bpobj_is_empty_impl(bpo); 219 mutex_exit(&bpo->bpo_lock); 220 return (is_empty); 221 } 222 223 /* 224 * A recursive iteration of the bpobjs would be nice here but we run the risk 225 * of overflowing function stack space. Instead, find each subobj and add it 226 * to the head of our list so it can be scanned for subjobjs. Like a 227 * recursive implementation, the "deepest" subobjs will be freed first. 228 * When a subobj is found to have no additional subojs, free it. 229 */ 230 typedef struct bpobj_info { 231 bpobj_t *bpi_bpo; 232 /* 233 * This object is a subobj of bpi_parent, 234 * at bpi_index in its subobj array. 235 */ 236 struct bpobj_info *bpi_parent; 237 uint64_t bpi_index; 238 /* How many of our subobj's are left to process. */ 239 uint64_t bpi_unprocessed_subobjs; 240 /* True after having visited this bpo's directly referenced BPs. */ 241 boolean_t bpi_visited; 242 list_node_t bpi_node; 243 } bpobj_info_t; 244 245 static bpobj_info_t * 246 bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) 247 { 248 bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP); 249 bpi->bpi_bpo = bpo; 250 bpi->bpi_parent = parent; 251 bpi->bpi_index = index; 252 if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 253 bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs; 254 } 255 return (bpi); 256 } 257 258 /* 259 * Update bpobj and all of its parents with new space accounting. 260 */ 261 static void 262 propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, 263 int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) 264 { 265 266 for (; bpi != NULL; bpi = bpi->bpi_parent) { 267 bpobj_t *p = bpi->bpi_bpo; 268 ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx)); 269 p->bpo_phys->bpo_bytes -= freed; 270 ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0); 271 if (p->bpo_havecomp) { 272 p->bpo_phys->bpo_comp -= comp_freed; 273 p->bpo_phys->bpo_uncomp -= uncomp_freed; 274 } 275 } 276 } 277 278 static int 279 bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, 280 int64_t start, dmu_tx_t *tx, boolean_t free) 281 { 282 int err = 0; 283 int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; 284 dmu_buf_t *dbuf = NULL; 285 bpobj_t *bpo = bpi->bpi_bpo; 286 287 for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { 288 uint64_t offset = i * sizeof (blkptr_t); 289 uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); 290 291 if (dbuf == NULL || dbuf->db_offset > offset) { 292 if (dbuf) 293 dmu_buf_rele(dbuf, FTAG); 294 err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 295 offset, FTAG, &dbuf, 0); 296 if (err) 297 break; 298 } 299 300 ASSERT3U(offset, >=, dbuf->db_offset); 301 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 302 303 blkptr_t *bparray = dbuf->db_data; 304 blkptr_t *bp = &bparray[blkoff]; 305 306 boolean_t bp_freed = BP_GET_FREE(bp); 307 err = func(arg, bp, bp_freed, tx); 308 if (err) 309 break; 310 311 if (free) { 312 int sign = bp_freed ? -1 : +1; 313 spa_t *spa = dmu_objset_spa(bpo->bpo_os); 314 freed += sign * bp_get_dsize_sync(spa, bp); 315 comp_freed += sign * BP_GET_PSIZE(bp); 316 uncomp_freed += sign * BP_GET_UCSIZE(bp); 317 ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); 318 bpo->bpo_phys->bpo_num_blkptrs--; 319 ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); 320 if (bp_freed) { 321 ASSERT(bpo->bpo_havefreed); 322 bpo->bpo_phys->bpo_num_freed--; 323 ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); 324 } 325 } 326 } 327 if (free) { 328 propagate_space_reduction(bpi, freed, comp_freed, 329 uncomp_freed, tx); 330 VERIFY0(dmu_free_range(bpo->bpo_os, 331 bpo->bpo_object, 332 bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 333 DMU_OBJECT_END, tx)); 334 } 335 if (dbuf) { 336 dmu_buf_rele(dbuf, FTAG); 337 dbuf = NULL; 338 } 339 return (err); 340 } 341 342 /* 343 * Given an initial bpo, start by freeing the BPs that are directly referenced 344 * by that bpo. If the bpo has subobjs, read in its last subobj and push the 345 * subobj to our stack. By popping items off our stack, eventually we will 346 * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if 347 * requested also free the now-empty bpo from disk and decrement 348 * its parent's subobj count. We continue popping each subobj from our stack, 349 * visiting its last subobj until they too have no more subobjs, and so on. 350 */ 351 static int 352 bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, 353 dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) 354 { 355 list_t stack; 356 bpobj_info_t *bpi; 357 int err = 0; 358 359 /* 360 * Create a "stack" for us to work with without worrying about 361 * stack overflows. Initialize it with the initial_bpo. 362 */ 363 list_create(&stack, sizeof (bpobj_info_t), 364 offsetof(bpobj_info_t, bpi_node)); 365 mutex_enter(&initial_bpo->bpo_lock); 366 367 if (bpobj_size != NULL) 368 *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; 369 370 list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); 371 372 while ((bpi = list_head(&stack)) != NULL) { 373 bpobj_t *bpo = bpi->bpi_bpo; 374 375 ASSERT3P(bpo, !=, NULL); 376 ASSERT(MUTEX_HELD(&bpo->bpo_lock)); 377 ASSERT(bpobj_is_open(bpo)); 378 379 if (free) 380 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 381 382 if (bpi->bpi_visited == B_FALSE) { 383 err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, 384 free); 385 bpi->bpi_visited = B_TRUE; 386 if (err != 0) 387 break; 388 } 389 /* 390 * We've finished with this bpo's directly-referenced BP's and 391 * it has no more unprocessed subobjs. We can free its 392 * bpobj_info_t (unless it is the topmost, initial_bpo). 393 * If we are freeing from disk, we can also do that. 394 */ 395 if (bpi->bpi_unprocessed_subobjs == 0) { 396 /* 397 * If there are no entries, there should 398 * be no bytes. 399 */ 400 if (bpobj_is_empty_impl(bpo)) { 401 ASSERT0(bpo->bpo_phys->bpo_bytes); 402 ASSERT0(bpo->bpo_phys->bpo_comp); 403 ASSERT0(bpo->bpo_phys->bpo_uncomp); 404 } 405 406 /* The initial_bpo has no parent and is not closed. */ 407 if (bpi->bpi_parent != NULL) { 408 if (free) { 409 bpobj_t *p = bpi->bpi_parent->bpi_bpo; 410 411 ASSERT0(bpo->bpo_phys->bpo_num_blkptrs); 412 ASSERT3U(p->bpo_phys->bpo_num_subobjs, 413 >, 0); 414 ASSERT3U(bpi->bpi_index, ==, 415 p->bpo_phys->bpo_num_subobjs - 1); 416 ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, 417 tx)); 418 419 p->bpo_phys->bpo_num_subobjs--; 420 421 VERIFY0(dmu_free_range(p->bpo_os, 422 p->bpo_phys->bpo_subobjs, 423 bpi->bpi_index * sizeof (uint64_t), 424 sizeof (uint64_t), tx)); 425 426 /* eliminate the empty subobj list */ 427 if (bpo->bpo_havesubobj && 428 bpo->bpo_phys->bpo_subobjs != 0) { 429 ASSERT0(bpo->bpo_phys-> 430 bpo_num_subobjs); 431 err = dmu_object_free( 432 bpo->bpo_os, 433 bpo->bpo_phys->bpo_subobjs, 434 tx); 435 if (err) 436 break; 437 bpo->bpo_phys->bpo_subobjs = 0; 438 } 439 err = dmu_object_free(p->bpo_os, 440 bpo->bpo_object, tx); 441 if (err) 442 break; 443 } 444 445 mutex_exit(&bpo->bpo_lock); 446 bpobj_close(bpo); 447 kmem_free(bpo, sizeof (bpobj_t)); 448 } else { 449 mutex_exit(&bpo->bpo_lock); 450 } 451 452 /* 453 * Finished processing this bpo. Unlock, and free 454 * our "stack" info. 455 */ 456 list_remove_head(&stack); 457 kmem_free(bpi, sizeof (bpobj_info_t)); 458 } else { 459 /* 460 * We have unprocessed subobjs. Process the next one. 461 */ 462 ASSERT(bpo->bpo_havecomp); 463 ASSERT3P(bpobj_size, ==, NULL); 464 465 /* Add the last subobj to stack. */ 466 int64_t i = bpi->bpi_unprocessed_subobjs - 1; 467 uint64_t offset = i * sizeof (uint64_t); 468 469 uint64_t obj_from_sublist; 470 err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 471 offset, sizeof (uint64_t), &obj_from_sublist, 472 DMU_READ_PREFETCH); 473 if (err) 474 break; 475 bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), 476 KM_SLEEP); 477 478 err = bpobj_open(sublist, bpo->bpo_os, 479 obj_from_sublist); 480 if (err) 481 break; 482 483 list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); 484 mutex_enter(&sublist->bpo_lock); 485 bpi->bpi_unprocessed_subobjs--; 486 } 487 } 488 /* 489 * Cleanup anything left on the "stack" after we left the loop. 490 * Every bpo on the stack is locked so we must remember to undo 491 * that now (in LIFO order). 492 */ 493 while ((bpi = list_remove_head(&stack)) != NULL) { 494 bpobj_t *bpo = bpi->bpi_bpo; 495 ASSERT(err != 0); 496 ASSERT3P(bpo, !=, NULL); 497 498 mutex_exit(&bpo->bpo_lock); 499 500 /* do not free the initial_bpo */ 501 if (bpi->bpi_parent != NULL) { 502 bpobj_close(bpi->bpi_bpo); 503 kmem_free(bpi->bpi_bpo, sizeof (bpobj_t)); 504 } 505 kmem_free(bpi, sizeof (bpobj_info_t)); 506 } 507 508 list_destroy(&stack); 509 510 return (err); 511 } 512 513 /* 514 * Iterate and remove the entries. If func returns nonzero, iteration 515 * will stop and that entry will not be removed. 516 */ 517 int 518 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 519 { 520 return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); 521 } 522 523 /* 524 * Iterate the entries. If func returns nonzero, iteration will stop. 525 * 526 * If there are no subobjs: 527 * 528 * *bpobj_size can be used to return the number of block pointers in the 529 * bpobj. Note that this may be different from the number of block pointers 530 * that are iterated over, if iteration is terminated early (e.g. by the func 531 * returning nonzero). 532 * 533 * If there are concurrent (or subsequent) modifications to the bpobj then the 534 * returned *bpobj_size can be passed as "start" to 535 * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. 536 */ 537 int 538 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, 539 uint64_t *bpobj_size) 540 { 541 return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); 542 } 543 544 /* 545 * Iterate over the blkptrs in the bpobj beginning at index start. If func 546 * returns nonzero, iteration will stop. This is a livelist specific function 547 * since it assumes that there are no subobjs present. 548 */ 549 int 550 livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, 551 int64_t start) 552 { 553 if (bpo->bpo_havesubobj) 554 VERIFY0(bpo->bpo_phys->bpo_subobjs); 555 bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); 556 int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); 557 kmem_free(bpi, sizeof (bpobj_info_t)); 558 return (err); 559 } 560 561 /* 562 * Logically add subobj's contents to the parent bpobj. 563 * 564 * In the most general case, this is accomplished in constant time by adding 565 * a reference to subobj. This case is used when enqueuing a large subobj: 566 * +--------------+ +--------------+ 567 * | bpobj |----------------------->| subobj list | 568 * +----+----+----+----+----+ +-----+-----+--+--+ 569 * | bp | bp | bp | bp | bp | | obj | obj | obj | 570 * +----+----+----+----+----+ +-----+-----+-----+ 571 * 572 * +--------------+ +--------------+ 573 * | sub-bpobj |----------------------> | subsubobj | 574 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ 575 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | 576 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ 577 * 578 * Result: sub-bpobj added to parent's subobj list. 579 * +--------------+ +--------------+ 580 * | bpobj |----------------------->| subobj list | 581 * +----+----+----+----+----+ +-----+-----+--+--+-----+ 582 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | 583 * +----+----+----+----+----+ +-----+-----+-----+--|--+ 584 * | 585 * /-----------------------------------------------------/ 586 * v 587 * +--------------+ +--------------+ 588 * | sub-bpobj |----------------------> | subsubobj | 589 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ 590 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | 591 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ 592 * 593 * 594 * In a common case, the subobj is small: its bp's and its list of subobj's 595 * are each stored in a single block. In this case we copy the subobj's 596 * contents to the parent: 597 * +--------------+ +--------------+ 598 * | bpobj |----------------------->| subobj list | 599 * +----+----+----+----+----+ +-----+-----+--+--+ 600 * | bp | bp | bp | bp | bp | | obj | obj | obj | 601 * +----+----+----+----+----+ +-----+-----+-----+ 602 * ^ ^ 603 * +--------------+ | +--------------+ | 604 * | sub-bpobj |---------^------------> | subsubobj | ^ 605 * +----+----+----+ | +-----+-----+--+ | 606 * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/ 607 * +----+----+ +-----+-----+ 608 * 609 * Result: subobj destroyed, contents copied to parent: 610 * +--------------+ +--------------+ 611 * | bpobj |----------------------->| subobj list | 612 * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+ 613 * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ | 614 * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+ 615 * 616 * 617 * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's 618 * but retain the sub-bpobj: 619 * +--------------+ +--------------+ 620 * | bpobj |----------------------->| subobj list | 621 * +----+----+----+----+----+ +-----+-----+--+--+ 622 * | bp | bp | bp | bp | bp | | obj | obj | obj | 623 * +----+----+----+----+----+ +-----+-----+-----+ 624 * ^ 625 * +--------------+ +--------------+ | 626 * | sub-bpobj |----------------------> | subsubobj | ^ 627 * +----+----+----+----+---------+----+ +-----+-----+--+ | 628 * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/ 629 * +----+----+----+----+---------+----+ +-----+-----+ 630 * 631 * Result: sub-sub-bpobjs and subobj added to parent's subobj list. 632 * +--------------+ +--------------+ 633 * | bpobj |-------------------->| subobj list | 634 * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+ 635 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* | 636 * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+ 637 * | 638 * /--------------------------------------------------------------/ 639 * v 640 * +--------------+ 641 * | sub-bpobj | 642 * +----+----+----+----+---------+----+ 643 * | bp | bp | bp | bp | ... | bp | 644 * +----+----+----+----+---------+----+ 645 */ 646 void 647 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) 648 { 649 bpobj_t subbpo; 650 uint64_t used, comp, uncomp, subsubobjs; 651 boolean_t copy_subsub = B_TRUE; 652 boolean_t copy_bps = B_TRUE; 653 654 ASSERT(bpobj_is_open(bpo)); 655 ASSERT(subobj != 0); 656 ASSERT(bpo->bpo_havesubobj); 657 ASSERT(bpo->bpo_havecomp); 658 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 659 660 if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { 661 bpobj_decr_empty(bpo->bpo_os, tx); 662 return; 663 } 664 665 VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); 666 if (bpobj_is_empty(&subbpo)) { 667 /* No point in having an empty subobj. */ 668 bpobj_close(&subbpo); 669 bpobj_free(bpo->bpo_os, subobj, tx); 670 return; 671 } 672 VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); 673 674 mutex_enter(&bpo->bpo_lock); 675 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 676 677 dmu_object_info_t doi; 678 679 if (bpo->bpo_phys->bpo_subobjs != 0) { 680 ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 681 &doi)); 682 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); 683 } 684 685 /* 686 * If subobj has only one block of subobjs, then move subobj's 687 * subobjs to bpo's subobj list directly. This reduces recursion in 688 * bpobj_iterate due to nested subobjs. 689 */ 690 subsubobjs = subbpo.bpo_phys->bpo_subobjs; 691 if (subsubobjs != 0) { 692 VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); 693 if (doi.doi_max_offset > doi.doi_data_block_size) { 694 copy_subsub = B_FALSE; 695 } 696 } 697 698 /* 699 * If, in addition to having only one block of subobj's, subobj has 700 * only one block of bp's, then move subobj's bp's to bpo's bp list 701 * directly. This reduces recursion in bpobj_iterate due to nested 702 * subobjs. 703 */ 704 VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi)); 705 if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) { 706 copy_bps = B_FALSE; 707 } 708 709 if (copy_subsub && subsubobjs != 0) { 710 dmu_buf_t *subdb; 711 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; 712 713 VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs, 714 0, FTAG, &subdb, 0)); 715 /* 716 * Make sure that we are not asking dmu_write() 717 * to write more data than we have in our buffer. 718 */ 719 VERIFY3U(subdb->db_size, >=, 720 numsubsub * sizeof (subobj)); 721 if (bpo->bpo_phys->bpo_subobjs == 0) { 722 bpo->bpo_phys->bpo_subobjs = 723 dmu_object_alloc(bpo->bpo_os, 724 DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, 725 DMU_OT_NONE, 0, tx); 726 } 727 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 728 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 729 numsubsub * sizeof (subobj), subdb->db_data, tx); 730 dmu_buf_rele(subdb, FTAG); 731 bpo->bpo_phys->bpo_num_subobjs += numsubsub; 732 733 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); 734 subbpo.bpo_phys->bpo_subobjs = 0; 735 VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx)); 736 } 737 738 if (copy_bps) { 739 dmu_buf_t *bps; 740 uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs; 741 742 ASSERT(copy_subsub); 743 VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj, 744 0, FTAG, &bps, 0)); 745 746 /* 747 * Make sure that we are not asking dmu_write() 748 * to write more data than we have in our buffer. 749 */ 750 VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t)); 751 dmu_write(bpo->bpo_os, bpo->bpo_object, 752 bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 753 numbps * sizeof (blkptr_t), 754 bps->db_data, tx); 755 dmu_buf_rele(bps, FTAG); 756 bpo->bpo_phys->bpo_num_blkptrs += numbps; 757 758 bpobj_close(&subbpo); 759 VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx)); 760 } else { 761 bpobj_close(&subbpo); 762 if (bpo->bpo_phys->bpo_subobjs == 0) { 763 bpo->bpo_phys->bpo_subobjs = 764 dmu_object_alloc(bpo->bpo_os, 765 DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, 766 DMU_OT_NONE, 0, tx); 767 } 768 769 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 770 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 771 sizeof (subobj), &subobj, tx); 772 bpo->bpo_phys->bpo_num_subobjs++; 773 } 774 775 bpo->bpo_phys->bpo_bytes += used; 776 bpo->bpo_phys->bpo_comp += comp; 777 bpo->bpo_phys->bpo_uncomp += uncomp; 778 mutex_exit(&bpo->bpo_lock); 779 780 } 781 782 /* 783 * Prefetch metadata required for bpobj_enqueue_subobj(). 784 */ 785 void 786 bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj) 787 { 788 dmu_object_info_t doi; 789 bpobj_t subbpo; 790 uint64_t subsubobjs; 791 boolean_t copy_subsub = B_TRUE; 792 boolean_t copy_bps = B_TRUE; 793 794 ASSERT(bpobj_is_open(bpo)); 795 ASSERT(subobj != 0); 796 797 if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) 798 return; 799 800 if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0) 801 return; 802 if (bpobj_is_empty(&subbpo)) { 803 bpobj_close(&subbpo); 804 return; 805 } 806 subsubobjs = subbpo.bpo_phys->bpo_subobjs; 807 bpobj_close(&subbpo); 808 809 if (subsubobjs != 0) { 810 if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0) 811 return; 812 if (doi.doi_max_offset > doi.doi_data_block_size) 813 copy_subsub = B_FALSE; 814 } 815 816 if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0) 817 return; 818 if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) 819 copy_bps = B_FALSE; 820 821 if (copy_subsub && subsubobjs != 0) { 822 if (bpo->bpo_phys->bpo_subobjs) { 823 dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, 824 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, 825 ZIO_PRIORITY_ASYNC_READ); 826 } 827 dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1, 828 ZIO_PRIORITY_ASYNC_READ); 829 } 830 831 if (copy_bps) { 832 dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, 833 bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1, 834 ZIO_PRIORITY_ASYNC_READ); 835 dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1, 836 ZIO_PRIORITY_ASYNC_READ); 837 } else if (bpo->bpo_phys->bpo_subobjs) { 838 dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, 839 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, 840 ZIO_PRIORITY_ASYNC_READ); 841 } 842 } 843 844 void 845 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, 846 dmu_tx_t *tx) 847 { 848 blkptr_t stored_bp = *bp; 849 uint64_t offset; 850 int blkoff; 851 blkptr_t *bparray; 852 853 ASSERT(bpobj_is_open(bpo)); 854 ASSERT(!BP_IS_HOLE(bp)); 855 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 856 857 if (BP_IS_EMBEDDED(bp)) { 858 /* 859 * The bpobj will compress better without the payload. 860 * 861 * Note that we store EMBEDDED bp's because they have an 862 * uncompressed size, which must be accounted for. An 863 * alternative would be to add their size to bpo_uncomp 864 * without storing the bp, but that would create additional 865 * complications: bpo_uncomp would be inconsistent with the 866 * set of BP's stored, and bpobj_iterate() wouldn't visit 867 * all the space accounted for in the bpobj. 868 */ 869 memset(&stored_bp, 0, sizeof (stored_bp)); 870 stored_bp.blk_prop = bp->blk_prop; 871 stored_bp.blk_birth = bp->blk_birth; 872 } else if (!BP_GET_DEDUP(bp)) { 873 /* The bpobj will compress better without the checksum */ 874 memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum)); 875 } 876 877 stored_bp.blk_fill = 0; 878 BP_SET_FREE(&stored_bp, bp_freed); 879 880 mutex_enter(&bpo->bpo_lock); 881 882 offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); 883 blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); 884 885 if (bpo->bpo_cached_dbuf == NULL || 886 offset < bpo->bpo_cached_dbuf->db_offset || 887 offset >= bpo->bpo_cached_dbuf->db_offset + 888 bpo->bpo_cached_dbuf->db_size) { 889 if (bpo->bpo_cached_dbuf) 890 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 891 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 892 offset, bpo, &bpo->bpo_cached_dbuf, 0)); 893 ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL); 894 } 895 896 dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); 897 bparray = bpo->bpo_cached_dbuf->db_data; 898 bparray[blkoff] = stored_bp; 899 900 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 901 bpo->bpo_phys->bpo_num_blkptrs++; 902 int sign = bp_freed ? -1 : +1; 903 bpo->bpo_phys->bpo_bytes += sign * 904 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 905 if (bpo->bpo_havecomp) { 906 bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); 907 bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); 908 } 909 if (bp_freed) { 910 ASSERT(bpo->bpo_havefreed); 911 bpo->bpo_phys->bpo_num_freed++; 912 } 913 mutex_exit(&bpo->bpo_lock); 914 } 915 916 struct space_range_arg { 917 spa_t *spa; 918 uint64_t mintxg; 919 uint64_t maxtxg; 920 uint64_t used; 921 uint64_t comp; 922 uint64_t uncomp; 923 }; 924 925 static int 926 space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) 927 { 928 (void) bp_freed, (void) tx; 929 struct space_range_arg *sra = arg; 930 931 if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { 932 if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) 933 sra->used += bp_get_dsize_sync(sra->spa, bp); 934 else 935 sra->used += bp_get_dsize(sra->spa, bp); 936 sra->comp += BP_GET_PSIZE(bp); 937 sra->uncomp += BP_GET_UCSIZE(bp); 938 } 939 return (0); 940 } 941 942 int 943 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 944 { 945 ASSERT(bpobj_is_open(bpo)); 946 mutex_enter(&bpo->bpo_lock); 947 948 *usedp = bpo->bpo_phys->bpo_bytes; 949 if (bpo->bpo_havecomp) { 950 *compp = bpo->bpo_phys->bpo_comp; 951 *uncompp = bpo->bpo_phys->bpo_uncomp; 952 mutex_exit(&bpo->bpo_lock); 953 return (0); 954 } else { 955 mutex_exit(&bpo->bpo_lock); 956 return (bpobj_space_range(bpo, 0, UINT64_MAX, 957 usedp, compp, uncompp)); 958 } 959 } 960 961 /* 962 * Return the amount of space in the bpobj which is: 963 * mintxg < blk_birth <= maxtxg 964 */ 965 int 966 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, 967 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 968 { 969 struct space_range_arg sra = { 0 }; 970 int err; 971 972 ASSERT(bpobj_is_open(bpo)); 973 974 /* 975 * As an optimization, if they want the whole txg range, just 976 * get bpo_bytes rather than iterating over the bps. 977 */ 978 if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) 979 return (bpobj_space(bpo, usedp, compp, uncompp)); 980 981 sra.spa = dmu_objset_spa(bpo->bpo_os); 982 sra.mintxg = mintxg; 983 sra.maxtxg = maxtxg; 984 985 err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); 986 *usedp = sra.used; 987 *compp = sra.comp; 988 *uncompp = sra.uncomp; 989 return (err); 990 } 991 992 /* 993 * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a 994 * bpobj are designated as free or allocated that information is not preserved 995 * in bplists. 996 */ 997 int 998 bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 999 dmu_tx_t *tx) 1000 { 1001 (void) bp_freed, (void) tx; 1002 bplist_t *bpl = arg; 1003 bplist_append(bpl, bp); 1004 return (0); 1005 } 1006