1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/bpobj.h> 27 #include <sys/zfs_context.h> 28 #include <sys/refcount.h> 29 #include <sys/dsl_pool.h> 30 #include <sys/zfeature.h> 31 #include <sys/zap.h> 32 33 /* 34 * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). 35 */ 36 uint64_t 37 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) 38 { 39 zfeature_info_t *empty_bpobj_feat = 40 &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ]; 41 spa_t *spa = dmu_objset_spa(os); 42 dsl_pool_t *dp = dmu_objset_pool(os); 43 44 if (spa_feature_is_enabled(spa, empty_bpobj_feat)) { 45 if (!spa_feature_is_active(spa, empty_bpobj_feat)) { 46 ASSERT3U(dp->dp_empty_bpobj, ==, 0); 47 dp->dp_empty_bpobj = 48 bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx); 49 VERIFY(zap_add(os, 50 DMU_POOL_DIRECTORY_OBJECT, 51 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 52 &dp->dp_empty_bpobj, tx) == 0); 53 } 54 spa_feature_incr(spa, empty_bpobj_feat, tx); 55 ASSERT(dp->dp_empty_bpobj != 0); 56 return (dp->dp_empty_bpobj); 57 } else { 58 return (bpobj_alloc(os, blocksize, tx)); 59 } 60 } 61 62 void 63 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) 64 { 65 zfeature_info_t *empty_bpobj_feat = 66 &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ]; 67 dsl_pool_t *dp = dmu_objset_pool(os); 68 69 spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx); 70 if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) { 71 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, 72 DMU_POOL_DIRECTORY_OBJECT, 73 DMU_POOL_EMPTY_BPOBJ, tx)); 74 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); 75 dp->dp_empty_bpobj = 0; 76 } 77 } 78 79 uint64_t 80 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) 81 { 82 int size; 83 84 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) 85 size = BPOBJ_SIZE_V0; 86 else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 87 size = BPOBJ_SIZE_V1; 88 else 89 size = sizeof (bpobj_phys_t); 90 91 return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, 92 DMU_OT_BPOBJ_HDR, size, tx)); 93 } 94 95 void 96 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 97 { 98 int64_t i; 99 bpobj_t bpo; 100 dmu_object_info_t doi; 101 int epb; 102 dmu_buf_t *dbuf = NULL; 103 104 ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); 105 VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); 106 107 mutex_enter(&bpo.bpo_lock); 108 109 if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) 110 goto out; 111 112 VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); 113 epb = doi.doi_data_block_size / sizeof (uint64_t); 114 115 for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 116 uint64_t *objarray; 117 uint64_t offset, blkoff; 118 119 offset = i * sizeof (uint64_t); 120 blkoff = P2PHASE(i, epb); 121 122 if (dbuf == NULL || dbuf->db_offset > offset) { 123 if (dbuf) 124 dmu_buf_rele(dbuf, FTAG); 125 VERIFY3U(0, ==, dmu_buf_hold(os, 126 bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); 127 } 128 129 ASSERT3U(offset, >=, dbuf->db_offset); 130 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 131 132 objarray = dbuf->db_data; 133 bpobj_free(os, objarray[blkoff], tx); 134 } 135 if (dbuf) { 136 dmu_buf_rele(dbuf, FTAG); 137 dbuf = NULL; 138 } 139 VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); 140 141 out: 142 mutex_exit(&bpo.bpo_lock); 143 bpobj_close(&bpo); 144 145 VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); 146 } 147 148 int 149 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) 150 { 151 dmu_object_info_t doi; 152 int err; 153 154 err = dmu_object_info(os, object, &doi); 155 if (err) 156 return (err); 157 158 bzero(bpo, sizeof (*bpo)); 159 mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); 160 161 ASSERT(bpo->bpo_dbuf == NULL); 162 ASSERT(bpo->bpo_phys == NULL); 163 ASSERT(object != 0); 164 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); 165 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); 166 167 err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); 168 if (err) 169 return (err); 170 171 bpo->bpo_os = os; 172 bpo->bpo_object = object; 173 bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; 174 bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); 175 bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); 176 bpo->bpo_phys = bpo->bpo_dbuf->db_data; 177 return (0); 178 } 179 180 void 181 bpobj_close(bpobj_t *bpo) 182 { 183 /* Lame workaround for closing a bpobj that was never opened. */ 184 if (bpo->bpo_object == 0) 185 return; 186 187 dmu_buf_rele(bpo->bpo_dbuf, bpo); 188 if (bpo->bpo_cached_dbuf != NULL) 189 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 190 bpo->bpo_dbuf = NULL; 191 bpo->bpo_phys = NULL; 192 bpo->bpo_cached_dbuf = NULL; 193 bpo->bpo_object = 0; 194 195 mutex_destroy(&bpo->bpo_lock); 196 } 197 198 static int 199 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, 200 boolean_t free) 201 { 202 dmu_object_info_t doi; 203 int epb; 204 int64_t i; 205 int err = 0; 206 dmu_buf_t *dbuf = NULL; 207 208 mutex_enter(&bpo->bpo_lock); 209 210 if (free) 211 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 212 213 for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { 214 blkptr_t *bparray; 215 blkptr_t *bp; 216 uint64_t offset, blkoff; 217 218 offset = i * sizeof (blkptr_t); 219 blkoff = P2PHASE(i, bpo->bpo_epb); 220 221 if (dbuf == NULL || dbuf->db_offset > offset) { 222 if (dbuf) 223 dmu_buf_rele(dbuf, FTAG); 224 err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, 225 FTAG, &dbuf, 0); 226 if (err) 227 break; 228 } 229 230 ASSERT3U(offset, >=, dbuf->db_offset); 231 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 232 233 bparray = dbuf->db_data; 234 bp = &bparray[blkoff]; 235 err = func(arg, bp, tx); 236 if (err) 237 break; 238 if (free) { 239 bpo->bpo_phys->bpo_bytes -= 240 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 241 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); 242 if (bpo->bpo_havecomp) { 243 bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); 244 bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); 245 } 246 bpo->bpo_phys->bpo_num_blkptrs--; 247 ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); 248 } 249 } 250 if (dbuf) { 251 dmu_buf_rele(dbuf, FTAG); 252 dbuf = NULL; 253 } 254 if (free) { 255 i++; 256 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, 257 i * sizeof (blkptr_t), -1ULL, tx)); 258 } 259 if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) 260 goto out; 261 262 ASSERT(bpo->bpo_havecomp); 263 err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); 264 if (err) { 265 mutex_exit(&bpo->bpo_lock); 266 return (err); 267 } 268 epb = doi.doi_data_block_size / sizeof (uint64_t); 269 270 for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 271 uint64_t *objarray; 272 uint64_t offset, blkoff; 273 bpobj_t sublist; 274 uint64_t used_before, comp_before, uncomp_before; 275 uint64_t used_after, comp_after, uncomp_after; 276 277 offset = i * sizeof (uint64_t); 278 blkoff = P2PHASE(i, epb); 279 280 if (dbuf == NULL || dbuf->db_offset > offset) { 281 if (dbuf) 282 dmu_buf_rele(dbuf, FTAG); 283 err = dmu_buf_hold(bpo->bpo_os, 284 bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); 285 if (err) 286 break; 287 } 288 289 ASSERT3U(offset, >=, dbuf->db_offset); 290 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 291 292 objarray = dbuf->db_data; 293 err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); 294 if (err) 295 break; 296 if (free) { 297 err = bpobj_space(&sublist, 298 &used_before, &comp_before, &uncomp_before); 299 if (err) 300 break; 301 } 302 err = bpobj_iterate_impl(&sublist, func, arg, tx, free); 303 if (free) { 304 VERIFY3U(0, ==, bpobj_space(&sublist, 305 &used_after, &comp_after, &uncomp_after)); 306 bpo->bpo_phys->bpo_bytes -= used_before - used_after; 307 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); 308 bpo->bpo_phys->bpo_comp -= comp_before - comp_after; 309 bpo->bpo_phys->bpo_uncomp -= 310 uncomp_before - uncomp_after; 311 } 312 313 bpobj_close(&sublist); 314 if (err) 315 break; 316 if (free) { 317 err = dmu_object_free(bpo->bpo_os, 318 objarray[blkoff], tx); 319 if (err) 320 break; 321 bpo->bpo_phys->bpo_num_subobjs--; 322 ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); 323 } 324 } 325 if (dbuf) { 326 dmu_buf_rele(dbuf, FTAG); 327 dbuf = NULL; 328 } 329 if (free) { 330 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, 331 bpo->bpo_phys->bpo_subobjs, 332 (i + 1) * sizeof (uint64_t), -1ULL, tx)); 333 } 334 335 out: 336 /* If there are no entries, there should be no bytes. */ 337 ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 || 338 (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) || 339 bpo->bpo_phys->bpo_bytes == 0); 340 341 mutex_exit(&bpo->bpo_lock); 342 return (err); 343 } 344 345 /* 346 * Iterate and remove the entries. If func returns nonzero, iteration 347 * will stop and that entry will not be removed. 348 */ 349 int 350 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 351 { 352 return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); 353 } 354 355 /* 356 * Iterate the entries. If func returns nonzero, iteration will stop. 357 */ 358 int 359 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 360 { 361 return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); 362 } 363 364 void 365 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) 366 { 367 bpobj_t subbpo; 368 uint64_t used, comp, uncomp, subsubobjs; 369 370 ASSERT(bpo->bpo_havesubobj); 371 ASSERT(bpo->bpo_havecomp); 372 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 373 374 if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { 375 bpobj_decr_empty(bpo->bpo_os, tx); 376 return; 377 } 378 379 VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); 380 VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); 381 382 if (used == 0) { 383 /* No point in having an empty subobj. */ 384 bpobj_close(&subbpo); 385 bpobj_free(bpo->bpo_os, subobj, tx); 386 return; 387 } 388 389 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 390 if (bpo->bpo_phys->bpo_subobjs == 0) { 391 bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, 392 DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); 393 } 394 395 mutex_enter(&bpo->bpo_lock); 396 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 397 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 398 sizeof (subobj), &subobj, tx); 399 bpo->bpo_phys->bpo_num_subobjs++; 400 401 /* 402 * If subobj has only one block of subobjs, then move subobj's 403 * subobjs to bpo's subobj list directly. This reduces 404 * recursion in bpobj_iterate due to nested subobjs. 405 */ 406 subsubobjs = subbpo.bpo_phys->bpo_subobjs; 407 if (subsubobjs != 0) { 408 dmu_object_info_t doi; 409 410 VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); 411 if (doi.doi_max_offset == doi.doi_data_block_size) { 412 dmu_buf_t *subdb; 413 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; 414 415 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, 416 0, FTAG, &subdb, 0)); 417 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 418 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 419 numsubsub * sizeof (subobj), subdb->db_data, tx); 420 dmu_buf_rele(subdb, FTAG); 421 bpo->bpo_phys->bpo_num_subobjs += numsubsub; 422 423 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); 424 subbpo.bpo_phys->bpo_subobjs = 0; 425 VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, 426 subsubobjs, tx)); 427 } 428 } 429 bpo->bpo_phys->bpo_bytes += used; 430 bpo->bpo_phys->bpo_comp += comp; 431 bpo->bpo_phys->bpo_uncomp += uncomp; 432 mutex_exit(&bpo->bpo_lock); 433 434 bpobj_close(&subbpo); 435 } 436 437 void 438 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) 439 { 440 blkptr_t stored_bp = *bp; 441 uint64_t offset; 442 int blkoff; 443 blkptr_t *bparray; 444 445 ASSERT(!BP_IS_HOLE(bp)); 446 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 447 448 /* We never need the fill count. */ 449 stored_bp.blk_fill = 0; 450 451 /* The bpobj will compress better if we can leave off the checksum */ 452 if (!BP_GET_DEDUP(bp)) 453 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); 454 455 mutex_enter(&bpo->bpo_lock); 456 457 offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); 458 blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); 459 460 if (bpo->bpo_cached_dbuf == NULL || 461 offset < bpo->bpo_cached_dbuf->db_offset || 462 offset >= bpo->bpo_cached_dbuf->db_offset + 463 bpo->bpo_cached_dbuf->db_size) { 464 if (bpo->bpo_cached_dbuf) 465 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 466 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 467 offset, bpo, &bpo->bpo_cached_dbuf, 0)); 468 } 469 470 dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); 471 bparray = bpo->bpo_cached_dbuf->db_data; 472 bparray[blkoff] = stored_bp; 473 474 dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 475 bpo->bpo_phys->bpo_num_blkptrs++; 476 bpo->bpo_phys->bpo_bytes += 477 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 478 if (bpo->bpo_havecomp) { 479 bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); 480 bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); 481 } 482 mutex_exit(&bpo->bpo_lock); 483 } 484 485 struct space_range_arg { 486 spa_t *spa; 487 uint64_t mintxg; 488 uint64_t maxtxg; 489 uint64_t used; 490 uint64_t comp; 491 uint64_t uncomp; 492 }; 493 494 /* ARGSUSED */ 495 static int 496 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 497 { 498 struct space_range_arg *sra = arg; 499 500 if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { 501 if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) 502 sra->used += bp_get_dsize_sync(sra->spa, bp); 503 else 504 sra->used += bp_get_dsize(sra->spa, bp); 505 sra->comp += BP_GET_PSIZE(bp); 506 sra->uncomp += BP_GET_UCSIZE(bp); 507 } 508 return (0); 509 } 510 511 int 512 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 513 { 514 mutex_enter(&bpo->bpo_lock); 515 516 *usedp = bpo->bpo_phys->bpo_bytes; 517 if (bpo->bpo_havecomp) { 518 *compp = bpo->bpo_phys->bpo_comp; 519 *uncompp = bpo->bpo_phys->bpo_uncomp; 520 mutex_exit(&bpo->bpo_lock); 521 return (0); 522 } else { 523 mutex_exit(&bpo->bpo_lock); 524 return (bpobj_space_range(bpo, 0, UINT64_MAX, 525 usedp, compp, uncompp)); 526 } 527 } 528 529 /* 530 * Return the amount of space in the bpobj which is: 531 * mintxg < blk_birth <= maxtxg 532 */ 533 int 534 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, 535 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 536 { 537 struct space_range_arg sra = { 0 }; 538 int err; 539 540 /* 541 * As an optimization, if they want the whole txg range, just 542 * get bpo_bytes rather than iterating over the bps. 543 */ 544 if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) 545 return (bpobj_space(bpo, usedp, compp, uncompp)); 546 547 sra.spa = dmu_objset_spa(bpo->bpo_os); 548 sra.mintxg = mintxg; 549 sra.maxtxg = maxtxg; 550 551 err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); 552 *usedp = sra.used; 553 *compp = sra.comp; 554 *uncompp = sra.uncomp; 555 return (err); 556 } 557