1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23eda14cbcSMatt Macy * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24eda14cbcSMatt Macy * Copyright (c) 2017 Datto Inc. 25eda14cbcSMatt Macy */ 26eda14cbcSMatt Macy 27eda14cbcSMatt Macy #include <sys/bpobj.h> 28eda14cbcSMatt Macy #include <sys/zfs_context.h> 29eda14cbcSMatt Macy #include <sys/zfs_refcount.h> 30eda14cbcSMatt Macy #include <sys/dsl_pool.h> 31eda14cbcSMatt Macy #include <sys/zfeature.h> 32eda14cbcSMatt Macy #include <sys/zap.h> 33eda14cbcSMatt Macy 34eda14cbcSMatt Macy /* 35eda14cbcSMatt Macy * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). 36eda14cbcSMatt Macy */ 37eda14cbcSMatt Macy uint64_t 38eda14cbcSMatt Macy bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) 39eda14cbcSMatt Macy { 40eda14cbcSMatt Macy spa_t *spa = dmu_objset_spa(os); 41eda14cbcSMatt Macy dsl_pool_t *dp = dmu_objset_pool(os); 42eda14cbcSMatt Macy 43eda14cbcSMatt Macy if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { 44eda14cbcSMatt Macy if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { 45eda14cbcSMatt Macy ASSERT0(dp->dp_empty_bpobj); 46eda14cbcSMatt Macy dp->dp_empty_bpobj = 47eda14cbcSMatt Macy bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); 48eda14cbcSMatt Macy VERIFY(zap_add(os, 49eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, 50eda14cbcSMatt Macy DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 51eda14cbcSMatt Macy &dp->dp_empty_bpobj, tx) == 0); 52eda14cbcSMatt Macy } 53eda14cbcSMatt Macy spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); 54eda14cbcSMatt Macy ASSERT(dp->dp_empty_bpobj != 0); 55eda14cbcSMatt Macy return (dp->dp_empty_bpobj); 56eda14cbcSMatt Macy } else { 57eda14cbcSMatt Macy return (bpobj_alloc(os, blocksize, tx)); 58eda14cbcSMatt Macy } 59eda14cbcSMatt Macy } 60eda14cbcSMatt Macy 61eda14cbcSMatt Macy void 62eda14cbcSMatt Macy bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) 63eda14cbcSMatt Macy { 64eda14cbcSMatt Macy dsl_pool_t *dp = dmu_objset_pool(os); 65eda14cbcSMatt Macy 66eda14cbcSMatt Macy spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); 67eda14cbcSMatt Macy if (!spa_feature_is_active(dmu_objset_spa(os), 68eda14cbcSMatt Macy SPA_FEATURE_EMPTY_BPOBJ)) { 69eda14cbcSMatt Macy VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, 70eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, 71eda14cbcSMatt Macy DMU_POOL_EMPTY_BPOBJ, tx)); 72eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); 73eda14cbcSMatt Macy dp->dp_empty_bpobj = 0; 74eda14cbcSMatt Macy } 75eda14cbcSMatt Macy } 76eda14cbcSMatt Macy 77eda14cbcSMatt Macy uint64_t 78eda14cbcSMatt Macy bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) 79eda14cbcSMatt Macy { 80eda14cbcSMatt Macy int size; 81eda14cbcSMatt Macy 82eda14cbcSMatt Macy if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) 83eda14cbcSMatt Macy size = BPOBJ_SIZE_V0; 84eda14cbcSMatt Macy else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 85eda14cbcSMatt Macy size = BPOBJ_SIZE_V1; 86eda14cbcSMatt Macy else if (!spa_feature_is_active(dmu_objset_spa(os), 87eda14cbcSMatt Macy SPA_FEATURE_LIVELIST)) 88eda14cbcSMatt Macy size = BPOBJ_SIZE_V2; 89eda14cbcSMatt Macy else 90eda14cbcSMatt Macy size = sizeof (bpobj_phys_t); 91eda14cbcSMatt Macy 92eda14cbcSMatt Macy return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, 93eda14cbcSMatt Macy DMU_OT_BPOBJ_HDR, size, tx)); 94eda14cbcSMatt Macy } 95eda14cbcSMatt Macy 96eda14cbcSMatt Macy void 97eda14cbcSMatt Macy bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 98eda14cbcSMatt Macy { 99eda14cbcSMatt Macy int64_t i; 100eda14cbcSMatt Macy bpobj_t bpo; 101eda14cbcSMatt Macy dmu_object_info_t doi; 102eda14cbcSMatt Macy int epb; 103eda14cbcSMatt Macy dmu_buf_t *dbuf = NULL; 104eda14cbcSMatt Macy 105eda14cbcSMatt Macy ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); 106eda14cbcSMatt Macy VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); 107eda14cbcSMatt Macy 108eda14cbcSMatt Macy mutex_enter(&bpo.bpo_lock); 109eda14cbcSMatt Macy 110eda14cbcSMatt Macy if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) 111eda14cbcSMatt Macy goto out; 112eda14cbcSMatt Macy 113eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); 114eda14cbcSMatt Macy epb = doi.doi_data_block_size / sizeof (uint64_t); 115eda14cbcSMatt Macy 116eda14cbcSMatt Macy for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 117eda14cbcSMatt Macy uint64_t *objarray; 118eda14cbcSMatt Macy uint64_t offset, blkoff; 119eda14cbcSMatt Macy 120eda14cbcSMatt Macy offset = i * sizeof (uint64_t); 121eda14cbcSMatt Macy blkoff = P2PHASE(i, epb); 122eda14cbcSMatt Macy 123eda14cbcSMatt Macy if (dbuf == NULL || dbuf->db_offset > offset) { 124eda14cbcSMatt Macy if (dbuf) 125eda14cbcSMatt Macy dmu_buf_rele(dbuf, FTAG); 126eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_buf_hold(os, 127eda14cbcSMatt Macy bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); 128eda14cbcSMatt Macy } 129eda14cbcSMatt Macy 130eda14cbcSMatt Macy ASSERT3U(offset, >=, dbuf->db_offset); 131eda14cbcSMatt Macy ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 132eda14cbcSMatt Macy 133eda14cbcSMatt Macy objarray = dbuf->db_data; 134eda14cbcSMatt Macy bpobj_free(os, objarray[blkoff], tx); 135eda14cbcSMatt Macy } 136eda14cbcSMatt Macy if (dbuf) { 137eda14cbcSMatt Macy dmu_buf_rele(dbuf, FTAG); 138eda14cbcSMatt Macy dbuf = NULL; 139eda14cbcSMatt Macy } 140eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); 141eda14cbcSMatt Macy 142eda14cbcSMatt Macy out: 143eda14cbcSMatt Macy mutex_exit(&bpo.bpo_lock); 144eda14cbcSMatt Macy bpobj_close(&bpo); 145eda14cbcSMatt Macy 146eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); 147eda14cbcSMatt Macy } 148eda14cbcSMatt Macy 149eda14cbcSMatt Macy int 150eda14cbcSMatt Macy bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) 151eda14cbcSMatt Macy { 152eda14cbcSMatt Macy dmu_object_info_t doi; 153eda14cbcSMatt Macy int err; 154eda14cbcSMatt Macy 155eda14cbcSMatt Macy err = dmu_object_info(os, object, &doi); 156eda14cbcSMatt Macy if (err) 157eda14cbcSMatt Macy return (err); 158eda14cbcSMatt Macy 159da5137abSMartin Matuska memset(bpo, 0, sizeof (*bpo)); 160eda14cbcSMatt Macy mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); 161eda14cbcSMatt Macy 162eda14cbcSMatt Macy ASSERT(bpo->bpo_dbuf == NULL); 163eda14cbcSMatt Macy ASSERT(bpo->bpo_phys == NULL); 164eda14cbcSMatt Macy ASSERT(object != 0); 165eda14cbcSMatt Macy ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); 166eda14cbcSMatt Macy ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); 167eda14cbcSMatt Macy 168eda14cbcSMatt Macy err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); 169eda14cbcSMatt Macy if (err) 170eda14cbcSMatt Macy return (err); 171eda14cbcSMatt Macy 172eda14cbcSMatt Macy bpo->bpo_os = os; 173eda14cbcSMatt Macy bpo->bpo_object = object; 174eda14cbcSMatt Macy bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; 175eda14cbcSMatt Macy bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); 176eda14cbcSMatt Macy bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); 177eda14cbcSMatt Macy bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); 178eda14cbcSMatt Macy bpo->bpo_phys = bpo->bpo_dbuf->db_data; 179eda14cbcSMatt Macy return (0); 180eda14cbcSMatt Macy } 181eda14cbcSMatt Macy 182eda14cbcSMatt Macy boolean_t 183eda14cbcSMatt Macy bpobj_is_open(const bpobj_t *bpo) 184eda14cbcSMatt Macy { 185eda14cbcSMatt Macy return (bpo->bpo_object != 0); 186eda14cbcSMatt Macy } 187eda14cbcSMatt Macy 188eda14cbcSMatt Macy void 189eda14cbcSMatt Macy bpobj_close(bpobj_t *bpo) 190eda14cbcSMatt Macy { 191eda14cbcSMatt Macy /* Lame workaround for closing a bpobj that was never opened. */ 192eda14cbcSMatt Macy if (bpo->bpo_object == 0) 193eda14cbcSMatt Macy return; 194eda14cbcSMatt Macy 195eda14cbcSMatt Macy dmu_buf_rele(bpo->bpo_dbuf, bpo); 196eda14cbcSMatt Macy if (bpo->bpo_cached_dbuf != NULL) 197eda14cbcSMatt Macy dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 198eda14cbcSMatt Macy bpo->bpo_dbuf = NULL; 199eda14cbcSMatt Macy bpo->bpo_phys = NULL; 200eda14cbcSMatt Macy bpo->bpo_cached_dbuf = NULL; 201eda14cbcSMatt Macy bpo->bpo_object = 0; 202eda14cbcSMatt Macy 203eda14cbcSMatt Macy mutex_destroy(&bpo->bpo_lock); 204eda14cbcSMatt Macy } 205eda14cbcSMatt Macy 206eda14cbcSMatt Macy static boolean_t 207eda14cbcSMatt Macy bpobj_is_empty_impl(bpobj_t *bpo) 208eda14cbcSMatt Macy { 209eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&bpo->bpo_lock)); 210eda14cbcSMatt Macy return (bpo->bpo_phys->bpo_num_blkptrs == 0 && 211eda14cbcSMatt Macy (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); 212eda14cbcSMatt Macy } 213eda14cbcSMatt Macy 214eda14cbcSMatt Macy boolean_t 215eda14cbcSMatt Macy bpobj_is_empty(bpobj_t *bpo) 216eda14cbcSMatt Macy { 217eda14cbcSMatt Macy mutex_enter(&bpo->bpo_lock); 218eda14cbcSMatt Macy boolean_t is_empty = bpobj_is_empty_impl(bpo); 219eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 220eda14cbcSMatt Macy return (is_empty); 221eda14cbcSMatt Macy } 222eda14cbcSMatt Macy 223eda14cbcSMatt Macy /* 224eda14cbcSMatt Macy * A recursive iteration of the bpobjs would be nice here but we run the risk 225eda14cbcSMatt Macy * of overflowing function stack space. Instead, find each subobj and add it 226eda14cbcSMatt Macy * to the head of our list so it can be scanned for subjobjs. Like a 227eda14cbcSMatt Macy * recursive implementation, the "deepest" subobjs will be freed first. 228eda14cbcSMatt Macy * When a subobj is found to have no additional subojs, free it. 229eda14cbcSMatt Macy */ 230eda14cbcSMatt Macy typedef struct bpobj_info { 231eda14cbcSMatt Macy bpobj_t *bpi_bpo; 232eda14cbcSMatt Macy /* 233eda14cbcSMatt Macy * This object is a subobj of bpi_parent, 234eda14cbcSMatt Macy * at bpi_index in its subobj array. 235eda14cbcSMatt Macy */ 236eda14cbcSMatt Macy struct bpobj_info *bpi_parent; 237eda14cbcSMatt Macy uint64_t bpi_index; 238eda14cbcSMatt Macy /* How many of our subobj's are left to process. */ 239eda14cbcSMatt Macy uint64_t bpi_unprocessed_subobjs; 240eda14cbcSMatt Macy /* True after having visited this bpo's directly referenced BPs. */ 241eda14cbcSMatt Macy boolean_t bpi_visited; 242eda14cbcSMatt Macy list_node_t bpi_node; 243eda14cbcSMatt Macy } bpobj_info_t; 244eda14cbcSMatt Macy 245eda14cbcSMatt Macy static bpobj_info_t * 246eda14cbcSMatt Macy bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) 247eda14cbcSMatt Macy { 248eda14cbcSMatt Macy bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP); 249eda14cbcSMatt Macy bpi->bpi_bpo = bpo; 250eda14cbcSMatt Macy bpi->bpi_parent = parent; 251eda14cbcSMatt Macy bpi->bpi_index = index; 252eda14cbcSMatt Macy if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 253eda14cbcSMatt Macy bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs; 254eda14cbcSMatt Macy } 255eda14cbcSMatt Macy return (bpi); 256eda14cbcSMatt Macy } 257eda14cbcSMatt Macy 258eda14cbcSMatt Macy /* 259eda14cbcSMatt Macy * Update bpobj and all of its parents with new space accounting. 260eda14cbcSMatt Macy */ 261eda14cbcSMatt Macy static void 262eda14cbcSMatt Macy propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, 263eda14cbcSMatt Macy int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) 264eda14cbcSMatt Macy { 265eda14cbcSMatt Macy 266eda14cbcSMatt Macy for (; bpi != NULL; bpi = bpi->bpi_parent) { 267eda14cbcSMatt Macy bpobj_t *p = bpi->bpi_bpo; 268eda14cbcSMatt Macy ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx)); 269eda14cbcSMatt Macy p->bpo_phys->bpo_bytes -= freed; 270eda14cbcSMatt Macy ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0); 271eda14cbcSMatt Macy if (p->bpo_havecomp) { 272eda14cbcSMatt Macy p->bpo_phys->bpo_comp -= comp_freed; 273eda14cbcSMatt Macy p->bpo_phys->bpo_uncomp -= uncomp_freed; 274eda14cbcSMatt Macy } 275eda14cbcSMatt Macy } 276eda14cbcSMatt Macy } 277eda14cbcSMatt Macy 278eda14cbcSMatt Macy static int 279eda14cbcSMatt Macy bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, 280eda14cbcSMatt Macy int64_t start, dmu_tx_t *tx, boolean_t free) 281eda14cbcSMatt Macy { 282eda14cbcSMatt Macy int err = 0; 283eda14cbcSMatt Macy int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; 284eda14cbcSMatt Macy dmu_buf_t *dbuf = NULL; 285eda14cbcSMatt Macy bpobj_t *bpo = bpi->bpi_bpo; 286eda14cbcSMatt Macy 287eda14cbcSMatt Macy for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { 288eda14cbcSMatt Macy uint64_t offset = i * sizeof (blkptr_t); 289eda14cbcSMatt Macy uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); 290eda14cbcSMatt Macy 291eda14cbcSMatt Macy if (dbuf == NULL || dbuf->db_offset > offset) { 292eda14cbcSMatt Macy if (dbuf) 293eda14cbcSMatt Macy dmu_buf_rele(dbuf, FTAG); 294eda14cbcSMatt Macy err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 295eda14cbcSMatt Macy offset, FTAG, &dbuf, 0); 296eda14cbcSMatt Macy if (err) 297eda14cbcSMatt Macy break; 298eda14cbcSMatt Macy } 299eda14cbcSMatt Macy 300eda14cbcSMatt Macy ASSERT3U(offset, >=, dbuf->db_offset); 301eda14cbcSMatt Macy ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 302eda14cbcSMatt Macy 303eda14cbcSMatt Macy blkptr_t *bparray = dbuf->db_data; 304eda14cbcSMatt Macy blkptr_t *bp = &bparray[blkoff]; 305eda14cbcSMatt Macy 306eda14cbcSMatt Macy boolean_t bp_freed = BP_GET_FREE(bp); 307eda14cbcSMatt Macy err = func(arg, bp, bp_freed, tx); 308eda14cbcSMatt Macy if (err) 309eda14cbcSMatt Macy break; 310eda14cbcSMatt Macy 311eda14cbcSMatt Macy if (free) { 312eda14cbcSMatt Macy int sign = bp_freed ? -1 : +1; 313eda14cbcSMatt Macy spa_t *spa = dmu_objset_spa(bpo->bpo_os); 314eda14cbcSMatt Macy freed += sign * bp_get_dsize_sync(spa, bp); 315eda14cbcSMatt Macy comp_freed += sign * BP_GET_PSIZE(bp); 316eda14cbcSMatt Macy uncomp_freed += sign * BP_GET_UCSIZE(bp); 317eda14cbcSMatt Macy ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); 318eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_blkptrs--; 319eda14cbcSMatt Macy ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); 320eda14cbcSMatt Macy if (bp_freed) { 321eda14cbcSMatt Macy ASSERT(bpo->bpo_havefreed); 322eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_freed--; 323eda14cbcSMatt Macy ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); 324eda14cbcSMatt Macy } 325eda14cbcSMatt Macy } 326eda14cbcSMatt Macy } 327eda14cbcSMatt Macy if (free) { 328eda14cbcSMatt Macy propagate_space_reduction(bpi, freed, comp_freed, 329eda14cbcSMatt Macy uncomp_freed, tx); 330eda14cbcSMatt Macy VERIFY0(dmu_free_range(bpo->bpo_os, 331eda14cbcSMatt Macy bpo->bpo_object, 332eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 333eda14cbcSMatt Macy DMU_OBJECT_END, tx)); 334eda14cbcSMatt Macy } 335eda14cbcSMatt Macy if (dbuf) { 336eda14cbcSMatt Macy dmu_buf_rele(dbuf, FTAG); 337eda14cbcSMatt Macy dbuf = NULL; 338eda14cbcSMatt Macy } 339eda14cbcSMatt Macy return (err); 340eda14cbcSMatt Macy } 341eda14cbcSMatt Macy 342eda14cbcSMatt Macy /* 343eda14cbcSMatt Macy * Given an initial bpo, start by freeing the BPs that are directly referenced 344eda14cbcSMatt Macy * by that bpo. If the bpo has subobjs, read in its last subobj and push the 345eda14cbcSMatt Macy * subobj to our stack. By popping items off our stack, eventually we will 346eda14cbcSMatt Macy * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if 347eda14cbcSMatt Macy * requested also free the now-empty bpo from disk and decrement 348eda14cbcSMatt Macy * its parent's subobj count. We continue popping each subobj from our stack, 349eda14cbcSMatt Macy * visiting its last subobj until they too have no more subobjs, and so on. 350eda14cbcSMatt Macy */ 351eda14cbcSMatt Macy static int 352eda14cbcSMatt Macy bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, 353eda14cbcSMatt Macy dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) 354eda14cbcSMatt Macy { 355eda14cbcSMatt Macy list_t stack; 356eda14cbcSMatt Macy bpobj_info_t *bpi; 357eda14cbcSMatt Macy int err = 0; 358eda14cbcSMatt Macy 359eda14cbcSMatt Macy /* 360eda14cbcSMatt Macy * Create a "stack" for us to work with without worrying about 361eda14cbcSMatt Macy * stack overflows. Initialize it with the initial_bpo. 362eda14cbcSMatt Macy */ 363eda14cbcSMatt Macy list_create(&stack, sizeof (bpobj_info_t), 364eda14cbcSMatt Macy offsetof(bpobj_info_t, bpi_node)); 365eda14cbcSMatt Macy mutex_enter(&initial_bpo->bpo_lock); 366eda14cbcSMatt Macy 367eda14cbcSMatt Macy if (bpobj_size != NULL) 368eda14cbcSMatt Macy *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; 369eda14cbcSMatt Macy 370eda14cbcSMatt Macy list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); 371eda14cbcSMatt Macy 372eda14cbcSMatt Macy while ((bpi = list_head(&stack)) != NULL) { 373eda14cbcSMatt Macy bpobj_t *bpo = bpi->bpi_bpo; 374eda14cbcSMatt Macy 375eda14cbcSMatt Macy ASSERT3P(bpo, !=, NULL); 376eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&bpo->bpo_lock)); 377eda14cbcSMatt Macy ASSERT(bpobj_is_open(bpo)); 378eda14cbcSMatt Macy 379eda14cbcSMatt Macy if (free) 380eda14cbcSMatt Macy dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 381eda14cbcSMatt Macy 382eda14cbcSMatt Macy if (bpi->bpi_visited == B_FALSE) { 383eda14cbcSMatt Macy err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, 384eda14cbcSMatt Macy free); 385eda14cbcSMatt Macy bpi->bpi_visited = B_TRUE; 386eda14cbcSMatt Macy if (err != 0) 387eda14cbcSMatt Macy break; 388eda14cbcSMatt Macy } 389eda14cbcSMatt Macy /* 390eda14cbcSMatt Macy * We've finished with this bpo's directly-referenced BP's and 391eda14cbcSMatt Macy * it has no more unprocessed subobjs. We can free its 392eda14cbcSMatt Macy * bpobj_info_t (unless it is the topmost, initial_bpo). 393eda14cbcSMatt Macy * If we are freeing from disk, we can also do that. 394eda14cbcSMatt Macy */ 395eda14cbcSMatt Macy if (bpi->bpi_unprocessed_subobjs == 0) { 396eda14cbcSMatt Macy /* 397eda14cbcSMatt Macy * If there are no entries, there should 398eda14cbcSMatt Macy * be no bytes. 399eda14cbcSMatt Macy */ 400eda14cbcSMatt Macy if (bpobj_is_empty_impl(bpo)) { 401eda14cbcSMatt Macy ASSERT0(bpo->bpo_phys->bpo_bytes); 402eda14cbcSMatt Macy ASSERT0(bpo->bpo_phys->bpo_comp); 403eda14cbcSMatt Macy ASSERT0(bpo->bpo_phys->bpo_uncomp); 404eda14cbcSMatt Macy } 405eda14cbcSMatt Macy 406eda14cbcSMatt Macy /* The initial_bpo has no parent and is not closed. */ 407eda14cbcSMatt Macy if (bpi->bpi_parent != NULL) { 408eda14cbcSMatt Macy if (free) { 409eda14cbcSMatt Macy bpobj_t *p = bpi->bpi_parent->bpi_bpo; 410eda14cbcSMatt Macy 411eda14cbcSMatt Macy ASSERT0(bpo->bpo_phys->bpo_num_blkptrs); 412eda14cbcSMatt Macy ASSERT3U(p->bpo_phys->bpo_num_subobjs, 413eda14cbcSMatt Macy >, 0); 414eda14cbcSMatt Macy ASSERT3U(bpi->bpi_index, ==, 415eda14cbcSMatt Macy p->bpo_phys->bpo_num_subobjs - 1); 416eda14cbcSMatt Macy ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, 417eda14cbcSMatt Macy tx)); 418eda14cbcSMatt Macy 419eda14cbcSMatt Macy p->bpo_phys->bpo_num_subobjs--; 420eda14cbcSMatt Macy 421eda14cbcSMatt Macy VERIFY0(dmu_free_range(p->bpo_os, 422eda14cbcSMatt Macy p->bpo_phys->bpo_subobjs, 423eda14cbcSMatt Macy bpi->bpi_index * sizeof (uint64_t), 424eda14cbcSMatt Macy sizeof (uint64_t), tx)); 425eda14cbcSMatt Macy 426eda14cbcSMatt Macy /* eliminate the empty subobj list */ 427eda14cbcSMatt Macy if (bpo->bpo_havesubobj && 428eda14cbcSMatt Macy bpo->bpo_phys->bpo_subobjs != 0) { 429eda14cbcSMatt Macy ASSERT0(bpo->bpo_phys-> 430eda14cbcSMatt Macy bpo_num_subobjs); 431eda14cbcSMatt Macy err = dmu_object_free( 432eda14cbcSMatt Macy bpo->bpo_os, 433eda14cbcSMatt Macy bpo->bpo_phys->bpo_subobjs, 434eda14cbcSMatt Macy tx); 435eda14cbcSMatt Macy if (err) 436eda14cbcSMatt Macy break; 437eda14cbcSMatt Macy bpo->bpo_phys->bpo_subobjs = 0; 438eda14cbcSMatt Macy } 439eda14cbcSMatt Macy err = dmu_object_free(p->bpo_os, 440eda14cbcSMatt Macy bpo->bpo_object, tx); 441eda14cbcSMatt Macy if (err) 442eda14cbcSMatt Macy break; 443eda14cbcSMatt Macy } 444eda14cbcSMatt Macy 445eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 446eda14cbcSMatt Macy bpobj_close(bpo); 447eda14cbcSMatt Macy kmem_free(bpo, sizeof (bpobj_t)); 448eda14cbcSMatt Macy } else { 449eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 450eda14cbcSMatt Macy } 451eda14cbcSMatt Macy 452eda14cbcSMatt Macy /* 453eda14cbcSMatt Macy * Finished processing this bpo. Unlock, and free 454eda14cbcSMatt Macy * our "stack" info. 455eda14cbcSMatt Macy */ 456eda14cbcSMatt Macy list_remove_head(&stack); 457eda14cbcSMatt Macy kmem_free(bpi, sizeof (bpobj_info_t)); 458eda14cbcSMatt Macy } else { 459eda14cbcSMatt Macy /* 460eda14cbcSMatt Macy * We have unprocessed subobjs. Process the next one. 461eda14cbcSMatt Macy */ 462eda14cbcSMatt Macy ASSERT(bpo->bpo_havecomp); 463eda14cbcSMatt Macy ASSERT3P(bpobj_size, ==, NULL); 464eda14cbcSMatt Macy 465eda14cbcSMatt Macy /* Add the last subobj to stack. */ 466eda14cbcSMatt Macy int64_t i = bpi->bpi_unprocessed_subobjs - 1; 467eda14cbcSMatt Macy uint64_t offset = i * sizeof (uint64_t); 468eda14cbcSMatt Macy 469eda14cbcSMatt Macy uint64_t obj_from_sublist; 470eda14cbcSMatt Macy err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 471eda14cbcSMatt Macy offset, sizeof (uint64_t), &obj_from_sublist, 472eda14cbcSMatt Macy DMU_READ_PREFETCH); 473eda14cbcSMatt Macy if (err) 474eda14cbcSMatt Macy break; 475eda14cbcSMatt Macy bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), 476eda14cbcSMatt Macy KM_SLEEP); 477eda14cbcSMatt Macy 478eda14cbcSMatt Macy err = bpobj_open(sublist, bpo->bpo_os, 479eda14cbcSMatt Macy obj_from_sublist); 480eda14cbcSMatt Macy if (err) 481eda14cbcSMatt Macy break; 482eda14cbcSMatt Macy 483eda14cbcSMatt Macy list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); 484eda14cbcSMatt Macy mutex_enter(&sublist->bpo_lock); 485eda14cbcSMatt Macy bpi->bpi_unprocessed_subobjs--; 486eda14cbcSMatt Macy } 487eda14cbcSMatt Macy } 488eda14cbcSMatt Macy /* 489eda14cbcSMatt Macy * Cleanup anything left on the "stack" after we left the loop. 490eda14cbcSMatt Macy * Every bpo on the stack is locked so we must remember to undo 491eda14cbcSMatt Macy * that now (in LIFO order). 492eda14cbcSMatt Macy */ 493eda14cbcSMatt Macy while ((bpi = list_remove_head(&stack)) != NULL) { 494eda14cbcSMatt Macy bpobj_t *bpo = bpi->bpi_bpo; 495eda14cbcSMatt Macy ASSERT(err != 0); 496eda14cbcSMatt Macy ASSERT3P(bpo, !=, NULL); 497eda14cbcSMatt Macy 498eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 499eda14cbcSMatt Macy 500eda14cbcSMatt Macy /* do not free the initial_bpo */ 501eda14cbcSMatt Macy if (bpi->bpi_parent != NULL) { 502eda14cbcSMatt Macy bpobj_close(bpi->bpi_bpo); 503eda14cbcSMatt Macy kmem_free(bpi->bpi_bpo, sizeof (bpobj_t)); 504eda14cbcSMatt Macy } 505eda14cbcSMatt Macy kmem_free(bpi, sizeof (bpobj_info_t)); 506eda14cbcSMatt Macy } 507eda14cbcSMatt Macy 508eda14cbcSMatt Macy list_destroy(&stack); 509eda14cbcSMatt Macy 510eda14cbcSMatt Macy return (err); 511eda14cbcSMatt Macy } 512eda14cbcSMatt Macy 513eda14cbcSMatt Macy /* 514eda14cbcSMatt Macy * Iterate and remove the entries. If func returns nonzero, iteration 515eda14cbcSMatt Macy * will stop and that entry will not be removed. 516eda14cbcSMatt Macy */ 517eda14cbcSMatt Macy int 518eda14cbcSMatt Macy bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 519eda14cbcSMatt Macy { 520eda14cbcSMatt Macy return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); 521eda14cbcSMatt Macy } 522eda14cbcSMatt Macy 523eda14cbcSMatt Macy /* 524eda14cbcSMatt Macy * Iterate the entries. If func returns nonzero, iteration will stop. 525eda14cbcSMatt Macy * 526eda14cbcSMatt Macy * If there are no subobjs: 527eda14cbcSMatt Macy * 528eda14cbcSMatt Macy * *bpobj_size can be used to return the number of block pointers in the 529eda14cbcSMatt Macy * bpobj. Note that this may be different from the number of block pointers 530eda14cbcSMatt Macy * that are iterated over, if iteration is terminated early (e.g. by the func 531eda14cbcSMatt Macy * returning nonzero). 532eda14cbcSMatt Macy * 533eda14cbcSMatt Macy * If there are concurrent (or subsequent) modifications to the bpobj then the 534eda14cbcSMatt Macy * returned *bpobj_size can be passed as "start" to 535eda14cbcSMatt Macy * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. 536eda14cbcSMatt Macy */ 537eda14cbcSMatt Macy int 538eda14cbcSMatt Macy bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, 539eda14cbcSMatt Macy uint64_t *bpobj_size) 540eda14cbcSMatt Macy { 541eda14cbcSMatt Macy return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); 542eda14cbcSMatt Macy } 543eda14cbcSMatt Macy 544eda14cbcSMatt Macy /* 545eda14cbcSMatt Macy * Iterate over the blkptrs in the bpobj beginning at index start. If func 546eda14cbcSMatt Macy * returns nonzero, iteration will stop. This is a livelist specific function 547eda14cbcSMatt Macy * since it assumes that there are no subobjs present. 548eda14cbcSMatt Macy */ 549eda14cbcSMatt Macy int 550eda14cbcSMatt Macy livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, 551eda14cbcSMatt Macy int64_t start) 552eda14cbcSMatt Macy { 553eda14cbcSMatt Macy if (bpo->bpo_havesubobj) 554eda14cbcSMatt Macy VERIFY0(bpo->bpo_phys->bpo_subobjs); 555eda14cbcSMatt Macy bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); 556eda14cbcSMatt Macy int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); 557eda14cbcSMatt Macy kmem_free(bpi, sizeof (bpobj_info_t)); 558eda14cbcSMatt Macy return (err); 559eda14cbcSMatt Macy } 560eda14cbcSMatt Macy 561eda14cbcSMatt Macy /* 562eda14cbcSMatt Macy * Logically add subobj's contents to the parent bpobj. 563eda14cbcSMatt Macy * 564eda14cbcSMatt Macy * In the most general case, this is accomplished in constant time by adding 565eda14cbcSMatt Macy * a reference to subobj. This case is used when enqueuing a large subobj: 566eda14cbcSMatt Macy * +--------------+ +--------------+ 567eda14cbcSMatt Macy * | bpobj |----------------------->| subobj list | 568eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+--+--+ 569eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | | obj | obj | obj | 570eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+-----+ 571eda14cbcSMatt Macy * 572eda14cbcSMatt Macy * +--------------+ +--------------+ 573eda14cbcSMatt Macy * | sub-bpobj |----------------------> | subsubobj | 574eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ 575eda14cbcSMatt Macy * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | 576eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ 577eda14cbcSMatt Macy * 578eda14cbcSMatt Macy * Result: sub-bpobj added to parent's subobj list. 579eda14cbcSMatt Macy * +--------------+ +--------------+ 580eda14cbcSMatt Macy * | bpobj |----------------------->| subobj list | 581eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+--+--+-----+ 582eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | 583eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+-----+--|--+ 584eda14cbcSMatt Macy * | 585eda14cbcSMatt Macy * /-----------------------------------------------------/ 586eda14cbcSMatt Macy * v 587eda14cbcSMatt Macy * +--------------+ +--------------+ 588eda14cbcSMatt Macy * | sub-bpobj |----------------------> | subsubobj | 589eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ 590eda14cbcSMatt Macy * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | 591eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ 592eda14cbcSMatt Macy * 593eda14cbcSMatt Macy * 594eda14cbcSMatt Macy * In a common case, the subobj is small: its bp's and its list of subobj's 595eda14cbcSMatt Macy * are each stored in a single block. In this case we copy the subobj's 596eda14cbcSMatt Macy * contents to the parent: 597eda14cbcSMatt Macy * +--------------+ +--------------+ 598eda14cbcSMatt Macy * | bpobj |----------------------->| subobj list | 599eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+--+--+ 600eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | | obj | obj | obj | 601eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+-----+ 602eda14cbcSMatt Macy * ^ ^ 603eda14cbcSMatt Macy * +--------------+ | +--------------+ | 604eda14cbcSMatt Macy * | sub-bpobj |---------^------------> | subsubobj | ^ 605eda14cbcSMatt Macy * +----+----+----+ | +-----+-----+--+ | 606eda14cbcSMatt Macy * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/ 607eda14cbcSMatt Macy * +----+----+ +-----+-----+ 608eda14cbcSMatt Macy * 609eda14cbcSMatt Macy * Result: subobj destroyed, contents copied to parent: 610eda14cbcSMatt Macy * +--------------+ +--------------+ 611eda14cbcSMatt Macy * | bpobj |----------------------->| subobj list | 612eda14cbcSMatt Macy * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+ 613eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ | 614eda14cbcSMatt Macy * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+ 615eda14cbcSMatt Macy * 616eda14cbcSMatt Macy * 617eda14cbcSMatt Macy * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's 618eda14cbcSMatt Macy * but retain the sub-bpobj: 619eda14cbcSMatt Macy * +--------------+ +--------------+ 620eda14cbcSMatt Macy * | bpobj |----------------------->| subobj list | 621eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+--+--+ 622eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | | obj | obj | obj | 623eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+-----+ 624eda14cbcSMatt Macy * ^ 625eda14cbcSMatt Macy * +--------------+ +--------------+ | 626eda14cbcSMatt Macy * | sub-bpobj |----------------------> | subsubobj | ^ 627eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+--+ | 628eda14cbcSMatt Macy * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/ 629eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+ 630eda14cbcSMatt Macy * 631eda14cbcSMatt Macy * Result: sub-sub-bpobjs and subobj added to parent's subobj list. 632eda14cbcSMatt Macy * +--------------+ +--------------+ 633eda14cbcSMatt Macy * | bpobj |-------------------->| subobj list | 634eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+ 635eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* | 636eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+ 637eda14cbcSMatt Macy * | 638eda14cbcSMatt Macy * /--------------------------------------------------------------/ 639eda14cbcSMatt Macy * v 640eda14cbcSMatt Macy * +--------------+ 641eda14cbcSMatt Macy * | sub-bpobj | 642eda14cbcSMatt Macy * +----+----+----+----+---------+----+ 643eda14cbcSMatt Macy * | bp | bp | bp | bp | ... | bp | 644eda14cbcSMatt Macy * +----+----+----+----+---------+----+ 645eda14cbcSMatt Macy */ 646eda14cbcSMatt Macy void 647eda14cbcSMatt Macy bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) 648eda14cbcSMatt Macy { 649eda14cbcSMatt Macy bpobj_t subbpo; 650eda14cbcSMatt Macy uint64_t used, comp, uncomp, subsubobjs; 651eda14cbcSMatt Macy boolean_t copy_subsub = B_TRUE; 652eda14cbcSMatt Macy boolean_t copy_bps = B_TRUE; 653eda14cbcSMatt Macy 654eda14cbcSMatt Macy ASSERT(bpobj_is_open(bpo)); 655eda14cbcSMatt Macy ASSERT(subobj != 0); 656eda14cbcSMatt Macy ASSERT(bpo->bpo_havesubobj); 657eda14cbcSMatt Macy ASSERT(bpo->bpo_havecomp); 658eda14cbcSMatt Macy ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 659eda14cbcSMatt Macy 660eda14cbcSMatt Macy if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { 661eda14cbcSMatt Macy bpobj_decr_empty(bpo->bpo_os, tx); 662eda14cbcSMatt Macy return; 663eda14cbcSMatt Macy } 664eda14cbcSMatt Macy 665eda14cbcSMatt Macy VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); 666eda14cbcSMatt Macy if (bpobj_is_empty(&subbpo)) { 667eda14cbcSMatt Macy /* No point in having an empty subobj. */ 668eda14cbcSMatt Macy bpobj_close(&subbpo); 669eda14cbcSMatt Macy bpobj_free(bpo->bpo_os, subobj, tx); 670eda14cbcSMatt Macy return; 671eda14cbcSMatt Macy } 672c9539b89SMartin Matuska VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); 673eda14cbcSMatt Macy 674eda14cbcSMatt Macy mutex_enter(&bpo->bpo_lock); 675eda14cbcSMatt Macy dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 676eda14cbcSMatt Macy 677eda14cbcSMatt Macy dmu_object_info_t doi; 678eda14cbcSMatt Macy 679eda14cbcSMatt Macy if (bpo->bpo_phys->bpo_subobjs != 0) { 680eda14cbcSMatt Macy ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 681eda14cbcSMatt Macy &doi)); 682eda14cbcSMatt Macy ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); 683eda14cbcSMatt Macy } 684eda14cbcSMatt Macy 685eda14cbcSMatt Macy /* 686eda14cbcSMatt Macy * If subobj has only one block of subobjs, then move subobj's 687eda14cbcSMatt Macy * subobjs to bpo's subobj list directly. This reduces recursion in 688eda14cbcSMatt Macy * bpobj_iterate due to nested subobjs. 689eda14cbcSMatt Macy */ 690eda14cbcSMatt Macy subsubobjs = subbpo.bpo_phys->bpo_subobjs; 691eda14cbcSMatt Macy if (subsubobjs != 0) { 692eda14cbcSMatt Macy VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); 693eda14cbcSMatt Macy if (doi.doi_max_offset > doi.doi_data_block_size) { 694eda14cbcSMatt Macy copy_subsub = B_FALSE; 695eda14cbcSMatt Macy } 696eda14cbcSMatt Macy } 697eda14cbcSMatt Macy 698eda14cbcSMatt Macy /* 699eda14cbcSMatt Macy * If, in addition to having only one block of subobj's, subobj has 700eda14cbcSMatt Macy * only one block of bp's, then move subobj's bp's to bpo's bp list 701eda14cbcSMatt Macy * directly. This reduces recursion in bpobj_iterate due to nested 702eda14cbcSMatt Macy * subobjs. 703eda14cbcSMatt Macy */ 704eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi)); 705eda14cbcSMatt Macy if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) { 706eda14cbcSMatt Macy copy_bps = B_FALSE; 707eda14cbcSMatt Macy } 708eda14cbcSMatt Macy 709eda14cbcSMatt Macy if (copy_subsub && subsubobjs != 0) { 710eda14cbcSMatt Macy dmu_buf_t *subdb; 711eda14cbcSMatt Macy uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; 712eda14cbcSMatt Macy 713eda14cbcSMatt Macy VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs, 714eda14cbcSMatt Macy 0, FTAG, &subdb, 0)); 715eda14cbcSMatt Macy /* 716eda14cbcSMatt Macy * Make sure that we are not asking dmu_write() 717eda14cbcSMatt Macy * to write more data than we have in our buffer. 718eda14cbcSMatt Macy */ 719eda14cbcSMatt Macy VERIFY3U(subdb->db_size, >=, 720eda14cbcSMatt Macy numsubsub * sizeof (subobj)); 721eda14cbcSMatt Macy if (bpo->bpo_phys->bpo_subobjs == 0) { 722eda14cbcSMatt Macy bpo->bpo_phys->bpo_subobjs = 723eda14cbcSMatt Macy dmu_object_alloc(bpo->bpo_os, 724eda14cbcSMatt Macy DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, 725eda14cbcSMatt Macy DMU_OT_NONE, 0, tx); 726eda14cbcSMatt Macy } 727eda14cbcSMatt Macy dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 728eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 729eda14cbcSMatt Macy numsubsub * sizeof (subobj), subdb->db_data, tx); 730eda14cbcSMatt Macy dmu_buf_rele(subdb, FTAG); 731eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_subobjs += numsubsub; 732eda14cbcSMatt Macy 733eda14cbcSMatt Macy dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); 734eda14cbcSMatt Macy subbpo.bpo_phys->bpo_subobjs = 0; 735eda14cbcSMatt Macy VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx)); 736eda14cbcSMatt Macy } 737eda14cbcSMatt Macy 738eda14cbcSMatt Macy if (copy_bps) { 739eda14cbcSMatt Macy dmu_buf_t *bps; 740eda14cbcSMatt Macy uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs; 741eda14cbcSMatt Macy 742eda14cbcSMatt Macy ASSERT(copy_subsub); 743eda14cbcSMatt Macy VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj, 744eda14cbcSMatt Macy 0, FTAG, &bps, 0)); 745eda14cbcSMatt Macy 746eda14cbcSMatt Macy /* 747eda14cbcSMatt Macy * Make sure that we are not asking dmu_write() 748eda14cbcSMatt Macy * to write more data than we have in our buffer. 749eda14cbcSMatt Macy */ 750eda14cbcSMatt Macy VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t)); 751eda14cbcSMatt Macy dmu_write(bpo->bpo_os, bpo->bpo_object, 752eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 753eda14cbcSMatt Macy numbps * sizeof (blkptr_t), 754eda14cbcSMatt Macy bps->db_data, tx); 755eda14cbcSMatt Macy dmu_buf_rele(bps, FTAG); 756eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_blkptrs += numbps; 757eda14cbcSMatt Macy 758eda14cbcSMatt Macy bpobj_close(&subbpo); 759eda14cbcSMatt Macy VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx)); 760eda14cbcSMatt Macy } else { 761eda14cbcSMatt Macy bpobj_close(&subbpo); 762eda14cbcSMatt Macy if (bpo->bpo_phys->bpo_subobjs == 0) { 763eda14cbcSMatt Macy bpo->bpo_phys->bpo_subobjs = 764eda14cbcSMatt Macy dmu_object_alloc(bpo->bpo_os, 765eda14cbcSMatt Macy DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, 766eda14cbcSMatt Macy DMU_OT_NONE, 0, tx); 767eda14cbcSMatt Macy } 768eda14cbcSMatt Macy 769eda14cbcSMatt Macy dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 770eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 771eda14cbcSMatt Macy sizeof (subobj), &subobj, tx); 772eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_subobjs++; 773eda14cbcSMatt Macy } 774eda14cbcSMatt Macy 775eda14cbcSMatt Macy bpo->bpo_phys->bpo_bytes += used; 776eda14cbcSMatt Macy bpo->bpo_phys->bpo_comp += comp; 777eda14cbcSMatt Macy bpo->bpo_phys->bpo_uncomp += uncomp; 778eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 779eda14cbcSMatt Macy 780eda14cbcSMatt Macy } 781eda14cbcSMatt Macy 782c9539b89SMartin Matuska /* 783c9539b89SMartin Matuska * Prefetch metadata required for bpobj_enqueue_subobj(). 784c9539b89SMartin Matuska */ 785c9539b89SMartin Matuska void 786c9539b89SMartin Matuska bpobj_prefetch_subobj(bpobj_t *bpo, uint64_t subobj) 787c9539b89SMartin Matuska { 788c9539b89SMartin Matuska dmu_object_info_t doi; 789c9539b89SMartin Matuska bpobj_t subbpo; 790c9539b89SMartin Matuska uint64_t subsubobjs; 791c9539b89SMartin Matuska boolean_t copy_subsub = B_TRUE; 792c9539b89SMartin Matuska boolean_t copy_bps = B_TRUE; 793c9539b89SMartin Matuska 794c9539b89SMartin Matuska ASSERT(bpobj_is_open(bpo)); 795c9539b89SMartin Matuska ASSERT(subobj != 0); 796c9539b89SMartin Matuska 797c9539b89SMartin Matuska if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) 798c9539b89SMartin Matuska return; 799c9539b89SMartin Matuska 800c9539b89SMartin Matuska if (bpobj_open(&subbpo, bpo->bpo_os, subobj) != 0) 801c9539b89SMartin Matuska return; 802c9539b89SMartin Matuska if (bpobj_is_empty(&subbpo)) { 803c9539b89SMartin Matuska bpobj_close(&subbpo); 804c9539b89SMartin Matuska return; 805c9539b89SMartin Matuska } 806c9539b89SMartin Matuska subsubobjs = subbpo.bpo_phys->bpo_subobjs; 807c9539b89SMartin Matuska bpobj_close(&subbpo); 808c9539b89SMartin Matuska 809c9539b89SMartin Matuska if (subsubobjs != 0) { 810c9539b89SMartin Matuska if (dmu_object_info(bpo->bpo_os, subsubobjs, &doi) != 0) 811c9539b89SMartin Matuska return; 812c9539b89SMartin Matuska if (doi.doi_max_offset > doi.doi_data_block_size) 813c9539b89SMartin Matuska copy_subsub = B_FALSE; 814c9539b89SMartin Matuska } 815c9539b89SMartin Matuska 816c9539b89SMartin Matuska if (dmu_object_info(bpo->bpo_os, subobj, &doi) != 0) 817c9539b89SMartin Matuska return; 818c9539b89SMartin Matuska if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) 819c9539b89SMartin Matuska copy_bps = B_FALSE; 820c9539b89SMartin Matuska 821c9539b89SMartin Matuska if (copy_subsub && subsubobjs != 0) { 822c9539b89SMartin Matuska if (bpo->bpo_phys->bpo_subobjs) { 823c9539b89SMartin Matuska dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, 824c9539b89SMartin Matuska bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, 825c9539b89SMartin Matuska ZIO_PRIORITY_ASYNC_READ); 826c9539b89SMartin Matuska } 827c9539b89SMartin Matuska dmu_prefetch(bpo->bpo_os, subsubobjs, 0, 0, 1, 828c9539b89SMartin Matuska ZIO_PRIORITY_ASYNC_READ); 829c9539b89SMartin Matuska } 830c9539b89SMartin Matuska 831c9539b89SMartin Matuska if (copy_bps) { 832c9539b89SMartin Matuska dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, 833c9539b89SMartin Matuska bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 1, 834c9539b89SMartin Matuska ZIO_PRIORITY_ASYNC_READ); 835c9539b89SMartin Matuska dmu_prefetch(bpo->bpo_os, subobj, 0, 0, 1, 836c9539b89SMartin Matuska ZIO_PRIORITY_ASYNC_READ); 837c9539b89SMartin Matuska } else if (bpo->bpo_phys->bpo_subobjs) { 838c9539b89SMartin Matuska dmu_prefetch(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 0, 839c9539b89SMartin Matuska bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 1, 840c9539b89SMartin Matuska ZIO_PRIORITY_ASYNC_READ); 841c9539b89SMartin Matuska } 842c9539b89SMartin Matuska } 843c9539b89SMartin Matuska 844eda14cbcSMatt Macy void 845eda14cbcSMatt Macy bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, 846eda14cbcSMatt Macy dmu_tx_t *tx) 847eda14cbcSMatt Macy { 848eda14cbcSMatt Macy blkptr_t stored_bp = *bp; 849eda14cbcSMatt Macy uint64_t offset; 850eda14cbcSMatt Macy int blkoff; 851eda14cbcSMatt Macy blkptr_t *bparray; 852eda14cbcSMatt Macy 853eda14cbcSMatt Macy ASSERT(bpobj_is_open(bpo)); 854eda14cbcSMatt Macy ASSERT(!BP_IS_HOLE(bp)); 855eda14cbcSMatt Macy ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 856eda14cbcSMatt Macy 857eda14cbcSMatt Macy if (BP_IS_EMBEDDED(bp)) { 858eda14cbcSMatt Macy /* 859eda14cbcSMatt Macy * The bpobj will compress better without the payload. 860eda14cbcSMatt Macy * 861eda14cbcSMatt Macy * Note that we store EMBEDDED bp's because they have an 862eda14cbcSMatt Macy * uncompressed size, which must be accounted for. An 863eda14cbcSMatt Macy * alternative would be to add their size to bpo_uncomp 864eda14cbcSMatt Macy * without storing the bp, but that would create additional 865eda14cbcSMatt Macy * complications: bpo_uncomp would be inconsistent with the 866eda14cbcSMatt Macy * set of BP's stored, and bpobj_iterate() wouldn't visit 867eda14cbcSMatt Macy * all the space accounted for in the bpobj. 868eda14cbcSMatt Macy */ 869da5137abSMartin Matuska memset(&stored_bp, 0, sizeof (stored_bp)); 870eda14cbcSMatt Macy stored_bp.blk_prop = bp->blk_prop; 871eda14cbcSMatt Macy stored_bp.blk_birth = bp->blk_birth; 872eda14cbcSMatt Macy } else if (!BP_GET_DEDUP(bp)) { 873eda14cbcSMatt Macy /* The bpobj will compress better without the checksum */ 874da5137abSMartin Matuska memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum)); 875eda14cbcSMatt Macy } 876eda14cbcSMatt Macy 877eda14cbcSMatt Macy stored_bp.blk_fill = 0; 878eda14cbcSMatt Macy BP_SET_FREE(&stored_bp, bp_freed); 879eda14cbcSMatt Macy 880eda14cbcSMatt Macy mutex_enter(&bpo->bpo_lock); 881eda14cbcSMatt Macy 882eda14cbcSMatt Macy offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); 883eda14cbcSMatt Macy blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); 884eda14cbcSMatt Macy 885eda14cbcSMatt Macy if (bpo->bpo_cached_dbuf == NULL || 886eda14cbcSMatt Macy offset < bpo->bpo_cached_dbuf->db_offset || 887eda14cbcSMatt Macy offset >= bpo->bpo_cached_dbuf->db_offset + 888eda14cbcSMatt Macy bpo->bpo_cached_dbuf->db_size) { 889eda14cbcSMatt Macy if (bpo->bpo_cached_dbuf) 890eda14cbcSMatt Macy dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 891eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 892eda14cbcSMatt Macy offset, bpo, &bpo->bpo_cached_dbuf, 0)); 893*2a58b312SMartin Matuska ASSERT3P(bpo->bpo_cached_dbuf, !=, NULL); 894eda14cbcSMatt Macy } 895eda14cbcSMatt Macy 896eda14cbcSMatt Macy dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); 897eda14cbcSMatt Macy bparray = bpo->bpo_cached_dbuf->db_data; 898eda14cbcSMatt Macy bparray[blkoff] = stored_bp; 899eda14cbcSMatt Macy 900eda14cbcSMatt Macy dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 901eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_blkptrs++; 902eda14cbcSMatt Macy int sign = bp_freed ? -1 : +1; 903eda14cbcSMatt Macy bpo->bpo_phys->bpo_bytes += sign * 904eda14cbcSMatt Macy bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 905eda14cbcSMatt Macy if (bpo->bpo_havecomp) { 906eda14cbcSMatt Macy bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); 907eda14cbcSMatt Macy bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); 908eda14cbcSMatt Macy } 909eda14cbcSMatt Macy if (bp_freed) { 910eda14cbcSMatt Macy ASSERT(bpo->bpo_havefreed); 911eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_freed++; 912eda14cbcSMatt Macy } 913eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 914eda14cbcSMatt Macy } 915eda14cbcSMatt Macy 916eda14cbcSMatt Macy struct space_range_arg { 917eda14cbcSMatt Macy spa_t *spa; 918eda14cbcSMatt Macy uint64_t mintxg; 919eda14cbcSMatt Macy uint64_t maxtxg; 920eda14cbcSMatt Macy uint64_t used; 921eda14cbcSMatt Macy uint64_t comp; 922eda14cbcSMatt Macy uint64_t uncomp; 923eda14cbcSMatt Macy }; 924eda14cbcSMatt Macy 925eda14cbcSMatt Macy static int 926eda14cbcSMatt Macy space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) 927eda14cbcSMatt Macy { 928e92ffd9bSMartin Matuska (void) bp_freed, (void) tx; 929eda14cbcSMatt Macy struct space_range_arg *sra = arg; 930eda14cbcSMatt Macy 931eda14cbcSMatt Macy if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { 932eda14cbcSMatt Macy if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) 933eda14cbcSMatt Macy sra->used += bp_get_dsize_sync(sra->spa, bp); 934eda14cbcSMatt Macy else 935eda14cbcSMatt Macy sra->used += bp_get_dsize(sra->spa, bp); 936eda14cbcSMatt Macy sra->comp += BP_GET_PSIZE(bp); 937eda14cbcSMatt Macy sra->uncomp += BP_GET_UCSIZE(bp); 938eda14cbcSMatt Macy } 939eda14cbcSMatt Macy return (0); 940eda14cbcSMatt Macy } 941eda14cbcSMatt Macy 942eda14cbcSMatt Macy int 943eda14cbcSMatt Macy bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 944eda14cbcSMatt Macy { 945eda14cbcSMatt Macy ASSERT(bpobj_is_open(bpo)); 946eda14cbcSMatt Macy mutex_enter(&bpo->bpo_lock); 947eda14cbcSMatt Macy 948eda14cbcSMatt Macy *usedp = bpo->bpo_phys->bpo_bytes; 949eda14cbcSMatt Macy if (bpo->bpo_havecomp) { 950eda14cbcSMatt Macy *compp = bpo->bpo_phys->bpo_comp; 951eda14cbcSMatt Macy *uncompp = bpo->bpo_phys->bpo_uncomp; 952eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 953eda14cbcSMatt Macy return (0); 954eda14cbcSMatt Macy } else { 955eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 956eda14cbcSMatt Macy return (bpobj_space_range(bpo, 0, UINT64_MAX, 957eda14cbcSMatt Macy usedp, compp, uncompp)); 958eda14cbcSMatt Macy } 959eda14cbcSMatt Macy } 960eda14cbcSMatt Macy 961eda14cbcSMatt Macy /* 962eda14cbcSMatt Macy * Return the amount of space in the bpobj which is: 963eda14cbcSMatt Macy * mintxg < blk_birth <= maxtxg 964eda14cbcSMatt Macy */ 965eda14cbcSMatt Macy int 966eda14cbcSMatt Macy bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, 967eda14cbcSMatt Macy uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 968eda14cbcSMatt Macy { 969eda14cbcSMatt Macy struct space_range_arg sra = { 0 }; 970eda14cbcSMatt Macy int err; 971eda14cbcSMatt Macy 972eda14cbcSMatt Macy ASSERT(bpobj_is_open(bpo)); 973eda14cbcSMatt Macy 974eda14cbcSMatt Macy /* 975eda14cbcSMatt Macy * As an optimization, if they want the whole txg range, just 976eda14cbcSMatt Macy * get bpo_bytes rather than iterating over the bps. 977eda14cbcSMatt Macy */ 978eda14cbcSMatt Macy if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) 979eda14cbcSMatt Macy return (bpobj_space(bpo, usedp, compp, uncompp)); 980eda14cbcSMatt Macy 981eda14cbcSMatt Macy sra.spa = dmu_objset_spa(bpo->bpo_os); 982eda14cbcSMatt Macy sra.mintxg = mintxg; 983eda14cbcSMatt Macy sra.maxtxg = maxtxg; 984eda14cbcSMatt Macy 985eda14cbcSMatt Macy err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); 986eda14cbcSMatt Macy *usedp = sra.used; 987eda14cbcSMatt Macy *compp = sra.comp; 988eda14cbcSMatt Macy *uncompp = sra.uncomp; 989eda14cbcSMatt Macy return (err); 990eda14cbcSMatt Macy } 991eda14cbcSMatt Macy 992eda14cbcSMatt Macy /* 993eda14cbcSMatt Macy * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a 994eda14cbcSMatt Macy * bpobj are designated as free or allocated that information is not preserved 995eda14cbcSMatt Macy * in bplists. 996eda14cbcSMatt Macy */ 997eda14cbcSMatt Macy int 998eda14cbcSMatt Macy bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 999eda14cbcSMatt Macy dmu_tx_t *tx) 1000eda14cbcSMatt Macy { 1001e92ffd9bSMartin Matuska (void) bp_freed, (void) tx; 1002eda14cbcSMatt Macy bplist_t *bpl = arg; 1003eda14cbcSMatt Macy bplist_append(bpl, bp); 1004eda14cbcSMatt Macy return (0); 1005eda14cbcSMatt Macy } 1006