1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy /* 22*eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23*eda14cbcSMatt Macy * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 24*eda14cbcSMatt Macy * Copyright (c) 2017 Datto Inc. 25*eda14cbcSMatt Macy */ 26*eda14cbcSMatt Macy 27*eda14cbcSMatt Macy #include <sys/bpobj.h> 28*eda14cbcSMatt Macy #include <sys/zfs_context.h> 29*eda14cbcSMatt Macy #include <sys/zfs_refcount.h> 30*eda14cbcSMatt Macy #include <sys/dsl_pool.h> 31*eda14cbcSMatt Macy #include <sys/zfeature.h> 32*eda14cbcSMatt Macy #include <sys/zap.h> 33*eda14cbcSMatt Macy 34*eda14cbcSMatt Macy /* 35*eda14cbcSMatt Macy * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). 36*eda14cbcSMatt Macy */ 37*eda14cbcSMatt Macy uint64_t 38*eda14cbcSMatt Macy bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) 39*eda14cbcSMatt Macy { 40*eda14cbcSMatt Macy spa_t *spa = dmu_objset_spa(os); 41*eda14cbcSMatt Macy dsl_pool_t *dp = dmu_objset_pool(os); 42*eda14cbcSMatt Macy 43*eda14cbcSMatt Macy if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { 44*eda14cbcSMatt Macy if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { 45*eda14cbcSMatt Macy ASSERT0(dp->dp_empty_bpobj); 46*eda14cbcSMatt Macy dp->dp_empty_bpobj = 47*eda14cbcSMatt Macy bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); 48*eda14cbcSMatt Macy VERIFY(zap_add(os, 49*eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, 50*eda14cbcSMatt Macy DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, 51*eda14cbcSMatt Macy &dp->dp_empty_bpobj, tx) == 0); 52*eda14cbcSMatt Macy } 53*eda14cbcSMatt Macy spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); 54*eda14cbcSMatt Macy ASSERT(dp->dp_empty_bpobj != 0); 55*eda14cbcSMatt Macy return (dp->dp_empty_bpobj); 56*eda14cbcSMatt Macy } else { 57*eda14cbcSMatt Macy return (bpobj_alloc(os, blocksize, tx)); 58*eda14cbcSMatt Macy } 59*eda14cbcSMatt Macy } 60*eda14cbcSMatt Macy 61*eda14cbcSMatt Macy void 62*eda14cbcSMatt Macy bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) 63*eda14cbcSMatt Macy { 64*eda14cbcSMatt Macy dsl_pool_t *dp = dmu_objset_pool(os); 65*eda14cbcSMatt Macy 66*eda14cbcSMatt Macy spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); 67*eda14cbcSMatt Macy if (!spa_feature_is_active(dmu_objset_spa(os), 68*eda14cbcSMatt Macy SPA_FEATURE_EMPTY_BPOBJ)) { 69*eda14cbcSMatt Macy VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, 70*eda14cbcSMatt Macy DMU_POOL_DIRECTORY_OBJECT, 71*eda14cbcSMatt Macy DMU_POOL_EMPTY_BPOBJ, tx)); 72*eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); 73*eda14cbcSMatt Macy dp->dp_empty_bpobj = 0; 74*eda14cbcSMatt Macy } 75*eda14cbcSMatt Macy } 76*eda14cbcSMatt Macy 77*eda14cbcSMatt Macy uint64_t 78*eda14cbcSMatt Macy bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) 79*eda14cbcSMatt Macy { 80*eda14cbcSMatt Macy int size; 81*eda14cbcSMatt Macy 82*eda14cbcSMatt Macy if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) 83*eda14cbcSMatt Macy size = BPOBJ_SIZE_V0; 84*eda14cbcSMatt Macy else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 85*eda14cbcSMatt Macy size = BPOBJ_SIZE_V1; 86*eda14cbcSMatt Macy else if (!spa_feature_is_active(dmu_objset_spa(os), 87*eda14cbcSMatt Macy SPA_FEATURE_LIVELIST)) 88*eda14cbcSMatt Macy size = BPOBJ_SIZE_V2; 89*eda14cbcSMatt Macy else 90*eda14cbcSMatt Macy size = sizeof (bpobj_phys_t); 91*eda14cbcSMatt Macy 92*eda14cbcSMatt Macy return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, 93*eda14cbcSMatt Macy DMU_OT_BPOBJ_HDR, size, tx)); 94*eda14cbcSMatt Macy } 95*eda14cbcSMatt Macy 96*eda14cbcSMatt Macy void 97*eda14cbcSMatt Macy bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) 98*eda14cbcSMatt Macy { 99*eda14cbcSMatt Macy int64_t i; 100*eda14cbcSMatt Macy bpobj_t bpo; 101*eda14cbcSMatt Macy dmu_object_info_t doi; 102*eda14cbcSMatt Macy int epb; 103*eda14cbcSMatt Macy dmu_buf_t *dbuf = NULL; 104*eda14cbcSMatt Macy 105*eda14cbcSMatt Macy ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); 106*eda14cbcSMatt Macy VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); 107*eda14cbcSMatt Macy 108*eda14cbcSMatt Macy mutex_enter(&bpo.bpo_lock); 109*eda14cbcSMatt Macy 110*eda14cbcSMatt Macy if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) 111*eda14cbcSMatt Macy goto out; 112*eda14cbcSMatt Macy 113*eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); 114*eda14cbcSMatt Macy epb = doi.doi_data_block_size / sizeof (uint64_t); 115*eda14cbcSMatt Macy 116*eda14cbcSMatt Macy for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { 117*eda14cbcSMatt Macy uint64_t *objarray; 118*eda14cbcSMatt Macy uint64_t offset, blkoff; 119*eda14cbcSMatt Macy 120*eda14cbcSMatt Macy offset = i * sizeof (uint64_t); 121*eda14cbcSMatt Macy blkoff = P2PHASE(i, epb); 122*eda14cbcSMatt Macy 123*eda14cbcSMatt Macy if (dbuf == NULL || dbuf->db_offset > offset) { 124*eda14cbcSMatt Macy if (dbuf) 125*eda14cbcSMatt Macy dmu_buf_rele(dbuf, FTAG); 126*eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_buf_hold(os, 127*eda14cbcSMatt Macy bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); 128*eda14cbcSMatt Macy } 129*eda14cbcSMatt Macy 130*eda14cbcSMatt Macy ASSERT3U(offset, >=, dbuf->db_offset); 131*eda14cbcSMatt Macy ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 132*eda14cbcSMatt Macy 133*eda14cbcSMatt Macy objarray = dbuf->db_data; 134*eda14cbcSMatt Macy bpobj_free(os, objarray[blkoff], tx); 135*eda14cbcSMatt Macy } 136*eda14cbcSMatt Macy if (dbuf) { 137*eda14cbcSMatt Macy dmu_buf_rele(dbuf, FTAG); 138*eda14cbcSMatt Macy dbuf = NULL; 139*eda14cbcSMatt Macy } 140*eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); 141*eda14cbcSMatt Macy 142*eda14cbcSMatt Macy out: 143*eda14cbcSMatt Macy mutex_exit(&bpo.bpo_lock); 144*eda14cbcSMatt Macy bpobj_close(&bpo); 145*eda14cbcSMatt Macy 146*eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); 147*eda14cbcSMatt Macy } 148*eda14cbcSMatt Macy 149*eda14cbcSMatt Macy int 150*eda14cbcSMatt Macy bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) 151*eda14cbcSMatt Macy { 152*eda14cbcSMatt Macy dmu_object_info_t doi; 153*eda14cbcSMatt Macy int err; 154*eda14cbcSMatt Macy 155*eda14cbcSMatt Macy err = dmu_object_info(os, object, &doi); 156*eda14cbcSMatt Macy if (err) 157*eda14cbcSMatt Macy return (err); 158*eda14cbcSMatt Macy 159*eda14cbcSMatt Macy bzero(bpo, sizeof (*bpo)); 160*eda14cbcSMatt Macy mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); 161*eda14cbcSMatt Macy 162*eda14cbcSMatt Macy ASSERT(bpo->bpo_dbuf == NULL); 163*eda14cbcSMatt Macy ASSERT(bpo->bpo_phys == NULL); 164*eda14cbcSMatt Macy ASSERT(object != 0); 165*eda14cbcSMatt Macy ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); 166*eda14cbcSMatt Macy ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); 167*eda14cbcSMatt Macy 168*eda14cbcSMatt Macy err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); 169*eda14cbcSMatt Macy if (err) 170*eda14cbcSMatt Macy return (err); 171*eda14cbcSMatt Macy 172*eda14cbcSMatt Macy bpo->bpo_os = os; 173*eda14cbcSMatt Macy bpo->bpo_object = object; 174*eda14cbcSMatt Macy bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; 175*eda14cbcSMatt Macy bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); 176*eda14cbcSMatt Macy bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); 177*eda14cbcSMatt Macy bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); 178*eda14cbcSMatt Macy bpo->bpo_phys = bpo->bpo_dbuf->db_data; 179*eda14cbcSMatt Macy return (0); 180*eda14cbcSMatt Macy } 181*eda14cbcSMatt Macy 182*eda14cbcSMatt Macy boolean_t 183*eda14cbcSMatt Macy bpobj_is_open(const bpobj_t *bpo) 184*eda14cbcSMatt Macy { 185*eda14cbcSMatt Macy return (bpo->bpo_object != 0); 186*eda14cbcSMatt Macy } 187*eda14cbcSMatt Macy 188*eda14cbcSMatt Macy void 189*eda14cbcSMatt Macy bpobj_close(bpobj_t *bpo) 190*eda14cbcSMatt Macy { 191*eda14cbcSMatt Macy /* Lame workaround for closing a bpobj that was never opened. */ 192*eda14cbcSMatt Macy if (bpo->bpo_object == 0) 193*eda14cbcSMatt Macy return; 194*eda14cbcSMatt Macy 195*eda14cbcSMatt Macy dmu_buf_rele(bpo->bpo_dbuf, bpo); 196*eda14cbcSMatt Macy if (bpo->bpo_cached_dbuf != NULL) 197*eda14cbcSMatt Macy dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 198*eda14cbcSMatt Macy bpo->bpo_dbuf = NULL; 199*eda14cbcSMatt Macy bpo->bpo_phys = NULL; 200*eda14cbcSMatt Macy bpo->bpo_cached_dbuf = NULL; 201*eda14cbcSMatt Macy bpo->bpo_object = 0; 202*eda14cbcSMatt Macy 203*eda14cbcSMatt Macy mutex_destroy(&bpo->bpo_lock); 204*eda14cbcSMatt Macy } 205*eda14cbcSMatt Macy 206*eda14cbcSMatt Macy static boolean_t 207*eda14cbcSMatt Macy bpobj_is_empty_impl(bpobj_t *bpo) 208*eda14cbcSMatt Macy { 209*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&bpo->bpo_lock)); 210*eda14cbcSMatt Macy return (bpo->bpo_phys->bpo_num_blkptrs == 0 && 211*eda14cbcSMatt Macy (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); 212*eda14cbcSMatt Macy } 213*eda14cbcSMatt Macy 214*eda14cbcSMatt Macy boolean_t 215*eda14cbcSMatt Macy bpobj_is_empty(bpobj_t *bpo) 216*eda14cbcSMatt Macy { 217*eda14cbcSMatt Macy mutex_enter(&bpo->bpo_lock); 218*eda14cbcSMatt Macy boolean_t is_empty = bpobj_is_empty_impl(bpo); 219*eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 220*eda14cbcSMatt Macy return (is_empty); 221*eda14cbcSMatt Macy } 222*eda14cbcSMatt Macy 223*eda14cbcSMatt Macy /* 224*eda14cbcSMatt Macy * A recursive iteration of the bpobjs would be nice here but we run the risk 225*eda14cbcSMatt Macy * of overflowing function stack space. Instead, find each subobj and add it 226*eda14cbcSMatt Macy * to the head of our list so it can be scanned for subjobjs. Like a 227*eda14cbcSMatt Macy * recursive implementation, the "deepest" subobjs will be freed first. 228*eda14cbcSMatt Macy * When a subobj is found to have no additional subojs, free it. 229*eda14cbcSMatt Macy */ 230*eda14cbcSMatt Macy typedef struct bpobj_info { 231*eda14cbcSMatt Macy bpobj_t *bpi_bpo; 232*eda14cbcSMatt Macy /* 233*eda14cbcSMatt Macy * This object is a subobj of bpi_parent, 234*eda14cbcSMatt Macy * at bpi_index in its subobj array. 235*eda14cbcSMatt Macy */ 236*eda14cbcSMatt Macy struct bpobj_info *bpi_parent; 237*eda14cbcSMatt Macy uint64_t bpi_index; 238*eda14cbcSMatt Macy /* How many of our subobj's are left to process. */ 239*eda14cbcSMatt Macy uint64_t bpi_unprocessed_subobjs; 240*eda14cbcSMatt Macy /* True after having visited this bpo's directly referenced BPs. */ 241*eda14cbcSMatt Macy boolean_t bpi_visited; 242*eda14cbcSMatt Macy list_node_t bpi_node; 243*eda14cbcSMatt Macy } bpobj_info_t; 244*eda14cbcSMatt Macy 245*eda14cbcSMatt Macy static bpobj_info_t * 246*eda14cbcSMatt Macy bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) 247*eda14cbcSMatt Macy { 248*eda14cbcSMatt Macy bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP); 249*eda14cbcSMatt Macy bpi->bpi_bpo = bpo; 250*eda14cbcSMatt Macy bpi->bpi_parent = parent; 251*eda14cbcSMatt Macy bpi->bpi_index = index; 252*eda14cbcSMatt Macy if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { 253*eda14cbcSMatt Macy bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs; 254*eda14cbcSMatt Macy } 255*eda14cbcSMatt Macy return (bpi); 256*eda14cbcSMatt Macy } 257*eda14cbcSMatt Macy 258*eda14cbcSMatt Macy /* 259*eda14cbcSMatt Macy * Update bpobj and all of its parents with new space accounting. 260*eda14cbcSMatt Macy */ 261*eda14cbcSMatt Macy static void 262*eda14cbcSMatt Macy propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, 263*eda14cbcSMatt Macy int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) 264*eda14cbcSMatt Macy { 265*eda14cbcSMatt Macy 266*eda14cbcSMatt Macy for (; bpi != NULL; bpi = bpi->bpi_parent) { 267*eda14cbcSMatt Macy bpobj_t *p = bpi->bpi_bpo; 268*eda14cbcSMatt Macy ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx)); 269*eda14cbcSMatt Macy p->bpo_phys->bpo_bytes -= freed; 270*eda14cbcSMatt Macy ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0); 271*eda14cbcSMatt Macy if (p->bpo_havecomp) { 272*eda14cbcSMatt Macy p->bpo_phys->bpo_comp -= comp_freed; 273*eda14cbcSMatt Macy p->bpo_phys->bpo_uncomp -= uncomp_freed; 274*eda14cbcSMatt Macy } 275*eda14cbcSMatt Macy } 276*eda14cbcSMatt Macy } 277*eda14cbcSMatt Macy 278*eda14cbcSMatt Macy static int 279*eda14cbcSMatt Macy bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, 280*eda14cbcSMatt Macy int64_t start, dmu_tx_t *tx, boolean_t free) 281*eda14cbcSMatt Macy { 282*eda14cbcSMatt Macy int err = 0; 283*eda14cbcSMatt Macy int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; 284*eda14cbcSMatt Macy dmu_buf_t *dbuf = NULL; 285*eda14cbcSMatt Macy bpobj_t *bpo = bpi->bpi_bpo; 286*eda14cbcSMatt Macy 287*eda14cbcSMatt Macy for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { 288*eda14cbcSMatt Macy uint64_t offset = i * sizeof (blkptr_t); 289*eda14cbcSMatt Macy uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); 290*eda14cbcSMatt Macy 291*eda14cbcSMatt Macy if (dbuf == NULL || dbuf->db_offset > offset) { 292*eda14cbcSMatt Macy if (dbuf) 293*eda14cbcSMatt Macy dmu_buf_rele(dbuf, FTAG); 294*eda14cbcSMatt Macy err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 295*eda14cbcSMatt Macy offset, FTAG, &dbuf, 0); 296*eda14cbcSMatt Macy if (err) 297*eda14cbcSMatt Macy break; 298*eda14cbcSMatt Macy } 299*eda14cbcSMatt Macy 300*eda14cbcSMatt Macy ASSERT3U(offset, >=, dbuf->db_offset); 301*eda14cbcSMatt Macy ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); 302*eda14cbcSMatt Macy 303*eda14cbcSMatt Macy blkptr_t *bparray = dbuf->db_data; 304*eda14cbcSMatt Macy blkptr_t *bp = &bparray[blkoff]; 305*eda14cbcSMatt Macy 306*eda14cbcSMatt Macy boolean_t bp_freed = BP_GET_FREE(bp); 307*eda14cbcSMatt Macy err = func(arg, bp, bp_freed, tx); 308*eda14cbcSMatt Macy if (err) 309*eda14cbcSMatt Macy break; 310*eda14cbcSMatt Macy 311*eda14cbcSMatt Macy if (free) { 312*eda14cbcSMatt Macy int sign = bp_freed ? -1 : +1; 313*eda14cbcSMatt Macy spa_t *spa = dmu_objset_spa(bpo->bpo_os); 314*eda14cbcSMatt Macy freed += sign * bp_get_dsize_sync(spa, bp); 315*eda14cbcSMatt Macy comp_freed += sign * BP_GET_PSIZE(bp); 316*eda14cbcSMatt Macy uncomp_freed += sign * BP_GET_UCSIZE(bp); 317*eda14cbcSMatt Macy ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); 318*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_blkptrs--; 319*eda14cbcSMatt Macy ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); 320*eda14cbcSMatt Macy if (bp_freed) { 321*eda14cbcSMatt Macy ASSERT(bpo->bpo_havefreed); 322*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_freed--; 323*eda14cbcSMatt Macy ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); 324*eda14cbcSMatt Macy } 325*eda14cbcSMatt Macy } 326*eda14cbcSMatt Macy } 327*eda14cbcSMatt Macy if (free) { 328*eda14cbcSMatt Macy propagate_space_reduction(bpi, freed, comp_freed, 329*eda14cbcSMatt Macy uncomp_freed, tx); 330*eda14cbcSMatt Macy VERIFY0(dmu_free_range(bpo->bpo_os, 331*eda14cbcSMatt Macy bpo->bpo_object, 332*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 333*eda14cbcSMatt Macy DMU_OBJECT_END, tx)); 334*eda14cbcSMatt Macy } 335*eda14cbcSMatt Macy if (dbuf) { 336*eda14cbcSMatt Macy dmu_buf_rele(dbuf, FTAG); 337*eda14cbcSMatt Macy dbuf = NULL; 338*eda14cbcSMatt Macy } 339*eda14cbcSMatt Macy return (err); 340*eda14cbcSMatt Macy } 341*eda14cbcSMatt Macy 342*eda14cbcSMatt Macy /* 343*eda14cbcSMatt Macy * Given an initial bpo, start by freeing the BPs that are directly referenced 344*eda14cbcSMatt Macy * by that bpo. If the bpo has subobjs, read in its last subobj and push the 345*eda14cbcSMatt Macy * subobj to our stack. By popping items off our stack, eventually we will 346*eda14cbcSMatt Macy * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if 347*eda14cbcSMatt Macy * requested also free the now-empty bpo from disk and decrement 348*eda14cbcSMatt Macy * its parent's subobj count. We continue popping each subobj from our stack, 349*eda14cbcSMatt Macy * visiting its last subobj until they too have no more subobjs, and so on. 350*eda14cbcSMatt Macy */ 351*eda14cbcSMatt Macy static int 352*eda14cbcSMatt Macy bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, 353*eda14cbcSMatt Macy dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) 354*eda14cbcSMatt Macy { 355*eda14cbcSMatt Macy list_t stack; 356*eda14cbcSMatt Macy bpobj_info_t *bpi; 357*eda14cbcSMatt Macy int err = 0; 358*eda14cbcSMatt Macy 359*eda14cbcSMatt Macy /* 360*eda14cbcSMatt Macy * Create a "stack" for us to work with without worrying about 361*eda14cbcSMatt Macy * stack overflows. Initialize it with the initial_bpo. 362*eda14cbcSMatt Macy */ 363*eda14cbcSMatt Macy list_create(&stack, sizeof (bpobj_info_t), 364*eda14cbcSMatt Macy offsetof(bpobj_info_t, bpi_node)); 365*eda14cbcSMatt Macy mutex_enter(&initial_bpo->bpo_lock); 366*eda14cbcSMatt Macy 367*eda14cbcSMatt Macy if (bpobj_size != NULL) 368*eda14cbcSMatt Macy *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; 369*eda14cbcSMatt Macy 370*eda14cbcSMatt Macy list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); 371*eda14cbcSMatt Macy 372*eda14cbcSMatt Macy while ((bpi = list_head(&stack)) != NULL) { 373*eda14cbcSMatt Macy bpobj_t *bpo = bpi->bpi_bpo; 374*eda14cbcSMatt Macy 375*eda14cbcSMatt Macy ASSERT3P(bpo, !=, NULL); 376*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&bpo->bpo_lock)); 377*eda14cbcSMatt Macy ASSERT(bpobj_is_open(bpo)); 378*eda14cbcSMatt Macy 379*eda14cbcSMatt Macy if (free) 380*eda14cbcSMatt Macy dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 381*eda14cbcSMatt Macy 382*eda14cbcSMatt Macy if (bpi->bpi_visited == B_FALSE) { 383*eda14cbcSMatt Macy err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, 384*eda14cbcSMatt Macy free); 385*eda14cbcSMatt Macy bpi->bpi_visited = B_TRUE; 386*eda14cbcSMatt Macy if (err != 0) 387*eda14cbcSMatt Macy break; 388*eda14cbcSMatt Macy } 389*eda14cbcSMatt Macy /* 390*eda14cbcSMatt Macy * We've finished with this bpo's directly-referenced BP's and 391*eda14cbcSMatt Macy * it has no more unprocessed subobjs. We can free its 392*eda14cbcSMatt Macy * bpobj_info_t (unless it is the topmost, initial_bpo). 393*eda14cbcSMatt Macy * If we are freeing from disk, we can also do that. 394*eda14cbcSMatt Macy */ 395*eda14cbcSMatt Macy if (bpi->bpi_unprocessed_subobjs == 0) { 396*eda14cbcSMatt Macy /* 397*eda14cbcSMatt Macy * If there are no entries, there should 398*eda14cbcSMatt Macy * be no bytes. 399*eda14cbcSMatt Macy */ 400*eda14cbcSMatt Macy if (bpobj_is_empty_impl(bpo)) { 401*eda14cbcSMatt Macy ASSERT0(bpo->bpo_phys->bpo_bytes); 402*eda14cbcSMatt Macy ASSERT0(bpo->bpo_phys->bpo_comp); 403*eda14cbcSMatt Macy ASSERT0(bpo->bpo_phys->bpo_uncomp); 404*eda14cbcSMatt Macy } 405*eda14cbcSMatt Macy 406*eda14cbcSMatt Macy /* The initial_bpo has no parent and is not closed. */ 407*eda14cbcSMatt Macy if (bpi->bpi_parent != NULL) { 408*eda14cbcSMatt Macy if (free) { 409*eda14cbcSMatt Macy bpobj_t *p = bpi->bpi_parent->bpi_bpo; 410*eda14cbcSMatt Macy 411*eda14cbcSMatt Macy ASSERT0(bpo->bpo_phys->bpo_num_blkptrs); 412*eda14cbcSMatt Macy ASSERT3U(p->bpo_phys->bpo_num_subobjs, 413*eda14cbcSMatt Macy >, 0); 414*eda14cbcSMatt Macy ASSERT3U(bpi->bpi_index, ==, 415*eda14cbcSMatt Macy p->bpo_phys->bpo_num_subobjs - 1); 416*eda14cbcSMatt Macy ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, 417*eda14cbcSMatt Macy tx)); 418*eda14cbcSMatt Macy 419*eda14cbcSMatt Macy p->bpo_phys->bpo_num_subobjs--; 420*eda14cbcSMatt Macy 421*eda14cbcSMatt Macy VERIFY0(dmu_free_range(p->bpo_os, 422*eda14cbcSMatt Macy p->bpo_phys->bpo_subobjs, 423*eda14cbcSMatt Macy bpi->bpi_index * sizeof (uint64_t), 424*eda14cbcSMatt Macy sizeof (uint64_t), tx)); 425*eda14cbcSMatt Macy 426*eda14cbcSMatt Macy /* eliminate the empty subobj list */ 427*eda14cbcSMatt Macy if (bpo->bpo_havesubobj && 428*eda14cbcSMatt Macy bpo->bpo_phys->bpo_subobjs != 0) { 429*eda14cbcSMatt Macy ASSERT0(bpo->bpo_phys-> 430*eda14cbcSMatt Macy bpo_num_subobjs); 431*eda14cbcSMatt Macy err = dmu_object_free( 432*eda14cbcSMatt Macy bpo->bpo_os, 433*eda14cbcSMatt Macy bpo->bpo_phys->bpo_subobjs, 434*eda14cbcSMatt Macy tx); 435*eda14cbcSMatt Macy if (err) 436*eda14cbcSMatt Macy break; 437*eda14cbcSMatt Macy bpo->bpo_phys->bpo_subobjs = 0; 438*eda14cbcSMatt Macy } 439*eda14cbcSMatt Macy err = dmu_object_free(p->bpo_os, 440*eda14cbcSMatt Macy bpo->bpo_object, tx); 441*eda14cbcSMatt Macy if (err) 442*eda14cbcSMatt Macy break; 443*eda14cbcSMatt Macy } 444*eda14cbcSMatt Macy 445*eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 446*eda14cbcSMatt Macy bpobj_close(bpo); 447*eda14cbcSMatt Macy kmem_free(bpo, sizeof (bpobj_t)); 448*eda14cbcSMatt Macy } else { 449*eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 450*eda14cbcSMatt Macy } 451*eda14cbcSMatt Macy 452*eda14cbcSMatt Macy /* 453*eda14cbcSMatt Macy * Finished processing this bpo. Unlock, and free 454*eda14cbcSMatt Macy * our "stack" info. 455*eda14cbcSMatt Macy */ 456*eda14cbcSMatt Macy list_remove_head(&stack); 457*eda14cbcSMatt Macy kmem_free(bpi, sizeof (bpobj_info_t)); 458*eda14cbcSMatt Macy } else { 459*eda14cbcSMatt Macy /* 460*eda14cbcSMatt Macy * We have unprocessed subobjs. Process the next one. 461*eda14cbcSMatt Macy */ 462*eda14cbcSMatt Macy ASSERT(bpo->bpo_havecomp); 463*eda14cbcSMatt Macy ASSERT3P(bpobj_size, ==, NULL); 464*eda14cbcSMatt Macy 465*eda14cbcSMatt Macy /* Add the last subobj to stack. */ 466*eda14cbcSMatt Macy int64_t i = bpi->bpi_unprocessed_subobjs - 1; 467*eda14cbcSMatt Macy uint64_t offset = i * sizeof (uint64_t); 468*eda14cbcSMatt Macy 469*eda14cbcSMatt Macy uint64_t obj_from_sublist; 470*eda14cbcSMatt Macy err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 471*eda14cbcSMatt Macy offset, sizeof (uint64_t), &obj_from_sublist, 472*eda14cbcSMatt Macy DMU_READ_PREFETCH); 473*eda14cbcSMatt Macy if (err) 474*eda14cbcSMatt Macy break; 475*eda14cbcSMatt Macy bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), 476*eda14cbcSMatt Macy KM_SLEEP); 477*eda14cbcSMatt Macy 478*eda14cbcSMatt Macy err = bpobj_open(sublist, bpo->bpo_os, 479*eda14cbcSMatt Macy obj_from_sublist); 480*eda14cbcSMatt Macy if (err) 481*eda14cbcSMatt Macy break; 482*eda14cbcSMatt Macy 483*eda14cbcSMatt Macy list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); 484*eda14cbcSMatt Macy mutex_enter(&sublist->bpo_lock); 485*eda14cbcSMatt Macy bpi->bpi_unprocessed_subobjs--; 486*eda14cbcSMatt Macy } 487*eda14cbcSMatt Macy } 488*eda14cbcSMatt Macy /* 489*eda14cbcSMatt Macy * Cleanup anything left on the "stack" after we left the loop. 490*eda14cbcSMatt Macy * Every bpo on the stack is locked so we must remember to undo 491*eda14cbcSMatt Macy * that now (in LIFO order). 492*eda14cbcSMatt Macy */ 493*eda14cbcSMatt Macy while ((bpi = list_remove_head(&stack)) != NULL) { 494*eda14cbcSMatt Macy bpobj_t *bpo = bpi->bpi_bpo; 495*eda14cbcSMatt Macy ASSERT(err != 0); 496*eda14cbcSMatt Macy ASSERT3P(bpo, !=, NULL); 497*eda14cbcSMatt Macy 498*eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 499*eda14cbcSMatt Macy 500*eda14cbcSMatt Macy /* do not free the initial_bpo */ 501*eda14cbcSMatt Macy if (bpi->bpi_parent != NULL) { 502*eda14cbcSMatt Macy bpobj_close(bpi->bpi_bpo); 503*eda14cbcSMatt Macy kmem_free(bpi->bpi_bpo, sizeof (bpobj_t)); 504*eda14cbcSMatt Macy } 505*eda14cbcSMatt Macy kmem_free(bpi, sizeof (bpobj_info_t)); 506*eda14cbcSMatt Macy } 507*eda14cbcSMatt Macy 508*eda14cbcSMatt Macy list_destroy(&stack); 509*eda14cbcSMatt Macy 510*eda14cbcSMatt Macy return (err); 511*eda14cbcSMatt Macy } 512*eda14cbcSMatt Macy 513*eda14cbcSMatt Macy /* 514*eda14cbcSMatt Macy * Iterate and remove the entries. If func returns nonzero, iteration 515*eda14cbcSMatt Macy * will stop and that entry will not be removed. 516*eda14cbcSMatt Macy */ 517*eda14cbcSMatt Macy int 518*eda14cbcSMatt Macy bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) 519*eda14cbcSMatt Macy { 520*eda14cbcSMatt Macy return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); 521*eda14cbcSMatt Macy } 522*eda14cbcSMatt Macy 523*eda14cbcSMatt Macy /* 524*eda14cbcSMatt Macy * Iterate the entries. If func returns nonzero, iteration will stop. 525*eda14cbcSMatt Macy * 526*eda14cbcSMatt Macy * If there are no subobjs: 527*eda14cbcSMatt Macy * 528*eda14cbcSMatt Macy * *bpobj_size can be used to return the number of block pointers in the 529*eda14cbcSMatt Macy * bpobj. Note that this may be different from the number of block pointers 530*eda14cbcSMatt Macy * that are iterated over, if iteration is terminated early (e.g. by the func 531*eda14cbcSMatt Macy * returning nonzero). 532*eda14cbcSMatt Macy * 533*eda14cbcSMatt Macy * If there are concurrent (or subsequent) modifications to the bpobj then the 534*eda14cbcSMatt Macy * returned *bpobj_size can be passed as "start" to 535*eda14cbcSMatt Macy * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. 536*eda14cbcSMatt Macy */ 537*eda14cbcSMatt Macy int 538*eda14cbcSMatt Macy bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, 539*eda14cbcSMatt Macy uint64_t *bpobj_size) 540*eda14cbcSMatt Macy { 541*eda14cbcSMatt Macy return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); 542*eda14cbcSMatt Macy } 543*eda14cbcSMatt Macy 544*eda14cbcSMatt Macy /* 545*eda14cbcSMatt Macy * Iterate over the blkptrs in the bpobj beginning at index start. If func 546*eda14cbcSMatt Macy * returns nonzero, iteration will stop. This is a livelist specific function 547*eda14cbcSMatt Macy * since it assumes that there are no subobjs present. 548*eda14cbcSMatt Macy */ 549*eda14cbcSMatt Macy int 550*eda14cbcSMatt Macy livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, 551*eda14cbcSMatt Macy int64_t start) 552*eda14cbcSMatt Macy { 553*eda14cbcSMatt Macy if (bpo->bpo_havesubobj) 554*eda14cbcSMatt Macy VERIFY0(bpo->bpo_phys->bpo_subobjs); 555*eda14cbcSMatt Macy bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); 556*eda14cbcSMatt Macy int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); 557*eda14cbcSMatt Macy kmem_free(bpi, sizeof (bpobj_info_t)); 558*eda14cbcSMatt Macy return (err); 559*eda14cbcSMatt Macy } 560*eda14cbcSMatt Macy 561*eda14cbcSMatt Macy /* 562*eda14cbcSMatt Macy * Logically add subobj's contents to the parent bpobj. 563*eda14cbcSMatt Macy * 564*eda14cbcSMatt Macy * In the most general case, this is accomplished in constant time by adding 565*eda14cbcSMatt Macy * a reference to subobj. This case is used when enqueuing a large subobj: 566*eda14cbcSMatt Macy * +--------------+ +--------------+ 567*eda14cbcSMatt Macy * | bpobj |----------------------->| subobj list | 568*eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+--+--+ 569*eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | | obj | obj | obj | 570*eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+-----+ 571*eda14cbcSMatt Macy * 572*eda14cbcSMatt Macy * +--------------+ +--------------+ 573*eda14cbcSMatt Macy * | sub-bpobj |----------------------> | subsubobj | 574*eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ 575*eda14cbcSMatt Macy * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | 576*eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ 577*eda14cbcSMatt Macy * 578*eda14cbcSMatt Macy * Result: sub-bpobj added to parent's subobj list. 579*eda14cbcSMatt Macy * +--------------+ +--------------+ 580*eda14cbcSMatt Macy * | bpobj |----------------------->| subobj list | 581*eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+--+--+-----+ 582*eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | 583*eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+-----+--|--+ 584*eda14cbcSMatt Macy * | 585*eda14cbcSMatt Macy * /-----------------------------------------------------/ 586*eda14cbcSMatt Macy * v 587*eda14cbcSMatt Macy * +--------------+ +--------------+ 588*eda14cbcSMatt Macy * | sub-bpobj |----------------------> | subsubobj | 589*eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ 590*eda14cbcSMatt Macy * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | 591*eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ 592*eda14cbcSMatt Macy * 593*eda14cbcSMatt Macy * 594*eda14cbcSMatt Macy * In a common case, the subobj is small: its bp's and its list of subobj's 595*eda14cbcSMatt Macy * are each stored in a single block. In this case we copy the subobj's 596*eda14cbcSMatt Macy * contents to the parent: 597*eda14cbcSMatt Macy * +--------------+ +--------------+ 598*eda14cbcSMatt Macy * | bpobj |----------------------->| subobj list | 599*eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+--+--+ 600*eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | | obj | obj | obj | 601*eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+-----+ 602*eda14cbcSMatt Macy * ^ ^ 603*eda14cbcSMatt Macy * +--------------+ | +--------------+ | 604*eda14cbcSMatt Macy * | sub-bpobj |---------^------------> | subsubobj | ^ 605*eda14cbcSMatt Macy * +----+----+----+ | +-----+-----+--+ | 606*eda14cbcSMatt Macy * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/ 607*eda14cbcSMatt Macy * +----+----+ +-----+-----+ 608*eda14cbcSMatt Macy * 609*eda14cbcSMatt Macy * Result: subobj destroyed, contents copied to parent: 610*eda14cbcSMatt Macy * +--------------+ +--------------+ 611*eda14cbcSMatt Macy * | bpobj |----------------------->| subobj list | 612*eda14cbcSMatt Macy * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+ 613*eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ | 614*eda14cbcSMatt Macy * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+ 615*eda14cbcSMatt Macy * 616*eda14cbcSMatt Macy * 617*eda14cbcSMatt Macy * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's 618*eda14cbcSMatt Macy * but retain the sub-bpobj: 619*eda14cbcSMatt Macy * +--------------+ +--------------+ 620*eda14cbcSMatt Macy * | bpobj |----------------------->| subobj list | 621*eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+--+--+ 622*eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | | obj | obj | obj | 623*eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+-----+ 624*eda14cbcSMatt Macy * ^ 625*eda14cbcSMatt Macy * +--------------+ +--------------+ | 626*eda14cbcSMatt Macy * | sub-bpobj |----------------------> | subsubobj | ^ 627*eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+--+ | 628*eda14cbcSMatt Macy * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/ 629*eda14cbcSMatt Macy * +----+----+----+----+---------+----+ +-----+-----+ 630*eda14cbcSMatt Macy * 631*eda14cbcSMatt Macy * Result: sub-sub-bpobjs and subobj added to parent's subobj list. 632*eda14cbcSMatt Macy * +--------------+ +--------------+ 633*eda14cbcSMatt Macy * | bpobj |-------------------->| subobj list | 634*eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+ 635*eda14cbcSMatt Macy * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* | 636*eda14cbcSMatt Macy * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+ 637*eda14cbcSMatt Macy * | 638*eda14cbcSMatt Macy * /--------------------------------------------------------------/ 639*eda14cbcSMatt Macy * v 640*eda14cbcSMatt Macy * +--------------+ 641*eda14cbcSMatt Macy * | sub-bpobj | 642*eda14cbcSMatt Macy * +----+----+----+----+---------+----+ 643*eda14cbcSMatt Macy * | bp | bp | bp | bp | ... | bp | 644*eda14cbcSMatt Macy * +----+----+----+----+---------+----+ 645*eda14cbcSMatt Macy */ 646*eda14cbcSMatt Macy void 647*eda14cbcSMatt Macy bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) 648*eda14cbcSMatt Macy { 649*eda14cbcSMatt Macy bpobj_t subbpo; 650*eda14cbcSMatt Macy uint64_t used, comp, uncomp, subsubobjs; 651*eda14cbcSMatt Macy boolean_t copy_subsub = B_TRUE; 652*eda14cbcSMatt Macy boolean_t copy_bps = B_TRUE; 653*eda14cbcSMatt Macy 654*eda14cbcSMatt Macy ASSERT(bpobj_is_open(bpo)); 655*eda14cbcSMatt Macy ASSERT(subobj != 0); 656*eda14cbcSMatt Macy ASSERT(bpo->bpo_havesubobj); 657*eda14cbcSMatt Macy ASSERT(bpo->bpo_havecomp); 658*eda14cbcSMatt Macy ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 659*eda14cbcSMatt Macy 660*eda14cbcSMatt Macy if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { 661*eda14cbcSMatt Macy bpobj_decr_empty(bpo->bpo_os, tx); 662*eda14cbcSMatt Macy return; 663*eda14cbcSMatt Macy } 664*eda14cbcSMatt Macy 665*eda14cbcSMatt Macy VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); 666*eda14cbcSMatt Macy VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); 667*eda14cbcSMatt Macy 668*eda14cbcSMatt Macy if (bpobj_is_empty(&subbpo)) { 669*eda14cbcSMatt Macy /* No point in having an empty subobj. */ 670*eda14cbcSMatt Macy bpobj_close(&subbpo); 671*eda14cbcSMatt Macy bpobj_free(bpo->bpo_os, subobj, tx); 672*eda14cbcSMatt Macy return; 673*eda14cbcSMatt Macy } 674*eda14cbcSMatt Macy 675*eda14cbcSMatt Macy mutex_enter(&bpo->bpo_lock); 676*eda14cbcSMatt Macy dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 677*eda14cbcSMatt Macy 678*eda14cbcSMatt Macy dmu_object_info_t doi; 679*eda14cbcSMatt Macy 680*eda14cbcSMatt Macy if (bpo->bpo_phys->bpo_subobjs != 0) { 681*eda14cbcSMatt Macy ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 682*eda14cbcSMatt Macy &doi)); 683*eda14cbcSMatt Macy ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); 684*eda14cbcSMatt Macy } 685*eda14cbcSMatt Macy 686*eda14cbcSMatt Macy /* 687*eda14cbcSMatt Macy * If subobj has only one block of subobjs, then move subobj's 688*eda14cbcSMatt Macy * subobjs to bpo's subobj list directly. This reduces recursion in 689*eda14cbcSMatt Macy * bpobj_iterate due to nested subobjs. 690*eda14cbcSMatt Macy */ 691*eda14cbcSMatt Macy subsubobjs = subbpo.bpo_phys->bpo_subobjs; 692*eda14cbcSMatt Macy if (subsubobjs != 0) { 693*eda14cbcSMatt Macy VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); 694*eda14cbcSMatt Macy if (doi.doi_max_offset > doi.doi_data_block_size) { 695*eda14cbcSMatt Macy copy_subsub = B_FALSE; 696*eda14cbcSMatt Macy } 697*eda14cbcSMatt Macy } 698*eda14cbcSMatt Macy 699*eda14cbcSMatt Macy /* 700*eda14cbcSMatt Macy * If, in addition to having only one block of subobj's, subobj has 701*eda14cbcSMatt Macy * only one block of bp's, then move subobj's bp's to bpo's bp list 702*eda14cbcSMatt Macy * directly. This reduces recursion in bpobj_iterate due to nested 703*eda14cbcSMatt Macy * subobjs. 704*eda14cbcSMatt Macy */ 705*eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi)); 706*eda14cbcSMatt Macy if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) { 707*eda14cbcSMatt Macy copy_bps = B_FALSE; 708*eda14cbcSMatt Macy } 709*eda14cbcSMatt Macy 710*eda14cbcSMatt Macy if (copy_subsub && subsubobjs != 0) { 711*eda14cbcSMatt Macy dmu_buf_t *subdb; 712*eda14cbcSMatt Macy uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; 713*eda14cbcSMatt Macy 714*eda14cbcSMatt Macy VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs, 715*eda14cbcSMatt Macy 0, FTAG, &subdb, 0)); 716*eda14cbcSMatt Macy /* 717*eda14cbcSMatt Macy * Make sure that we are not asking dmu_write() 718*eda14cbcSMatt Macy * to write more data than we have in our buffer. 719*eda14cbcSMatt Macy */ 720*eda14cbcSMatt Macy VERIFY3U(subdb->db_size, >=, 721*eda14cbcSMatt Macy numsubsub * sizeof (subobj)); 722*eda14cbcSMatt Macy if (bpo->bpo_phys->bpo_subobjs == 0) { 723*eda14cbcSMatt Macy bpo->bpo_phys->bpo_subobjs = 724*eda14cbcSMatt Macy dmu_object_alloc(bpo->bpo_os, 725*eda14cbcSMatt Macy DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, 726*eda14cbcSMatt Macy DMU_OT_NONE, 0, tx); 727*eda14cbcSMatt Macy } 728*eda14cbcSMatt Macy dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 729*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 730*eda14cbcSMatt Macy numsubsub * sizeof (subobj), subdb->db_data, tx); 731*eda14cbcSMatt Macy dmu_buf_rele(subdb, FTAG); 732*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_subobjs += numsubsub; 733*eda14cbcSMatt Macy 734*eda14cbcSMatt Macy dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); 735*eda14cbcSMatt Macy subbpo.bpo_phys->bpo_subobjs = 0; 736*eda14cbcSMatt Macy VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx)); 737*eda14cbcSMatt Macy } 738*eda14cbcSMatt Macy 739*eda14cbcSMatt Macy if (copy_bps) { 740*eda14cbcSMatt Macy dmu_buf_t *bps; 741*eda14cbcSMatt Macy uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs; 742*eda14cbcSMatt Macy 743*eda14cbcSMatt Macy ASSERT(copy_subsub); 744*eda14cbcSMatt Macy VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj, 745*eda14cbcSMatt Macy 0, FTAG, &bps, 0)); 746*eda14cbcSMatt Macy 747*eda14cbcSMatt Macy /* 748*eda14cbcSMatt Macy * Make sure that we are not asking dmu_write() 749*eda14cbcSMatt Macy * to write more data than we have in our buffer. 750*eda14cbcSMatt Macy */ 751*eda14cbcSMatt Macy VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t)); 752*eda14cbcSMatt Macy dmu_write(bpo->bpo_os, bpo->bpo_object, 753*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), 754*eda14cbcSMatt Macy numbps * sizeof (blkptr_t), 755*eda14cbcSMatt Macy bps->db_data, tx); 756*eda14cbcSMatt Macy dmu_buf_rele(bps, FTAG); 757*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_blkptrs += numbps; 758*eda14cbcSMatt Macy 759*eda14cbcSMatt Macy bpobj_close(&subbpo); 760*eda14cbcSMatt Macy VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx)); 761*eda14cbcSMatt Macy } else { 762*eda14cbcSMatt Macy bpobj_close(&subbpo); 763*eda14cbcSMatt Macy if (bpo->bpo_phys->bpo_subobjs == 0) { 764*eda14cbcSMatt Macy bpo->bpo_phys->bpo_subobjs = 765*eda14cbcSMatt Macy dmu_object_alloc(bpo->bpo_os, 766*eda14cbcSMatt Macy DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, 767*eda14cbcSMatt Macy DMU_OT_NONE, 0, tx); 768*eda14cbcSMatt Macy } 769*eda14cbcSMatt Macy 770*eda14cbcSMatt Macy dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, 771*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), 772*eda14cbcSMatt Macy sizeof (subobj), &subobj, tx); 773*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_subobjs++; 774*eda14cbcSMatt Macy } 775*eda14cbcSMatt Macy 776*eda14cbcSMatt Macy bpo->bpo_phys->bpo_bytes += used; 777*eda14cbcSMatt Macy bpo->bpo_phys->bpo_comp += comp; 778*eda14cbcSMatt Macy bpo->bpo_phys->bpo_uncomp += uncomp; 779*eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 780*eda14cbcSMatt Macy 781*eda14cbcSMatt Macy } 782*eda14cbcSMatt Macy 783*eda14cbcSMatt Macy void 784*eda14cbcSMatt Macy bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, 785*eda14cbcSMatt Macy dmu_tx_t *tx) 786*eda14cbcSMatt Macy { 787*eda14cbcSMatt Macy blkptr_t stored_bp = *bp; 788*eda14cbcSMatt Macy uint64_t offset; 789*eda14cbcSMatt Macy int blkoff; 790*eda14cbcSMatt Macy blkptr_t *bparray; 791*eda14cbcSMatt Macy 792*eda14cbcSMatt Macy ASSERT(bpobj_is_open(bpo)); 793*eda14cbcSMatt Macy ASSERT(!BP_IS_HOLE(bp)); 794*eda14cbcSMatt Macy ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); 795*eda14cbcSMatt Macy 796*eda14cbcSMatt Macy if (BP_IS_EMBEDDED(bp)) { 797*eda14cbcSMatt Macy /* 798*eda14cbcSMatt Macy * The bpobj will compress better without the payload. 799*eda14cbcSMatt Macy * 800*eda14cbcSMatt Macy * Note that we store EMBEDDED bp's because they have an 801*eda14cbcSMatt Macy * uncompressed size, which must be accounted for. An 802*eda14cbcSMatt Macy * alternative would be to add their size to bpo_uncomp 803*eda14cbcSMatt Macy * without storing the bp, but that would create additional 804*eda14cbcSMatt Macy * complications: bpo_uncomp would be inconsistent with the 805*eda14cbcSMatt Macy * set of BP's stored, and bpobj_iterate() wouldn't visit 806*eda14cbcSMatt Macy * all the space accounted for in the bpobj. 807*eda14cbcSMatt Macy */ 808*eda14cbcSMatt Macy bzero(&stored_bp, sizeof (stored_bp)); 809*eda14cbcSMatt Macy stored_bp.blk_prop = bp->blk_prop; 810*eda14cbcSMatt Macy stored_bp.blk_birth = bp->blk_birth; 811*eda14cbcSMatt Macy } else if (!BP_GET_DEDUP(bp)) { 812*eda14cbcSMatt Macy /* The bpobj will compress better without the checksum */ 813*eda14cbcSMatt Macy bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); 814*eda14cbcSMatt Macy } 815*eda14cbcSMatt Macy 816*eda14cbcSMatt Macy stored_bp.blk_fill = 0; 817*eda14cbcSMatt Macy BP_SET_FREE(&stored_bp, bp_freed); 818*eda14cbcSMatt Macy 819*eda14cbcSMatt Macy mutex_enter(&bpo->bpo_lock); 820*eda14cbcSMatt Macy 821*eda14cbcSMatt Macy offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); 822*eda14cbcSMatt Macy blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); 823*eda14cbcSMatt Macy 824*eda14cbcSMatt Macy if (bpo->bpo_cached_dbuf == NULL || 825*eda14cbcSMatt Macy offset < bpo->bpo_cached_dbuf->db_offset || 826*eda14cbcSMatt Macy offset >= bpo->bpo_cached_dbuf->db_offset + 827*eda14cbcSMatt Macy bpo->bpo_cached_dbuf->db_size) { 828*eda14cbcSMatt Macy if (bpo->bpo_cached_dbuf) 829*eda14cbcSMatt Macy dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); 830*eda14cbcSMatt Macy VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, 831*eda14cbcSMatt Macy offset, bpo, &bpo->bpo_cached_dbuf, 0)); 832*eda14cbcSMatt Macy } 833*eda14cbcSMatt Macy 834*eda14cbcSMatt Macy dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); 835*eda14cbcSMatt Macy bparray = bpo->bpo_cached_dbuf->db_data; 836*eda14cbcSMatt Macy bparray[blkoff] = stored_bp; 837*eda14cbcSMatt Macy 838*eda14cbcSMatt Macy dmu_buf_will_dirty(bpo->bpo_dbuf, tx); 839*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_blkptrs++; 840*eda14cbcSMatt Macy int sign = bp_freed ? -1 : +1; 841*eda14cbcSMatt Macy bpo->bpo_phys->bpo_bytes += sign * 842*eda14cbcSMatt Macy bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); 843*eda14cbcSMatt Macy if (bpo->bpo_havecomp) { 844*eda14cbcSMatt Macy bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); 845*eda14cbcSMatt Macy bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); 846*eda14cbcSMatt Macy } 847*eda14cbcSMatt Macy if (bp_freed) { 848*eda14cbcSMatt Macy ASSERT(bpo->bpo_havefreed); 849*eda14cbcSMatt Macy bpo->bpo_phys->bpo_num_freed++; 850*eda14cbcSMatt Macy } 851*eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 852*eda14cbcSMatt Macy } 853*eda14cbcSMatt Macy 854*eda14cbcSMatt Macy struct space_range_arg { 855*eda14cbcSMatt Macy spa_t *spa; 856*eda14cbcSMatt Macy uint64_t mintxg; 857*eda14cbcSMatt Macy uint64_t maxtxg; 858*eda14cbcSMatt Macy uint64_t used; 859*eda14cbcSMatt Macy uint64_t comp; 860*eda14cbcSMatt Macy uint64_t uncomp; 861*eda14cbcSMatt Macy }; 862*eda14cbcSMatt Macy 863*eda14cbcSMatt Macy /* ARGSUSED */ 864*eda14cbcSMatt Macy static int 865*eda14cbcSMatt Macy space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) 866*eda14cbcSMatt Macy { 867*eda14cbcSMatt Macy struct space_range_arg *sra = arg; 868*eda14cbcSMatt Macy 869*eda14cbcSMatt Macy if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { 870*eda14cbcSMatt Macy if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) 871*eda14cbcSMatt Macy sra->used += bp_get_dsize_sync(sra->spa, bp); 872*eda14cbcSMatt Macy else 873*eda14cbcSMatt Macy sra->used += bp_get_dsize(sra->spa, bp); 874*eda14cbcSMatt Macy sra->comp += BP_GET_PSIZE(bp); 875*eda14cbcSMatt Macy sra->uncomp += BP_GET_UCSIZE(bp); 876*eda14cbcSMatt Macy } 877*eda14cbcSMatt Macy return (0); 878*eda14cbcSMatt Macy } 879*eda14cbcSMatt Macy 880*eda14cbcSMatt Macy int 881*eda14cbcSMatt Macy bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 882*eda14cbcSMatt Macy { 883*eda14cbcSMatt Macy ASSERT(bpobj_is_open(bpo)); 884*eda14cbcSMatt Macy mutex_enter(&bpo->bpo_lock); 885*eda14cbcSMatt Macy 886*eda14cbcSMatt Macy *usedp = bpo->bpo_phys->bpo_bytes; 887*eda14cbcSMatt Macy if (bpo->bpo_havecomp) { 888*eda14cbcSMatt Macy *compp = bpo->bpo_phys->bpo_comp; 889*eda14cbcSMatt Macy *uncompp = bpo->bpo_phys->bpo_uncomp; 890*eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 891*eda14cbcSMatt Macy return (0); 892*eda14cbcSMatt Macy } else { 893*eda14cbcSMatt Macy mutex_exit(&bpo->bpo_lock); 894*eda14cbcSMatt Macy return (bpobj_space_range(bpo, 0, UINT64_MAX, 895*eda14cbcSMatt Macy usedp, compp, uncompp)); 896*eda14cbcSMatt Macy } 897*eda14cbcSMatt Macy } 898*eda14cbcSMatt Macy 899*eda14cbcSMatt Macy /* 900*eda14cbcSMatt Macy * Return the amount of space in the bpobj which is: 901*eda14cbcSMatt Macy * mintxg < blk_birth <= maxtxg 902*eda14cbcSMatt Macy */ 903*eda14cbcSMatt Macy int 904*eda14cbcSMatt Macy bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, 905*eda14cbcSMatt Macy uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 906*eda14cbcSMatt Macy { 907*eda14cbcSMatt Macy struct space_range_arg sra = { 0 }; 908*eda14cbcSMatt Macy int err; 909*eda14cbcSMatt Macy 910*eda14cbcSMatt Macy ASSERT(bpobj_is_open(bpo)); 911*eda14cbcSMatt Macy 912*eda14cbcSMatt Macy /* 913*eda14cbcSMatt Macy * As an optimization, if they want the whole txg range, just 914*eda14cbcSMatt Macy * get bpo_bytes rather than iterating over the bps. 915*eda14cbcSMatt Macy */ 916*eda14cbcSMatt Macy if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) 917*eda14cbcSMatt Macy return (bpobj_space(bpo, usedp, compp, uncompp)); 918*eda14cbcSMatt Macy 919*eda14cbcSMatt Macy sra.spa = dmu_objset_spa(bpo->bpo_os); 920*eda14cbcSMatt Macy sra.mintxg = mintxg; 921*eda14cbcSMatt Macy sra.maxtxg = maxtxg; 922*eda14cbcSMatt Macy 923*eda14cbcSMatt Macy err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); 924*eda14cbcSMatt Macy *usedp = sra.used; 925*eda14cbcSMatt Macy *compp = sra.comp; 926*eda14cbcSMatt Macy *uncompp = sra.uncomp; 927*eda14cbcSMatt Macy return (err); 928*eda14cbcSMatt Macy } 929*eda14cbcSMatt Macy 930*eda14cbcSMatt Macy /* 931*eda14cbcSMatt Macy * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a 932*eda14cbcSMatt Macy * bpobj are designated as free or allocated that information is not preserved 933*eda14cbcSMatt Macy * in bplists. 934*eda14cbcSMatt Macy */ 935*eda14cbcSMatt Macy /* ARGSUSED */ 936*eda14cbcSMatt Macy int 937*eda14cbcSMatt Macy bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 938*eda14cbcSMatt Macy dmu_tx_t *tx) 939*eda14cbcSMatt Macy { 940*eda14cbcSMatt Macy bplist_t *bpl = arg; 941*eda14cbcSMatt Macy bplist_append(bpl, bp); 942*eda14cbcSMatt Macy return (0); 943*eda14cbcSMatt Macy } 944